diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md
index 459bf56..46fb874 100644
--- a/.cursor/commands/qa.md
+++ b/.cursor/commands/qa.md
@@ -230,7 +230,7 @@ Once all deployments are complete, present the human with these invoke commands
kernel invoke ts-basic get-page-title --payload '{"url": "https://www.google.com"}'
kernel invoke ts-captcha-solver test-captcha-solver
kernel invoke ts-stagehand teamsize-task --payload '{"company": "Kernel"}'
-kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'
+kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}'
kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipedia.org/wiki/Special:Random"}'
kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}'
kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}'
@@ -240,7 +240,7 @@ kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://
kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}'
kernel invoke python-captcha-solver test-captcha-solver
kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}'
-kernel invoke python-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'
+kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}'
kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}'
kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}'
kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}'
diff --git a/pkg/create/templates.go b/pkg/create/templates.go
index ca9067c..f99c4e6 100644
--- a/pkg/create/templates.go
+++ b/pkg/create/templates.go
@@ -178,7 +178,7 @@ var Commands = map[string]map[string]DeployConfig{
TemplateAnthropicComputerUse: {
EntryPoint: "index.ts",
NeedsEnvFile: true,
- InvokeCommand: `kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'`,
+ InvokeCommand: `kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to http://magnitasks.com and click on Tasks in the sidebar"}'`,
},
TemplateMagnitude: {
EntryPoint: "index.ts",
@@ -220,7 +220,7 @@ var Commands = map[string]map[string]DeployConfig{
TemplateAnthropicComputerUse: {
EntryPoint: "main.py",
NeedsEnvFile: true,
- InvokeCommand: `kernel invoke python-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'`,
+ InvokeCommand: `kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to http://magnitasks.com and click on Tasks in the sidebar"}'`,
},
TemplateOpenAIComputerUse: {
EntryPoint: "main.py",
diff --git a/pkg/templates/python/anthropic-computer-use/README.md b/pkg/templates/python/anthropic-computer-use/README.md
index a5d8b11..376f30f 100644
--- a/pkg/templates/python/anthropic-computer-use/README.md
+++ b/pkg/templates/python/anthropic-computer-use/README.md
@@ -1,7 +1,47 @@
# Kernel Python Sample App - Anthropic Computer Use
-This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use.
+This is a Kernel application that implements a prompt loop using Anthropic Computer Use with Kernel's Computer Controls API.
-It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but replaces `xodotool` and `gnome-screenshot` with Playwright.
+It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but uses Kernel's Computer Controls API instead of `xdotool` and `gnome-screenshot`.
-See the [docs](https://www.kernel.sh/docs/quickstart) for information.
\ No newline at end of file
+## Setup
+
+1. Get your API keys:
+ - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com)
+ - **Anthropic**: [console.anthropic.com](https://console.anthropic.com)
+
+2. Deploy the app:
+```bash
+kernel login
+cp .env.example .env # Add your ANTHROPIC_API_KEY
+kernel deploy main.py --env-file .env
+```
+
+## Usage
+
+```bash
+kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'
+```
+
+## Recording Replays
+
+> **Note:** Replay recording is only available to Kernel users on paid plans.
+
+Add `"record_replay": true` to your payload to capture a video of the browser session:
+
+```bash
+kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}'
+```
+
+When enabled, the response will include a `replay_url` field with a link to view the recorded session.
+
+## Known Limitations
+
+### Cursor Position
+
+The `cursor_position` action is not supported with Kernel's Computer Controls API. If the model attempts to use this action, an error will be returned. This is a known limitation that does not significantly impact most computer use workflows, as the model typically tracks cursor position through screenshots.
+
+## Resources
+
+- [Anthropic Computer Use Documentation](https://docs.anthropic.com/en/docs/build-with-claude/computer-use)
+- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
diff --git a/pkg/templates/python/anthropic-computer-use/loop.py b/pkg/templates/python/anthropic-computer-use/loop.py
index e4711b1..4062088 100644
--- a/pkg/templates/python/anthropic-computer-use/loop.py
+++ b/pkg/templates/python/anthropic-computer-use/loop.py
@@ -1,25 +1,16 @@
"""
Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools.
From https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/loop.py
+Modified to use Kernel Computer Controls API instead of Playwright.
"""
import os
-import platform
-from collections.abc import Callable
from datetime import datetime
from enum import StrEnum
from typing import Any, cast
-from playwright.async_api import Page
-
-import httpx
-from anthropic import (
- Anthropic,
- AnthropicBedrock,
- AnthropicVertex,
- APIError,
- APIResponseValidationError,
- APIStatusError,
-)
+
+from kernel import Kernel
+from anthropic import Anthropic
from anthropic.types.beta import (
BetaCacheControlEphemeralParam,
BetaContentBlockParam,
@@ -78,6 +69,8 @@ async def sampling_loop(
model: str,
messages: list[BetaMessageParam],
api_key: str,
+ kernel: Kernel,
+ session_id: str,
provider: APIProvider = APIProvider.ANTHROPIC,
system_prompt_suffix: str = "",
only_n_most_recent_images: int | None = None,
@@ -85,7 +78,6 @@ async def sampling_loop(
tool_version: ToolVersion = "computer_use_20250124",
thinking_budget: int | None = None,
token_efficient_tools_beta: bool = False,
- playwright_page: Page,
):
"""
Agentic sampling loop for the assistant/tool interaction of computer use.
@@ -94,6 +86,8 @@ async def sampling_loop(
model: The model to use for the API call
messages: The conversation history
api_key: The API key for authentication
+ kernel: The Kernel client instance
+ session_id: The Kernel browser session ID
provider: The API provider (defaults to ANTHROPIC)
system_prompt_suffix: Additional system prompt text (defaults to empty string)
only_n_most_recent_images: Optional limit on number of recent images to keep
@@ -101,12 +95,11 @@ async def sampling_loop(
tool_version: Version of tools to use (defaults to V20250124)
thinking_budget: Optional token budget for thinking
token_efficient_tools_beta: Whether to use token efficient tools beta
- playwright_page: The Playwright page instance for browser automation
"""
tool_group = TOOL_GROUPS_BY_VERSION[tool_version]
tool_collection = ToolCollection(
*(
- ToolCls(page=playwright_page if ToolCls.__name__.startswith("ComputerTool") else None)
+ ToolCls(kernel=kernel, session_id=session_id) if ToolCls.__name__.startswith("ComputerTool") else ToolCls()
for ToolCls in tool_group.tools
)
)
@@ -252,21 +245,31 @@ def _response_to_params(
) -> list[BetaContentBlockParam]:
res: list[BetaContentBlockParam] = []
for block in response.content:
- if isinstance(block, BetaTextBlock):
- if block.text:
+ block_type = getattr(block, "type", None)
+
+ if block_type == "thinking":
+ thinking_block = {
+ "type": "thinking",
+ "thinking": getattr(block, "thinking", None),
+ }
+ if hasattr(block, "signature"):
+ thinking_block["signature"] = getattr(block, "signature", None)
+ res.append(cast(BetaContentBlockParam, thinking_block))
+ elif block_type == "text" or isinstance(block, BetaTextBlock):
+ if getattr(block, "text", None):
res.append(BetaTextBlockParam(type="text", text=block.text))
- elif getattr(block, "type", None) == "thinking":
- # Handle thinking blocks - include signature field
- thinking_block = {
- "type": "thinking",
- "thinking": getattr(block, "thinking", None),
- }
- if hasattr(block, "signature"):
- thinking_block["signature"] = getattr(block, "signature", None)
- res.append(cast(BetaContentBlockParam, thinking_block))
+ elif block_type == "tool_use":
+ tool_use_block: BetaToolUseBlockParam = {
+ "type": "tool_use",
+ "id": block.id,
+ "name": block.name,
+ "input": block.input,
+ }
+ res.append(tool_use_block)
else:
- # Handle tool use blocks normally
- res.append(cast(BetaToolUseBlockParam, block.model_dump()))
+ # Preserve unexpected block types to avoid silently dropping content
+ if hasattr(block, "model_dump"):
+ res.append(cast(BetaContentBlockParam, block.model_dump()))
return res
@@ -334,4 +337,4 @@ def _make_api_tool_result(
def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
if result.system:
result_text = f"{result.system}\n{result_text}"
- return result_text
\ No newline at end of file
+ return result_text
diff --git a/pkg/templates/python/anthropic-computer-use/main.py b/pkg/templates/python/anthropic-computer-use/main.py
index e53090a..51b571d 100644
--- a/pkg/templates/python/anthropic-computer-use/main.py
+++ b/pkg/templates/python/anthropic-computer-use/main.py
@@ -1,97 +1,92 @@
import os
-from typing import Dict, TypedDict
+from typing import Dict, Optional, TypedDict
import kernel
-from kernel import Kernel
from loop import sampling_loop
-from playwright.async_api import async_playwright
+from session import KernelBrowserSession
class QueryInput(TypedDict):
query: str
+ record_replay: Optional[bool]
class QueryOutput(TypedDict):
result: str
+ replay_url: Optional[str]
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise ValueError("ANTHROPIC_API_KEY is not set")
-client = Kernel()
app = kernel.App("python-anthropic-cua")
+
@app.action("cua-task")
async def cua_task(
ctx: kernel.KernelContext,
payload: QueryInput,
) -> QueryOutput:
- # A function that processes a user query using a browser-based sampling loop
-
- # Args:
- # ctx: Kernel context containing invocation information
- # payload: An object containing a query string to process
-
- # Returns:
- # A dictionary containing the result of the sampling loop as a string
+ """
+ Process a user query using Anthropic Computer Use with Kernel's browser automation.
+
+ Args:
+ ctx: Kernel context containing invocation information
+ payload: An object containing:
+ - query: The task/query string to process
+ - record_replay: Optional boolean to enable video replay recording
+
+ Returns:
+ A dictionary containing:
+ - result: The result of the sampling loop as a string
+ - replay_url: URL to view the replay (if recording was enabled)
+ """
if not payload or not payload.get("query"):
raise ValueError("Query is required")
- kernel_browser = client.browsers.create(
- invocation_id=ctx.invocation_id, stealth=True
- )
- print("Kernel browser live view url: ", kernel_browser.browser_live_view_url)
-
- try:
- async with async_playwright() as playwright:
- browser = await playwright.chromium.connect_over_cdp(
- kernel_browser.cdp_ws_url
- )
- context = (
- browser.contexts[0] if browser.contexts else await browser.new_context()
+ record_replay = payload.get("record_replay", False)
+
+ async with KernelBrowserSession(
+ stealth=True,
+ record_replay=record_replay,
+ ) as session:
+ print("Kernel browser live view url:", session.live_view_url)
+
+ final_messages = await sampling_loop(
+ model="claude-sonnet-4-5-20250929",
+ messages=[
+ {
+ "role": "user",
+ "content": payload["query"],
+ }
+ ],
+ api_key=str(api_key),
+ thinking_budget=1024,
+ kernel=session.kernel,
+ session_id=session.session_id,
+ )
+
+ if not final_messages:
+ raise ValueError("No messages were generated during the sampling loop")
+
+ last_message = final_messages[-1]
+ if not last_message:
+ raise ValueError(
+ "Failed to get the last message from the sampling loop"
)
- page = context.pages[0] if context.pages else await context.new_page()
-
- # Run the sampling loop
- final_messages = await sampling_loop(
- model="claude-sonnet-4-20250514",
- messages=[
- {
- "role": "user",
- "content": payload["query"],
- }
- ],
- api_key=str(api_key),
- thinking_budget=1024,
- playwright_page=page,
+
+ result = ""
+ if isinstance(last_message.get("content"), str):
+ result = last_message["content"] # type: ignore[assignment]
+ else:
+ result = "".join(
+ block["text"]
+ for block in last_message["content"] # type: ignore[index]
+ if isinstance(block, Dict) and block.get("type") == "text"
)
- # Extract the final result
- if not final_messages:
- raise ValueError("No messages were generated during the sampling loop")
-
- last_message = final_messages[-1]
- if not last_message:
- raise ValueError(
- "Failed to get the last message from the sampling loop"
- )
-
- result = ""
- if isinstance(last_message.get("content"), str):
- result = last_message["content"] # type: ignore[assignment]
- else:
- result = "".join(
- block["text"]
- for block in last_message["content"] # type: ignore[index]
- if isinstance(block, Dict) and block.get("type") == "text"
- )
-
- return {"result": result}
- except Exception as exc:
- print(f"Error in sampling loop: {exc}")
- raise
- finally:
- if browser is not None:
- await browser.close()
- client.browsers.delete_by_id(kernel_browser.session_id)
+ return {
+ "result": result,
+ "replay_url": session.replay_view_url,
+ }
diff --git a/pkg/templates/python/anthropic-computer-use/pyproject.toml b/pkg/templates/python/anthropic-computer-use/pyproject.toml
index f9a7686..f5b75de 100644
--- a/pkg/templates/python/anthropic-computer-use/pyproject.toml
+++ b/pkg/templates/python/anthropic-computer-use/pyproject.toml
@@ -5,11 +5,9 @@ description = "Kernel reference app for Anthropic Computer Use"
requires-python = ">=3.9"
dependencies = [
"anthropic>=0.75.0",
- "playwright>=1.56.0",
"python-dateutil>=2.9.0",
"pydantic>=2.12.5",
"typing-extensions>=4.15.0",
- "kernel>=0.23.0",
+ "kernel>=0.24.0",
"python-dotenv>=1.2.1",
- "httpx>=0.28.1",
]
diff --git a/pkg/templates/python/anthropic-computer-use/session.py b/pkg/templates/python/anthropic-computer-use/session.py
new file mode 100644
index 0000000..3227b28
--- /dev/null
+++ b/pkg/templates/python/anthropic-computer-use/session.py
@@ -0,0 +1,149 @@
+"""
+Kernel Browser Session Manager.
+
+Provides an async context manager for managing Kernel browser lifecycle
+with optional video replay recording.
+"""
+
+import asyncio
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+from kernel import Kernel
+
+
+@dataclass
+class KernelBrowserSession:
+ """
+ Manages Kernel browser lifecycle as an async context manager.
+
+ Creates a browser session on entry and cleans it up on exit.
+ Optionally records a video replay of the entire session.
+ Provides session_id to computer tools.
+
+ Usage:
+ async with KernelBrowserSession(record_replay=True) as session:
+ # Use session.session_id and session.kernel for operations
+ pass
+ # Browser is automatically cleaned up, replay URL available in session.replay_view_url
+ """
+
+ stealth: bool = True
+ timeout_seconds: int = 300
+
+ # Replay recording options
+ record_replay: bool = False
+ replay_grace_period: float = 5.0 # Seconds to wait before stopping replay
+
+ # Set after browser creation
+ session_id: Optional[str] = field(default=None, init=False)
+ live_view_url: Optional[str] = field(default=None, init=False)
+ replay_id: Optional[str] = field(default=None, init=False)
+ replay_view_url: Optional[str] = field(default=None, init=False)
+ _kernel: Optional[Kernel] = field(default=None, init=False)
+
+ async def __aenter__(self) -> "KernelBrowserSession":
+ """Create a Kernel browser session and optionally start recording."""
+ self._kernel = Kernel()
+
+ # Create browser with specified settings
+ browser = self._kernel.browsers.create(
+ stealth=self.stealth,
+ timeout_seconds=self.timeout_seconds,
+ viewport={
+ "width": 1024,
+ "height": 768,
+ "refresh_rate": 60,
+ },
+ )
+
+ self.session_id = browser.session_id
+ self.live_view_url = browser.browser_live_view_url
+
+ print(f"Kernel browser created: {self.session_id}")
+ print(f"Live view URL: {self.live_view_url}")
+
+ # Start replay recording if enabled
+ if self.record_replay:
+ try:
+ await self._start_replay()
+ except Exception as e:
+ print(f"Warning: Failed to start replay recording: {e}")
+ print("Continuing without replay recording.")
+
+ return self
+
+ async def _start_replay(self) -> None:
+ """Start recording a replay of the browser session."""
+ if not self._kernel or not self.session_id:
+ return
+
+ print("Starting replay recording...")
+ replay = self._kernel.browsers.replays.start(self.session_id)
+ self.replay_id = replay.replay_id
+ print(f"Replay recording started: {self.replay_id}")
+
+ async def _stop_and_get_replay_url(self) -> None:
+ """Stop recording and get the replay URL."""
+ if not self._kernel or not self.session_id or not self.replay_id:
+ return
+
+ print("Stopping replay recording...")
+ self._kernel.browsers.replays.stop(
+ replay_id=self.replay_id,
+ id=self.session_id,
+ )
+ print("Replay recording stopped. Processing video...")
+
+ # Wait a moment for processing
+ await asyncio.sleep(2)
+
+ # Poll for replay to be ready (with timeout)
+ max_wait = 60 # seconds
+ start_time = time.time()
+ replay_ready = False
+
+ while time.time() - start_time < max_wait:
+ try:
+ replays = self._kernel.browsers.replays.list(self.session_id)
+ for replay in replays:
+ if replay.replay_id == self.replay_id:
+ self.replay_view_url = replay.replay_view_url
+ replay_ready = True
+ break
+ if replay_ready:
+ break
+ except Exception:
+ pass
+ await asyncio.sleep(1)
+
+ if not replay_ready:
+ print("Warning: Replay may still be processing")
+ elif self.replay_view_url:
+ print(f"Replay view URL: {self.replay_view_url}")
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+ """Stop recording and delete the browser session."""
+ if self._kernel and self.session_id:
+ try:
+ # Stop replay if recording was enabled
+ if self.record_replay and self.replay_id:
+ # Wait grace period before stopping to capture final state
+ if self.replay_grace_period > 0:
+ print(f"Waiting {self.replay_grace_period}s grace period...")
+ await asyncio.sleep(self.replay_grace_period)
+ await self._stop_and_get_replay_url()
+ finally:
+ print(f"Destroying browser session: {self.session_id}")
+ self._kernel.browsers.delete_by_id(self.session_id)
+ print("Browser session destroyed.")
+
+ self._kernel = None
+
+ @property
+ def kernel(self) -> Kernel:
+ """Get the Kernel client instance."""
+ if self._kernel is None:
+ raise RuntimeError("Session not initialized. Use async with context.")
+ return self._kernel
diff --git a/pkg/templates/python/anthropic-computer-use/tools/computer.py b/pkg/templates/python/anthropic-computer-use/tools/computer.py
index 60f7522..654a289 100644
--- a/pkg/templates/python/anthropic-computer-use/tools/computer.py
+++ b/pkg/templates/python/anthropic-computer-use/tools/computer.py
@@ -1,53 +1,42 @@
"""
+Computer tool using Kernel's Computer Controls API.
Modified from https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py
-Replaces xdotool and gnome-screenshot with Playwright.
+Replaces Playwright with Kernel Computer Controls API.
"""
import asyncio
import base64
-import os
-from enum import StrEnum
from typing import Literal, TypedDict, cast, get_args
-from playwright.async_api import Page
-
+from kernel import Kernel
from anthropic.types.beta import BetaToolComputerUse20241022Param, BetaToolUnionParam
from .base import BaseAnthropicTool, ToolError, ToolResult
TYPING_DELAY_MS = 12
-TYPING_GROUP_SIZE = 50
-
-# Map alternative names to standard Playwright modifier keys
-MODIFIER_KEY_MAP = {
- 'ctrl': 'Control',
- 'alt': 'Alt',
- 'cmd': 'Meta',
- 'command': 'Meta',
- 'win': 'Meta',
-}
-# Essential key mappings for Playwright compatibility
+# Key mappings for Kernel Computer Controls API
+# Map common key names to xdotool-compatible format that Kernel uses
KEY_MAP = {
- 'return': 'Enter',
- 'space': ' ',
- 'left': 'ArrowLeft',
- 'right': 'ArrowRight',
- 'up': 'ArrowUp',
- 'down': 'ArrowDown',
+ 'return': 'Return',
+ 'enter': 'Return',
+ 'space': 'space',
+ 'left': 'Left',
+ 'right': 'Right',
+ 'up': 'Up',
+ 'down': 'Down',
'home': 'Home',
'end': 'End',
- 'pageup': 'PageUp',
- 'page_up': 'PageUp',
- 'pagedown': 'PageDown',
- 'page_down': 'PageDown',
+ 'pageup': 'Page_Up',
+ 'page_up': 'Page_Up',
+ 'pagedown': 'Page_Down',
+ 'page_down': 'Page_Down',
'delete': 'Delete',
- 'backspace': 'Backspace',
+ 'backspace': 'BackSpace',
'tab': 'Tab',
'esc': 'Escape',
'escape': 'Escape',
'insert': 'Insert',
- 'super_l': 'Meta',
'f1': 'F1',
'f2': 'F2',
'f3': 'F3',
@@ -60,9 +49,21 @@
'f10': 'F10',
'f11': 'F11',
'f12': 'F12',
- 'minus': '-',
- 'equal': '=',
- 'plus': '+',
+ 'minus': 'minus',
+ 'equal': 'equal',
+ 'plus': 'plus',
+}
+
+# Modifier key mappings
+MODIFIER_KEY_MAP = {
+ 'ctrl': 'ctrl',
+ 'control': 'ctrl',
+ 'alt': 'alt',
+ 'cmd': 'super',
+ 'command': 'super',
+ 'win': 'super',
+ 'meta': 'super',
+ 'shift': 'shift',
}
Action_20241022 = Literal[
@@ -92,33 +93,30 @@
ScrollDirection = Literal["up", "down", "left", "right"]
-# Map Playwright mouse buttons to our actions
-MOUSE_BUTTONS = {
- "left_click": "left",
- "right_click": "right",
- "middle_click": "middle",
-}
class ComputerToolOptions(TypedDict):
display_height_px: int
display_width_px: int
display_number: int | None
-def chunks(s: str, chunk_size: int) -> list[str]:
- return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)]
class BaseComputerTool:
"""
- A tool that allows the agent to interact with the screen, keyboard, and mouse using Playwright.
+ A tool that allows the agent to interact with the screen, keyboard, and mouse using Kernel's Computer Controls API.
The tool parameters are defined by Anthropic and are not editable.
"""
name: Literal["computer"] = "computer"
- width: int = 1280
- height: int = 720
+ width: int = 1024
+ height: int = 768
display_num: int | None = None
- page: Page | None = None
-
+
+ # Kernel client and session
+ kernel: Kernel | None = None
+ session_id: str | None = None
+
+ # Track last mouse position for drag operations
+ _last_mouse_position: tuple[int, int] = (0, 0)
_screenshot_delay = 2.0
@property
@@ -129,9 +127,10 @@ def options(self) -> ComputerToolOptions:
"display_number": self.display_num,
}
- def __init__(self, page: Page | None = None):
+ def __init__(self, kernel: Kernel | None = None, session_id: str | None = None):
super().__init__()
- self.page = page
+ self.kernel = kernel
+ self.session_id = session_id
def validate_coordinates(self, coordinate: tuple[int, int] | list[int] | None = None) -> tuple[int, int] | None:
"""Validate that coordinates are non-negative integers and convert lists to tuples if needed."""
@@ -152,23 +151,30 @@ def validate_coordinates(self, coordinate: tuple[int, int] | list[int] | None =
return coordinate
def map_key(self, key: str) -> str:
- """Map a key to its Playwright equivalent."""
+ """Map a key to its Kernel/xdotool equivalent."""
+ key_lower = key.lower().strip()
+
# Handle modifier keys
- if key.lower() in MODIFIER_KEY_MAP:
- return MODIFIER_KEY_MAP[key.lower()]
+ if key_lower in MODIFIER_KEY_MAP:
+ return MODIFIER_KEY_MAP[key_lower]
# Handle special keys
- if key.lower() in KEY_MAP:
- return KEY_MAP[key.lower()]
+ if key_lower in KEY_MAP:
+ return KEY_MAP[key_lower]
# Handle key combinations (e.g. "ctrl+a")
if '+' in key:
parts = key.split('+')
- if len(parts) == 2:
- modifier, main_key = parts
- mapped_modifier = MODIFIER_KEY_MAP.get(modifier.lower(), modifier)
- mapped_key = KEY_MAP.get(main_key.lower(), main_key)
- return f"{mapped_modifier}+{mapped_key}"
+ mapped_parts = []
+ for part in parts:
+ part = part.strip().lower()
+ if part in MODIFIER_KEY_MAP:
+ mapped_parts.append(MODIFIER_KEY_MAP[part])
+ elif part in KEY_MAP:
+ mapped_parts.append(KEY_MAP[part])
+ else:
+ mapped_parts.append(part)
+ return '+'.join(mapped_parts)
# Return the key as is if no mapping exists
return key
@@ -181,8 +187,8 @@ async def __call__(
coordinate: tuple[int, int] | list[int] | None = None,
**kwargs,
):
- if not self.page:
- raise ToolError("Playwright page not initialized")
+ if not self.kernel or not self.session_id:
+ raise ToolError("Kernel client or session not initialized")
if action in ("mouse_move", "left_click_drag"):
if coordinate is None:
@@ -194,12 +200,25 @@ async def __call__(
x, y = coordinate
if action == "mouse_move":
- await self.page.mouse.move(x, y)
+ self.kernel.browsers.computer.move_mouse(
+ id=self.session_id,
+ x=x,
+ y=y,
+ )
+ self._last_mouse_position = (x, y)
return await self.screenshot()
elif action == "left_click_drag":
- await self.page.mouse.down(button="left")
- await self.page.mouse.move(x, y)
- await self.page.mouse.up(button="left")
+ start_coord = kwargs.get("start_coordinate")
+ start_x, start_y = self.validate_coordinates(start_coord) if start_coord else self._last_mouse_position
+
+ print(f"Dragging from ({start_x}, {start_y}) to ({x}, {y})")
+
+ self.kernel.browsers.computer.drag_mouse(
+ id=self.session_id,
+ path=[[start_x, start_y], [x, y]],
+ button="left",
+ )
+ self._last_mouse_position = (x, y)
return await self.screenshot()
if action in ("key", "type"):
@@ -208,22 +227,22 @@ async def __call__(
if coordinate is not None:
raise ToolError(f"coordinate is not accepted for {action}")
if not isinstance(text, str):
- raise ToolError(output=f"{text} must be a string")
+ raise ToolError(f"{text} must be a string")
if action == "key":
mapped_key = self.map_key(text)
- await self.page.keyboard.press(mapped_key)
+ self.kernel.browsers.computer.press_key(
+ id=self.session_id,
+ keys=[mapped_key],
+ )
return await self.screenshot()
elif action == "type":
- results: list[ToolResult] = []
- for chunk in chunks(text, TYPING_GROUP_SIZE):
- await self.page.keyboard.type(chunk, delay=TYPING_DELAY_MS)
- results.append(await self.screenshot())
- return ToolResult(
- output="".join(result.output or "" for result in results),
- error="".join(result.error or "" for result in results),
- base64_image=results[-1].base64_image if results else None,
+ self.kernel.browsers.computer.type_text(
+ id=self.session_id,
+ text=text,
+ delay=TYPING_DELAY_MS,
)
+ return await self.screenshot()
if action in (
"left_click",
@@ -239,40 +258,62 @@ async def __call__(
if action == "screenshot":
return await self.screenshot()
elif action == "cursor_position":
- # Playwright doesn't provide a direct way to get cursor position
- # We'll return a placeholder since this isn't critical functionality
- return ToolResult(output="Cursor position not available in Playwright")
+ # Kernel Computer Controls API doesn't track cursor position
+ raise ToolError("Cursor position is not available with Kernel Computer Controls API")
else:
if coordinate is not None:
coordinate = self.validate_coordinates(coordinate)
x, y = coordinate
- await self.page.mouse.move(x, y)
+ else:
+ x, y = self._last_mouse_position
+ button = "left"
+ if action == "right_click":
+ button = "right"
+ elif action == "middle_click":
+ button = "middle"
+
+ num_clicks = 1
if action == "double_click":
- await self.page.mouse.dblclick(x, y)
- else:
- await self.page.mouse.click(x, y, button=MOUSE_BUTTONS[action])
+ num_clicks = 2
+
+ self.kernel.browsers.computer.click_mouse(
+ id=self.session_id,
+ x=x,
+ y=y,
+ button=button,
+ num_clicks=num_clicks,
+ )
+ self._last_mouse_position = (x, y)
return await self.screenshot()
raise ToolError(f"Invalid action: {action}")
async def screenshot(self):
- """Take a screenshot using Playwright and return the base64 encoded image."""
- if not self.page:
- raise ToolError("Playwright page not initialized")
+ """Take a screenshot using Kernel Computer Controls API and return the base64 encoded image."""
+ if not self.kernel or not self.session_id:
+ raise ToolError("Kernel client or session not initialized")
- # Take screenshot using Playwright and get the buffer directly
- screenshot_bytes = await self.page.screenshot(type="png")
+ print("Starting screenshot...")
+ await asyncio.sleep(self._screenshot_delay)
+
+ response = self.kernel.browsers.computer.capture_screenshot(id=self.session_id)
+ screenshot_bytes = response.read()
+
+ print(f"Screenshot taken, size: {len(screenshot_bytes)} bytes")
+
return ToolResult(
base64_image=base64.b64encode(screenshot_bytes).decode()
)
+
class ComputerTool20241022(BaseComputerTool, BaseAnthropicTool):
api_type: Literal["computer_20241022"] = "computer_20241022"
def to_params(self) -> BetaToolComputerUse20241022Param:
return {"name": self.name, "type": self.api_type, **self.options}
+
class ComputerTool20250124(BaseComputerTool, BaseAnthropicTool):
api_type: Literal["computer_20250124"] = "computer_20250124"
@@ -294,22 +335,29 @@ async def __call__(
key: str | None = None,
**kwargs,
):
- if not self.page:
- raise ToolError("Playwright page not initialized")
+ if not self.kernel or not self.session_id:
+ raise ToolError("Kernel client or session not initialized")
if action in ("left_mouse_down", "left_mouse_up"):
if coordinate is not None:
- raise ToolError(f"coordinate is not accepted for {action=}.")
- if action == "left_mouse_down":
- await self.page.mouse.down(button="left")
+ coordinate = self.validate_coordinates(coordinate)
+ x, y = coordinate
else:
- await self.page.mouse.up(button="left")
+ x, y = self._last_mouse_position
+
+ click_type = "down" if action == "left_mouse_down" else "up"
+ self.kernel.browsers.computer.click_mouse(
+ id=self.session_id,
+ x=x,
+ y=y,
+ button="left",
+ click_type=click_type,
+ )
+ self._last_mouse_position = (x, y)
return await self.screenshot()
if action == "scroll":
- if scroll_direction is None or scroll_direction not in get_args(
- ScrollDirection
- ):
+ if scroll_direction is None or scroll_direction not in get_args(ScrollDirection):
raise ToolError(
f"{scroll_direction=} must be 'up', 'down', 'left', or 'right'"
)
@@ -319,31 +367,32 @@ async def __call__(
if coordinate is not None:
coordinate = self.validate_coordinates(coordinate)
x, y = coordinate
- await self.page.mouse.move(x, y)
-
- # Map scroll directions to Playwright's wheel events
- page_dimensions = await self.page.evaluate(
- "() => Promise.resolve({ h: window.innerHeight, w: window.innerWidth })"
- )
- page_partitions = 25
- scroll_factor = scroll_amount / page_partitions
- page_width = page_dimensions['w']
- page_height = page_dimensions['h']
+ else:
+ x, y = self._last_mouse_position
+ # Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior)
+ scroll_factor = scroll_amount * 120
+
delta_x = 0
delta_y = 0
if scroll_direction == "up":
- delta_y = -scroll_factor * page_height
+ delta_y = -scroll_factor
elif scroll_direction == "down":
- delta_y = scroll_factor * page_height
+ delta_y = scroll_factor
elif scroll_direction == "left":
- delta_x = -scroll_factor * page_width
+ delta_x = -scroll_factor
elif scroll_direction == "right":
- delta_x = scroll_factor * page_width
+ delta_x = scroll_factor
- print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y):.02f} pixels {scroll_direction}")
+ print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y)} pixels {scroll_direction}")
- await self.page.mouse.wheel(delta_x=delta_x, delta_y=delta_y)
+ self.kernel.browsers.computer.scroll(
+ id=self.session_id,
+ x=x,
+ y=y,
+ delta_x=delta_x,
+ delta_y=delta_y,
+ )
return await self.screenshot()
if action in ("hold_key", "wait"):
@@ -358,9 +407,11 @@ async def __call__(
if text is None:
raise ToolError(f"text is required for {action}")
mapped_key = self.map_key(text)
- await self.page.keyboard.down(mapped_key)
- await asyncio.sleep(duration)
- await self.page.keyboard.up(mapped_key)
+ self.kernel.browsers.computer.press_key(
+ id=self.session_id,
+ keys=[mapped_key],
+ duration=int(duration * 1000), # Convert to milliseconds
+ )
return await self.screenshot()
if action == "wait":
@@ -380,23 +431,45 @@ async def __call__(
if coordinate is not None:
coordinate = self.validate_coordinates(coordinate)
x, y = coordinate
- await self.page.mouse.move(x, y)
+ else:
+ x, y = self._last_mouse_position
+
+ button = "left"
+ if action == "right_click":
+ button = "right"
+ elif action == "middle_click":
+ button = "middle"
+
+ num_clicks = 1
+ if action == "double_click":
+ num_clicks = 2
+ elif action == "triple_click":
+ num_clicks = 3
if key:
mapped_key = self.map_key(key)
- await self.page.keyboard.down(mapped_key)
+ self.kernel.browsers.computer.press_key(
+ id=self.session_id,
+ keys=[mapped_key],
+ click_type="down",
+ )
- if action == "triple_click":
- # Playwright doesn't have triple click, so we'll simulate it
- await self.page.mouse.click(x, y, click_count=3)
- elif action == "double_click":
- await self.page.mouse.dblclick(x, y)
- else:
- await self.page.mouse.click(x, y, button=MOUSE_BUTTONS[action])
+ self.kernel.browsers.computer.click_mouse(
+ id=self.session_id,
+ x=x,
+ y=y,
+ button=button,
+ num_clicks=num_clicks,
+ )
if key:
- await self.page.keyboard.up(mapped_key)
+ self.kernel.browsers.computer.press_key(
+ id=self.session_id,
+ keys=[mapped_key],
+ click_type="up",
+ )
+ self._last_mouse_position = (x, y)
return await self.screenshot()
return await super().__call__(
diff --git a/pkg/templates/typescript/anthropic-computer-use/README.md b/pkg/templates/typescript/anthropic-computer-use/README.md
index 2f71cad..d4cd552 100644
--- a/pkg/templates/typescript/anthropic-computer-use/README.md
+++ b/pkg/templates/typescript/anthropic-computer-use/README.md
@@ -1,7 +1,47 @@
# Kernel TypeScript Sample App - Anthropic Computer Use
-This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use.
+This is a Kernel application that implements a prompt loop using Anthropic Computer Use with Kernel's Computer Controls API.
-It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but replaces `xodotool` and `gnome-screenshot` with Playwright.
+It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but uses Kernel's Computer Controls API instead of `xdotool` and `gnome-screenshot`.
-See the [docs](https://www.kernel.sh/docs/quickstart) for information.
+## Setup
+
+1. Get your API keys:
+ - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com)
+ - **Anthropic**: [console.anthropic.com](https://console.anthropic.com)
+
+2. Deploy the app:
+```bash
+kernel login
+cp .env.example .env # Add your ANTHROPIC_API_KEY
+kernel deploy index.ts --env-file .env
+```
+
+## Usage
+
+```bash
+kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'
+```
+
+## Recording Replays
+
+> **Note:** Replay recording is only available to Kernel users on paid plans.
+
+Add `"record_replay": true` to your payload to capture a video of the browser session:
+
+```bash
+kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}'
+```
+
+When enabled, the response will include a `replay_url` field with a link to view the recorded session.
+
+## Known Limitations
+
+### Cursor Position
+
+The `cursor_position` action is not supported with Kernel's Computer Controls API. If the model attempts to use this action, an error will be returned. This is a known limitation that does not significantly impact most computer use workflows, as the model typically tracks cursor position through screenshots.
+
+## Resources
+
+- [Anthropic Computer Use Documentation](https://docs.anthropic.com/en/docs/build-with-claude/computer-use)
+- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
diff --git a/pkg/templates/typescript/anthropic-computer-use/index.ts b/pkg/templates/typescript/anthropic-computer-use/index.ts
index cc7a0dd..b126626 100644
--- a/pkg/templates/typescript/anthropic-computer-use/index.ts
+++ b/pkg/templates/typescript/anthropic-computer-use/index.ts
@@ -1,6 +1,6 @@
import { Kernel, type KernelContext } from '@onkernel/sdk';
-import { chromium } from 'playwright-core';
import { samplingLoop } from './loop';
+import { KernelBrowserSession } from './session';
const kernel = new Kernel();
@@ -8,10 +8,12 @@ const app = kernel.app('ts-anthropic-cua');
interface QueryInput {
query: string;
+ record_replay?: boolean;
}
interface QueryOutput {
result: string;
+ replay_url?: string;
}
// LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX`
@@ -29,31 +31,27 @@ app.action(
throw new Error('Query is required');
}
- const kernelBrowser = await kernel.browsers.create({
- invocation_id: ctx.invocation_id,
+ // Create browser session with optional replay recording
+ const session = new KernelBrowserSession(kernel, {
stealth: true,
+ recordReplay: payload.record_replay ?? false,
});
- console.log("Kernel browser live view url: ", kernelBrowser.browser_live_view_url);
-
- const browser = await chromium.connectOverCDP(kernelBrowser.cdp_ws_url);
- const context = await browser.contexts()[0];
- const page = await context?.pages()[0];
- if (!page) {
- throw new Error('Error getting initial page');
- }
+ await session.start();
+ console.log('Kernel browser live view url:', session.liveViewUrl);
try {
// Run the sampling loop
const finalMessages = await samplingLoop({
- model: 'claude-sonnet-4-20250514',
+ model: 'claude-sonnet-4-5-20250929',
messages: [{
role: 'user',
content: payload.query,
}],
apiKey: ANTHROPIC_API_KEY,
thinkingBudget: 1024,
- playwrightPage: page,
+ kernel,
+ sessionId: session.sessionId,
});
// Extract the final result from the messages
@@ -72,13 +70,17 @@ app.action(
block.type === 'text' ? block.text : ''
).join('');
- return { result };
+ // Stop session and get replay URL if recording was enabled
+ const sessionInfo = await session.stop();
+
+ return {
+ result,
+ replay_url: sessionInfo.replayViewUrl,
+ };
} catch (error) {
console.error('Error in sampling loop:', error);
+ await session.stop();
throw error;
- } finally {
- await browser.close();
- await kernel.browsers.deleteByID(kernelBrowser.session_id);
}
},
);
diff --git a/pkg/templates/typescript/anthropic-computer-use/loop.ts b/pkg/templates/typescript/anthropic-computer-use/loop.ts
index d5034eb..fa775d9 100644
--- a/pkg/templates/typescript/anthropic-computer-use/loop.ts
+++ b/pkg/templates/typescript/anthropic-computer-use/loop.ts
@@ -1,6 +1,6 @@
import { Anthropic } from '@anthropic-ai/sdk';
import { DateTime } from 'luxon';
-import type { Page } from 'playwright-core';
+import type { Kernel } from '@onkernel/sdk';
import { DEFAULT_TOOL_VERSION, TOOL_GROUPS_BY_VERSION, ToolCollection, type ToolVersion } from './tools/collection';
import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer';
import type { ActionParams } from './tools/types/computer';
@@ -55,7 +55,8 @@ export async function samplingLoop({
toolVersion,
thinkingBudget,
tokenEfficientToolsBeta = false,
- playwrightPage,
+ kernel,
+ sessionId,
}: {
model: string;
systemPromptSuffix?: string;
@@ -66,11 +67,12 @@ export async function samplingLoop({
toolVersion?: ToolVersion;
thinkingBudget?: number;
tokenEfficientToolsBeta?: boolean;
- playwrightPage: Page;
+ kernel: Kernel;
+ sessionId: string;
}): Promise {
const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION;
const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion];
- const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(playwrightPage)));
+ const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(kernel, sessionId)));
const system: BetaTextBlock = {
type: 'text',
diff --git a/pkg/templates/typescript/anthropic-computer-use/package.json b/pkg/templates/typescript/anthropic-computer-use/package.json
index e6ce639..8012da1 100644
--- a/pkg/templates/typescript/anthropic-computer-use/package.json
+++ b/pkg/templates/typescript/anthropic-computer-use/package.json
@@ -5,9 +5,8 @@
"private": true,
"dependencies": {
"@anthropic-ai/sdk": "^0.71.2",
- "@onkernel/sdk": "^0.23.0",
- "luxon": "^3.7.2",
- "playwright-core": "^1.57.0"
+ "@onkernel/sdk": "^0.24.0",
+ "luxon": "^3.7.2"
},
"devDependencies": {
"@types/node": "^22.15.17",
diff --git a/pkg/templates/typescript/anthropic-computer-use/session.ts b/pkg/templates/typescript/anthropic-computer-use/session.ts
new file mode 100644
index 0000000..06e30a6
--- /dev/null
+++ b/pkg/templates/typescript/anthropic-computer-use/session.ts
@@ -0,0 +1,222 @@
+/**
+ * Kernel Browser Session Manager.
+ *
+ * Provides a class for managing Kernel browser lifecycle
+ * with optional video replay recording.
+ */
+
+import type { Kernel } from '@onkernel/sdk';
+
+export interface SessionOptions {
+ /** Enable stealth mode to avoid bot detection */
+ stealth?: boolean;
+ /** Browser session timeout in seconds */
+ timeoutSeconds?: number;
+ /** Enable replay recording (requires paid plan) */
+ recordReplay?: boolean;
+ /** Grace period in seconds before stopping replay */
+ replayGracePeriod?: number;
+}
+
+export interface SessionInfo {
+ sessionId: string;
+ liveViewUrl: string;
+ replayId?: string;
+ replayViewUrl?: string;
+}
+
+const DEFAULT_OPTIONS: Required = {
+ stealth: true,
+ timeoutSeconds: 300,
+ recordReplay: false,
+ replayGracePeriod: 5.0,
+};
+
+/**
+ * Manages Kernel browser lifecycle with optional replay recording.
+ *
+ * Usage:
+ * ```typescript
+ * const session = new KernelBrowserSession(kernel, options);
+ * await session.start();
+ * try {
+ * // Use session.sessionId for computer controls
+ * } finally {
+ * await session.stop();
+ * }
+ * ```
+ */
+export class KernelBrowserSession {
+ private kernel: Kernel;
+ private options: Required;
+
+ // Session state
+ private _sessionId: string | null = null;
+ private _liveViewUrl: string | null = null;
+ private _replayId: string | null = null;
+ private _replayViewUrl: string | null = null;
+
+ constructor(kernel: Kernel, options: SessionOptions = {}) {
+ this.kernel = kernel;
+ this.options = { ...DEFAULT_OPTIONS, ...options };
+ }
+
+ get sessionId(): string {
+ if (!this._sessionId) {
+ throw new Error('Session not started. Call start() first.');
+ }
+ return this._sessionId;
+ }
+
+ get liveViewUrl(): string | null {
+ return this._liveViewUrl;
+ }
+
+ get replayViewUrl(): string | null {
+ return this._replayViewUrl;
+ }
+
+ get info(): SessionInfo {
+ return {
+ sessionId: this.sessionId,
+ liveViewUrl: this._liveViewUrl || '',
+ replayId: this._replayId || undefined,
+ replayViewUrl: this._replayViewUrl || undefined,
+ };
+ }
+
+ /**
+ * Create a Kernel browser session and optionally start recording.
+ */
+ async start(): Promise {
+ // Create browser with specified settings
+ const browser = await this.kernel.browsers.create({
+ stealth: this.options.stealth,
+ timeout_seconds: this.options.timeoutSeconds,
+ viewport: {
+ width: 1024,
+ height: 768,
+ refresh_rate: 60,
+ },
+ });
+
+ this._sessionId = browser.session_id;
+ this._liveViewUrl = browser.browser_live_view_url;
+
+ console.log(`Kernel browser created: ${this._sessionId}`);
+ console.log(`Live view URL: ${this._liveViewUrl}`);
+
+ // Start replay recording if enabled
+ if (this.options.recordReplay) {
+ try {
+ await this.startReplay();
+ } catch (error) {
+ console.warn(`Warning: Failed to start replay recording: ${error}`);
+ console.warn('Continuing without replay recording.');
+ }
+ }
+
+ return this.info;
+ }
+
+ /**
+ * Start recording a replay of the browser session.
+ */
+ private async startReplay(): Promise {
+ if (!this._sessionId) {
+ return;
+ }
+
+ console.log('Starting replay recording...');
+ const replay = await this.kernel.browsers.replays.start(this._sessionId);
+ this._replayId = replay.replay_id;
+ console.log(`Replay recording started: ${this._replayId}`);
+ }
+
+ /**
+ * Stop recording and get the replay URL.
+ */
+ private async stopReplay(): Promise {
+ if (!this._sessionId || !this._replayId) {
+ return;
+ }
+
+ console.log('Stopping replay recording...');
+ await this.kernel.browsers.replays.stop(this._replayId, {
+ id: this._sessionId,
+ });
+ console.log('Replay recording stopped. Processing video...');
+
+ // Wait a moment for processing
+ await this.sleep(2000);
+
+ // Poll for replay to be ready (with timeout)
+ const maxWait = 60000; // 60 seconds
+ const startTime = Date.now();
+ let replayReady = false;
+
+ while (Date.now() - startTime < maxWait) {
+ try {
+ const replays = await this.kernel.browsers.replays.list(this._sessionId);
+ for (const replay of replays) {
+ if (replay.replay_id === this._replayId) {
+ this._replayViewUrl = replay.replay_view_url;
+ replayReady = true;
+ break;
+ }
+ }
+ if (replayReady) {
+ break;
+ }
+ } catch {
+ // Ignore errors while polling
+ }
+ await this.sleep(1000);
+ }
+
+ if (!replayReady) {
+ console.log('Warning: Replay may still be processing');
+ } else if (this._replayViewUrl) {
+ console.log(`Replay view URL: ${this._replayViewUrl}`);
+ }
+ }
+
+ /**
+ * Stop recording, and delete the browser session.
+ */
+ async stop(): Promise {
+ const info = this.info;
+
+ if (this._sessionId) {
+ try {
+ // Stop replay if recording was enabled
+ if (this.options.recordReplay && this._replayId) {
+ // Wait grace period before stopping to capture final state
+ if (this.options.replayGracePeriod > 0) {
+ console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`);
+ await this.sleep(this.options.replayGracePeriod * 1000);
+ }
+ await this.stopReplay();
+ info.replayViewUrl = this._replayViewUrl || undefined;
+ }
+ } finally {
+ // Always clean up the browser session, even if replay stopping fails
+ console.log(`Destroying browser session: ${this._sessionId}`);
+ await this.kernel.browsers.deleteByID(this._sessionId);
+ console.log('Browser session destroyed.');
+ }
+ }
+
+ // Reset state
+ this._sessionId = null;
+ this._liveViewUrl = null;
+ this._replayId = null;
+ this._replayViewUrl = null;
+
+ return info;
+ }
+
+ private sleep(ms: number): Promise {
+ return new Promise(resolve => setTimeout(resolve, ms));
+ }
+}
diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts b/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts
index 45f3afe..8d4c395 100644
--- a/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts
+++ b/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts
@@ -46,7 +46,7 @@ export class ToolCollection {
return Array.from(this.tools.values()).map(tool => tool.toParams());
}
- async run(name: string, toolInput: { action: Action } & Record): Promise {
+ async run(name: string, toolInput: ActionParams): Promise {
const tool = this.tools.get(name);
if (!tool) {
throw new Error(`Tool ${name} not found`);
diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts
index 66725b9..8e415ad 100644
--- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts
+++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts
@@ -1,16 +1,19 @@
-import type { Page } from 'playwright-core';
+import { Buffer } from 'buffer';
+import type { Kernel } from '@onkernel/sdk';
import type { ActionParams, BaseAnthropicTool, ToolResult } from './types/computer';
import { Action, ToolError } from './types/computer';
-import { KeyboardUtils } from './utils/keyboard';
import { ActionValidator } from './utils/validator';
const TYPING_DELAY_MS = 12;
export class ComputerTool implements BaseAnthropicTool {
name: 'computer' = 'computer';
- protected page: Page;
+ protected kernel: Kernel;
+ protected sessionId: string;
protected _screenshotDelay = 2.0;
protected version: '20241022' | '20250124';
+
+ private lastMousePosition: [number, number] = [0, 0];
private readonly mouseActions = new Set([
Action.LEFT_CLICK,
@@ -19,7 +22,6 @@ export class ComputerTool implements BaseAnthropicTool {
Action.DOUBLE_CLICK,
Action.TRIPLE_CLICK,
Action.MOUSE_MOVE,
- Action.LEFT_CLICK_DRAG,
Action.LEFT_MOUSE_DOWN,
Action.LEFT_MOUSE_UP,
]);
@@ -37,8 +39,9 @@ export class ComputerTool implements BaseAnthropicTool {
Action.WAIT,
]);
- constructor(page: Page, version: '20241022' | '20250124' = '20250124') {
- this.page = page;
+ constructor(kernel: Kernel, sessionId: string, version: '20241022' | '20250124' = '20250124') {
+ this.kernel = kernel;
+ this.sessionId = sessionId;
this.version = version;
}
@@ -50,8 +53,8 @@ export class ComputerTool implements BaseAnthropicTool {
const params = {
name: this.name,
type: this.apiType,
- display_width_px: 1280,
- display_height_px: 720,
+ display_width_px: 1024,
+ display_height_px: 768,
display_number: null,
};
return params;
@@ -77,59 +80,181 @@ export class ComputerTool implements BaseAnthropicTool {
private async handleMouseAction(action: Action, coordinate: [number, number]): Promise {
const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate);
- await this.page.mouse.move(x, y);
- await this.page.waitForTimeout(100);
- if (action === Action.LEFT_MOUSE_DOWN) {
- await this.page.mouse.down();
+ if (action === Action.MOUSE_MOVE) {
+ await this.kernel.browsers.computer.moveMouse(this.sessionId, {
+ x,
+ y,
+ });
+ this.lastMousePosition = [x, y];
+ } else if (action === Action.LEFT_MOUSE_DOWN) {
+ await this.kernel.browsers.computer.clickMouse(this.sessionId, {
+ x,
+ y,
+ button: 'left',
+ click_type: 'down',
+ });
+ this.lastMousePosition = [x, y];
} else if (action === Action.LEFT_MOUSE_UP) {
- await this.page.mouse.up();
+ await this.kernel.browsers.computer.clickMouse(this.sessionId, {
+ x,
+ y,
+ button: 'left',
+ click_type: 'up',
+ });
+ this.lastMousePosition = [x, y];
} else {
const button = this.getMouseButton(action);
+ let numClicks = 1;
if (action === Action.DOUBLE_CLICK) {
- await this.page.mouse.dblclick(x, y, { button });
+ numClicks = 2;
} else if (action === Action.TRIPLE_CLICK) {
- await this.page.mouse.click(x, y, { button, clickCount: 3 });
- } else {
- await this.page.mouse.click(x, y, { button });
+ numClicks = 3;
}
+
+ await this.kernel.browsers.computer.clickMouse(this.sessionId, {
+ x,
+ y,
+ button,
+ click_type: 'click',
+ num_clicks: numClicks,
+ });
+ this.lastMousePosition = [x, y];
}
- await this.page.waitForTimeout(500);
+ await new Promise(resolve => setTimeout(resolve, 500));
return await this.screenshot();
}
private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise {
if (action === Action.HOLD_KEY) {
- const key = KeyboardUtils.getPlaywrightKey(text);
- await this.page.keyboard.down(key);
- await new Promise(resolve => setTimeout(resolve, duration! * 1000));
- await this.page.keyboard.up(key);
+ const key = this.convertToKernelKey(text);
+ await this.kernel.browsers.computer.pressKey(this.sessionId, {
+ keys: [key],
+ duration: duration ? duration * 1000 : undefined,
+ });
} else if (action === Action.KEY) {
- const keys = KeyboardUtils.parseKeyCombination(text);
- for (const key of keys) {
- await this.page.keyboard.down(key);
- }
- for (const key of keys.reverse()) {
- await this.page.keyboard.up(key);
- }
+ const key = this.convertKeyCombinationToKernel(text);
+ await this.kernel.browsers.computer.pressKey(this.sessionId, {
+ keys: [key],
+ });
} else {
- await this.page.keyboard.type(text, { delay: TYPING_DELAY_MS });
+ await this.kernel.browsers.computer.typeText(this.sessionId, {
+ text,
+ delay: TYPING_DELAY_MS,
+ });
}
- await this.page.waitForTimeout(500);
+ await new Promise(resolve => setTimeout(resolve, 500));
return await this.screenshot();
}
+ // Key mappings for Kernel Computer Controls API (xdotool format)
+ private static readonly KEY_MAP: Record = {
+ // Enter/Return
+ 'return': 'Return',
+ 'enter': 'Return',
+ 'Enter': 'Return',
+ // Arrow keys
+ 'left': 'Left',
+ 'right': 'Right',
+ 'up': 'Up',
+ 'down': 'Down',
+ 'ArrowLeft': 'Left',
+ 'ArrowRight': 'Right',
+ 'ArrowUp': 'Up',
+ 'ArrowDown': 'Down',
+ // Navigation
+ 'home': 'Home',
+ 'end': 'End',
+ 'pageup': 'Page_Up',
+ 'page_up': 'Page_Up',
+ 'PageUp': 'Page_Up',
+ 'pagedown': 'Page_Down',
+ 'page_down': 'Page_Down',
+ 'PageDown': 'Page_Down',
+ // Editing
+ 'delete': 'Delete',
+ 'backspace': 'BackSpace',
+ 'Backspace': 'BackSpace',
+ 'tab': 'Tab',
+ 'insert': 'Insert',
+ // Escape
+ 'esc': 'Escape',
+ 'escape': 'Escape',
+ // Function keys
+ 'f1': 'F1',
+ 'f2': 'F2',
+ 'f3': 'F3',
+ 'f4': 'F4',
+ 'f5': 'F5',
+ 'f6': 'F6',
+ 'f7': 'F7',
+ 'f8': 'F8',
+ 'f9': 'F9',
+ 'f10': 'F10',
+ 'f11': 'F11',
+ 'f12': 'F12',
+ // Misc
+ 'space': 'space',
+ 'minus': 'minus',
+ 'equal': 'equal',
+ 'plus': 'plus',
+ };
+
+ // Modifier key mappings (xdotool format)
+ private static readonly MODIFIER_MAP: Record = {
+ 'ctrl': 'ctrl',
+ 'control': 'ctrl',
+ 'Control': 'ctrl',
+ 'alt': 'alt',
+ 'Alt': 'alt',
+ 'shift': 'shift',
+ 'Shift': 'shift',
+ 'meta': 'super',
+ 'Meta': 'super',
+ 'cmd': 'super',
+ 'command': 'super',
+ 'win': 'super',
+ 'super': 'super',
+ };
+
+ private convertToKernelKey(key: string): string {
+ // Check modifier keys first
+ if (ComputerTool.MODIFIER_MAP[key]) {
+ return ComputerTool.MODIFIER_MAP[key];
+ }
+ // Check special keys
+ if (ComputerTool.KEY_MAP[key]) {
+ return ComputerTool.KEY_MAP[key];
+ }
+ // Return as-is if no mapping exists
+ return key;
+ }
+
+ private convertKeyCombinationToKernel(combo: string): string {
+ // Handle key combinations (e.g., "ctrl+a", "Control+t")
+ if (combo.includes('+')) {
+ const parts = combo.split('+');
+ const mappedParts = parts.map(part => this.convertToKernelKey(part.trim()));
+ return mappedParts.join('+');
+ }
+ // Single key - just convert it
+ return this.convertToKernelKey(combo);
+ }
+
async screenshot(): Promise {
try {
console.log('Starting screenshot...');
await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000));
- const screenshot = await this.page.screenshot({ type: 'png' });
- console.log('Screenshot taken, size:', screenshot.length, 'bytes');
+ const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId);
+ const blob = await response.blob();
+ const arrayBuffer = await blob.arrayBuffer();
+ const buffer = Buffer.from(arrayBuffer);
+ console.log('Screenshot taken, size:', buffer.length, 'bytes');
return {
- base64Image: screenshot.toString('base64'),
+ base64Image: buffer.toString('base64'),
};
} catch (error) {
throw new ToolError(`Failed to take screenshot: ${error}`);
@@ -155,18 +280,7 @@ export class ComputerTool implements BaseAnthropicTool {
}
if (action === Action.CURSOR_POSITION) {
- const position = await this.page.evaluate(() => {
- const selection = window.getSelection();
- const range = selection?.getRangeAt(0);
- const rect = range?.getBoundingClientRect();
- return rect ? { x: rect.x, y: rect.y } : null;
- });
-
- if (!position) {
- throw new ToolError('Failed to get cursor position');
- }
-
- return { output: `X=${position.x},Y=${position.y}` };
+ throw new ToolError('Cursor position is not available with Kernel Computer Controls API');
}
if (action === Action.SCROLL) {
@@ -184,29 +298,33 @@ export class ComputerTool implements BaseAnthropicTool {
throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`);
}
- if (coordinate) {
- const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate);
- await this.page.mouse.move(x, y);
- await this.page.waitForTimeout(100);
+ const [x, y] = coordinate
+ ? ActionValidator.validateAndGetCoordinates(coordinate)
+ : this.lastMousePosition;
+
+ let delta_x = 0;
+ let delta_y = 0;
+ // Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior)
+ const scrollDelta = (scrollAmountValue ?? 1) * 120;
+
+ if (scrollDirection === 'down') {
+ delta_y = scrollDelta;
+ } else if (scrollDirection === 'up') {
+ delta_y = -scrollDelta;
+ } else if (scrollDirection === 'right') {
+ delta_x = scrollDelta;
+ } else if (scrollDirection === 'left') {
+ delta_x = -scrollDelta;
}
- const pageDimensions = await this.page.evaluate(() => {
- return { h: window.innerHeight, w: window.innerWidth };
+ await this.kernel.browsers.computer.scroll(this.sessionId, {
+ x,
+ y,
+ delta_x,
+ delta_y,
});
- const pagePartitions = 25;
- const scrollFactor = (scrollAmountValue || 10) / pagePartitions;
-
- if (scrollDirection === 'down' || scrollDirection === 'up') {
- const amount = pageDimensions.h * scrollFactor;
- console.log(`Scrolling ${amount.toFixed(2)} pixels ${scrollDirection}`);
- await this.page.mouse.wheel(0, scrollDirection === 'down' ? amount : -amount);
- } else {
- const amount = pageDimensions.w * scrollFactor;
- console.log(`Scrolling ${amount.toFixed(2)} pixels ${scrollDirection}`);
- await this.page.mouse.wheel(scrollDirection === 'right' ? amount : -amount, 0);
- }
- await this.page.waitForTimeout(500);
+ await new Promise(resolve => setTimeout(resolve, 500));
return await this.screenshot();
}
@@ -218,6 +336,30 @@ export class ComputerTool implements BaseAnthropicTool {
return await this.screenshot();
}
+ if (action === Action.LEFT_CLICK_DRAG) {
+ if (!coordinate) {
+ throw new ToolError(`coordinate is required for ${action}`);
+ }
+
+ const [endX, endY] = ActionValidator.validateAndGetCoordinates(coordinate);
+ const startCoordinate = kwargs.start_coordinate as [number, number] | undefined;
+ const [startX, startY] = startCoordinate
+ ? ActionValidator.validateAndGetCoordinates(startCoordinate)
+ : this.lastMousePosition;
+
+ console.log(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`);
+
+ await this.kernel.browsers.computer.dragMouse(this.sessionId, {
+ path: [[startX, startY], [endX, endY]],
+ button: 'left',
+ });
+
+ this.lastMousePosition = [endX, endY];
+
+ await new Promise(resolve => setTimeout(resolve, 500));
+ return await this.screenshot();
+ }
+
if (this.mouseActions.has(action)) {
if (!coordinate) {
throw new ToolError(`coordinate is required for ${action}`);
@@ -238,13 +380,13 @@ export class ComputerTool implements BaseAnthropicTool {
// For backward compatibility
export class ComputerTool20241022 extends ComputerTool {
- constructor(page: Page) {
- super(page, '20241022');
+ constructor(kernel: Kernel, sessionId: string) {
+ super(kernel, sessionId, '20241022');
}
}
export class ComputerTool20250124 extends ComputerTool {
- constructor(page: Page) {
- super(page, '20250124');
+ constructor(kernel: Kernel, sessionId: string) {
+ super(kernel, sessionId, '20250124');
}
}