From fba8a7c8dbf6f812f92c7f504aaa88a3e8b74fcf Mon Sep 17 00:00:00 2001 From: Tanmay Sardesai Date: Thu, 18 Dec 2025 11:31:19 -0800 Subject: [PATCH 1/8] first pass at using kernel computer controls instead of playwright --- .../anthropic-computer-use/index.ts | 3 +- .../typescript/anthropic-computer-use/loop.ts | 12 +- .../tools/collection.ts | 2 +- .../anthropic-computer-use/tools/computer.ts | 217 ++++++++++++------ 4 files changed, 161 insertions(+), 73 deletions(-) diff --git a/pkg/templates/typescript/anthropic-computer-use/index.ts b/pkg/templates/typescript/anthropic-computer-use/index.ts index cc7a0dd..35f36d0 100644 --- a/pkg/templates/typescript/anthropic-computer-use/index.ts +++ b/pkg/templates/typescript/anthropic-computer-use/index.ts @@ -53,7 +53,8 @@ app.action( }], apiKey: ANTHROPIC_API_KEY, thinkingBudget: 1024, - playwrightPage: page, + kernel, + sessionId: kernelBrowser.session_id, }); // Extract the final result from the messages diff --git a/pkg/templates/typescript/anthropic-computer-use/loop.ts b/pkg/templates/typescript/anthropic-computer-use/loop.ts index d5034eb..199e92c 100644 --- a/pkg/templates/typescript/anthropic-computer-use/loop.ts +++ b/pkg/templates/typescript/anthropic-computer-use/loop.ts @@ -1,6 +1,6 @@ import { Anthropic } from '@anthropic-ai/sdk'; import { DateTime } from 'luxon'; -import type { Page } from 'playwright-core'; +import type { Kernel } from '@onkernel/sdk'; import { DEFAULT_TOOL_VERSION, TOOL_GROUPS_BY_VERSION, ToolCollection, type ToolVersion } from './tools/collection'; import { ComputerTool20241022, ComputerTool20250124 } from './tools/computer'; import type { ActionParams } from './tools/types/computer'; @@ -55,7 +55,8 @@ export async function samplingLoop({ toolVersion, thinkingBudget, tokenEfficientToolsBeta = false, - playwrightPage, + kernel, + sessionId, }: { model: string; systemPromptSuffix?: string; @@ -66,11 +67,12 @@ export async function samplingLoop({ toolVersion?: ToolVersion; thinkingBudget?: number; tokenEfficientToolsBeta?: boolean; - playwrightPage: Page; + kernel: Kernel; + sessionId: string; }): Promise { const selectedVersion = toolVersion || DEFAULT_TOOL_VERSION; const toolGroup = TOOL_GROUPS_BY_VERSION[selectedVersion]; - const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(playwrightPage))); + const toolCollection = new ToolCollection(...toolGroup.tools.map((Tool: typeof ComputerTool20241022 | typeof ComputerTool20250124) => new Tool(kernel, sessionId))); const system: BetaTextBlock = { type: 'text', @@ -116,7 +118,7 @@ export async function samplingLoop({ messages, model, system: [system], - tools: toolParams, + tools: toolParams as any, // Type assertion needed due to ActionParams being used for both tool definition and input betas, ...extraBody, }); diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts b/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts index 45f3afe..8d4c395 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/collection.ts @@ -46,7 +46,7 @@ export class ToolCollection { return Array.from(this.tools.values()).map(tool => tool.toParams()); } - async run(name: string, toolInput: { action: Action } & Record): Promise { + async run(name: string, toolInput: ActionParams): Promise { const tool = this.tools.get(name); if (!tool) { throw new Error(`Tool ${name} not found`); diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts index 66725b9..333bcdf 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts @@ -1,4 +1,5 @@ -import type { Page } from 'playwright-core'; +import { Buffer } from 'buffer'; +import type { Kernel } from '@onkernel/sdk'; import type { ActionParams, BaseAnthropicTool, ToolResult } from './types/computer'; import { Action, ToolError } from './types/computer'; import { KeyboardUtils } from './utils/keyboard'; @@ -8,7 +9,8 @@ const TYPING_DELAY_MS = 12; export class ComputerTool implements BaseAnthropicTool { name: 'computer' = 'computer'; - protected page: Page; + protected kernel: Kernel; + protected sessionId: string; protected _screenshotDelay = 2.0; protected version: '20241022' | '20250124'; @@ -19,7 +21,6 @@ export class ComputerTool implements BaseAnthropicTool { Action.DOUBLE_CLICK, Action.TRIPLE_CLICK, Action.MOUSE_MOVE, - Action.LEFT_CLICK_DRAG, Action.LEFT_MOUSE_DOWN, Action.LEFT_MOUSE_UP, ]); @@ -37,8 +38,9 @@ export class ComputerTool implements BaseAnthropicTool { Action.WAIT, ]); - constructor(page: Page, version: '20241022' | '20250124' = '20250124') { - this.page = page; + constructor(kernel: Kernel, sessionId: string, version: '20241022' | '20250124' = '20250124') { + this.kernel = kernel; + this.sessionId = sessionId; this.version = version; } @@ -77,59 +79,127 @@ export class ComputerTool implements BaseAnthropicTool { private async handleMouseAction(action: Action, coordinate: [number, number]): Promise { const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); - await this.page.mouse.move(x, y); - await this.page.waitForTimeout(100); - if (action === Action.LEFT_MOUSE_DOWN) { - await this.page.mouse.down(); + if (action === Action.MOUSE_MOVE) { + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x, + y, + }); + } else if (action === Action.LEFT_MOUSE_DOWN) { + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'down', + }); } else if (action === Action.LEFT_MOUSE_UP) { - await this.page.mouse.up(); + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'up', + }); } else { const button = this.getMouseButton(action); + let numClicks = 1; if (action === Action.DOUBLE_CLICK) { - await this.page.mouse.dblclick(x, y, { button }); + numClicks = 2; } else if (action === Action.TRIPLE_CLICK) { - await this.page.mouse.click(x, y, { button, clickCount: 3 }); - } else { - await this.page.mouse.click(x, y, { button }); + numClicks = 3; } + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button, + click_type: 'click', + num_clicks: numClicks, + }); } - await this.page.waitForTimeout(500); + await new Promise(resolve => setTimeout(resolve, 500)); return await this.screenshot(); } private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise { if (action === Action.HOLD_KEY) { - const key = KeyboardUtils.getPlaywrightKey(text); - await this.page.keyboard.down(key); - await new Promise(resolve => setTimeout(resolve, duration! * 1000)); - await this.page.keyboard.up(key); + // For HOLD_KEY, we need to press and hold for the duration + // OnKernel doesn't have a direct hold API, so we'll use pressKey with duration + const key = this.convertToOnKernelKey(text); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [key], + duration: duration ? duration * 1000 : undefined, + }); } else if (action === Action.KEY) { - const keys = KeyboardUtils.parseKeyCombination(text); - for (const key of keys) { - await this.page.keyboard.down(key); - } - for (const key of keys.reverse()) { - await this.page.keyboard.up(key); - } + // Convert key combination to OnKernel format (e.g., "Ctrl+t") + const key = this.convertKeyCombinationToOnKernel(text); + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [key], + }); } else { - await this.page.keyboard.type(text, { delay: TYPING_DELAY_MS }); + // TYPE action - use typeText + await this.kernel.browsers.computer.typeText(this.sessionId, { + text, + delay: TYPING_DELAY_MS, + }); } - await this.page.waitForTimeout(500); + await new Promise(resolve => setTimeout(resolve, 500)); return await this.screenshot(); } + private convertToOnKernelKey(key: string): string { + // Convert Playwright key names to OnKernel format + const keyMap: Record = { + 'Control': 'Ctrl', + 'Meta': 'Meta', + 'Alt': 'Alt', + 'Shift': 'Shift', + 'Enter': 'Enter', + 'ArrowLeft': 'ArrowLeft', + 'ArrowRight': 'ArrowRight', + 'ArrowUp': 'ArrowUp', + 'ArrowDown': 'ArrowDown', + 'Home': 'Home', + 'End': 'End', + 'PageUp': 'PageUp', + 'PageDown': 'PageDown', + 'Delete': 'Delete', + 'Backspace': 'Backspace', + 'Tab': 'Tab', + 'Escape': 'Escape', + 'Insert': 'Insert', + }; + return keyMap[key] || key; + } + + private convertKeyCombinationToOnKernel(combo: string): string { + // Convert key combinations like "Control+t" to "Ctrl+t" + const parts = combo.split('+').map(part => { + const trimmed = part.trim(); + if (trimmed.toLowerCase() === 'control' || trimmed.toLowerCase() === 'ctrl') { + return 'Ctrl'; + } + if (trimmed.toLowerCase() === 'meta' || trimmed.toLowerCase() === 'command' || trimmed.toLowerCase() === 'cmd') { + return 'Meta'; + } + return trimmed; + }); + return parts.join('+'); + } + async screenshot(): Promise { try { console.log('Starting screenshot...'); await new Promise(resolve => setTimeout(resolve, this._screenshotDelay * 1000)); - const screenshot = await this.page.screenshot({ type: 'png' }); - console.log('Screenshot taken, size:', screenshot.length, 'bytes'); + const response = await this.kernel.browsers.computer.captureScreenshot(this.sessionId); + const blob = await response.blob(); + const arrayBuffer = await blob.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + console.log('Screenshot taken, size:', buffer.length, 'bytes'); return { - base64Image: screenshot.toString('base64'), + base64Image: buffer.toString('base64'), }; } catch (error) { throw new ToolError(`Failed to take screenshot: ${error}`); @@ -155,18 +225,10 @@ export class ComputerTool implements BaseAnthropicTool { } if (action === Action.CURSOR_POSITION) { - const position = await this.page.evaluate(() => { - const selection = window.getSelection(); - const range = selection?.getRangeAt(0); - const rect = range?.getBoundingClientRect(); - return rect ? { x: rect.x, y: rect.y } : null; - }); - - if (!position) { - throw new ToolError('Failed to get cursor position'); - } - - return { output: `X=${position.x},Y=${position.y}` }; + // OnKernel computer controls don't have a direct cursor position API + // This would need to be handled differently or removed + // For now, we'll return an error indicating this feature isn't available + throw new ToolError('Cursor position is not available with OnKernel computer controls API'); } if (action === Action.SCROLL) { @@ -184,29 +246,35 @@ export class ComputerTool implements BaseAnthropicTool { throw new ToolError(`Scroll amount "${scrollAmountValue}" must be a non-negative number`); } - if (coordinate) { - const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); - await this.page.mouse.move(x, y); - await this.page.waitForTimeout(100); + const [x, y] = coordinate + ? ActionValidator.validateAndGetCoordinates(coordinate) + : [0, 0]; // Default to top-left if no coordinate provided + + // Convert scroll direction and amount to delta_x and delta_y + // OnKernel uses positive delta_y for scrolling down, negative for up + // Positive delta_x for scrolling right, negative for left + let delta_x = 0; + let delta_y = 0; + const scrollDelta = scrollAmountValue || 120; // Default scroll amount + + if (scrollDirection === 'down') { + delta_y = scrollDelta; + } else if (scrollDirection === 'up') { + delta_y = -scrollDelta; + } else if (scrollDirection === 'right') { + delta_x = scrollDelta; + } else if (scrollDirection === 'left') { + delta_x = -scrollDelta; } - const pageDimensions = await this.page.evaluate(() => { - return { h: window.innerHeight, w: window.innerWidth }; + await this.kernel.browsers.computer.scroll(this.sessionId, { + x, + y, + delta_x, + delta_y, }); - const pagePartitions = 25; - const scrollFactor = (scrollAmountValue || 10) / pagePartitions; - - if (scrollDirection === 'down' || scrollDirection === 'up') { - const amount = pageDimensions.h * scrollFactor; - console.log(`Scrolling ${amount.toFixed(2)} pixels ${scrollDirection}`); - await this.page.mouse.wheel(0, scrollDirection === 'down' ? amount : -amount); - } else { - const amount = pageDimensions.w * scrollFactor; - console.log(`Scrolling ${amount.toFixed(2)} pixels ${scrollDirection}`); - await this.page.mouse.wheel(scrollDirection === 'right' ? amount : -amount, 0); - } - await this.page.waitForTimeout(500); + await new Promise(resolve => setTimeout(resolve, 500)); return await this.screenshot(); } @@ -218,6 +286,23 @@ export class ComputerTool implements BaseAnthropicTool { return await this.screenshot(); } + if (action === Action.LEFT_CLICK_DRAG) { + if (!coordinate) { + throw new ToolError(`coordinate is required for ${action}`); + } + // For drag, we need a path - for now, we'll handle it as a simple click + // The drag action would need additional path information + const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x, + y, + button: 'left', + click_type: 'click', + }); + await new Promise(resolve => setTimeout(resolve, 500)); + return await this.screenshot(); + } + if (this.mouseActions.has(action)) { if (!coordinate) { throw new ToolError(`coordinate is required for ${action}`); @@ -238,13 +323,13 @@ export class ComputerTool implements BaseAnthropicTool { // For backward compatibility export class ComputerTool20241022 extends ComputerTool { - constructor(page: Page) { - super(page, '20241022'); + constructor(kernel: Kernel, sessionId: string) { + super(kernel, sessionId, '20241022'); } } export class ComputerTool20250124 extends ComputerTool { - constructor(page: Page) { - super(page, '20250124'); + constructor(kernel: Kernel, sessionId: string) { + super(kernel, sessionId, '20250124'); } } From 1c9f88a9243a80c47a12966a31bf7064454d5b89 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Wed, 14 Jan 2026 18:15:41 -0500 Subject: [PATCH 2/8] Anthropic CUA - Update Python Template + fix remaining TS aspects Fix remaining TS items + update Python template for Anthropic CUA to utilize computer controls instead of Playwright. Still do to: optimize click location issues. --- .../python/anthropic-computer-use/README.md | 46 ++- .../python/anthropic-computer-use/loop.py | 64 ++-- .../python/anthropic-computer-use/main.py | 130 ++++--- .../anthropic-computer-use/pyproject.toml | 4 +- .../python/anthropic-computer-use/session.py | 139 ++++++++ .../anthropic-computer-use/tools/computer.py | 326 +++++++++++------- .../anthropic-computer-use/README.md | 46 ++- .../anthropic-computer-use/index.ts | 34 +- .../anthropic-computer-use/package.json | 5 +- .../anthropic-computer-use/session.ts | 207 +++++++++++ .../anthropic-computer-use/tools/computer.ts | 44 ++- 11 files changed, 789 insertions(+), 256 deletions(-) create mode 100644 pkg/templates/python/anthropic-computer-use/session.py create mode 100644 pkg/templates/typescript/anthropic-computer-use/session.ts diff --git a/pkg/templates/python/anthropic-computer-use/README.md b/pkg/templates/python/anthropic-computer-use/README.md index a5d8b11..376f30f 100644 --- a/pkg/templates/python/anthropic-computer-use/README.md +++ b/pkg/templates/python/anthropic-computer-use/README.md @@ -1,7 +1,47 @@ # Kernel Python Sample App - Anthropic Computer Use -This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use. +This is a Kernel application that implements a prompt loop using Anthropic Computer Use with Kernel's Computer Controls API. -It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but replaces `xodotool` and `gnome-screenshot` with Playwright. +It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but uses Kernel's Computer Controls API instead of `xdotool` and `gnome-screenshot`. -See the [docs](https://www.kernel.sh/docs/quickstart) for information. \ No newline at end of file +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Anthropic**: [console.anthropic.com](https://console.anthropic.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your ANTHROPIC_API_KEY +kernel deploy main.py --env-file .env +``` + +## Usage + +```bash +kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Known Limitations + +### Cursor Position + +The `cursor_position` action is not supported with Kernel's Computer Controls API. If the model attempts to use this action, an error will be returned. This is a known limitation that does not significantly impact most computer use workflows, as the model typically tracks cursor position through screenshots. + +## Resources + +- [Anthropic Computer Use Documentation](https://docs.anthropic.com/en/docs/build-with-claude/computer-use) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/python/anthropic-computer-use/loop.py b/pkg/templates/python/anthropic-computer-use/loop.py index e4711b1..206c585 100644 --- a/pkg/templates/python/anthropic-computer-use/loop.py +++ b/pkg/templates/python/anthropic-computer-use/loop.py @@ -1,25 +1,16 @@ """ Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools. From https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/loop.py +Modified to use Kernel Computer Controls API instead of Playwright. """ import os -import platform -from collections.abc import Callable from datetime import datetime from enum import StrEnum from typing import Any, cast -from playwright.async_api import Page - -import httpx -from anthropic import ( - Anthropic, - AnthropicBedrock, - AnthropicVertex, - APIError, - APIResponseValidationError, - APIStatusError, -) + +from kernel import Kernel +from anthropic import Anthropic from anthropic.types.beta import ( BetaCacheControlEphemeralParam, BetaContentBlockParam, @@ -78,6 +69,8 @@ async def sampling_loop( model: str, messages: list[BetaMessageParam], api_key: str, + kernel: Kernel, + session_id: str, provider: APIProvider = APIProvider.ANTHROPIC, system_prompt_suffix: str = "", only_n_most_recent_images: int | None = None, @@ -85,7 +78,6 @@ async def sampling_loop( tool_version: ToolVersion = "computer_use_20250124", thinking_budget: int | None = None, token_efficient_tools_beta: bool = False, - playwright_page: Page, ): """ Agentic sampling loop for the assistant/tool interaction of computer use. @@ -94,6 +86,8 @@ async def sampling_loop( model: The model to use for the API call messages: The conversation history api_key: The API key for authentication + kernel: The Kernel client instance + session_id: The Kernel browser session ID provider: The API provider (defaults to ANTHROPIC) system_prompt_suffix: Additional system prompt text (defaults to empty string) only_n_most_recent_images: Optional limit on number of recent images to keep @@ -101,12 +95,11 @@ async def sampling_loop( tool_version: Version of tools to use (defaults to V20250124) thinking_budget: Optional token budget for thinking token_efficient_tools_beta: Whether to use token efficient tools beta - playwright_page: The Playwright page instance for browser automation """ tool_group = TOOL_GROUPS_BY_VERSION[tool_version] tool_collection = ToolCollection( *( - ToolCls(page=playwright_page if ToolCls.__name__.startswith("ComputerTool") else None) + ToolCls(kernel=kernel, session_id=session_id) if ToolCls.__name__.startswith("ComputerTool") else ToolCls() for ToolCls in tool_group.tools ) ) @@ -252,21 +245,30 @@ def _response_to_params( ) -> list[BetaContentBlockParam]: res: list[BetaContentBlockParam] = [] for block in response.content: - if isinstance(block, BetaTextBlock): - if block.text: + block_type = getattr(block, "type", None) + + # Handle thinking blocks + if block_type == "thinking": + thinking_block = { + "type": "thinking", + "thinking": getattr(block, "thinking", None), + } + if hasattr(block, "signature"): + thinking_block["signature"] = getattr(block, "signature", None) + res.append(cast(BetaContentBlockParam, thinking_block)) + # Handle text blocks + elif block_type == "text" or isinstance(block, BetaTextBlock): + if getattr(block, "text", None): res.append(BetaTextBlockParam(type="text", text=block.text)) - elif getattr(block, "type", None) == "thinking": - # Handle thinking blocks - include signature field - thinking_block = { - "type": "thinking", - "thinking": getattr(block, "thinking", None), - } - if hasattr(block, "signature"): - thinking_block["signature"] = getattr(block, "signature", None) - res.append(cast(BetaContentBlockParam, thinking_block)) - else: - # Handle tool use blocks normally - res.append(cast(BetaToolUseBlockParam, block.model_dump())) + # Handle tool use blocks + elif block_type == "tool_use": + tool_use_block: BetaToolUseBlockParam = { + "type": "tool_use", + "id": block.id, + "name": block.name, + "input": block.input, + } + res.append(tool_use_block) return res @@ -334,4 +336,4 @@ def _make_api_tool_result( def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str): if result.system: result_text = f"{result.system}\n{result_text}" - return result_text \ No newline at end of file + return result_text diff --git a/pkg/templates/python/anthropic-computer-use/main.py b/pkg/templates/python/anthropic-computer-use/main.py index e53090a..654dda7 100644 --- a/pkg/templates/python/anthropic-computer-use/main.py +++ b/pkg/templates/python/anthropic-computer-use/main.py @@ -1,97 +1,95 @@ import os -from typing import Dict, TypedDict +from typing import Dict, Optional, TypedDict import kernel -from kernel import Kernel from loop import sampling_loop -from playwright.async_api import async_playwright +from session import KernelBrowserSession class QueryInput(TypedDict): query: str + record_replay: Optional[bool] class QueryOutput(TypedDict): result: str + replay_url: Optional[str] api_key = os.getenv("ANTHROPIC_API_KEY") if not api_key: raise ValueError("ANTHROPIC_API_KEY is not set") -client = Kernel() app = kernel.App("python-anthropic-cua") + @app.action("cua-task") async def cua_task( ctx: kernel.KernelContext, payload: QueryInput, ) -> QueryOutput: - # A function that processes a user query using a browser-based sampling loop - - # Args: - # ctx: Kernel context containing invocation information - # payload: An object containing a query string to process - - # Returns: - # A dictionary containing the result of the sampling loop as a string + """ + Process a user query using Anthropic Computer Use with Kernel's browser automation. + + Args: + ctx: Kernel context containing invocation information + payload: An object containing: + - query: The task/query string to process + - record_replay: Optional boolean to enable video replay recording + + Returns: + A dictionary containing: + - result: The result of the sampling loop as a string + - replay_url: URL to view the replay (if recording was enabled) + """ if not payload or not payload.get("query"): raise ValueError("Query is required") - kernel_browser = client.browsers.create( - invocation_id=ctx.invocation_id, stealth=True - ) - print("Kernel browser live view url: ", kernel_browser.browser_live_view_url) - - try: - async with async_playwright() as playwright: - browser = await playwright.chromium.connect_over_cdp( - kernel_browser.cdp_ws_url - ) - context = ( - browser.contexts[0] if browser.contexts else await browser.new_context() + record_replay = payload.get("record_replay", False) + + async with KernelBrowserSession( + stealth=True, + record_replay=record_replay, + ) as session: + print("Kernel browser live view url:", session.live_view_url) + + # Run the sampling loop + final_messages = await sampling_loop( + model="claude-sonnet-4-20250514", + messages=[ + { + "role": "user", + "content": payload["query"], + } + ], + api_key=str(api_key), + thinking_budget=1024, + kernel=session.kernel, + session_id=session.session_id, + ) + + # Extract the final result + if not final_messages: + raise ValueError("No messages were generated during the sampling loop") + + last_message = final_messages[-1] + if not last_message: + raise ValueError( + "Failed to get the last message from the sampling loop" ) - page = context.pages[0] if context.pages else await context.new_page() - - # Run the sampling loop - final_messages = await sampling_loop( - model="claude-sonnet-4-20250514", - messages=[ - { - "role": "user", - "content": payload["query"], - } - ], - api_key=str(api_key), - thinking_budget=1024, - playwright_page=page, + + result = "" + if isinstance(last_message.get("content"), str): + result = last_message["content"] # type: ignore[assignment] + else: + result = "".join( + block["text"] + for block in last_message["content"] # type: ignore[index] + if isinstance(block, Dict) and block.get("type") == "text" ) - # Extract the final result - if not final_messages: - raise ValueError("No messages were generated during the sampling loop") - - last_message = final_messages[-1] - if not last_message: - raise ValueError( - "Failed to get the last message from the sampling loop" - ) - - result = "" - if isinstance(last_message.get("content"), str): - result = last_message["content"] # type: ignore[assignment] - else: - result = "".join( - block["text"] - for block in last_message["content"] # type: ignore[index] - if isinstance(block, Dict) and block.get("type") == "text" - ) - - return {"result": result} - except Exception as exc: - print(f"Error in sampling loop: {exc}") - raise - finally: - if browser is not None: - await browser.close() - client.browsers.delete_by_id(kernel_browser.session_id) + # Session is cleaned up, replay_url is available if recording was enabled + return { + "result": result, + "replay_url": session.replay_view_url, + } diff --git a/pkg/templates/python/anthropic-computer-use/pyproject.toml b/pkg/templates/python/anthropic-computer-use/pyproject.toml index f9a7686..f5b75de 100644 --- a/pkg/templates/python/anthropic-computer-use/pyproject.toml +++ b/pkg/templates/python/anthropic-computer-use/pyproject.toml @@ -5,11 +5,9 @@ description = "Kernel reference app for Anthropic Computer Use" requires-python = ">=3.9" dependencies = [ "anthropic>=0.75.0", - "playwright>=1.56.0", "python-dateutil>=2.9.0", "pydantic>=2.12.5", "typing-extensions>=4.15.0", - "kernel>=0.23.0", + "kernel>=0.24.0", "python-dotenv>=1.2.1", - "httpx>=0.28.1", ] diff --git a/pkg/templates/python/anthropic-computer-use/session.py b/pkg/templates/python/anthropic-computer-use/session.py new file mode 100644 index 0000000..c34ac01 --- /dev/null +++ b/pkg/templates/python/anthropic-computer-use/session.py @@ -0,0 +1,139 @@ +""" +Kernel Browser Session Manager. + +Provides an async context manager for managing Kernel browser lifecycle +with optional video replay recording. +""" + +import asyncio +import time +from dataclasses import dataclass, field +from typing import Optional + +from kernel import Kernel + + +@dataclass +class KernelBrowserSession: + """ + Manages Kernel browser lifecycle as an async context manager. + + Creates a browser session on entry and cleans it up on exit. + Optionally records a video replay of the entire session. + Provides session_id to computer tools. + + Usage: + async with KernelBrowserSession(record_replay=True) as session: + # Use session.session_id and session.kernel for operations + pass + # Browser is automatically cleaned up, replay URL available in session.replay_view_url + """ + + stealth: bool = True + timeout_seconds: int = 300 + + # Replay recording options + record_replay: bool = False + replay_grace_period: float = 5.0 # Seconds to wait before stopping replay + + # Set after browser creation + session_id: Optional[str] = field(default=None, init=False) + live_view_url: Optional[str] = field(default=None, init=False) + replay_id: Optional[str] = field(default=None, init=False) + replay_view_url: Optional[str] = field(default=None, init=False) + _kernel: Optional[Kernel] = field(default=None, init=False) + + async def __aenter__(self) -> "KernelBrowserSession": + """Create a Kernel browser session and optionally start recording.""" + self._kernel = Kernel() + + # Create browser with specified settings + browser = self._kernel.browsers.create( + stealth=self.stealth, + timeout_seconds=self.timeout_seconds, + ) + + self.session_id = browser.session_id + self.live_view_url = browser.browser_live_view_url + + print(f"Kernel browser created: {self.session_id}") + print(f"Live view URL: {self.live_view_url}") + + # Start replay recording if enabled + if self.record_replay: + await self._start_replay() + + return self + + async def _start_replay(self) -> None: + """Start recording a replay of the browser session.""" + if not self._kernel or not self.session_id: + return + + print("Starting replay recording...") + replay = self._kernel.browsers.replays.start(self.session_id) + self.replay_id = replay.replay_id + print(f"Replay recording started: {self.replay_id}") + + async def _stop_and_get_replay_url(self) -> None: + """Stop recording and get the replay URL.""" + if not self._kernel or not self.session_id or not self.replay_id: + return + + print("Stopping replay recording...") + self._kernel.browsers.replays.stop( + replay_id=self.replay_id, + id=self.session_id, + ) + print("Replay recording stopped. Processing video...") + + # Wait a moment for processing + await asyncio.sleep(2) + + # Poll for replay to be ready (with timeout) + max_wait = 60 # seconds + start_time = time.time() + replay_ready = False + + while time.time() - start_time < max_wait: + try: + replays = self._kernel.browsers.replays.list(self.session_id) + for replay in replays: + if replay.replay_id == self.replay_id: + self.replay_view_url = replay.replay_view_url + replay_ready = True + break + if replay_ready: + break + except Exception: + pass + await asyncio.sleep(1) + + if not replay_ready: + print("Warning: Replay may still be processing") + elif self.replay_view_url: + print(f"Replay view URL: {self.replay_view_url}") + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + """Stop recording and delete the browser session.""" + if self._kernel and self.session_id: + # Stop replay if recording was enabled + if self.record_replay and self.replay_id: + # Wait grace period before stopping to capture final state + if self.replay_grace_period > 0: + print(f"Waiting {self.replay_grace_period}s grace period...") + await asyncio.sleep(self.replay_grace_period) + await self._stop_and_get_replay_url() + + print(f"Destroying browser session: {self.session_id}") + self._kernel.browsers.delete_by_id(self.session_id) + print("Browser session destroyed.") + + self._kernel = None + + @property + def kernel(self) -> Kernel: + """Get the Kernel client instance.""" + if self._kernel is None: + raise RuntimeError("Session not initialized. Use async with context.") + return self._kernel diff --git a/pkg/templates/python/anthropic-computer-use/tools/computer.py b/pkg/templates/python/anthropic-computer-use/tools/computer.py index 60f7522..083f13f 100644 --- a/pkg/templates/python/anthropic-computer-use/tools/computer.py +++ b/pkg/templates/python/anthropic-computer-use/tools/computer.py @@ -1,53 +1,42 @@ """ +Computer tool using Kernel's Computer Controls API. Modified from https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/tools/computer.py -Replaces xdotool and gnome-screenshot with Playwright. +Replaces Playwright with Kernel Computer Controls API. """ import asyncio import base64 -import os -from enum import StrEnum from typing import Literal, TypedDict, cast, get_args -from playwright.async_api import Page - +from kernel import Kernel from anthropic.types.beta import BetaToolComputerUse20241022Param, BetaToolUnionParam from .base import BaseAnthropicTool, ToolError, ToolResult TYPING_DELAY_MS = 12 -TYPING_GROUP_SIZE = 50 - -# Map alternative names to standard Playwright modifier keys -MODIFIER_KEY_MAP = { - 'ctrl': 'Control', - 'alt': 'Alt', - 'cmd': 'Meta', - 'command': 'Meta', - 'win': 'Meta', -} -# Essential key mappings for Playwright compatibility +# Key mappings for Kernel Computer Controls API +# Map common key names to xdotool-compatible format that Kernel uses KEY_MAP = { - 'return': 'Enter', - 'space': ' ', - 'left': 'ArrowLeft', - 'right': 'ArrowRight', - 'up': 'ArrowUp', - 'down': 'ArrowDown', + 'return': 'Return', + 'enter': 'Return', + 'space': 'space', + 'left': 'Left', + 'right': 'Right', + 'up': 'Up', + 'down': 'Down', 'home': 'Home', 'end': 'End', - 'pageup': 'PageUp', - 'page_up': 'PageUp', - 'pagedown': 'PageDown', - 'page_down': 'PageDown', + 'pageup': 'Page_Up', + 'page_up': 'Page_Up', + 'pagedown': 'Page_Down', + 'page_down': 'Page_Down', 'delete': 'Delete', - 'backspace': 'Backspace', + 'backspace': 'BackSpace', 'tab': 'Tab', 'esc': 'Escape', 'escape': 'Escape', 'insert': 'Insert', - 'super_l': 'Meta', 'f1': 'F1', 'f2': 'F2', 'f3': 'F3', @@ -60,9 +49,21 @@ 'f10': 'F10', 'f11': 'F11', 'f12': 'F12', - 'minus': '-', - 'equal': '=', - 'plus': '+', + 'minus': 'minus', + 'equal': 'equal', + 'plus': 'plus', +} + +# Modifier key mappings +MODIFIER_KEY_MAP = { + 'ctrl': 'ctrl', + 'control': 'ctrl', + 'alt': 'alt', + 'cmd': 'super', + 'command': 'super', + 'win': 'super', + 'meta': 'super', + 'shift': 'shift', } Action_20241022 = Literal[ @@ -92,33 +93,30 @@ ScrollDirection = Literal["up", "down", "left", "right"] -# Map Playwright mouse buttons to our actions -MOUSE_BUTTONS = { - "left_click": "left", - "right_click": "right", - "middle_click": "middle", -} class ComputerToolOptions(TypedDict): display_height_px: int display_width_px: int display_number: int | None -def chunks(s: str, chunk_size: int) -> list[str]: - return [s[i : i + chunk_size] for i in range(0, len(s), chunk_size)] class BaseComputerTool: """ - A tool that allows the agent to interact with the screen, keyboard, and mouse using Playwright. + A tool that allows the agent to interact with the screen, keyboard, and mouse using Kernel's Computer Controls API. The tool parameters are defined by Anthropic and are not editable. """ name: Literal["computer"] = "computer" - width: int = 1280 - height: int = 720 + width: int = 1920 + height: int = 1080 display_num: int | None = None - page: Page | None = None - + + # Kernel client and session + kernel: Kernel | None = None + session_id: str | None = None + + # Track last mouse position for drag operations + _last_mouse_position: tuple[int, int] = (0, 0) _screenshot_delay = 2.0 @property @@ -129,9 +127,10 @@ def options(self) -> ComputerToolOptions: "display_number": self.display_num, } - def __init__(self, page: Page | None = None): + def __init__(self, kernel: Kernel | None = None, session_id: str | None = None): super().__init__() - self.page = page + self.kernel = kernel + self.session_id = session_id def validate_coordinates(self, coordinate: tuple[int, int] | list[int] | None = None) -> tuple[int, int] | None: """Validate that coordinates are non-negative integers and convert lists to tuples if needed.""" @@ -152,23 +151,30 @@ def validate_coordinates(self, coordinate: tuple[int, int] | list[int] | None = return coordinate def map_key(self, key: str) -> str: - """Map a key to its Playwright equivalent.""" + """Map a key to its Kernel/xdotool equivalent.""" + key_lower = key.lower().strip() + # Handle modifier keys - if key.lower() in MODIFIER_KEY_MAP: - return MODIFIER_KEY_MAP[key.lower()] + if key_lower in MODIFIER_KEY_MAP: + return MODIFIER_KEY_MAP[key_lower] # Handle special keys - if key.lower() in KEY_MAP: - return KEY_MAP[key.lower()] + if key_lower in KEY_MAP: + return KEY_MAP[key_lower] # Handle key combinations (e.g. "ctrl+a") if '+' in key: parts = key.split('+') - if len(parts) == 2: - modifier, main_key = parts - mapped_modifier = MODIFIER_KEY_MAP.get(modifier.lower(), modifier) - mapped_key = KEY_MAP.get(main_key.lower(), main_key) - return f"{mapped_modifier}+{mapped_key}" + mapped_parts = [] + for part in parts: + part = part.strip().lower() + if part in MODIFIER_KEY_MAP: + mapped_parts.append(MODIFIER_KEY_MAP[part]) + elif part in KEY_MAP: + mapped_parts.append(KEY_MAP[part]) + else: + mapped_parts.append(part) + return '+'.join(mapped_parts) # Return the key as is if no mapping exists return key @@ -181,8 +187,8 @@ async def __call__( coordinate: tuple[int, int] | list[int] | None = None, **kwargs, ): - if not self.page: - raise ToolError("Playwright page not initialized") + if not self.kernel or not self.session_id: + raise ToolError("Kernel client or session not initialized") if action in ("mouse_move", "left_click_drag"): if coordinate is None: @@ -194,12 +200,30 @@ async def __call__( x, y = coordinate if action == "mouse_move": - await self.page.mouse.move(x, y) + self.kernel.browsers.computer.move_mouse( + id=self.session_id, + x=x, + y=y, + ) + self._last_mouse_position = (x, y) return await self.screenshot() elif action == "left_click_drag": - await self.page.mouse.down(button="left") - await self.page.mouse.move(x, y) - await self.page.mouse.up(button="left") + # Get start position from kwargs or use last known position + start_coord = kwargs.get("start_coordinate") + if start_coord: + start_coord = self.validate_coordinates(start_coord) + start_x, start_y = start_coord + else: + start_x, start_y = self._last_mouse_position + + print(f"Dragging from ({start_x}, {start_y}) to ({x}, {y})") + + self.kernel.browsers.computer.drag_mouse( + id=self.session_id, + path=[[start_x, start_y], [x, y]], + button="left", + ) + self._last_mouse_position = (x, y) return await self.screenshot() if action in ("key", "type"): @@ -208,22 +232,22 @@ async def __call__( if coordinate is not None: raise ToolError(f"coordinate is not accepted for {action}") if not isinstance(text, str): - raise ToolError(output=f"{text} must be a string") + raise ToolError(f"{text} must be a string") if action == "key": mapped_key = self.map_key(text) - await self.page.keyboard.press(mapped_key) + self.kernel.browsers.computer.press_key( + id=self.session_id, + keys=[mapped_key], + ) return await self.screenshot() elif action == "type": - results: list[ToolResult] = [] - for chunk in chunks(text, TYPING_GROUP_SIZE): - await self.page.keyboard.type(chunk, delay=TYPING_DELAY_MS) - results.append(await self.screenshot()) - return ToolResult( - output="".join(result.output or "" for result in results), - error="".join(result.error or "" for result in results), - base64_image=results[-1].base64_image if results else None, + self.kernel.browsers.computer.type_text( + id=self.session_id, + text=text, + delay=TYPING_DELAY_MS, ) + return await self.screenshot() if action in ( "left_click", @@ -239,40 +263,62 @@ async def __call__( if action == "screenshot": return await self.screenshot() elif action == "cursor_position": - # Playwright doesn't provide a direct way to get cursor position - # We'll return a placeholder since this isn't critical functionality - return ToolResult(output="Cursor position not available in Playwright") + # Kernel Computer Controls API doesn't track cursor position + raise ToolError("Cursor position is not available with Kernel Computer Controls API") else: if coordinate is not None: coordinate = self.validate_coordinates(coordinate) x, y = coordinate - await self.page.mouse.move(x, y) + else: + x, y = self._last_mouse_position + + button = "left" + if action == "right_click": + button = "right" + elif action == "middle_click": + button = "middle" + num_clicks = 1 if action == "double_click": - await self.page.mouse.dblclick(x, y) - else: - await self.page.mouse.click(x, y, button=MOUSE_BUTTONS[action]) + num_clicks = 2 + + self.kernel.browsers.computer.click_mouse( + id=self.session_id, + x=x, + y=y, + button=button, + num_clicks=num_clicks, + ) + self._last_mouse_position = (x, y) return await self.screenshot() raise ToolError(f"Invalid action: {action}") async def screenshot(self): - """Take a screenshot using Playwright and return the base64 encoded image.""" - if not self.page: - raise ToolError("Playwright page not initialized") + """Take a screenshot using Kernel Computer Controls API and return the base64 encoded image.""" + if not self.kernel or not self.session_id: + raise ToolError("Kernel client or session not initialized") - # Take screenshot using Playwright and get the buffer directly - screenshot_bytes = await self.page.screenshot(type="png") + print("Starting screenshot...") + await asyncio.sleep(self._screenshot_delay) + + response = self.kernel.browsers.computer.capture_screenshot(id=self.session_id) + screenshot_bytes = response.read() + + print(f"Screenshot taken, size: {len(screenshot_bytes)} bytes") + return ToolResult( base64_image=base64.b64encode(screenshot_bytes).decode() ) + class ComputerTool20241022(BaseComputerTool, BaseAnthropicTool): api_type: Literal["computer_20241022"] = "computer_20241022" def to_params(self) -> BetaToolComputerUse20241022Param: return {"name": self.name, "type": self.api_type, **self.options} + class ComputerTool20250124(BaseComputerTool, BaseAnthropicTool): api_type: Literal["computer_20250124"] = "computer_20250124" @@ -294,22 +340,29 @@ async def __call__( key: str | None = None, **kwargs, ): - if not self.page: - raise ToolError("Playwright page not initialized") + if not self.kernel or not self.session_id: + raise ToolError("Kernel client or session not initialized") if action in ("left_mouse_down", "left_mouse_up"): if coordinate is not None: - raise ToolError(f"coordinate is not accepted for {action=}.") - if action == "left_mouse_down": - await self.page.mouse.down(button="left") + coordinate = self.validate_coordinates(coordinate) + x, y = coordinate else: - await self.page.mouse.up(button="left") + x, y = self._last_mouse_position + + click_type = "down" if action == "left_mouse_down" else "up" + self.kernel.browsers.computer.click_mouse( + id=self.session_id, + x=x, + y=y, + button="left", + click_type=click_type, + ) + self._last_mouse_position = (x, y) return await self.screenshot() if action == "scroll": - if scroll_direction is None or scroll_direction not in get_args( - ScrollDirection - ): + if scroll_direction is None or scroll_direction not in get_args(ScrollDirection): raise ToolError( f"{scroll_direction=} must be 'up', 'down', 'left', or 'right'" ) @@ -319,31 +372,33 @@ async def __call__( if coordinate is not None: coordinate = self.validate_coordinates(coordinate) x, y = coordinate - await self.page.mouse.move(x, y) - - # Map scroll directions to Playwright's wheel events - page_dimensions = await self.page.evaluate( - "() => Promise.resolve({ h: window.innerHeight, w: window.innerWidth })" - ) - page_partitions = 25 - scroll_factor = scroll_amount / page_partitions - page_width = page_dimensions['w'] - page_height = page_dimensions['h'] + else: + x, y = self._last_mouse_position + # Calculate scroll delta based on direction and amount + # Use a reasonable scroll factor + scroll_factor = scroll_amount * 10 # Adjust multiplier as needed + delta_x = 0 delta_y = 0 if scroll_direction == "up": - delta_y = -scroll_factor * page_height + delta_y = -scroll_factor elif scroll_direction == "down": - delta_y = scroll_factor * page_height + delta_y = scroll_factor elif scroll_direction == "left": - delta_x = -scroll_factor * page_width + delta_x = -scroll_factor elif scroll_direction == "right": - delta_x = scroll_factor * page_width + delta_x = scroll_factor - print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y):.02f} pixels {scroll_direction}") + print(f"Scrolling {abs(delta_x) if delta_x != 0 else abs(delta_y)} pixels {scroll_direction}") - await self.page.mouse.wheel(delta_x=delta_x, delta_y=delta_y) + self.kernel.browsers.computer.scroll( + id=self.session_id, + x=x, + y=y, + delta_x=delta_x, + delta_y=delta_y, + ) return await self.screenshot() if action in ("hold_key", "wait"): @@ -358,9 +413,11 @@ async def __call__( if text is None: raise ToolError(f"text is required for {action}") mapped_key = self.map_key(text) - await self.page.keyboard.down(mapped_key) - await asyncio.sleep(duration) - await self.page.keyboard.up(mapped_key) + self.kernel.browsers.computer.press_key( + id=self.session_id, + keys=[mapped_key], + duration=int(duration * 1000), # Convert to milliseconds + ) return await self.screenshot() if action == "wait": @@ -380,23 +437,48 @@ async def __call__( if coordinate is not None: coordinate = self.validate_coordinates(coordinate) x, y = coordinate - await self.page.mouse.move(x, y) + else: + x, y = self._last_mouse_position + button = "left" + if action == "right_click": + button = "right" + elif action == "middle_click": + button = "middle" + + num_clicks = 1 + if action == "double_click": + num_clicks = 2 + elif action == "triple_click": + num_clicks = 3 + + # Handle modifier key if provided if key: mapped_key = self.map_key(key) - await self.page.keyboard.down(mapped_key) + # Press modifier key down + self.kernel.browsers.computer.press_key( + id=self.session_id, + keys=[mapped_key], + click_type="down", + ) - if action == "triple_click": - # Playwright doesn't have triple click, so we'll simulate it - await self.page.mouse.click(x, y, click_count=3) - elif action == "double_click": - await self.page.mouse.dblclick(x, y) - else: - await self.page.mouse.click(x, y, button=MOUSE_BUTTONS[action]) + self.kernel.browsers.computer.click_mouse( + id=self.session_id, + x=x, + y=y, + button=button, + num_clicks=num_clicks, + ) if key: - await self.page.keyboard.up(mapped_key) + # Release modifier key + self.kernel.browsers.computer.press_key( + id=self.session_id, + keys=[mapped_key], + click_type="up", + ) + self._last_mouse_position = (x, y) return await self.screenshot() return await super().__call__( diff --git a/pkg/templates/typescript/anthropic-computer-use/README.md b/pkg/templates/typescript/anthropic-computer-use/README.md index 2f71cad..d4cd552 100644 --- a/pkg/templates/typescript/anthropic-computer-use/README.md +++ b/pkg/templates/typescript/anthropic-computer-use/README.md @@ -1,7 +1,47 @@ # Kernel TypeScript Sample App - Anthropic Computer Use -This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use. +This is a Kernel application that implements a prompt loop using Anthropic Computer Use with Kernel's Computer Controls API. -It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but replaces `xodotool` and `gnome-screenshot` with Playwright. +It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but uses Kernel's Computer Controls API instead of `xdotool` and `gnome-screenshot`. -See the [docs](https://www.kernel.sh/docs/quickstart) for information. +## Setup + +1. Get your API keys: + - **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com) + - **Anthropic**: [console.anthropic.com](https://console.anthropic.com) + +2. Deploy the app: +```bash +kernel login +cp .env.example .env # Add your ANTHROPIC_API_KEY +kernel deploy index.ts --env-file .env +``` + +## Usage + +```bash +kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}' +``` + +## Recording Replays + +> **Note:** Replay recording is only available to Kernel users on paid plans. + +Add `"record_replay": true` to your payload to capture a video of the browser session: + +```bash +kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}' +``` + +When enabled, the response will include a `replay_url` field with a link to view the recorded session. + +## Known Limitations + +### Cursor Position + +The `cursor_position` action is not supported with Kernel's Computer Controls API. If the model attempts to use this action, an error will be returned. This is a known limitation that does not significantly impact most computer use workflows, as the model typically tracks cursor position through screenshots. + +## Resources + +- [Anthropic Computer Use Documentation](https://docs.anthropic.com/en/docs/build-with-claude/computer-use) +- [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/typescript/anthropic-computer-use/index.ts b/pkg/templates/typescript/anthropic-computer-use/index.ts index 35f36d0..015b69c 100644 --- a/pkg/templates/typescript/anthropic-computer-use/index.ts +++ b/pkg/templates/typescript/anthropic-computer-use/index.ts @@ -1,6 +1,6 @@ import { Kernel, type KernelContext } from '@onkernel/sdk'; -import { chromium } from 'playwright-core'; import { samplingLoop } from './loop'; +import { KernelBrowserSession } from './session'; const kernel = new Kernel(); @@ -8,10 +8,12 @@ const app = kernel.app('ts-anthropic-cua'); interface QueryInput { query: string; + record_replay?: boolean; } interface QueryOutput { result: string; + replay_url?: string; } // LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX` @@ -29,19 +31,14 @@ app.action( throw new Error('Query is required'); } - const kernelBrowser = await kernel.browsers.create({ - invocation_id: ctx.invocation_id, + // Create browser session with optional replay recording + const session = new KernelBrowserSession(kernel, { stealth: true, + recordReplay: payload.record_replay ?? false, }); - console.log("Kernel browser live view url: ", kernelBrowser.browser_live_view_url); - - const browser = await chromium.connectOverCDP(kernelBrowser.cdp_ws_url); - const context = await browser.contexts()[0]; - const page = await context?.pages()[0]; - if (!page) { - throw new Error('Error getting initial page'); - } + await session.start(); + console.log('Kernel browser live view url:', session.liveViewUrl); try { // Run the sampling loop @@ -54,7 +51,7 @@ app.action( apiKey: ANTHROPIC_API_KEY, thinkingBudget: 1024, kernel, - sessionId: kernelBrowser.session_id, + sessionId: session.sessionId, }); // Extract the final result from the messages @@ -73,13 +70,18 @@ app.action( block.type === 'text' ? block.text : '' ).join(''); - return { result }; + // Stop session and get replay URL if recording was enabled + const sessionInfo = await session.stop(); + + return { + result, + replay_url: sessionInfo.replayViewUrl, + }; } catch (error) { console.error('Error in sampling loop:', error); + // Make sure to clean up the session even on error + await session.stop(); throw error; - } finally { - await browser.close(); - await kernel.browsers.deleteByID(kernelBrowser.session_id); } }, ); diff --git a/pkg/templates/typescript/anthropic-computer-use/package.json b/pkg/templates/typescript/anthropic-computer-use/package.json index e6ce639..8012da1 100644 --- a/pkg/templates/typescript/anthropic-computer-use/package.json +++ b/pkg/templates/typescript/anthropic-computer-use/package.json @@ -5,9 +5,8 @@ "private": true, "dependencies": { "@anthropic-ai/sdk": "^0.71.2", - "@onkernel/sdk": "^0.23.0", - "luxon": "^3.7.2", - "playwright-core": "^1.57.0" + "@onkernel/sdk": "^0.24.0", + "luxon": "^3.7.2" }, "devDependencies": { "@types/node": "^22.15.17", diff --git a/pkg/templates/typescript/anthropic-computer-use/session.ts b/pkg/templates/typescript/anthropic-computer-use/session.ts new file mode 100644 index 0000000..b61819d --- /dev/null +++ b/pkg/templates/typescript/anthropic-computer-use/session.ts @@ -0,0 +1,207 @@ +/** + * Kernel Browser Session Manager. + * + * Provides a class for managing Kernel browser lifecycle + * with optional video replay recording. + */ + +import type { Kernel } from '@onkernel/sdk'; + +export interface SessionOptions { + /** Enable stealth mode to avoid bot detection */ + stealth?: boolean; + /** Browser session timeout in seconds */ + timeoutSeconds?: number; + /** Enable replay recording (requires paid plan) */ + recordReplay?: boolean; + /** Grace period in seconds before stopping replay */ + replayGracePeriod?: number; +} + +export interface SessionInfo { + sessionId: string; + liveViewUrl: string; + replayId?: string; + replayViewUrl?: string; +} + +const DEFAULT_OPTIONS: Required = { + stealth: true, + timeoutSeconds: 300, + recordReplay: false, + replayGracePeriod: 5.0, +}; + +/** + * Manages Kernel browser lifecycle with optional replay recording. + * + * Usage: + * ```typescript + * const session = new KernelBrowserSession(kernel, options); + * await session.start(); + * try { + * // Use session.sessionId for computer controls + * } finally { + * await session.stop(); + * } + * ``` + */ +export class KernelBrowserSession { + private kernel: Kernel; + private options: Required; + + // Session state + private _sessionId: string | null = null; + private _liveViewUrl: string | null = null; + private _replayId: string | null = null; + private _replayViewUrl: string | null = null; + + constructor(kernel: Kernel, options: SessionOptions = {}) { + this.kernel = kernel; + this.options = { ...DEFAULT_OPTIONS, ...options }; + } + + get sessionId(): string { + if (!this._sessionId) { + throw new Error('Session not started. Call start() first.'); + } + return this._sessionId; + } + + get liveViewUrl(): string | null { + return this._liveViewUrl; + } + + get replayViewUrl(): string | null { + return this._replayViewUrl; + } + + get info(): SessionInfo { + return { + sessionId: this.sessionId, + liveViewUrl: this._liveViewUrl || '', + replayId: this._replayId || undefined, + replayViewUrl: this._replayViewUrl || undefined, + }; + } + + /** + * Create a Kernel browser session and optionally start recording. + */ + async start(): Promise { + // Create browser with specified settings + const browser = await this.kernel.browsers.create({ + stealth: this.options.stealth, + timeout_seconds: this.options.timeoutSeconds, + }); + + this._sessionId = browser.session_id; + this._liveViewUrl = browser.browser_live_view_url; + + console.log(`Kernel browser created: ${this._sessionId}`); + console.log(`Live view URL: ${this._liveViewUrl}`); + + // Start replay recording if enabled + if (this.options.recordReplay) { + await this.startReplay(); + } + + return this.info; + } + + /** + * Start recording a replay of the browser session. + */ + private async startReplay(): Promise { + if (!this._sessionId) { + return; + } + + console.log('Starting replay recording...'); + const replay = await this.kernel.browsers.replays.start(this._sessionId); + this._replayId = replay.replay_id; + console.log(`Replay recording started: ${this._replayId}`); + } + + /** + * Stop recording and get the replay URL. + */ + private async stopReplay(): Promise { + if (!this._sessionId || !this._replayId) { + return; + } + + console.log('Stopping replay recording...'); + await this.kernel.browsers.replays.stop(this._replayId, { + id: this._sessionId, + }); + console.log('Replay recording stopped. Processing video...'); + + // Wait a moment for processing + await this.sleep(2000); + + // Poll for replay to be ready (with timeout) + const maxWait = 60000; // 60 seconds + const startTime = Date.now(); + let replayReady = false; + + while (Date.now() - startTime < maxWait) { + try { + const replays = await this.kernel.browsers.replays.list(this._sessionId); + for (const replay of replays) { + if (replay.replay_id === this._replayId) { + this._replayViewUrl = replay.replay_view_url; + replayReady = true; + break; + } + } + if (replayReady) { + break; + } + } catch { + // Ignore errors while polling + } + await this.sleep(1000); + } + + if (!replayReady) { + console.log('Warning: Replay may still be processing'); + } else if (this._replayViewUrl) { + console.log(`Replay view URL: ${this._replayViewUrl}`); + } + } + + /** + * Stop recording, and delete the browser session. + */ + async stop(): Promise { + const info = this.info; + + if (this._sessionId) { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; + } + + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); + } + + // Reset state + this._sessionId = null; + this._replayId = null; + + return info; + } + + private sleep(ms: number): Promise { + return new Promise(resolve => setTimeout(resolve, ms)); + } +} diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts index 333bcdf..75fbd07 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts @@ -13,6 +13,9 @@ export class ComputerTool implements BaseAnthropicTool { protected sessionId: string; protected _screenshotDelay = 2.0; protected version: '20241022' | '20250124'; + + // Track the last known mouse position for drag operations + private lastMousePosition: [number, number] = [0, 0]; private readonly mouseActions = new Set([ Action.LEFT_CLICK, @@ -52,8 +55,8 @@ export class ComputerTool implements BaseAnthropicTool { const params = { name: this.name, type: this.apiType, - display_width_px: 1280, - display_height_px: 720, + display_width_px: 1920, + display_height_px: 1080, display_number: null, }; return params; @@ -85,6 +88,8 @@ export class ComputerTool implements BaseAnthropicTool { x, y, }); + // Track mouse position for drag operations + this.lastMousePosition = [x, y]; } else if (action === Action.LEFT_MOUSE_DOWN) { await this.kernel.browsers.computer.clickMouse(this.sessionId, { x, @@ -92,6 +97,7 @@ export class ComputerTool implements BaseAnthropicTool { button: 'left', click_type: 'down', }); + this.lastMousePosition = [x, y]; } else if (action === Action.LEFT_MOUSE_UP) { await this.kernel.browsers.computer.clickMouse(this.sessionId, { x, @@ -99,6 +105,7 @@ export class ComputerTool implements BaseAnthropicTool { button: 'left', click_type: 'up', }); + this.lastMousePosition = [x, y]; } else { const button = this.getMouseButton(action); let numClicks = 1; @@ -115,6 +122,8 @@ export class ComputerTool implements BaseAnthropicTool { click_type: 'click', num_clicks: numClicks, }); + // Track mouse position for drag operations + this.lastMousePosition = [x, y]; } await new Promise(resolve => setTimeout(resolve, 500)); @@ -290,15 +299,32 @@ export class ComputerTool implements BaseAnthropicTool { if (!coordinate) { throw new ToolError(`coordinate is required for ${action}`); } - // For drag, we need a path - for now, we'll handle it as a simple click - // The drag action would need additional path information - const [x, y] = ActionValidator.validateAndGetCoordinates(coordinate); - await this.kernel.browsers.computer.clickMouse(this.sessionId, { - x, - y, + + // Get the destination coordinate + const [endX, endY] = ActionValidator.validateAndGetCoordinates(coordinate); + + // Check if start_coordinate is provided in kwargs (for newer API versions) + let startX: number, startY: number; + const startCoordinate = kwargs.start_coordinate as [number, number] | undefined; + + if (startCoordinate) { + [startX, startY] = ActionValidator.validateAndGetCoordinates(startCoordinate); + } else { + // Use last known mouse position as the start point + [startX, startY] = this.lastMousePosition; + } + + console.log(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`); + + // Use Kernel's dragMouse API with a path from start to end + await this.kernel.browsers.computer.dragMouse(this.sessionId, { + path: [[startX, startY], [endX, endY]], button: 'left', - click_type: 'click', }); + + // Update tracked mouse position to the end of the drag + this.lastMousePosition = [endX, endY]; + await new Promise(resolve => setTimeout(resolve, 500)); return await this.screenshot(); } From 3df2243f8edc63c058819223bbde3c1081118e42 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Thu, 15 Jan 2026 08:26:09 -0500 Subject: [PATCH 3/8] feat(templates): update Anthropic Computer Use templates with 1024x768 viewport and Claude Sonnet 4.5 Updates both TypeScript and Python Anthropic Computer Use templates: - Set viewport to 1024x768@60Hz (Anthropic recommended size) - Update model to claude-sonnet-4-5-20250929 - Fix coordinate alignment between browser viewport and computer tool dimensions Changes: - pkg/templates/typescript/anthropic-computer-use/ - tools/computer.ts: display_width_px=1024, display_height_px=768 - session.ts: viewport 1024x768@60Hz - index.ts: model updated to claude-sonnet-4-5-20250929 - pkg/templates/python/anthropic-computer-use/ - tools/computer.py: width=1024, height=768 - session.py: viewport 1024x768@60Hz - main.py: model updated to claude-sonnet-4-5-20250929 Test replays (magnitasks.com Kanban drag test - moved 5 items to Done): - TypeScript: https://proxy.iad-awesome-blackwell.onkernel.com:8443/browser/replays?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE4MDAwMTgyNTYsInNlc3Npb24iOnsiaWQiOiJmZDA3NGRxZjY5bnNlcjk4aDliNGtrb3giLCJjZHBQb3J0Ijo5MjIyLCJjZHBXc1BhdGgiOiIiLCJpbnN0YW5jZU5hbWUiOiJicm93c2VyLXN0ZWFsdGgtcHJvZHVjdGlvbi01LWFsbG93ZWQtaGFtbWVyaGVhZC00MjcxIiwiZnFkbiI6InF1aWV0LXRyZWUtM3kybnd6c2EucHJvZC1pYWQtdWtwLWJyb3dzZXJzLTAub25rZXJuZWwuYXBwIiwibWV0cm8iOiJodHRwczovL2FwaS5wcm9kLWlhZC11a3AtYnJvd3NlcnMtMC5vbmtlcm5lbC5ydW4vdjEiLCJ1c2VySWQiOiJ3ODdoNHd1dTRoazNmeHFyZW5iNzFrMnAiLCJvcmdJZCI6ImlxMnRmMjUzbWlsOWptOWhmZjI3bDhyMiIsInN0ZWFsdGgiOnRydWUsImhlYWRsZXNzIjpmYWxzZSwicmVwbGF5UHJlZml4IjoiczM6Ly9rZXJuZWwtYXBpLXByb2Qvc2Vzc2lvbnJlcGxheXMvaXEydGYyNTNtaWw5am05aGZmMjdsOHIyL2ZkMDc0ZHFmNjluc2VyOThoOWI0a2tveCIsImtlcm5lbEh0dHBTZXJ2ZXJQb3J0Ijo0NDQsInRpbWVvdXRTZWNvbmRzIjozMDAsImNyZWF0ZWRBdCI6IjIwMjYtMDEtMTVUMTM6MDQ6MTYuNzc2OTEwOTc5WiIsImltYWdlIjoib25rZXJuZWwva2VybmVsLWN1LXYyNTo5NmYzOGU0Iiwic3RlYWx0aFByb3h5SWRlbnRpZmllciI6Ijg3NTY1X25YREZGQDIxNi4yNDcuMTAyLjE1MDo2MTIzMiIsImxpdmVTbHVnIjoia3c5b0lBc1VzRkxlIiwicHJpdmF0ZUlQIjoiMTcyLjE2LjIuMjAxIiwidmlld3BvcnRXaWR0aCI6MTAyNCwidmlld3BvcnRIZWlnaHQiOjc2OCwidmlld3BvcnRSZWZyZXNoUmF0ZSI6NjB9fQ.GHE2BXg6qrtNMoqO6NvuJ9fbHTW15igfmXl7W-ls3Qg&replay_id=wipxrn813lmajv7ukdkuykoa - Python: https://proxy.iad-awesome-blackwell.onkernel.com:8443/browser/replays?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJleHAiOjE4MDAwMTc4OTUsInNlc3Npb24iOnsiaWQiOiJseTVxOXQxa3F6YXR3NzE1N3lpYzl2M3IiLCJjZHBQb3J0Ijo5MjIyLCJjZHBXc1BhdGgiOiIiLCJpbnN0YW5jZU5hbWUiOiJicm93c2VyLXN0ZWFsdGgtcHJvZHVjdGlvbi01LXJlYWwtd2F0Y2htZW4tNTUxNCIsImZxZG4iOiJ0d2lsaWdodC1ib25vYm8tZGFvZTd5ZngucHJvZC1pYWQtdWtwLWJyb3dzZXJzLTAub25rZXJuZWwuYXBwIiwibWV0cm8iOiJodHRwczovL2FwaS5wcm9kLWlhZC11a3AtYnJvd3NlcnMtMC5vbmtlcm5lbC5ydW4vdjEiLCJ1c2VySWQiOiJ3ODdoNHd1dTRoazNmeHFyZW5iNzFrMnAiLCJvcmdJZCI6ImlxMnRmMjUzbWlsOWptOWhmZjI3bDhyMiIsInN0ZWFsdGgiOnRydWUsImhlYWRsZXNzIjpmYWxzZSwicmVwbGF5UHJlZml4IjoiczM6Ly9rZXJuZWwtYXBpLXByb2Qvc2Vzc2lvbnJlcGxheXMvaXEydGYyNTNtaWw5am05aGZmMjdsOHIyL2x5NXE5dDFrcXphdHc3MTU3eWljOXYzciIsImtlcm5lbEh0dHBTZXJ2ZXJQb3J0Ijo0NDQsInRpbWVvdXRTZWNvbmRzIjozMDAsImNyZWF0ZWRBdCI6IjIwMjYtMDEtMTVUMTI6NTg6MTUuMzk0MjQyNTc3WiIsImltYWdlIjoib25rZXJuZWwva2VybmVsLWN1LXYyNTo5NmYzOGU0Iiwic3RlYWx0aFByb3h5SWRlbnRpZmllciI6Ijg3NTY1X25YREZGQDE0MC4yMzMuMjQ5LjE3NDo2MTIzNCIsImxpdmVTbHVnIjoiak5DdGdpdHRreGtrIiwicHJpdmF0ZUlQIjoiMTcyLjE2LjcuMTMzIiwidmlld3BvcnRXaWR0aCI6MTAyNCwidmlld3BvcnRIZWlnaHQiOjc2OCwidmlld3BvcnRSZWZyZXNoUmF0ZSI6NjB9fQ._AhzTu1HwawrWwDgo66K3FZkEh4dpiOEVPmBTO4A21A&replay_id=pa0ha28zodehf1e1jyv1qibn Resolves KERNEL-725 --- .../python/anthropic-computer-use/loop.py | 3 -- .../python/anthropic-computer-use/main.py | 5 +-- .../python/anthropic-computer-use/session.py | 5 +++ .../anthropic-computer-use/tools/computer.py | 18 ++------- .../anthropic-computer-use/index.ts | 3 +- .../typescript/anthropic-computer-use/loop.ts | 2 +- .../anthropic-computer-use/session.ts | 5 +++ .../anthropic-computer-use/tools/computer.ts | 37 ++++--------------- 8 files changed, 24 insertions(+), 54 deletions(-) diff --git a/pkg/templates/python/anthropic-computer-use/loop.py b/pkg/templates/python/anthropic-computer-use/loop.py index 206c585..d5d9aa1 100644 --- a/pkg/templates/python/anthropic-computer-use/loop.py +++ b/pkg/templates/python/anthropic-computer-use/loop.py @@ -247,7 +247,6 @@ def _response_to_params( for block in response.content: block_type = getattr(block, "type", None) - # Handle thinking blocks if block_type == "thinking": thinking_block = { "type": "thinking", @@ -256,11 +255,9 @@ def _response_to_params( if hasattr(block, "signature"): thinking_block["signature"] = getattr(block, "signature", None) res.append(cast(BetaContentBlockParam, thinking_block)) - # Handle text blocks elif block_type == "text" or isinstance(block, BetaTextBlock): if getattr(block, "text", None): res.append(BetaTextBlockParam(type="text", text=block.text)) - # Handle tool use blocks elif block_type == "tool_use": tool_use_block: BetaToolUseBlockParam = { "type": "tool_use", diff --git a/pkg/templates/python/anthropic-computer-use/main.py b/pkg/templates/python/anthropic-computer-use/main.py index 654dda7..51b571d 100644 --- a/pkg/templates/python/anthropic-computer-use/main.py +++ b/pkg/templates/python/anthropic-computer-use/main.py @@ -53,9 +53,8 @@ async def cua_task( ) as session: print("Kernel browser live view url:", session.live_view_url) - # Run the sampling loop final_messages = await sampling_loop( - model="claude-sonnet-4-20250514", + model="claude-sonnet-4-5-20250929", messages=[ { "role": "user", @@ -68,7 +67,6 @@ async def cua_task( session_id=session.session_id, ) - # Extract the final result if not final_messages: raise ValueError("No messages were generated during the sampling loop") @@ -88,7 +86,6 @@ async def cua_task( if isinstance(block, Dict) and block.get("type") == "text" ) - # Session is cleaned up, replay_url is available if recording was enabled return { "result": result, "replay_url": session.replay_view_url, diff --git a/pkg/templates/python/anthropic-computer-use/session.py b/pkg/templates/python/anthropic-computer-use/session.py index c34ac01..179dda2 100644 --- a/pkg/templates/python/anthropic-computer-use/session.py +++ b/pkg/templates/python/anthropic-computer-use/session.py @@ -51,6 +51,11 @@ async def __aenter__(self) -> "KernelBrowserSession": browser = self._kernel.browsers.create( stealth=self.stealth, timeout_seconds=self.timeout_seconds, + viewport={ + "width": 1024, + "height": 768, + "refresh_rate": 60, + }, ) self.session_id = browser.session_id diff --git a/pkg/templates/python/anthropic-computer-use/tools/computer.py b/pkg/templates/python/anthropic-computer-use/tools/computer.py index 083f13f..d51d34a 100644 --- a/pkg/templates/python/anthropic-computer-use/tools/computer.py +++ b/pkg/templates/python/anthropic-computer-use/tools/computer.py @@ -107,8 +107,8 @@ class BaseComputerTool: """ name: Literal["computer"] = "computer" - width: int = 1920 - height: int = 1080 + width: int = 1024 + height: int = 768 display_num: int | None = None # Kernel client and session @@ -208,13 +208,8 @@ async def __call__( self._last_mouse_position = (x, y) return await self.screenshot() elif action == "left_click_drag": - # Get start position from kwargs or use last known position start_coord = kwargs.get("start_coordinate") - if start_coord: - start_coord = self.validate_coordinates(start_coord) - start_x, start_y = start_coord - else: - start_x, start_y = self._last_mouse_position + start_x, start_y = self.validate_coordinates(start_coord) if start_coord else self._last_mouse_position print(f"Dragging from ({start_x}, {start_y}) to ({x}, {y})") @@ -375,9 +370,7 @@ async def __call__( else: x, y = self._last_mouse_position - # Calculate scroll delta based on direction and amount - # Use a reasonable scroll factor - scroll_factor = scroll_amount * 10 # Adjust multiplier as needed + scroll_factor = scroll_amount * 10 delta_x = 0 delta_y = 0 @@ -452,10 +445,8 @@ async def __call__( elif action == "triple_click": num_clicks = 3 - # Handle modifier key if provided if key: mapped_key = self.map_key(key) - # Press modifier key down self.kernel.browsers.computer.press_key( id=self.session_id, keys=[mapped_key], @@ -471,7 +462,6 @@ async def __call__( ) if key: - # Release modifier key self.kernel.browsers.computer.press_key( id=self.session_id, keys=[mapped_key], diff --git a/pkg/templates/typescript/anthropic-computer-use/index.ts b/pkg/templates/typescript/anthropic-computer-use/index.ts index 015b69c..b126626 100644 --- a/pkg/templates/typescript/anthropic-computer-use/index.ts +++ b/pkg/templates/typescript/anthropic-computer-use/index.ts @@ -43,7 +43,7 @@ app.action( try { // Run the sampling loop const finalMessages = await samplingLoop({ - model: 'claude-sonnet-4-20250514', + model: 'claude-sonnet-4-5-20250929', messages: [{ role: 'user', content: payload.query, @@ -79,7 +79,6 @@ app.action( }; } catch (error) { console.error('Error in sampling loop:', error); - // Make sure to clean up the session even on error await session.stop(); throw error; } diff --git a/pkg/templates/typescript/anthropic-computer-use/loop.ts b/pkg/templates/typescript/anthropic-computer-use/loop.ts index 199e92c..fa775d9 100644 --- a/pkg/templates/typescript/anthropic-computer-use/loop.ts +++ b/pkg/templates/typescript/anthropic-computer-use/loop.ts @@ -118,7 +118,7 @@ export async function samplingLoop({ messages, model, system: [system], - tools: toolParams as any, // Type assertion needed due to ActionParams being used for both tool definition and input + tools: toolParams, betas, ...extraBody, }); diff --git a/pkg/templates/typescript/anthropic-computer-use/session.ts b/pkg/templates/typescript/anthropic-computer-use/session.ts index b61819d..c237259 100644 --- a/pkg/templates/typescript/anthropic-computer-use/session.ts +++ b/pkg/templates/typescript/anthropic-computer-use/session.ts @@ -93,6 +93,11 @@ export class KernelBrowserSession { const browser = await this.kernel.browsers.create({ stealth: this.options.stealth, timeout_seconds: this.options.timeoutSeconds, + viewport: { + width: 1024, + height: 768, + refresh_rate: 60, + }, }); this._sessionId = browser.session_id; diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts index 75fbd07..b0de903 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts @@ -14,7 +14,6 @@ export class ComputerTool implements BaseAnthropicTool { protected _screenshotDelay = 2.0; protected version: '20241022' | '20250124'; - // Track the last known mouse position for drag operations private lastMousePosition: [number, number] = [0, 0]; private readonly mouseActions = new Set([ @@ -55,8 +54,8 @@ export class ComputerTool implements BaseAnthropicTool { const params = { name: this.name, type: this.apiType, - display_width_px: 1920, - display_height_px: 1080, + display_width_px: 1024, + display_height_px: 768, display_number: null, }; return params; @@ -88,7 +87,6 @@ export class ComputerTool implements BaseAnthropicTool { x, y, }); - // Track mouse position for drag operations this.lastMousePosition = [x, y]; } else if (action === Action.LEFT_MOUSE_DOWN) { await this.kernel.browsers.computer.clickMouse(this.sessionId, { @@ -122,7 +120,6 @@ export class ComputerTool implements BaseAnthropicTool { click_type: 'click', num_clicks: numClicks, }); - // Track mouse position for drag operations this.lastMousePosition = [x, y]; } @@ -132,21 +129,17 @@ export class ComputerTool implements BaseAnthropicTool { private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise { if (action === Action.HOLD_KEY) { - // For HOLD_KEY, we need to press and hold for the duration - // OnKernel doesn't have a direct hold API, so we'll use pressKey with duration const key = this.convertToOnKernelKey(text); await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [key], duration: duration ? duration * 1000 : undefined, }); } else if (action === Action.KEY) { - // Convert key combination to OnKernel format (e.g., "Ctrl+t") const key = this.convertKeyCombinationToOnKernel(text); await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [key], }); } else { - // TYPE action - use typeText await this.kernel.browsers.computer.typeText(this.sessionId, { text, delay: TYPING_DELAY_MS, @@ -234,9 +227,6 @@ export class ComputerTool implements BaseAnthropicTool { } if (action === Action.CURSOR_POSITION) { - // OnKernel computer controls don't have a direct cursor position API - // This would need to be handled differently or removed - // For now, we'll return an error indicating this feature isn't available throw new ToolError('Cursor position is not available with OnKernel computer controls API'); } @@ -257,14 +247,11 @@ export class ComputerTool implements BaseAnthropicTool { const [x, y] = coordinate ? ActionValidator.validateAndGetCoordinates(coordinate) - : [0, 0]; // Default to top-left if no coordinate provided + : [0, 0]; - // Convert scroll direction and amount to delta_x and delta_y - // OnKernel uses positive delta_y for scrolling down, negative for up - // Positive delta_x for scrolling right, negative for left let delta_x = 0; let delta_y = 0; - const scrollDelta = scrollAmountValue || 120; // Default scroll amount + const scrollDelta = scrollAmountValue || 120; if (scrollDirection === 'down') { delta_y = scrollDelta; @@ -300,29 +287,19 @@ export class ComputerTool implements BaseAnthropicTool { throw new ToolError(`coordinate is required for ${action}`); } - // Get the destination coordinate const [endX, endY] = ActionValidator.validateAndGetCoordinates(coordinate); - - // Check if start_coordinate is provided in kwargs (for newer API versions) - let startX: number, startY: number; const startCoordinate = kwargs.start_coordinate as [number, number] | undefined; - - if (startCoordinate) { - [startX, startY] = ActionValidator.validateAndGetCoordinates(startCoordinate); - } else { - // Use last known mouse position as the start point - [startX, startY] = this.lastMousePosition; - } + const [startX, startY] = startCoordinate + ? ActionValidator.validateAndGetCoordinates(startCoordinate) + : this.lastMousePosition; console.log(`Dragging from (${startX}, ${startY}) to (${endX}, ${endY})`); - // Use Kernel's dragMouse API with a path from start to end await this.kernel.browsers.computer.dragMouse(this.sessionId, { path: [[startX, startY], [endX, endY]], button: 'left', }); - // Update tracked mouse position to the end of the drag this.lastMousePosition = [endX, endY]; await new Promise(resolve => setTimeout(resolve, 500)); From 6801fa2530d53a54a88346c7fdc8cb13d45a302b Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Thu, 15 Jan 2026 11:04:04 -0500 Subject: [PATCH 4/8] Update templates.go Updated invokecommand example for the anthropic templates --- pkg/create/templates.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/create/templates.go b/pkg/create/templates.go index ca9067c..f99c4e6 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -178,7 +178,7 @@ var Commands = map[string]map[string]DeployConfig{ TemplateAnthropicComputerUse: { EntryPoint: "index.ts", NeedsEnvFile: true, - InvokeCommand: `kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'`, + InvokeCommand: `kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to http://magnitasks.com and click on Tasks in the sidebar"}'`, }, TemplateMagnitude: { EntryPoint: "index.ts", @@ -220,7 +220,7 @@ var Commands = map[string]map[string]DeployConfig{ TemplateAnthropicComputerUse: { EntryPoint: "main.py", NeedsEnvFile: true, - InvokeCommand: `kernel invoke python-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'`, + InvokeCommand: `kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to http://magnitasks.com and click on Tasks in the sidebar"}'`, }, TemplateOpenAIComputerUse: { EntryPoint: "main.py", From 45af9fb83ac0fd2191eeef726285378c181d4d87 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Thu, 15 Jan 2026 11:04:20 -0500 Subject: [PATCH 5/8] Update qa.md with new queries for anthropic templates --- .cursor/commands/qa.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cursor/commands/qa.md b/.cursor/commands/qa.md index 459bf56..46fb874 100644 --- a/.cursor/commands/qa.md +++ b/.cursor/commands/qa.md @@ -230,7 +230,7 @@ Once all deployments are complete, present the human with these invoke commands kernel invoke ts-basic get-page-title --payload '{"url": "https://www.google.com"}' kernel invoke ts-captcha-solver test-captcha-solver kernel invoke ts-stagehand teamsize-task --payload '{"company": "Kernel"}' -kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}' +kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipedia.org/wiki/Special:Random"}' kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}' @@ -240,7 +240,7 @@ kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https:// kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}' kernel invoke python-captcha-solver test-captcha-solver kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}' -kernel invoke python-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}' +kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}' kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}' kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}' From af0c2330aae3df4a0cd6abfd36cb971b77892327 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Thu, 15 Jan 2026 17:57:47 -0500 Subject: [PATCH 6/8] fix(anthropic-cua): address PR review feedback for robustness and consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TypeScript template: - Add xdotool-format key mappings for consistency with Python template - Rename methods from convertToOnKernelKey to convertToKernelKey - Fix scroll fallback to use lastMousePosition instead of [0, 0] - Fix scroll amount using ?? operator to handle zero correctly - Remove unused KeyboardUtils import - Fix error message: "OnKernel" → "Kernel" - Reset all state fields (liveViewUrl, replayViewUrl) on session stop - Handle replay recording failures gracefully with try/catch Python template: - Wrap cleanup in try/finally to ensure browser deletion on errors - Handle replay recording failures gracefully with try/except - Preserve unexpected Anthropic content block types in loop --- .../python/anthropic-computer-use/loop.py | 4 + .../python/anthropic-computer-use/session.py | 29 ++-- .../anthropic-computer-use/session.ts | 9 +- .../anthropic-computer-use/tools/computer.ts | 137 ++++++++++++------ 4 files changed, 124 insertions(+), 55 deletions(-) diff --git a/pkg/templates/python/anthropic-computer-use/loop.py b/pkg/templates/python/anthropic-computer-use/loop.py index d5d9aa1..4062088 100644 --- a/pkg/templates/python/anthropic-computer-use/loop.py +++ b/pkg/templates/python/anthropic-computer-use/loop.py @@ -266,6 +266,10 @@ def _response_to_params( "input": block.input, } res.append(tool_use_block) + else: + # Preserve unexpected block types to avoid silently dropping content + if hasattr(block, "model_dump"): + res.append(cast(BetaContentBlockParam, block.model_dump())) return res diff --git a/pkg/templates/python/anthropic-computer-use/session.py b/pkg/templates/python/anthropic-computer-use/session.py index 179dda2..3227b28 100644 --- a/pkg/templates/python/anthropic-computer-use/session.py +++ b/pkg/templates/python/anthropic-computer-use/session.py @@ -66,7 +66,11 @@ async def __aenter__(self) -> "KernelBrowserSession": # Start replay recording if enabled if self.record_replay: - await self._start_replay() + try: + await self._start_replay() + except Exception as e: + print(f"Warning: Failed to start replay recording: {e}") + print("Continuing without replay recording.") return self @@ -122,17 +126,18 @@ async def _stop_and_get_replay_url(self) -> None: async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: """Stop recording and delete the browser session.""" if self._kernel and self.session_id: - # Stop replay if recording was enabled - if self.record_replay and self.replay_id: - # Wait grace period before stopping to capture final state - if self.replay_grace_period > 0: - print(f"Waiting {self.replay_grace_period}s grace period...") - await asyncio.sleep(self.replay_grace_period) - await self._stop_and_get_replay_url() - - print(f"Destroying browser session: {self.session_id}") - self._kernel.browsers.delete_by_id(self.session_id) - print("Browser session destroyed.") + try: + # Stop replay if recording was enabled + if self.record_replay and self.replay_id: + # Wait grace period before stopping to capture final state + if self.replay_grace_period > 0: + print(f"Waiting {self.replay_grace_period}s grace period...") + await asyncio.sleep(self.replay_grace_period) + await self._stop_and_get_replay_url() + finally: + print(f"Destroying browser session: {self.session_id}") + self._kernel.browsers.delete_by_id(self.session_id) + print("Browser session destroyed.") self._kernel = None diff --git a/pkg/templates/typescript/anthropic-computer-use/session.ts b/pkg/templates/typescript/anthropic-computer-use/session.ts index c237259..2367484 100644 --- a/pkg/templates/typescript/anthropic-computer-use/session.ts +++ b/pkg/templates/typescript/anthropic-computer-use/session.ts @@ -108,7 +108,12 @@ export class KernelBrowserSession { // Start replay recording if enabled if (this.options.recordReplay) { - await this.startReplay(); + try { + await this.startReplay(); + } catch (error) { + console.warn(`Warning: Failed to start replay recording: ${error}`); + console.warn('Continuing without replay recording.'); + } } return this.info; @@ -201,7 +206,9 @@ export class KernelBrowserSession { // Reset state this._sessionId = null; + this._liveViewUrl = null; this._replayId = null; + this._replayViewUrl = null; return info; } diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts index b0de903..5abd983 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts @@ -2,7 +2,6 @@ import { Buffer } from 'buffer'; import type { Kernel } from '@onkernel/sdk'; import type { ActionParams, BaseAnthropicTool, ToolResult } from './types/computer'; import { Action, ToolError } from './types/computer'; -import { KeyboardUtils } from './utils/keyboard'; import { ActionValidator } from './utils/validator'; const TYPING_DELAY_MS = 12; @@ -129,13 +128,13 @@ export class ComputerTool implements BaseAnthropicTool { private async handleKeyboardAction(action: Action, text: string, duration?: number): Promise { if (action === Action.HOLD_KEY) { - const key = this.convertToOnKernelKey(text); + const key = this.convertToKernelKey(text); await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [key], duration: duration ? duration * 1000 : undefined, }); } else if (action === Action.KEY) { - const key = this.convertKeyCombinationToOnKernel(text); + const key = this.convertKeyCombinationToKernel(text); await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [key], }); @@ -150,44 +149,98 @@ export class ComputerTool implements BaseAnthropicTool { return await this.screenshot(); } - private convertToOnKernelKey(key: string): string { - // Convert Playwright key names to OnKernel format - const keyMap: Record = { - 'Control': 'Ctrl', - 'Meta': 'Meta', - 'Alt': 'Alt', - 'Shift': 'Shift', - 'Enter': 'Enter', - 'ArrowLeft': 'ArrowLeft', - 'ArrowRight': 'ArrowRight', - 'ArrowUp': 'ArrowUp', - 'ArrowDown': 'ArrowDown', - 'Home': 'Home', - 'End': 'End', - 'PageUp': 'PageUp', - 'PageDown': 'PageDown', - 'Delete': 'Delete', - 'Backspace': 'Backspace', - 'Tab': 'Tab', - 'Escape': 'Escape', - 'Insert': 'Insert', - }; - return keyMap[key] || key; + // Key mappings for Kernel Computer Controls API (xdotool format) + private static readonly KEY_MAP: Record = { + // Enter/Return + 'return': 'Return', + 'enter': 'Return', + 'Enter': 'Return', + // Arrow keys + 'left': 'Left', + 'right': 'Right', + 'up': 'Up', + 'down': 'Down', + 'ArrowLeft': 'Left', + 'ArrowRight': 'Right', + 'ArrowUp': 'Up', + 'ArrowDown': 'Down', + // Navigation + 'home': 'Home', + 'end': 'End', + 'pageup': 'Page_Up', + 'page_up': 'Page_Up', + 'PageUp': 'Page_Up', + 'pagedown': 'Page_Down', + 'page_down': 'Page_Down', + 'PageDown': 'Page_Down', + // Editing + 'delete': 'Delete', + 'backspace': 'BackSpace', + 'Backspace': 'BackSpace', + 'tab': 'Tab', + 'insert': 'Insert', + // Escape + 'esc': 'Escape', + 'escape': 'Escape', + // Function keys + 'f1': 'F1', + 'f2': 'F2', + 'f3': 'F3', + 'f4': 'F4', + 'f5': 'F5', + 'f6': 'F6', + 'f7': 'F7', + 'f8': 'F8', + 'f9': 'F9', + 'f10': 'F10', + 'f11': 'F11', + 'f12': 'F12', + // Misc + 'space': 'space', + 'minus': 'minus', + 'equal': 'equal', + 'plus': 'plus', + }; + + // Modifier key mappings (xdotool format) + private static readonly MODIFIER_MAP: Record = { + 'ctrl': 'ctrl', + 'control': 'ctrl', + 'Control': 'ctrl', + 'alt': 'alt', + 'Alt': 'alt', + 'shift': 'shift', + 'Shift': 'shift', + 'meta': 'super', + 'Meta': 'super', + 'cmd': 'super', + 'command': 'super', + 'win': 'super', + 'super': 'super', + }; + + private convertToKernelKey(key: string): string { + // Check modifier keys first + if (ComputerTool.MODIFIER_MAP[key]) { + return ComputerTool.MODIFIER_MAP[key]; + } + // Check special keys + if (ComputerTool.KEY_MAP[key]) { + return ComputerTool.KEY_MAP[key]; + } + // Return as-is if no mapping exists + return key; } - private convertKeyCombinationToOnKernel(combo: string): string { - // Convert key combinations like "Control+t" to "Ctrl+t" - const parts = combo.split('+').map(part => { - const trimmed = part.trim(); - if (trimmed.toLowerCase() === 'control' || trimmed.toLowerCase() === 'ctrl') { - return 'Ctrl'; - } - if (trimmed.toLowerCase() === 'meta' || trimmed.toLowerCase() === 'command' || trimmed.toLowerCase() === 'cmd') { - return 'Meta'; - } - return trimmed; - }); - return parts.join('+'); + private convertKeyCombinationToKernel(combo: string): string { + // Handle key combinations (e.g., "ctrl+a", "Control+t") + if (combo.includes('+')) { + const parts = combo.split('+'); + const mappedParts = parts.map(part => this.convertToKernelKey(part.trim())); + return mappedParts.join('+'); + } + // Single key - just convert it + return this.convertToKernelKey(combo); } async screenshot(): Promise { @@ -227,7 +280,7 @@ export class ComputerTool implements BaseAnthropicTool { } if (action === Action.CURSOR_POSITION) { - throw new ToolError('Cursor position is not available with OnKernel computer controls API'); + throw new ToolError('Cursor position is not available with Kernel Computer Controls API'); } if (action === Action.SCROLL) { @@ -247,11 +300,11 @@ export class ComputerTool implements BaseAnthropicTool { const [x, y] = coordinate ? ActionValidator.validateAndGetCoordinates(coordinate) - : [0, 0]; + : this.lastMousePosition; let delta_x = 0; let delta_y = 0; - const scrollDelta = scrollAmountValue || 120; + const scrollDelta = scrollAmountValue ?? 120; if (scrollDirection === 'down') { delta_y = scrollDelta; From 7df0d8abcc963e723a03c2da9f5b30d2831a0597 Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Fri, 16 Jan 2026 11:36:00 -0500 Subject: [PATCH 7/8] fix(anthropic-cua): use 120px scroll multiplier to match Anthropic xdotool behavior Anthropic's reference implementation uses xdotool where each scroll_amount unit equals one scroll wheel click (~120 pixels). Previously: - TypeScript used the value directly - Python used a 10x multiplier Both now use 120x to match Anthropic's expected behavior for AI agents. --- pkg/templates/python/anthropic-computer-use/tools/computer.py | 3 ++- .../typescript/anthropic-computer-use/tools/computer.ts | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/templates/python/anthropic-computer-use/tools/computer.py b/pkg/templates/python/anthropic-computer-use/tools/computer.py index d51d34a..654a289 100644 --- a/pkg/templates/python/anthropic-computer-use/tools/computer.py +++ b/pkg/templates/python/anthropic-computer-use/tools/computer.py @@ -370,7 +370,8 @@ async def __call__( else: x, y = self._last_mouse_position - scroll_factor = scroll_amount * 10 + # Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior) + scroll_factor = scroll_amount * 120 delta_x = 0 delta_y = 0 diff --git a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts index 5abd983..8e415ad 100644 --- a/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts +++ b/pkg/templates/typescript/anthropic-computer-use/tools/computer.ts @@ -304,7 +304,8 @@ export class ComputerTool implements BaseAnthropicTool { let delta_x = 0; let delta_y = 0; - const scrollDelta = scrollAmountValue ?? 120; + // Each scroll_amount unit = 1 scroll wheel click ≈ 120 pixels (matches Anthropic's xdotool behavior) + const scrollDelta = (scrollAmountValue ?? 1) * 120; if (scrollDirection === 'down') { delta_y = scrollDelta; From ac3baaa88edd8f28150db60c2669e351bdf79f1e Mon Sep 17 00:00:00 2001 From: Daniel Prevoznik Date: Fri, 16 Jan 2026 11:36:08 -0500 Subject: [PATCH 8/8] fix(ts-anthropic-cua): add try/finally to ensure browser session cleanup Wrap replay stopping logic in try/finally to ensure browser session is always deleted even if stopReplay() fails. This prevents resource leaks on the Kernel platform when replay recording is enabled and stopping fails. Matches the existing Python implementation behavior. --- .../anthropic-computer-use/session.ts | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/pkg/templates/typescript/anthropic-computer-use/session.ts b/pkg/templates/typescript/anthropic-computer-use/session.ts index 2367484..06e30a6 100644 --- a/pkg/templates/typescript/anthropic-computer-use/session.ts +++ b/pkg/templates/typescript/anthropic-computer-use/session.ts @@ -188,20 +188,23 @@ export class KernelBrowserSession { const info = this.info; if (this._sessionId) { - // Stop replay if recording was enabled - if (this.options.recordReplay && this._replayId) { - // Wait grace period before stopping to capture final state - if (this.options.replayGracePeriod > 0) { - console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); - await this.sleep(this.options.replayGracePeriod * 1000); + try { + // Stop replay if recording was enabled + if (this.options.recordReplay && this._replayId) { + // Wait grace period before stopping to capture final state + if (this.options.replayGracePeriod > 0) { + console.log(`Waiting ${this.options.replayGracePeriod}s grace period...`); + await this.sleep(this.options.replayGracePeriod * 1000); + } + await this.stopReplay(); + info.replayViewUrl = this._replayViewUrl || undefined; } - await this.stopReplay(); - info.replayViewUrl = this._replayViewUrl || undefined; + } finally { + // Always clean up the browser session, even if replay stopping fails + console.log(`Destroying browser session: ${this._sessionId}`); + await this.kernel.browsers.deleteByID(this._sessionId); + console.log('Browser session destroyed.'); } - - console.log(`Destroying browser session: ${this._sessionId}`); - await this.kernel.browsers.deleteByID(this._sessionId); - console.log('Browser session destroyed.'); } // Reset state