Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .cursor/commands/qa.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ Once all deployments are complete, present the human with these invoke commands
kernel invoke ts-basic get-page-title --payload '{"url": "https://www.google.com"}'
kernel invoke ts-captcha-solver test-captcha-solver
kernel invoke ts-stagehand teamsize-task --payload '{"company": "Kernel"}'
kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'
kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}'
kernel invoke ts-magnitude mag-url-extract --payload '{"url": "https://en.wikipedia.org/wiki/Special:Random"}'
kernel invoke ts-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}'
kernel invoke ts-gemini-cua gemini-cua-task --payload '{"startingUrl": "https://www.magnitasks.com/", "instruction": "Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board? You are done successfully when the items are moved."}'
Expand All @@ -240,7 +240,7 @@ kernel invoke ts-claude-agent-sdk agent-task --payload '{"task": "Go to https://
kernel invoke python-basic get-page-title --payload '{"url": "https://www.google.com"}'
kernel invoke python-captcha-solver test-captcha-solver
kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}'
kernel invoke python-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'
kernel invoke python-anthropic-cua cua-task --payload '{"query": "Go to http://magnitasks.com, Click the Tasks option in the left-side bar, and move the 5 items in the To Do and In Progress items to the Done section of the Kanban board. You are done successfully when the items are moved.", "record_replay": true}'
kernel invoke python-openai-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}'
kernel invoke python-openagi-cua openagi-default-task -p '{"instruction": "Navigate to https://agiopen.org and click the What is Computer Use? button"}'
kernel invoke py-claude-agent-sdk agent-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 3 stories"}'
Expand Down
4 changes: 2 additions & 2 deletions pkg/create/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ var Commands = map[string]map[string]DeployConfig{
TemplateAnthropicComputerUse: {
EntryPoint: "index.ts",
NeedsEnvFile: true,
InvokeCommand: `kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'`,
InvokeCommand: `kernel invoke ts-anthropic-cua cua-task --payload '{"query": "Navigate to http://magnitasks.com and click on Tasks in the sidebar"}'`,
},
TemplateMagnitude: {
EntryPoint: "index.ts",
Expand Down Expand Up @@ -220,7 +220,7 @@ var Commands = map[string]map[string]DeployConfig{
TemplateAnthropicComputerUse: {
EntryPoint: "main.py",
NeedsEnvFile: true,
InvokeCommand: `kernel invoke python-anthropic-cua cua-task --payload '{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}'`,
InvokeCommand: `kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to http://magnitasks.com and click on Tasks in the sidebar"}'`,
},
TemplateOpenAIComputerUse: {
EntryPoint: "main.py",
Expand Down
46 changes: 43 additions & 3 deletions pkg/templates/python/anthropic-computer-use/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,47 @@
# Kernel Python Sample App - Anthropic Computer Use

This is a simple Kernel application that implements a prompt loop using Anthropic Computer Use.
This is a Kernel application that implements a prompt loop using Anthropic Computer Use with Kernel's Computer Controls API.

It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but replaces `xodotool` and `gnome-screenshot` with Playwright.
It generally follows the [Anthropic Reference Implementation](https://github.com/anthropics/anthropic-quickstarts/tree/main/computer-use-demo) but uses Kernel's Computer Controls API instead of `xdotool` and `gnome-screenshot`.

See the [docs](https://www.kernel.sh/docs/quickstart) for information.
## Setup

1. Get your API keys:
- **Kernel**: [dashboard.onkernel.com](https://dashboard.onkernel.com)
- **Anthropic**: [console.anthropic.com](https://console.anthropic.com)

2. Deploy the app:
```bash
kernel login
cp .env.example .env # Add your ANTHROPIC_API_KEY
kernel deploy main.py --env-file .env
```

## Usage

```bash
kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com and describe the page"}'
```

## Recording Replays

> **Note:** Replay recording is only available to Kernel users on paid plans.

Add `"record_replay": true` to your payload to capture a video of the browser session:

```bash
kernel invoke python-anthropic-cua cua-task --payload '{"query": "Navigate to https://example.com", "record_replay": true}'
```

When enabled, the response will include a `replay_url` field with a link to view the recorded session.

## Known Limitations

### Cursor Position

The `cursor_position` action is not supported with Kernel's Computer Controls API. If the model attempts to use this action, an error will be returned. This is a known limitation that does not significantly impact most computer use workflows, as the model typically tracks cursor position through screenshots.

## Resources

- [Anthropic Computer Use Documentation](https://docs.anthropic.com/en/docs/build-with-claude/computer-use)
- [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
63 changes: 33 additions & 30 deletions pkg/templates/python/anthropic-computer-use/loop.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,16 @@
"""
Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools.
From https://github.com/anthropics/anthropic-quickstarts/blob/main/computer-use-demo/computer_use_demo/loop.py
Modified to use Kernel Computer Controls API instead of Playwright.
"""

import os
import platform
from collections.abc import Callable
from datetime import datetime
from enum import StrEnum
from typing import Any, cast
from playwright.async_api import Page

import httpx
from anthropic import (
Anthropic,
AnthropicBedrock,
AnthropicVertex,
APIError,
APIResponseValidationError,
APIStatusError,
)

from kernel import Kernel
from anthropic import Anthropic
from anthropic.types.beta import (
BetaCacheControlEphemeralParam,
BetaContentBlockParam,
Expand Down Expand Up @@ -78,14 +69,15 @@ async def sampling_loop(
model: str,
messages: list[BetaMessageParam],
api_key: str,
kernel: Kernel,
session_id: str,
provider: APIProvider = APIProvider.ANTHROPIC,
system_prompt_suffix: str = "",
only_n_most_recent_images: int | None = None,
max_tokens: int = 4096,
tool_version: ToolVersion = "computer_use_20250124",
thinking_budget: int | None = None,
token_efficient_tools_beta: bool = False,
playwright_page: Page,
):
"""
Agentic sampling loop for the assistant/tool interaction of computer use.
Expand All @@ -94,19 +86,20 @@ async def sampling_loop(
model: The model to use for the API call
messages: The conversation history
api_key: The API key for authentication
kernel: The Kernel client instance
session_id: The Kernel browser session ID
provider: The API provider (defaults to ANTHROPIC)
system_prompt_suffix: Additional system prompt text (defaults to empty string)
only_n_most_recent_images: Optional limit on number of recent images to keep
max_tokens: Maximum tokens for the response (defaults to 4096)
tool_version: Version of tools to use (defaults to V20250124)
thinking_budget: Optional token budget for thinking
token_efficient_tools_beta: Whether to use token efficient tools beta
playwright_page: The Playwright page instance for browser automation
"""
tool_group = TOOL_GROUPS_BY_VERSION[tool_version]
tool_collection = ToolCollection(
*(
ToolCls(page=playwright_page if ToolCls.__name__.startswith("ComputerTool") else None)
ToolCls(kernel=kernel, session_id=session_id) if ToolCls.__name__.startswith("ComputerTool") else ToolCls()
for ToolCls in tool_group.tools
)
)
Expand Down Expand Up @@ -252,21 +245,31 @@ def _response_to_params(
) -> list[BetaContentBlockParam]:
res: list[BetaContentBlockParam] = []
for block in response.content:
if isinstance(block, BetaTextBlock):
if block.text:
block_type = getattr(block, "type", None)

if block_type == "thinking":
thinking_block = {
"type": "thinking",
"thinking": getattr(block, "thinking", None),
}
if hasattr(block, "signature"):
thinking_block["signature"] = getattr(block, "signature", None)
res.append(cast(BetaContentBlockParam, thinking_block))
elif block_type == "text" or isinstance(block, BetaTextBlock):
if getattr(block, "text", None):
res.append(BetaTextBlockParam(type="text", text=block.text))
elif getattr(block, "type", None) == "thinking":
# Handle thinking blocks - include signature field
thinking_block = {
"type": "thinking",
"thinking": getattr(block, "thinking", None),
}
if hasattr(block, "signature"):
thinking_block["signature"] = getattr(block, "signature", None)
res.append(cast(BetaContentBlockParam, thinking_block))
elif block_type == "tool_use":
tool_use_block: BetaToolUseBlockParam = {
"type": "tool_use",
"id": block.id,
"name": block.name,
"input": block.input,
}
res.append(tool_use_block)
else:
# Handle tool use blocks normally
res.append(cast(BetaToolUseBlockParam, block.model_dump()))
# Preserve unexpected block types to avoid silently dropping content
if hasattr(block, "model_dump"):
res.append(cast(BetaContentBlockParam, block.model_dump()))
return res


Expand Down Expand Up @@ -334,4 +337,4 @@ def _make_api_tool_result(
def _maybe_prepend_system_tool_result(result: ToolResult, result_text: str):
if result.system:
result_text = f"<system>{result.system}</system>\n{result_text}"
return result_text
return result_text
127 changes: 61 additions & 66 deletions pkg/templates/python/anthropic-computer-use/main.py
Original file line number Diff line number Diff line change
@@ -1,97 +1,92 @@
import os
from typing import Dict, TypedDict
from typing import Dict, Optional, TypedDict

import kernel
from kernel import Kernel
from loop import sampling_loop
from playwright.async_api import async_playwright
from session import KernelBrowserSession


class QueryInput(TypedDict):
query: str
record_replay: Optional[bool]


class QueryOutput(TypedDict):
result: str
replay_url: Optional[str]


api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise ValueError("ANTHROPIC_API_KEY is not set")

client = Kernel()
app = kernel.App("python-anthropic-cua")


@app.action("cua-task")
async def cua_task(
ctx: kernel.KernelContext,
payload: QueryInput,
) -> QueryOutput:
# A function that processes a user query using a browser-based sampling loop

# Args:
# ctx: Kernel context containing invocation information
# payload: An object containing a query string to process

# Returns:
# A dictionary containing the result of the sampling loop as a string
"""
Process a user query using Anthropic Computer Use with Kernel's browser automation.

Args:
ctx: Kernel context containing invocation information
payload: An object containing:
- query: The task/query string to process
- record_replay: Optional boolean to enable video replay recording

Returns:
A dictionary containing:
- result: The result of the sampling loop as a string
- replay_url: URL to view the replay (if recording was enabled)
"""
if not payload or not payload.get("query"):
raise ValueError("Query is required")

kernel_browser = client.browsers.create(
invocation_id=ctx.invocation_id, stealth=True
)
print("Kernel browser live view url: ", kernel_browser.browser_live_view_url)

try:
async with async_playwright() as playwright:
browser = await playwright.chromium.connect_over_cdp(
kernel_browser.cdp_ws_url
)
context = (
browser.contexts[0] if browser.contexts else await browser.new_context()
record_replay = payload.get("record_replay", False)

async with KernelBrowserSession(
stealth=True,
record_replay=record_replay,
) as session:
print("Kernel browser live view url:", session.live_view_url)

final_messages = await sampling_loop(
model="claude-sonnet-4-5-20250929",
messages=[
{
"role": "user",
"content": payload["query"],
}
],
api_key=str(api_key),
thinking_budget=1024,
kernel=session.kernel,
session_id=session.session_id,
)

if not final_messages:
raise ValueError("No messages were generated during the sampling loop")

last_message = final_messages[-1]
if not last_message:
raise ValueError(
"Failed to get the last message from the sampling loop"
)
page = context.pages[0] if context.pages else await context.new_page()

# Run the sampling loop
final_messages = await sampling_loop(
model="claude-sonnet-4-20250514",
messages=[
{
"role": "user",
"content": payload["query"],
}
],
api_key=str(api_key),
thinking_budget=1024,
playwright_page=page,

result = ""
if isinstance(last_message.get("content"), str):
result = last_message["content"] # type: ignore[assignment]
else:
result = "".join(
block["text"]
for block in last_message["content"] # type: ignore[index]
if isinstance(block, Dict) and block.get("type") == "text"
)

# Extract the final result
if not final_messages:
raise ValueError("No messages were generated during the sampling loop")

last_message = final_messages[-1]
if not last_message:
raise ValueError(
"Failed to get the last message from the sampling loop"
)

result = ""
if isinstance(last_message.get("content"), str):
result = last_message["content"] # type: ignore[assignment]
else:
result = "".join(
block["text"]
for block in last_message["content"] # type: ignore[index]
if isinstance(block, Dict) and block.get("type") == "text"
)

return {"result": result}
except Exception as exc:
print(f"Error in sampling loop: {exc}")
raise
finally:
if browser is not None:
await browser.close()
client.browsers.delete_by_id(kernel_browser.session_id)
return {
"result": result,
"replay_url": session.replay_view_url,
}
4 changes: 1 addition & 3 deletions pkg/templates/python/anthropic-computer-use/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,9 @@ description = "Kernel reference app for Anthropic Computer Use"
requires-python = ">=3.9"
dependencies = [
"anthropic>=0.75.0",
"playwright>=1.56.0",
"python-dateutil>=2.9.0",
"pydantic>=2.12.5",
"typing-extensions>=4.15.0",
"kernel>=0.23.0",
"kernel>=0.24.0",
"python-dotenv>=1.2.1",
"httpx>=0.28.1",
]
Loading