diff --git a/eval_protocol/integrations/tinker_rollout_processor.py b/eval_protocol/integrations/tinker_rollout_processor.py index 5f2c1197..771f878e 100644 --- a/eval_protocol/integrations/tinker_rollout_processor.py +++ b/eval_protocol/integrations/tinker_rollout_processor.py @@ -2,6 +2,7 @@ import logging import os import time +from datetime import datetime, timezone import traceback from typing import Any, Dict, List, Optional, Union @@ -76,6 +77,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> """Generate rollout tasks using Tinker.""" async def process_row(row: EvaluationRow) -> EvaluationRow: + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() if not row.messages: diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py index 3480d0f7..86677efc 100644 --- a/eval_protocol/mcp/execution/manager.py +++ b/eval_protocol/mcp/execution/manager.py @@ -11,6 +11,7 @@ import os import threading import time +from datetime import datetime, timezone from dataclasses import asdict from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast @@ -97,6 +98,8 @@ def execute_rollouts( async def _execute_with_semaphore(idx): async with semaphore: evaluation_row: EvaluationRow = evaluation_rows[idx] + if evaluation_row.execution_metadata.rollout_start_time is None: + evaluation_row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) row_start_time = time.perf_counter() trajectory = await self._execute_rollout( diff --git a/eval_protocol/models.py b/eval_protocol/models.py index b119c7b3..1351dd6b 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -793,6 +793,11 @@ class CostMetrics(BaseModel): class ExecutionMetadata(BaseModel): """Metadata about the execution of the evaluation.""" + rollout_start_time: Optional[datetime] = Field( + default=None, + description="UTC timestamp when the rollout started.", + ) + invocation_id: Optional[str] = Field( default_factory=generate_id, description="The ID of the invocation that this row belongs to.", diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py index ec6f983b..b55c5838 100644 --- a/eval_protocol/pytest/default_agent_rollout_processor.py +++ b/eval_protocol/pytest/default_agent_rollout_processor.py @@ -3,6 +3,7 @@ import logging import os import time +from datetime import datetime, timezone from typing import Any, AsyncIterator, List, Optional, Union, Dict from mcp.types import CallToolResult, TextContent @@ -249,6 +250,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() agent = Agent( diff --git a/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py b/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py index 27d44b80..39f8a4d8 100644 --- a/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py +++ b/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py @@ -4,6 +4,7 @@ import os import tempfile import time +from datetime import datetime, timezone from typing import Any, Callable, Dict, List, Optional from pydantic import BaseModel, Field @@ -66,7 +67,8 @@ def __call__( async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with complete sandbox lifecycle""" - + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() agent: Agent | None = None temp_config_path: str | None = None diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py index f9618799..0902934f 100644 --- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py +++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py @@ -4,6 +4,7 @@ from collections.abc import Callable import logging import time +from datetime import datetime, timezone from pydantic_ai.toolsets import FunctionToolset from pydantic_ai.usage import UsageLimits from typing_extensions import override @@ -50,6 +51,8 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) -> async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with agent rollout.""" + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() tools = [] diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py index c3e09ba3..bde79b04 100644 --- a/eval_protocol/pytest/default_single_turn_rollout_process.py +++ b/eval_protocol/pytest/default_single_turn_rollout_process.py @@ -3,6 +3,7 @@ import logging import os import time +from datetime import datetime, timezone from dataclasses import asdict, is_dataclass from types import SimpleNamespace from typing import Any, List @@ -62,6 +63,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row asynchronously.""" + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() if len(row.messages) == 0: diff --git a/eval_protocol/pytest/github_action_rollout_processor.py b/eval_protocol/pytest/github_action_rollout_processor.py index 3e4f9ec0..6c3489b9 100644 --- a/eval_protocol/pytest/github_action_rollout_processor.py +++ b/eval_protocol/pytest/github_action_rollout_processor.py @@ -67,6 +67,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> max_pages = (num_rows + 99) // 100 # Round up pages async def _process_row(row: EvaluationRow) -> EvaluationRow: + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() if row.execution_metadata.invocation_id is None: diff --git a/eval_protocol/pytest/openenv_rollout_processor.py b/eval_protocol/pytest/openenv_rollout_processor.py index 0f662692..af57cb3c 100644 --- a/eval_protocol/pytest/openenv_rollout_processor.py +++ b/eval_protocol/pytest/openenv_rollout_processor.py @@ -15,6 +15,7 @@ import asyncio import logging import time +from datetime import datetime, timezone from itertools import count from typing import List, Any, Dict, Callable, Generic, TypeVar, Optional, Type @@ -167,6 +168,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> async def process_row(row: EvaluationRow) -> EvaluationRow: """Process a single row with OpenEnv rollout.""" + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() logger.info("[OpenEnvRolloutProcessor] Starting rollout for row") diff --git a/eval_protocol/pytest/priority_scheduler.py b/eval_protocol/pytest/priority_scheduler.py index 605b973e..d773ee42 100644 --- a/eval_protocol/pytest/priority_scheduler.py +++ b/eval_protocol/pytest/priority_scheduler.py @@ -3,6 +3,7 @@ import logging import os import time +from datetime import datetime, timezone from collections import defaultdict from dataclasses import dataclass, field from typing import Any, List, Dict, Optional, Union @@ -293,6 +294,8 @@ async def _run_eval(rows_to_eval: Union[EvaluationRow, List[EvaluationRow]]): # 3. Execute the rollout result_row: Optional[EvaluationRow] = None + if row_copy.execution_metadata.rollout_start_time is None: + row_copy.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() try: async for result in rollout_processor_with_retry( diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py index aa1c5d44..34371200 100644 --- a/eval_protocol/pytest/remote_rollout_processor.py +++ b/eval_protocol/pytest/remote_rollout_processor.py @@ -1,5 +1,6 @@ import asyncio import time +from datetime import datetime, timezone from typing import Any, Dict, List, Optional import requests @@ -72,6 +73,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) -> raise ValueError("remote_base_url is required in RolloutProcessorConfig.kwargs for RemoteRolloutProcessor") async def _process_row(row: EvaluationRow) -> EvaluationRow: + if row.execution_metadata.rollout_start_time is None: + row.execution_metadata.rollout_start_time = datetime.now(timezone.utc) start_time = time.perf_counter() if row.execution_metadata.invocation_id is None: diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx index 87e89f86..34b21438 100644 --- a/vite-app/src/components/EvaluationRow.tsx +++ b/vite-app/src/components/EvaluationRow.tsx @@ -263,6 +263,21 @@ const RowScore = observer(({ score }: { score: number | undefined }) => { ); }); +const RowRolloutDuration = observer( + ({ durationSeconds }: { durationSeconds?: number | null }) => { + if (durationSeconds === null || durationSeconds === undefined) { + return N/A; + } + + const formatted = `${durationSeconds.toFixed(2)}s`; + return ( + + {formatted} + + ); + } +); + const RowCreated = observer(({ created_at }: { created_at: Date | string }) => { const date = created_at instanceof Date ? created_at : new Date(created_at); @@ -432,6 +447,13 @@ export const EvaluationRow = observer( /> + {/* Rollout Latency */} + + + + {/* Model */} diff --git a/vite-app/src/components/EvaluationTable.tsx b/vite-app/src/components/EvaluationTable.tsx index 56adc283..f30c07a9 100644 --- a/vite-app/src/components/EvaluationTable.tsx +++ b/vite-app/src/components/EvaluationTable.tsx @@ -226,6 +226,14 @@ export const EvaluationTable = observer(() => { > Rollout Status + + Rollout Latency + (typeof val === "string" ? new Date(val) : val), + z.date() + ) + .optional() + .describe("UTC timestamp when the rollout started."), invocation_id: z .string() .optional() @@ -346,6 +353,16 @@ export const ExecutionMetadataSchema = z.object({ .nullable() .optional() .describe("Processing duration in seconds for this evaluation row."), + rollout_duration_seconds: z + .number() + .nullable() + .optional() + .describe("Processing duration in seconds for the rollout of this row."), + eval_duration_seconds: z + .number() + .nullable() + .optional() + .describe("Processing duration in seconds for the evaluation of this row."), experiment_duration_seconds: z .number() .nullable()