diff --git a/eval_protocol/integrations/tinker_rollout_processor.py b/eval_protocol/integrations/tinker_rollout_processor.py
index 5f2c1197..771f878e 100644
--- a/eval_protocol/integrations/tinker_rollout_processor.py
+++ b/eval_protocol/integrations/tinker_rollout_processor.py
@@ -2,6 +2,7 @@
import logging
import os
import time
+from datetime import datetime, timezone
import traceback
from typing import Any, Dict, List, Optional, Union
@@ -76,6 +77,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
"""Generate rollout tasks using Tinker."""
async def process_row(row: EvaluationRow) -> EvaluationRow:
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
if not row.messages:
diff --git a/eval_protocol/mcp/execution/manager.py b/eval_protocol/mcp/execution/manager.py
index 3480d0f7..86677efc 100644
--- a/eval_protocol/mcp/execution/manager.py
+++ b/eval_protocol/mcp/execution/manager.py
@@ -11,6 +11,7 @@
import os
import threading
import time
+from datetime import datetime, timezone
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast
@@ -97,6 +98,8 @@ def execute_rollouts(
async def _execute_with_semaphore(idx):
async with semaphore:
evaluation_row: EvaluationRow = evaluation_rows[idx]
+ if evaluation_row.execution_metadata.rollout_start_time is None:
+ evaluation_row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
row_start_time = time.perf_counter()
trajectory = await self._execute_rollout(
diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index b119c7b3..1351dd6b 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -793,6 +793,11 @@ class CostMetrics(BaseModel):
class ExecutionMetadata(BaseModel):
"""Metadata about the execution of the evaluation."""
+ rollout_start_time: Optional[datetime] = Field(
+ default=None,
+ description="UTC timestamp when the rollout started.",
+ )
+
invocation_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the invocation that this row belongs to.",
diff --git a/eval_protocol/pytest/default_agent_rollout_processor.py b/eval_protocol/pytest/default_agent_rollout_processor.py
index ec6f983b..b55c5838 100644
--- a/eval_protocol/pytest/default_agent_rollout_processor.py
+++ b/eval_protocol/pytest/default_agent_rollout_processor.py
@@ -3,6 +3,7 @@
import logging
import os
import time
+from datetime import datetime, timezone
from typing import Any, AsyncIterator, List, Optional, Union, Dict
from mcp.types import CallToolResult, TextContent
@@ -249,6 +250,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with agent rollout."""
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
agent = Agent(
diff --git a/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py b/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py
index 27d44b80..39f8a4d8 100644
--- a/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py
+++ b/eval_protocol/pytest/default_klavis_sandbox_rollout_processor.py
@@ -4,6 +4,7 @@
import os
import tempfile
import time
+from datetime import datetime, timezone
from typing import Any, Callable, Dict, List, Optional
from pydantic import BaseModel, Field
@@ -66,7 +67,8 @@ def __call__(
async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with complete sandbox lifecycle"""
-
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
agent: Agent | None = None
temp_config_path: str | None = None
diff --git a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py
index f9618799..0902934f 100644
--- a/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py
+++ b/eval_protocol/pytest/default_pydantic_ai_rollout_processor.py
@@ -4,6 +4,7 @@
from collections.abc import Callable
import logging
import time
+from datetime import datetime, timezone
from pydantic_ai.toolsets import FunctionToolset
from pydantic_ai.usage import UsageLimits
from typing_extensions import override
@@ -50,6 +51,8 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->
async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with agent rollout."""
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
tools = []
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
index c3e09ba3..bde79b04 100644
--- a/eval_protocol/pytest/default_single_turn_rollout_process.py
+++ b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -3,6 +3,7 @@
import logging
import os
import time
+from datetime import datetime, timezone
from dataclasses import asdict, is_dataclass
from types import SimpleNamespace
from typing import Any, List
@@ -62,6 +63,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row asynchronously."""
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
if len(row.messages) == 0:
diff --git a/eval_protocol/pytest/github_action_rollout_processor.py b/eval_protocol/pytest/github_action_rollout_processor.py
index 3e4f9ec0..6c3489b9 100644
--- a/eval_protocol/pytest/github_action_rollout_processor.py
+++ b/eval_protocol/pytest/github_action_rollout_processor.py
@@ -67,6 +67,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
max_pages = (num_rows + 99) // 100 # Round up pages
async def _process_row(row: EvaluationRow) -> EvaluationRow:
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
if row.execution_metadata.invocation_id is None:
diff --git a/eval_protocol/pytest/openenv_rollout_processor.py b/eval_protocol/pytest/openenv_rollout_processor.py
index 0f662692..af57cb3c 100644
--- a/eval_protocol/pytest/openenv_rollout_processor.py
+++ b/eval_protocol/pytest/openenv_rollout_processor.py
@@ -15,6 +15,7 @@
import asyncio
import logging
import time
+from datetime import datetime, timezone
from itertools import count
from typing import List, Any, Dict, Callable, Generic, TypeVar, Optional, Type
@@ -167,6 +168,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with OpenEnv rollout."""
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
logger.info("[OpenEnvRolloutProcessor] Starting rollout for row")
diff --git a/eval_protocol/pytest/priority_scheduler.py b/eval_protocol/pytest/priority_scheduler.py
index 605b973e..d773ee42 100644
--- a/eval_protocol/pytest/priority_scheduler.py
+++ b/eval_protocol/pytest/priority_scheduler.py
@@ -3,6 +3,7 @@
import logging
import os
import time
+from datetime import datetime, timezone
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Any, List, Dict, Optional, Union
@@ -293,6 +294,8 @@ async def _run_eval(rows_to_eval: Union[EvaluationRow, List[EvaluationRow]]):
# 3. Execute the rollout
result_row: Optional[EvaluationRow] = None
+ if row_copy.execution_metadata.rollout_start_time is None:
+ row_copy.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
try:
async for result in rollout_processor_with_retry(
diff --git a/eval_protocol/pytest/remote_rollout_processor.py b/eval_protocol/pytest/remote_rollout_processor.py
index aa1c5d44..34371200 100644
--- a/eval_protocol/pytest/remote_rollout_processor.py
+++ b/eval_protocol/pytest/remote_rollout_processor.py
@@ -1,5 +1,6 @@
import asyncio
import time
+from datetime import datetime, timezone
from typing import Any, Dict, List, Optional
import requests
@@ -72,6 +73,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
raise ValueError("remote_base_url is required in RolloutProcessorConfig.kwargs for RemoteRolloutProcessor")
async def _process_row(row: EvaluationRow) -> EvaluationRow:
+ if row.execution_metadata.rollout_start_time is None:
+ row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
if row.execution_metadata.invocation_id is None:
diff --git a/vite-app/src/components/EvaluationRow.tsx b/vite-app/src/components/EvaluationRow.tsx
index 87e89f86..34b21438 100644
--- a/vite-app/src/components/EvaluationRow.tsx
+++ b/vite-app/src/components/EvaluationRow.tsx
@@ -263,6 +263,21 @@ const RowScore = observer(({ score }: { score: number | undefined }) => {
);
});
+const RowRolloutDuration = observer(
+ ({ durationSeconds }: { durationSeconds?: number | null }) => {
+ if (durationSeconds === null || durationSeconds === undefined) {
+ return N/A;
+ }
+
+ const formatted = `${durationSeconds.toFixed(2)}s`;
+ return (
+
+ {formatted}
+
+ );
+ }
+);
+
const RowCreated = observer(({ created_at }: { created_at: Date | string }) => {
const date = created_at instanceof Date ? created_at : new Date(created_at);
@@ -432,6 +447,13 @@ export const EvaluationRow = observer(
/>
+ {/* Rollout Latency */}
+
+
+
+
{/* Model */}
diff --git a/vite-app/src/components/EvaluationTable.tsx b/vite-app/src/components/EvaluationTable.tsx
index 56adc283..f30c07a9 100644
--- a/vite-app/src/components/EvaluationTable.tsx
+++ b/vite-app/src/components/EvaluationTable.tsx
@@ -226,6 +226,14 @@ export const EvaluationTable = observer(() => {
>
Rollout Status
+
+ Rollout Latency
+
(typeof val === "string" ? new Date(val) : val),
+ z.date()
+ )
+ .optional()
+ .describe("UTC timestamp when the rollout started."),
invocation_id: z
.string()
.optional()
@@ -346,6 +353,16 @@ export const ExecutionMetadataSchema = z.object({
.nullable()
.optional()
.describe("Processing duration in seconds for this evaluation row."),
+ rollout_duration_seconds: z
+ .number()
+ .nullable()
+ .optional()
+ .describe("Processing duration in seconds for the rollout of this row."),
+ eval_duration_seconds: z
+ .number()
+ .nullable()
+ .optional()
+ .describe("Processing duration in seconds for the evaluation of this row."),
experiment_duration_seconds: z
.number()
.nullable()