Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions eval_protocol/integrations/tinker_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import time
from datetime import datetime, timezone
import traceback
from typing import Any, Dict, List, Optional, Union

Expand Down Expand Up @@ -76,6 +77,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
"""Generate rollout tasks using Tinker."""

async def process_row(row: EvaluationRow) -> EvaluationRow:
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()

if not row.messages:
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/mcp/execution/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os
import threading
import time
from datetime import datetime, timezone
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast

Expand Down Expand Up @@ -97,6 +98,8 @@ def execute_rollouts(
async def _execute_with_semaphore(idx):
async with semaphore:
evaluation_row: EvaluationRow = evaluation_rows[idx]
if evaluation_row.execution_metadata.rollout_start_time is None:
evaluation_row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
row_start_time = time.perf_counter()

trajectory = await self._execute_rollout(
Expand Down
5 changes: 5 additions & 0 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,11 @@ class CostMetrics(BaseModel):
class ExecutionMetadata(BaseModel):
"""Metadata about the execution of the evaluation."""

rollout_start_time: Optional[datetime] = Field(
default=None,
description="UTC timestamp when the rollout started.",
)

invocation_id: Optional[str] = Field(
default_factory=generate_id,
description="The ID of the invocation that this row belongs to.",
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/pytest/default_agent_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import time
from datetime import datetime, timezone
from typing import Any, AsyncIterator, List, Optional, Union, Dict

from mcp.types import CallToolResult, TextContent
Expand Down Expand Up @@ -249,6 +250,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with agent rollout."""
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()

agent = Agent(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
import tempfile
import time
from datetime import datetime, timezone
from typing import Any, Callable, Dict, List, Optional

from pydantic import BaseModel, Field
Expand Down Expand Up @@ -66,7 +67,8 @@ def __call__(

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with complete sandbox lifecycle"""

if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
agent: Agent | None = None
temp_config_path: str | None = None
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/pytest/default_pydantic_ai_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from collections.abc import Callable
import logging
import time
from datetime import datetime, timezone
from pydantic_ai.toolsets import FunctionToolset
from pydantic_ai.usage import UsageLimits
from typing_extensions import override
Expand Down Expand Up @@ -50,6 +51,8 @@ def __call__(self, rows: list[EvaluationRow], config: RolloutProcessorConfig) ->

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with agent rollout."""
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()

tools = []
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/pytest/default_single_turn_rollout_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import time
from datetime import datetime, timezone
from dataclasses import asdict, is_dataclass
from types import SimpleNamespace
from typing import Any, List
Expand Down Expand Up @@ -62,6 +63,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row asynchronously."""
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
Comment on lines +66 to +67

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Reset rollout_start_time on retries

In retry flows (rollout_processor_with_retry reuses the same EvaluationRow), this if ... is None guard means the timestamp is only set on the first attempt. If the first attempt fails and the row is retried, the successful attempt keeps the earlier rollout_start_time while rollout_duration_seconds reflects only the last attempt, so any latency calculation or trace alignment based on rollout_start_time will be too early by the time spent in prior retries. Consider resetting rollout_start_time at the start of each attempt (or in the retry wrapper) to keep these timings consistent.

Useful? React with 👍 / 👎.

start_time = time.perf_counter()

if len(row.messages) == 0:
Expand Down
2 changes: 2 additions & 0 deletions eval_protocol/pytest/github_action_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
max_pages = (num_rows + 99) // 100 # Round up pages

async def _process_row(row: EvaluationRow) -> EvaluationRow:
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()

if row.execution_metadata.invocation_id is None:
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/pytest/openenv_rollout_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import asyncio
import logging
import time
from datetime import datetime, timezone
from itertools import count
from typing import List, Any, Dict, Callable, Generic, TypeVar, Optional, Type

Expand Down Expand Up @@ -167,6 +168,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->

async def process_row(row: EvaluationRow) -> EvaluationRow:
"""Process a single row with OpenEnv rollout."""
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()

logger.info("[OpenEnvRolloutProcessor] Starting rollout for row")
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/pytest/priority_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import os
import time
from datetime import datetime, timezone
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Any, List, Dict, Optional, Union
Expand Down Expand Up @@ -293,6 +294,8 @@ async def _run_eval(rows_to_eval: Union[EvaluationRow, List[EvaluationRow]]):

# 3. Execute the rollout
result_row: Optional[EvaluationRow] = None
if row_copy.execution_metadata.rollout_start_time is None:
row_copy.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()
try:
async for result in rollout_processor_with_retry(
Expand Down
3 changes: 3 additions & 0 deletions eval_protocol/pytest/remote_rollout_processor.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional

import requests
Expand Down Expand Up @@ -72,6 +73,8 @@ def __call__(self, rows: List[EvaluationRow], config: RolloutProcessorConfig) ->
raise ValueError("remote_base_url is required in RolloutProcessorConfig.kwargs for RemoteRolloutProcessor")

async def _process_row(row: EvaluationRow) -> EvaluationRow:
if row.execution_metadata.rollout_start_time is None:
row.execution_metadata.rollout_start_time = datetime.now(timezone.utc)
start_time = time.perf_counter()

if row.execution_metadata.invocation_id is None:
Expand Down
22 changes: 22 additions & 0 deletions vite-app/src/components/EvaluationRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,21 @@ const RowScore = observer(({ score }: { score: number | undefined }) => {
);
});

const RowRolloutDuration = observer(
({ durationSeconds }: { durationSeconds?: number | null }) => {
if (durationSeconds === null || durationSeconds === undefined) {
return <span className="text-gray-500">N/A</span>;
}

const formatted = `${durationSeconds.toFixed(2)}s`;
return (
<span className="font-mono text-gray-900 whitespace-nowrap">
{formatted}
</span>
);
}
);

const RowCreated = observer(({ created_at }: { created_at: Date | string }) => {
const date = created_at instanceof Date ? created_at : new Date(created_at);

Expand Down Expand Up @@ -432,6 +447,13 @@ export const EvaluationRow = observer(
/>
</TableCell>

{/* Rollout Latency */}
<TableCell className="py-3 text-xs">
<RowRolloutDuration
durationSeconds={row.execution_metadata?.rollout_duration_seconds}
/>
</TableCell>

{/* Model */}
<TableCell className="py-3 text-xs">
<RowModel model={row.input_metadata.completion_params.model} />
Expand Down
8 changes: 8 additions & 0 deletions vite-app/src/components/EvaluationTable.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,14 @@ export const EvaluationTable = observer(() => {
>
Rollout Status
</SortableTableHeader>
<SortableTableHeader
sortField="$.execution_metadata.rollout_duration_seconds"
currentSortField={state.sortField}
currentSortDirection={state.sortDirection}
onSort={handleSort}
>
Rollout Latency
</SortableTableHeader>
<SortableTableHeader
sortField="$.input_metadata.completion_params.model"
currentSortField={state.sortField}
Expand Down
17 changes: 17 additions & 0 deletions vite-app/src/types/eval-protocol.ts
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,13 @@ export const CostMetricsSchema = z.object({
});

export const ExecutionMetadataSchema = z.object({
rollout_start_time: z
.preprocess(
(val) => (typeof val === "string" ? new Date(val) : val),
z.date()
)
.optional()
.describe("UTC timestamp when the rollout started."),
invocation_id: z
.string()
.optional()
Expand Down Expand Up @@ -346,6 +353,16 @@ export const ExecutionMetadataSchema = z.object({
.nullable()
.optional()
.describe("Processing duration in seconds for this evaluation row."),
rollout_duration_seconds: z
.number()
.nullable()
.optional()
.describe("Processing duration in seconds for the rollout of this row."),
eval_duration_seconds: z
.number()
.nullable()
.optional()
.describe("Processing duration in seconds for the evaluation of this row."),
experiment_duration_seconds: z
.number()
.nullable()
Expand Down