eval-protocol · dphuang2 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/.gitignore b/.gitignore
@@ -243,3 +243,5 @@ package.json
 tau2-bench
 *.err
 eval-protocol
+
+.vscode/launch.json
diff --git a/.vscode/.gitignore b/.vscode/.gitignore
@@ -0,0 +1 @@
+!launch.json.backup
diff --git a/.vscode/launch.json b/.vscode/launch.json
diff --git a/.vscode/launch.json.example b/.vscode/launch.json.example
@@ -0,0 +1,60 @@
+{
+	"version": "0.2.0",
+	"configurations": [
+		{
+			"name": "EP: Upload",
+			"type": "python",
+			"request": "launch",
+			"module": "eval_protocol.cli",
+			"args": ["upload"],
+			"console": "integratedTerminal",
+			"justMyCode": false,
+			"cwd": "<REPLACE_WITH_YOUR_EVALUATOR_DIRECTORY>",
+			"env": {
+				"PYTHONPATH": "${workspaceFolder}",
+				"FIREWORKS_API_KEY": "${env:FIREWORKS_API_KEY}",
+				"FIREWORKS_BASE_URL": "${env:FIREWORKS_BASE_URL}",
+				"FIREWORKS_EXTRA_HEADERS": "{\"x-api-key\": \"${env:FIREWORKS_API_KEY}\", \"X-Fireworks-Gateway-Secret\": \"${env:FIREWORKS_GATEWAY_SECRET}\"}"
+			}
+		},
+		{
+			"name": "EP: Local Test",
+			"type": "python",
+			"request": "launch",
+			"module": "eval_protocol.cli",
+			"args": ["local-test", "--ignore-docker"],
+			"console": "integratedTerminal",
+			"justMyCode": false,
+			"cwd": "<REPLACE_WITH_YOUR_EVALUATOR_DIRECTORY>",
+			"env": {
+				"PYTHONPATH": "${workspaceFolder}",
+				"FIREWORKS_API_KEY": "${env:FIREWORKS_API_KEY}",
+				"FIREWORKS_BASE_URL": "${env:FIREWORKS_BASE_URL}",
+				"FIREWORKS_EXTRA_HEADERS": "{\"x-api-key\": \"${env:FIREWORKS_API_KEY}\", \"X-Fireworks-Gateway-Secret\": \"${env:FIREWORKS_GATEWAY_SECRET}\"}"
+			}
+		},
+		{
+			"name": "EP: Create RFT",
+			"type": "python",
+			"request": "launch",
+			"module": "eval_protocol.cli",
+			"args": [
+				"create",
+				"rft",
+				"--base-model",
+				"accounts/fireworks/models/qwen3-0p6b",
+				"--chunk-size",
+				"10"
+			],
+			"console": "integratedTerminal",
+			"justMyCode": false,
+			"cwd": "<REPLACE_WITH_YOUR_EVALUATOR_DIRECTORY>",
+			"env": {
+				"PYTHONPATH": "${workspaceFolder}",
+				"FIREWORKS_API_KEY": "${env:FIREWORKS_API_KEY}",
+				"FIREWORKS_BASE_URL": "${env:FIREWORKS_BASE_URL}",
+				"FIREWORKS_EXTRA_HEADERS": "{\"x-api-key\": \"${env:FIREWORKS_API_KEY}\", \"X-Fireworks-Gateway-Secret\": \"${env:FIREWORKS_GATEWAY_SECRET}\"}"
+			}
+		}
+	]
+}
diff --git a/eval_protocol/auth.py b/eval_protocol/auth.py
@@ -1,12 +1,75 @@
 import logging
 import os
-from typing import Optional
+from typing import Dict, Optional
 
 import requests
+from dotenv import dotenv_values, find_dotenv, load_dotenv
 
 logger = logging.getLogger(__name__)
 
 
+def find_dotenv_path(search_path: Optional[str] = None) -> Optional[str]:
+    """
+    Find the .env file path, searching .env.dev first, then .env.
+
+    Args:
+        search_path: Directory to search from. If None, uses current working directory.
+
+    Returns:
+        Path to the .env file if found, otherwise None.
+    """
+    # If a specific search path is provided, look there first
+    if search_path:
+        env_dev_path = os.path.join(search_path, ".env.dev")
+        if os.path.isfile(env_dev_path):
+            return env_dev_path
+        env_path = os.path.join(search_path, ".env")
+        if os.path.isfile(env_path):
+            return env_path
+        return None
+
+    # Otherwise use find_dotenv to search up the directory tree
+    env_dev_path = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
+    if env_dev_path:
+        return env_dev_path
+    env_path = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
+    if env_path:
+        return env_path
+    return None
+
+
+def get_dotenv_values(search_path: Optional[str] = None) -> Dict[str, Optional[str]]:
+    """
+    Get all key-value pairs from the .env file.
+
+    Args:
+        search_path: Directory to search from. If None, uses current working directory.
+
+    Returns:
+        Dictionary of environment variable names to values.
+    """
+    dotenv_path = find_dotenv_path(search_path)
+    if dotenv_path:
+        return dotenv_values(dotenv_path)
+    return {}
+
+
+# --- Load .env files ---
+# Attempt to load .env.dev first, then .env as a fallback.
+# This happens when the module is imported.
+# We use override=False (default) so that existing environment variables
+# (e.g., set in the shell) are NOT overridden by .env files.
+_DOTENV_PATH = find_dotenv_path()
+if _DOTENV_PATH:
+    load_dotenv(dotenv_path=_DOTENV_PATH, override=False)
+    logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_DOTENV_PATH}")
+else:
+    logger.debug(
+        "eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
+    )
+# --- End .env loading ---
+
+
 def get_fireworks_api_key() -> Optional[str]:
     """
     Retrieves the Fireworks API key.
@@ -73,6 +136,8 @@ def verify_api_key_and_get_account_id(
     Args:
         api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
         api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
+            If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
+            dev.api.fireworks.ai for the verification call.
 
     Returns:
         The resolved account id if verification succeeds and the header is present; otherwise None.
@@ -81,7 +146,12 @@ def verify_api_key_and_get_account_id(
         resolved_key = api_key or get_fireworks_api_key()
         if not resolved_key:
             return None
-        resolved_base = api_base or get_fireworks_api_base()
+        provided_base = api_base or get_fireworks_api_base()
+        # Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
+        if "api.fireworks.ai" in provided_base:
+            resolved_base = provided_base
+        else:
+            resolved_base = "https://dev.api.fireworks.ai"
 
         from .common_utils import get_user_agent
 

diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
@@ -81,13 +81,12 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         "--env-file",
         help="Path to .env file containing secrets to upload (default: .env in current directory)",
     )
-    upload_parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Overwrite existing evaluator with the same ID",
-    )
 
     # Auto-generate flags from SDK Fireworks().evaluators.create() signature
+    # Note: We use Fireworks() directly here instead of create_fireworks_client()
+    # because we only need the method signature for introspection, not a fully
+    # authenticated client. create_fireworks_client() would trigger an HTTP request
+    # to verify the API key, causing delays even for --help invocations.
     create_evaluator_fn = Fireworks().evaluators.create
 
     upload_skip_fields = {
@@ -137,7 +136,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
 
     rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
     rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
-    rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
     rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
     rft_parser.add_argument(
         "--ignore-docker",
@@ -198,6 +196,10 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         "loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
     }
 
+    # Note: We use Fireworks() directly here instead of create_fireworks_client()
+    # because we only need the method signature for introspection, not a fully
+    # authenticated client. create_fireworks_client() would trigger an HTTP request
+    # to verify the API key, causing delays even for --help invocations.
     create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
 
     add_args_from_callable_signature(
@@ -208,6 +210,78 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         help_overrides=help_overrides,
     )
 
+    # Create evj (Evaluation Job) subcommand
+    evj_parser = create_subparsers.add_parser(
+        "evj",
+        help="Create an Evaluation Job on Fireworks",
+    )
+
+    evj_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
+    evj_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
+    evj_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
+    evj_parser.add_argument(
+        "--ignore-docker",
+        action="store_true",
+        help="Ignore Dockerfile even if present; run pytest on host during evaluator validation",
+    )
+    evj_parser.add_argument(
+        "--docker-build-extra",
+        default="",
+        metavar="",
+        help="Extra flags to pass to 'docker build' when validating evaluator (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
+    )
+    evj_parser.add_argument(
+        "--docker-run-extra",
+        default="",
+        metavar="",
+        help="Extra flags to pass to 'docker run' when validating evaluator (quoted string, e.g. \"--env-file .env --memory=8g\")",
+    )
+    evj_parser.add_argument(
+        "--quiet",
+        action="store_true",
+        help="If set, only errors will be printed.",
+    )
+
+    # Auto-generate flags from SDK Fireworks().evaluation_jobs.create() signature
+    create_evj_fn = Fireworks().evaluation_jobs.create
+
+    evj_skip_fields = {
+        "__top_level__": {
+            "account_id",  # auto-detected
+            "extra_headers",
+            "extra_query",
+            "extra_body",
+            "timeout",
+        },
+        "evaluation_job": {
+            "output_stats",  # read-only, set by server
+        },
+    }
+    evj_aliases = {
+        "evaluation_job_id": ["--job-id"],
+        "evaluation_job.evaluator": ["--evaluator"],
+        "evaluation_job.input_dataset": ["--dataset"],  # --input-dataset is auto-added
+        "evaluation_job.display_name": ["--name"],
+        # output_dataset, evaluator_version get their short forms auto-added
+    }
+    evj_help_overrides = {
+        "evaluation_job_id": "Evaluation Job ID to use",
+        "evaluation_job.evaluator": "Evaluator resource name (format: accounts/{account_id}/evaluators/{evaluator_id})",
+        "evaluation_job.input_dataset": "Input dataset resource name (format: accounts/{account_id}/datasets/{dataset_id})",
+        "evaluation_job.output_dataset": "Output dataset resource name where results will be written",
+        "evaluation_job.display_name": "Display name for the evaluation job",
+        "evaluation_job.evaluator_version": "Specific evaluator version to use (defaults to current version)",
+        "leaderboard_ids": "Optional leaderboard IDs to attach this job to upon creation",
+    }
+
+    add_args_from_callable_signature(
+        evj_parser,
+        create_evj_fn,
+        skip_fields=evj_skip_fields,
+        aliases=evj_aliases,
+        help_overrides=evj_help_overrides,
+    )
+
     # Local test command
     local_test_parser = subparsers.add_parser(
         "local-test",
@@ -349,7 +423,11 @@ def _extract_flag_value(argv_list, flag_name):
             from .cli_commands.create_rft import create_rft_command
 
             return create_rft_command(args)
-        print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
+        elif args.create_command == "evj":
+            from .cli_commands.create_evj import create_evj_command
+
+            return create_evj_command(args)
+        print("Error: missing subcommand for 'create'. Try: eval-protocol create rft|evj")
         return 1
     elif args.command == "local-test":
         from .cli_commands.local_test import local_test_command