{
  "schema_version": "1.0",
  "service": "codex-reset-radar",
  "type": "model_iq_check",
  "updated_at": "2026-06-04T12:33:41.856085+08:00",
  "experiment": {
    "status": "experimental_development",
    "label": "实验性模型体检 · 开发中",
    "description": "使用固定 DeepSWE 12 题探针的通过率折算趋势；样本、口径和自动化仍在开发中。"
  },
  "subset": {
    "id": "deepswe-12-v1",
    "name": "DeepSWE 12-task probe v1",
    "size": 12,
    "selection_policy": "Fixed low-cost mixed pass/fail subset calibrated from the 2026-05-30 113-task Codex GPT-5.5 xhigh run; selected for language coverage, 8/4 pass-fail balance, and cost control.",
    "tasks": [
      {
        "task_name": "ytt-jsonpath-query-api",
        "language": "go",
        "baseline_passed": true,
        "baseline_cost_usd": 2.62,
        "role": "stable_pass",
        "title": "Add JSONPath query APIs to orderedmap and Starlark modules"
      },
      {
        "task_name": "participle-grammar-conflict-analysis",
        "language": "go",
        "baseline_passed": false,
        "baseline_cost_usd": 3.16,
        "role": "boundary_fail",
        "title": "Add build-time grammar conflict analysis to participle"
      },
      {
        "task_name": "abs-module-cache-flags",
        "language": "go",
        "baseline_passed": true,
        "baseline_cost_usd": 4.67,
        "role": "stable_pass",
        "title": "Harden module loading, cache introspection, and script flags"
      },
      {
        "task_name": "httpx-multipart-response-parsing",
        "language": "python",
        "baseline_passed": true,
        "baseline_cost_usd": 3.02,
        "role": "stable_pass",
        "title": "Add multipart response parsing to HTTPX"
      },
      {
        "task_name": "bandit-incremental-cache-control",
        "language": "python",
        "baseline_passed": false,
        "baseline_cost_usd": 4.12,
        "role": "boundary_fail",
        "title": "Add incremental cache controls to Bandit"
      },
      {
        "task_name": "ipython-session-bundle-replay",
        "language": "python",
        "baseline_passed": true,
        "baseline_cost_usd": 4.58,
        "role": "stable_pass",
        "title": "Add session bundle recording and replay to IPython"
      },
      {
        "task_name": "ofetch-per-origin-circuit-breaker",
        "language": "typescript",
        "baseline_passed": true,
        "baseline_cost_usd": 2.78,
        "role": "stable_pass",
        "title": "Add a per-origin circuit breaker to ofetch"
      },
      {
        "task_name": "obsidian-linter-link-format-conversion",
        "language": "typescript",
        "baseline_passed": false,
        "baseline_cost_usd": 2.79,
        "role": "boundary_fail",
        "title": "Add link format conversion between wiki and markdown syntax"
      },
      {
        "task_name": "kea-atomic-signal-selectors",
        "language": "typescript",
        "baseline_passed": true,
        "baseline_cost_usd": 5.59,
        "role": "stable_pass",
        "title": "Add atomic signal selectors to Kea"
      },
      {
        "task_name": "csstree-shorthand-expansion-compression",
        "language": "javascript",
        "baseline_passed": true,
        "baseline_cost_usd": 3.82,
        "role": "stable_pass",
        "title": "Add shorthand expansion and compression to the lexer"
      },
      {
        "task_name": "fd-deterministic-multi-key-sorting",
        "language": "rust",
        "baseline_passed": false,
        "baseline_cost_usd": 4.26,
        "role": "boundary_fail",
        "title": "Add deterministic multi-key sorting to fd"
      },
      {
        "task_name": "oxvg-structural-selector-preservation",
        "language": "rust",
        "baseline_passed": true,
        "baseline_cost_usd": 8.49,
        "role": "hard_pass",
        "title": "Preserve structure needed by stylesheet selectors"
      }
    ]
  },
  "baseline": {
    "full_run": {
      "date": "2026-05-30",
      "source": "combined-113-codex-gpt55-xhigh-20260530",
      "tasks": 113,
      "valid_tasks": 113,
      "passed": 75,
      "failed": 38,
      "pass_rate": 0.663717,
      "iq_score": 100.0,
      "cost_usd": 767.855315,
      "n_input_tokens": 1039855369,
      "n_cache_tokens": 1014737920,
      "n_output_tokens": 4496637,
      "n_agent_steps": 9160,
      "wall_seconds": 46195
    },
    "subset": {
      "date": "2026-05-30",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "tasks": 12,
      "passed": 8,
      "failed": 4,
      "pass_rate": 0.666667,
      "iq_score": 100.0,
      "estimated_cost_usd": 49.9
    }
  },
  "history": [
    {
      "date": "2026-05-30",
      "label": "DeepSWE 12-task probe v1 baseline",
      "source": "combined-113-codex-gpt55-xhigh-20260530:selected-12",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "subset_id": "deepswe-12-v1",
      "tasks": 12,
      "valid_tasks": 12,
      "passed": 8,
      "failed": 4,
      "invalid": 0,
      "pass_rate": 0.666667,
      "baseline_pass_rate": 0.666667,
      "iq_score": 100.0,
      "status": "green",
      "cost_usd": 49.911904,
      "n_input_tokens": 56031320,
      "n_cache_tokens": 53929728,
      "n_output_tokens": 414636,
      "n_agent_steps": 634,
      "wall_seconds": 3270,
      "wall_time_basis": "scheduled_task_durations",
      "completion_concurrency": 4,
      "serial_task_seconds": 11565,
      "source_span_seconds": 35142,
      "task_results": [
        {
          "task_name": "ytt-jsonpath-query-api",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 2.623482,
          "n_agent_steps": 31,
          "duration_seconds": 764
        },
        {
          "task_name": "participle-grammar-conflict-analysis",
          "language": "go",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 3.16346,
          "n_agent_steps": 47,
          "duration_seconds": 1074
        },
        {
          "task_name": "abs-module-cache-flags",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 4.673595000000001,
          "n_agent_steps": 57,
          "duration_seconds": 1015
        },
        {
          "task_name": "httpx-multipart-response-parsing",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 3.0185630000000003,
          "n_agent_steps": 39,
          "duration_seconds": 586
        },
        {
          "task_name": "bandit-incremental-cache-control",
          "language": "python",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 4.117954,
          "n_agent_steps": 52,
          "duration_seconds": 920
        },
        {
          "task_name": "ipython-session-bundle-replay",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 4.584818,
          "n_agent_steps": 45,
          "duration_seconds": 1014
        },
        {
          "task_name": "ofetch-per-origin-circuit-breaker",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 2.7796270000000005,
          "n_agent_steps": 40,
          "duration_seconds": 837
        },
        {
          "task_name": "obsidian-linter-link-format-conversion",
          "language": "typescript",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 2.7941079999999996,
          "n_agent_steps": 34,
          "duration_seconds": 672
        },
        {
          "task_name": "kea-atomic-signal-selectors",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 5.586837,
          "n_agent_steps": 77,
          "duration_seconds": 1149
        },
        {
          "task_name": "csstree-shorthand-expansion-compression",
          "language": "javascript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 3.818159,
          "n_agent_steps": 50,
          "duration_seconds": 1212
        },
        {
          "task_name": "fd-deterministic-multi-key-sorting",
          "language": "rust",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0,
          "exception_type": null,
          "cost_usd": 4.2603670000000005,
          "n_agent_steps": 58,
          "duration_seconds": 905
        },
        {
          "task_name": "oxvg-structural-selector-preservation",
          "language": "rust",
          "role": "hard_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1,
          "exception_type": null,
          "cost_usd": 8.490934,
          "n_agent_steps": 104,
          "duration_seconds": 1419
        }
      ]
    },
    {
      "date": "2026-06-01",
      "label": "Daily DeepSWE 12-task probe 2026-06-01",
      "source": "codex-gpt55-xhigh-goals-full-4x-20260531205053:selected-12",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "subset_id": "deepswe-12-v1",
      "tasks": 12,
      "valid_tasks": 12,
      "passed": 6,
      "failed": 6,
      "invalid": 0,
      "pass_rate": 0.5,
      "baseline_pass_rate": 0.666667,
      "iq_score": 75.0,
      "status": "red",
      "cost_usd": 59.821461,
      "n_input_tokens": 72998601,
      "n_cache_tokens": 70534272,
      "n_output_tokens": 407756,
      "n_agent_steps": 783,
      "wall_seconds": 3859,
      "wall_time_basis": "scheduled_task_durations",
      "completion_concurrency": 4,
      "serial_task_seconds": 11323,
      "source_span_seconds": 27330,
      "task_results": [
        {
          "task_name": "ytt-jsonpath-query-api",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.44947,
          "n_agent_steps": 42,
          "duration_seconds": 767
        },
        {
          "task_name": "participle-grammar-conflict-analysis",
          "language": "go",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.6334400000000002,
          "n_agent_steps": 59,
          "duration_seconds": 1061
        },
        {
          "task_name": "abs-module-cache-flags",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 4.458967,
          "n_agent_steps": 70,
          "duration_seconds": 786
        },
        {
          "task_name": "httpx-multipart-response-parsing",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.1876219999999997,
          "n_agent_steps": 53,
          "duration_seconds": 584
        },
        {
          "task_name": "bandit-incremental-cache-control",
          "language": "python",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.856619,
          "n_agent_steps": 72,
          "duration_seconds": 819
        },
        {
          "task_name": "ipython-session-bundle-replay",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.538222,
          "n_agent_steps": 49,
          "duration_seconds": 809
        },
        {
          "task_name": "ofetch-per-origin-circuit-breaker",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.430294,
          "n_agent_steps": 53,
          "duration_seconds": 689
        },
        {
          "task_name": "obsidian-linter-link-format-conversion",
          "language": "typescript",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 2.9640500000000003,
          "n_agent_steps": 33,
          "duration_seconds": 662
        },
        {
          "task_name": "kea-atomic-signal-selectors",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.751042,
          "n_agent_steps": 71,
          "duration_seconds": 950
        },
        {
          "task_name": "csstree-shorthand-expansion-compression",
          "language": "javascript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 5.078142,
          "n_agent_steps": 68,
          "duration_seconds": 1373
        },
        {
          "task_name": "fd-deterministic-multi-key-sorting",
          "language": "rust",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 3.747298,
          "n_agent_steps": 52,
          "duration_seconds": 688
        },
        {
          "task_name": "oxvg-structural-selector-preservation",
          "language": "rust",
          "role": "hard_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 15.726294999999999,
          "n_agent_steps": 161,
          "duration_seconds": 2135
        }
      ]
    },
    {
      "date": "2026-06-02",
      "label": "Daily DeepSWE 12-task probe 2026-06-02",
      "source": "model-iq-deepswe-12-v1-gpt-55-xhigh-20260602",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "subset_id": "deepswe-12-v1",
      "tasks": 12,
      "valid_tasks": 12,
      "passed": 8,
      "failed": 4,
      "invalid": 0,
      "pass_rate": 0.666667,
      "baseline_pass_rate": 0.666667,
      "iq_score": 100.0,
      "status": "green",
      "cost_usd": 43.663017,
      "n_input_tokens": 46125507,
      "n_cache_tokens": 44028544,
      "n_output_tokens": 372131,
      "n_agent_steps": 559,
      "wall_seconds": 2787,
      "wall_time_basis": "actual_subset_span",
      "completion_concurrency": 4,
      "serial_task_seconds": 9705,
      "source_span_seconds": 2787,
      "task_results": [
        {
          "task_name": "ytt-jsonpath-query-api",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.163265,
          "n_agent_steps": 27,
          "duration_seconds": 892
        },
        {
          "task_name": "participle-grammar-conflict-analysis",
          "language": "go",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 2.549391,
          "n_agent_steps": 29,
          "duration_seconds": 679
        },
        {
          "task_name": "abs-module-cache-flags",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.1324750000000003,
          "n_agent_steps": 49,
          "duration_seconds": 699
        },
        {
          "task_name": "httpx-multipart-response-parsing",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.861939,
          "n_agent_steps": 43,
          "duration_seconds": 832
        },
        {
          "task_name": "bandit-incremental-cache-control",
          "language": "python",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.188758,
          "n_agent_steps": 50,
          "duration_seconds": 666
        },
        {
          "task_name": "ipython-session-bundle-replay",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.2738479999999996,
          "n_agent_steps": 36,
          "duration_seconds": 595
        },
        {
          "task_name": "ofetch-per-origin-circuit-breaker",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 4.105462,
          "n_agent_steps": 60,
          "duration_seconds": 847
        },
        {
          "task_name": "obsidian-linter-link-format-conversion",
          "language": "typescript",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 3.0429129999999995,
          "n_agent_steps": 28,
          "duration_seconds": 554
        },
        {
          "task_name": "kea-atomic-signal-selectors",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 5.0945160000000005,
          "n_agent_steps": 56,
          "duration_seconds": 1262
        },
        {
          "task_name": "csstree-shorthand-expansion-compression",
          "language": "javascript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.727847,
          "n_agent_steps": 56,
          "duration_seconds": 955
        },
        {
          "task_name": "fd-deterministic-multi-key-sorting",
          "language": "rust",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.519681,
          "n_agent_steps": 51,
          "duration_seconds": 713
        },
        {
          "task_name": "oxvg-structural-selector-preservation",
          "language": "rust",
          "role": "hard_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 6.002922,
          "n_agent_steps": 74,
          "duration_seconds": 1013
        }
      ]
    },
    {
      "date": "2026-06-03",
      "label": "Daily DeepSWE 12-task probe 2026-06-03",
      "source": "model-iq-deepswe-12-v1-gpt-55-xhigh-20260603",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "subset_id": "deepswe-12-v1",
      "tasks": 12,
      "valid_tasks": 12,
      "passed": 8,
      "failed": 4,
      "invalid": 0,
      "pass_rate": 0.666667,
      "baseline_pass_rate": 0.666667,
      "iq_score": 100.0,
      "status": "green",
      "cost_usd": 47.177695,
      "n_input_tokens": 48578393,
      "n_cache_tokens": 45937920,
      "n_output_tokens": 366879,
      "n_agent_steps": 562,
      "wall_seconds": 2495,
      "wall_time_basis": "actual_subset_span",
      "completion_concurrency": 4,
      "serial_task_seconds": 9045,
      "source_span_seconds": 2495,
      "task_results": [
        {
          "task_name": "ytt-jsonpath-query-api",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.629709,
          "n_agent_steps": 24,
          "duration_seconds": 601
        },
        {
          "task_name": "participle-grammar-conflict-analysis",
          "language": "go",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.371299,
          "n_agent_steps": 31,
          "duration_seconds": 593
        },
        {
          "task_name": "abs-module-cache-flags",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.728831,
          "n_agent_steps": 34,
          "duration_seconds": 585
        },
        {
          "task_name": "httpx-multipart-response-parsing",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 4.203313,
          "n_agent_steps": 56,
          "duration_seconds": 649
        },
        {
          "task_name": "bandit-incremental-cache-control",
          "language": "python",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 3.749546,
          "n_agent_steps": 55,
          "duration_seconds": 748
        },
        {
          "task_name": "ipython-session-bundle-replay",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.160657,
          "n_agent_steps": 57,
          "duration_seconds": 698
        },
        {
          "task_name": "ofetch-per-origin-circuit-breaker",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.753209,
          "n_agent_steps": 47,
          "duration_seconds": 731
        },
        {
          "task_name": "obsidian-linter-link-format-conversion",
          "language": "typescript",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 3.245758,
          "n_agent_steps": 33,
          "duration_seconds": 676
        },
        {
          "task_name": "kea-atomic-signal-selectors",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 4.937765000000001,
          "n_agent_steps": 50,
          "duration_seconds": 774
        },
        {
          "task_name": "csstree-shorthand-expansion-compression",
          "language": "javascript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 3.874065,
          "n_agent_steps": 43,
          "duration_seconds": 1024
        },
        {
          "task_name": "fd-deterministic-multi-key-sorting",
          "language": "rust",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.3686659999999997,
          "n_agent_steps": 44,
          "duration_seconds": 652
        },
        {
          "task_name": "oxvg-structural-selector-preservation",
          "language": "rust",
          "role": "hard_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 8.154877,
          "n_agent_steps": 88,
          "duration_seconds": 1313
        }
      ]
    },
    {
      "date": "2026-06-04",
      "label": "Daily DeepSWE 12-task probe 2026-06-04",
      "source": "model-iq-deepswe-12-v1-gpt-55-xhigh-20260604",
      "model": "gpt-5.5",
      "reasoning_effort": "xhigh",
      "subset_id": "deepswe-12-v1",
      "tasks": 12,
      "valid_tasks": 12,
      "passed": 6,
      "failed": 6,
      "invalid": 0,
      "pass_rate": 0.5,
      "baseline_pass_rate": 0.666667,
      "iq_score": 75.0,
      "status": "red",
      "cost_usd": 43.83028,
      "n_input_tokens": 48075956,
      "n_cache_tokens": 46299520,
      "n_output_tokens": 393278,
      "n_agent_steps": 541,
      "wall_seconds": 2818,
      "wall_time_basis": "actual_subset_span",
      "completion_concurrency": 4,
      "serial_task_seconds": 10365,
      "source_span_seconds": 2818,
      "task_results": [
        {
          "task_name": "ytt-jsonpath-query-api",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.663008,
          "n_agent_steps": 35,
          "duration_seconds": 687
        },
        {
          "task_name": "participle-grammar-conflict-analysis",
          "language": "go",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 2.562004,
          "n_agent_steps": 36,
          "duration_seconds": 759
        },
        {
          "task_name": "abs-module-cache-flags",
          "language": "go",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 2.9852450000000004,
          "n_agent_steps": 44,
          "duration_seconds": 614
        },
        {
          "task_name": "httpx-multipart-response-parsing",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.0165100000000002,
          "n_agent_steps": 40,
          "duration_seconds": 639
        },
        {
          "task_name": "bandit-incremental-cache-control",
          "language": "python",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 3.183236,
          "n_agent_steps": 46,
          "duration_seconds": 746
        },
        {
          "task_name": "ipython-session-bundle-replay",
          "language": "python",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 2.9527039999999998,
          "n_agent_steps": 43,
          "duration_seconds": 728
        },
        {
          "task_name": "ofetch-per-origin-circuit-breaker",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 3.1688530000000004,
          "n_agent_steps": 42,
          "duration_seconds": 1101
        },
        {
          "task_name": "obsidian-linter-link-format-conversion",
          "language": "typescript",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.3412310000000005,
          "n_agent_steps": 44,
          "duration_seconds": 872
        },
        {
          "task_name": "kea-atomic-signal-selectors",
          "language": "typescript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 4.182389000000001,
          "n_agent_steps": 46,
          "duration_seconds": 779
        },
        {
          "task_name": "csstree-shorthand-expansion-compression",
          "language": "javascript",
          "role": "stable_pass",
          "baseline_passed": true,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.129477,
          "n_agent_steps": 50,
          "duration_seconds": 1115
        },
        {
          "task_name": "fd-deterministic-multi-key-sorting",
          "language": "rust",
          "role": "boundary_fail",
          "baseline_passed": false,
          "passed": false,
          "valid": true,
          "reward": 0.0,
          "exception_type": null,
          "cost_usd": 4.0393,
          "n_agent_steps": 49,
          "duration_seconds": 952
        },
        {
          "task_name": "oxvg-structural-selector-preservation",
          "language": "rust",
          "role": "hard_pass",
          "baseline_passed": true,
          "passed": true,
          "valid": true,
          "reward": 1.0,
          "exception_type": null,
          "cost_usd": 6.606323,
          "n_agent_steps": 66,
          "duration_seconds": 1372
        }
      ]
    }
  ],
  "latest": {
    "date": "2026-06-04",
    "label": "Daily DeepSWE 12-task probe 2026-06-04",
    "source": "model-iq-deepswe-12-v1-gpt-55-xhigh-20260604",
    "model": "gpt-5.5",
    "reasoning_effort": "xhigh",
    "subset_id": "deepswe-12-v1",
    "tasks": 12,
    "valid_tasks": 12,
    "passed": 6,
    "failed": 6,
    "invalid": 0,
    "pass_rate": 0.5,
    "baseline_pass_rate": 0.666667,
    "iq_score": 75.0,
    "status": "red",
    "cost_usd": 43.83028,
    "n_input_tokens": 48075956,
    "n_cache_tokens": 46299520,
    "n_output_tokens": 393278,
    "n_agent_steps": 541,
    "wall_seconds": 2818,
    "wall_time_basis": "actual_subset_span",
    "completion_concurrency": 4,
    "serial_task_seconds": 10365,
    "source_span_seconds": 2818,
    "task_results": [
      {
        "task_name": "ytt-jsonpath-query-api",
        "language": "go",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1.0,
        "exception_type": null,
        "cost_usd": 2.663008,
        "n_agent_steps": 35,
        "duration_seconds": 687
      },
      {
        "task_name": "participle-grammar-conflict-analysis",
        "language": "go",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0.0,
        "exception_type": null,
        "cost_usd": 2.562004,
        "n_agent_steps": 36,
        "duration_seconds": 759
      },
      {
        "task_name": "abs-module-cache-flags",
        "language": "go",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1.0,
        "exception_type": null,
        "cost_usd": 2.9852450000000004,
        "n_agent_steps": 44,
        "duration_seconds": 614
      },
      {
        "task_name": "httpx-multipart-response-parsing",
        "language": "python",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1.0,
        "exception_type": null,
        "cost_usd": 3.0165100000000002,
        "n_agent_steps": 40,
        "duration_seconds": 639
      },
      {
        "task_name": "bandit-incremental-cache-control",
        "language": "python",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0.0,
        "exception_type": null,
        "cost_usd": 3.183236,
        "n_agent_steps": 46,
        "duration_seconds": 746
      },
      {
        "task_name": "ipython-session-bundle-replay",
        "language": "python",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": false,
        "valid": true,
        "reward": 0.0,
        "exception_type": null,
        "cost_usd": 2.9527039999999998,
        "n_agent_steps": 43,
        "duration_seconds": 728
      },
      {
        "task_name": "ofetch-per-origin-circuit-breaker",
        "language": "typescript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1.0,
        "exception_type": null,
        "cost_usd": 3.1688530000000004,
        "n_agent_steps": 42,
        "duration_seconds": 1101
      },
      {
        "task_name": "obsidian-linter-link-format-conversion",
        "language": "typescript",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0.0,
        "exception_type": null,
        "cost_usd": 4.3412310000000005,
        "n_agent_steps": 44,
        "duration_seconds": 872
      },
      {
        "task_name": "kea-atomic-signal-selectors",
        "language": "typescript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1.0,
        "exception_type": null,
        "cost_usd": 4.182389000000001,
        "n_agent_steps": 46,
        "duration_seconds": 779
      },
      {
        "task_name": "csstree-shorthand-expansion-compression",
        "language": "javascript",
        "role": "stable_pass",
        "baseline_passed": true,
        "passed": false,
        "valid": true,
        "reward": 0.0,
        "exception_type": null,
        "cost_usd": 4.129477,
        "n_agent_steps": 50,
        "duration_seconds": 1115
      },
      {
        "task_name": "fd-deterministic-multi-key-sorting",
        "language": "rust",
        "role": "boundary_fail",
        "baseline_passed": false,
        "passed": false,
        "valid": true,
        "reward": 0.0,
        "exception_type": null,
        "cost_usd": 4.0393,
        "n_agent_steps": 49,
        "duration_seconds": 952
      },
      {
        "task_name": "oxvg-structural-selector-preservation",
        "language": "rust",
        "role": "hard_pass",
        "baseline_passed": true,
        "passed": true,
        "valid": true,
        "reward": 1.0,
        "exception_type": null,
        "cost_usd": 6.606323,
        "n_agent_steps": 66,
        "duration_seconds": 1372
      }
    ]
  },
  "moving_average": {
    "iq_3d": 91.7,
    "iq_7d": 90.0
  }
}
