diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs index 7cc33384..15c302d7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -3,6 +3,7 @@ #[path = "quantitative/freshness.rs"] mod freshness; #[path = "quantitative/metrics.rs"] mod metrics; #[path = "quantitative/product_manifest.rs"] mod product_manifest; +#[path = "quantitative/qmd_candidate_replay.rs"] mod qmd_candidate_replay; use std::{path::Path, process::Command}; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs index 5330273e..62f4530d 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs @@ -141,6 +141,8 @@ fn quantitative_docker_task_routes_through_split_makefile_and_digest_runner() -> assert!(docker_script.contains("build_baseline_runner_with_digest")); assert!(aggregate_script.contains("require_runner_image_digest")); assert!(aggregate_script.contains("materialize-quantitative-artifact-freshness.py")); + assert!(aggregate_script.contains("materialize-qmd-candidate-replay-gate.py")); + assert!(aggregate_script.contains("qmd-candidate-replay-comparability-gate.json")); Ok(()) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/qmd_candidate_replay.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/qmd_candidate_replay.rs new file mode 100644 index 00000000..165de02a --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/qmd_candidate_replay.rs @@ -0,0 +1,390 @@ +use std::{ + env, fs, + path::PathBuf, + process::{self, Command}, + time::{SystemTime, UNIX_EPOCH}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +const QMD_COMMIT: &str = "0123456789abcdef0123456789abcdef01234567"; +const RUNNER_DIGEST: &str = + "sha256:cea965615ad701b8b772f4a5607b982f01c3177e29fc8dbcd2b76b19ba862751"; + +struct QmdCandidateReplayFixture { + temp_dir: PathBuf, + product_manifest_path: PathBuf, + freshness_manifest_path: PathBuf, + out_path: PathBuf, +} +impl QmdCandidateReplayFixture { + fn new(name: &str, product_manifest: &Value, freshness_manifest: &Value) -> Result { + let nonce = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); + let temp_dir = env::temp_dir().join(format!("{name}-{}-{nonce}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let product_manifest_path = temp_dir.join("qmd-product-manifest.json"); + let freshness_manifest_path = temp_dir.join("freshness-manifest.json"); + let out_path = temp_dir.join("qmd-candidate-replay-gate.json"); + + fs::write( + &product_manifest_path, + format!("{}\n", serde_json::to_string_pretty(product_manifest)?), + )?; + fs::write( + &freshness_manifest_path, + format!("{}\n", serde_json::to_string_pretty(freshness_manifest)?), + )?; + + Ok(Self { temp_dir, product_manifest_path, freshness_manifest_path, out_path }) + } + + fn run_materializer(&self) -> Result { + let output = Command::new("python3") + .arg( + support::workspace_root()?.join("scripts/materialize-qmd-candidate-replay-gate.py"), + ) + .arg("--product-manifest") + .arg(&self.product_manifest_path) + .arg("--freshness-manifest") + .arg(&self.freshness_manifest_path) + .arg("--out") + .arg(&self.out_path) + .output()?; + + assert!( + output.status.success(), + "qmd candidate-replay gate materializer failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + support::load_json(&self.out_path) + } +} + +impl Drop for QmdCandidateReplayFixture { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.temp_dir); + } +} + +#[test] +fn qmd_candidate_replay_gate_passes_with_full_comparability_evidence() -> Result<()> { + let fixture = QmdCandidateReplayFixture::new( + "elf-qmd-candidate-replay-pass", + &qmd_product_manifest(true, true, Some("qmd-held-out-audit"), qmd_per_query_rows()), + &qmd_freshness_manifest(Some(RUNNER_DIGEST), Some(QMD_COMMIT)), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.qmd_candidate_replay_comparability_gate/v1") + ); + assert_eq!(manifest.pointer("/result_state").and_then(Value::as_str), Some("pass")); + assert_eq!(manifest.pointer("/comparable").and_then(Value::as_bool), Some(true)); + assert_eq!( + manifest.pointer("/unqualified_leaderboard_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + assert_eq!(manifest.pointer("/replay_artifact_row_count").and_then(Value::as_u64), Some(2)); + assert_eq!(manifest.pointer("/per_query_row_count").and_then(Value::as_u64), Some(2)); + assert_eq!( + support::string_array_at(&manifest, "/source_manifest_corpus_ids")?, + vec!["qmd-corpus"] + ); + + let gates = manifest + .pointer("/gates") + .and_then(Value::as_object) + .ok_or_else(|| eyre::eyre!("missing gates object"))?; + + for gate_name in gates.keys() { + assert_eq!( + manifest.pointer(&format!("/gates/{gate_name}")).and_then(Value::as_bool), + Some(true), + "gate {gate_name} should pass" + ); + } + + Ok(()) +} + +#[test] +fn qmd_candidate_replay_gate_blocks_missing_digest_audit_and_replay_rows() -> Result<()> { + let fixture = QmdCandidateReplayFixture::new( + "elf-qmd-candidate-replay-blocked", + &qmd_product_manifest(false, false, None, vec![]), + &qmd_freshness_manifest(None, Some(QMD_COMMIT)), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!(manifest.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert_eq!(manifest.pointer("/comparable").and_then(Value::as_bool), Some(false)); + assert_eq!( + manifest.pointer("/unqualified_leaderboard_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + assert!(support::array_contains_str(&manifest, "/missing_gates", "qmd_held_out")?); + assert!(support::array_contains_str(&manifest, "/missing_gates", "qmd_leakage_audited")?); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_audit_manifest_present" + )?); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_replay_artifact_present" + )?); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_candidate_replay_complete" + )?); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_container_digest_present" + )?); + + Ok(()) +} + +#[test] +fn qmd_candidate_replay_gate_blocks_non_pass_product_state() -> Result<()> { + let mut product_manifest = + qmd_product_manifest(true, true, Some("qmd-held-out-audit"), qmd_per_query_rows()); + + support::set_json_pointer( + &mut product_manifest, + "/rows/0/result_state", + serde_json::json!("blocked"), + )?; + + let fixture = QmdCandidateReplayFixture::new( + "elf-qmd-candidate-replay-non-pass", + &product_manifest, + &qmd_freshness_manifest(Some(RUNNER_DIGEST), Some(QMD_COMMIT)), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!(manifest.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert!(support::array_contains_str(&manifest, "/missing_gates", "qmd_result_state_pass")?); + assert_eq!( + manifest.pointer("/unqualified_leaderboard_claim_allowed").and_then(Value::as_bool), + Some(false) + ); + + Ok(()) +} + +#[test] +fn qmd_candidate_replay_gate_blocks_incomplete_source_id_mapping() -> Result<()> { + let mut product_manifest = + qmd_product_manifest(true, true, Some("qmd-held-out-audit"), qmd_per_query_rows()); + + product_manifest + .pointer_mut("/per_query_rows/0") + .and_then(Value::as_object_mut) + .ok_or_else(|| eyre::eyre!("missing qmd per-query row"))? + .remove("source_manifest_corpus_id"); + + let fixture = QmdCandidateReplayFixture::new( + "elf-qmd-candidate-replay-missing-source-id", + &product_manifest, + &qmd_freshness_manifest(Some(RUNNER_DIGEST), Some(QMD_COMMIT)), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!(manifest.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert!(support::array_contains_str(&manifest, "/missing_gates", "qmd_source_id_mapped")?); + + Ok(()) +} + +#[test] +fn qmd_candidate_replay_gate_blocks_non_pass_or_partial_replay_rows() -> Result<()> { + let mut product_manifest = + qmd_product_manifest(true, true, Some("qmd-held-out-audit"), qmd_per_query_rows()); + + support::set_json_pointer( + &mut product_manifest, + "/per_query_rows/0/result_state", + serde_json::json!("incomplete"), + )?; + support::set_json_pointer( + &mut product_manifest, + "/rows/0/ranking_coverage_state", + serde_json::json!("partial_coverage"), + )?; + + let fixture = QmdCandidateReplayFixture::new( + "elf-qmd-candidate-replay-partial-replay", + &product_manifest, + &qmd_freshness_manifest(Some(RUNNER_DIGEST), Some(QMD_COMMIT)), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!(manifest.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_candidate_replay_complete" + )?); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_aggregate_replay_fields_complete" + )?); + + Ok(()) +} + +#[test] +fn qmd_candidate_replay_gate_blocks_split_reproducibility_rows() -> Result<()> { + let fixture = QmdCandidateReplayFixture::new( + "elf-qmd-candidate-replay-split-repro", + &qmd_product_manifest(true, true, Some("qmd-held-out-audit"), qmd_per_query_rows()), + &qmd_split_freshness_manifest(), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!(manifest.pointer("/result_state").and_then(Value::as_str), Some("blocked")); + assert_eq!( + manifest.pointer("/gates/qmd_container_digest_present").and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + manifest.pointer("/gates/qmd_product_commit_present").and_then(Value::as_bool), + Some(true) + ); + assert!(support::array_contains_str( + &manifest, + "/missing_gates", + "qmd_reproducibility_row_bound" + )?); + + Ok(()) +} + +fn qmd_product_manifest( + held_out: bool, + leakage_audited: bool, + audit_manifest_id: Option<&str>, + per_query_rows: Vec, +) -> Value { + serde_json::json!({ + "schema": "elf.agent_memory_quantitative_product_manifest/v1", + "manifest_id": "qmd-candidate-replay-product-manifest", + "corpus_id": "qmd-corpus", + "rows": [{ + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "adapter_name": "qmd live real-world CLI adapter", + "suite": "retrieval", + "evidence_class": "live_real_world", + "source_manifest_corpus_id": "qmd-corpus", + "result_state": "pass", + "comparable": true, + "metric_comparable": true, + "leaderboard_eligible": false, + "held_out": held_out, + "leakage_audited": leakage_audited, + "audit_manifest_id": audit_manifest_id, + "fixture_regression_only": false, + "sample_size": 2, + "ranking_query_count": 2, + "ranking_coverage_state": "complete", + "ranked_candidate_source": "produced_evidence_order", + "qrel_source": "explicit_qrels", + "explicit_qrel_query_count": 2, + "metrics": {}, + "metric_states": {}, + "denominators": {}, + "confidence_intervals": {}, + "claim_boundary": "Qualified qmd candidate-replay comparability only; no unqualified leaderboard claim." + }], + "per_query_rows": per_query_rows + }) +} + +fn qmd_per_query_rows() -> Vec { + vec![qmd_per_query_row("qmd-query-1", 4, 1), qmd_per_query_row("qmd-query-2", 5, 2)] +} + +fn qmd_per_query_row(query_id: &str, candidate_count: u64, expected_relevant_count: u64) -> Value { + serde_json::json!({ + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "query_id": query_id, + "source_manifest_corpus_id": "qmd-corpus", + "result_state": "pass", + "candidate_count": candidate_count, + "expected_relevant_count": expected_relevant_count, + "qrel_source": "explicit_qrels", + "metrics": {}, + "metric_states": {}, + "denominators": {}, + "claim_boundary": "Per-query replay row emitted by product runtime candidates with explicit qrels." + }) +} + +fn qmd_freshness_manifest(container_digest: Option<&str>, product_commit: Option<&str>) -> Value { + serde_json::json!({ + "schema": "elf.quantitative_artifact_freshness_manifest/v1", + "combined_inputs": [{ + "label": "qmd-live-explicit-qrels", + "rows": [{ + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "evidence_class": "live_real_world", + "result_state": "pass", + "leaderboard_eligible": false, + "metric_comparable": true, + "present_in_combined_manifest": true, + "reproducibility": { + "container_image_digest": container_digest, + "product_commit": product_commit, + "public_reproducible": container_digest.is_some() && product_commit.is_some(), + "missing_fields": [] + } + }] + }] + }) +} + +fn qmd_split_freshness_manifest() -> Value { + serde_json::json!({ + "schema": "elf.quantitative_artifact_freshness_manifest/v1", + "combined_inputs": [{ + "label": "qmd-live-explicit-qrels", + "rows": [ + { + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "reproducibility": { + "container_image_digest": RUNNER_DIGEST, + "product_commit": null, + "public_reproducible": false, + "missing_fields": ["product_commit"] + } + }, + { + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "reproducibility": { + "container_image_digest": null, + "product_commit": QMD_COMMIT, + "public_reproducible": false, + "missing_fields": ["container_image_digest"] + } + } + ] + }] + }) +} diff --git a/docs/evidence/benchmarking/2026-07-03-qmd-candidate-replay-comparability-gate.md b/docs/evidence/benchmarking/2026-07-03-qmd-candidate-replay-comparability-gate.md new file mode 100644 index 00000000..bad7d43c --- /dev/null +++ b/docs/evidence/benchmarking/2026-07-03-qmd-candidate-replay-comparability-gate.md @@ -0,0 +1,85 @@ +# qmd Candidate-Replay Comparability Gate - July 3, 2026 + +## Purpose + +XY-1156 adds a qmd-specific comparability gate for the Docker-owned quantitative +benchmark path. The gate prevents qmd candidate replay evidence from becoming a +general leaderboard claim unless the artifact has the evidence needed for a +qualified same-corpus candidate-replay comparison. + +## Command Surface + +Primary command: + +```bash +cargo make real-world-memory-quantitative-docker +``` + +The command runs inside `docker-compose.baseline.yml` through the baseline runner. +It refuses to run the aggregate outside Docker and requires the baseline runner +image digest before benchmark work starts. + +The Docker aggregate now emits: + +| Artifact | Purpose | +| --- | --- | +| `tmp/real-world-memory/quantitative-docker/qmd-quantitative-product-manifest.json` | qmd product-runtime quantitative row and per-query replay rows exported from the live explicit-qrels adapter report. | +| `tmp/real-world-memory/quantitative-docker/quantitative-artifact-freshness-manifest.json` | Docker runner, repository, input manifest, artifact digest, container image digest, and product commit reproducibility evidence. | +| `tmp/real-world-memory/quantitative-docker/qmd-candidate-replay-comparability-gate.json` | qmd-specific typed pass/blocked gate for candidate replay comparability. | + +## Gate Contract + +`scripts/materialize-qmd-candidate-replay-gate.py` reads the qmd quantitative +product manifest and the freshness manifest, then checks: + +| Gate | Required Evidence | +| --- | --- | +| `product_manifest_schema` | Product manifest uses `elf.agent_memory_quantitative_product_manifest/v1`. | +| `freshness_manifest_schema` | Freshness manifest uses `elf.quantitative_artifact_freshness_manifest/v1`. | +| `qmd_row_present` | At least one qmd product row exists. | +| `qmd_typed_result_state_present` | The qmd row keeps a typed pass or non-pass state. | +| `qmd_result_state_pass` | The qmd product row is a measured `pass`, not only a typed blocker. | +| `qmd_source_id_mapped` | Every qmd product and per-query row maps to the manifest `corpus_id`. | +| `qmd_held_out` | The qmd row declares held-out status. | +| `qmd_leakage_audited` | The qmd row declares leakage audit status. | +| `qmd_audit_manifest_present` | The qmd row names the audit manifest. | +| `qmd_replay_artifact_present` | Per-query rows pass and contain positive runtime candidate counts, positive expected relevance counts, and explicit qrels. | +| `qmd_candidate_replay_complete` | Every qmd per-query row has a complete passing replay artifact. | +| `qmd_aggregate_replay_fields_complete` | Aggregate ranking count, sample size, explicit-qrel count, coverage state, ranked-candidate source, and qrel source match the per-query replay rows. | +| `qmd_container_digest_present` | Matching qmd freshness evidence contains a valid hex runner image digest. | +| `qmd_product_commit_present` | Matching qmd freshness evidence contains a structured 40-hex qmd product commit. | +| `qmd_reproducibility_row_bound` | The same matching qmd freshness row contains both the runner image digest and product commit. | + +If any gate fails, the output `result_state` is `blocked` and `comparable` is +`false`. The output always sets `unqualified_leaderboard_claim_allowed` to +`false`. + +## Validation + +Targeted tests cover: + +- passing output when all source-id, held-out, leakage, replay, digest, and commit + evidence is present; +- blocked output when audit, replay, and digest evidence are missing; +- Docker aggregate wiring so the qmd gate is emitted alongside the freshness + manifest. + +The current change intentionally does not rerun the full Docker quantitative +benchmark because the aggregate is slow. The executable contract is covered by the +materializer tests and Docker script wiring; a later production run should publish +the generated JSON artifact from the container path. + +## Claim Boundary + +This gate can support only this narrow claim: + +> qmd candidate replay is comparable for the same corpus when all gate checks pass. + +It does not support: + +- an unqualified product leaderboard; +- a broad ELF-vs-qmd product win; +- qmd public reproducibility when the freshness manifest lacks image digest or + product commit evidence; +- candidate replay comparability when per-query runtime candidate rows or explicit + qrels are missing. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 5741dfde..66cb1d65 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -60,3 +60,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-23-p4-quality-hardening-productization-readiness-report.md`: P4 Quality Hardening and Productization Readiness Report - June 23, 2026; adds `cargo make real-world-memory-p4-quality-hardening-closeout`, reruns adversarial, source-library, knowledge, and production-readiness slices, preserves private/provider blockers, and keeps P5 queueing behind main-thread acceptance with a narrowed productization scope. - `2026-06-27-public-quantitative-competitor-scoreboard-report.md`: Public Quantitative Competitor Scoreboard Report - June 27, 2026; publishes `elf.quality_scoreboard/v1` rows for 20 tracked products, including VectifyAI PageIndex, VectifyAI OpenKB, and plastic-labs Honcho typed rows. Rows expose recall@5, precision@5, MRR, nDCG, lifecycle, source-ref, and latency metrics where measured, and typed blocker, source-provenance, and next-evidence metadata where comparable metrics are not yet available, while preserving zero comparable product-runtime pass claims until held-out, leakage-audited, digest-identified runtime evidence exists. - `2026-07-03-source-backed-quality-benchmark-harness.md`: Source-Backed Memory Quality Benchmark Harness - July 3, 2026; adds `cargo make source-backed-memory-quality` and the `elf.source_backed_memory_quality_benchmark/v1` report surface for expected evidence recall, precision@5, source-ref coverage, stale/correction/delete behavior, Context Pack activation, Recall Debug privacy, hard-fail leak counters, latency, and typed scenario coverage. +- `2026-07-03-qmd-candidate-replay-comparability-gate.md`: qmd Candidate-Replay Comparability Gate - July 3, 2026; adds the Docker-aggregate `qmd-candidate-replay-comparability-gate.json` artifact and the qmd-specific typed pass/blocked gate for complete source-id mapping, held-out/leakage audit evidence, passing per-query replay rows, aggregate replay consistency, row-bound runner image digest plus product commit provenance, and no unqualified leaderboard claim. diff --git a/docs/log.md b/docs/log.md index 1aaa3776..6f7dc6d5 100644 --- a/docs/log.md +++ b/docs/log.md @@ -184,3 +184,10 @@ logs. evidence recall, precision@5, source-ref coverage, stale/correction/delete behavior, Context Pack activation, Recall Debug privacy, hard-fail leak counters, latency, and required source-backed memory scenario coverage. +- Added the XY-1156 qmd candidate-replay comparability gate to the Docker-owned + quantitative aggregate so qmd source-id mapping, held-out/leakage audit evidence, + passing per-query replay rows, aggregate replay consistency, baseline runner image + digest, and product commit provenance are checked before any qualified qmd + candidate-replay comparison. Digest and product commit evidence must be bound to + the same matching freshness row. The gate remains typed pass/blocked and never + permits an unqualified product leaderboard claim. diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md index c7396502..582eb75a 100644 --- a/docs/spec/agent_memory_quantitative_benchmark_v1.md +++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md @@ -106,6 +106,17 @@ RAGFlow must not accept a product commit unless a matching runtime that emitted the row. A non-pass attestation remains a failure even when a commit-shaped value is present. +The same Docker aggregate materializes +`qmd-candidate-replay-comparability-gate.json` through +`scripts/materialize-qmd-candidate-replay-gate.py`. The gate is qmd-specific +because qmd's useful comparator strength is candidate replay: it can be compared +only when the qmd quantitative manifest carries source-id mapping, held-out and +leakage audit evidence, explicit qrels, passing per-query runtime candidate rows, +aggregate replay counts that match those rows, and a matching freshness row that +binds a valid container image digest to a structured product commit. The gate +emits typed `pass` or `blocked` state and never permits an unqualified product +leaderboard claim. + ## Result States Every row must declare one result state: diff --git a/scripts/materialize-qmd-candidate-replay-gate.py b/scripts/materialize-qmd-candidate-replay-gate.py new file mode 100644 index 00000000..85a47e5a --- /dev/null +++ b/scripts/materialize-qmd-candidate-replay-gate.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +"""Materialize qmd candidate-replay comparability gates.""" + +from __future__ import annotations + +import argparse +import json +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +SCHEMA = "elf.qmd_candidate_replay_comparability_gate/v1" +PRODUCT_MANIFEST_SCHEMA = "elf.agent_memory_quantitative_product_manifest/v1" +FRESHNESS_SCHEMA = "elf.quantitative_artifact_freshness_manifest/v1" +QMD_ADAPTER_IDS = {"qmd_live_real_world", "qmd_operator_debug_live"} +SUPPORTED_RESULT_STATES = { + "pass", + "wrong_result", + "incomplete", + "blocked", + "not_tested", + "not_encoded", + "not_comparable", + "unsupported_claim", +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--product-manifest", required=True, type=Path) + parser.add_argument("--freshness-manifest", required=True, type=Path) + parser.add_argument("--out", required=True, type=Path) + return parser.parse_args() + + +def read_json(path: Path) -> dict[str, Any]: + with path.open(encoding="utf-8") as handle: + value = json.load(handle) + if not isinstance(value, dict): + raise SystemExit(f"{path} must contain a JSON object") + return value + + +def write_json(path: Path, value: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(value, handle, indent=2, sort_keys=True) + handle.write("\n") + + +def qmd_rows(manifest: dict[str, Any]) -> list[dict[str, Any]]: + rows = manifest.get("rows", []) + if not isinstance(rows, list): + return [] + return [row for row in rows if is_qmd_mapping(row)] + + +def qmd_per_query_rows(manifest: dict[str, Any]) -> list[dict[str, Any]]: + rows = manifest.get("per_query_rows", []) + if not isinstance(rows, list): + return [] + return [row for row in rows if is_qmd_mapping(row)] + + +def is_qmd_mapping(mapping: Any) -> bool: + if not isinstance(mapping, dict): + return False + product = str(mapping.get("product", "")).strip().lower() + adapter_id = str(mapping.get("adapter_id", "")).strip() + return product == "qmd" or adapter_id in QMD_ADAPTER_IDS + + +def row_key(mapping: dict[str, Any]) -> tuple[str, str]: + candidate = mapping.get("row", mapping) + if not isinstance(candidate, dict): + candidate = mapping + return ( + str(candidate.get("product", "")).strip().lower(), + str(candidate.get("adapter_id", "")).strip(), + ) + + +def freshness_rows(manifest: dict[str, Any]) -> list[dict[str, Any]]: + rows = [] + top_level_rows = manifest.get("rows", []) + if isinstance(top_level_rows, list): + rows.extend(top_level_rows) + combined_inputs = manifest.get("combined_inputs", []) + if isinstance(combined_inputs, list): + for combined_input in combined_inputs: + if not isinstance(combined_input, dict): + continue + input_rows = combined_input.get("rows", []) + if isinstance(input_rows, list): + rows.extend(input_rows) + return [row for row in rows if is_qmd_mapping(row.get("row", row))] + + +def has_container_digest(row: dict[str, Any]) -> bool: + reproducibility = row.get("reproducibility", row) + if not isinstance(reproducibility, dict): + return False + digest = reproducibility.get("container_image_digest") + return ( + isinstance(digest, str) + and digest.startswith("sha256:") + and len(digest) == 71 + and all(char in "0123456789abcdefABCDEF" for char in digest.removeprefix("sha256:")) + ) + + +def has_product_commit(row: dict[str, Any]) -> bool: + reproducibility = row.get("reproducibility", row) + if not isinstance(reproducibility, dict): + return False + commit = reproducibility.get("product_commit") + return ( + isinstance(commit, str) + and len(commit) == 40 + and all(char in "0123456789abcdefABCDEF" for char in commit) + ) + + +def positive_count(value: Any) -> bool: + return isinstance(value, int) and not isinstance(value, bool) and value > 0 + + +def replay_artifact_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + return [ + row + for row in rows + if row.get("result_state") == "pass" + and positive_count(row.get("candidate_count")) + and positive_count(row.get("expected_relevant_count")) + and row.get("qrel_source") == "explicit_qrels" + ] + + +def all_rows_same_corpus(rows: list[dict[str, Any]], corpus_id: Any) -> bool: + if not isinstance(corpus_id, str) or not corpus_id.strip() or not rows: + return False + return all(row.get("source_manifest_corpus_id") == corpus_id for row in rows) + + +def aggregate_replay_fields_complete(row: dict[str, Any], per_query_count: int) -> bool: + return ( + per_query_count > 0 + and row.get("ranking_query_count") == per_query_count + and row.get("explicit_qrel_query_count") == per_query_count + and row.get("sample_size") == per_query_count + and row.get("ranking_coverage_state") == "complete" + and row.get("ranked_candidate_source") == "produced_evidence_order" + and row.get("qrel_source") == "explicit_qrels" + ) + + +def gate_result(product_manifest: dict[str, Any], freshness_manifest: dict[str, Any]) -> dict[str, Any]: + qmd_product_rows = qmd_rows(product_manifest) + qmd_query_rows = qmd_per_query_rows(product_manifest) + qmd_freshness_rows = freshness_rows(freshness_manifest) + replay_rows = replay_artifact_rows(qmd_query_rows) + qmd_row = qmd_product_rows[0] if qmd_product_rows else {} + qmd_key = row_key(qmd_row) + matching_freshness_rows = [row for row in qmd_freshness_rows if row_key(row) == qmd_key] + source_corpus_ids = sorted( + { + row.get("source_manifest_corpus_id") + for row in [*qmd_product_rows, *qmd_query_rows] + if row.get("source_manifest_corpus_id") + } + ) + gates = { + "product_manifest_schema": product_manifest.get("schema") == PRODUCT_MANIFEST_SCHEMA, + "freshness_manifest_schema": freshness_manifest.get("schema") == FRESHNESS_SCHEMA, + "qmd_row_present": bool(qmd_product_rows), + "qmd_typed_result_state_present": qmd_row.get("result_state") in SUPPORTED_RESULT_STATES, + "qmd_result_state_pass": qmd_row.get("result_state") == "pass", + "qmd_source_id_mapped": all_rows_same_corpus( + [*qmd_product_rows, *qmd_query_rows], + product_manifest.get("corpus_id"), + ), + "qmd_held_out": qmd_row.get("held_out") is True, + "qmd_leakage_audited": qmd_row.get("leakage_audited") is True, + "qmd_audit_manifest_present": bool(qmd_row.get("audit_manifest_id")), + "qmd_replay_artifact_present": bool(replay_rows), + "qmd_candidate_replay_complete": len(replay_rows) == len(qmd_query_rows) and bool(qmd_query_rows), + "qmd_aggregate_replay_fields_complete": aggregate_replay_fields_complete( + qmd_row, + len(qmd_query_rows), + ), + "qmd_container_digest_present": any(has_container_digest(row) for row in matching_freshness_rows), + "qmd_product_commit_present": any(has_product_commit(row) for row in matching_freshness_rows), + "qmd_reproducibility_row_bound": any( + has_container_digest(row) and has_product_commit(row) for row in matching_freshness_rows + ), + } + missing = [name for name, passed in gates.items() if not passed] + result_state = "pass" if not missing else "blocked" + + return { + "schema": SCHEMA, + "generated_at": datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"), + "product": "qmd", + "adapter_ids": sorted( + {str(row.get("adapter_id")) for row in [*qmd_product_rows, *qmd_query_rows] if row.get("adapter_id")} + ), + "result_state": result_state, + "comparable": result_state == "pass", + "unqualified_leaderboard_claim_allowed": False, + "claim_boundary": ( + "This gate only permits a qualified qmd candidate-replay comparability claim when " + "all gates pass. It never permits an unqualified product leaderboard claim." + ), + "gates": gates, + "missing_gates": missing, + "source_manifest_corpus_ids": source_corpus_ids, + "row_count": len(qmd_product_rows), + "per_query_row_count": len(qmd_query_rows), + "replay_artifact_row_count": len(replay_rows), + "freshness_row_count": len(qmd_freshness_rows), + "matching_freshness_row_count": len(matching_freshness_rows), + "typed_result_state": qmd_row.get("result_state"), + "quantitative_row": qmd_row, + } + + +def main() -> None: + args = parse_args() + product_manifest = read_json(args.product_manifest) + freshness_manifest = read_json(args.freshness_manifest) + write_json(args.out, gate_result(product_manifest, freshness_manifest)) + + +if __name__ == "__main__": + main() diff --git a/scripts/real-world-quantitative-docker.sh b/scripts/real-world-quantitative-docker.sh index 547da752..e16782f0 100755 --- a/scripts/real-world-quantitative-docker.sh +++ b/scripts/real-world-quantitative-docker.sh @@ -8,6 +8,7 @@ LIVE_ADAPTER_DIR="${LIVE_QRELS_DIR}/live-adapters" SYNC_LOG="${REPORT_DIR}/synced-artifacts.tsv" QMD_PRODUCT_MANIFEST="${REPORT_DIR}/qmd-quantitative-product-manifest.json" FRESHNESS_MANIFEST="${REPORT_DIR}/quantitative-artifact-freshness-manifest.json" +QMD_CANDIDATE_REPLAY_GATE="${REPORT_DIR}/qmd-candidate-replay-comparability-gate.json" RUN_LIVE_EXPLICIT_QRELS="${ELF_REAL_WORLD_QUANTITATIVE_RUN_LIVE_EXPLICIT_QRELS:-1}" RUN_LANGGRAPH="${ELF_REAL_WORLD_QUANTITATIVE_RUN_LANGGRAPH:-0}" QMD_DIR="${ELF_REAL_WORLD_QMD_DIR:-/bench/repos/qmd}" @@ -96,6 +97,12 @@ python3 scripts/materialize-quantitative-artifact-freshness.py \ --run-live-explicit-qrels "${RUN_LIVE_EXPLICIT_QRELS}" \ --run-langgraph "${RUN_LANGGRAPH}" +python3 scripts/materialize-qmd-candidate-replay-gate.py \ + --product-manifest "${QMD_PRODUCT_MANIFEST}" \ + --freshness-manifest "${FRESHNESS_MANIFEST}" \ + --out "${QMD_CANDIDATE_REPLAY_GATE}" + echo "Quantitative Docker benchmark artifacts:" echo " ${QMD_PRODUCT_MANIFEST}" echo " ${FRESHNESS_MANIFEST}" +echo " ${QMD_CANDIDATE_REPLAY_GATE}"