diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs index 9bcc07c8..7cc33384 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -1,5 +1,6 @@ #[path = "quantitative/audit_manifest.rs"] mod audit_manifest; #[path = "quantitative/contracts.rs"] mod contracts; +#[path = "quantitative/freshness.rs"] mod freshness; #[path = "quantitative/metrics.rs"] mod metrics; #[path = "quantitative/product_manifest.rs"] mod product_manifest; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs new file mode 100644 index 00000000..5330273e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/freshness.rs @@ -0,0 +1,287 @@ +use std::{ + env, fs, + path::PathBuf, + process::{self, Command}, + time::{SystemTime, UNIX_EPOCH}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +const HONCHO_COMMIT: &str = "60a15e664d7298eb790b788e95c6ca2e6bd30c80"; +const RUNNER_DIGEST: &str = + "sha256:cea965615ad701b8b772f4a5607b982f01c3177e29fc8dbcd2b76b19ba862751"; + +struct FreshnessFixture { + temp_dir: PathBuf, + product_manifest_path: PathBuf, + sync_log_path: PathBuf, + out_path: PathBuf, +} +impl FreshnessFixture { + fn new(name: &str, product_manifest: &Value) -> Result { + let nonce = SystemTime::now().duration_since(UNIX_EPOCH)?.as_nanos(); + let temp_dir = env::temp_dir().join(format!("{name}-{}-{nonce}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let product_manifest_path = temp_dir.join("product-manifest.json"); + let sync_log_path = temp_dir.join("synced-artifacts.tsv"); + let out_path = temp_dir.join("freshness.json"); + + fs::write( + &product_manifest_path, + format!("{}\n", serde_json::to_string_pretty(product_manifest)?), + )?; + fs::write( + &sync_log_path, + format!( + "combined-input\thoncho-live\t{}\tcurrent_docker_run\n", + product_manifest_path.display() + ), + )?; + + Ok(Self { temp_dir, product_manifest_path, sync_log_path, out_path }) + } + + fn run_materializer(&self) -> Result { + let output = Command::new("python3") + .arg( + support::workspace_root()? + .join("scripts/materialize-quantitative-artifact-freshness.py"), + ) + .arg("--sync-log") + .arg(&self.sync_log_path) + .arg("--combined-product-manifest") + .arg(&self.product_manifest_path) + .arg("--out") + .arg(&self.out_path) + .arg("--run-live-explicit-qrels") + .arg("1") + .arg("--run-langgraph") + .arg("1") + .env("ELF_BASELINE_RUNNER_IMAGE_DIGEST", RUNNER_DIGEST) + .output()?; + + assert!( + output.status.success(), + "freshness materializer failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + support::load_json(&self.out_path) + } +} + +impl Drop for FreshnessFixture { + fn drop(&mut self) { + let _ = fs::remove_dir_all(&self.temp_dir); + } +} + +#[test] +fn quantitative_freshness_accepts_runner_image_digest_for_public_reproducibility() -> Result<()> { + let fixture = FreshnessFixture::new( + "elf-runner-digest-freshness", + &honcho_product_manifest(honcho_runtime_attestation( + "pass", + Some("runtime_source_checkout_verified"), + )), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!( + manifest.pointer("/reproducibility_summary/state").and_then(Value::as_str), + Some("public_reproducibility_ready") + ); + assert_eq!( + manifest + .pointer("/reproducibility_summary/public_reproducible_claim_allowed") + .and_then(Value::as_bool), + Some(true) + ); + assert!( + manifest + .pointer("/reproducibility_summary/missing_field_counts/container_image_digest") + .is_none() + ); + + let row = only_freshness_row(&manifest)?; + + assert_eq!( + row.pointer("/reproducibility/container_image_digest").and_then(Value::as_str), + Some(RUNNER_DIGEST) + ); + assert_eq!( + row.pointer("/reproducibility/container_image_digest_source").and_then(Value::as_str), + Some("env:ELF_BASELINE_RUNNER_IMAGE_DIGEST") + ); + assert_eq!( + row.pointer("/reproducibility/public_reproducible").and_then(Value::as_bool), + Some(true) + ); + assert_eq!(support::array_at(row, "/reproducibility/missing_fields")?.len(), 0); + + Ok(()) +} + +#[test] +fn quantitative_docker_task_routes_through_split_makefile_and_digest_runner() -> Result<()> { + let task_catalog = support::make_task_catalog()?; + let workspace = support::workspace_root()?; + let docker_script = fs::read_to_string(workspace.join("scripts/real-world-docker.sh"))?; + let aggregate_script = + fs::read_to_string(workspace.join("scripts/real-world-quantitative-docker.sh"))?; + + assert!(task_catalog.contains("[tasks.real-world-memory-quantitative-docker]")); + assert!(task_catalog.contains("\"memory-quantitative-docker\"")); + assert!(docker_script.contains("memory-quantitative-docker)")); + assert!(docker_script.contains("build_baseline_runner_with_digest")); + assert!(aggregate_script.contains("require_runner_image_digest")); + assert!(aggregate_script.contains("materialize-quantitative-artifact-freshness.py")); + + Ok(()) +} + +#[test] +fn quantitative_freshness_rejects_runtime_sensitive_commit_without_attestation() -> Result<()> { + let mut product_manifest = honcho_product_manifest(honcho_runtime_attestation( + "pass", + Some("runtime_source_checkout_verified"), + )); + + product_manifest + .pointer_mut("/rows/0/runtime_source_attestation") + .ok_or_else(|| eyre::eyre!("missing row attestation"))? + .take(); + product_manifest + .as_object_mut() + .ok_or_else(|| eyre::eyre!("product manifest is not an object"))? + .remove("runtime_source_attestation"); + + let fixture = FreshnessFixture::new("elf-runtime-sensitive-commit-spoof", &product_manifest)?; + let manifest = fixture.run_materializer()?; + + assert_eq!( + manifest.pointer("/reproducibility_summary/ready_row_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + manifest + .pointer("/reproducibility_summary/missing_field_counts/product_commit") + .and_then(Value::as_u64), + Some(1) + ); + + let gap = only_product_commit_gap(&manifest)?; + + assert_eq!( + gap.pointer("/reproducibility/runtime_source_attestation/status").and_then(Value::as_str), + Some("missing") + ); + assert_eq!( + gap.pointer("/reproducibility/runtime_source_attestation/reason").and_then(Value::as_str), + Some("missing_runtime_source_attestation") + ); + assert!(support::array_at(gap, "/reproducibility/rejected_product_commit_values")?.iter().any( + |item| item.pointer("/value").and_then(Value::as_str) == Some(HONCHO_COMMIT) + && item.pointer("/reason").and_then(Value::as_str) + == Some("missing_runtime_source_attestation") + )); + + Ok(()) +} + +#[test] +fn quantitative_freshness_rejects_non_pass_attestation_without_reason() -> Result<()> { + let fixture = FreshnessFixture::new( + "elf-non-pass-attestation", + &honcho_product_manifest(honcho_runtime_attestation("fail", None)), + )?; + let manifest = fixture.run_materializer()?; + + assert_eq!( + manifest.pointer("/reproducibility_summary/ready_row_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + manifest + .pointer("/reproducibility_summary/missing_field_counts/product_commit") + .and_then(Value::as_u64), + Some(1) + ); + + let gap = only_product_commit_gap(&manifest)?; + + assert_eq!( + gap.pointer("/reproducibility/runtime_source_attestation/status").and_then(Value::as_str), + Some("fail") + ); + assert_eq!( + gap.pointer("/reproducibility/runtime_source_attestation/reason").and_then(Value::as_str), + Some("runtime_source_attestation_not_pass") + ); + assert!( + support::array_at(gap, "/reproducibility/rejected_product_commit_values")? + .iter() + .any(|item| item.pointer("/reason").and_then(Value::as_str) + == Some("runtime_source_attestation_not_pass")) + ); + + Ok(()) +} + +fn honcho_product_manifest(runtime_source_attestation: Value) -> Value { + serde_json::json!({ + "schema": "elf.agent_memory_quantitative_product_manifest/v1", + "manifest_id": "honcho-test-product-manifest", + "product": "Honcho", + "adapter_id": "honcho_live_real_world", + "corpus_id": "test-corpus", + "rows": [{ + "product": "Honcho", + "adapter_id": "honcho_live_real_world", + "adapter_name": "Honcho live adapter", + "suite": "retrieval", + "evidence_class": "live_real_world", + "result_state": "wrong_result", + "leaderboard_eligible": false, + "metric_comparable": false, + "product_commit": HONCHO_COMMIT, + "product_commit_source": "git.rev_parse_head:honcho_source_dir", + "runtime_source_attestation": runtime_source_attestation + }] + }) +} + +fn honcho_runtime_attestation(status: &str, reason: Option<&str>) -> Value { + let mut attestation = serde_json::json!({ + "status": status, + "product_commit": HONCHO_COMMIT, + "runtime_executed": true, + "source_checkout_used": true, + "runtime_artifact": "tmp/honcho/product-manifest.json" + }); + + if let Some(reason) = reason { + attestation["reason"] = serde_json::json!(reason); + } + + attestation +} + +fn only_freshness_row(manifest: &Value) -> Result<&Value> { + let inputs = support::array_at(manifest, "/combined_inputs")?; + let input = inputs.first().ok_or_else(|| eyre::eyre!("missing freshness input"))?; + let rows = support::array_at(input, "/rows")?; + + rows.first().ok_or_else(|| eyre::eyre!("missing freshness row")) +} + +fn only_product_commit_gap(manifest: &Value) -> Result<&Value> { + let gaps = support::array_at(manifest, "/product_commit_gap_rows")?; + + gaps.first().ok_or_else(|| eyre::eyre!("missing product commit gap row")) +} diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md index 265a71c1..c7396502 100644 --- a/docs/spec/agent_memory_quantitative_benchmark_v1.md +++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md @@ -18,8 +18,10 @@ code_refs: - makefiles/benchmark-memory-a.toml - makefiles/benchmark-memory-b.toml - scripts/materialize-explicit-qrels.py + - scripts/materialize-quantitative-artifact-freshness.py - scripts/real-world-explicit-qrels.sh - scripts/real-world-docker.sh + - scripts/real-world-quantitative-docker.sh - scripts/real-world-live-explicit-qrels.sh - apps/elf-eval/src/app.rs - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs @@ -36,8 +38,10 @@ drift_watch: - makefiles/benchmark-memory-a.toml - makefiles/benchmark-memory-b.toml - scripts/materialize-explicit-qrels.py + - scripts/materialize-quantitative-artifact-freshness.py - scripts/real-world-explicit-qrels.sh - scripts/real-world-docker.sh + - scripts/real-world-quantitative-docker.sh - scripts/real-world-live-explicit-qrels.sh - docs/spec/agent_memory_knowledge_system_v1.md - docs/spec/real_world_agent_memory_benchmark_v1.md @@ -84,6 +88,24 @@ Every quantitative row must declare one evidence class: | `research_gate` | Research-only, blocked, or reference-only evidence. | No. | | `mixed_evidence` | Aggregate row blends multiple evidence classes. | No; split rows before leaderboard use. | +## Artifact Freshness And Public Reproducibility + +Public reproducibility is a separate gate from metric quality. A quantitative row +may support a public reproducibility claim only when its provenance carries the +aggregate command, Docker runner, Compose file, repository head, environment +profile, input product-manifest SHA-256, artifact digest, container image digest, +and structured 40-hex product source commit. + +`cargo make real-world-memory-quantitative-docker` runs the Docker-owned aggregate +entrypoint and fails before aggregate work starts if the `baseline-runner` image +digest is absent or malformed. `scripts/materialize-quantitative-artifact-freshness.py` +then materializes the row-level freshness manifest for Docker-contained +quantitative product manifests. Runtime-sensitive rows such as Honcho, Letta, and +RAGFlow must not accept a product commit unless a matching +`runtime_source_attestation` proves the pinned checkout or image revision was the +runtime that emitted the row. A non-pass attestation remains a failure even when a +commit-shaped value is present. + ## Result States Every row must declare one result state: diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml index 3b47da39..79819a43 100644 --- a/makefiles/benchmark-memory-b.toml +++ b/makefiles/benchmark-memory-b.toml @@ -414,6 +414,14 @@ args = [ "tmp/real-world-memory/project-decisions/report.md", ] +[tasks.real-world-memory-quantitative-docker] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-quantitative-docker", +] + [tasks.real-world-memory-quantitative-scoreboard] workspace = false dependencies = [ diff --git a/scripts/materialize-quantitative-artifact-freshness.py b/scripts/materialize-quantitative-artifact-freshness.py new file mode 100644 index 00000000..8dc7b6b0 --- /dev/null +++ b/scripts/materialize-quantitative-artifact-freshness.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 +"""Materialize provenance gates for a Docker-contained quantitative aggregate.""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import os +import subprocess +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +SCHEMA = "elf.quantitative_artifact_freshness_manifest/v1" +PRODUCT_MANIFEST_SCHEMA = "elf.agent_memory_quantitative_product_manifest/v1" +BENCHMARK_COMMAND = "cargo make real-world-memory-quantitative-docker" +BENCHMARK_RUNNER = "docker-compose.baseline.yml baseline-runner" +BENCHMARK_COMPOSE_FILE = "docker-compose.baseline.yml" +REPO_ROOT = Path(__file__).resolve().parents[1] + +REPRODUCIBILITY_REQUIRED_FIELDS = [ + "command", + "runner", + "compose_file", + "repository_head", + "env_profile", + "input_manifest_sha256", + "artifact_digests", + "container_image_digest", + "product_commit", +] + +PRODUCT_COMMIT_FIELDS = [ + "product_commit", + "product_revision", + "source_revision", + "repository_commit", + "git_commit", + "commit", + "revision", +] + +RUNTIME_SOURCE_ATTESTATION_REQUIREMENTS = { + "honcho_live_real_world": ( + "Honcho product_commit may satisfy reproducibility only when the artifact " + "attests that the pinned Honcho checkout was the runtime that emitted the row." + ), + "letta_research_gate": ( + "Letta product_commit may satisfy reproducibility only when the artifact attests " + "that the pinned Letta server/runtime checkout was the runtime that emitted the row." + ), + "ragflow_docker_evidence_smoke": ( + "RAGFlow product_commit may satisfy reproducibility only when the artifact attests " + "that the pinned RAGFlow checkout or image revision emitted retrieval outputs." + ), + "ragflow_research_gate": ( + "RAGFlow product_commit may satisfy reproducibility only when the artifact attests " + "that the pinned RAGFlow checkout or image revision emitted retrieval outputs." + ), +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sync-log", required=True, type=Path) + parser.add_argument("--combined-product-manifest", required=True, type=Path) + parser.add_argument("--out", required=True, type=Path) + parser.add_argument("--operational-evidence-manifest", type=Path) + parser.add_argument("--run-live-explicit-qrels", required=True) + parser.add_argument("--run-langgraph", required=True) + return parser.parse_args() + + +def read_json(path: Path) -> dict[str, Any]: + with path.open(encoding="utf-8") as handle: + return json.load(handle) + + +def write_json(path: Path, value: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + json.dump(value, handle, indent=2, sort_keys=True) + handle.write("\n") + + +def resolve_artifact_path(path: str | Path) -> Path: + raw = Path(path) + if raw.is_absolute(): + return raw + return REPO_ROOT / raw + + +def sync_log_rows(path: Path) -> list[dict[str, str]]: + rows: list[dict[str, str]] = [] + with path.open(encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + fields = line.rstrip("\n").split("\t") + if len(fields) < 4: + raise SystemExit(f"{path}:{line_number} has {len(fields)} fields; expected >=4") + rows.append( + { + "action": fields[0], + "label": fields[1], + "path": fields[2], + "source_kind": fields[3] or "unknown", + } + ) + return rows + + +def combined_inputs(rows: list[dict[str, str]]) -> list[dict[str, str]]: + return [row for row in rows if row["action"] == "combined-input"] + + +def sha256_file(path: Path) -> str | None: + if not path.is_file(): + return None + digest = hashlib.sha256() + with path.open("rb") as handle: + for chunk in iter(lambda: handle.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def valid_git_commit(value: Any) -> str | None: + if not isinstance(value, str): + return None + commit = value.strip() + if len(commit) == 40 and all(char in "0123456789abcdefABCDEF" for char in commit): + return commit.lower() + return None + + +def git_head() -> str | None: + env_head = valid_git_commit(os.environ.get("ELF_REAL_WORLD_QUANTITATIVE_REPOSITORY_HEAD")) + if env_head is not None: + return env_head + try: + output = subprocess.run( + ["git", "rev-parse", "HEAD"], + cwd=REPO_ROOT, + check=True, + capture_output=True, + text=True, + ) + except (OSError, subprocess.CalledProcessError): + return None + return valid_git_commit(output.stdout.strip()) + + +def env_profile(args: argparse.Namespace) -> dict[str, str]: + return { + "run_live_explicit_qrels": args.run_live_explicit_qrels, + "run_langgraph": args.run_langgraph, + "audit_profile": os.environ.get("ELF_REAL_WORLD_LIVE_AUDIT_PROFILE", "explicit-qrels-locked"), + "checked_in_sync_enabled": os.environ.get("ELF_REAL_WORLD_QUANTITATIVE_SYNC_CHECKED_IN", "1"), + } + + +def valid_sha256_digest(value: str | None) -> str | None: + if not value: + return None + digest = value.strip().removeprefix("sha256:") + if len(digest) == 64 and all(char in "0123456789abcdefABCDEF" for char in digest): + return f"sha256:{digest.lower()}" + return None + + +def runner_image_digest() -> tuple[str | None, str | None]: + for env_name in ( + "ELF_BASELINE_RUNNER_IMAGE_DIGEST", + "ELF_REAL_WORLD_QUANTITATIVE_RUNNER_IMAGE_DIGEST", + ): + digest = valid_sha256_digest(os.environ.get(env_name)) + if digest is not None: + return digest, f"env:{env_name}" + return None, None + + +def first_present(mapping: dict[str, Any], keys: list[str]) -> Any: + for key in keys: + value = mapping.get(key) + if value not in (None, "", []): + return value + return None + + +def structured_product_commit( + mapping: dict[str, Any] | None, + *, + scope: str, +) -> tuple[str | None, str | None, list[dict[str, str]]]: + if mapping is None: + return None, None, [] + + rejected = [] + for field in PRODUCT_COMMIT_FIELDS: + value = mapping.get(field) + if value in (None, "", []): + continue + commit = valid_git_commit(value) + if commit is not None: + source = first_present( + mapping, + [ + f"{field}_source", + "product_commit_source", + "product_revision_source", + "source_revision_source", + "repository_commit_source", + "git_commit_source", + "commit_source", + "revision_source", + ], + ) + if not isinstance(source, str) or not source.strip(): + source = f"{scope}.{field}" + return commit, source.strip(), rejected + rejected.append( + { + "scope": scope, + "field": field, + "value": str(value)[:200], + "reason": "not_40_hex_git_commit", + } + ) + return None, None, rejected + + +def runtime_source_attestation( + row: dict[str, Any] | None, + manifest: dict[str, Any] | None, +) -> tuple[dict[str, Any] | None, str | None]: + for scope, mapping in (("row", row), ("manifest", manifest)): + if isinstance(mapping, dict) and isinstance(mapping.get("runtime_source_attestation"), dict): + return mapping["runtime_source_attestation"], f"{scope}.runtime_source_attestation" + return None, None + + +def runtime_source_attestation_status( + *, + adapter_id: str | None, + product_commit: str | None, + row: dict[str, Any] | None, + manifest: dict[str, Any] | None, +) -> dict[str, Any] | None: + claim_boundary = RUNTIME_SOURCE_ATTESTATION_REQUIREMENTS.get(adapter_id or "") + if claim_boundary is None: + return None + + status: dict[str, Any] = { + "required": True, + "status": "missing", + "required_fields": [ + "runtime_executed=true", + "source_checkout_used=true or image_revision_label_verified=true", + "commit matches product_commit", + "runtime_artifact", + ], + "claim_boundary": claim_boundary, + } + attestation, source = runtime_source_attestation(row, manifest) + if attestation is None: + status["reason"] = "missing_runtime_source_attestation" + return status + + status["source"] = source + status["status"] = "fail" + if attestation.get("status") != "pass": + reason = attestation.get("reason") + status["reason"] = ( + reason if isinstance(reason, str) and reason.strip() else "runtime_source_attestation_not_pass" + ) + return status + + attested_commit = valid_git_commit( + attestation.get("product_commit") + or attestation.get("commit") + or attestation.get("repository_commit") + or attestation.get("source_revision") + ) + if attested_commit is None: + status["reason"] = "missing_attested_40_hex_commit" + return status + if product_commit is None or attested_commit != product_commit: + status["reason"] = "attested_commit_does_not_match_product_commit" + status["attested_commit"] = attested_commit + return status + if attestation.get("runtime_executed") is not True: + status["reason"] = "runtime_executed_not_true" + return status + if ( + attestation.get("source_checkout_used") is not True + and attestation.get("image_revision_label_verified") is not True + ): + status["reason"] = "runtime_source_binding_not_verified" + return status + if not isinstance(attestation.get("runtime_artifact"), str) or not attestation["runtime_artifact"].strip(): + status["reason"] = "missing_runtime_artifact" + return status + + status["status"] = "pass" + status["reason"] = "runtime_source_attestation_passed" + status["attested_commit"] = attested_commit + return status + + +def missing_reproducibility_fields(record: dict[str, Any]) -> list[str]: + return [ + field + for field in REPRODUCIBILITY_REQUIRED_FIELDS + if record.get(field) in (None, "", []) + ] + + +def build_reproducibility_record( + *, + args: argparse.Namespace, + repository_head: str | None, + entry: dict[str, str], + manifest: dict[str, Any] | None, + row: dict[str, Any] | None, + input_manifest_sha256: str | None, +) -> dict[str, Any]: + row_commit, row_commit_source, rejected = structured_product_commit(row, scope="row") + manifest_commit, manifest_commit_source, manifest_rejected = structured_product_commit( + manifest, + scope="manifest", + ) + rejected.extend(manifest_rejected) + + product_commit = row_commit or manifest_commit + product_commit_source = row_commit_source or manifest_commit_source + adapter_id = str((row or {}).get("adapter_id") or (manifest or {}).get("adapter_id") or "") + attestation_status = runtime_source_attestation_status( + adapter_id=adapter_id, + product_commit=product_commit, + row=row, + manifest=manifest, + ) + if product_commit is not None and attestation_status is not None and attestation_status["status"] != "pass": + rejected.append( + { + "scope": "runtime_source_attestation", + "field": "product_commit", + "value": product_commit, + "reason": str(attestation_status.get("reason")), + } + ) + product_commit = None + product_commit_source = None + + container_digest, container_digest_source = runner_image_digest() + artifact_digests = [] + if input_manifest_sha256 is not None: + artifact_digests.append( + { + "kind": "quantitative_product_manifest", + "path": entry["path"], + "sha256": input_manifest_sha256, + } + ) + + record: dict[str, Any] = { + "command": BENCHMARK_COMMAND, + "runner": BENCHMARK_RUNNER, + "compose_file": BENCHMARK_COMPOSE_FILE, + "repository_head": repository_head, + "env_profile": env_profile(args), + "source_kind": entry["source_kind"], + "input_manifest": entry["path"], + "input_manifest_sha256": input_manifest_sha256, + "artifact_digests": artifact_digests, + "container_image_digest": container_digest, + "product_commit": product_commit, + "product_commit_source": product_commit_source, + } + if container_digest_source is not None: + record["container_image_digest_source"] = container_digest_source + if rejected: + record["rejected_product_commit_values"] = rejected + if attestation_status is not None: + record["runtime_source_attestation"] = attestation_status + record["missing_fields"] = missing_reproducibility_fields(record) + record["public_reproducible"] = not record["missing_fields"] + return record + + +def load_product_manifest(path: Path) -> dict[str, Any]: + manifest = read_json(resolve_artifact_path(path)) + if manifest.get("schema") != PRODUCT_MANIFEST_SCHEMA: + raise SystemExit(f"{path} has unsupported schema {manifest.get('schema')!r}") + return manifest + + +def row_key(row: dict[str, Any]) -> tuple[Any, Any]: + return row.get("product"), row.get("adapter_id") + + +def main() -> None: + args = parse_args() + inputs = combined_inputs(sync_log_rows(args.sync_log)) + if not inputs: + raise SystemExit(f"{args.sync_log} has no combined-input rows") + + combined = load_product_manifest(args.combined_product_manifest) + combined_rows = combined.get("rows", []) + combined_keys = {row_key(row) for row in combined_rows} + repository_head = git_head() + missing_field_counts: dict[str, int] = {} + ready_row_count = 0 + input_row_count = 0 + missing_from_combined = [] + product_commit_gap_rows = [] + combined_input_records = [] + + for entry in inputs: + path = resolve_artifact_path(entry["path"]) + input_digest = sha256_file(path) + manifest = load_product_manifest(path) + rows = [] + for row in manifest.get("rows", []): + input_row_count += 1 + reproducibility = build_reproducibility_record( + args=args, + repository_head=repository_head, + entry=entry, + manifest=manifest, + row=row, + input_manifest_sha256=input_digest, + ) + for field in reproducibility["missing_fields"]: + missing_field_counts[field] = missing_field_counts.get(field, 0) + 1 + if reproducibility["public_reproducible"]: + ready_row_count += 1 + + row_record = { + "label": entry["label"], + "source_kind": entry["source_kind"], + "product": row.get("product"), + "adapter_id": row.get("adapter_id"), + "evidence_class": row.get("evidence_class"), + "result_state": row.get("result_state"), + "leaderboard_eligible": bool(row.get("leaderboard_eligible")), + "metric_comparable": bool(row.get("metric_comparable")), + "present_in_combined_manifest": row_key(row) in combined_keys, + "reproducibility": reproducibility, + } + if not row_record["present_in_combined_manifest"]: + missing_from_combined.append(row_record) + if "product_commit" in reproducibility["missing_fields"]: + product_commit_gap_rows.append(row_record) + rows.append(row_record) + + combined_input_records.append( + { + **entry, + "status": "loaded", + "row_count": len(rows), + "input_manifest_sha256": input_digest, + "rows": rows, + } + ) + + manifest = { + "schema": SCHEMA, + "generated_at": datetime.now(UTC).isoformat().replace("+00:00", "Z"), + "status": "pass" if not missing_from_combined else "fail", + "run_live_explicit_qrels": args.run_live_explicit_qrels, + "run_langgraph": args.run_langgraph, + "policy": { + "combined_manifest_rows_must_have_input_provenance": True, + "public_reproducibility_requires_runner_digest": True, + "runtime_sensitive_product_commits_require_runtime_attestation": True, + }, + "combined_product_manifest": args.combined_product_manifest.as_posix(), + "combined_input_count": len(inputs), + "input_row_count": input_row_count, + "combined_row_count": len(combined_rows), + "missing_from_combined_count": len(missing_from_combined), + "missing_from_combined": missing_from_combined, + "product_commit_gap_rows": product_commit_gap_rows, + "reproducibility_summary": { + "state": ( + "public_reproducibility_ready" + if input_row_count > 0 and ready_row_count == input_row_count and not missing_field_counts + else "public_reproducibility_not_ready" + ), + "row_count": input_row_count, + "ready_row_count": ready_row_count, + "required_fields": REPRODUCIBILITY_REQUIRED_FIELDS, + "missing_field_counts": missing_field_counts, + "product_commit_gap_count": len(product_commit_gap_rows), + "public_reproducible_claim_allowed": ( + input_row_count > 0 and ready_row_count == input_row_count and not missing_field_counts + ), + "claim_boundary": ( + "A row may support a public reproducibility claim only when it carries " + "the aggregate command, Docker runner, compose file, repository head, " + "environment profile, input product-manifest SHA-256, artifact digest, " + "container image digest, and structured 40-hex product source commit provenance." + ), + }, + "combined_inputs": combined_input_records, + } + + write_json(args.out, manifest) + if manifest["status"] != "pass": + raise SystemExit( + "quantitative artifact freshness gate failed: " + f"{len(missing_from_combined)} input rows missing from combined manifest" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh index 8afc80d5..d4ed947f 100755 --- a/scripts/real-world-docker.sh +++ b/scripts/real-world-docker.sh @@ -6,6 +6,28 @@ if [ -z "$profile" ]; then echo "usage: scripts/real-world-docker.sh " >&2 exit 2 fi +repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +build_baseline_runner_with_digest() { + docker compose -f docker-compose.baseline.yml build baseline-runner + + local image_id + image_id="$(docker compose -f docker-compose.baseline.yml images -q baseline-runner | head -n 1)" + if [ -z "$image_id" ]; then + echo "Unable to resolve baseline-runner image id after Docker Compose build." >&2 + exit 1 + fi + + case "$image_id" in + sha256:*) export ELF_BASELINE_RUNNER_IMAGE_DIGEST="$image_id" ;; + *) export ELF_BASELINE_RUNNER_IMAGE_DIGEST="sha256:${image_id}" ;; + esac + + if [[ ! "$ELF_BASELINE_RUNNER_IMAGE_DIGEST" =~ ^sha256:[0-9a-fA-F]{64}$ ]]; then + echo "Invalid baseline-runner image digest: ${ELF_BASELINE_RUNNER_IMAGE_DIGEST}" >&2 + exit 1 + fi +} case "$profile" in job-operator-ux-live-adapters) @@ -137,6 +159,17 @@ memory-live-explicit-qrels) -e ELF_REAL_WORLD_QMD_DIR \ baseline-runner bash scripts/real-world-live-explicit-qrels.sh ;; +memory-quantitative-docker) + build_baseline_runner_with_digest + docker compose -f docker-compose.baseline.yml run --rm \ + -e ELF_BASELINE_RUNNER_IMAGE_DIGEST \ + -e ELF_REAL_WORLD_QUANTITATIVE_REPORT_DIR \ + -e ELF_REAL_WORLD_QUANTITATIVE_LIVE_EXPLICIT_QRELS_DIR \ + -e ELF_REAL_WORLD_QUANTITATIVE_RUN_LIVE_EXPLICIT_QRELS \ + -e ELF_REAL_WORLD_QUANTITATIVE_RUN_LANGGRAPH \ + -e ELF_REAL_WORLD_QMD_DIR \ + baseline-runner bash scripts/real-world-quantitative-docker.sh + ;; *) echo "unknown real-world Docker profile: $profile" >&2 exit 2 diff --git a/scripts/real-world-quantitative-docker.sh b/scripts/real-world-quantitative-docker.sh new file mode 100755 index 00000000..547da752 --- /dev/null +++ b/scripts/real-world-quantitative-docker.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_QUANTITATIVE_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/quantitative-docker}" +LIVE_QRELS_DIR="${ELF_REAL_WORLD_QUANTITATIVE_LIVE_EXPLICIT_QRELS_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-explicit-qrels}" +LIVE_ADAPTER_DIR="${LIVE_QRELS_DIR}/live-adapters" +SYNC_LOG="${REPORT_DIR}/synced-artifacts.tsv" +QMD_PRODUCT_MANIFEST="${REPORT_DIR}/qmd-quantitative-product-manifest.json" +FRESHNESS_MANIFEST="${REPORT_DIR}/quantitative-artifact-freshness-manifest.json" +RUN_LIVE_EXPLICIT_QRELS="${ELF_REAL_WORLD_QUANTITATIVE_RUN_LIVE_EXPLICIT_QRELS:-1}" +RUN_LANGGRAPH="${ELF_REAL_WORLD_QUANTITATIVE_RUN_LANGGRAPH:-0}" +QMD_DIR="${ELF_REAL_WORLD_QMD_DIR:-/bench/repos/qmd}" + +if [[ ! -f "/.dockerenv" ]]; then + echo "Refusing to run the quantitative benchmark aggregate outside Docker." >&2 + echo "Use cargo make real-world-memory-quantitative-docker." >&2 + exit 1 +fi + +for cmd in bash cargo git jq python3; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in quantitative Docker benchmark runner." >&2 + exit 1 + fi +done + +require_runner_image_digest() { + local digest="${ELF_BASELINE_RUNNER_IMAGE_DIGEST:-}" + if [[ -z "${digest}" ]]; then + digest="${ELF_REAL_WORLD_QUANTITATIVE_RUNNER_IMAGE_DIGEST:-}" + fi + if [[ -z "${digest}" ]]; then + echo "Missing baseline-runner image digest before quantitative aggregate work starts." >&2 + echo "Use cargo make real-world-memory-quantitative-docker so scripts/real-world-docker.sh can pass ELF_BASELINE_RUNNER_IMAGE_DIGEST." >&2 + exit 1 + fi + if [[ ! "${digest}" =~ ^sha256:[0-9a-fA-F]{64}$ ]]; then + echo "Invalid baseline-runner image digest: ${digest}" >&2 + exit 1 + fi + export ELF_BASELINE_RUNNER_IMAGE_DIGEST="${digest}" + export ELF_REAL_WORLD_QUANTITATIVE_RUNNER_IMAGE_DIGEST="${digest}" +} + +annotate_product_manifest_from_git() { + local manifest="$1" + local repo_dir="$2" + local source="$3" + + if [[ ! -d "${repo_dir}/.git" ]]; then + return 0 + fi + + local commit + commit="$(git -C "${repo_dir}" rev-parse HEAD)" + jq \ + --arg commit "${commit}" \ + --arg source "${source}" \ + '.product_commit = $commit + | .product_commit_source = $source + | .rows |= map(.product_commit = $commit | .product_commit_source = $source)' \ + "${manifest}" >"${manifest}.tmp" + mv "${manifest}.tmp" "${manifest}" +} + +require_runner_image_digest + +cd "${ROOT_DIR}" +rm -rf "${REPORT_DIR}" +mkdir -p "${REPORT_DIR}" +: >"${SYNC_LOG}" + +if [[ "${RUN_LIVE_EXPLICIT_QRELS}" == "1" ]]; then + ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR="${LIVE_QRELS_DIR}" \ + bash scripts/real-world-live-explicit-qrels.sh +fi + +cargo run -p elf-eval --bin real_world_job_benchmark -- export-quantitative-product-manifest \ + --report "${LIVE_ADAPTER_DIR}/qmd-report.json" \ + --out "${QMD_PRODUCT_MANIFEST}" \ + --product qmd \ + --adapter-id qmd_live_real_world \ + --adapter-name "qmd live real-world CLI adapter" + +annotate_product_manifest_from_git "${QMD_PRODUCT_MANIFEST}" "${QMD_DIR}" "git.rev_parse_head:qmd_dir" + +printf 'combined-input\tqmd-live-explicit-qrels\t%s\t%s\n' \ + "${QMD_PRODUCT_MANIFEST}" \ + "current_docker_run" >>"${SYNC_LOG}" + +python3 scripts/materialize-quantitative-artifact-freshness.py \ + --sync-log "${SYNC_LOG}" \ + --combined-product-manifest "${QMD_PRODUCT_MANIFEST}" \ + --out "${FRESHNESS_MANIFEST}" \ + --run-live-explicit-qrels "${RUN_LIVE_EXPLICIT_QRELS}" \ + --run-langgraph "${RUN_LANGGRAPH}" + +echo "Quantitative Docker benchmark artifacts:" +echo " ${QMD_PRODUCT_MANIFEST}" +echo " ${FRESHNESS_MANIFEST}"