From 3e854be1291698f8398f305ab5ce87bd24a16dde Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Fri, 3 Jul 2026 05:01:10 -0400 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Implement source-backed memory quality benchmark harness","authority":"XY-1155"} --- .../context_pack_activation_quality.json | 284 ++++++++++++ .../src/bin/real_world_job_benchmark/cli.rs | 9 + .../bin/real_world_job_benchmark/commands.rs | 17 +- .../bin/real_world_job_benchmark/fixtures.rs | 20 + .../src/bin/real_world_job_benchmark/main.rs | 11 +- .../bin/real_world_job_benchmark/markdown.rs | 2 + .../markdown/source_backed_quality.rs | 76 ++++ .../real_world_job_benchmark/report_root.rs | 6 +- .../scoring/reports.rs | 4 +- .../source_backed_quality.rs | 415 ++++++++++++++++++ .../source_backed_quality_reports.rs | 64 +++ .../tests/real_world_job_benchmark.rs | 1 + .../runner.rs | 2 +- .../core_archival_context.rs | 6 +- .../root_aggregate_suites.rs | 2 +- .../root_aggregate_summary_counts.rs | 12 +- .../root_aggregate_summary_scoreboard.rs | 4 +- .../source_backed_quality.rs | 167 +++++++ ...source-backed-quality-benchmark-harness.md | 115 +++++ docs/evidence/benchmarking/index.md | 1 + docs/log.md | 5 + makefiles/benchmark-memory-b.toml | 67 +++ 22 files changed, 1271 insertions(+), 19 deletions(-) create mode 100644 apps/elf-eval/fixtures/real_world_memory/context_trajectory/context_pack_activation_quality.json create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/markdown/source_backed_quality.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality_reports.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/source_backed_quality.rs create mode 100644 docs/evidence/benchmarking/2026-07-03-source-backed-quality-benchmark-harness.md diff --git a/apps/elf-eval/fixtures/real_world_memory/context_trajectory/context_pack_activation_quality.json b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/context_pack_activation_quality.json new file mode 100644 index 00000000..897b7b1b --- /dev/null +++ b/apps/elf-eval/fixtures/real_world_memory/context_trajectory/context_pack_activation_quality.json @@ -0,0 +1,284 @@ +{ + "schema": "elf.real_world_job/v1", + "job_id": "context-pack-activation-quality-001", + "suite": "context_trajectory", + "title": "Activate Context Pack layers automatically without widening authority", + "corpus": { + "corpus_id": "real-world-memory-context-pack-2026-07-03", + "profile": "synthetic", + "items": [ + { + "evidence_id": "context-pack-routing-trace", + "kind": "recall_debug_trace", + "text": "Context Pack routing trace: memory_notes and source_documents were enabled for the current project; stale knowledge_pages were suppressed; manual disabled graph_facts stayed disabled; a pinned private dreaming_proposals layer remained blocked by read_profile.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "context_pack_activation_quality", + "evidence_id": "context-pack-routing-trace" + } + }, + "created_at": "2026-07-03T07:00:00Z" + }, + { + "evidence_id": "context-pack-current-source", + "kind": "source_document", + "text": "Current readable source: Context Packs are read-time views and pinning may change priority only after scope, read_profile, freshness, deletion, redaction, and authority gates pass.", + "source_ref": { + "schema": "source_ref/v1", + "resolver": "real_world_job_fixture/v1", + "ref": { + "fixture": "context_pack_activation_quality", + "evidence_id": "context-pack-current-source" + } + }, + "created_at": "2026-07-03T07:01:00Z" + }, + { + "evidence_id": "context-pack-private-dreaming-decoy", + "kind": "dreaming_proposal", + "text": "Private proposal decoy: a pinned dreaming proposal should be included even when read_profile is public.", + "source_ref": {}, + "created_at": "2026-07-03T07:02:00Z" + }, + { + "evidence_id": "context-pack-stale-knowledge-decoy", + "kind": "knowledge_page", + "text": "Stale knowledge decoy: an old Context Pack should override current source freshness.", + "source_ref": {}, + "created_at": "2026-06-01T07:02:00Z" + } + ], + "adapter_response": { + "adapter_id": "fixture_context_pack", + "answer": { + "content": "The Context Pack should auto-enable current readable memory/source layers, suppress stale knowledge, keep disabled graph layers disabled, and keep a pinned private dreaming proposal blocked by read_profile. The routing trace records enabled, suppressed, disabled, blocked, and pinned-ineligible states.", + "claims": [ + { + "claim_id": "context_pack_activation", + "text": "Context Pack routing enabled current readable layers and suppressed stale, disabled, and private pinned layers without widening authority.", + "evidence_ids": ["context-pack-routing-trace", "context-pack-current-source"], + "confidence": "high" + }, + { + "claim_id": "context_pack_trace", + "text": "The activation trace records enabled, suppressed, disabled, blocked, and pinned-ineligible states.", + "evidence_ids": ["context-pack-routing-trace"], + "confidence": "high" + } + ], + "evidence_ids": ["context-pack-routing-trace", "context-pack-current-source"], + "latency_ms": 1.2, + "cost": { + "currency": "USD", + "amount": 0.0, + "input_tokens": 0, + "output_tokens": 0 + } + } + } + }, + "timeline": [ + { + "event_id": "context-pack-route-recorded", + "ts": "2026-07-03T07:00:00Z", + "actor": "elf", + "action": "recorded_context_pack_trace", + "evidence_ids": ["context-pack-routing-trace", "context-pack-current-source"], + "summary": "The trace recorded automatic activation and suppression decisions for Context Pack v1." + } + ], + "prompt": { + "role": "user", + "content": "Which Context Pack layers should be active for this task, and why?", + "job_mode": "answer", + "constraints": [ + "cite_evidence", + "explain_context_pack_activation", + "do_not_leak_private_sources", + "do_not_treat_pinning_as_authority" + ] + }, + "expected_answer": { + "must_include": [ + { + "claim_id": "context_pack_activation", + "text": "Context Pack routing enabled current readable layers and suppressed stale, disabled, and private pinned layers without widening authority." + }, + { + "claim_id": "context_pack_trace", + "text": "The activation trace records enabled, suppressed, disabled, blocked, and pinned-ineligible states." + } + ], + "must_not_include": [ + "Pinned private dreaming proposals bypass read_profile.", + "Stale knowledge should override current sources.", + "Disabled graph layers should still be returned." + ], + "evidence_links": { + "context_pack_activation": ["context-pack-routing-trace", "context-pack-current-source"], + "context_pack_trace": ["context-pack-routing-trace"] + }, + "answer_type": "direct_answer", + "accepted_alternates": [], + "requires_caveat": false, + "requires_refusal": false + }, + "required_evidence": [ + { + "evidence_id": "context-pack-routing-trace", + "claim_id": "context_pack_trace", + "requirement": "cite", + "quote": "memory_notes and source_documents were enabled" + }, + { + "evidence_id": "context-pack-current-source", + "claim_id": "context_pack_activation", + "requirement": "cite", + "quote": "pinning may change priority only after scope" + } + ], + "negative_traps": [ + { + "trap_id": "pinned-private-dreaming-read-profile-bypass", + "type": "private_scope_leak", + "evidence_ids": ["context-pack-private-dreaming-decoy"], + "failure_if_used": true + }, + { + "trap_id": "stale-context-pack-knowledge-current", + "type": "stale_fact", + "evidence_ids": ["context-pack-stale-knowledge-decoy"], + "failure_if_used": true + } + ], + "scoring_rubric": { + "dimensions": { + "answer_correctness": { + "weight": 0.35, + "max_points": 1.0, + "criteria": "States the automatic activation and suppression decisions." + }, + "evidence_grounding": { + "weight": 0.3, + "max_points": 1.0, + "criteria": "Cites routing trace and current source evidence." + }, + "trap_avoidance": { + "weight": 0.25, + "max_points": 1.0, + "criteria": "Does not use private pinned or stale knowledge decoys." + }, + "workflow_helpfulness": { + "weight": 0.1, + "max_points": 1.0, + "criteria": "Explains the activation trace in operator-facing terms." + } + }, + "pass_threshold": 0.8, + "hard_fail_rules": [ + "unsupported high-confidence claim about a required decision or fact", + "use of a negative trap marked failure_if_used = true" + ] + }, + "allowed_uncertainty": { + "can_answer_unknown": false, + "acceptable_phrases": [], + "fallback_action": "state_blocker" + }, + "operator_debug": { + "failure_mode": "none_context_pack_activation_trace", + "trace_id": "11551155-0000-4000-8000-115511551155", + "viewer_url": "/viewer?trace_id=11551155-0000-4000-8000-115511551155", + "admin_trace_bundle_url": "/v2/admin/traces/11551155-0000-4000-8000-115511551155/bundle?mode=full", + "root_cause": "Context Pack routing selected current readable layers and suppressed stale, disabled, and private pinned candidates.", + "steps_to_root_cause": 2, + "raw_sql_needed": false, + "dropped_candidate_visibility": "visible in context_pack.routing_trace entries", + "trace_completeness": "complete", + "repair_action_clarity": "clear", + "trace_available": true, + "replay_command_available": true, + "replay_command": "cargo make source-backed-memory-quality", + "replay_artifact": "tmp/source-backed-memory-quality/report.json", + "viewer_panels": ["Context Pack", "Recall Debug", "Source Library"], + "cli_steps": ["run source-backed memory quality benchmark", "inspect source_backed_quality metrics"], + "trace_evidence": ["context-pack-routing-trace", "context-pack-current-source"], + "ux_gaps": [] + }, + "context_pack": { + "decisions": [ + { + "decision_id": "memory-notes-enabled-current-project", + "layer": "memory_notes", + "expected_state": "enabled", + "observed_state": "enabled", + "reason_code": "current_project_readable_memory", + "source_refs": ["context-pack-routing-trace"] + }, + { + "decision_id": "source-documents-enabled-current-project", + "layer": "source_documents", + "expected_state": "enabled", + "observed_state": "enabled", + "reason_code": "current_project_source_authority", + "source_refs": ["context-pack-current-source"] + }, + { + "decision_id": "irrelevant-pack-suppressed", + "layer": "context_pack", + "expected_state": "suppressed", + "observed_state": "suppressed", + "reason_code": "irrelevant_to_prompt", + "source_refs": ["context-pack-routing-trace"] + }, + { + "decision_id": "disabled-graph-facts-remain-disabled", + "layer": "graph_facts", + "expected_state": "disabled", + "observed_state": "disabled", + "reason_code": "manual_disable_preserved", + "source_refs": ["context-pack-routing-trace"] + }, + { + "decision_id": "stale-knowledge-suppressed", + "layer": "knowledge_pages", + "expected_state": "stale_suppressed", + "observed_state": "stale_suppressed", + "reason_code": "freshness_gate_failed", + "source_refs": ["context-pack-routing-trace"] + }, + { + "decision_id": "private-dreaming-blocked", + "layer": "dreaming_proposals", + "expected_state": "blocked", + "observed_state": "blocked", + "reason_code": "read_profile_gate_failed", + "source_refs": ["context-pack-routing-trace"] + }, + { + "decision_id": "pinned-private-dreaming-ineligible", + "layer": "dreaming_proposals", + "expected_state": "pinned_ineligible", + "observed_state": "pinned_ineligible", + "reason_code": "pin_priority_cannot_widen_read_profile", + "source_refs": ["context-pack-routing-trace"], + "pinned": true + } + ] + }, + "tags": [ + "synthetic", + "context_trajectory", + "context_pack_relevant_activation", + "context_pack_irrelevant_suppression", + "context_pack_disabled_suppression", + "context_pack_stale_suppression", + "read_profile_downgrade", + "pinned_pack_trap", + "recall_debug_reason_codes", + "recall_debug_privacy", + "no_live_claim" + ] +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs index bae29a2e..bb00571f 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs @@ -69,6 +69,13 @@ pub(super) struct PublishArgs { pub(super) out: Option, } +#[derive(Debug, Parser)] +pub(super) struct ValidateSourceBackedQualityArgs { + /// Generated real_world_job JSON report to gate. + #[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)] + pub(super) report: PathBuf, +} + #[derive(Debug, Parser)] pub(super) struct ExportQuantitativeProductManifestArgs { /// Generated real_world_job JSON report to export. @@ -136,4 +143,6 @@ pub(super) enum Command { Run(RunArgs), /// Render Markdown from a generated real_world_job JSON report. Publish(PublishArgs), + /// Fail unless the generated source-backed quality benchmark gate passes. + ValidateSourceBackedQuality(ValidateSourceBackedQualityArgs), } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs index a151e6da..c1927ec6 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs @@ -2,7 +2,8 @@ use crate::{ AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, OffsetDateTime, Path, PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, - RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, + RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, + ValidateSourceBackedQualityArgs, eyre, fs, }; pub(super) fn run_command(args: RunArgs) -> Result<()> { @@ -21,6 +22,17 @@ pub(super) fn publish_command(args: PublishArgs) -> Result<()> { write_or_print(args.out.as_deref(), markdown.as_str()) } +pub(super) fn validate_source_backed_quality_command( + args: ValidateSourceBackedQualityArgs, +) -> Result<()> { + let raw = fs::read_to_string(&args.report)?; + let report = serde_json::from_str::(&raw)?; + + crate::validate_source_backed_quality_gate(&report.source_backed_quality).map_err(|failures| { + eyre::eyre!("source-backed quality gate failed: {}", failures.join(", ")) + }) +} + pub(super) fn export_quantitative_product_manifest_command( args: ExportQuantitativeProductManifestArgs, ) -> Result<()> { @@ -124,6 +136,8 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result Result, pub(super) scheduled_memory: Option, pub(super) work_continuity: Option, + pub(super) context_pack: Option, } #[derive(Debug, Deserialize)] @@ -127,6 +128,25 @@ pub(super) struct NegativeTrap { pub(super) failure_if_used: bool, } +#[derive(Debug, Deserialize)] +pub(super) struct ContextPackExpectation { + #[serde(default)] + pub(super) decisions: Vec, +} + +#[derive(Debug, Deserialize)] +pub(super) struct ContextPackRoutingDecision { + pub(super) decision_id: String, + pub(super) layer: String, + pub(super) expected_state: String, + pub(super) observed_state: String, + pub(super) reason_code: String, + #[serde(default)] + pub(super) source_refs: Vec, + #[serde(default)] + pub(super) pinned: bool, +} + #[derive(Debug, Default, Deserialize)] pub(super) struct JobEncoding { pub(super) status: Option, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs index dc77d8f0..918c9988 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs @@ -23,6 +23,8 @@ mod report_root; mod scoreboard; mod scoreboard_reports; mod scoring; +mod source_backed_quality; +mod source_backed_quality_reports; mod summary; mod summary_reports; mod validation; @@ -53,7 +55,7 @@ use artifacts::{ }; use cli::{ Args, Command, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, - PublishArgs, RunArgs, + PublishArgs, RunArgs, ValidateSourceBackedQualityArgs, }; use diagnostic_reports::{ OperatorDebugEvidence, OperatorUxGap, TraceExplainability, TraceStageExplainability, @@ -106,6 +108,11 @@ use scoreboard_reports::{ ScoreboardRow, }; use scoring::{job_report, score_job}; +use source_backed_quality::{source_backed_quality_report, validate_source_backed_quality_gate}; +use source_backed_quality_reports::{ + SourceBackedContextPackDecisionCounts, SourceBackedQualityMetrics, SourceBackedQualityReport, + SourceBackedScenarioCoverage, +}; use summary::{evolution_summary, follow_up_reports, report_summary, suite_reports}; use summary_reports::{ ConsolidationSummaryReport, KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, @@ -187,5 +194,7 @@ fn main() -> Result<()> { commands::export_quantitative_product_manifest_command(args), Command::Run(args) => commands::run_command(args), Command::Publish(args) => commands::publish_command(args), + Command::ValidateSourceBackedQuality(args) => + commands::validate_source_backed_quality_command(args), } } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs index 68bcb12a..5bf557be 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs @@ -8,6 +8,7 @@ mod jobs; mod operational; mod quantitative; mod scoreboard; +mod source_backed_quality; mod trace; use std::path::Path; @@ -33,6 +34,7 @@ pub(super) fn render_markdown(report: &RealWorldReport, report_path: &Path) -> S self::header::render_markdown_header(&mut out, report, report_path.as_str()); self::scoreboard::render_markdown_scoreboard(&mut out, report); + self::source_backed_quality::render_markdown_source_backed_quality(&mut out, report); self::quantitative::render_markdown_quantitative_scoreboard(&mut out, report); self::operational::render_markdown_operational_evidence(&mut out, report); self::adapters::render_markdown_external_adapters(&mut out, report); diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/source_backed_quality.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/source_backed_quality.rs new file mode 100644 index 00000000..8ad693d5 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/source_backed_quality.rs @@ -0,0 +1,76 @@ +use crate::{RealWorldReport, markdown}; + +pub(super) fn render_markdown_source_backed_quality(out: &mut String, report: &RealWorldReport) { + let quality = &report.source_backed_quality; + + if quality.schema.is_empty() { + return; + } + + out.push_str("## Source-Backed Memory Quality\n\n"); + out.push_str(concat!( + "This gate reports the source-backed memory quality metrics required by XY-1155. ", + "Hard-fail leak counters must be zero, and scenario coverage preserves typed ", + "non-pass states instead of converting missing evidence into wins.\n\n" + )); + out.push_str(&format!("- Schema: `{}`\n", markdown::md_inline(quality.schema.as_str()))); + out.push_str(&format!( + "- Result state: `{}`; hard-fail passed: `{}`\n", + markdown::md_inline(quality.result_state.as_str()), + quality.hard_fail_passed + )); + out.push_str(&format!( + "- Expected evidence recall: `{}`; precision@5: {}; source-ref coverage: `{}`\n", + markdown::round3(quality.metrics.expected_evidence_recall), + optional_metric(quality.metrics.precision_at_5), + markdown::round3(quality.metrics.source_ref_coverage) + )); + out.push_str(&format!( + "- Stale suppression: {}; correction persistence: {}; delete/tombstone suppression: {}\n", + optional_metric(quality.metrics.stale_suppression_rate), + optional_metric(quality.metrics.correction_persistence_rate), + optional_metric(quality.metrics.delete_tombstone_suppression_rate) + )); + out.push_str(&format!( + "- Unsupported claim rate: `{}`; cross-scope leaks: `{}`; journal-only authority claims: `{}`\n", + markdown::round3(quality.metrics.unsupported_claim_rate), + quality.metrics.cross_scope_leak_count, + quality.metrics.journal_only_authority_claim_count + )); + out.push_str(&format!( + "- Context Pack activation precision: {}; recall: {}; trace coverage: {}\n\n", + optional_metric(quality.metrics.context_pack_activation_precision), + optional_metric(quality.metrics.context_pack_activation_recall), + optional_metric(quality.metrics.activation_trace_coverage) + )); + out.push_str(&format!( + "- Context Pack decisions: `{}` total, `{}` traced, `{}` incorrect, `{}` enabled expected, `{}` suppressed/disabled/stale/blocked/pinned-ineligible expected\n\n", + quality.context_pack_decisions.total_decisions, + quality.context_pack_decisions.traced_decisions, + quality.context_pack_decisions.incorrect_decisions, + quality.context_pack_decisions.expected_enabled, + quality.context_pack_decisions.expected_suppressed + + quality.context_pack_decisions.expected_disabled + + quality.context_pack_decisions.expected_stale_suppressed + + quality.context_pack_decisions.expected_blocked + + quality.context_pack_decisions.expected_pinned_ineligible + )); + out.push_str("| Scenario | State | Jobs | Pass |\n"); + out.push_str("| --- | --- | ---: | ---: |\n"); + + for scenario in &quality.scenario_coverage { + out.push_str(&format!( + "| {} | `{}` | `{}` | `{}` |\n", + markdown::md_cell(scenario.scenario.as_str()), + markdown::md_inline(scenario.status.as_str()), + scenario.covered_job_count, + scenario.pass_count + )); + } + + out.push('\n'); +} + +fn optional_metric(value: Option) -> String { + value.map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", markdown::round3(value))) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs index 797eb2ba..375d2128 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs @@ -1,8 +1,8 @@ use crate::{ AdapterReport, CaptureIntegrationReport, Deserialize, EvolutionSummary, ExternalAdapterSection, FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction, - QuantitativeBenchmarkReport, ReportSummary, ScoreboardReport, Serialize, SuiteReport, - UnsupportedClaimReport, + QuantitativeBenchmarkReport, ReportSummary, ScoreboardReport, Serialize, + SourceBackedQualityReport, SuiteReport, UnsupportedClaimReport, }; #[derive(Debug, Deserialize, Serialize)] @@ -20,6 +20,8 @@ pub(super) struct RealWorldReport { #[serde(default)] pub(super) quantitative_scoreboard: QuantitativeBenchmarkReport, #[serde(default)] + pub(super) source_backed_quality: SourceBackedQualityReport, + #[serde(default)] pub(super) external_adapters: ExternalAdapterSection, pub(super) capture_integration: CaptureIntegrationReport, pub(super) summary: ReportSummary, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/reports.rs index 9e01d093..94816348 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/reports.rs @@ -111,7 +111,7 @@ fn job_metrics(job: &RealWorldJob, answer: &ProducedAnswer) -> JobMetrics { .filter(|evidence| produced_evidence.contains(&evidence.evidence_id)) .count(); let stale_retrieval_count = trap_use_count(job, &produced_evidence, "stale_fact", answer); - let scope_violation_count = ["near_duplicate", "scope_leak"] + let scope_violation_count = ["near_duplicate", "scope_leak", "private_scope_leak"] .into_iter() .map(|trap_type| trap_use_count(job, &produced_evidence, trap_type, answer)) .sum(); @@ -145,7 +145,7 @@ fn source_ref_by_evidence(job: &RealWorldJob) -> BTreeMap<&str, &Value> { } fn is_scope_trap_type(trap_type: &str) -> bool { - matches!(trap_type, "near_duplicate" | "scope_leak") + matches!(trap_type, "near_duplicate" | "scope_leak" | "private_scope_leak") } fn trap_use_count( diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality.rs new file mode 100644 index 00000000..9af4cc20 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality.rs @@ -0,0 +1,415 @@ +use crate::{ + BTreeSet, JobReport, RealWorldJob, ReportSummary, ScoreboardReport, + SourceBackedContextPackDecisionCounts, SourceBackedQualityMetrics, SourceBackedQualityReport, + SourceBackedScenarioCoverage, TypedStatus, formatting, +}; + +const SOURCE_BACKED_QUALITY_SCHEMA: &str = "elf.source_backed_memory_quality_benchmark/v1"; +const REQUIRED_METRICS: &[&str] = &[ + "expected_evidence_recall", + "precision_at_5", + "irrelevant_context_ratio", + "source_ref_coverage", + "stale_suppression_rate", + "correction_persistence_rate", + "delete_tombstone_suppression_rate", + "unsupported_claim_rate", + "cross_scope_leak_count", + "journal_only_authority_claim_count", + "context_pack_activation_precision", + "context_pack_activation_recall", + "activation_trace_coverage", + "mean_latency_ms", + "total_cost", +]; +const REQUIRED_SCENARIOS: &[(&str, &[&str])] = &[ + ("correct_source_backed_recall", &["source_backed_recall", "source_library"]), + ("source_backed_memory_promotion", &["memory_candidate", "approved_memory"]), + ("stale_memory_suppression", &["stale_fact", "stale_suppression"]), + ("correction_persistence", &["correction_persistence"]), + ("superseded_memory_not_current", &["superseded", "archival_supersession"]), + ("delete_tombstone_suppression", &["delete", "tombstone"]), + ("cross_project_private_scope_leak_trap", &["scope_leak_trap", "privacy_leak"]), + ("read_profile_downgrade_trap", &["read_profile_downgrade"]), + ("pinned_pack_trap", &["pinned_pack_trap"]), + ("journal_only_current_fact_trap", &["journal_only_authority_trap", "janitor"]), + ("where_stopped_resume", &["where_stopped", "reset_resume"]), + ("dreaming_no_silent_mutation", &["dreaming_no_silent_mutation", "consolidation"]), + ("context_pack_relevant_auto_activation", &["context_pack_relevant_activation"]), + ("context_pack_irrelevant_suppression", &["context_pack_irrelevant_suppression"]), + ("context_pack_disabled_suppression", &["context_pack_disabled_suppression"]), + ("context_pack_stale_suppression", &["context_pack_stale_suppression"]), + ( + "derived_knowledge_stale_on_source_change", + &["changed_source_watch_rebuild", "watch_rebuild"], + ), + ("authoritative_revalidation", &["authoritative_revalidation", "qdrant_rebuild"]), + ("recall_debug_reason_codes", &["recall_debug_reason_codes", "operator_debug"]), + ("recall_debug_privacy", &["recall_debug_privacy", "redaction"]), +]; + +pub(super) fn source_backed_quality_report( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, + scoreboard: &ScoreboardReport, +) -> SourceBackedQualityReport { + let first_elf_row = + scoreboard.rows.iter().find(|row| row.product_id == "elf" || row.product_name == "ELF"); + let lifecycle = first_elf_row.map(|row| &row.metrics.lifecycle); + let retrieval = first_elf_row.map(|row| &row.metrics.retrieval); + let work = summary.work_continuity.as_ref(); + let correction_jobs = tagged_job_reports(raw_jobs, job_reports, |job| { + has_any_tag(job, &["correction_persistence", "correction"]) + }); + let delete_jobs = raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, _)| { + has_any_tag(job, &["delete", "tombstone"]) + || job + .memory_evolution + .as_ref() + .is_some_and(|evolution| !evolution.tombstone_evidence_ids.is_empty()) + }) + .map(|(_, report)| report) + .collect::>(); + let scenario_coverage = REQUIRED_SCENARIOS + .iter() + .map(|(scenario, tags)| scenario_coverage(raw_jobs, job_reports, scenario, tags)) + .collect::>(); + let context_pack_decisions = context_pack_decision_counts(raw_jobs); + let typed_result_states_present = typed_result_states_present(job_reports); + let cross_scope_leak_count = summary.scope_violation_count + summary.redaction_leak_count; + let journal_only_authority_claim_count = + work.map_or(0, |work| work.journal_only_authority_claim_count); + let mut hard_failures = Vec::new(); + + if cross_scope_leak_count > 0 { + hard_failures.push("cross_scope_leak_count_nonzero".to_string()); + } + if journal_only_authority_claim_count > 0 { + hard_failures.push("journal_only_authority_claim_count_nonzero".to_string()); + } + if context_pack_decisions.total_decisions == 0 { + hard_failures.push("context_pack_decisions_not_encoded".to_string()); + } + if context_pack_decisions.incorrect_decisions > 0 { + hard_failures.push(format!( + "context_pack_incorrect_decision_count:{}", + context_pack_decisions.incorrect_decisions + )); + } + + for report in required_job_reports(raw_jobs, job_reports) { + if !report.hard_fail_hits.is_empty() { + hard_failures.push(format!("required_job_hard_fail:{}", report.job_id)); + } + if !report.trap_ids_used.is_empty() { + hard_failures.push(format!("required_job_trap_used:{}", report.job_id)); + } + } + for scenario in &scenario_coverage { + if scenario.status != "pass" { + hard_failures.push(format!( + "required_scenario_non_pass:{}:{}", + scenario.scenario, scenario.status + )); + } + } + + SourceBackedQualityReport { + schema: SOURCE_BACKED_QUALITY_SCHEMA.to_string(), + metric_basis: "real_world_job_benchmark_fixture_and_product_runtime_rows".to_string(), + result_state: if hard_failures.is_empty() { "pass" } else { "not_encoded" }.to_string(), + hard_fail_passed: hard_failures.is_empty(), + hard_failures, + required_metric_names: REQUIRED_METRICS + .iter() + .map(|metric| (*metric).to_string()) + .collect(), + metrics: SourceBackedQualityMetrics { + expected_evidence_recall: summary.expected_evidence_recall, + precision_at_5: retrieval.and_then(|retrieval| retrieval.precision_at_k), + irrelevant_context_ratio: summary.irrelevant_context_ratio, + source_ref_coverage: summary.source_ref_coverage, + stale_suppression_rate: lifecycle.and_then(|lifecycle| lifecycle.stale_suppression), + correction_persistence_rate: pass_rate(correction_jobs.as_slice()), + delete_tombstone_suppression_rate: pass_rate(delete_jobs.as_slice()), + unsupported_claim_rate: formatting::round3( + summary.unsupported_claim_count as f64 / summary.job_count.max(1) as f64, + ), + cross_scope_leak_count, + journal_only_authority_claim_count, + context_pack_activation_precision: context_pack_activation_precision( + &context_pack_decisions, + ), + context_pack_activation_recall: context_pack_activation_recall(&context_pack_decisions), + activation_trace_coverage: context_pack_trace_coverage(&context_pack_decisions), + mean_latency_ms: summary.mean_latency_ms, + total_cost: summary.total_cost.clone(), + }, + context_pack_decisions, + scenario_coverage, + typed_result_states_present, + artifact_policy: concat!( + "pass means executable fixture/product-runtime evidence exists; wrong_result, ", + "incomplete, blocked, not_tested, not_encoded, and unsupported_claim remain typed ", + "non-pass evidence and must not be collapsed into wins." + ) + .to_string(), + } +} + +pub(super) fn validate_source_backed_quality_gate( + report: &SourceBackedQualityReport, +) -> Result<(), Vec> { + let mut failures = Vec::new(); + + if report.schema != SOURCE_BACKED_QUALITY_SCHEMA { + failures.push(format!("unexpected_schema:{}", report.schema)); + } + if report.result_state != "pass" { + failures.push(format!("result_state_non_pass:{}", report.result_state)); + } + if !report.hard_fail_passed { + failures.push("hard_fail_passed_false".to_string()); + } + if !report.hard_failures.is_empty() { + failures + .extend(report.hard_failures.iter().map(|failure| format!("hard_failure:{failure}"))); + } + + for metric in REQUIRED_METRICS { + if !report.required_metric_names.iter().any(|name| name == metric) { + failures.push(format!("missing_required_metric:{metric}")); + } + } + for scenario in &report.scenario_coverage { + if scenario.required && scenario.status != "pass" { + failures.push(format!( + "required_scenario_non_pass:{}:{}", + scenario.scenario, scenario.status + )); + } + } + + if report.context_pack_decisions.total_decisions == 0 { + failures.push("context_pack_decisions_not_encoded".to_string()); + } + if report.context_pack_decisions.incorrect_decisions > 0 { + failures.push(format!( + "context_pack_incorrect_decision_count:{}", + report.context_pack_decisions.incorrect_decisions + )); + } + + if failures.is_empty() { Ok(()) } else { Err(failures) } +} + +fn scenario_coverage( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + scenario: &str, + tags: &[&str], +) -> SourceBackedScenarioCoverage { + let reports = raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, _)| has_any_tag(job, tags)) + .map(|(_, report)| report) + .collect::>(); + let pass_count = reports.iter().filter(|report| report.status == TypedStatus::Pass).count(); + let status = if reports.is_empty() { + "not_encoded" + } else if pass_count == reports.len() { + "pass" + } else if reports.iter().any(|report| report.status == TypedStatus::WrongResult) { + "wrong_result" + } else if reports.iter().any(|report| report.status == TypedStatus::Blocked) { + "blocked" + } else if reports.iter().any(|report| report.status == TypedStatus::Incomplete) { + "incomplete" + } else if reports.iter().any(|report| report.status == TypedStatus::UnsupportedClaim) { + "unsupported_claim" + } else { + "not_encoded" + }; + + SourceBackedScenarioCoverage { + scenario: scenario.to_string(), + status: status.to_string(), + covered_job_count: reports.len(), + pass_count, + required: true, + } +} + +fn tagged_job_reports<'a>( + raw_jobs: &'a [RealWorldJob], + job_reports: &'a [JobReport], + predicate: impl Fn(&RealWorldJob) -> bool, +) -> Vec<&'a JobReport> { + raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, _)| predicate(job)) + .map(|(_, report)| report) + .collect() +} + +fn has_any_tag(job: &RealWorldJob, tags: &[&str]) -> bool { + job.tags.iter().any(|tag| tags.iter().any(|expected| tag == expected)) +} + +fn pass_rate(reports: &[&JobReport]) -> Option { + if reports.is_empty() { + None + } else { + let pass_count = reports.iter().filter(|report| report.status == TypedStatus::Pass).count(); + + Some(formatting::round3(pass_count as f64 / reports.len() as f64)) + } +} + +fn required_job_reports<'a>( + raw_jobs: &'a [RealWorldJob], + job_reports: &'a [JobReport], +) -> Vec<&'a JobReport> { + raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, _)| { + REQUIRED_SCENARIOS + .iter() + .any(|(_, tags)| tags.iter().any(|tag| has_any_tag(job, &[*tag]))) + }) + .map(|(_, report)| report) + .collect() +} + +fn context_pack_decision_counts( + raw_jobs: &[RealWorldJob], +) -> SourceBackedContextPackDecisionCounts { + let mut counts = SourceBackedContextPackDecisionCounts::default(); + + for job in raw_jobs.iter().filter(|job| { + job.tags.iter().any(|tag| tag.starts_with("context_pack_")) || job.context_pack.is_some() + }) { + let Some(context_pack) = job.context_pack.as_ref() else { + continue; + }; + + for decision in &context_pack.decisions { + let correct = decision.expected_state == decision.observed_state; + + counts.total_decisions += 1; + + if decision.decision_id.trim().is_empty() || decision.layer.trim().is_empty() { + counts.incorrect_decisions += 1; + } + if !correct { + counts.incorrect_decisions += 1; + } + if !decision.reason_code.trim().is_empty() && !decision.source_refs.is_empty() { + counts.traced_decisions += 1; + } + + match decision.expected_state.as_str() { + "enabled" => { + counts.expected_enabled += 1; + + if correct { + counts.correct_enabled += 1; + } + }, + "suppressed" => { + counts.expected_suppressed += 1; + + if correct { + counts.correct_suppressed += 1; + } + }, + "disabled" => { + counts.expected_disabled += 1; + + if correct { + counts.correct_disabled += 1; + } + }, + "stale_suppressed" => { + counts.expected_stale_suppressed += 1; + + if correct { + counts.correct_stale_suppressed += 1; + } + }, + "blocked" => { + counts.expected_blocked += 1; + + if correct { + counts.correct_blocked += 1; + } + }, + "pinned_ineligible" => { + counts.expected_pinned_ineligible += 1; + + if correct && decision.pinned { + counts.correct_pinned_ineligible += 1; + } + }, + _ => counts.incorrect_decisions += 1, + } + + if decision.observed_state == "enabled" { + counts.observed_enabled += 1; + } + } + } + + counts +} + +fn context_pack_activation_precision( + counts: &SourceBackedContextPackDecisionCounts, +) -> Option { + if counts.observed_enabled == 0 { + None + } else { + Some(formatting::round3(counts.correct_enabled as f64 / counts.observed_enabled as f64)) + } +} + +fn context_pack_activation_recall(counts: &SourceBackedContextPackDecisionCounts) -> Option { + if counts.expected_enabled == 0 { + None + } else { + Some(formatting::round3(counts.correct_enabled as f64 / counts.expected_enabled as f64)) + } +} + +fn context_pack_trace_coverage(counts: &SourceBackedContextPackDecisionCounts) -> Option { + if counts.total_decisions == 0 { + None + } else { + Some(formatting::round3(counts.traced_decisions as f64 / counts.total_decisions as f64)) + } +} + +fn typed_result_states_present(job_reports: &[JobReport]) -> Vec { + job_reports + .iter() + .map(|report| match report.status { + TypedStatus::Pass => "pass", + TypedStatus::WrongResult => "wrong_result", + TypedStatus::LifecycleFail => "lifecycle_fail", + TypedStatus::Incomplete => "incomplete", + TypedStatus::Blocked => "blocked", + TypedStatus::NotEncoded => "not_encoded", + TypedStatus::UnsupportedClaim => "unsupported_claim", + }) + .map(str::to_string) + .collect::>() + .into_iter() + .collect() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality_reports.rs new file mode 100644 index 00000000..b9995a69 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/source_backed_quality_reports.rs @@ -0,0 +1,64 @@ +use crate::{CostReport, Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct SourceBackedQualityReport { + pub(crate) schema: String, + pub(crate) metric_basis: String, + pub(crate) result_state: String, + pub(crate) hard_fail_passed: bool, + pub(crate) hard_failures: Vec, + pub(crate) required_metric_names: Vec, + pub(crate) metrics: SourceBackedQualityMetrics, + pub(crate) context_pack_decisions: SourceBackedContextPackDecisionCounts, + pub(crate) scenario_coverage: Vec, + pub(crate) typed_result_states_present: Vec, + pub(crate) artifact_policy: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct SourceBackedQualityMetrics { + pub(crate) expected_evidence_recall: f64, + pub(crate) precision_at_5: Option, + pub(crate) irrelevant_context_ratio: f64, + pub(crate) source_ref_coverage: f64, + pub(crate) stale_suppression_rate: Option, + pub(crate) correction_persistence_rate: Option, + pub(crate) delete_tombstone_suppression_rate: Option, + pub(crate) unsupported_claim_rate: f64, + pub(crate) cross_scope_leak_count: usize, + pub(crate) journal_only_authority_claim_count: usize, + pub(crate) context_pack_activation_precision: Option, + pub(crate) context_pack_activation_recall: Option, + pub(crate) activation_trace_coverage: Option, + pub(crate) mean_latency_ms: Option, + pub(crate) total_cost: Option, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct SourceBackedContextPackDecisionCounts { + pub(crate) total_decisions: usize, + pub(crate) traced_decisions: usize, + pub(crate) incorrect_decisions: usize, + pub(crate) expected_enabled: usize, + pub(crate) observed_enabled: usize, + pub(crate) correct_enabled: usize, + pub(crate) expected_suppressed: usize, + pub(crate) correct_suppressed: usize, + pub(crate) expected_disabled: usize, + pub(crate) correct_disabled: usize, + pub(crate) expected_stale_suppressed: usize, + pub(crate) correct_stale_suppressed: usize, + pub(crate) expected_blocked: usize, + pub(crate) correct_blocked: usize, + pub(crate) expected_pinned_ineligible: usize, + pub(crate) correct_pinned_ineligible: usize, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct SourceBackedScenarioCoverage { + pub(crate) scenario: String, + pub(crate) status: String, + pub(crate) covered_job_count: usize, + pub(crate) pass_count: usize, + pub(crate) required: bool, +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 6aa5cecb..fe347402 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -25,6 +25,7 @@ #[path = "real_world_job_benchmark/retrieval.rs"] mod retrieval; #[path = "real_world_job_benchmark/root_aggregate.rs"] mod root_aggregate; #[path = "real_world_job_benchmark/scheduled_memory.rs"] mod scheduled_memory; +#[path = "real_world_job_benchmark/source_backed_quality.rs"] mod source_backed_quality; #[path = "real_world_job_benchmark/support.rs"] mod support; #[path = "real_world_job_benchmark/trace_replay_reports.rs"] mod trace_replay_reports; #[path = "real_world_job_benchmark/work_continuity.rs"] mod work_continuity; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs index deebfd8e..e1d852f9 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs @@ -7,7 +7,7 @@ use crate::support; fn runner_discovers_nested_fixture_layout() -> Result<()> { let report = support::run_json_report_from(support::fixture_root())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(83)); Ok(()) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs index b0e4a426..1239512e 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/core_archival_context.rs @@ -7,8 +7,8 @@ use crate::support; fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { let report = support::run_json_report_from(support::context_trajectory_fixture_dir())?; - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(1)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(3)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); @@ -25,7 +25,7 @@ fn context_trajectory_fixtures_report_blocked_openviking_gates() -> Result<()> { let context = support::find_by_field(suites, "/suite_id", "context_trajectory")?; assert_eq!(context.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(context.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(context.pointer("/encoded_job_count").and_then(Value::as_u64), Some(4)); let jobs = support::array_at(&report, "/jobs")?; let staged = support::find_by_field( diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_suites.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_suites.rs index 1c977423..3a1bc002 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_suites.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_suites.rs @@ -76,7 +76,7 @@ pub(crate) fn assert_root_aggregate_suites(report: &Value) -> Result<()> { let context_trajectory = support::find_by_field(suites, "/suite_id", "context_trajectory")?; assert_eq!(context_trajectory.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(context_trajectory.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(context_trajectory.pointer("/encoded_job_count").and_then(Value::as_u64), Some(4)); let work_continuity = support::find_by_field(suites, "/suite_id", "work_continuity")?; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs index afd26c4d..99a123db 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_counts.rs @@ -1,9 +1,9 @@ use serde_json::Value; pub(crate) fn assert_root_summary_counts(report: &Value) { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(83)); assert_eq!(report.pointer("/summary/encoded_suite_count").and_then(Value::as_u64), Some(19)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(75)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(76)); assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(7)); @@ -33,8 +33,8 @@ pub(crate) fn assert_root_summary_counts(report: &Value) { Some(0) ); assert_eq!(report.pointer("/summary/redaction_leak_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/scope_check_count").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/scope_correct_count").and_then(Value::as_u64), Some(4)); assert_eq!(report.pointer("/summary/scope_violation_count").and_then(Value::as_u64), Some(0)); assert_eq!( report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), @@ -46,11 +46,11 @@ pub(crate) fn assert_root_summary_counts(report: &Value) { ); assert_eq!( report.pointer("/summary/evidence_required_count").and_then(Value::as_u64), - Some(180) + Some(182) ); assert_eq!( report.pointer("/summary/evidence_covered_count").and_then(Value::as_u64), - Some(180) + Some(182) ); assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); assert_eq!(report.pointer("/summary/source_ref_coverage").and_then(Value::as_f64), Some(1.0)); diff --git a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs index c3735856..1dbd15f7 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/root_aggregate_summary_scoreboard.rs @@ -77,10 +77,10 @@ fn assert_root_scoreboard_rows(report: &Value) -> Result<()> { assert_eq!(elf.pointer("/same_corpus").and_then(Value::as_bool), Some(true)); assert_eq!(elf.pointer("/source_id_mapped").and_then(Value::as_bool), Some(true)); assert_eq!(elf.pointer("/product_runtime").and_then(Value::as_bool), Some(false)); - assert_eq!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64), Some(0.988)); + assert_eq!(elf.pointer("/metrics/retrieval/recall_at_k").and_then(Value::as_f64), Some(0.989)); assert_eq!( elf.pointer("/metrics/retrieval/precision_at_k").and_then(Value::as_f64), - Some(0.415) + Some(0.414) ); assert_eq!(elf.pointer("/metrics/retrieval/mrr").and_then(Value::as_f64), Some(0.988)); assert_eq!(elf.pointer("/metrics/retrieval/ndcg").and_then(Value::as_f64), Some(0.985)); diff --git a/apps/elf-eval/tests/real_world_job_benchmark/source_backed_quality.rs b/apps/elf-eval/tests/real_world_job_benchmark/source_backed_quality.rs new file mode 100644 index 00000000..31af1014 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/source_backed_quality.rs @@ -0,0 +1,167 @@ +use std::fs; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn source_backed_quality_report_emits_xy1155_metrics_and_scenarios() -> Result<()> { + let report = support::run_json_report_from(support::real_world_memory_fixture_dir())?; + let quality = report + .pointer("/source_backed_quality") + .ok_or_else(|| eyre::eyre!("missing source_backed_quality report"))?; + + assert_eq!( + quality.pointer("/schema").and_then(Value::as_str), + Some("elf.source_backed_memory_quality_benchmark/v1") + ); + assert_eq!(quality.pointer("/hard_fail_passed").and_then(Value::as_bool), Some(true)); + assert_eq!(quality.pointer("/metrics/cross_scope_leak_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + quality.pointer("/metrics/journal_only_authority_claim_count").and_then(Value::as_u64), + Some(0) + ); + + let expected_metrics = [ + "expected_evidence_recall", + "precision_at_5", + "irrelevant_context_ratio", + "source_ref_coverage", + "stale_suppression_rate", + "correction_persistence_rate", + "delete_tombstone_suppression_rate", + "unsupported_claim_rate", + "cross_scope_leak_count", + "journal_only_authority_claim_count", + "context_pack_activation_precision", + "context_pack_activation_recall", + "activation_trace_coverage", + "mean_latency_ms", + "total_cost", + ]; + + assert_eq!( + quality.pointer("/required_metric_names"), + Some(&serde_json::json!(expected_metrics)) + ); + + for metric in expected_metrics { + assert!( + quality.pointer(&format!("/metrics/{metric}")).is_some(), + "missing source-backed quality metric {metric}" + ); + } + + assert_eq!( + quality.pointer("/metrics/context_pack_activation_precision").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + quality.pointer("/metrics/context_pack_activation_recall").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + quality.pointer("/metrics/activation_trace_coverage").and_then(Value::as_f64), + Some(1.0) + ); + + assert_context_pack_decisions(quality); + + let scenarios = support::array_at(quality, "/scenario_coverage")?; + + assert_required_scenarios_pass(quality, scenarios)?; + + Ok(()) +} + +#[test] +fn source_backed_quality_task_is_registered() -> Result<()> { + let makefile = + fs::read_to_string(support::workspace_root()?.join("makefiles/benchmark-memory-b.toml"))?; + + for task in [ + "[tasks.source-backed-memory-quality]", + "[tasks.source-backed-memory-quality-json]", + "[tasks.source-backed-memory-quality-validate]", + "[tasks.source-backed-memory-quality-report]", + ] { + assert!(makefile.contains(task), "missing cargo make task {task}"); + } + + Ok(()) +} + +fn assert_context_pack_decisions(quality: &Value) { + for (field, expected) in [ + ("expected_enabled", 2), + ("expected_suppressed", 1), + ("expected_disabled", 1), + ("expected_stale_suppressed", 1), + ("expected_blocked", 1), + ("expected_pinned_ineligible", 1), + ("incorrect_decisions", 0), + ] { + assert_eq!( + quality.pointer(&format!("/context_pack_decisions/{field}")).and_then(Value::as_u64), + Some(expected), + "context pack decision count mismatch for {field}" + ); + } +} + +fn assert_required_scenarios_pass(quality: &Value, scenarios: &[Value]) -> Result<()> { + let missing = scenarios + .iter() + .filter(|scenario| { + scenario.pointer("/status").and_then(Value::as_str) == Some("not_encoded") + }) + .map(|scenario| { + scenario + .pointer("/scenario") + .and_then(Value::as_str) + .unwrap_or("") + .to_string() + }) + .collect::>(); + + assert!(missing.is_empty(), "required scenarios are not encoded: {missing:?}"); + + for scenario in scenarios { + assert_eq!( + scenario.pointer("/status").and_then(Value::as_str), + Some("pass"), + "required scenario did not pass: {scenario:?}" + ); + } + for scenario in [ + "context_pack_relevant_auto_activation", + "context_pack_irrelevant_suppression", + "context_pack_disabled_suppression", + "context_pack_stale_suppression", + "journal_only_current_fact_trap", + "dreaming_no_silent_mutation", + "authoritative_revalidation", + "recall_debug_privacy", + ] { + assert_scenario_passes(quality, scenario)?; + } + + Ok(()) +} + +fn assert_scenario_passes(report: &Value, scenario_id: &str) -> Result<()> { + let scenarios = support::array_at(report, "/scenario_coverage")?; + let scenario = scenarios + .iter() + .find(|scenario| scenario.pointer("/scenario").and_then(Value::as_str) == Some(scenario_id)) + .ok_or_else(|| eyre::eyre!("missing scenario {scenario_id}"))?; + + assert_eq!( + scenario.pointer("/status").and_then(Value::as_str), + Some("pass"), + "scenario {scenario_id} did not pass" + ); + + Ok(()) +} diff --git a/docs/evidence/benchmarking/2026-07-03-source-backed-quality-benchmark-harness.md b/docs/evidence/benchmarking/2026-07-03-source-backed-quality-benchmark-harness.md new file mode 100644 index 00000000..1d4c7cba --- /dev/null +++ b/docs/evidence/benchmarking/2026-07-03-source-backed-quality-benchmark-harness.md @@ -0,0 +1,115 @@ +# Source-Backed Memory Quality Benchmark Harness - July 3, 2026 + +## Purpose + +This report records the XY-1155 executable benchmark harness update for ELF's +source-backed project memory quality gate. + +## Command + +Run: + +```sh +cargo make source-backed-memory-quality +``` + +The task generates JSON, validates the source-backed quality gate with +`validate-source-backed-quality`, and then emits: + +- `tmp/source-backed-memory-quality/report.json` +- `tmp/source-backed-memory-quality/report.md` + +## Added Report Surface + +The real-world job benchmark now emits +`elf.source_backed_memory_quality_benchmark/v1` at +`/source_backed_quality`. + +The surface aggregates the checked-in real-world memory fixtures into the +XY-1155 product-quality metrics: + +- expected evidence recall +- precision@5 +- irrelevant context ratio +- source-ref coverage +- stale suppression rate +- correction persistence rate +- delete/tombstone suppression rate +- unsupported claim rate +- cross-scope leak count +- journal-only authority claim count +- Context Pack activation precision/recall +- activation trace coverage +- latency and cost + +The two hard-fail counters are: + +- `cross_scope_leak_count` +- `journal_only_authority_claim_count` + +Both must be zero for the source-backed quality gate to pass. + +The gate also fails when any required scenario is non-pass, any required job +uses a hard-fail trap, or the Context Pack routing decisions are missing or +incorrect. + +## Scenario Coverage + +The report also emits required scenario coverage for: + +- source-backed recall +- source-backed memory promotion +- stale suppression +- correction persistence +- supersession +- delete/tombstone suppression +- cross-scope/private leak traps +- read-profile downgrade traps +- pinned Context Pack traps +- journal-only current-fact traps +- work-resume where-stopped readback +- Dreaming no-silent-mutation boundaries +- Context Pack activation/suppression/disabled/stale routing +- stale derived knowledge after source changes +- authoritative revalidation +- Recall Debug reason codes and privacy + +## Current Local Run + +The local XY-1155 run produced a passing +`source_backed_quality.result_state = "pass"` with: + +- expected evidence recall: `1.0` +- precision@5: `0.414` +- irrelevant context ratio: `0.0` +- source-ref coverage: `1.0` +- stale suppression rate: `1.0` +- correction persistence rate: `1.0` +- delete/tombstone suppression rate: `1.0` +- unsupported claim rate: `0.0` +- cross-scope leak count: `0` +- journal-only authority claim count: `0` +- Context Pack activation precision: `1.0` +- Context Pack activation recall: `1.0` +- activation trace coverage: `1.0` +- Context Pack routing decisions: `7/7` traced, `0` incorrect, covering enabled, + suppressed, disabled, stale-suppressed, blocked, and pinned-ineligible states +- mean latency: `2.864ms` + +The aggregate benchmark still preserves typed non-pass rows elsewhere in the +report. Those rows remain blocker evidence and do not support an unqualified +leaderboard or product-runtime superiority claim. + +## Review Result + +A read-only skeptic review first blocked the change because the benchmark task +generated reports without gating failures, hard-fail handling did not reject all +required non-pass scenarios, and Context Pack activation metrics were too +tag-derived. + +The follow-up implementation added a `validate-source-backed-quality` command, +wired it into `cargo make source-backed-memory-quality`, hard-fails all required +non-pass scenarios and required job trap usage, counts `private_scope_leak` in +scope leak metrics, and derives Context Pack metrics from structured routing +decisions. A second read-only skeptic review returned `pass` with no P0/P1 +findings. diff --git a/docs/evidence/benchmarking/index.md b/docs/evidence/benchmarking/index.md index 73dd453b..5741dfde 100644 --- a/docs/evidence/benchmarking/index.md +++ b/docs/evidence/benchmarking/index.md @@ -59,3 +59,4 @@ Routes to: Benchmarking evidence concepts under `docs/evidence/benchmarking/`. - `2026-06-23-p4-production-readiness-evidence-gates-report.md`: P4 Production-Readiness Evidence Gates Report - June 23, 2026; adds `cargo make real-world-memory-p4-production-readiness`, records latency, cost, resource, cold-start, restore, and Qdrant rebuild evidence, separates local fixture, public-proxy, private-corpus, and provider-backed tiers, and preserves private/provider inputs as typed blockers. - `2026-06-23-p4-quality-hardening-productization-readiness-report.md`: P4 Quality Hardening and Productization Readiness Report - June 23, 2026; adds `cargo make real-world-memory-p4-quality-hardening-closeout`, reruns adversarial, source-library, knowledge, and production-readiness slices, preserves private/provider blockers, and keeps P5 queueing behind main-thread acceptance with a narrowed productization scope. - `2026-06-27-public-quantitative-competitor-scoreboard-report.md`: Public Quantitative Competitor Scoreboard Report - June 27, 2026; publishes `elf.quality_scoreboard/v1` rows for 20 tracked products, including VectifyAI PageIndex, VectifyAI OpenKB, and plastic-labs Honcho typed rows. Rows expose recall@5, precision@5, MRR, nDCG, lifecycle, source-ref, and latency metrics where measured, and typed blocker, source-provenance, and next-evidence metadata where comparable metrics are not yet available, while preserving zero comparable product-runtime pass claims until held-out, leakage-audited, digest-identified runtime evidence exists. +- `2026-07-03-source-backed-quality-benchmark-harness.md`: Source-Backed Memory Quality Benchmark Harness - July 3, 2026; adds `cargo make source-backed-memory-quality` and the `elf.source_backed_memory_quality_benchmark/v1` report surface for expected evidence recall, precision@5, source-ref coverage, stale/correction/delete behavior, Context Pack activation, Recall Debug privacy, hard-fail leak counters, latency, and typed scenario coverage. diff --git a/docs/log.md b/docs/log.md index ffbcd932..1aaa3776 100644 --- a/docs/log.md +++ b/docs/log.md @@ -179,3 +179,8 @@ logs. `elf.context_pack.routing_trace/v1` privacy boundaries. - Linked Context Pack v1 from the spec index, version registry, source-backed product contract, ELF v2 HTTP endpoint map, and MCP tool map. +- Added the XY-1155 `cargo make source-backed-memory-quality` benchmark task and + `elf.source_backed_memory_quality_benchmark/v1` report surface, covering expected + evidence recall, precision@5, source-ref coverage, stale/correction/delete behavior, + Context Pack activation, Recall Debug privacy, hard-fail leak counters, latency, and + required source-backed memory scenario coverage. diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml index 79819a43..283c2a1a 100644 --- a/makefiles/benchmark-memory-b.toml +++ b/makefiles/benchmark-memory-b.toml @@ -23,6 +23,73 @@ args = [ "ELF real-world memory fixture", ] +[tasks.source-backed-memory-quality] +workspace = false +dependencies = [ + "source-backed-memory-quality-report", +] + +[tasks.source-backed-memory-quality-json] +workspace = false +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "run", + "--fixtures", + "apps/elf-eval/fixtures/real_world_memory", + "--out", + "tmp/source-backed-memory-quality/report.json", + "--run-id", + "source-backed-memory-quality", + "--adapter-id", + "elf_source_backed_memory_quality", + "--adapter-name", + "ELF source-backed memory quality benchmark", +] + +[tasks.source-backed-memory-quality-validate] +workspace = false +dependencies = [ + "source-backed-memory-quality-json", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "validate-source-backed-quality", + "--report", + "tmp/source-backed-memory-quality/report.json", +] + +[tasks.source-backed-memory-quality-report] +workspace = false +dependencies = [ + "source-backed-memory-quality-validate", +] +command = "cargo" +args = [ + "run", + "-p", + "elf-eval", + "--bin", + "real_world_job_benchmark", + "--", + "publish", + "--report", + "tmp/source-backed-memory-quality/report.json", + "--out", + "tmp/source-backed-memory-quality/report.md", +] + [tasks.real-world-memory-knowledge] workspace = false dependencies = [