From 94300e9378f09c72519b1a40e107cc1586cf6f6c Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 02:46:41 -0400 Subject: [PATCH] {"schema":"decodex/commit/1","summary":"Modularize benchmark support and graph memory tests","authority":"manual"} --- .../consolidation_knowledge_tests_fixtures.rs | 351 +------------- .../consolidation.rs | 70 +++ .../knowledge.rs | 92 ++++ .../operator_debug.rs | 95 ++++ .../project_decisions.rs | 92 ++++ .../runner.rs | 13 + .../production_ops.rs | 336 +------------- .../production_ops_evidence.rs | 143 ++++++ .../production_ops_failure_cases.rs | 113 +++++ .../production_ops_jobs.rs | 57 +++ .../production_ops_summary.rs | 29 ++ .../support/report_paths.rs | 394 ++-------------- .../support/report_paths_markdown.rs | 127 ++++++ .../support/report_paths_project_files.rs | 9 + .../support/report_paths_snapshots.rs | 122 +++++ .../work_continuity.rs | 333 +------------- .../work_continuity_markdown.rs | 60 +++ .../work_continuity_mutations.rs | 198 ++++++++ .../work_continuity_summary.rs | 81 ++++ packages/elf-storage/tests/graph_memory.rs | 427 +----------------- .../elf-storage/tests/graph_memory/entity.rs | 42 ++ .../elf-storage/tests/graph_memory/facts.rs | 276 +++++++++++ .../elf-storage/tests/graph_memory/helpers.rs | 37 ++ .../tests/graph_memory/predicates.rs | 73 +++ 24 files changed, 1799 insertions(+), 1771 deletions(-) create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/consolidation.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/knowledge.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/operator_debug.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/project_decisions.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/production_ops_evidence.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/production_ops_failure_cases.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/production_ops_jobs.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/production_ops_summary.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_markdown.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_project_files.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_snapshots.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/work_continuity_markdown.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/work_continuity_mutations.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/work_continuity_summary.rs create mode 100644 packages/elf-storage/tests/graph_memory/entity.rs create mode 100644 packages/elf-storage/tests/graph_memory/facts.rs create mode 100644 packages/elf-storage/tests/graph_memory/helpers.rs create mode 100644 packages/elf-storage/tests/graph_memory/predicates.rs diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures.rs index 932c2332..1bd9a2a6 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures.rs @@ -1,346 +1,5 @@ -use std::fs; - -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -#[test] -fn runner_discovers_nested_fixture_layout() -> Result<()> { - let report = support::run_json_report_from(support::fixture_root())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); - - Ok(()) -} - -#[test] -fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<()> { - let report = support::run_json_report_from(support::operator_debug_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(7)); - assert_eq!( - report.pointer("/summary/operator_debug_job_count").and_then(Value::as_u64), - Some(7) - ); - assert_eq!(report.pointer("/summary/raw_sql_needed_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/trace_incomplete_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/operator_ux_gap_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(7)); - assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), - Some(3) - ); - - let jobs = support::array_at(&report, "/jobs")?; - let dropped = support::find_by_field(jobs, "/job_id", "operator-debug-dropped-evidence-001")?; - let selected = - support::find_by_field(jobs, "/job_id", "operator-debug-selected-not-narrated-001")?; - let compact = - support::find_by_field(jobs, "/job_id", "operator-debug-qmd-style-compact-replay-001")?; - - assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - dropped.pointer("/operator_debug/raw_sql_needed").and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - dropped.pointer("/operator_debug/dropped_candidate_visibility").and_then(Value::as_str), - Some("visible in Retrieval Funnel and Replay Candidates") - ); - assert_eq!( - dropped.pointer("/operator_debug/viewer_url").and_then(Value::as_str), - Some("/viewer?trace_id=11111111-1111-4111-8111-111111111111") - ); - assert_eq!( - dropped.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("filter.read_profile") - ); - assert!(support::array_contains_str( - dropped, - "/trace_explainability/stages/1/dropped_evidence", - "trace-dropped-expected" - )?); - assert!(support::array_contains_str( - dropped, - "/trace_explainability/stages/1/distractor_evidence", - "trace-dropped-decoy" - )?); - assert!(support::array_contains_str(dropped, "/produced_evidence", "trace-dropped-expected")?); - assert_eq!(selected.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - selected.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("selection.narration") - ); - assert_eq!( - selected.pointer("/operator_debug/failure_mode").and_then(Value::as_str), - Some("selected_but_not_narrated") - ); - assert_eq!(compact.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - compact.pointer("/operator_debug/failure_mode").and_then(Value::as_str), - Some("qmd_style_compact_replay") - ); - assert_eq!( - compact.pointer("/operator_debug/replay_command_available").and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - compact.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), - Some("recall_debug.compact_replay") - ); - assert!(support::array_contains_str( - compact, - "/trace_explainability/stages/4/kept_evidence", - "compact-replay-artifact" - )?); - assert!(support::array_contains_str( - compact, - "/produced_evidence", - "qmd-short-replay-reference" - )?); - - Ok(()) -} - -#[test] -fn consolidation_fixtures_report_reviewable_proposal_metrics() -> Result<()> { - let report = support::run_json_report_from(support::consolidation_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); - assert_eq!( - report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), - Some(4) - ); - assert_eq!( - report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report - .pointer("/summary/consolidation/proposal_unsupported_claim_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/consolidation/executable_gap_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - report.pointer("/summary/consolidation/lineage_completeness").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/consolidation/review_action_correctness").and_then(Value::as_f64), - Some(1.0) - ); - - let jobs = support::array_at(&report, "/jobs")?; - let project_summary = - support::find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?; - let contradiction = - support::find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; - - assert_eq!( - project_summary - .pointer("/consolidation/proposals/0/actual_review_action") - .and_then(Value::as_str), - Some("apply") - ); - assert_eq!( - contradiction - .pointer("/consolidation/proposals/0/actual_review_action") - .and_then(Value::as_str), - Some("discard") - ); - assert_eq!( - contradiction - .pointer("/consolidation/proposals/0/unsupported_claim_count") - .and_then(Value::as_u64), - Some(1) - ); - - let suites = support::array_at(&report, "/suites")?; - let consolidation_suite = support::find_by_field(suites, "/suite_id", "consolidation")?; - - assert_eq!(consolidation_suite.pointer("/status").and_then(Value::as_str), Some("pass")); - - Ok(()) -} - -#[test] -fn knowledge_fixtures_report_page_metrics() -> Result<()> { - let report = support::run_json_report_from(support::knowledge_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); - assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(5)); - assert_eq!( - report.pointer("/summary/knowledge/section_count").and_then(Value::as_u64), - Some(13) - ); - assert_eq!( - report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), - Some(0.923) - ); - assert_eq!( - report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/knowledge/rebuild_determinism").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/knowledge/backlink_count").and_then(Value::as_u64), - Some(11) - ); - assert_eq!( - report.pointer("/summary/knowledge/pages_with_backlinks").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/knowledge/backlink_coverage").and_then(Value::as_f64), - Some(1.0) - ); - assert_eq!( - report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), - Some(0.979) - ); - assert_eq!( - report.pointer("/summary/knowledge/pages_with_version_diff").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report.pointer("/summary/knowledge/allowed_variance_count").and_then(Value::as_u64), - Some(1) - ); - - let suites = support::array_at(&report, "/suites")?; - let knowledge_suite = support::find_by_field(suites, "/suite_id", "knowledge_compilation")?; - - assert_eq!(knowledge_suite.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(knowledge_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); - - let jobs = support::array_at(&report, "/jobs")?; - let project_page_job = support::find_by_field(jobs, "/job_id", "knowledge-project-page-001")?; - let watch_rebuild_job = support::find_by_field(jobs, "/job_id", "knowledge-watch-rebuild-003")?; - - assert_eq!( - project_page_job.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - project_page_job.pointer("/knowledge/untraced_section_count").and_then(Value::as_u64), - Some(0) - ); - assert_eq!( - watch_rebuild_job.pointer("/knowledge/pages_with_version_diff").and_then(Value::as_u64), - Some(1) - ); - assert!( - watch_rebuild_job - .pointer("/produced_answer") - .and_then(Value::as_str) - .is_some_and(|answer| answer - .contains("PageIndex/OpenKB adapter claim as lint evidence") - && answer.contains("leaves source documents plus Memory Notes unmodified")) - ); - - Ok(()) -} - -#[test] -fn project_decisions_fixtures_report_decision_policy_cases() -> Result<()> { - let report = support::run_json_report_from(support::project_decisions_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); - assert_eq!( - report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), - Some(2) - ); - assert_eq!( - report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), - Some(5) - ); - assert_eq!( - report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), - Some(1.0) - ); - - let suites = support::array_at(&report, "/suites")?; - let project_decisions = support::find_by_field(suites, "/suite_id", "project_decisions")?; - - assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(project_decisions.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); - assert_eq!( - project_decisions.pointer("/update_rationale_available_count").and_then(Value::as_u64), - Some(5) - ); - - let jobs = support::array_at(&report, "/jobs")?; - let accepted = - support::find_by_field(jobs, "/job_id", "project-decision-accepted-typed-failures-001")?; - let reversal = - support::find_by_field(jobs, "/job_id", "project-decision-reversal-live-baseline-001")?; - let validation = - support::find_by_field(jobs, "/job_id", "project-decision-current-validation-gate-001")?; - let tradeoff = - support::find_by_field(jobs, "/job_id", "project-decision-tradeoff-fixture-backed-001")?; - let caveat = - support::find_by_field(jobs, "/job_id", "project-decision-private-manifest-caveat-001")?; - - assert_eq!(accepted.pointer("/answer_type").and_then(Value::as_str), Some("decision_record")); - assert_eq!( - accepted.pointer("/expected_evidence").and_then(Value::as_array).map(Vec::len), - Some(2) - ); - assert_eq!( - reversal.pointer("/evolution/historical_evidence/0").and_then(Value::as_str), - Some("live-baseline-suite-win-old") - ); - assert_eq!( - validation.pointer("/evolution/current_evidence/0").and_then(Value::as_str), - Some("validation-gate-current-decodex") - ); - assert_eq!(tradeoff.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); - assert_eq!(caveat.pointer("/can_answer_unknown").and_then(Value::as_bool), Some(true)); - - for job in jobs { - let expected_evidence = support::array_at(job, "/expected_evidence")?; - - assert!( - !expected_evidence.is_empty(), - "project decision job {} must declare required evidence", - job.pointer("/job_id").and_then(Value::as_str).unwrap_or("") - ); - } - for entry in fs::read_dir(support::project_decisions_fixture_dir())? { - let path = entry?.path(); - - if path.extension().and_then(|ext| ext.to_str()) != Some("json") { - continue; - } - - let fixture = serde_json::from_str::(&fs::read_to_string(path)?)?; - let required_evidence = support::array_at(&fixture, "/required_evidence")?; - let negative_traps = support::array_at(&fixture, "/negative_traps")?; - - assert!(!required_evidence.is_empty()); - assert!(!negative_traps.is_empty()); - } - - Ok(()) -} +mod consolidation; +mod knowledge; +mod operator_debug; +mod project_decisions; +mod runner; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/consolidation.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/consolidation.rs new file mode 100644 index 00000000..5ba987a4 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/consolidation.rs @@ -0,0 +1,70 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn consolidation_fixtures_report_reviewable_proposal_metrics() -> Result<()> { + let report = support::run_json_report_from(support::consolidation_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(4)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(4)); + assert_eq!( + report.pointer("/summary/consolidation/proposal_count").and_then(Value::as_u64), + Some(4) + ); + assert_eq!( + report.pointer("/summary/consolidation/source_mutation_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report + .pointer("/summary/consolidation/proposal_unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/consolidation/executable_gap_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + report.pointer("/summary/consolidation/lineage_completeness").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/consolidation/review_action_correctness").and_then(Value::as_f64), + Some(1.0) + ); + + let jobs = support::array_at(&report, "/jobs")?; + let project_summary = + support::find_by_field(jobs, "/job_id", "consolidation-project-summary-apply-001")?; + let contradiction = + support::find_by_field(jobs, "/job_id", "consolidation-contradiction-report-discard-001")?; + + assert_eq!( + project_summary + .pointer("/consolidation/proposals/0/actual_review_action") + .and_then(Value::as_str), + Some("apply") + ); + assert_eq!( + contradiction + .pointer("/consolidation/proposals/0/actual_review_action") + .and_then(Value::as_str), + Some("discard") + ); + assert_eq!( + contradiction + .pointer("/consolidation/proposals/0/unsupported_claim_count") + .and_then(Value::as_u64), + Some(1) + ); + + let suites = support::array_at(&report, "/suites")?; + let consolidation_suite = support::find_by_field(suites, "/suite_id", "consolidation")?; + + assert_eq!(consolidation_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/knowledge.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/knowledge.rs new file mode 100644 index 00000000..ee46af64 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/knowledge.rs @@ -0,0 +1,92 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn knowledge_fixtures_report_page_metrics() -> Result<()> { + let report = support::run_json_report_from(support::knowledge_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(3)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/wrong_result_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/knowledge/page_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + report.pointer("/summary/knowledge/section_count").and_then(Value::as_u64), + Some(13) + ); + assert_eq!( + report.pointer("/summary/knowledge/citation_coverage").and_then(Value::as_f64), + Some(0.923) + ); + assert_eq!( + report.pointer("/summary/knowledge/stale_claim_detection").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/rebuild_determinism").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/backlink_count").and_then(Value::as_u64), + Some(11) + ); + assert_eq!( + report.pointer("/summary/knowledge/pages_with_backlinks").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/knowledge/backlink_coverage").and_then(Value::as_f64), + Some(1.0) + ); + assert_eq!( + report.pointer("/summary/knowledge/page_usefulness").and_then(Value::as_f64), + Some(0.979) + ); + assert_eq!( + report.pointer("/summary/knowledge/pages_with_version_diff").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report.pointer("/summary/knowledge/allowed_variance_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = support::array_at(&report, "/suites")?; + let knowledge_suite = support::find_by_field(suites, "/suite_id", "knowledge_compilation")?; + + assert_eq!(knowledge_suite.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(knowledge_suite.pointer("/encoded_job_count").and_then(Value::as_u64), Some(3)); + + let jobs = support::array_at(&report, "/jobs")?; + let project_page_job = support::find_by_field(jobs, "/job_id", "knowledge-project-page-001")?; + let watch_rebuild_job = support::find_by_field(jobs, "/job_id", "knowledge-watch-rebuild-003")?; + + assert_eq!( + project_page_job.pointer("/knowledge/unsupported_summary_count").and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + project_page_job.pointer("/knowledge/untraced_section_count").and_then(Value::as_u64), + Some(0) + ); + assert_eq!( + watch_rebuild_job.pointer("/knowledge/pages_with_version_diff").and_then(Value::as_u64), + Some(1) + ); + assert!( + watch_rebuild_job + .pointer("/produced_answer") + .and_then(Value::as_str) + .is_some_and(|answer| answer + .contains("PageIndex/OpenKB adapter claim as lint evidence") + && answer.contains("leaves source documents plus Memory Notes unmodified")) + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/operator_debug.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/operator_debug.rs new file mode 100644 index 00000000..61b8d17b --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/operator_debug.rs @@ -0,0 +1,95 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn operator_debug_fixture_reports_trace_links_and_failure_details() -> Result<()> { + let report = support::run_json_report_from(support::operator_debug_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(7)); + assert_eq!( + report.pointer("/summary/operator_debug_job_count").and_then(Value::as_u64), + Some(7) + ); + assert_eq!(report.pointer("/summary/raw_sql_needed_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/trace_incomplete_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/operator_ux_gap_count").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(7)); + assert_eq!(report.pointer("/summary/unsupported_claim").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/unsupported_claim_count").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/trace_explainability_count").and_then(Value::as_u64), + Some(3) + ); + + let jobs = support::array_at(&report, "/jobs")?; + let dropped = support::find_by_field(jobs, "/job_id", "operator-debug-dropped-evidence-001")?; + let selected = + support::find_by_field(jobs, "/job_id", "operator-debug-selected-not-narrated-001")?; + let compact = + support::find_by_field(jobs, "/job_id", "operator-debug-qmd-style-compact-replay-001")?; + + assert_eq!(dropped.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + dropped.pointer("/operator_debug/raw_sql_needed").and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + dropped.pointer("/operator_debug/dropped_candidate_visibility").and_then(Value::as_str), + Some("visible in Retrieval Funnel and Replay Candidates") + ); + assert_eq!( + dropped.pointer("/operator_debug/viewer_url").and_then(Value::as_str), + Some("/viewer?trace_id=11111111-1111-4111-8111-111111111111") + ); + assert_eq!( + dropped.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("filter.read_profile") + ); + assert!(support::array_contains_str( + dropped, + "/trace_explainability/stages/1/dropped_evidence", + "trace-dropped-expected" + )?); + assert!(support::array_contains_str( + dropped, + "/trace_explainability/stages/1/distractor_evidence", + "trace-dropped-decoy" + )?); + assert!(support::array_contains_str(dropped, "/produced_evidence", "trace-dropped-expected")?); + assert_eq!(selected.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + selected.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("selection.narration") + ); + assert_eq!( + selected.pointer("/operator_debug/failure_mode").and_then(Value::as_str), + Some("selected_but_not_narrated") + ); + assert_eq!(compact.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + compact.pointer("/operator_debug/failure_mode").and_then(Value::as_str), + Some("qmd_style_compact_replay") + ); + assert_eq!( + compact.pointer("/operator_debug/replay_command_available").and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + compact.pointer("/trace_explainability/failure_stage").and_then(Value::as_str), + Some("recall_debug.compact_replay") + ); + assert!(support::array_contains_str( + compact, + "/trace_explainability/stages/4/kept_evidence", + "compact-replay-artifact" + )?); + assert!(support::array_contains_str( + compact, + "/produced_evidence", + "qmd-short-replay-reference" + )?); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/project_decisions.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/project_decisions.rs new file mode 100644 index 00000000..72ddb030 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/project_decisions.rs @@ -0,0 +1,92 @@ +use std::fs; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn project_decisions_fixtures_report_decision_policy_cases() -> Result<()> { + let report = support::run_json_report_from(support::project_decisions_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(5)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!( + report.pointer("/summary/conflict_detection_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/summary/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + assert_eq!( + report.pointer("/summary/expected_evidence_recall").and_then(Value::as_f64), + Some(1.0) + ); + + let suites = support::array_at(&report, "/suites")?; + let project_decisions = support::find_by_field(suites, "/suite_id", "project_decisions")?; + + assert_eq!(project_decisions.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(project_decisions.pointer("/encoded_job_count").and_then(Value::as_u64), Some(5)); + assert_eq!( + project_decisions.pointer("/update_rationale_available_count").and_then(Value::as_u64), + Some(5) + ); + + let jobs = support::array_at(&report, "/jobs")?; + let accepted = + support::find_by_field(jobs, "/job_id", "project-decision-accepted-typed-failures-001")?; + let reversal = + support::find_by_field(jobs, "/job_id", "project-decision-reversal-live-baseline-001")?; + let validation = + support::find_by_field(jobs, "/job_id", "project-decision-current-validation-gate-001")?; + let tradeoff = + support::find_by_field(jobs, "/job_id", "project-decision-tradeoff-fixture-backed-001")?; + let caveat = + support::find_by_field(jobs, "/job_id", "project-decision-private-manifest-caveat-001")?; + + assert_eq!(accepted.pointer("/answer_type").and_then(Value::as_str), Some("decision_record")); + assert_eq!( + accepted.pointer("/expected_evidence").and_then(Value::as_array).map(Vec::len), + Some(2) + ); + assert_eq!( + reversal.pointer("/evolution/historical_evidence/0").and_then(Value::as_str), + Some("live-baseline-suite-win-old") + ); + assert_eq!( + validation.pointer("/evolution/current_evidence/0").and_then(Value::as_str), + Some("validation-gate-current-decodex") + ); + assert_eq!(tradeoff.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); + assert_eq!(caveat.pointer("/can_answer_unknown").and_then(Value::as_bool), Some(true)); + + for job in jobs { + let expected_evidence = support::array_at(job, "/expected_evidence")?; + + assert!( + !expected_evidence.is_empty(), + "project decision job {} must declare required evidence", + job.pointer("/job_id").and_then(Value::as_str).unwrap_or("") + ); + } + for entry in fs::read_dir(support::project_decisions_fixture_dir())? { + let path = entry?.path(); + + if path.extension().and_then(|ext| ext.to_str()) != Some("json") { + continue; + } + + let fixture = serde_json::from_str::(&fs::read_to_string(path)?)?; + let required_evidence = support::array_at(&fixture, "/required_evidence")?; + let negative_traps = support::array_at(&fixture, "/negative_traps")?; + + assert!(!required_evidence.is_empty()); + assert!(!negative_traps.is_empty()); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs new file mode 100644 index 00000000..deebfd8e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge_tests_fixtures/runner.rs @@ -0,0 +1,13 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn runner_discovers_nested_fixture_layout() -> Result<()> { + let report = support::run_json_report_from(support::fixture_root())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(82)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/production_ops.rs b/apps/elf-eval/tests/real_world_job_benchmark/production_ops.rs index cb53b97c..ba03c605 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/production_ops.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/production_ops.rs @@ -1,7 +1,9 @@ -use std::{env, fs, process}; +mod production_ops_evidence; +mod production_ops_failure_cases; +mod production_ops_jobs; +mod production_ops_summary; use color_eyre::Result; -use serde_json::Value; use crate::support; @@ -9,333 +11,9 @@ use crate::support; fn production_ops_fixtures_report_bounded_typed_states() -> Result<()> { let report = support::run_json_report_from(support::production_ops_fixture_dir())?; - assert_production_ops_summary(&report)?; - assert_production_ops_jobs(&report)?; - assert_production_ops_operational_evidence(&report)?; - - Ok(()) -} - -fn assert_production_ops_summary(report: &Value) -> Result<()> { - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); - assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); - assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); - assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); - assert_eq!( - report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), - Some(2) - ); - assert_eq!( - report.pointer("/private_corpus_redaction/private_fixture_count").and_then(Value::as_u64), - Some(1) - ); - - let suites = support::array_at(report, "/suites")?; - let production_ops = support::find_by_field(suites, "/suite_id", "production_ops")?; - - assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(8)); - - Ok(()) -} - -fn assert_production_ops_jobs(report: &Value) -> Result<()> { - let jobs = support::array_at(report, "/jobs")?; - let authority_recovery = - support::find_by_field(jobs, "/job_id", "production-ops-authority-plane-recovery-001")?; - let backfill = support::find_by_field(jobs, "/job_id", "production-ops-backfill-resume-001")?; - let restore = support::find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; - let public_proxy = - support::find_by_field(jobs, "/job_id", "production-ops-public-proxy-addendum-001")?; - let private_manifest = - support::find_by_field(jobs, "/job_id", "production-ops-private-manifest-blocked-001")?; - let credentials = - support::find_by_field(jobs, "/job_id", "production-ops-credential-boundary-001")?; - let dependency = - support::find_by_field(jobs, "/job_id", "production-ops-cold-start-dependency-001")?; - - assert_authority_recovery_job(authority_recovery)?; - - assert_eq!(authority_recovery.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(backfill.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(restore.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); - assert_eq!(public_proxy.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!( - public_proxy.pointer("/operational_evidence_tier").and_then(Value::as_str), - Some("public_proxy") - ); - assert_eq!(private_manifest.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - private_manifest.pointer("/operational_evidence_tier").and_then(Value::as_str), - Some("private_corpus") - ); - assert_eq!(credentials.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!( - credentials.pointer("/operational_evidence_tier").and_then(Value::as_str), - Some("provider_backed") - ); - assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("pass")); - - Ok(()) -} - -fn assert_authority_recovery_job(job: &Value) -> Result<()> { - assert_eq!(job.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); - assert_eq!(job.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); - assert_eq!( - job.pointer("/recovery_drills/0/contract_schema").and_then(Value::as_str), - Some("elf.authority_recovery_drill/v1") - ); - assert!(support::array_at(job, "/hard_fail_hits")?.is_empty()); - - Ok(()) -} - -fn assert_production_ops_operational_evidence(report: &Value) -> Result<()> { - assert_eq!( - report.pointer("/operational_evidence/schema").and_then(Value::as_str), - Some("elf.operational_evidence_gates/v1") - ); - assert_eq!( - report - .pointer("/operational_evidence/missing_private_provider_inputs_are_typed_blockers") - .and_then(Value::as_bool), - Some(true) - ); - assert_eq!( - report - .pointer("/operational_evidence/private_corpus_pass_claim_allowed") - .and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - report - .pointer("/operational_evidence/provider_backed_pass_claim_allowed") - .and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - report.pointer("/operational_evidence/latency/measured_job_count").and_then(Value::as_u64), - Some(8) - ); - assert_eq!( - report.pointer("/operational_evidence/cost/jobs_with_cost_report").and_then(Value::as_u64), - Some(8) - ); - assert_eq!( - report - .pointer("/operational_evidence/resource/resource_envelope_job_count") - .and_then(Value::as_u64), - Some(2) - ); - assert_eq!( - report - .pointer("/operational_evidence/cold_start_restore_rebuild/qdrant_rebuild_pass_count") - .and_then(Value::as_u64), - Some(2) - ); - - assert_authority_recovery_operational_evidence(report); - - let tiers = support::array_at(report, "/operational_evidence/tiers")?; - let local_fixture = support::find_by_field(tiers, "/tier", "local_fixture")?; - let public_proxy_tier = support::find_by_field(tiers, "/tier", "public_proxy")?; - let private_corpus = support::find_by_field(tiers, "/tier", "private_corpus")?; - let provider_backed = support::find_by_field(tiers, "/tier", "provider_backed")?; - - assert_eq!(local_fixture.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(local_fixture.pointer("/job_count").and_then(Value::as_u64), Some(5)); - assert_eq!(public_proxy_tier.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(public_proxy_tier.pointer("/job_count").and_then(Value::as_u64), Some(1)); - assert_eq!(private_corpus.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(private_corpus.pointer("/blocked").and_then(Value::as_u64), Some(1)); - assert_eq!(provider_backed.pointer("/status").and_then(Value::as_str), Some("blocked")); - assert_eq!(provider_backed.pointer("/blocked").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -fn assert_authority_recovery_operational_evidence(report: &Value) { - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/drill_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/authority_plane_count") - .and_then(Value::as_u64), - Some(7) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/backup_pitr_restored_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/record_count_preserved_count") - .and_then(Value::as_u64), - Some(7) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/source_ref_preserved_count") - .and_then(Value::as_u64), - Some(7) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/lifecycle_history_preserved_count") - .and_then(Value::as_u64), - Some(7) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/rpo_met_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/rto_met_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/idempotent_outbox_replay_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/qdrant_rebuild_complete_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/migration_repair_count") - .and_then(Value::as_u64), - Some(1) - ); - assert_eq!( - report - .pointer("/operational_evidence/authority_recovery/dead_letter_handled_count") - .and_then(Value::as_u64), - Some(1) - ); -} - -#[test] -fn authority_recovery_fixture_rejects_incomplete_recovery_predicates() -> Result<()> { - for (slug, pointer, replacement, expected_error) in authority_recovery_failure_cases() { - assert_authority_recovery_fixture_failure( - slug, - |fixture| support::set_json_pointer(fixture, pointer, replacement), - expected_error, - )?; - } - - Ok(()) -} - -fn authority_recovery_failure_cases() -> Vec<(&'static str, &'static str, Value, &'static str)> { - vec![ - ( - "unrestored-backup", - "/corpus/adapter_response/answer/recovery_drills/0/backup_pitr/restored", - serde_json::json!(false), - "incomplete backup/PITR drill evidence", - ), - ( - "record-count-loss", - "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/after_count", - serde_json::json!(2), - "lost or gained source authority records", - ), - ( - "source-ref-loss", - "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/source_refs_preserved", - serde_json::json!(false), - "did not preserve source authority source refs", - ), - ( - "lifecycle-history-loss", - "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/lifecycle_history_preserved", - serde_json::json!(false), - "did not preserve source authority lifecycle history", - ), - ( - "hidden-source-of-truth", - "/corpus/adapter_response/answer/recovery_drills/0/degraded_read/source_of_truth_visible", - serde_json::json!(false), - "hidden source-of-truth records during degraded read", - ), - ( - "rpo-miss", - "/corpus/adapter_response/answer/recovery_drills/0/rpo/measured_seconds", - serde_json::json!(61.0), - "exceeded rpo recovery target", - ), - ( - "non-idempotent-outbox", - "/corpus/adapter_response/answer/recovery_drills/0/outbox_replay/duplicate_write_count", - serde_json::json!(1), - "incomplete outbox replay drill evidence", - ), - ( - "incomplete-qdrant-rebuild", - "/corpus/adapter_response/answer/recovery_drills/0/qdrant_rebuild/complete", - serde_json::json!(false), - "incomplete Qdrant rebuild drill evidence", - ), - ( - "missing-migration-repair", - "/corpus/adapter_response/answer/recovery_drills/0/migration_repair/applied", - serde_json::json!(false), - "incomplete migration repair drill evidence", - ), - ( - "dead-letter-underhandled", - "/corpus/adapter_response/answer/recovery_drills/0/dead_letter/handled_count", - serde_json::json!(1), - "incomplete dead-letter handling drill evidence", - ), - ] -} - -fn assert_authority_recovery_fixture_failure( - slug: &str, - mutate: F, - expected_error: &str, -) -> Result<()> -where - F: FnOnce(&mut Value) -> Result<()>, -{ - let fixture_path = - support::production_ops_fixture_dir().join("authority_plane_recovery_drill.json"); - let mut fixture = support::load_json(&fixture_path)?; - - mutate(&mut fixture)?; - - let temp_dir = env::temp_dir().join(format!("elf-authority-recovery-{slug}-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("fixture.json"), serde_json::to_vec_pretty(&fixture)?)?; - - let stderr = support::run_json_report_from_failure(temp_dir)?; - - assert!( - stderr.contains(expected_error), - "missing expected error `{expected_error}` in stderr: {stderr}", - ); + production_ops_summary::assert_production_ops_summary(&report)?; + production_ops_jobs::assert_production_ops_jobs(&report)?; + production_ops_evidence::assert_production_ops_operational_evidence(&report)?; Ok(()) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark/production_ops_evidence.rs b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_evidence.rs new file mode 100644 index 00000000..863c2a14 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_evidence.rs @@ -0,0 +1,143 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_production_ops_operational_evidence(report: &Value) -> Result<()> { + assert_eq!( + report.pointer("/operational_evidence/schema").and_then(Value::as_str), + Some("elf.operational_evidence_gates/v1") + ); + assert_eq!( + report + .pointer("/operational_evidence/missing_private_provider_inputs_are_typed_blockers") + .and_then(Value::as_bool), + Some(true) + ); + assert_eq!( + report + .pointer("/operational_evidence/private_corpus_pass_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/operational_evidence/provider_backed_pass_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report.pointer("/operational_evidence/latency/measured_job_count").and_then(Value::as_u64), + Some(8) + ); + assert_eq!( + report.pointer("/operational_evidence/cost/jobs_with_cost_report").and_then(Value::as_u64), + Some(8) + ); + assert_eq!( + report + .pointer("/operational_evidence/resource/resource_envelope_job_count") + .and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report + .pointer("/operational_evidence/cold_start_restore_rebuild/qdrant_rebuild_pass_count") + .and_then(Value::as_u64), + Some(2) + ); + + assert_authority_recovery_operational_evidence(report); + + let tiers = support::array_at(report, "/operational_evidence/tiers")?; + let local_fixture = support::find_by_field(tiers, "/tier", "local_fixture")?; + let public_proxy_tier = support::find_by_field(tiers, "/tier", "public_proxy")?; + let private_corpus = support::find_by_field(tiers, "/tier", "private_corpus")?; + let provider_backed = support::find_by_field(tiers, "/tier", "provider_backed")?; + + assert_eq!(local_fixture.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(local_fixture.pointer("/job_count").and_then(Value::as_u64), Some(5)); + assert_eq!(public_proxy_tier.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(public_proxy_tier.pointer("/job_count").and_then(Value::as_u64), Some(1)); + assert_eq!(private_corpus.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(private_corpus.pointer("/blocked").and_then(Value::as_u64), Some(1)); + assert_eq!(provider_backed.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(provider_backed.pointer("/blocked").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +fn assert_authority_recovery_operational_evidence(report: &Value) { + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/drill_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/authority_plane_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/backup_pitr_restored_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/record_count_preserved_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/source_ref_preserved_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/lifecycle_history_preserved_count") + .and_then(Value::as_u64), + Some(7) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/rpo_met_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/rto_met_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/idempotent_outbox_replay_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/qdrant_rebuild_complete_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/migration_repair_count") + .and_then(Value::as_u64), + Some(1) + ); + assert_eq!( + report + .pointer("/operational_evidence/authority_recovery/dead_letter_handled_count") + .and_then(Value::as_u64), + Some(1) + ); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/production_ops_failure_cases.rs b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_failure_cases.rs new file mode 100644 index 00000000..4a1fdcc7 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_failure_cases.rs @@ -0,0 +1,113 @@ +use std::{env, fs, process}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn authority_recovery_fixture_rejects_incomplete_recovery_predicates() -> Result<()> { + for (slug, pointer, replacement, expected_error) in authority_recovery_failure_cases() { + assert_authority_recovery_fixture_failure( + slug, + |fixture| support::set_json_pointer(fixture, pointer, replacement), + expected_error, + )?; + } + + Ok(()) +} + +fn authority_recovery_failure_cases() -> Vec<(&'static str, &'static str, Value, &'static str)> { + vec![ + ( + "unrestored-backup", + "/corpus/adapter_response/answer/recovery_drills/0/backup_pitr/restored", + serde_json::json!(false), + "incomplete backup/PITR drill evidence", + ), + ( + "record-count-loss", + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/after_count", + serde_json::json!(2), + "lost or gained source authority records", + ), + ( + "source-ref-loss", + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/source_refs_preserved", + serde_json::json!(false), + "did not preserve source authority source refs", + ), + ( + "lifecycle-history-loss", + "/corpus/adapter_response/answer/recovery_drills/0/authority_record_counts/0/lifecycle_history_preserved", + serde_json::json!(false), + "did not preserve source authority lifecycle history", + ), + ( + "hidden-source-of-truth", + "/corpus/adapter_response/answer/recovery_drills/0/degraded_read/source_of_truth_visible", + serde_json::json!(false), + "hidden source-of-truth records during degraded read", + ), + ( + "rpo-miss", + "/corpus/adapter_response/answer/recovery_drills/0/rpo/measured_seconds", + serde_json::json!(61.0), + "exceeded rpo recovery target", + ), + ( + "non-idempotent-outbox", + "/corpus/adapter_response/answer/recovery_drills/0/outbox_replay/duplicate_write_count", + serde_json::json!(1), + "incomplete outbox replay drill evidence", + ), + ( + "incomplete-qdrant-rebuild", + "/corpus/adapter_response/answer/recovery_drills/0/qdrant_rebuild/complete", + serde_json::json!(false), + "incomplete Qdrant rebuild drill evidence", + ), + ( + "missing-migration-repair", + "/corpus/adapter_response/answer/recovery_drills/0/migration_repair/applied", + serde_json::json!(false), + "incomplete migration repair drill evidence", + ), + ( + "dead-letter-underhandled", + "/corpus/adapter_response/answer/recovery_drills/0/dead_letter/handled_count", + serde_json::json!(1), + "incomplete dead-letter handling drill evidence", + ), + ] +} + +fn assert_authority_recovery_fixture_failure( + slug: &str, + mutate: F, + expected_error: &str, +) -> Result<()> +where + F: FnOnce(&mut Value) -> Result<()>, +{ + let fixture_path = + support::production_ops_fixture_dir().join("authority_plane_recovery_drill.json"); + let mut fixture = support::load_json(&fixture_path)?; + + mutate(&mut fixture)?; + + let temp_dir = env::temp_dir().join(format!("elf-authority-recovery-{slug}-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("fixture.json"), serde_json::to_vec_pretty(&fixture)?)?; + + let stderr = support::run_json_report_from_failure(temp_dir)?; + + assert!( + stderr.contains(expected_error), + "missing expected error `{expected_error}` in stderr: {stderr}", + ); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/production_ops_jobs.rs b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_jobs.rs new file mode 100644 index 00000000..e860ce7e --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_jobs.rs @@ -0,0 +1,57 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_production_ops_jobs(report: &Value) -> Result<()> { + let jobs = support::array_at(report, "/jobs")?; + let authority_recovery = + support::find_by_field(jobs, "/job_id", "production-ops-authority-plane-recovery-001")?; + let backfill = support::find_by_field(jobs, "/job_id", "production-ops-backfill-resume-001")?; + let restore = support::find_by_field(jobs, "/job_id", "production-ops-restore-cold-start-001")?; + let public_proxy = + support::find_by_field(jobs, "/job_id", "production-ops-public-proxy-addendum-001")?; + let private_manifest = + support::find_by_field(jobs, "/job_id", "production-ops-private-manifest-blocked-001")?; + let credentials = + support::find_by_field(jobs, "/job_id", "production-ops-credential-boundary-001")?; + let dependency = + support::find_by_field(jobs, "/job_id", "production-ops-cold-start-dependency-001")?; + + assert_authority_recovery_job(authority_recovery)?; + + assert_eq!(authority_recovery.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(backfill.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(restore.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(restore.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(public_proxy.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!( + public_proxy.pointer("/operational_evidence_tier").and_then(Value::as_str), + Some("public_proxy") + ); + assert_eq!(private_manifest.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + private_manifest.pointer("/operational_evidence_tier").and_then(Value::as_str), + Some("private_corpus") + ); + assert_eq!(credentials.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!( + credentials.pointer("/operational_evidence_tier").and_then(Value::as_str), + Some("provider_backed") + ); + assert_eq!(dependency.pointer("/status").and_then(Value::as_str), Some("pass")); + + Ok(()) +} + +fn assert_authority_recovery_job(job: &Value) -> Result<()> { + assert_eq!(job.pointer("/qdrant_rebuild_case").and_then(Value::as_bool), Some(true)); + assert_eq!(job.pointer("/requires_caveat").and_then(Value::as_bool), Some(true)); + assert_eq!( + job.pointer("/recovery_drills/0/contract_schema").and_then(Value::as_str), + Some("elf.authority_recovery_drill/v1") + ); + assert!(support::array_at(job, "/hard_fail_hits")?.is_empty()); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/production_ops_summary.rs b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_summary.rs new file mode 100644 index 00000000..4581efdf --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/production_ops_summary.rs @@ -0,0 +1,29 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +pub(super) fn assert_production_ops_summary(report: &Value) -> Result<()> { + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(6)); + assert_eq!(report.pointer("/summary/incomplete").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/blocked").and_then(Value::as_u64), Some(2)); + assert_eq!(report.pointer("/summary/not_encoded").and_then(Value::as_u64), Some(0)); + assert_eq!(report.pointer("/summary/evidence_coverage").and_then(Value::as_f64), Some(1.0)); + assert_eq!( + report.pointer("/summary/qdrant_rebuild_case_count").and_then(Value::as_u64), + Some(2) + ); + assert_eq!( + report.pointer("/private_corpus_redaction/private_fixture_count").and_then(Value::as_u64), + Some(1) + ); + + let suites = support::array_at(report, "/suites")?; + let production_ops = support::find_by_field(suites, "/suite_id", "production_ops")?; + + assert_eq!(production_ops.pointer("/status").and_then(Value::as_str), Some("blocked")); + assert_eq!(production_ops.pointer("/encoded_job_count").and_then(Value::as_u64), Some(8)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths.rs b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths.rs index 623da2da..5459f51d 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths.rs @@ -1,344 +1,50 @@ -use std::path::PathBuf; - -use color_eyre::Result; - -use crate::support; - -pub(crate) fn report_snapshot_path(file_name: &str) -> Result { - Ok(support::workspace_root()? - .join("apps") - .join("elf-eval") - .join("fixtures") - .join("report_snapshots") - .join(file_name)) -} - -pub(crate) fn strength_profile_report_path() -> Result { - report_snapshot_path("2026-06-11-qmd-openviking-strength-profile-report.json") -} - -pub(crate) fn strength_profile_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-qmd-openviking-strength-profile-report.md")) -} - -pub(crate) fn measurement_coverage_audit_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-measurement-coverage-audit.md")) -} - -pub(crate) fn measurement_coverage_audit_json_path() -> Result { - report_snapshot_path("2026-06-11-measurement-coverage-audit.json") -} - -pub(crate) fn retrieval_debug_profile_json_path() -> Result { - report_snapshot_path("2026-06-11-elf-qmd-retrieval-debug-profile.json") -} - -pub(crate) fn trace_replay_diagnostics_report_path() -> Result { - report_snapshot_path("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json") -} - -pub(crate) fn trace_replay_diagnostics_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md")) -} - -pub(crate) fn competitor_strength_adoption_report_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-competitor-strength-adoption-report.md")) -} - -pub(crate) fn competitor_strength_adoption_report_json_path() -> Result { - report_snapshot_path("2026-06-11-competitor-strength-adoption-report.json") -} - -pub(crate) fn capture_write_policy_live_report_path() -> Result { - report_snapshot_path("2026-06-11-capture-write-policy-live-report.json") -} - -pub(crate) fn capture_write_policy_live_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-capture-write-policy-live-report.md")) -} - -pub(crate) fn live_consolidation_proposal_scoring_report_path() -> Result { - report_snapshot_path("2026-06-16-live-consolidation-proposal-scoring-report.json") -} - -pub(crate) fn live_consolidation_proposal_scoring_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-16-live-consolidation-proposal-scoring-report.md")) -} - -pub(crate) fn temporal_history_competitor_gap_json_path() -> Result { - report_snapshot_path("2026-06-11-temporal-history-competitor-gap-report.json") -} - -pub(crate) fn dreaming_readiness_stage_ledger_json_path() -> Result { - report_snapshot_path("2026-06-16-dreaming-readiness-stage-ledger.json") -} - -pub(crate) fn dreaming_readiness_stage_ledger_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-16-dreaming-readiness-stage-ledger.md")) -} - -pub(crate) fn dreaming_competitor_strength_retest_report_json_path() -> Result { - report_snapshot_path("2026-06-17-dreaming-competitor-strength-retest-report.json") -} - -pub(crate) fn dreaming_competitor_strength_retest_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-17-dreaming-competitor-strength-retest-report.md")) -} - -pub(crate) fn qmd_debug_ergonomics_dreaming_retest_report_json_path() -> Result { - report_snapshot_path("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.json") -} - -pub(crate) fn qmd_debug_ergonomics_dreaming_retest_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md")) -} - -pub(crate) fn openviking_trajectory_materialization_report_json_path() -> Result { - report_snapshot_path("2026-06-19-openviking-trajectory-materialization-report.json") -} - -pub(crate) fn letta_core_archive_export_readback_report_json_path() -> Result { - report_snapshot_path("2026-06-19-letta-core-archive-export-readback-report.json") -} - -pub(crate) fn service_native_dreaming_readback_report_json_path() -> Result { - report_snapshot_path("2026-06-19-service-native-dreaming-readback-report.json") -} - -pub(crate) fn service_native_dreaming_readback_materialization_json_path() -> Result { - report_snapshot_path("2026-06-19-service-native-dreaming-readback-materialization.json") -} - -pub(crate) fn dreaming_review_queue_report_json_path() -> Result { - report_snapshot_path("2026-06-20-dreaming-review-queue-report.json") -} - -pub(crate) fn recall_debug_panel_report_json_path() -> Result { - report_snapshot_path("2026-06-20-recall-debug-panel-report.json") -} - -pub(crate) fn agent_knowledge_os_closeout_benchmark_report_json_path() -> Result { - report_snapshot_path("2026-06-20-agent-knowledge-os-closeout-benchmark-report.json") -} - -pub(crate) fn p2_knowledge_workspace_pageindex_openkb_closeout_report_json_path() -> Result -{ - report_snapshot_path("2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.json") -} - -pub(crate) fn openmemory_ui_export_product_readback_report_json_path() -> Result { - report_snapshot_path("2026-06-19-openmemory-ui-export-product-readback-report.json") -} - -pub(crate) fn graph_rag_citation_navigation_promotion_report_json_path() -> Result { - report_snapshot_path("2026-06-19-graph-rag-citation-navigation-promotion-report.json") -} - -pub(crate) fn graph_rag_adapter_matrix_report_json_path() -> Result { - report_snapshot_path("2026-06-23-graph-rag-adapter-matrix-report.json") -} - -pub(crate) fn p3_competitor_strength_absorption_report_json_path() -> Result { - report_snapshot_path("2026-06-23-p3-competitor-strength-absorption-report.json") -} - -pub(crate) fn operator_approved_public_proxy_private_addendum_report_json_path() -> Result -{ - report_snapshot_path( - "2026-06-19-operator-approved-public-proxy-production-private-addendum.json", - ) -} - -pub(crate) fn openviking_trajectory_materialization_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-openviking-trajectory-materialization-report.md")) -} - -pub(crate) fn letta_core_archive_export_readback_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-letta-core-archive-export-readback-report.md")) -} - -pub(crate) fn service_native_dreaming_readback_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-service-native-dreaming-readback-report.md")) -} - -pub(crate) fn dreaming_review_queue_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-20-dreaming-review-queue-report.md")) -} - -pub(crate) fn recall_debug_panel_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-20-recall-debug-panel-report.md")) -} - -pub(crate) fn agent_knowledge_os_closeout_benchmark_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-20-agent-knowledge-os-closeout-benchmark-report.md")) -} - -pub(crate) fn p2_knowledge_workspace_pageindex_openkb_closeout_report_markdown_path() --> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md")) -} - -pub(crate) fn openmemory_ui_export_product_readback_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-openmemory-ui-export-product-readback-report.md")) -} - -pub(crate) fn graph_rag_citation_navigation_promotion_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-graph-rag-citation-navigation-promotion-report.md")) -} - -pub(crate) fn graph_rag_adapter_matrix_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-23-graph-rag-adapter-matrix-report.md")) -} - -pub(crate) fn p3_competitor_strength_absorption_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-23-p3-competitor-strength-absorption-report.md")) -} - -pub(crate) fn graph_topic_map_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-20-graph-topic-map-report.md")) -} - -pub(crate) fn operator_approved_public_proxy_private_addendum_report_markdown_path() --> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-19-operator-approved-public-proxy-production-private-addendum.md")) -} - -pub(crate) fn live_temporal_reconciliation_report_json_path() -> Result { - report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") -} - -pub(crate) fn live_temporal_reconciliation_report_markdown_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-16-live-temporal-reconciliation-report.md")) -} - -pub(crate) fn competitor_strength_matrix_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-competitor-strength-evidence-matrix.md")) -} - -pub(crate) fn competitor_strength_matrix_json_path() -> Result { - report_snapshot_path("2026-06-11-xy-897-competitor-strength-matrix.json") -} - -pub(crate) fn readme_path() -> Result { - Ok(support::workspace_root()?.join("README.md")) -} - -pub(crate) fn comparison_external_projects_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("external_memory") - .join("comparison_external_projects.md")) -} - -pub(crate) fn benchmarking_index_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("index.md")) -} - -pub(crate) fn iteration_direction_report_path() -> Result { - Ok(support::workspace_root()? - .join("docs") - .join("evidence") - .join("benchmarking") - .join("2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md")) -} +mod report_paths_markdown; +mod report_paths_project_files; +mod report_paths_snapshots; + +pub(crate) use self::{ + report_paths_markdown::{ + agent_knowledge_os_closeout_benchmark_report_markdown_path, benchmarking_index_path, + capture_write_policy_live_markdown_path, comparison_external_projects_path, + competitor_strength_adoption_report_path, competitor_strength_matrix_path, + dreaming_competitor_strength_retest_report_markdown_path, + dreaming_readiness_stage_ledger_markdown_path, dreaming_review_queue_report_markdown_path, + graph_rag_adapter_matrix_report_markdown_path, + graph_rag_citation_navigation_promotion_report_markdown_path, + graph_topic_map_report_markdown_path, iteration_direction_report_path, + letta_core_archive_export_readback_report_markdown_path, + live_consolidation_proposal_scoring_markdown_path, + live_temporal_reconciliation_report_markdown_path, measurement_coverage_audit_path, + openmemory_ui_export_product_readback_report_markdown_path, + openviking_trajectory_materialization_report_markdown_path, + operator_approved_public_proxy_private_addendum_report_markdown_path, + p2_knowledge_workspace_pageindex_openkb_closeout_report_markdown_path, + p3_competitor_strength_absorption_report_markdown_path, + qmd_debug_ergonomics_dreaming_retest_report_markdown_path, + recall_debug_panel_report_markdown_path, + service_native_dreaming_readback_report_markdown_path, strength_profile_markdown_path, + trace_replay_diagnostics_markdown_path, + }, + report_paths_project_files::readme_path, + report_paths_snapshots::{ + agent_knowledge_os_closeout_benchmark_report_json_path, + capture_write_policy_live_report_path, competitor_strength_adoption_report_json_path, + competitor_strength_matrix_json_path, dreaming_competitor_strength_retest_report_json_path, + dreaming_readiness_stage_ledger_json_path, dreaming_review_queue_report_json_path, + graph_rag_adapter_matrix_report_json_path, + graph_rag_citation_navigation_promotion_report_json_path, + letta_core_archive_export_readback_report_json_path, + live_consolidation_proposal_scoring_report_path, + live_temporal_reconciliation_report_json_path, measurement_coverage_audit_json_path, + openmemory_ui_export_product_readback_report_json_path, + openviking_trajectory_materialization_report_json_path, + operator_approved_public_proxy_private_addendum_report_json_path, + p2_knowledge_workspace_pageindex_openkb_closeout_report_json_path, + p3_competitor_strength_absorption_report_json_path, + qmd_debug_ergonomics_dreaming_retest_report_json_path, recall_debug_panel_report_json_path, + retrieval_debug_profile_json_path, + service_native_dreaming_readback_materialization_json_path, + service_native_dreaming_readback_report_json_path, strength_profile_report_path, + temporal_history_competitor_gap_json_path, trace_replay_diagnostics_report_path, + }, +}; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_markdown.rs b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_markdown.rs new file mode 100644 index 00000000..d32e85ef --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_markdown.rs @@ -0,0 +1,127 @@ +use std::path::PathBuf; + +use color_eyre::Result; + +use crate::support; + +pub(crate) fn strength_profile_markdown_path() -> Result { + benchmarking_path("2026-06-11-qmd-openviking-strength-profile-report.md") +} + +pub(crate) fn measurement_coverage_audit_path() -> Result { + benchmarking_path("2026-06-11-measurement-coverage-audit.md") +} + +pub(crate) fn trace_replay_diagnostics_markdown_path() -> Result { + benchmarking_path("2026-06-11-elf-qmd-trace-replay-diagnostics-report.md") +} + +pub(crate) fn competitor_strength_adoption_report_path() -> Result { + benchmarking_path("2026-06-11-competitor-strength-adoption-report.md") +} + +pub(crate) fn capture_write_policy_live_markdown_path() -> Result { + benchmarking_path("2026-06-11-capture-write-policy-live-report.md") +} + +pub(crate) fn live_consolidation_proposal_scoring_markdown_path() -> Result { + benchmarking_path("2026-06-16-live-consolidation-proposal-scoring-report.md") +} + +pub(crate) fn dreaming_readiness_stage_ledger_markdown_path() -> Result { + benchmarking_path("2026-06-16-dreaming-readiness-stage-ledger.md") +} + +pub(crate) fn dreaming_competitor_strength_retest_report_markdown_path() -> Result { + benchmarking_path("2026-06-17-dreaming-competitor-strength-retest-report.md") +} + +pub(crate) fn qmd_debug_ergonomics_dreaming_retest_report_markdown_path() -> Result { + benchmarking_path("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.md") +} + +pub(crate) fn openviking_trajectory_materialization_report_markdown_path() -> Result { + benchmarking_path("2026-06-19-openviking-trajectory-materialization-report.md") +} + +pub(crate) fn letta_core_archive_export_readback_report_markdown_path() -> Result { + benchmarking_path("2026-06-19-letta-core-archive-export-readback-report.md") +} + +pub(crate) fn service_native_dreaming_readback_report_markdown_path() -> Result { + benchmarking_path("2026-06-19-service-native-dreaming-readback-report.md") +} + +pub(crate) fn dreaming_review_queue_report_markdown_path() -> Result { + benchmarking_path("2026-06-20-dreaming-review-queue-report.md") +} + +pub(crate) fn recall_debug_panel_report_markdown_path() -> Result { + benchmarking_path("2026-06-20-recall-debug-panel-report.md") +} + +pub(crate) fn agent_knowledge_os_closeout_benchmark_report_markdown_path() -> Result { + benchmarking_path("2026-06-20-agent-knowledge-os-closeout-benchmark-report.md") +} + +pub(crate) fn p2_knowledge_workspace_pageindex_openkb_closeout_report_markdown_path() +-> Result { + benchmarking_path("2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.md") +} + +pub(crate) fn openmemory_ui_export_product_readback_report_markdown_path() -> Result { + benchmarking_path("2026-06-19-openmemory-ui-export-product-readback-report.md") +} + +pub(crate) fn graph_rag_citation_navigation_promotion_report_markdown_path() -> Result { + benchmarking_path("2026-06-19-graph-rag-citation-navigation-promotion-report.md") +} + +pub(crate) fn graph_rag_adapter_matrix_report_markdown_path() -> Result { + benchmarking_path("2026-06-23-graph-rag-adapter-matrix-report.md") +} + +pub(crate) fn p3_competitor_strength_absorption_report_markdown_path() -> Result { + benchmarking_path("2026-06-23-p3-competitor-strength-absorption-report.md") +} + +pub(crate) fn graph_topic_map_report_markdown_path() -> Result { + benchmarking_path("2026-06-20-graph-topic-map-report.md") +} + +pub(crate) fn operator_approved_public_proxy_private_addendum_report_markdown_path() +-> Result { + benchmarking_path("2026-06-19-operator-approved-public-proxy-production-private-addendum.md") +} + +pub(crate) fn live_temporal_reconciliation_report_markdown_path() -> Result { + benchmarking_path("2026-06-16-live-temporal-reconciliation-report.md") +} + +pub(crate) fn competitor_strength_matrix_path() -> Result { + benchmarking_path("2026-06-11-competitor-strength-evidence-matrix.md") +} + +pub(crate) fn comparison_external_projects_path() -> Result { + Ok(support::workspace_root()? + .join("docs") + .join("evidence") + .join("external_memory") + .join("comparison_external_projects.md")) +} + +pub(crate) fn benchmarking_index_path() -> Result { + benchmarking_path("index.md") +} + +pub(crate) fn iteration_direction_report_path() -> Result { + benchmarking_path("2026-06-11-elf-iteration-direction-from-competitor-benchmarks.md") +} + +fn benchmarking_path(file_name: &str) -> Result { + Ok(support::workspace_root()? + .join("docs") + .join("evidence") + .join("benchmarking") + .join(file_name)) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_project_files.rs b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_project_files.rs new file mode 100644 index 00000000..c899baab --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_project_files.rs @@ -0,0 +1,9 @@ +use std::path::PathBuf; + +use color_eyre::Result; + +use crate::support; + +pub(crate) fn readme_path() -> Result { + Ok(support::workspace_root()?.join("README.md")) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_snapshots.rs b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_snapshots.rs new file mode 100644 index 00000000..1151e1bf --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/support/report_paths_snapshots.rs @@ -0,0 +1,122 @@ +use std::path::PathBuf; + +use color_eyre::Result; + +use crate::support; + +pub(crate) fn strength_profile_report_path() -> Result { + report_snapshot_path("2026-06-11-qmd-openviking-strength-profile-report.json") +} + +pub(crate) fn measurement_coverage_audit_json_path() -> Result { + report_snapshot_path("2026-06-11-measurement-coverage-audit.json") +} + +pub(crate) fn retrieval_debug_profile_json_path() -> Result { + report_snapshot_path("2026-06-11-elf-qmd-retrieval-debug-profile.json") +} + +pub(crate) fn trace_replay_diagnostics_report_path() -> Result { + report_snapshot_path("2026-06-11-elf-qmd-trace-replay-diagnostics-report.json") +} + +pub(crate) fn competitor_strength_adoption_report_json_path() -> Result { + report_snapshot_path("2026-06-11-competitor-strength-adoption-report.json") +} + +pub(crate) fn capture_write_policy_live_report_path() -> Result { + report_snapshot_path("2026-06-11-capture-write-policy-live-report.json") +} + +pub(crate) fn live_consolidation_proposal_scoring_report_path() -> Result { + report_snapshot_path("2026-06-16-live-consolidation-proposal-scoring-report.json") +} + +pub(crate) fn temporal_history_competitor_gap_json_path() -> Result { + report_snapshot_path("2026-06-11-temporal-history-competitor-gap-report.json") +} + +pub(crate) fn dreaming_readiness_stage_ledger_json_path() -> Result { + report_snapshot_path("2026-06-16-dreaming-readiness-stage-ledger.json") +} + +pub(crate) fn dreaming_competitor_strength_retest_report_json_path() -> Result { + report_snapshot_path("2026-06-17-dreaming-competitor-strength-retest-report.json") +} + +pub(crate) fn qmd_debug_ergonomics_dreaming_retest_report_json_path() -> Result { + report_snapshot_path("2026-06-19-qmd-debug-ergonomics-dreaming-retest-report.json") +} + +pub(crate) fn openviking_trajectory_materialization_report_json_path() -> Result { + report_snapshot_path("2026-06-19-openviking-trajectory-materialization-report.json") +} + +pub(crate) fn letta_core_archive_export_readback_report_json_path() -> Result { + report_snapshot_path("2026-06-19-letta-core-archive-export-readback-report.json") +} + +pub(crate) fn service_native_dreaming_readback_report_json_path() -> Result { + report_snapshot_path("2026-06-19-service-native-dreaming-readback-report.json") +} + +pub(crate) fn service_native_dreaming_readback_materialization_json_path() -> Result { + report_snapshot_path("2026-06-19-service-native-dreaming-readback-materialization.json") +} + +pub(crate) fn dreaming_review_queue_report_json_path() -> Result { + report_snapshot_path("2026-06-20-dreaming-review-queue-report.json") +} + +pub(crate) fn recall_debug_panel_report_json_path() -> Result { + report_snapshot_path("2026-06-20-recall-debug-panel-report.json") +} + +pub(crate) fn agent_knowledge_os_closeout_benchmark_report_json_path() -> Result { + report_snapshot_path("2026-06-20-agent-knowledge-os-closeout-benchmark-report.json") +} + +pub(crate) fn p2_knowledge_workspace_pageindex_openkb_closeout_report_json_path() -> Result +{ + report_snapshot_path("2026-06-22-p2-knowledge-workspace-pageindex-openkb-closeout-report.json") +} + +pub(crate) fn openmemory_ui_export_product_readback_report_json_path() -> Result { + report_snapshot_path("2026-06-19-openmemory-ui-export-product-readback-report.json") +} + +pub(crate) fn graph_rag_citation_navigation_promotion_report_json_path() -> Result { + report_snapshot_path("2026-06-19-graph-rag-citation-navigation-promotion-report.json") +} + +pub(crate) fn graph_rag_adapter_matrix_report_json_path() -> Result { + report_snapshot_path("2026-06-23-graph-rag-adapter-matrix-report.json") +} + +pub(crate) fn p3_competitor_strength_absorption_report_json_path() -> Result { + report_snapshot_path("2026-06-23-p3-competitor-strength-absorption-report.json") +} + +pub(crate) fn operator_approved_public_proxy_private_addendum_report_json_path() -> Result +{ + report_snapshot_path( + "2026-06-19-operator-approved-public-proxy-production-private-addendum.json", + ) +} + +pub(crate) fn live_temporal_reconciliation_report_json_path() -> Result { + report_snapshot_path("2026-06-16-live-temporal-reconciliation-report.json") +} + +pub(crate) fn competitor_strength_matrix_json_path() -> Result { + report_snapshot_path("2026-06-11-xy-897-competitor-strength-matrix.json") +} + +fn report_snapshot_path(file_name: &str) -> Result { + Ok(support::workspace_root()? + .join("apps") + .join("elf-eval") + .join("fixtures") + .join("report_snapshots") + .join(file_name)) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/work_continuity.rs b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity.rs index 8af1f48f..cd9557d8 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/work_continuity.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity.rs @@ -1,330 +1,3 @@ -use std::{ - env, fs, - process::{self, Command}, -}; - -use color_eyre::Result; -use serde_json::Value; - -use crate::support; - -#[test] -fn work_continuity_fixtures_score_required_metrics() -> Result<()> { - let report = support::run_json_report_from(support::work_continuity_fixture_dir())?; - - assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(8)); - assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); - - assert_work_continuity_summary_counts(&report); - - let suites = support::array_at(&report, "/suites")?; - let work_continuity = support::find_by_field(suites, "/suite_id", "work_continuity")?; - - assert_eq!(work_continuity.pointer("/status").and_then(Value::as_str), Some("pass")); - assert_eq!(work_continuity.pointer("/encoded_job_count").and_then(Value::as_u64), Some(8)); - - Ok(()) -} - -fn assert_work_continuity_summary_counts(report: &Value) { - for (field, expected) in [ - ("readback_count", 8), - ("entry_count", 8), - ("reset_resume_required_count", 1), - ("reset_resume_success_count", 1), - ("decision_rationale_required_count", 1), - ("decision_rationale_recalled_count", 1), - ("rejected_option_required_count", 1), - ("rejected_option_suppressed_count", 1), - ("rejected_option_resurrection_count", 0), - ("explicit_next_step_required_count", 1), - ("explicit_next_step_returned_count", 1), - ("explicit_next_step_correct_count", 1), - ("inferred_next_step_required_count", 1), - ("inferred_next_step_labeled_count", 1), - ("inferred_step_instruction_count", 0), - ("handoff_source_ref_required_count", 1), - ("handoff_source_ref_covered_count", 1), - ("redaction_required_count", 1), - ("redaction_applied_count", 1), - ("sensitive_marker_persistence_count", 0), - ("janitor_candidate_count", 1), - ("janitor_false_promotion_count", 0), - ("journal_only_authority_claim_count", 0), - ] { - assert_work_continuity_summary_u64(report, field, expected); - } - for (field, expected) in [ - ("reset_resume_success_rate", 1.0), - ("decision_rationale_recall_rate", 1.0), - ("rejected_option_suppression_rate", 1.0), - ("explicit_next_step_precision", 1.0), - ("inferred_next_step_labeling_rate", 1.0), - ("handoff_source_ref_coverage", 1.0), - ("redaction_rate", 1.0), - ("janitor_false_promotion_rate", 0.0), - ] { - assert_work_continuity_summary_f64(report, field, expected); - } -} - -fn assert_work_continuity_summary_u64(report: &Value, field: &str, expected: u64) { - assert_eq!( - report.pointer(&format!("/summary/work_continuity/{field}")).and_then(Value::as_u64), - Some(expected), - "unexpected Work Continuity summary field {field}", - ); -} - -fn assert_work_continuity_summary_f64(report: &Value, field: &str, expected: f64) { - assert_eq!( - report.pointer(&format!("/summary/work_continuity/{field}")).and_then(Value::as_f64), - Some(expected), - "unexpected Work Continuity summary field {field}", - ); -} - -#[test] -fn work_continuity_markdown_renders_required_metrics() -> Result<()> { - let report = support::run_json_report_from(support::work_continuity_fixture_dir())?; - let temp_dir = - env::temp_dir().join(format!("elf-real-world-work-continuity-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - - let report_path = temp_dir.join("work-continuity-report.json"); - let markdown_path = temp_dir.join("work-continuity-report.md"); - - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("publish") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&markdown_path) - .output()?; - - assert!( - output.status.success(), - "real_world_job publisher failed: {}", - String::from_utf8_lossy(&output.stderr), - ); - - let markdown = fs::read_to_string(markdown_path)?; - - assert!(markdown.contains("Work Continuity Metrics")); - assert!(markdown.contains("work-continuity-redaction-001")); - assert!(markdown.contains("work-continuity-janitor-false-promotion-001")); - assert!(markdown.contains("Janitor False Promotion")); - assert!(markdown.contains("Sensitive Persistence")); - assert!(markdown.contains("Journal Authority Claims")); - assert!(markdown.contains("| work-continuity-reset-resume-001 | 1 | 1 | `1/1` (`1.000`)")); - assert!(markdown.contains( - "| work-continuity-explicit-next-step-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `1/1` (`1.000`)" - )); - assert!(markdown.contains( - "| work-continuity-handoff-source-ref-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`1.000`) | `0/0` (`0.000`) | `1/1` (`1.000`)" - )); - assert!(markdown.contains( - "| work-continuity-redaction-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`1.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `1/1` (`1.000`)" - )); - assert!(markdown.contains( - "| work-continuity-janitor-false-promotion-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`1.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/1` (`0.000`)" - )); - - Ok(()) -} - -#[test] -fn work_continuity_fixture_fails_sensitive_marker_persistence() -> Result<()> { - let report = run_work_continuity_mutation( - "redaction_sensitive_marker.json", - "sensitive_marker_persistence.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["items"] - [0]["redaction_audit"]["persisted_sensitive_marker_ids"] = - serde_json::json!(["secret-demo-token"]); - }, - )?; - let job = single_work_continuity_job(&report, "work-continuity-redaction-001")?; - - assert_work_continuity_wrong_result(job, "sensitive_marker_persistence_count", 1); - - Ok(()) -} - -#[test] -fn work_continuity_fixture_fails_rejected_option_resurrection() -> Result<()> { - let report = run_work_continuity_mutation( - "rejected_option_suppression.json", - "rejected_option_resurrection.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["items"] - [0]["rejected_options"][0]["resurrected_as_current"] = Value::Bool(true); - }, - )?; - let job = single_work_continuity_job(&report, "work-continuity-rejected-option-001")?; - - assert_work_continuity_wrong_result(job, "rejected_option_resurrection_count", 1); - - Ok(()) -} - -#[test] -fn work_continuity_fixture_fails_inferred_step_instruction() -> Result<()> { - let report = run_work_continuity_mutation( - "inferred_next_step_labeling.json", - "inferred_step_instruction.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["items"] - [0]["inferred_next_steps"][0]["instruction"] = Value::Bool(true); - }, - )?; - let job = single_work_continuity_job(&report, "work-continuity-inferred-next-step-001")?; - - assert_work_continuity_wrong_result(job, "inferred_step_instruction_count", 1); - - Ok(()) -} - -#[test] -fn work_continuity_fixture_fails_journal_only_authority_claim() -> Result<()> { - let report = run_work_continuity_mutation( - "handoff_source_ref_coverage.json", - "journal_only_authority_claim.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["where_stopped"] - ["journal_only_authority_claims"] = serde_json::json!(["wj-handoff-source-ref"]); - }, - )?; - let job = single_work_continuity_job(&report, "work-continuity-handoff-source-ref-001")?; - - assert_work_continuity_wrong_result(job, "journal_only_authority_claim_count", 1); - - Ok(()) -} - -#[test] -fn work_continuity_fixture_fails_janitor_promotion_or_missing_review() -> Result<()> { - let promoted = run_work_continuity_mutation( - "janitor_false_promotion_guard.json", - "janitor_promoted.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["janitor_candidates"] - [0]["promoted_to_memory"] = Value::Bool(true); - }, - )?; - let promoted_job = - single_work_continuity_job(&promoted, "work-continuity-janitor-false-promotion-001")?; - - assert_work_continuity_wrong_result(promoted_job, "janitor_false_promotion_count", 1); - assert_hard_fail_hit(promoted_job, "janitor Work Journal candidate promoted without review"); - - let missing_review = run_work_continuity_mutation( - "janitor_false_promotion_guard.json", - "janitor_missing_review_required.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["janitor_candidates"] - [0]["review_required"] = Value::Bool(false); - }, - )?; - let missing_review_job = - single_work_continuity_job(&missing_review, "work-continuity-janitor-false-promotion-001")?; - - assert_work_continuity_wrong_result(missing_review_job, "janitor_false_promotion_count", 1); - assert_hard_fail_hit( - missing_review_job, - "janitor Work Journal candidate promoted without review", - ); - - let extra_bad_candidate = run_work_continuity_mutation( - "janitor_false_promotion_guard.json", - "janitor_extra_bad_candidate.json", - |fixture| { - fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["janitor_candidates"] = serde_json::json!([ - { - "candidate_id": "wj-janitor-candidate", - "evidence_refs": ["wj-janitor-candidate-source"], - "review_required": true, - "promoted_to_memory": false - }, - { - "candidate_id": "wj-extra-janitor-candidate", - "evidence_refs": ["wj-janitor-candidate-source"], - "review_required": true, - "promoted_to_memory": true - } - ]); - }, - )?; - let extra_bad_candidate_job = single_work_continuity_job( - &extra_bad_candidate, - "work-continuity-janitor-false-promotion-001", - )?; - - assert_work_continuity_wrong_result( - extra_bad_candidate_job, - "janitor_false_promotion_count", - 1, - ); - assert_hard_fail_hit( - extra_bad_candidate_job, - "janitor Work Journal candidate promoted without review", - ); - - assert_eq!( - extra_bad_candidate_job - .pointer("/work_continuity/janitor_candidate_count") - .and_then(Value::as_u64), - Some(2) - ); - - Ok(()) -} - -fn run_work_continuity_mutation( - fixture_name: &str, - output_name: &str, - mutate: impl FnOnce(&mut Value), -) -> Result { - let fixture_path = support::work_continuity_fixture_dir().join(fixture_name); - let temp_dir = - env::temp_dir().join(format!("elf-work-continuity-{output_name}-{}", process::id())); - let mut fixture = support::load_json(&fixture_path)?; - - mutate(&mut fixture); - - if temp_dir.exists() { - fs::remove_dir_all(&temp_dir)?; - } - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join(output_name), serde_json::to_vec_pretty(&fixture)?)?; - - support::run_json_report_from(temp_dir) -} - -fn single_work_continuity_job<'a>(report: &'a Value, job_id: &str) -> Result<&'a Value> { - let jobs = support::array_at(report, "/jobs")?; - - support::find_by_field(jobs, "/job_id", job_id) -} - -fn assert_work_continuity_wrong_result(job: &Value, metric_name: &str, expected: u64) { - assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); - assert_eq!( - job.pointer(&format!("/work_continuity/{metric_name}")).and_then(Value::as_u64), - Some(expected) - ); -} - -fn assert_hard_fail_hit(job: &Value, expected_hit: &str) { - let hits = job.pointer("/hard_fail_hits").and_then(Value::as_array).expect("hard_fail_hits"); - - assert!( - hits.iter().filter_map(Value::as_str).any(|hit| hit == expected_hit), - "missing hard_fail_hits marker: {expected_hit}" - ); -} +mod work_continuity_markdown; +mod work_continuity_mutations; +mod work_continuity_summary; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_markdown.rs b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_markdown.rs new file mode 100644 index 00000000..e7aec6b6 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_markdown.rs @@ -0,0 +1,60 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn work_continuity_markdown_renders_required_metrics() -> Result<()> { + let report = support::run_json_report_from(support::work_continuity_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-real-world-work-continuity-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + + let report_path = temp_dir.join("work-continuity-report.json"); + let markdown_path = temp_dir.join("work-continuity-report.md"); + + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("publish") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&markdown_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job publisher failed: {}", + String::from_utf8_lossy(&output.stderr), + ); + + let markdown = fs::read_to_string(markdown_path)?; + + assert!(markdown.contains("Work Continuity Metrics")); + assert!(markdown.contains("work-continuity-redaction-001")); + assert!(markdown.contains("work-continuity-janitor-false-promotion-001")); + assert!(markdown.contains("Janitor False Promotion")); + assert!(markdown.contains("Sensitive Persistence")); + assert!(markdown.contains("Journal Authority Claims")); + assert!(markdown.contains("| work-continuity-reset-resume-001 | 1 | 1 | `1/1` (`1.000`)")); + assert!(markdown.contains( + "| work-continuity-explicit-next-step-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `1/1` (`1.000`)" + )); + assert!(markdown.contains( + "| work-continuity-handoff-source-ref-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`1.000`) | `0/0` (`0.000`) | `1/1` (`1.000`)" + )); + assert!(markdown.contains( + "| work-continuity-redaction-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`1.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `1/1` (`1.000`)" + )); + assert!(markdown.contains( + "| work-continuity-janitor-false-promotion-001 | 1 | 1 | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`1.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/0` (`0.000`) | `0/1` (`0.000`)" + )); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_mutations.rs b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_mutations.rs new file mode 100644 index 00000000..4e42c8dd --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_mutations.rs @@ -0,0 +1,198 @@ +use std::{env, fs, process}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn work_continuity_fixture_fails_sensitive_marker_persistence() -> Result<()> { + let report = run_work_continuity_mutation( + "redaction_sensitive_marker.json", + "sensitive_marker_persistence.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["items"] + [0]["redaction_audit"]["persisted_sensitive_marker_ids"] = + serde_json::json!(["secret-demo-token"]); + }, + )?; + let job = single_work_continuity_job(&report, "work-continuity-redaction-001")?; + + assert_work_continuity_wrong_result(job, "sensitive_marker_persistence_count", 1); + + Ok(()) +} + +#[test] +fn work_continuity_fixture_fails_rejected_option_resurrection() -> Result<()> { + let report = run_work_continuity_mutation( + "rejected_option_suppression.json", + "rejected_option_resurrection.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["items"] + [0]["rejected_options"][0]["resurrected_as_current"] = Value::Bool(true); + }, + )?; + let job = single_work_continuity_job(&report, "work-continuity-rejected-option-001")?; + + assert_work_continuity_wrong_result(job, "rejected_option_resurrection_count", 1); + + Ok(()) +} + +#[test] +fn work_continuity_fixture_fails_inferred_step_instruction() -> Result<()> { + let report = run_work_continuity_mutation( + "inferred_next_step_labeling.json", + "inferred_step_instruction.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["items"] + [0]["inferred_next_steps"][0]["instruction"] = Value::Bool(true); + }, + )?; + let job = single_work_continuity_job(&report, "work-continuity-inferred-next-step-001")?; + + assert_work_continuity_wrong_result(job, "inferred_step_instruction_count", 1); + + Ok(()) +} + +#[test] +fn work_continuity_fixture_fails_journal_only_authority_claim() -> Result<()> { + let report = run_work_continuity_mutation( + "handoff_source_ref_coverage.json", + "journal_only_authority_claim.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["where_stopped"] + ["journal_only_authority_claims"] = serde_json::json!(["wj-handoff-source-ref"]); + }, + )?; + let job = single_work_continuity_job(&report, "work-continuity-handoff-source-ref-001")?; + + assert_work_continuity_wrong_result(job, "journal_only_authority_claim_count", 1); + + Ok(()) +} + +#[test] +fn work_continuity_fixture_fails_janitor_promotion_or_missing_review() -> Result<()> { + let promoted = run_work_continuity_mutation( + "janitor_false_promotion_guard.json", + "janitor_promoted.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["janitor_candidates"] + [0]["promoted_to_memory"] = Value::Bool(true); + }, + )?; + let promoted_job = + single_work_continuity_job(&promoted, "work-continuity-janitor-false-promotion-001")?; + + assert_work_continuity_wrong_result(promoted_job, "janitor_false_promotion_count", 1); + assert_hard_fail_hit(promoted_job, "janitor Work Journal candidate promoted without review"); + + let missing_review = run_work_continuity_mutation( + "janitor_false_promotion_guard.json", + "janitor_missing_review_required.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["janitor_candidates"] + [0]["review_required"] = Value::Bool(false); + }, + )?; + let missing_review_job = + single_work_continuity_job(&missing_review, "work-continuity-janitor-false-promotion-001")?; + + assert_work_continuity_wrong_result(missing_review_job, "janitor_false_promotion_count", 1); + assert_hard_fail_hit( + missing_review_job, + "janitor Work Journal candidate promoted without review", + ); + + let extra_bad_candidate = run_work_continuity_mutation( + "janitor_false_promotion_guard.json", + "janitor_extra_bad_candidate.json", + |fixture| { + fixture["corpus"]["adapter_response"]["answer"]["work_journal_readbacks"][0]["janitor_candidates"] = serde_json::json!([ + { + "candidate_id": "wj-janitor-candidate", + "evidence_refs": ["wj-janitor-candidate-source"], + "review_required": true, + "promoted_to_memory": false + }, + { + "candidate_id": "wj-extra-janitor-candidate", + "evidence_refs": ["wj-janitor-candidate-source"], + "review_required": true, + "promoted_to_memory": true + } + ]); + }, + )?; + let extra_bad_candidate_job = single_work_continuity_job( + &extra_bad_candidate, + "work-continuity-janitor-false-promotion-001", + )?; + + assert_work_continuity_wrong_result( + extra_bad_candidate_job, + "janitor_false_promotion_count", + 1, + ); + assert_hard_fail_hit( + extra_bad_candidate_job, + "janitor Work Journal candidate promoted without review", + ); + + assert_eq!( + extra_bad_candidate_job + .pointer("/work_continuity/janitor_candidate_count") + .and_then(Value::as_u64), + Some(2) + ); + + Ok(()) +} + +fn run_work_continuity_mutation( + fixture_name: &str, + output_name: &str, + mutate: impl FnOnce(&mut Value), +) -> Result { + let fixture_path = support::work_continuity_fixture_dir().join(fixture_name); + let temp_dir = + env::temp_dir().join(format!("elf-work-continuity-{output_name}-{}", process::id())); + let mut fixture = support::load_json(&fixture_path)?; + + mutate(&mut fixture); + + if temp_dir.exists() { + fs::remove_dir_all(&temp_dir)?; + } + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join(output_name), serde_json::to_vec_pretty(&fixture)?)?; + + support::run_json_report_from(temp_dir) +} + +fn single_work_continuity_job<'a>(report: &'a Value, job_id: &str) -> Result<&'a Value> { + let jobs = support::array_at(report, "/jobs")?; + + support::find_by_field(jobs, "/job_id", job_id) +} + +fn assert_work_continuity_wrong_result(job: &Value, metric_name: &str, expected: u64) { + assert_eq!(job.pointer("/status").and_then(Value::as_str), Some("wrong_result")); + assert_eq!( + job.pointer(&format!("/work_continuity/{metric_name}")).and_then(Value::as_u64), + Some(expected) + ); +} + +fn assert_hard_fail_hit(job: &Value, expected_hit: &str) { + let hits = job.pointer("/hard_fail_hits").and_then(Value::as_array).expect("hard_fail_hits"); + + assert!( + hits.iter().filter_map(Value::as_str).any(|hit| hit == expected_hit), + "missing hard_fail_hits marker: {expected_hit}" + ); +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_summary.rs b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_summary.rs new file mode 100644 index 00000000..2a5c398d --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/work_continuity_summary.rs @@ -0,0 +1,81 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn work_continuity_fixtures_score_required_metrics() -> Result<()> { + let report = support::run_json_report_from(support::work_continuity_fixture_dir())?; + + assert_eq!(report.pointer("/summary/job_count").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/pass").and_then(Value::as_u64), Some(8)); + assert_eq!(report.pointer("/summary/wrong_result").and_then(Value::as_u64), Some(0)); + + assert_work_continuity_summary_counts(&report); + + let suites = support::array_at(&report, "/suites")?; + let work_continuity = support::find_by_field(suites, "/suite_id", "work_continuity")?; + + assert_eq!(work_continuity.pointer("/status").and_then(Value::as_str), Some("pass")); + assert_eq!(work_continuity.pointer("/encoded_job_count").and_then(Value::as_u64), Some(8)); + + Ok(()) +} + +fn assert_work_continuity_summary_counts(report: &Value) { + for (field, expected) in [ + ("readback_count", 8), + ("entry_count", 8), + ("reset_resume_required_count", 1), + ("reset_resume_success_count", 1), + ("decision_rationale_required_count", 1), + ("decision_rationale_recalled_count", 1), + ("rejected_option_required_count", 1), + ("rejected_option_suppressed_count", 1), + ("rejected_option_resurrection_count", 0), + ("explicit_next_step_required_count", 1), + ("explicit_next_step_returned_count", 1), + ("explicit_next_step_correct_count", 1), + ("inferred_next_step_required_count", 1), + ("inferred_next_step_labeled_count", 1), + ("inferred_step_instruction_count", 0), + ("handoff_source_ref_required_count", 1), + ("handoff_source_ref_covered_count", 1), + ("redaction_required_count", 1), + ("redaction_applied_count", 1), + ("sensitive_marker_persistence_count", 0), + ("janitor_candidate_count", 1), + ("janitor_false_promotion_count", 0), + ("journal_only_authority_claim_count", 0), + ] { + assert_work_continuity_summary_u64(report, field, expected); + } + for (field, expected) in [ + ("reset_resume_success_rate", 1.0), + ("decision_rationale_recall_rate", 1.0), + ("rejected_option_suppression_rate", 1.0), + ("explicit_next_step_precision", 1.0), + ("inferred_next_step_labeling_rate", 1.0), + ("handoff_source_ref_coverage", 1.0), + ("redaction_rate", 1.0), + ("janitor_false_promotion_rate", 0.0), + ] { + assert_work_continuity_summary_f64(report, field, expected); + } +} + +fn assert_work_continuity_summary_u64(report: &Value, field: &str, expected: u64) { + assert_eq!( + report.pointer(&format!("/summary/work_continuity/{field}")).and_then(Value::as_u64), + Some(expected), + "unexpected Work Continuity summary field {field}", + ); +} + +fn assert_work_continuity_summary_f64(report: &Value, field: &str, expected: f64) { + assert_eq!( + report.pointer(&format!("/summary/work_continuity/{field}")).and_then(Value::as_f64), + Some(expected), + "unexpected Work Continuity summary field {field}", + ); +} diff --git a/packages/elf-storage/tests/graph_memory.rs b/packages/elf-storage/tests/graph_memory.rs index c9e9fe57..7bb52d15 100644 --- a/packages/elf-storage/tests/graph_memory.rs +++ b/packages/elf-storage/tests/graph_memory.rs @@ -2,426 +2,9 @@ //! Integration tests for graph and memory storage helpers. -use sqlx::PgConnection; -use time::{Duration, OffsetDateTime}; -use uuid::Uuid; - -use elf_config::Postgres; -use elf_storage::{ - db::Db, - graph, - models::{GraphFact, MemoryNote}, - queries, -}; -use elf_testkit::TestDatabase; - -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn graph_entity_upsert_is_idempotent_by_normalized_canonical() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!( - "Skipping graph_entity_upsert_is_idempotent_by_normalized_canonical; set ELF_PG_DSN to run." - ); - - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - - db.ensure_schema(4_096).await.expect("Failed to ensure schema."); - - let mut tx = db.pool.begin().await.expect("Failed to open transaction."); - let tenant_id = "tenant-a"; - let project_id = "project-a"; - let entity_id = - graph::upsert_entity(&mut tx, tenant_id, project_id, " Alice Doe ", Some("person")) - .await - .expect("Failed to upsert canonical entity."); - let canonical_norm = graph::normalize_entity_name("Alice doe"); - - assert_eq!(canonical_norm, "alice doe"); - - let entity_again = - graph::upsert_entity(&mut tx, tenant_id, project_id, "Alice\tDoe", Some("person")) - .await - .expect("Failed to upsert canonical alias."); - - assert_eq!(entity_id, entity_again); - - tx.commit().await.expect("Failed to commit transaction."); - - assert!(test_db.cleanup().await.is_ok(), "Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn graph_fact_with_empty_evidence_is_rejected() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!("Skipping graph_fact_with_empty_evidence_is_rejected; set ELF_PG_DSN to run."); - - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - - db.ensure_schema(4_096).await.expect("Failed to ensure schema."); - - let mut tx = db.pool.begin().await.expect("Failed to open transaction."); - let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity A", None) - .await - .expect("Failed to upsert subject."); - let predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "related_to") - .await - .expect("Failed to resolve predicate."); - let err = graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "related_to", - predicate.predicate_id, - None, - Some("value"), - OffsetDateTime::now_utc(), - None, - &[], - ) - .await - .expect_err("Expected empty evidence to be rejected."); - - assert!(matches!(err, elf_storage::Error::InvalidArgument(_))); - - tx.rollback().await.expect("Failed to rollback transaction."); - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn graph_fact_duplicates_with_active_window_fail_unique_constraint() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!( - "Skipping graph_fact_duplicates_with_active_window_fail_unique_constraint; set ELF_PG_DSN to run." - ); - - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - - db.ensure_schema(4_096).await.expect("Failed to ensure schema."); - - let mut tx = db.pool.begin().await.expect("Failed to open transaction."); - let note_id = insert_memory_note(&mut tx, "tenant-a", "project-a").await; - let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) - .await - .expect("Failed to upsert subject."); - let object = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Object", None) - .await - .expect("Failed to upsert object."); - let predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "related_to") - .await - .expect("Failed to resolve predicate."); - let now = OffsetDateTime::now_utc(); - - graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "related_to", - predicate.predicate_id, - Some(object), - None, - now, - None, - &[note_id], - ) - .await - .expect("Failed to insert graph fact."); - - let err = graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "related_to", - predicate.predicate_id, - Some(object), - None, - now, - None, - &[note_id], - ) - .await; - - assert!(err.is_err()); - - tx.rollback().await.expect("Failed to rollback transaction."); - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn graph_fact_rejects_invalid_valid_window() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!("Skipping graph_fact_rejects_invalid_valid_window; set ELF_PG_DSN to run."); - - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - - db.ensure_schema(4_096).await.expect("Failed to ensure schema."); - - let mut tx = db.pool.begin().await.expect("Failed to open transaction."); - let note_id = insert_memory_note(&mut tx, "tenant-a", "project-a").await; - let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) - .await - .expect("Failed to upsert subject."); - let predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "expires") - .await - .expect("Failed to resolve predicate."); - let now = OffsetDateTime::now_utc(); - let err = graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "expires", - predicate.predicate_id, - None, - Some("value"), - now, - Some(now), - &[note_id], - ) - .await; - - assert!(err.is_err()); - - tx.rollback().await.expect("Failed to rollback transaction."); - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn graph_fetch_active_facts_returns_active_window_only() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!( - "Skipping graph_fetch_active_facts_returns_active_window_only; set ELF_PG_DSN to run." - ); - - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - - db.ensure_schema(4_096).await.expect("Failed to ensure schema."); - - let mut tx = db.pool.begin().await.expect("Failed to open transaction."); - let note_id = insert_memory_note(&mut tx, "tenant-a", "project-a").await; - let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) - .await - .expect("Failed to upsert subject."); - let active_predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "active_fact") - .await - .expect("Failed to resolve predicate."); - let expired_predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "expired_fact") - .await - .expect("Failed to resolve predicate."); - let future_predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "future_fact") - .await - .expect("Failed to resolve predicate."); - let now = OffsetDateTime::now_utc(); - let active = graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "active_fact", - active_predicate.predicate_id, - None, - Some("alpha"), - now - Duration::hours(1), - None, - &[note_id], - ) - .await - .expect("Failed to insert active graph fact."); - - graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "expired_fact", - expired_predicate.predicate_id, - None, - Some("beta"), - now - Duration::hours(2), - Some(now - Duration::minutes(1)), - &[note_id], - ) - .await - .expect("Failed to insert expired graph fact."); - graph::insert_fact_with_evidence( - &mut tx, - "tenant-a", - "project-a", - "agent-a", - "scope-a", - subject, - "future_fact", - future_predicate.predicate_id, - None, - Some("gamma"), - now + Duration::hours(1), - None, - &[note_id], - ) - .await - .expect("Failed to insert future graph fact."); - - let facts: Vec = graph::fetch_active_facts_for_subject( - &mut tx, - "tenant-a", - "project-a", - "scope-a", - subject, - now, - ) - .await - .expect("Failed to fetch active graph facts."); - - assert_eq!(facts.len(), 1); - assert_eq!(facts[0].fact_id, active); - assert_eq!(facts[0].predicate, "active_fact"); - - tx.rollback().await.expect("Failed to rollback transaction."); - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -#[tokio::test] -#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] -async fn graph_predicate_guarded_update_conflicts_after_deprecate() { - let Some(base_dsn) = elf_testkit::env_dsn() else { - eprintln!( - "Skipping graph_predicate_guarded_update_conflicts_after_deprecate; set ELF_PG_DSN to run." - ); - - return; - }; - let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); - let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; - let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); - - db.ensure_schema(4_096).await.expect("Failed to ensure schema."); - - let mut tx = db.pool.begin().await.expect("Failed to open transaction."); - let predicate = - graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "mentors") - .await - .expect("Failed to resolve predicate."); - let updated_active = graph::update_predicate_guarded( - &mut tx, - predicate.predicate_id, - predicate.status.as_str(), - predicate.cardinality.as_str(), - Some("active"), - None, - ) - .await - .expect("Failed to activate predicate."); - let stale_expected_status = updated_active.status.clone(); - let stale_expected_cardinality = updated_active.cardinality.clone(); - let updated_deprecated = graph::update_predicate_guarded( - &mut tx, - predicate.predicate_id, - updated_active.status.as_str(), - updated_active.cardinality.as_str(), - Some("deprecated"), - None, - ) - .await - .expect("Failed to deprecate predicate."); - - assert_eq!(updated_deprecated.status, "deprecated"); - - let err = graph::update_predicate_guarded( - &mut tx, - predicate.predicate_id, - stale_expected_status.as_str(), - stale_expected_cardinality.as_str(), - None, - Some("single"), - ) - .await - .expect_err("Expected guarded update to conflict after deprecate."); - - assert!(matches!(err, elf_storage::Error::Conflict(_))); - - let predicate_now = graph::get_predicate_by_id(&mut tx, predicate.predicate_id) - .await - .expect("Failed to load predicate.") - .expect("Expected predicate row."); - - assert_eq!(predicate_now.status, "deprecated"); - - tx.rollback().await.expect("Failed to rollback transaction."); - test_db.cleanup().await.expect("Failed to cleanup test database."); -} - -async fn insert_memory_note( - executor: &mut PgConnection, - tenant_id: &str, - project_id: &str, -) -> Uuid { - let note_id = Uuid::new_v4(); - let note = MemoryNote { - note_id, - tenant_id: tenant_id.to_string(), - project_id: project_id.to_string(), - agent_id: "agent-a".to_string(), - scope: "scope-a".to_string(), - r#type: "fact".to_string(), - key: None, - text: "graph note evidence".to_string(), - importance: 1.0, - confidence: 1.0, - status: "active".to_string(), - created_at: OffsetDateTime::now_utc(), - updated_at: OffsetDateTime::now_utc(), - expires_at: None, - embedding_version: "test:vec:1".to_string(), - source_ref: serde_json::json!({}), - hit_count: 0, - last_hit_at: None, - }; - - queries::insert_note(executor, ¬e).await.expect("Failed to insert evidence note."); - - note_id +mod graph_memory { + mod entity; + mod facts; + mod helpers; + mod predicates; } diff --git a/packages/elf-storage/tests/graph_memory/entity.rs b/packages/elf-storage/tests/graph_memory/entity.rs new file mode 100644 index 00000000..f398d27f --- /dev/null +++ b/packages/elf-storage/tests/graph_memory/entity.rs @@ -0,0 +1,42 @@ +use elf_config::Postgres; +use elf_storage::{db::Db, graph}; +use elf_testkit::TestDatabase; + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_entity_upsert_is_idempotent_by_normalized_canonical() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_entity_upsert_is_idempotent_by_normalized_canonical; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let tenant_id = "tenant-a"; + let project_id = "project-a"; + let entity_id = + graph::upsert_entity(&mut tx, tenant_id, project_id, " Alice Doe ", Some("person")) + .await + .expect("Failed to upsert canonical entity."); + let canonical_norm = graph::normalize_entity_name("Alice doe"); + + assert_eq!(canonical_norm, "alice doe"); + + let entity_again = + graph::upsert_entity(&mut tx, tenant_id, project_id, "Alice\tDoe", Some("person")) + .await + .expect("Failed to upsert canonical alias."); + + assert_eq!(entity_id, entity_again); + + tx.commit().await.expect("Failed to commit transaction."); + + assert!(test_db.cleanup().await.is_ok(), "Failed to cleanup test database."); +} diff --git a/packages/elf-storage/tests/graph_memory/facts.rs b/packages/elf-storage/tests/graph_memory/facts.rs new file mode 100644 index 00000000..09113244 --- /dev/null +++ b/packages/elf-storage/tests/graph_memory/facts.rs @@ -0,0 +1,276 @@ +use time::{Duration, OffsetDateTime}; + +use crate::graph_memory::helpers; +use elf_config::Postgres; +use elf_storage::{db::Db, graph, models::GraphFact}; +use elf_testkit::TestDatabase; + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fact_with_empty_evidence_is_rejected() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!("Skipping graph_fact_with_empty_evidence_is_rejected; set ELF_PG_DSN to run."); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity A", None) + .await + .expect("Failed to upsert subject."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "related_to") + .await + .expect("Failed to resolve predicate."); + let err = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "related_to", + predicate.predicate_id, + None, + Some("value"), + OffsetDateTime::now_utc(), + None, + &[], + ) + .await + .expect_err("Expected empty evidence to be rejected."); + + assert!(matches!(err, elf_storage::Error::InvalidArgument(_))); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fact_duplicates_with_active_window_fail_unique_constraint() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_fact_duplicates_with_active_window_fail_unique_constraint; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let note_id = helpers::insert_memory_note(&mut tx, "tenant-a", "project-a").await; + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) + .await + .expect("Failed to upsert subject."); + let object = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Object", None) + .await + .expect("Failed to upsert object."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "related_to") + .await + .expect("Failed to resolve predicate."); + let now = OffsetDateTime::now_utc(); + + graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "related_to", + predicate.predicate_id, + Some(object), + None, + now, + None, + &[note_id], + ) + .await + .expect("Failed to insert graph fact."); + + let err = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "related_to", + predicate.predicate_id, + Some(object), + None, + now, + None, + &[note_id], + ) + .await; + + assert!(err.is_err()); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fact_rejects_invalid_valid_window() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!("Skipping graph_fact_rejects_invalid_valid_window; set ELF_PG_DSN to run."); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let note_id = helpers::insert_memory_note(&mut tx, "tenant-a", "project-a").await; + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) + .await + .expect("Failed to upsert subject."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "expires") + .await + .expect("Failed to resolve predicate."); + let now = OffsetDateTime::now_utc(); + let err = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "expires", + predicate.predicate_id, + None, + Some("value"), + now, + Some(now), + &[note_id], + ) + .await; + + assert!(err.is_err()); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_fetch_active_facts_returns_active_window_only() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_fetch_active_facts_returns_active_window_only; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let note_id = helpers::insert_memory_note(&mut tx, "tenant-a", "project-a").await; + let subject = graph::upsert_entity(&mut tx, "tenant-a", "project-a", "Entity Subject", None) + .await + .expect("Failed to upsert subject."); + let active_predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "active_fact") + .await + .expect("Failed to resolve predicate."); + let expired_predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "expired_fact") + .await + .expect("Failed to resolve predicate."); + let future_predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "future_fact") + .await + .expect("Failed to resolve predicate."); + let now = OffsetDateTime::now_utc(); + let active = graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "active_fact", + active_predicate.predicate_id, + None, + Some("alpha"), + now - Duration::hours(1), + None, + &[note_id], + ) + .await + .expect("Failed to insert active graph fact."); + + graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "expired_fact", + expired_predicate.predicate_id, + None, + Some("beta"), + now - Duration::hours(2), + Some(now - Duration::minutes(1)), + &[note_id], + ) + .await + .expect("Failed to insert expired graph fact."); + graph::insert_fact_with_evidence( + &mut tx, + "tenant-a", + "project-a", + "agent-a", + "scope-a", + subject, + "future_fact", + future_predicate.predicate_id, + None, + Some("gamma"), + now + Duration::hours(1), + None, + &[note_id], + ) + .await + .expect("Failed to insert future graph fact."); + + let facts: Vec = graph::fetch_active_facts_for_subject( + &mut tx, + "tenant-a", + "project-a", + "scope-a", + subject, + now, + ) + .await + .expect("Failed to fetch active graph facts."); + + assert_eq!(facts.len(), 1); + assert_eq!(facts[0].fact_id, active); + assert_eq!(facts[0].predicate, "active_fact"); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +} diff --git a/packages/elf-storage/tests/graph_memory/helpers.rs b/packages/elf-storage/tests/graph_memory/helpers.rs new file mode 100644 index 00000000..5d15c19b --- /dev/null +++ b/packages/elf-storage/tests/graph_memory/helpers.rs @@ -0,0 +1,37 @@ +use sqlx::PgConnection; +use time::OffsetDateTime; +use uuid::Uuid; + +use elf_storage::{models::MemoryNote, queries}; + +pub(in crate::graph_memory) async fn insert_memory_note( + executor: &mut PgConnection, + tenant_id: &str, + project_id: &str, +) -> Uuid { + let note_id = Uuid::new_v4(); + let note = MemoryNote { + note_id, + tenant_id: tenant_id.to_string(), + project_id: project_id.to_string(), + agent_id: "agent-a".to_string(), + scope: "scope-a".to_string(), + r#type: "fact".to_string(), + key: None, + text: "graph note evidence".to_string(), + importance: 1.0, + confidence: 1.0, + status: "active".to_string(), + created_at: OffsetDateTime::now_utc(), + updated_at: OffsetDateTime::now_utc(), + expires_at: None, + embedding_version: "test:vec:1".to_string(), + source_ref: serde_json::json!({}), + hit_count: 0, + last_hit_at: None, + }; + + queries::insert_note(executor, ¬e).await.expect("Failed to insert evidence note."); + + note_id +} diff --git a/packages/elf-storage/tests/graph_memory/predicates.rs b/packages/elf-storage/tests/graph_memory/predicates.rs new file mode 100644 index 00000000..ed2382f7 --- /dev/null +++ b/packages/elf-storage/tests/graph_memory/predicates.rs @@ -0,0 +1,73 @@ +use elf_config::Postgres; +use elf_storage::{db::Db, graph}; +use elf_testkit::TestDatabase; + +#[tokio::test] +#[ignore = "Requires external Postgres. Set ELF_PG_DSN to run."] +async fn graph_predicate_guarded_update_conflicts_after_deprecate() { + let Some(base_dsn) = elf_testkit::env_dsn() else { + eprintln!( + "Skipping graph_predicate_guarded_update_conflicts_after_deprecate; set ELF_PG_DSN to run." + ); + + return; + }; + let test_db = TestDatabase::new(&base_dsn).await.expect("Failed to create test database."); + let cfg = Postgres { dsn: test_db.dsn().to_string(), pool_max_conns: 1 }; + let db = Db::connect(&cfg).await.expect("Failed to connect to Postgres."); + + db.ensure_schema(4_096).await.expect("Failed to ensure schema."); + + let mut tx = db.pool.begin().await.expect("Failed to open transaction."); + let predicate = + graph::resolve_or_register_predicate(&mut tx, "tenant-a", "project-a", "mentors") + .await + .expect("Failed to resolve predicate."); + let updated_active = graph::update_predicate_guarded( + &mut tx, + predicate.predicate_id, + predicate.status.as_str(), + predicate.cardinality.as_str(), + Some("active"), + None, + ) + .await + .expect("Failed to activate predicate."); + let stale_expected_status = updated_active.status.clone(); + let stale_expected_cardinality = updated_active.cardinality.clone(); + let updated_deprecated = graph::update_predicate_guarded( + &mut tx, + predicate.predicate_id, + updated_active.status.as_str(), + updated_active.cardinality.as_str(), + Some("deprecated"), + None, + ) + .await + .expect("Failed to deprecate predicate."); + + assert_eq!(updated_deprecated.status, "deprecated"); + + let err = graph::update_predicate_guarded( + &mut tx, + predicate.predicate_id, + stale_expected_status.as_str(), + stale_expected_cardinality.as_str(), + None, + Some("single"), + ) + .await + .expect_err("Expected guarded update to conflict after deprecate."); + + assert!(matches!(err, elf_storage::Error::Conflict(_))); + + let predicate_now = graph::get_predicate_by_id(&mut tx, predicate.predicate_id) + .await + .expect("Failed to load predicate.") + .expect("Expected predicate row."); + + assert_eq!(predicate_now.status, "deprecated"); + + tx.rollback().await.expect("Failed to rollback transaction."); + test_db.cleanup().await.expect("Failed to cleanup test database."); +}