diff --git a/CHANGELOG.md b/CHANGELOG.md index db4122fc..56124a7e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,10 @@ metadata digest mismatches, committed observation mismatches, and retained material unavailability while preserving the coarse replay posture for existing callers. +- `echo-dind-tests` now includes a process-kill WAL crashpoint witness that + kills child processes after committed WAL material and before transaction + commit, proving recovery preserves committed history and excludes uncommitted + tails. - `warp-core` can now materialize WAL projection records into deterministic WARP graph facts with root, writer epoch, segment, commit-anchor, and recovery certificate nodes plus typed graph edges suitable for WSC serialization. The @@ -100,6 +104,9 @@ - `cargo xtask test-slice durability-release` now includes the exact `materialization_outbox_recovery_returns_typed_posture` witness, locking typed materialization outbox recovery posture into the release gate. +- `cargo xtask test-slice durability-release` now includes the exact + `wal_process_crashpoints` witness, promoting the process-kill WAL crashpoint + runner from future descriptor to release-gate evidence. - `warp-core` trusted runtime hosts now configure runtime WAL through `TrustedRuntimeWalConfig`, including in-memory and filesystem-backed adapters. `TrustedRuntimeWalStoreKind` exposes the configured adapter kind as diff --git a/crates/echo-dind-tests/src/lib.rs b/crates/echo-dind-tests/src/lib.rs index 6f228a85..a2c1427c 100644 --- a/crates/echo-dind-tests/src/lib.rs +++ b/crates/echo-dind-tests/src/lib.rs @@ -141,3 +141,235 @@ impl EchoKernel { .canonical_state_hash() } } + +#[cfg(test)] +mod tests { + use std::{ + env, fs, + path::{Path, PathBuf}, + process::{self, Child, Command}, + thread, + time::Duration, + }; + + use warp_core::{ + causal_wal::{ + build_submission_acceptance_transaction, recover_filesystem_store, + recover_submission_index, AffectedFrontier, AffectedFrontierKind, FilesystemWalStore, + Lsn, PayloadCodecId, PayloadSchemaId, RecoveredSubmissionPosture, RecoveryAccessMode, + RecoveryTailPosture, SubmissionAcceptanceRecord, WalAppendAuthority, + WalCommittedTransaction, WalDurabilityMode, WalSegmentId, WalStorePort, + WalTransactionBuilder, WalTransactionId, WalTransactionKind, WriterEpochId, + WriterEpochRequest, + }, + Hash, + }; + + const CHILD_MODE_ENV: &str = "ECHO_DIND_WAL_CRASHPOINT_CHILD"; + const WAL_ROOT_ENV: &str = "ECHO_DIND_WAL_CRASHPOINT_ROOT"; + const READY_MARKER_ENV: &str = "ECHO_DIND_WAL_CRASHPOINT_READY"; + const AFTER_COMMIT_MODE: &str = "after_wal_commit"; + const BEFORE_COMMIT_MODE: &str = "before_wal_commit"; + + #[test] + fn wal_process_crashpoints() { + if let Ok(mode) = env::var(CHILD_MODE_ENV) { + run_wal_crashpoint_child(&mode); + } + + let root = crashpoint_root(); + let _ = fs::remove_dir_all(&root); + fs::create_dir_all(&root).expect("create crashpoint root"); + + let after_root = root.join("after-wal-commit"); + let after_acceptance = acceptance("after-wal-commit"); + run_and_kill_child(AFTER_COMMIT_MODE, &after_root); + let after_report = recover_filesystem_store(&after_root, RecoveryAccessMode::ReadOnly) + .expect("recover after-commit WAL root"); + let after_index = + recover_submission_index(&after_report).expect("recover after-commit index"); + let after_entry = after_index + .get(&after_acceptance.submission_id) + .expect("after-commit submission recovered"); + assert_eq!(after_report.tail_posture, RecoveryTailPosture::Clean); + assert_eq!(after_entry.acceptance, after_acceptance); + assert_eq!( + after_entry.posture, + RecoveredSubmissionPosture::AcceptedPending + ); + + let before_root = root.join("before-wal-commit"); + let before_acceptance = acceptance("before-wal-commit"); + run_and_kill_child(BEFORE_COMMIT_MODE, &before_root); + let before_report = recover_filesystem_store(&before_root, RecoveryAccessMode::ReadOnly) + .expect("recover before-commit WAL root"); + let before_index = + recover_submission_index(&before_report).expect("recover before-commit index"); + assert_eq!( + before_report.tail_posture, + RecoveryTailPosture::WouldTruncateAll + ); + assert!(before_index.get(&before_acceptance.submission_id).is_none()); + assert!(before_index.is_empty()); + + fs::remove_dir_all(&root).expect("remove crashpoint root"); + } + + fn run_wal_crashpoint_child(mode: &str) -> ! { + let root = PathBuf::from(env::var_os(WAL_ROOT_ENV).expect("WAL root env")); + let marker = PathBuf::from(env::var_os(READY_MARKER_ENV).expect("ready marker env")); + fs::create_dir_all(&root).expect("create child WAL root"); + let mut store = FilesystemWalStore::open(&root, WalSegmentId::from_raw(1)) + .expect("open child WAL store"); + store + .acquire_writer_epoch(writer_epoch_request()) + .expect("acquire writer epoch"); + match mode { + AFTER_COMMIT_MODE => { + store + .append_transaction(submission_transaction( + "after-wal-commit", + Lsn::from_raw(0), + )) + .expect("append committed child transaction"); + } + BEFORE_COMMIT_MODE => { + let transaction = submission_transaction("before-wal-commit", Lsn::from_raw(0)); + store + .append_uncommitted_frame(epoch_id(), transaction.frames[0].clone()) + .expect("append uncommitted child frame"); + } + other => panic!("unknown child crashpoint mode: {other}"), + } + fs::write(marker, b"ready").expect("write ready marker"); + loop { + thread::sleep(Duration::from_secs(60)); + } + } + + fn run_and_kill_child(mode: &str, wal_root: &Path) { + fs::create_dir_all(wal_root).expect("create WAL root"); + let marker = wal_root.join("ready"); + let mut child = Command::new(env::current_exe().expect("current test binary")) + .arg("tests::wal_process_crashpoints") + .arg("--exact") + .arg("--nocapture") + .env(CHILD_MODE_ENV, mode) + .env(WAL_ROOT_ENV, wal_root) + .env(READY_MARKER_ENV, &marker) + .spawn() + .expect("spawn WAL crashpoint child"); + wait_for_ready_marker(&mut child, &marker); + child.kill().expect("kill WAL crashpoint child"); + let status = child.wait().expect("wait for killed child"); + assert!(!status.success(), "child should have been killed"); + } + + fn wait_for_ready_marker(child: &mut Child, marker: &Path) { + for _ in 0..200 { + if marker.exists() { + return; + } + if let Some(status) = child.try_wait().expect("poll child") { + panic!("child exited before ready marker: {status}"); + } + thread::sleep(Duration::from_millis(50)); + } + let _ = child.kill(); + panic!("timed out waiting for ready marker at {}", marker.display()); + } + + fn crashpoint_root() -> PathBuf { + env::current_dir() + .expect("current dir") + .join("target") + .join("echo-dind-wal-crashpoints") + .join(process::id().to_string()) + } + + fn submission_transaction(label: &str, first_lsn: Lsn) -> WalCommittedTransaction { + build_submission_acceptance_transaction( + builder( + transaction_id(label), + first_lsn, + WalAppendAuthority::SubmissionIntake, + WalTransactionKind::SubmissionIntake, + ), + acceptance(label), + vec![frontier(label)], + ) + .expect("build submission transaction") + } + + fn acceptance(label: &str) -> SubmissionAcceptanceRecord { + SubmissionAcceptanceRecord { + submission_id: digest(&format!("submission:{label}")), + canonical_envelope_digest: digest(&format!("envelope:{label}")), + idempotency_key_digest: Some(digest(&format!("idempotency:{label}"))), + acceptance_evidence_digest: digest(&format!("accepted:{label}")), + } + } + + fn builder( + transaction_id: WalTransactionId, + first_lsn: Lsn, + authority: WalAppendAuthority, + transaction_kind: WalTransactionKind, + ) -> WalTransactionBuilder { + WalTransactionBuilder::new( + epoch_id(), + WalSegmentId::from_raw(1), + transaction_id, + transaction_kind, + authority, + first_lsn, + digest("genesis-frame"), + digest("genesis-commit"), + WalDurabilityMode::StrictFilesystem, + PayloadCodecId::from_hash(digest("codec:echo-dind-wal-crashpoint")), + PayloadSchemaId::from_hash(digest("schema:echo-dind-wal-crashpoint")), + 1, + 1, + digest("domain:echo-dind-wal-crashpoint"), + ) + } + + fn writer_epoch_request() -> WriterEpochRequest { + WriterEpochRequest { + epoch_id: epoch_id(), + storage_fencing_token: digest("fence:echo-dind-wal-crashpoint"), + process_identity: digest("process:echo-dind-wal-crashpoint"), + host_identity: digest("host:echo-dind-wal-crashpoint"), + started_at_lsn: Lsn::from_raw(0), + previous_epoch_id: None, + previous_epoch_final_commit_digest: None, + lease_or_lock_evidence: digest("lease:echo-dind-wal-crashpoint"), + } + } + + fn frontier(label: &str) -> AffectedFrontier { + AffectedFrontier { + kind: AffectedFrontierKind::SubmissionQueue, + before_digest: digest(&format!("{label}:submission:before")), + after_digest: digest(&format!("{label}:submission:after")), + } + } + + fn transaction_id(label: &str) -> WalTransactionId { + WalTransactionId::from_hash(digest(&format!("tx:{label}"))) + } + + fn epoch_id() -> WriterEpochId { + WriterEpochId::from_hash(digest("epoch:echo-dind-wal-crashpoint")) + } + + fn digest(label: &str) -> Hash { + let mut out = [0_u8; 32]; + for (index, byte) in label.as_bytes().iter().enumerate() { + out[index % 32] = out[index % 32] + .wrapping_add(*byte) + .wrapping_add(index as u8); + } + out + } +} diff --git a/crates/warp-core/src/causal_wal.rs b/crates/warp-core/src/causal_wal.rs index 6bb08d21..bb713a67 100644 --- a/crates/warp-core/src/causal_wal.rs +++ b/crates/warp-core/src/causal_wal.rs @@ -1305,6 +1305,8 @@ pub struct WalManifest { pub enum WalCrashpointExecution { /// Crashpoint is simulated in-process by Rust fixtures. SimulatedInProcess, + /// Crashpoint is exercised by killing a child process. + ProcessKill, /// Crashpoint is reserved for a future process-kill runner. ProcessKillFuture, } @@ -1386,7 +1388,7 @@ const WAL_CRASHPOINT_MANIFEST: &[WalCrashpointDescriptor] = &[ WalCrashpointDescriptor { name: "process.kill.after_wal_commit", boundary: WalCrashpointBoundary::Process, - execution: WalCrashpointExecution::ProcessKillFuture, + execution: WalCrashpointExecution::ProcessKill, }, ]; diff --git a/crates/warp-core/tests/causal_wal_hardening_tests.rs b/crates/warp-core/tests/causal_wal_hardening_tests.rs index 6c281ed2..531d0d6a 100644 --- a/crates/warp-core/tests/causal_wal_hardening_tests.rs +++ b/crates/warp-core/tests/causal_wal_hardening_tests.rs @@ -1868,7 +1868,7 @@ fn crashpoint_manifest_lists_checkpoint_boundaries() { } #[test] -fn crashpoint_manifest_marks_process_kill_as_future_until_runner_exists() { +fn crashpoint_manifest_marks_process_kill_as_available_runner() { let process_entries = wal_crashpoint_manifest() .iter() .filter(|entry| entry.boundary == WalCrashpointBoundary::Process) @@ -1877,7 +1877,7 @@ fn crashpoint_manifest_marks_process_kill_as_future_until_runner_exists() { assert!(!process_entries.is_empty()); assert!(process_entries .iter() - .all(|entry| entry.execution == WalCrashpointExecution::ProcessKillFuture)); + .all(|entry| entry.execution == WalCrashpointExecution::ProcessKill)); } #[test] diff --git a/docs/design/causal-wal-hardening-matrix.md b/docs/design/causal-wal-hardening-matrix.md index e91d0f49..780a1875 100644 --- a/docs/design/causal-wal-hardening-matrix.md +++ b/docs/design/causal-wal-hardening-matrix.md @@ -429,23 +429,23 @@ Test plan: User story: -As Echo, I need a future CLI/BATS crash runner contract that mirrors the Rust -fixture semantics before it shells out to real processes. +As Echo, I need a process-kill crash runner contract that mirrors the Rust +fixture semantics while exercising real parent/child process boundaries. Acceptance criteria: - Rust crash fixtures define canonical crashpoint names. - A test-visible crashpoint manifest lists supported boundaries. -- The manifest distinguishes simulated in-process cuts from future process-kill - cuts. -- No CLI runner claims more than the Rust fixture proves. +- The manifest distinguishes simulated in-process cuts from process-kill cuts. +- No process runner claims more than the Rust fixture proves. Test plan: - `crashpoint_manifest_lists_submission_boundaries` - `crashpoint_manifest_lists_tick_boundaries` - `crashpoint_manifest_lists_checkpoint_boundaries` -- `crashpoint_manifest_marks_process_kill_as_future_until_runner_exists` +- `crashpoint_manifest_marks_process_kill_as_available_runner` +- `wal_process_crashpoints` ## Slice 62: Filesystem Strict Sync Evidence diff --git a/docs/topics/WAL.md b/docs/topics/WAL.md index 05a0b25d..5d1aa699 100644 --- a/docs/topics/WAL.md +++ b/docs/topics/WAL.md @@ -133,6 +133,11 @@ artifact or metadata mismatches, committed observation mismatches, and retained material unavailability so restart logic can retry, repair, or obstruct without blindly replaying effects. +The process-kill crashpoint runner exercises the filesystem WAL across real +parent/child process boundaries. A killed child that already committed WAL +material recovers as committed history; a killed child with only uncommitted +frames recovers as tail posture and does not enter accepted or decided history. + ## Evidence The runtime ACK and recovery witnesses live in diff --git a/docs/workflows.md b/docs/workflows.md index 09730be8..a6135d58 100644 --- a/docs/workflows.md +++ b/docs/workflows.md @@ -164,7 +164,7 @@ The repo also exposes maintenance commands via `cargo xtask …`: - `cargo xtask test-slice contract-path-release` runs the v0.1 local contract-host release witness: installed contract pipeline replay, reference trusted host loop, and the serious external consumer fixture. - `cargo xtask test-slice runtime-wal-ack` runs the fast runtime WAL-backed ACK witness: app-facing acceptance rollback, scheduler tick receipt invariant checks, scheduler tick commit-before-publish, recovered indexes, CLI submission posture JSON, stale-claim guard, and generated man-page check. - `cargo xtask test-slice durable-runtime-wal` runs the release-grade filesystem runtime WAL durability witness: filesystem ACK recovery, filesystem failure atomicity, CLI submission posture JSON, stale-claim guard, and generated man-page check. -- `cargo xtask test-slice durability-release` runs the joined WAL/WSC release witness: filesystem runtime WAL durability, WSC retained evidence recovery, app-safe missing-retention posture, recovery plan bootstrap posture, committed-only durability index rebuilds, typed materialization outbox recovery, WSC topology recovery, topology WAL recovery, typed missing-material obstruction, stale-claim guards, doctrine checks, and generated man-page freshness. This is a release-gate slice, not the fastest local edit loop. +- `cargo xtask test-slice durability-release` runs the joined WAL/WSC release witness: filesystem runtime WAL durability, WSC retained evidence recovery, app-safe missing-retention posture, recovery plan bootstrap posture, committed-only durability index rebuilds, typed materialization outbox recovery, process-kill WAL crashpoints, WSC topology recovery, topology WAL recovery, typed missing-material obstruction, stale-claim guards, doctrine checks, and generated man-page freshness. This is a release-gate slice, not the fastest local edit loop. - `cargo xtask pr-preflight` runs the default changed-scope pre-PR gate against `origin/main`. - `cargo xtask pr-preflight --full` runs the broader explicit full pre-PR gate. - `cargo xtask dind` runs the DIND (Deterministic Ironclad Nightmare Drills) harness locally. diff --git a/xtask/src/main.rs b/xtask/src/main.rs index 1af63390..854467f0 100644 --- a/xtask/src/main.rs +++ b/xtask/src/main.rs @@ -825,6 +825,7 @@ fn build_test_slice_commands(slice: TestSlice) -> Vec { "causal_wal_tests", "materialization_outbox_recovery_returns_typed_posture", ]), + cargo_command(["test", "-p", "echo-dind-tests", "wal_process_crashpoints"]), cargo_command([ "test", "-p", @@ -6743,7 +6744,7 @@ mod tests { #[test] fn test_slice_durability_release_stays_explicit() { let commands = build_test_slice_commands(TestSlice::DurabilityRelease); - assert_eq!(commands.len(), 16); + assert_eq!(commands.len(), 17); let expected = [ ( @@ -6882,6 +6883,10 @@ mod tests { "materialization_outbox_recovery_returns_typed_posture", ], ), + ( + "cargo", + vec!["test", "-p", "echo-dind-tests", "wal_process_crashpoints"], + ), ( "cargo", vec![