diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4dd35485..7d8ceb5c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,9 +2,9 @@ name: AIngle CI on: push: - branches: [ main, develop ] + branches: [ main, dev ] pull_request: - branches: [ main, develop ] + branches: [ main, dev ] env: CARGO_TERM_COLOR: always diff --git a/.gitignore b/.gitignore index bb7ee1d0..3d9e8360 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,9 @@ Thumbs.db CLAUDE.md .claude_settings .claudeignore +.mcp.json +docs/superpowers/ +.superpowers/ # GitHub Copilot .copilot/ @@ -113,6 +116,8 @@ llm-instructions.md *.profraw *.profdata aingle_iot.db +data/ +*.sled # Logs *.log @@ -127,4 +132,9 @@ aingle_iot.db .env.local .secrets contexto/ -contexto/* \ No newline at end of file +contexto/* + +# Local neural-embedder test models (never commit — ~470MB fp32 e5-small, fetched on demand) +crates/ineru/test-models/ +# Local ONNX Runtime dylib for running the gated neural tests (set ORT_DYLIB_PATH to it) +.tmp-ort/ \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index d422051e..b2743605 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -70,7 +70,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "getrandom 0.3.4", "once_cell", + "serde", "version_check", "zerocopy", ] @@ -97,7 +99,7 @@ dependencies = [ [[package]] name = "aingle_ai" -version = "0.6.3" +version = "0.7.0" dependencies = [ "blake2", "candle-core 0.9.2", @@ -119,7 +121,7 @@ dependencies = [ [[package]] name = "aingle_contracts" -version = "0.6.3" +version = "0.7.0" dependencies = [ "blake3", "dashmap 6.1.0", @@ -138,9 +140,10 @@ dependencies = [ [[package]] name = "aingle_cortex" -version = "0.6.3" +version = "0.7.0" dependencies = [ "aingle_graph", + "aingle_ingest", "aingle_logic", "aingle_raft", "aingle_wal", @@ -149,6 +152,7 @@ dependencies = [ "async-graphql", "async-graphql-axum", "axum", + "base64 0.22.1", "blake3", "chrono", "dashmap 6.1.0", @@ -157,6 +161,7 @@ dependencies = [ "futures", "hex", "if-addrs 0.13.4", + "ignore", "ineru", "jsonwebtoken", "log", @@ -169,6 +174,7 @@ dependencies = [ "regex", "reqwest", "rmcp", + "rsa", "rustls", "rustls-pemfile", "schemars", @@ -193,7 +199,7 @@ dependencies = [ [[package]] name = "aingle_graph" -version = "0.6.3" +version = "0.7.0" dependencies = [ "bincode", "blake3", @@ -215,9 +221,21 @@ dependencies = [ "uuid", ] +[[package]] +name = "aingle_ingest" +version = "0.7.0" +dependencies = [ + "aingle_graph", + "blake3", + "once_cell", + "regex", + "serde", + "serde_json", +] + [[package]] name = "aingle_logic" -version = "0.6.3" +version = "0.7.0" dependencies = [ "aingle_graph", "chrono", @@ -233,7 +251,7 @@ dependencies = [ [[package]] name = "aingle_minimal" -version = "0.6.3" +version = "0.7.0" dependencies = [ "async-io", "async-tungstenite", @@ -275,7 +293,7 @@ dependencies = [ [[package]] name = "aingle_raft" -version = "0.6.3" +version = "0.7.0" dependencies = [ "aingle_graph", "aingle_wal", @@ -296,7 +314,7 @@ dependencies = [ [[package]] name = "aingle_viz" -version = "0.6.3" +version = "0.7.0" dependencies = [ "aingle_graph", "aingle_minimal", @@ -318,7 +336,7 @@ dependencies = [ [[package]] name = "aingle_wal" -version = "0.6.3" +version = "0.7.0" dependencies = [ "bincode", "blake3", @@ -330,7 +348,7 @@ dependencies = [ [[package]] name = "aingle_zk" -version = "0.6.3" +version = "0.7.0" dependencies = [ "blake3", "bulletproofs", @@ -670,7 +688,7 @@ dependencies = [ "async-graphql-value", "async-trait", "asynk-strim", - "base64", + "base64 0.22.1", "blocking", "bytes", "chrono", @@ -914,7 +932,7 @@ checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" dependencies = [ "axum-core", "axum-macros", - "base64", + "base64 0.22.1", "bytes", "form_urlencoded", "futures-util", @@ -994,6 +1012,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.22.1" @@ -1497,6 +1521,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + [[package]] name = "cbc" version = "0.1.2" @@ -1658,7 +1691,7 @@ checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" dependencies = [ "glob", "libc", - "libloading", + "libloading 0.8.9", ] [[package]] @@ -1761,6 +1794,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "compact_str" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + [[package]] name = "compression-codecs" version = "0.4.37" @@ -2325,6 +2373,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -2457,6 +2514,37 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.117", +] + [[package]] name = "derive_more" version = "2.1.1" @@ -2908,6 +2996,12 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" + [[package]] name = "esp-idf-hal" version = "0.44.1" @@ -3041,6 +3135,21 @@ dependencies = [ "siphasher", ] +[[package]] +name = "fastembed" +version = "5.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "545e4fb17fc48768ff36c2a3854aa5b0b809d0ed595ab5530fa8ac94f31bd0ea" +dependencies = [ + "anyhow", + "ndarray", + "ort", + "safetensors 0.8.0", + "serde", + "serde_json", + "tokenizers", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -3946,7 +4055,7 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "futures-channel", "futures-util", @@ -4153,12 +4262,13 @@ dependencies = [ [[package]] name = "ineru" -version = "0.6.3" +version = "0.7.0" dependencies = [ "bincode", "blake3", "chrono", "criterion", + "fastembed", "log", "rusqlite", "serde", @@ -4393,7 +4503,7 @@ version = "10.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0529410abe238729a60b108898784df8984c87f6054c9c4fcacc47e4803c1ce1" dependencies = [ - "base64", + "base64 0.22.1", "ed25519-dalek", "getrandom 0.2.17", "hmac", @@ -4412,7 +4522,7 @@ dependencies = [ [[package]] name = "kaneru" -version = "0.6.3" +version = "0.7.0" dependencies = [ "chrono", "criterion", @@ -4530,6 +4640,16 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "libloading" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" +dependencies = [ + "cfg-if", + "windows-link 0.2.1", +] + [[package]] name = "libm" version = "0.2.16" @@ -4675,6 +4795,22 @@ dependencies = [ "zerocopy-derive", ] +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + [[package]] name = "maplit" version = "1.0.2" @@ -4830,6 +4966,28 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "more-asserts" version = "0.3.1" @@ -5172,6 +5330,28 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "onig" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +dependencies = [ + "bitflags 2.11.0", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "oorandom" version = "11.1.5" @@ -5295,6 +5475,25 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" +[[package]] +name = "ort" +version = "2.0.0-rc.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7de3af33d24a745ffb8fab904b13478438d1cd52868e6f17735ef6e1f8bf133" +dependencies = [ + "libloading 0.9.0", + "ndarray", + "ort-sys", + "smallvec", + "tracing", +] + +[[package]] +name = "ort-sys" +version = "2.0.0-rc.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90" + [[package]] name = "oxilangtag" version = "0.1.5" @@ -5462,7 +5661,7 @@ version = "3.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" dependencies = [ - "base64", + "base64 0.22.1", "serde_core", ] @@ -6014,6 +6213,17 @@ dependencies = [ "rayon-core", ] +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools 0.14.0", + "rayon", +] + [[package]] name = "rayon-core" version = "1.13.0" @@ -6195,7 +6405,7 @@ version = "0.12.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" dependencies = [ - "base64", + "base64 0.22.1", "bytes", "encoding_rs", "futures-core", @@ -6340,7 +6550,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0810a9f717d9828f475fe1f629f4c305c8464b7f496c3a854b58d29e65f4058e" dependencies = [ "async-trait", - "base64", + "base64 0.22.1", "bytes", "chrono", "futures", @@ -6650,6 +6860,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "safetensors" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b079b829cb27a1c3c374341345ed2e8b2c0c839034522cee576c140bd7f846" +dependencies = [ + "hashbrown 0.16.1", + "libc", + "serde", + "serde_json", + "tempfile", +] + [[package]] name = "same-file" version = "1.0.6" @@ -7152,6 +7375,18 @@ dependencies = [ "der", ] +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "sse-stream" version = "0.2.3" @@ -7260,7 +7495,7 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e0fd33c04d4617df42c9c84c698511c59f59869629fb7a193067eec41bce347" dependencies = [ - "base64", + "base64 0.22.1", "crc", "lazy_static", "md-5", @@ -7548,6 +7783,39 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223" +dependencies = [ + "ahash 0.8.12", + "aho-corasick", + "compact_str", + "dary_heap", + "derive_builder", + "esaxx-rs", + "getrandom 0.3.4", + "itertools 0.14.0", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand 0.9.2", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror 2.0.18", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.50.0" @@ -7872,7 +8140,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6a8b8ac3543b2a8eb0b28c7ac3d5f2db6221e057f3b3ae47cf7637b1333a5c3" dependencies = [ "async-trait", - "base64", + "base64 0.22.1", "futures", "log", "md-5", @@ -7931,6 +8199,15 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + [[package]] name = "unicode-segmentation" version = "1.12.0" @@ -7943,6 +8220,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "universal-hash" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index 672ec8e7..3c691e85 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "crates/aingle_graph", # Native Semantic GraphDB "crates/aingle_zk", # Zero-Knowledge Proofs (Privacy) "crates/ineru", # Ineru Memory System + "crates/aingle_ingest", # Structural ingestion (markdown/code → triples + chunks) "crates/aingle_ai", # AI Integration Layer "crates/aingle_logic", # Proof-of-Logic Validation Engine "crates/kaneru", # Kaneru — Unified Multi-Agent Execution System diff --git a/README.md b/README.md index 271eb2cc..d01074ef 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,61 @@ Interactive D3.js dashboard. Watch your DAG evolve in real-time. Filter, search, --- +## Grounded Retrieval & Provenance + +AIngle includes a retrieval layer that turns a corpus of documents into +**cited, provenance-backed context** for question answering, with an explicit +signal for how well an answer is supported by the underlying sources. The intent +is *grounded* generation: a downstream language model is given only material that +can be traced back to verifiable records, rather than being trusted to recall +facts on its own. + +The pipeline has three stages: + +1. **Ingestion (`aingle_ingest`).** A pure, deterministic extractor maps a + document `(path, content)` to an `Extraction` of **provenanced triples** and + **text chunks**. Structure (headings, sections, links) is preserved as + semantic triples in the graph, while chunks become the unit of retrieval. + Because ingestion is deterministic, the same input always yields the same + triples and chunks — a prerequisite for reproducible provenance. + +2. **Provenance anchoring.** Each ingested unit is tied to the append-only DAG + action that recorded it via a `provenance_anchor` — the signed hash of that + action (see the `dag-sign` feature). This makes a retrieved chunk not just + *findable* but *attestable*: you can point at the exact, tamper-evident record + it came from. + +3. **Grounded retrieval (`service::ground`).** A question is embedded and matched + against the ingested chunks; the service returns a `GroundedContext` + containing the cited chunks (with their `provenance_anchor`) and a + **groundedness** classification: + + | groundedness | meaning | + |--------------|---------| + | `grounded` | strong similarity **and** enough corroborating chunks — the answer is well supported by the sources. | + | `weak` | some relevant material, but below the corroboration threshold — answer with caution. | + | `ungrounded` | no sufficiently similar source — the sources do not support an answer. | + + The groundedness signal is derived from the best match plus a minimum number + of corroborating chunks, so a caller (or an LLM prompted with the context) can + explicitly say when the sources *don't* support an answer instead of + hallucinating one. + +These capabilities are exposed to Model-Context-Protocol clients through three +tools (see the [MCP server](#mcp-server) section): + +| Tool | Purpose | +|------|---------| +| `aingle_ingest` | Ingest a document `(path, content)` → provenanced triples + chunks. | +| `aingle_ground` | Answer a question with cited, provenance-backed context + a groundedness signal. | +| `aingle_sources` | List the sources that have been ingested. | + +Because retrieval results carry provenance anchors into the signed DAG history, +grounded answers are **auditable**: every cited passage resolves to a verifiable +action in the ledger. + +--- + ## Clustering AIngle supports multi-node clustering via Raft consensus for high availability and horizontal scalability. Writes are replicated to all nodes; reads can be served from any node with optional quorum consistency. diff --git a/assets/aingle-legacy.svg b/assets/aingle-legacy.svg new file mode 100644 index 00000000..e55a326d --- /dev/null +++ b/assets/aingle-legacy.svg @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/aingle.svg b/assets/aingle.svg index e55a326d..b1651e21 100644 --- a/assets/aingle.svg +++ b/assets/aingle.svg @@ -1,80 +1,11 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + diff --git a/assets/logo-legacy.svg b/assets/logo-legacy.svg new file mode 100644 index 00000000..e55a326d --- /dev/null +++ b/assets/logo-legacy.svg @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/assets/logo.svg b/assets/logo.svg index e55a326d..b1651e21 100644 --- a/assets/logo.svg +++ b/assets/logo.svg @@ -1,80 +1,11 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + diff --git a/crates/aingle/Cargo.toml b/crates/aingle/Cargo.toml index f1fe24cd..2966aac9 100644 --- a/crates/aingle/Cargo.toml +++ b/crates/aingle/Cargo.toml @@ -26,14 +26,14 @@ ghost_actor = "0.3.0-alpha.1" ai_hash = { version = ">=0.0.1", path = "../ai_hash", features = ["full"] } aingle_cascade = { version = "0.0.1", path = "../aingle_cascade" } aingle_conductor_api = { version = "0.0.1", path = "../aingle_conductor_api" } -aingle_ai = { version = "0.6", path = "../aingle_ai", optional = true } +aingle_ai = { version = "0.7", path = "../aingle_ai", optional = true } aingle_keystore = { version = "0.0.1", path = "../aingle_keystore" } aingle_p2p = { version = "0.0.1", path = "../aingle_p2p" } aingle_sqlite = { version = "0.0.1", path = "../aingle_sqlite" } aingle_middleware_bytes = "=0.0.3" aingle_state = { version = "0.0.1", path = "../aingle_state" } aingle_types = { version = "0.0.1", path = "../aingle_types" } -aingle_cortex = { version = "0.6", path = "../aingle_cortex", default-features = false, features = ["rest"] } +aingle_cortex = { version = "0.7", path = "../aingle_cortex", default-features = false, features = ["rest"] } aingle_wasmer_host = "0.0.1" aingle_websocket = { version = "0.0.1", path = "../aingle_websocket" } wasmer = "=7.0.1" diff --git a/crates/aingle_ai/Cargo.toml b/crates/aingle_ai/Cargo.toml index a087fc5a..5f007c88 100644 --- a/crates/aingle_ai/Cargo.toml +++ b/crates/aingle_ai/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_ai" -version = "0.6.3" +version = "0.7.0" description = "AI integration layer for AIngle - Ineru, Nested Learning, Kaneru" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" diff --git a/crates/aingle_ai/src/config.rs b/crates/aingle_ai/src/config.rs index 9b55a283..907845b3 100644 --- a/crates/aingle_ai/src/config.rs +++ b/crates/aingle_ai/src/config.rs @@ -3,9 +3,9 @@ //! Global AI configuration +use crate::ineru::IneruConfig; use crate::kaneru::KaneruConfig; use crate::nested_learning::NestedConfig; -use crate::ineru::IneruConfig; use serde::{Deserialize, Serialize}; /// Global AI configuration for AIngle nodes diff --git a/crates/aingle_ai/src/emergent/mod.rs b/crates/aingle_ai/src/emergent/mod.rs index d591a5dc..982d8c20 100644 --- a/crates/aingle_ai/src/emergent/mod.rs +++ b/crates/aingle_ai/src/emergent/mod.rs @@ -17,8 +17,8 @@ mod predictive_validator; pub use adaptive_consensus::AdaptiveConsensus; pub use predictive_validator::PredictiveValidator; -use crate::nested_learning::NestedLearning; use crate::ineru::IneruMemory; +use crate::nested_learning::NestedLearning; use crate::types::{AiTransaction, ConsensusLevel, ValidationPrediction}; /// Unified AI layer combining all capabilities @@ -39,8 +39,8 @@ pub struct AiLayer { impl AiLayer { /// Create a new AI layer with default configuration pub fn new() -> Self { - use crate::nested_learning::NestedConfig; use crate::ineru::IneruConfig; + use crate::nested_learning::NestedConfig; Self { ineru: IneruMemory::new(IneruConfig::default()), diff --git a/crates/aingle_ai/src/emergent/predictive_validator.rs b/crates/aingle_ai/src/emergent/predictive_validator.rs index d96d7f23..cd886250 100644 --- a/crates/aingle_ai/src/emergent/predictive_validator.rs +++ b/crates/aingle_ai/src/emergent/predictive_validator.rs @@ -5,8 +5,8 @@ //! //! Predict validation outcome before full validation. -use crate::nested_learning::NestedLearning; use crate::ineru::IneruMemory; +use crate::nested_learning::NestedLearning; use crate::types::{AiTransaction, ValidationPrediction}; /// Predict validation outcome before full validation @@ -160,8 +160,8 @@ pub struct PredictionAccuracy { #[cfg(test)] mod tests { use super::*; - use crate::nested_learning::NestedConfig; use crate::ineru::IneruConfig; + use crate::nested_learning::NestedConfig; fn make_test_tx(id: u8) -> AiTransaction { AiTransaction { diff --git a/crates/aingle_ai/src/lib.rs b/crates/aingle_ai/src/lib.rs index 1be71d33..3861df3c 100644 --- a/crates/aingle_ai/src/lib.rs +++ b/crates/aingle_ai/src/lib.rs @@ -58,9 +58,9 @@ #![warn(clippy::all)] pub mod emergent; +pub mod ineru; pub mod kaneru; pub mod nested_learning; -pub mod ineru; mod config; mod error; @@ -75,7 +75,7 @@ pub mod prelude { pub use crate::config::AiConfig; pub use crate::emergent::{AdaptiveConsensus, PredictiveValidator}; pub use crate::error::{AiError, AiResult}; + pub use crate::ineru::{IneruConfig, IneruMemory, LongTermMemory, ShortTermMemory}; pub use crate::kaneru::{KaneruAgent, KaneruConfig}; pub use crate::nested_learning::{NestedConfig, NestedLearning}; - pub use crate::ineru::{LongTermMemory, ShortTermMemory, IneruConfig, IneruMemory}; } diff --git a/crates/aingle_contracts/Cargo.toml b/crates/aingle_contracts/Cargo.toml index c3ea5927..4c43ac3c 100644 --- a/crates/aingle_contracts/Cargo.toml +++ b/crates/aingle_contracts/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_contracts" -version = "0.6.3" +version = "0.7.0" description = "Smart Contracts DSL and WASM Runtime for AIngle" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" diff --git a/crates/aingle_cortex/Cargo.toml b/crates/aingle_cortex/Cargo.toml index 7ceecf18..84d9bd75 100644 --- a/crates/aingle_cortex/Cargo.toml +++ b/crates/aingle_cortex/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_cortex" -version = "0.6.3" +version = "0.7.0" description = "Córtex API - REST/GraphQL/SPARQL interface for AIngle semantic graphs" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -25,6 +25,9 @@ dag = ["cluster", "aingle_graph/dag", "aingle_graph/dag-sign", "aingle_raft/dag" mcp = ["dep:rmcp", "dep:schemars"] mcp-http = ["mcp", "rmcp/transport-streamable-http-server", "rmcp/server-side-http"] mcp-oauth = ["mcp-http", "dep:jsonwebtoken"] +# Real neural embeddings: forwards to ineru's fastembed-backed embedder. +# Off by default — default cortex build stays hash-only (MSRV 1.83 unaffected). +neural-embeddings = ["ineru/neural-embeddings"] full =["rest", "graphql", "sparql", "auth", "dag"] [[bin]] @@ -33,10 +36,12 @@ path = "src/main.rs" [dependencies] # Core AIngle crates -aingle_graph = { version = "0.6", path = "../aingle_graph", features = ["sled-backend"] } -aingle_logic = { version = "0.6", path = "../aingle_logic" } -aingle_zk = { version = "0.6", path = "../aingle_zk" } -ineru = { version = "0.6", path = "../ineru" } +aingle_graph = { version = "0.7", path = "../aingle_graph", features = ["sled-backend"] } +aingle_logic = { version = "0.7", path = "../aingle_logic" } +aingle_zk = { version = "0.7", path = "../aingle_zk" } +ineru = { version = "0.7", path = "../ineru" } +aingle_ingest = { version = "0.7", path = "../aingle_ingest" } +ignore = "0.4" # Web framework axum = { version = "0.8", features = ["ws", "macros"] } @@ -103,8 +108,8 @@ rcgen = { version = "0.13", optional = true } ed25519-dalek = { version = "2", features = ["rand_core"], optional = true } hex = { version = "0.4", optional = true } # Clustering (optional) -aingle_wal = { version = "0.6", path = "../aingle_wal", optional = true } -aingle_raft = { version = "0.6", path = "../aingle_raft", optional = true } +aingle_wal = { version = "0.7", path = "../aingle_wal", optional = true } +aingle_raft = { version = "0.7", path = "../aingle_raft", optional = true } openraft = { version = "0.10.0-alpha.17", features = ["serde", "type-alias"], optional = true } tokio-rustls = { version = "0.26", default-features = false, features = ["ring"], optional = true } rustls-pemfile = { version = "2", optional = true } diff --git a/crates/aingle_cortex/src/auth/jwt.rs b/crates/aingle_cortex/src/auth/jwt.rs index e15597a8..34c59a76 100644 --- a/crates/aingle_cortex/src/auth/jwt.rs +++ b/crates/aingle_cortex/src/auth/jwt.rs @@ -379,7 +379,10 @@ mod tests { #[test] fn test_token_roundtrip() { - std::env::set_var("AINGLE_JWT_SECRET", "test-secret-only-do-not-use-in-production-64bytes-pad"); + std::env::set_var( + "AINGLE_JWT_SECRET", + "test-secret-only-do-not-use-in-production-64bytes-pad", + ); let claims = Claims::new_access("user123", vec!["user".to_string()]); let token = encode( diff --git a/crates/aingle_cortex/src/client.rs b/crates/aingle_cortex/src/client.rs index eebfdb77..6959f82e 100644 --- a/crates/aingle_cortex/src/client.rs +++ b/crates/aingle_cortex/src/client.rs @@ -8,9 +8,8 @@ //! the knowledge layer. use crate::wasm_types::{ - GraphQueryInput, GraphQueryOutput, GraphStoreInput, GraphStoreOutput, - MemoryRecallInput, MemoryRecallOutput, MemoryRememberInput, MemoryRememberOutput, - Triple, ObjectValue, + GraphQueryInput, GraphQueryOutput, GraphStoreInput, GraphStoreOutput, MemoryRecallInput, + MemoryRecallOutput, MemoryRememberInput, MemoryRememberOutput, ObjectValue, Triple, }; use serde::{Deserialize, Serialize}; @@ -197,7 +196,10 @@ impl CortexInternalClient { /// Query the semantic graph. pub async fn graph_query(&self, input: GraphQueryInput) -> Result { let (subject, predicate) = if let Some(ref pattern) = input.pattern { - (pattern.subject.clone().or(input.subject), pattern.predicate.clone().or(input.predicate)) + ( + pattern.subject.clone().or(input.subject), + pattern.predicate.clone().or(input.predicate), + ) } else { (input.subject, input.predicate) }; @@ -205,23 +207,28 @@ impl CortexInternalClient { let body = PatternQueryRequest { subject, predicate, - object: input.pattern.as_ref() + object: input + .pattern + .as_ref() .and_then(|p| p.object.as_ref()) .map(Self::object_to_json), limit: input.limit, }; - let req = self.apply_auth( - self.http.post(self.url("/api/v1/query")).json(&body), - ); + let req = self.apply_auth(self.http.post(self.url("/api/v1/query")).json(&body)); - let resp = req.send().await.map_err(|e| format!("Cortex query failed: {}", e))?; + let resp = req + .send() + .await + .map_err(|e| format!("Cortex query failed: {}", e))?; if !resp.status().is_success() { return Err(format!("Cortex query returned {}", resp.status())); } - let result: PatternQueryResponse = resp.json().await + let result: PatternQueryResponse = resp + .json() + .await .map_err(|e| format!("Failed to parse Cortex response: {}", e))?; Ok(GraphQueryOutput { @@ -238,17 +245,20 @@ impl CortexInternalClient { object: Self::object_to_json(&input.object), }; - let req = self.apply_auth( - self.http.post(self.url("/api/v1/triples")).json(&body), - ); + let req = self.apply_auth(self.http.post(self.url("/api/v1/triples")).json(&body)); - let resp = req.send().await.map_err(|e| format!("Cortex store failed: {}", e))?; + let resp = req + .send() + .await + .map_err(|e| format!("Cortex store failed: {}", e))?; if !resp.status().is_success() { return Err(format!("Cortex store returned {}", resp.status())); } - let result: CreateTripleResponse = resp.json().await + let result: CreateTripleResponse = resp + .json() + .await .map_err(|e| format!("Failed to parse Cortex response: {}", e))?; Ok(GraphStoreOutput { @@ -257,7 +267,10 @@ impl CortexInternalClient { } /// Recall memories from the Titans system. - pub async fn memory_recall(&self, input: MemoryRecallInput) -> Result { + pub async fn memory_recall( + &self, + input: MemoryRecallInput, + ) -> Result { let body = MemoryRecallRequest { query: input.query, entry_type: input.entry_type, @@ -265,34 +278,46 @@ impl CortexInternalClient { }; let req = self.apply_auth( - self.http.post(self.url("/api/v1/memory/recall")).json(&body), + self.http + .post(self.url("/api/v1/memory/recall")) + .json(&body), ); - let resp = req.send().await.map_err(|e| format!("Titans recall failed: {}", e))?; + let resp = req + .send() + .await + .map_err(|e| format!("Titans recall failed: {}", e))?; if !resp.status().is_success() { return Err(format!("Titans recall returned {}", resp.status())); } - let result: MemoryRecallResponse = resp.json().await + let result: MemoryRecallResponse = resp + .json() + .await .map_err(|e| format!("Failed to parse Titans response: {}", e))?; Ok(MemoryRecallOutput { - results: result.results.iter().map(|r| { - crate::wasm_types::MemoryResult { + results: result + .results + .iter() + .map(|r| crate::wasm_types::MemoryResult { id: r.id.clone(), data: r.data.clone(), entry_type: r.entry_type.clone(), tags: r.tags.clone(), importance: r.importance, created_at: r.created_at.clone(), - } - }).collect(), + }) + .collect(), }) } /// Store a new memory in the Titans system. - pub async fn memory_remember(&self, input: MemoryRememberInput) -> Result { + pub async fn memory_remember( + &self, + input: MemoryRememberInput, + ) -> Result { let body = MemoryRememberRequest { data: input.data, entry_type: input.entry_type, @@ -301,16 +326,23 @@ impl CortexInternalClient { }; let req = self.apply_auth( - self.http.post(self.url("/api/v1/memory/remember")).json(&body), + self.http + .post(self.url("/api/v1/memory/remember")) + .json(&body), ); - let resp = req.send().await.map_err(|e| format!("Titans remember failed: {}", e))?; + let resp = req + .send() + .await + .map_err(|e| format!("Titans remember failed: {}", e))?; if !resp.status().is_success() { return Err(format!("Titans remember returned {}", resp.status())); } - let result: MemoryRememberResponse = resp.json().await + let result: MemoryRememberResponse = resp + .json() + .await .map_err(|e| format!("Failed to parse Titans response: {}", e))?; Ok(MemoryRememberOutput { id: result.id }) @@ -318,7 +350,11 @@ impl CortexInternalClient { /// Check if Cortex is healthy and reachable. pub async fn health_check(&self) -> bool { - match self.apply_auth(self.http.get(self.url("/api/v1/health"))).send().await { + match self + .apply_auth(self.http.get(self.url("/api/v1/health"))) + .send() + .await + { Ok(resp) => resp.status().is_success(), Err(_) => false, } diff --git a/crates/aingle_cortex/src/embedder.rs b/crates/aingle_cortex/src/embedder.rs new file mode 100644 index 00000000..de64d837 --- /dev/null +++ b/crates/aingle_cortex/src/embedder.rs @@ -0,0 +1,239 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Embedder selection and index-migration helpers for Cortex. +//! +//! Chooses a `NeuralEmbedder` when the `neural-embeddings` feature is on and a +//! model directory is available, else falls back to `HashEmbedder`. Also owns +//! the dimension-sidecar bookkeeping used to detect an embedder change and the +//! registry-clear that forces a re-ingest after one. + +use ineru::{Embedder, Embedding, HashEmbedder}; +use std::sync::Arc; + +/// Builds the active embedder. Returns a `NeuralEmbedder` only when cortex is +/// compiled with `neural-embeddings` AND `model_dir` is `Some` AND the model +/// loads; otherwise a `HashEmbedder`. Never panics — embedding must not be able +/// to take the server down. +pub fn build_embedder(model_dir: Option<&str>) -> Arc { + #[cfg(feature = "neural-embeddings")] + if let Some(dir) = model_dir { + match ineru::NeuralEmbedder::from_path(std::path::Path::new(dir)) { + Ok(e) => { + log::info!("Using neural embedder from {dir}"); + return Arc::new(e); + } + Err(e) => { + log::warn!("Failed to load neural embedder from {dir}: {e}. Using hash embedder."); + } + } + } + #[cfg(not(feature = "neural-embeddings"))] + if model_dir.is_some() { + log::warn!( + "--embed-model was set but cortex was built without the `neural-embeddings` \ + feature; using the hash embedder." + ); + } + Arc::new(HashEmbedder::new()) +} + +/// Reads the persisted embedder dimensionality from `/embedder.dims`. +/// Returns `None` if the sidecar is absent or unparseable. +pub fn read_dims(dir: &std::path::Path) -> Option { + let raw = std::fs::read_to_string(dir.join("embedder.dims")).ok()?; + raw.trim().parse::().ok() +} + +/// Writes the active embedder dimensionality to `/embedder.dims`. +pub fn write_dims(dir: &std::path::Path, dims: usize) { + if let Err(e) = std::fs::write(dir.join("embedder.dims"), dims.to_string()) { + log::warn!("Failed to write embedder.dims sidecar: {e}"); + } +} + +/// Deletes every `aingle:source_hash` registry triple so the next ingest treats +/// all files as new and re-embeds them. Returns the number removed. +pub fn clear_source_registry(graph: &aingle_graph::GraphDB) -> Result { + use aingle_graph::{Predicate, TriplePattern}; + let pattern = TriplePattern::any() + .with_predicate(Predicate::named(crate::service::ingest::PRED_SOURCE_HASH)); + let ids: Vec<_> = graph.find(pattern)?.into_iter().map(|t| t.id()).collect(); + let mut removed = 0; + for id in &ids { + match graph.delete(id) { + Ok(true) => removed += 1, + Ok(false) => {} // already gone — fine + Err(e) => log::warn!("clear_source_registry: delete failed for {id:?}: {e}"), + } + } + Ok(removed) +} + +/// An embedder whose inner delegate can be hot-swapped at runtime while its +/// reported dimensionality stays FIXED. Lets a UI start immediately with a +/// "pending" embedder and install the real (slow-to-load) model later WITHOUT +/// ever changing the vector dimension — so a dimension-keyed index (HNSW) stays +/// consistent. Stored vectors must only be produced AFTER a real delegate is +/// installed; the caller gates ingest on readiness. +pub struct SwappableEmbedder { + inner: std::sync::RwLock>, + dims: usize, +} + +/// Placeholder delegate before the real model is installed. Returns a zero vector +/// of the fixed dims — harmless for queries (cosine 0 → "ungrounded") and never +/// used for stored passages because ingest is gated on readiness. +struct PendingEmbedder { + dims: usize, +} + +impl Embedder for PendingEmbedder { + fn embed_passage(&self, _text: &str) -> Embedding { + Embedding::new(vec![0.0; self.dims]) + } + fn embed_query(&self, _text: &str) -> Embedding { + Embedding::new(vec![0.0; self.dims]) + } + fn dimensions(&self) -> usize { + self.dims + } +} + +impl SwappableEmbedder { + /// Creates a swappable embedder in the pending state with a fixed dimension. + pub fn new_pending(dims: usize) -> Self { + Self { + inner: std::sync::RwLock::new(Arc::new(PendingEmbedder { dims })), + dims, + } + } + + /// Installs the real delegate. The delegate MUST report the same dimension + /// this swappable was created with; a mismatch is logged and ignored so the + /// index dimension can never change underneath stored vectors. + pub fn install(&self, delegate: Arc) { + if delegate.dimensions() != self.dims { + log::warn!( + "SwappableEmbedder.install rejected: delegate dims {} != fixed {}", + delegate.dimensions(), + self.dims + ); + return; + } + *self.inner.write().expect("swappable embedder poisoned") = delegate; + } +} + +impl Embedder for SwappableEmbedder { + fn embed_passage(&self, text: &str) -> Embedding { + let inner = self + .inner + .read() + .expect("swappable embedder poisoned") + .clone(); + inner.embed_passage(text) + } + fn embed_query(&self, text: &str) -> Embedding { + let inner = self + .inner + .read() + .expect("swappable embedder poisoned") + .clone(); + inner.embed_query(text) + } + fn dimensions(&self) -> usize { + self.dims + } + fn relevance_thresholds(&self) -> (f32, f32) { + let inner = self + .inner + .read() + .expect("swappable embedder poisoned") + .clone(); + inner.relevance_thresholds() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn build_embedder_without_model_is_hash_64d() { + let e = build_embedder(None); + assert_eq!(e.dimensions(), 64); + } + + #[test] + fn build_embedder_missing_dir_falls_back_to_hash() { + let e = build_embedder(Some("/nonexistent/model/dir")); + assert_eq!(e.dimensions(), 64); + } + + #[test] + fn dims_sidecar_round_trips() { + let dir = tempfile::tempdir().unwrap(); + write_dims(dir.path(), 384); + assert_eq!(read_dims(dir.path()), Some(384)); + } + + #[test] + fn read_dims_absent_is_none() { + let dir = tempfile::tempdir().unwrap(); + assert_eq!(read_dims(dir.path()), None); + } + + #[test] + fn clear_source_registry_on_empty_graph_is_zero() { + let graph = aingle_graph::GraphDB::memory().unwrap(); + assert_eq!(clear_source_registry(&graph).unwrap(), 0); + } + + #[test] + fn swappable_reports_fixed_dims_before_and_after_install() { + let s = SwappableEmbedder::new_pending(384); + assert_eq!(s.dimensions(), 384); + let q = s.embed_query("hola"); + assert_eq!(q.0.len(), 384); + assert!(q.0.iter().all(|x| *x == 0.0)); + s.install(std::sync::Arc::new(Fake384)); + assert_eq!(s.dimensions(), 384); + let q2 = s.embed_query("hola"); + assert_eq!(q2.0.len(), 384); + assert!(q2.0.iter().any(|x| *x != 0.0)); + } + + #[test] + fn swappable_rejects_mismatched_dims_install() { + let s = SwappableEmbedder::new_pending(384); + s.install(std::sync::Arc::new(ineru::HashEmbedder::new())); // 64d → rejected + let q = s.embed_query("x"); + assert_eq!(q.0.len(), 384); + assert!(q.0.iter().all(|x| *x == 0.0)); + } + + #[test] + fn swappable_delegates_relevance_thresholds_after_install() { + let s = SwappableEmbedder::new_pending(384); + s.install(std::sync::Arc::new(Fake384)); + assert_eq!(s.relevance_thresholds(), (0.80, 0.77)); + } + + /// 384-dim test delegate with non-zero output and the e5 thresholds. + struct Fake384; + impl ineru::Embedder for Fake384 { + fn embed_passage(&self, _t: &str) -> ineru::Embedding { + ineru::Embedding::new(vec![0.5; 384]) + } + fn embed_query(&self, _t: &str) -> ineru::Embedding { + ineru::Embedding::new(vec![0.5; 384]) + } + fn dimensions(&self) -> usize { + 384 + } + fn relevance_thresholds(&self) -> (f32, f32) { + (0.80, 0.77) + } + } +} diff --git a/crates/aingle_cortex/src/graphql/mod.rs b/crates/aingle_cortex/src/graphql/mod.rs index 633350a9..2387c9d8 100644 --- a/crates/aingle_cortex/src/graphql/mod.rs +++ b/crates/aingle_cortex/src/graphql/mod.rs @@ -51,8 +51,10 @@ async fn graphql_handler( /// GraphiQL IDE async fn graphql_playground() -> impl IntoResponse { - Html(async_graphql::http::GraphiQLSource::build() - .endpoint("/graphql") - .subscription_endpoint("/graphql/ws") - .finish()) + Html( + async_graphql::http::GraphiQLSource::build() + .endpoint("/graphql") + .subscription_endpoint("/graphql/ws") + .finish(), + ) } diff --git a/crates/aingle_cortex/src/graphql/resolvers.rs b/crates/aingle_cortex/src/graphql/resolvers.rs index b7f1d13c..c86355fe 100644 --- a/crates/aingle_cortex/src/graphql/resolvers.rs +++ b/crates/aingle_cortex/src/graphql/resolvers.rs @@ -191,6 +191,7 @@ impl MutationRoot { subject: input.subject.clone(), predicate: input.predicate.clone(), object: serde_json::json!({}), + provenance: None, }], }, signature: None, @@ -200,9 +201,9 @@ impl MutationRoot { key.sign(&mut action); } - dag_store.put(&action).map_err(|e| { - Error::new(format!("DAG action failed: {e}")) - })?; + dag_store + .put(&action) + .map_err(|e| Error::new(format!("DAG action failed: {e}")))?; } } @@ -258,9 +259,9 @@ impl MutationRoot { key.sign(&mut action); } - dag_store.put(&action).map_err(|e| { - Error::new(format!("DAG action failed: {e}")) - })?; + dag_store + .put(&action) + .map_err(|e| Error::new(format!("DAG action failed: {e}")))?; } } diff --git a/crates/aingle_cortex/src/lib.rs b/crates/aingle_cortex/src/lib.rs index 4372feed..f1c83b62 100644 --- a/crates/aingle_cortex/src/lib.rs +++ b/crates/aingle_cortex/src/lib.rs @@ -166,6 +166,7 @@ pub mod auth; pub mod client; #[cfg(feature = "cluster")] pub mod cluster_init; +pub mod embedder; pub mod error; #[cfg(feature = "graphql")] pub mod graphql; diff --git a/crates/aingle_cortex/src/main.rs b/crates/aingle_cortex/src/main.rs index d37b9439..374b60f0 100644 --- a/crates/aingle_cortex/src/main.rs +++ b/crates/aingle_cortex/src/main.rs @@ -42,6 +42,7 @@ async fn main() -> Result<(), Box> { } let mut config = CortexConfig::default(); + config.embed_model = std::env::var("AINGLE_EMBED_MODEL").ok(); // Simple argument parsing let mut i = 1; @@ -68,6 +69,12 @@ async fn main() -> Result<(), Box> { i += 1; } } + "--embed-model" => { + if i + 1 < args.len() { + config.embed_model = Some(args[i + 1].clone()); + i += 1; + } + } "--memory" => { config.db_path = Some(":memory:".to_string()); } @@ -338,6 +345,7 @@ fn print_help() { " --db Path to graph database (default: ~/.aingle/cortex/graph.sled)" ); println!(" --memory Use volatile in-memory storage (no persistence)"); + println!(" --embed-model Directory with a neural embedding model (requires --features neural-embeddings; falls back to hash if absent)"); println!(" --flush-interval Periodic flush interval in seconds (default: 300, 0=off)"); println!(" --mcp Serve MCP over stdio (requires --features mcp)"); println!( diff --git a/crates/aingle_cortex/src/mcp/server.rs b/crates/aingle_cortex/src/mcp/server.rs index f72abdea..b4c97337 100644 --- a/crates/aingle_cortex/src/mcp/server.rs +++ b/crates/aingle_cortex/src/mcp/server.rs @@ -23,7 +23,7 @@ pub struct DagHistoryParams { #[cfg(feature = "dag")] fn default_hist_limit() -> usize { - 50 + crate::service::dag::DEFAULT_HISTORY_LIMIT } /// Parameters for the `aingle_dag_action` tool. @@ -90,6 +90,108 @@ impl AingleMcp { "pong".to_string() } + /// Ingest a markdown vault / code repo into the graph + memory with provenance. + #[tool( + description = "Ingest a markdown vault or code repo: auto-extracts triples \ + (frontmatter, wikilinks, headings, tags), indexes text chunks for \ + semantic recall, and records signed provenance. Incremental: unchanged \ + files are skipped." + )] + async fn aingle_ingest( + &self, + params: Parameters, + ) -> Result { + let Parameters(p) = params; + let resp = crate::service::ingest::ingest_path(&self.state, &p.path, None) + .await + .map_err(super::convert::to_mcp_error)?; + Ok(CallToolResult::success(vec![Content::json(resp)?])) + } + + /// Grounded retrieval: cited, provenance-backed context for a question. + #[tool( + description = "Answer-grounding for a question. Returns cited source chunks \ + (path:lines) with a signed-provenance anchor and a groundedness signal. \ + Answer ONLY from the returned context; if groundedness is not 'grounded', \ + say so and do not invent.", + annotations(read_only_hint = true) + )] + async fn aingle_ground( + &self, + params: Parameters, + ) -> Result { + let Parameters(p) = params; + let resp = crate::service::ground::ground(&self.state, &p.question, p.k) + .await + .map_err(super::convert::to_mcp_error)?; + Ok(CallToolResult::success(vec![Content::json(resp)?])) + } + + /// Verified backlinks + outgoing links + unlinked mentions for a note. + #[tool( + description = "Verified backlinks, outgoing links, and unlinked mentions for a note. \ + Each backlink includes the source's context line and a signed-provenance anchor \ + when available. Use for accurate reverse navigation.", + annotations(read_only_hint = true) + )] + async fn aingle_backlinks( + &self, + params: Parameters, + ) -> Result { + let Parameters(p) = params; + let resp = crate::service::backlinks::backlinks(&self.state, &p.note).await; + Ok(CallToolResult::success(vec![Content::json(resp)?])) + } + + /// Verified context bundle for a note: semantically-related notes (by meaning, + /// not just links) with the matching passage and signed provenance. + #[tool( + description = "Verified context bundle for a note: notes that are semantically \ + related by meaning (not just by explicit links), each with the matching \ + passage as evidence and a signed-provenance anchor when available. Use to \ + answer grounded in a note's verified neighborhood without hallucinating.", + annotations(read_only_hint = true) + )] + async fn aingle_note_context( + &self, + params: Parameters, + ) -> Result { + let Parameters(p) = params; + let resp = crate::service::context::note_context_cached( + &self.state, + &p.note, + p.limit.unwrap_or(8), + ) + .await; + Ok(CallToolResult::success(vec![Content::json(resp)?])) + } + + /// List ingested sources and their signed content hashes. + #[tool( + description = "List ingested source files with their content hashes (the \ + signed provenance registry).", + annotations(read_only_hint = true) + )] + async fn aingle_sources(&self) -> Result { + let resp = crate::service::ingest::list_sources(&self.state) + .await + .map_err(super::convert::to_mcp_error)?; + Ok(CallToolResult::success(vec![Content::json(resp)?])) + } + + /// Vault Map & Navigation Manual: entry points, topics, orphans, indices, + /// and guidance for navigating the vault accurately before answering. + #[tool( + description = "Vault map & navigation manual: hub entry-points, semantic topic \ + clusters, orphan notes, tag/type indices, and guidance. Call this FIRST to \ + navigate a vault accurately, then aingle_ground each claim.", + annotations(read_only_hint = true) + )] + async fn aingle_vault_map(&self) -> Result { + let resp = crate::service::vault_map::vault_map_cached(&self.state).await; + Ok(CallToolResult::success(vec![Content::json(resp)?])) + } + /// Query the semantic graph by triple pattern (any field omitted = wildcard). #[tool( description = "Query the semantic graph by triple pattern. Omit a field to wildcard it.", @@ -536,6 +638,43 @@ impl AingleMcp { } } +/// Parameters for the `aingle_ingest` tool. +#[derive(serde::Deserialize, schemars::JsonSchema)] +pub struct IngestParams { + /// Absolute or relative path to the vault/repo root to ingest. + pub path: String, +} + +/// Parameters for the `aingle_ground` tool. +#[derive(serde::Deserialize, schemars::JsonSchema)] +pub struct GroundParams { + /// The question to ground against ingested sources. + pub question: String, + /// Max chunks to retrieve. + #[serde(default = "default_ground_k")] + pub k: usize, +} + +fn default_ground_k() -> usize { + 6 +} + +/// Parameters for the `aingle_backlinks` tool. +#[derive(serde::Deserialize, schemars::JsonSchema)] +pub struct BacklinksParams { + /// Note path (vault-relative) to get backlinks for, e.g. "ideas/sled.md". + pub note: String, +} + +/// Parameters for the `aingle_note_context` tool. +#[derive(serde::Deserialize, schemars::JsonSchema)] +pub struct NoteContextParams { + /// Note path (vault-relative) to get the verified context bundle for. + pub note: String, + /// Max number of related neighbors to return (default 8). + pub limit: Option, +} + #[tool_handler(router = self.tool_router)] impl ServerHandler for AingleMcp { fn get_info(&self) -> ServerInfo { @@ -549,3 +688,33 @@ impl ServerHandler for AingleMcp { info } } + +#[cfg(test)] +mod ingest_tools_tests { + use super::*; + + #[test] + fn router_exposes_ingest_ground_sources() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + let mcp = AingleMcp::new(state); + let names: Vec = mcp + .tool_router + .list_all() + .into_iter() + .map(|t| t.name.to_string()) + .collect(); + for expected in [ + "aingle_ingest", + "aingle_ground", + "aingle_sources", + "aingle_vault_map", + "aingle_backlinks", + "aingle_note_context", + ] { + assert!( + names.contains(&expected.to_string()), + "missing tool {expected}" + ); + } + } +} diff --git a/crates/aingle_cortex/src/middleware/mod.rs b/crates/aingle_cortex/src/middleware/mod.rs index 3aaa1c09..0f9472bb 100644 --- a/crates/aingle_cortex/src/middleware/mod.rs +++ b/crates/aingle_cortex/src/middleware/mod.rs @@ -22,5 +22,5 @@ pub mod namespace; pub mod rate_limit; -pub use namespace::{namespace_extractor, is_in_namespace, scope_subject, RequestNamespace}; +pub use namespace::{is_in_namespace, namespace_extractor, scope_subject, RequestNamespace}; pub use rate_limit::{RateLimitError, RateLimiter, RateLimiterLayer}; diff --git a/crates/aingle_cortex/src/middleware/namespace.rs b/crates/aingle_cortex/src/middleware/namespace.rs index 31ba729a..f46f2b1c 100644 --- a/crates/aingle_cortex/src/middleware/namespace.rs +++ b/crates/aingle_cortex/src/middleware/namespace.rs @@ -6,12 +6,7 @@ //! Extracts the `namespace` from JWT claims and injects it into Axum request //! extensions so downstream handlers can scope queries/mutations by namespace. -use axum::{ - body::Body, - http::Request, - middleware::Next, - response::Response, -}; +use axum::{body::Body, http::Request, middleware::Next, response::Response}; /// Namespace extracted from JWT claims, available via request extensions. #[derive(Debug, Clone)] @@ -22,10 +17,7 @@ pub struct RequestNamespace(pub Option); /// If auth is not enabled or no namespace is present in the token, sets `None`. /// Downstream handlers can read `RequestNamespace` from extensions and enforce /// namespace boundaries accordingly. -pub async fn namespace_extractor( - mut req: Request, - next: Next, -) -> Response { +pub async fn namespace_extractor(mut req: Request, next: Next) -> Response { // Try to extract namespace from the Authorization header let namespace = extract_namespace_from_token(&req); req.extensions_mut().insert(RequestNamespace(namespace)); diff --git a/crates/aingle_cortex/src/middleware/rate_limit.rs b/crates/aingle_cortex/src/middleware/rate_limit.rs index dcf490b6..937ccf3c 100644 --- a/crates/aingle_cortex/src/middleware/rate_limit.rs +++ b/crates/aingle_cortex/src/middleware/rate_limit.rs @@ -66,10 +66,9 @@ impl IntoResponse for RateLimitError { .into_response(); // Add Retry-After header (infallible: From for HeaderValue) - response.headers_mut().insert( - "Retry-After", - HeaderValue::from(*secs), - ); + response + .headers_mut() + .insert("Retry-After", HeaderValue::from(*secs)); // Add rate limit headers response @@ -282,11 +281,9 @@ where // 1. If behind a proxy, try X-Forwarded-For / X-Real-IP headers. // 2. Fall back to ConnectInfo (direct connection IP). let ip = if limiter.secure_ip { - extract_proxy_ip(&req) - .or_else(|| extract_connect_ip(&req)) + extract_proxy_ip(&req).or_else(|| extract_connect_ip(&req)) } else { - extract_connect_ip(&req) - .or_else(|| extract_proxy_ip(&req)) + extract_connect_ip(&req).or_else(|| extract_proxy_ip(&req)) }; let ip = match ip { @@ -309,10 +306,7 @@ where "X-RateLimit-Limit", HeaderValue::from(limiter.requests_per_minute), ); - headers.insert( - "X-RateLimit-Remaining", - HeaderValue::from(remaining), - ); + headers.insert("X-RateLimit-Remaining", HeaderValue::from(remaining)); Ok(response) } diff --git a/crates/aingle_cortex/src/p2p/config.rs b/crates/aingle_cortex/src/p2p/config.rs index 68ab7fb3..c44da9f4 100644 --- a/crates/aingle_cortex/src/p2p/config.rs +++ b/crates/aingle_cortex/src/p2p/config.rs @@ -58,17 +58,17 @@ impl P2pConfig { /// Validate configuration values. pub fn validate(&self) -> Result<(), String> { if self.port < 1024 { - return Err(format!( - "p2p port must be >= 1024, got {}", - self.port - )); + return Err(format!("p2p port must be >= 1024, got {}", self.port)); } if let Some(ref seed) = self.seed { if seed.is_empty() { return Err("p2p seed must not be empty".to_string()); } - if !seed.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') { + if !seed + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-') + { return Err("p2p seed must be alphanumeric (plus _ and -)".to_string()); } } diff --git a/crates/aingle_cortex/src/p2p/dag_sync.rs b/crates/aingle_cortex/src/p2p/dag_sync.rs index 48f7b4fc..f5f6b0fa 100644 --- a/crates/aingle_cortex/src/p2p/dag_sync.rs +++ b/crates/aingle_cortex/src/p2p/dag_sync.rs @@ -31,10 +31,7 @@ pub fn collect_local_tips(graph: &GraphDB) -> (Vec, u64) { /// Given remote tips, compute which actions we have that the remote is missing, /// and return them as serialized bytes ready for sending. #[cfg(feature = "dag")] -pub fn compute_missing_from_tips( - graph: &GraphDB, - remote_tips: &[String], -) -> Vec> { +pub fn compute_missing_from_tips(graph: &GraphDB, remote_tips: &[String]) -> Vec> { let Some(dag_store) = graph.dag_store() else { return Vec::new(); }; @@ -63,10 +60,7 @@ pub fn compute_missing_from_tips( /// Fetch serialized DAG actions by their hex hashes for sending to a peer. #[cfg(feature = "dag")] -pub fn fetch_actions_by_hash( - graph: &GraphDB, - hashes: &[String], -) -> Vec> { +pub fn fetch_actions_by_hash(graph: &GraphDB, hashes: &[String]) -> Vec> { let Some(dag_store) = graph.dag_store() else { return Vec::new(); }; @@ -88,10 +82,7 @@ pub fn fetch_actions_by_hash( /// Ingest received DAG actions into the local store. #[cfg(feature = "dag")] -pub fn ingest_actions( - graph: &GraphDB, - action_bytes_list: &[Vec], -) -> (usize, usize) { +pub fn ingest_actions(graph: &GraphDB, action_bytes_list: &[Vec]) -> (usize, usize) { let Some(dag_store) = graph.dag_store() else { return (0, action_bytes_list.len()); }; diff --git a/crates/aingle_cortex/src/p2p/discovery.rs b/crates/aingle_cortex/src/p2p/discovery.rs index 7778d07b..2bbe70b2 100644 --- a/crates/aingle_cortex/src/p2p/discovery.rs +++ b/crates/aingle_cortex/src/p2p/discovery.rs @@ -42,8 +42,7 @@ mod inner { impl P2pDiscovery { pub fn new(node_id: String, seed_hash: String, port: u16) -> Result { - let daemon = ServiceDaemon::new() - .map_err(|e| format!("mDNS daemon: {}", e))?; + let daemon = ServiceDaemon::new().map_err(|e| format!("mDNS daemon: {}", e))?; Ok(Self { daemon, node_id, @@ -71,10 +70,7 @@ mod inner { return Err("no network interfaces".to_string()); } - let instance_name = format!( - "cortex-{}", - &self.node_id[..8.min(self.node_id.len())] - ); + let instance_name = format!("cortex-{}", &self.node_id[..8.min(self.node_id.len())]); let mut props = HashMap::new(); props.insert("node_id".to_string(), self.node_id.clone()); @@ -177,10 +173,8 @@ mod inner { self.running .store(false, std::sync::atomic::Ordering::SeqCst); if self.registered { - let instance_name = format!( - "cortex-{}", - &self.node_id[..8.min(self.node_id.len())] - ); + let instance_name = + format!("cortex-{}", &self.node_id[..8.min(self.node_id.len())]); let _ = self .daemon .unregister(&format!("{}.{}", instance_name, SERVICE_TYPE)); diff --git a/crates/aingle_cortex/src/p2p/gossip.rs b/crates/aingle_cortex/src/p2p/gossip.rs index 067afb80..b23b7d88 100644 --- a/crates/aingle_cortex/src/p2p/gossip.rs +++ b/crates/aingle_cortex/src/p2p/gossip.rs @@ -437,11 +437,7 @@ impl TripleGossipManager { } /// Find IDs that exist in `our_ids` but are missing from `peer_filter`. - pub fn find_missing( - &self, - peer_filter: &BloomFilter, - our_ids: &[[u8; 32]], - ) -> Vec<[u8; 32]> { + pub fn find_missing(&self, peer_filter: &BloomFilter, our_ids: &[[u8; 32]]) -> Vec<[u8; 32]> { our_ids .iter() .filter(|id| !peer_filter.may_contain(id)) diff --git a/crates/aingle_cortex/src/p2p/identity.rs b/crates/aingle_cortex/src/p2p/identity.rs index cf964682..c3b24237 100644 --- a/crates/aingle_cortex/src/p2p/identity.rs +++ b/crates/aingle_cortex/src/p2p/identity.rs @@ -41,8 +41,8 @@ impl NodeIdentity { // Write with restrictive permissions (Unix 0o600). #[cfg(unix)] { - use std::os::unix::fs::OpenOptionsExt; use std::io::Write; + use std::os::unix::fs::OpenOptionsExt; let mut f = std::fs::OpenOptions::new() .create(true) .write(true) diff --git a/crates/aingle_cortex/src/p2p/manager.rs b/crates/aingle_cortex/src/p2p/manager.rs index 44c00d8a..f6912bea 100644 --- a/crates/aingle_cortex/src/p2p/manager.rs +++ b/crates/aingle_cortex/src/p2p/manager.rs @@ -52,9 +52,7 @@ impl ManualPeerTracker { fn record_failure(&mut self) { self.retries += 1; self.last_attempt = Instant::now(); - self.current_backoff = Duration::from_secs( - (self.current_backoff.as_secs() * 2).min(300), - ); + self.current_backoff = Duration::from_secs((self.current_backoff.as_secs() * 2).min(300)); if self.retries >= self.max_retries { self.abandoned = true; } @@ -85,7 +83,8 @@ impl PingTracker { } fn record_ping(&mut self, addr: SocketAddr, timestamp_ms: u64) { - self.outstanding.insert(addr, (timestamp_ms, Instant::now())); + self.outstanding + .insert(addr, (timestamp_ms, Instant::now())); } fn record_pong(&mut self, addr: &SocketAddr, _timestamp_ms: u64) { @@ -183,9 +182,10 @@ impl P2pManager { } // A3: Load persistent peer store and merge with manual peers. - let peer_store = Arc::new(RwLock::new( - PeerStore::load(&config.data_dir, config.max_peers * 2), - )); + let peer_store = Arc::new(RwLock::new(PeerStore::load( + &config.data_dir, + config.max_peers * 2, + ))); // 6. Connect to manual peers + persisted peers. let triple_count = { @@ -227,7 +227,11 @@ impl P2pManager { sync.write().await.get_peer_state(&stored.addr); } Err(e) => { - tracing::debug!("P2P persisted peer {} unreachable: {}", stored.addr, e); + tracing::debug!( + "P2P persisted peer {} unreachable: {}", + stored.addr, + e + ); } } } @@ -243,8 +247,7 @@ impl P2pManager { ))); // 7. Discovery. - let mut disc = - P2pDiscovery::new(node_id.clone(), seed_hash.clone(), config.port)?; + let mut disc = P2pDiscovery::new(node_id.clone(), seed_hash.clone(), config.port)?; if config.mdns { disc.register()?; disc.start_browsing()?; @@ -309,19 +312,31 @@ impl P2pManager { }; match hello { - P2pMessage::Hello { seed_hash: peer_seed, node_id: peer_nid, .. } => { + P2pMessage::Hello { + seed_hash: peer_seed, + node_id: peer_nid, + .. + } => { let accepted = peer_seed == accept_seed_hash; let ack = P2pMessage::HelloAck { node_id: accept_node_id.clone(), accepted, - reason: if accepted { None } else { Some("seed_mismatch".into()) }, + reason: if accepted { + None + } else { + Some("seed_mismatch".into()) + }, }; if P2pTransport::send_on_conn(&connection, &ack).await.is_err() { continue; } if accepted { - tracing::info!("P2P accepted connection from {} ({})", remote, &peer_nid[..8.min(peer_nid.len())]); + tracing::info!( + "P2P accepted connection from {} ({})", + remote, + &peer_nid[..8.min(peer_nid.len())] + ); // Store connection (brief write lock). transport.write().await.store_connection(remote, connection); // Register in sync manager for gossip. @@ -336,7 +351,10 @@ impl P2pManager { }); let _ = ps.save(); } else { - tracing::warn!("P2P rejected connection from {}: seed mismatch", remote); + tracing::warn!( + "P2P rejected connection from {}: seed mismatch", + remote + ); connection.close(1u32.into(), b"seed_mismatch"); } } @@ -441,13 +459,9 @@ impl P2pManager { #[cfg(feature = "dag")] { let g = graph2.read().await; - let (tips, action_count) = - crate::p2p::dag_sync::collect_local_tips(&g); + let (tips, action_count) = crate::p2p::dag_sync::collect_local_tips(&g); if !tips.is_empty() { - let dag_msg = P2pMessage::DagTipSync { - tips, - action_count, - }; + let dag_msg = P2pMessage::DagTipSync { tips, action_count }; let _ = t.send(&peer_addr, &dag_msg).await; } } @@ -539,13 +553,9 @@ impl P2pManager { ); } } - P2pMessage::BloomSync { - filter_bytes, - .. - } => { + P2pMessage::BloomSync { filter_bytes, .. } => { let peer_filter = BloomFilter::from_bytes(&filter_bytes); - let local_ids: Vec<[u8; 32]> = - sync.read().await.local_ids().to_vec(); + let local_ids: Vec<[u8; 32]> = sync.read().await.local_ids().to_vec(); let missing = gossip.read().await.find_missing(&peer_filter, &local_ids); @@ -601,10 +611,7 @@ impl P2pManager { .filter_map(|tw| tw.to_triple()) .collect(); let g = graph.read().await; - let result = sync - .write() - .await - .store_received_triples(converted, &g); + let result = sync.write().await.store_received_triples(converted, &g); sync.write() .await .record_sync_result(addr, true, result.inserted); @@ -635,7 +642,10 @@ impl P2pManager { } } // A1: Handle incoming deletion announcement. - P2pMessage::AnnounceDelete { triple_id, tombstone_ts } => { + P2pMessage::AnnounceDelete { + triple_id, + tombstone_ts, + } => { if let Some(tid) = TripleId::from_hex(&triple_id) { let mut s = sync.write().await; if !s.has_tombstone(&tid.0) { @@ -682,10 +692,7 @@ impl P2pManager { // A4: Forward pong to health task via channel. P2pMessage::Pong { timestamp_ms, .. } => { let _ = health_tx - .send(HealthEvent::PongReceived { - addr, - timestamp_ms, - }) + .send(HealthEvent::PongReceived { addr, timestamp_ms }) .await; } // DAG sync message handlers @@ -710,8 +717,7 @@ impl P2pManager { #[cfg(feature = "dag")] P2pMessage::RequestDagActions { hashes } => { let g = graph.read().await; - let actions = - crate::p2p::dag_sync::fetch_actions_by_hash(&g, &hashes); + let actions = crate::p2p::dag_sync::fetch_actions_by_hash(&g, &hashes); if !actions.is_empty() { let send_msg = P2pMessage::SendDagActions { actions }; let t = transport.read().await; @@ -772,10 +778,7 @@ impl P2pManager { .connect(peer.addr, triple_count) .await; if let Ok(()) = result { - tracing::info!( - "P2P discovered and connected to {}", - peer.node_id - ); + tracing::info!("P2P discovered and connected to {}", peer.node_id); sync.write().await.get_peer_state(&peer.addr); // A3: Record mDNS peer let mut ps = peer_store.write().await; @@ -1155,7 +1158,9 @@ mod tests { fn ping_tracker_timed_out_detection() { let mut tracker = PingTracker::new(Duration::from_millis(10)); let addr: SocketAddr = "127.0.0.1:9000".parse().unwrap(); - tracker.outstanding.insert(addr, (1000, Instant::now() - Duration::from_millis(50))); + tracker + .outstanding + .insert(addr, (1000, Instant::now() - Duration::from_millis(50))); let timed_out = tracker.timed_out_peers(); assert_eq!(timed_out.len(), 1); assert_eq!(timed_out[0], addr); diff --git a/crates/aingle_cortex/src/p2p/message.rs b/crates/aingle_cortex/src/p2p/message.rs index 043a20a6..9e3181d9 100644 --- a/crates/aingle_cortex/src/p2p/message.rs +++ b/crates/aingle_cortex/src/p2p/message.rs @@ -38,21 +38,13 @@ pub enum P2pMessage { triple_count: u64, }, /// Request triples by their hex IDs. - RequestTriples { - ids: Vec, - }, + RequestTriples { ids: Vec }, /// Batch of triples. - SendTriples { - triples: Vec, - }, + SendTriples { triples: Vec }, /// Lightweight announcement of a new triple. - Announce { - triple_id: String, - }, + Announce { triple_id: String }, /// Keep-alive ping. - Ping { - timestamp_ms: u64, - }, + Ping { timestamp_ms: u64 }, /// Keep-alive pong. Pong { timestamp_ms: u64, @@ -64,9 +56,7 @@ pub enum P2pMessage { tombstone_ts: u64, }, /// Batch tombstone synchronization. - TombstoneSync { - tombstones: Vec, - }, + TombstoneSync { tombstones: Vec }, // ── DAG sync messages (feature: dag) ──────────────────────── /// Exchange of DAG tip hashes for sync. #[cfg(feature = "dag")] @@ -76,14 +66,10 @@ pub enum P2pMessage { }, /// Request specific DAG actions by hash. #[cfg(feature = "dag")] - RequestDagActions { - hashes: Vec, - }, + RequestDagActions { hashes: Vec }, /// Batch of serialized DAG actions. #[cfg(feature = "dag")] - SendDagActions { - actions: Vec>, - }, + SendDagActions { actions: Vec> }, // ── Raft / Cluster messages (feature: cluster) ────────────── /// Raft AppendEntries RPC (serialized openraft request). #[cfg(feature = "cluster")] @@ -278,10 +264,7 @@ fn json_to_value(j: &serde_json::Value) -> Value { } } "lang" => { - let lang = map - .get("lang") - .and_then(|v| v.as_str()) - .unwrap_or_default(); + let lang = map.get("lang").and_then(|v| v.as_str()).unwrap_or_default(); Value::LangString { value: val.to_string(), lang: lang.to_string(), @@ -315,7 +298,13 @@ mod tests { }; let bytes = msg.to_bytes(); let parsed = P2pMessage::from_bytes(&bytes).unwrap(); - assert!(matches!(parsed, P2pMessage::Hello { triple_count: 42, .. })); + assert!(matches!( + parsed, + P2pMessage::Hello { + triple_count: 42, + .. + } + )); } #[test] @@ -328,7 +317,10 @@ mod tests { let bytes = msg.to_bytes(); let parsed = P2pMessage::from_bytes(&bytes).unwrap(); match parsed { - P2pMessage::BloomSync { filter_bytes, triple_count } => { + P2pMessage::BloomSync { + filter_bytes, + triple_count, + } => { assert_eq!(filter_bytes.len(), 128); assert_eq!(triple_count, 100); } @@ -359,9 +351,7 @@ mod tests { author: None, source: None, }; - let msg = P2pMessage::SendTriples { - triples: vec![tw], - }; + let msg = P2pMessage::SendTriples { triples: vec![tw] }; let bytes = msg.to_bytes(); let parsed = P2pMessage::from_bytes(&bytes).unwrap(); match parsed { @@ -433,7 +423,10 @@ mod tests { let bytes = msg.to_bytes(); let parsed = P2pMessage::from_bytes(&bytes).unwrap(); match parsed { - P2pMessage::AnnounceDelete { triple_id, tombstone_ts } => { + P2pMessage::AnnounceDelete { + triple_id, + tombstone_ts, + } => { assert_eq!(triple_id, "deadbeef"); assert_eq!(tombstone_ts, 1700000000000); } @@ -444,8 +437,14 @@ mod tests { #[test] fn tombstone_sync_roundtrip() { let tombstones = vec![ - TombstoneWire { triple_id: "aa".into(), deleted_at_ms: 100 }, - TombstoneWire { triple_id: "bb".into(), deleted_at_ms: 200 }, + TombstoneWire { + triple_id: "aa".into(), + deleted_at_ms: 100, + }, + TombstoneWire { + triple_id: "bb".into(), + deleted_at_ms: 200, + }, ]; let msg = P2pMessage::TombstoneSync { tombstones }; let bytes = msg.to_bytes(); diff --git a/crates/aingle_cortex/src/p2p/peer_store.rs b/crates/aingle_cortex/src/p2p/peer_store.rs index 0d6ef963..311b8ae3 100644 --- a/crates/aingle_cortex/src/p2p/peer_store.rs +++ b/crates/aingle_cortex/src/p2p/peer_store.rs @@ -46,19 +46,22 @@ impl PeerStore { } else { Vec::new() }; - Self { path, peers, max_peers } + Self { + path, + peers, + max_peers, + } } /// Write the current peer list to disk. pub fn save(&self) -> Result<(), String> { if let Some(parent) = self.path.parent() { - std::fs::create_dir_all(parent) - .map_err(|e| format!("create peer store dir: {}", e))?; + std::fs::create_dir_all(parent).map_err(|e| format!("create peer store dir: {}", e))?; } let json = serde_json::to_string_pretty(&self.peers) .map_err(|e| format!("serialize peers: {}", e))?; - let mut file = std::fs::File::create(&self.path) - .map_err(|e| format!("create peer store: {}", e))?; + let mut file = + std::fs::File::create(&self.path).map_err(|e| format!("create peer store: {}", e))?; std::io::Write::write_all(&mut file, json.as_bytes()) .map_err(|e| format!("write peer store: {}", e))?; file.sync_all() @@ -75,7 +78,10 @@ impl PeerStore { // Enforce capacity if self.peers.len() >= self.max_peers { // Remove oldest (by last_connected_ms) - if let Some(oldest_idx) = self.peers.iter().enumerate() + if let Some(oldest_idx) = self + .peers + .iter() + .enumerate() .min_by_key(|(_, p)| p.last_connected_ms) .map(|(i, _)| i) { @@ -191,7 +197,7 @@ mod tests { // Add a peer with an old timestamp store.add(stored_peer(9001, 1)); store.cleanup_stale(1000); // 1 second max age - // peer with ts=0 is kept (never-connected sentinel), old one removed + // peer with ts=0 is kept (never-connected sentinel), old one removed assert_eq!(store.all().len(), 1); assert_eq!(store.all()[0].addr, addr(9000)); } diff --git a/crates/aingle_cortex/src/p2p/rate_limiter.rs b/crates/aingle_cortex/src/p2p/rate_limiter.rs index 3aa5d48f..4851b023 100644 --- a/crates/aingle_cortex/src/p2p/rate_limiter.rs +++ b/crates/aingle_cortex/src/p2p/rate_limiter.rs @@ -41,9 +41,10 @@ impl IngressRateLimiter { /// /// Returns the number of allowed triples (0..=count). pub fn check(&mut self, addr: &SocketAddr, count: usize) -> usize { - let bucket = self.per_peer.entry(*addr).or_insert_with(|| { - TokenBucket::with_params(self.per_peer_max, self.per_peer_rate) - }); + let bucket = self + .per_peer + .entry(*addr) + .or_insert_with(|| TokenBucket::with_params(self.per_peer_max, self.per_peer_rate)); let mut allowed = 0; for _ in 0..count { diff --git a/crates/aingle_cortex/src/p2p/sync_manager.rs b/crates/aingle_cortex/src/p2p/sync_manager.rs index a55ba231..8855db61 100644 --- a/crates/aingle_cortex/src/p2p/sync_manager.rs +++ b/crates/aingle_cortex/src/p2p/sync_manager.rs @@ -146,11 +146,7 @@ impl TripleSyncManager { } /// Insert triples received from a peer into the graph. Duplicates are counted, not errors. - pub fn store_received_triples( - &mut self, - triples: Vec, - graph: &GraphDB, - ) -> StoreResult { + pub fn store_received_triples(&mut self, triples: Vec, graph: &GraphDB) -> StoreResult { let mut result = StoreResult::default(); for triple in triples { let id = TripleId::from_triple(&triple); @@ -161,7 +157,10 @@ impl TripleSyncManager { } Err(e) => { let msg = format!("{}", e); - if msg.contains("duplicate") || msg.contains("exists") || msg.contains("already") { + if msg.contains("duplicate") + || msg.contains("exists") + || msg.contains("already") + { result.duplicates += 1; } else { result.errors += 1; @@ -215,7 +214,8 @@ impl TripleSyncManager { .unwrap_or_default() .as_millis() as u64; let ttl_ms = self.tombstone_ttl.as_millis() as u64; - self.tombstones.retain(|_, ts| now_ms.saturating_sub(*ts) < ttl_ms); + self.tombstones + .retain(|_, ts| now_ms.saturating_sub(*ts) < ttl_ms); } /// Return all active tombstones as (hash, timestamp_ms) pairs. diff --git a/crates/aingle_cortex/src/p2p/transport.rs b/crates/aingle_cortex/src/p2p/transport.rs index 68b42c09..c0bf865b 100644 --- a/crates/aingle_cortex/src/p2p/transport.rs +++ b/crates/aingle_cortex/src/p2p/transport.rs @@ -103,7 +103,9 @@ impl P2pTransport { // Receive HelloAck. let ack = Self::recv_from_connection(&connection).await?; match ack { - P2pMessage::HelloAck { accepted, reason, .. } => { + P2pMessage::HelloAck { + accepted, reason, .. + } => { if !accepted { connection.close(1u32.into(), b"rejected"); return Err(format!( @@ -142,7 +144,9 @@ impl P2pTransport { let hello = Self::recv_from_connection(&connection).await?; match &hello { - P2pMessage::Hello { seed_hash, node_id, .. } => { + P2pMessage::Hello { + seed_hash, node_id, .. + } => { let accepted = seed_hash == &self.seed_hash; let reason = if accepted { None @@ -158,7 +162,11 @@ impl P2pTransport { Self::send_on_connection(&connection, &ack).await?; if accepted { - tracing::info!("P2P accepted connection from {} ({})", remote, &node_id[..8.min(node_id.len())]); + tracing::info!( + "P2P accepted connection from {} ({})", + remote, + &node_id[..8.min(node_id.len())] + ); self.connections.insert(remote, connection); Ok(Some((remote, hello))) } else { @@ -302,8 +310,7 @@ impl P2pTransport { .map_err(|e| format!("cert gen: {}", e))?; let cert_der = CertificateDer::from(cert.cert.der().to_vec()); - let key_der = - PrivateKeyDer::Pkcs8(PrivatePkcs8KeyDer::from(cert.key_pair.serialize_der())); + let key_der = PrivateKeyDer::Pkcs8(PrivatePkcs8KeyDer::from(cert.key_pair.serialize_der())); let mut server_crypto = rustls::ServerConfig::builder() .with_no_client_auth() @@ -429,22 +436,14 @@ mod tests { #[test] fn transport_new_has_no_connections() { - let t = P2pTransport::new( - P2pTransportConfig::default(), - "abc".into(), - "hash".into(), - ); + let t = P2pTransport::new(P2pTransportConfig::default(), "abc".into(), "hash".into()); assert_eq!(t.connection_count(), 0); assert!(t.connected_peers().is_empty()); } #[test] fn is_connected_false_initially() { - let t = P2pTransport::new( - P2pTransportConfig::default(), - "abc".into(), - "hash".into(), - ); + let t = P2pTransport::new(P2pTransportConfig::default(), "abc".into(), "hash".into()); let addr: SocketAddr = "127.0.0.1:19091".parse().unwrap(); assert!(!t.is_connected(&addr)); } @@ -483,11 +482,7 @@ mod tests { #[tokio::test] async fn disconnect_nonexistent_is_noop() { - let mut t = P2pTransport::new( - P2pTransportConfig::default(), - "abc".into(), - "hash".into(), - ); + let mut t = P2pTransport::new(P2pTransportConfig::default(), "abc".into(), "hash".into()); let addr: SocketAddr = "127.0.0.1:19091".parse().unwrap(); t.disconnect(&addr); // should not panic } diff --git a/crates/aingle_cortex/src/proofs/backend.rs b/crates/aingle_cortex/src/proofs/backend.rs index 07e7ffd5..9866dc02 100644 --- a/crates/aingle_cortex/src/proofs/backend.rs +++ b/crates/aingle_cortex/src/proofs/backend.rs @@ -80,10 +80,7 @@ impl ProofBackend for MemoryProofBackend { .data .read() .map_err(|_| "MemoryProofBackend lock poisoned".to_string())?; - Ok(data - .iter() - .map(|(k, v)| (k.clone(), v.clone())) - .collect()) + Ok(data.iter().map(|(k, v)| (k.clone(), v.clone())).collect()) } } @@ -103,8 +100,7 @@ pub struct SledProofBackend { impl SledProofBackend { /// Open or create a proofs tree inside the Sled database at `path`. pub fn open(path: &str) -> Result { - let db = - sled::open(path).map_err(|e| format!("sled open error (proofs): {e}"))?; + let db = sled::open(path).map_err(|e| format!("sled open error (proofs): {e}"))?; let tree = db .open_tree("proofs") .map_err(|e| format!("sled open_tree(proofs) error: {e}"))?; @@ -139,8 +135,7 @@ impl ProofBackend for SledProofBackend { fn list_all(&self) -> Result)>, String> { let mut results = Vec::new(); for item in self.tree.iter() { - let (k, v) = - item.map_err(|e| format!("sled proofs scan error: {e}"))?; + let (k, v) = item.map_err(|e| format!("sled proofs scan error: {e}"))?; let key = String::from_utf8(k.to_vec()) .map_err(|e| format!("sled proofs key decode error: {e}"))?; results.push((key, v.to_vec())); diff --git a/crates/aingle_cortex/src/proofs/store.rs b/crates/aingle_cortex/src/proofs/store.rs index 03ee91a1..2bb6eb34 100644 --- a/crates/aingle_cortex/src/proofs/store.rs +++ b/crates/aingle_cortex/src/proofs/store.rs @@ -460,10 +460,7 @@ impl ProofStore { /// Get count of proofs pub async fn count(&self) -> usize { - self.backend - .list_all() - .map(|all| all.len()) - .unwrap_or(0) + self.backend.list_all().map(|all| all.len()).unwrap_or(0) } /// Flush proof backend to durable storage. diff --git a/crates/aingle_cortex/src/proofs/verification.rs b/crates/aingle_cortex/src/proofs/verification.rs index 24842c97..44b43665 100644 --- a/crates/aingle_cortex/src/proofs/verification.rs +++ b/crates/aingle_cortex/src/proofs/verification.rs @@ -114,10 +114,11 @@ fn reconstruct_zk_proof(proof: &StoredProof) -> Result, -) -> Json { +pub async fn get_audit_stats(State(state): State) -> Json { let log = state.audit_log.read().await; Json(log.stats()) } diff --git a/crates/aingle_cortex/src/rest/cluster.rs b/crates/aingle_cortex/src/rest/cluster.rs index 5408972d..0b8fadc2 100644 --- a/crates/aingle_cortex/src/rest/cluster.rs +++ b/crates/aingle_cortex/src/rest/cluster.rs @@ -86,16 +86,16 @@ pub struct WalVerifyResponse { } /// GET /api/v1/cluster/status -pub async fn cluster_status( - State(state): State, -) -> Result> { +pub async fn cluster_status(State(state): State) -> Result> { let wal_last_seq = { #[cfg(feature = "cluster")] { state.wal.as_ref().map(|w| w.last_seq()).unwrap_or(0) } #[cfg(not(feature = "cluster"))] - { 0u64 } + { + 0u64 + } }; // Extract live Raft metrics when available @@ -113,9 +113,7 @@ pub async fn cluster_status( .map(|lid| lid.index) .unwrap_or(0); - let commit_index = metrics - .last_log_index - .unwrap_or(0); + let commit_index = metrics.last_log_index.unwrap_or(0); // Build member list from membership config let membership = metrics.membership_config.membership(); @@ -137,7 +135,10 @@ pub async fn cluster_status( // Resolve leader address from membership config (#13) let leader_addr = leader_id.and_then(|lid| { - membership.nodes().find(|(nid, _)| **nid == lid).map(|(_, node)| node.rest_addr.clone()) + membership + .nodes() + .find(|(nid, _)| **nid == lid) + .map(|(_, node)| node.rest_addr.clone()) }); return Ok(Json(ClusterStatus { @@ -187,10 +188,16 @@ pub async fn cluster_join( if metrics.current_leader != state.cluster_node_id { let membership = metrics.membership_config.membership(); let leader_addr = metrics.current_leader.and_then(|lid| { - membership.nodes().find(|(nid, _)| **nid == lid).map(|(_, node)| node.rest_addr.clone()) + membership + .nodes() + .find(|(nid, _)| **nid == lid) + .map(|(_, node)| node.rest_addr.clone()) }); if let Some(ref addr) = leader_addr { - return Err(Error::Redirect(format!("http://{}/api/v1/cluster/join", addr))); + return Err(Error::Redirect(format!( + "http://{}/api/v1/cluster/join", + addr + ))); } return Ok(( StatusCode::CONFLICT, @@ -219,7 +226,10 @@ pub async fn cluster_join( voter_ids.insert(req.node_id); // Resolve leader_addr for response let leader_addr = metrics.current_leader.and_then(|lid| { - membership.nodes().find(|(nid, _)| **nid == lid).map(|(_, node)| node.rest_addr.clone()) + membership + .nodes() + .find(|(nid, _)| **nid == lid) + .map(|(_, node)| node.rest_addr.clone()) }); match raft.change_membership(voter_ids.clone(), false).await { Ok(_) => { @@ -294,18 +304,23 @@ pub async fn cluster_leave( if metrics.current_leader != state.cluster_node_id { let membership = metrics.membership_config.membership(); let leader_addr = metrics.current_leader.and_then(|lid| { - membership.nodes().find(|(nid, _)| **nid == lid).map(|(_, node)| node.rest_addr.clone()) + membership + .nodes() + .find(|(nid, _)| **nid == lid) + .map(|(_, node)| node.rest_addr.clone()) }); if let Some(ref addr) = leader_addr { - return Err(Error::Redirect(format!("http://{}/api/v1/cluster/leave", addr))); + return Err(Error::Redirect(format!( + "http://{}/api/v1/cluster/leave", + addr + ))); } return Err(Error::Internal("Not leader; leader unknown".to_string())); } if let Some(node_id) = state.cluster_node_id { let membership = metrics.membership_config.membership(); - let mut voter_ids: std::collections::BTreeSet = - membership.voter_ids().collect(); + let mut voter_ids: std::collections::BTreeSet = membership.voter_ids().collect(); voter_ids.remove(&node_id); if !voter_ids.is_empty() { if let Err(e) = raft.change_membership(voter_ids, false).await { @@ -320,9 +335,7 @@ pub async fn cluster_leave( } /// GET /api/v1/cluster/members -pub async fn cluster_members( - State(state): State, -) -> Result>> { +pub async fn cluster_members(State(state): State) -> Result>> { #[cfg(feature = "cluster")] if let Some(ref raft) = state.raft { let metrics = raft.metrics().borrow_watched().clone(); @@ -351,12 +364,12 @@ pub async fn cluster_members( } /// GET /api/v1/cluster/wal/stats -pub async fn wal_stats( - State(state): State, -) -> Result> { +pub async fn wal_stats(State(state): State) -> Result> { #[cfg(feature = "cluster")] if let Some(ref wal) = state.wal { - let stats = wal.stats().map_err(|e| Error::Internal(format!("WAL stats error: {e}")))?; + let stats = wal + .stats() + .map_err(|e| Error::Internal(format!("WAL stats error: {e}")))?; return Ok(Json(WalStatsResponse { segment_count: stats.segment_count, total_size_bytes: stats.total_size_bytes, @@ -374,9 +387,7 @@ pub async fn wal_stats( } /// POST /api/v1/cluster/wal/verify -pub async fn wal_verify( - State(state): State, -) -> Result> { +pub async fn wal_verify(State(state): State) -> Result> { #[cfg(feature = "cluster")] if let Some(ref wal) = state.wal { let wal_dir = wal.dir(); diff --git a/crates/aingle_cortex/src/rest/cluster_utils.rs b/crates/aingle_cortex/src/rest/cluster_utils.rs index 42aa9239..ffcc0397 100644 --- a/crates/aingle_cortex/src/rest/cluster_utils.rs +++ b/crates/aingle_cortex/src/rest/cluster_utils.rs @@ -3,9 +3,9 @@ //! Shared helpers for cluster-mode REST handlers. -use axum::http::HeaderMap; use crate::error::Error; use crate::state::AppState; +use axum::http::HeaderMap; /// Convert a Raft `client_write` error into an appropriate HTTP error. /// diff --git a/crates/aingle_cortex/src/rest/dag.rs b/crates/aingle_cortex/src/rest/dag.rs index eded0287..701c40f2 100644 --- a/crates/aingle_cortex/src/rest/dag.rs +++ b/crates/aingle_cortex/src/rest/dag.rs @@ -42,6 +42,10 @@ pub struct DagActionDto { pub payload_type: String, pub payload_summary: String, pub signed: bool, + /// Blake3 hex content hash of the source file, if present in the action's + /// provenance. Extracted from the first provenanced triple in a + /// `TripleInsert` (or the first `TripleInsert` inside a `Batch`). + pub content_hash: Option, } #[derive(Debug, Serialize)] @@ -173,7 +177,7 @@ pub struct CreateDagActionResponse { } fn default_limit() -> usize { - 50 + crate::service::dag::DEFAULT_HISTORY_LIMIT } // ============================================================================ @@ -208,14 +212,8 @@ pub async fn get_dag_history( // Triple-ID-based lookup uses the affected index if let Some(ref tid_hex) = query.triple_id { - let mut bytes = [0u8; 32]; - if tid_hex.len() != 64 { - return Err(Error::InvalidInput("triple_id must be 64 hex chars".into())); - } - for i in 0..32 { - bytes[i] = u8::from_str_radix(&tid_hex[i * 2..i * 2 + 2], 16) - .map_err(|_| Error::InvalidInput("Invalid hex in triple_id".into()))?; - } + let bytes = parse_hex32(tid_hex) + .ok_or_else(|| Error::InvalidInput("triple_id must be 64 valid hex chars".into()))?; let actions = graph .dag_history(&bytes, query.limit) @@ -293,16 +291,8 @@ pub async fn get_dag_verify( let action_hash = aingle_graph::dag::DagActionHash::from_hex(&hash) .ok_or_else(|| Error::InvalidInput(format!("Invalid hash: {}", hash)))?; - let mut pk_bytes = [0u8; 32]; - if query.public_key.len() != 64 { - return Err(Error::InvalidInput( - "public_key must be 64 hex chars".into(), - )); - } - for i in 0..32 { - pk_bytes[i] = u8::from_str_radix(&query.public_key[i * 2..i * 2 + 2], 16) - .map_err(|_| Error::InvalidInput("Invalid hex in public_key".into()))?; - } + let pk_bytes = parse_hex32(&query.public_key) + .ok_or_else(|| Error::InvalidInput("public_key must be 64 valid hex chars".into()))?; let graph = state.graph.read().await; let action = graph @@ -586,11 +576,15 @@ pub fn dag_router() -> Router { // Helpers // ============================================================================ +/// Convert a raw [`DagAction`] to its serializable DTO form. +/// +/// Extracts the payload type, a human-readable summary, and the content hash +/// from the action's provenance (for `TripleInsert` and `Batch` payloads). pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionDto { let hash = action.compute_hash().to_hex(); let parents: Vec = action.parents.iter().map(|h| h.to_hex()).collect(); - let (payload_type, payload_summary) = match &action.payload { + let (payload_type, payload_summary, content_hash) = match &action.payload { aingle_graph::dag::DagPayload::TripleInsert { triples } => { let summary = if triples.len() == 1 { let t = &triples[0]; @@ -598,7 +592,12 @@ pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionD } else { format!("{} triple(s)", triples.len()) }; - ("triple:create".to_string(), summary) + // Extract content_hash from the first triple that carries provenance. + // All triples from a single file ingest share the same content_hash. + let content_hash = triples + .iter() + .find_map(|t| t.provenance.as_ref().map(|p| p.content_hash.clone())); + ("triple:create".to_string(), summary, content_hash) } aingle_graph::dag::DagPayload::TripleDelete { triple_ids, @@ -609,7 +608,7 @@ pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionD } else { format!("{} triple(s)", triple_ids.len()) }; - ("triple:delete".to_string(), summary) + ("triple:delete".to_string(), summary, None) } aingle_graph::dag::DagPayload::MemoryOp { kind } => { let summary = match kind { @@ -621,10 +620,24 @@ pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionD } aingle_graph::dag::MemoryOpKind::Consolidate => "Consolidate".to_string(), }; - ("memory:op".to_string(), summary) + ("memory:op".to_string(), summary, None) } aingle_graph::dag::DagPayload::Batch { ops } => { - ("batch".to_string(), format!("{} ops", ops.len())) + // Search the ops for the first TripleInsert that has a provenanced triple. + let content_hash = ops.iter().find_map(|op| { + if let aingle_graph::dag::DagPayload::TripleInsert { triples } = op { + triples + .iter() + .find_map(|t| t.provenance.as_ref().map(|p| p.content_hash.clone())) + } else { + None + } + }); + ( + "batch".to_string(), + format!("{} ops", ops.len()), + content_hash, + ) } aingle_graph::dag::DagPayload::Genesis { triple_count, @@ -632,6 +645,7 @@ pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionD } => ( "genesis".to_string(), format!("{} triples: {}", triple_count, description), + None, ), aingle_graph::dag::DagPayload::Compact { pruned_count, @@ -643,13 +657,14 @@ pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionD "pruned {} / retained {} ({})", pruned_count, retained_count, policy ), + None, ), - aingle_graph::dag::DagPayload::Noop => ("noop".to_string(), String::new()), + aingle_graph::dag::DagPayload::Noop => ("noop".to_string(), String::new(), None), aingle_graph::dag::DagPayload::Custom { payload_type, payload_summary, .. - } => (payload_type.clone(), payload_summary.clone()), + } => (payload_type.clone(), payload_summary.clone(), None), }; DagActionDto { @@ -661,7 +676,22 @@ pub(crate) fn action_to_dto(action: &aingle_graph::dag::DagAction) -> DagActionD payload_type, payload_summary, signed: action.signature.is_some(), + content_hash, + } +} + +/// Parse a 64-character hex string into a 32-byte array. +/// +/// Returns `None` if `hex` is not exactly 64 characters or contains non-hex digits. +fn parse_hex32(hex: &str) -> Option<[u8; 32]> { + if hex.len() != 64 { + return None; + } + let mut out = [0u8; 32]; + for (i, b) in out.iter_mut().enumerate() { + *b = u8::from_str_radix(&hex[i * 2..i * 2 + 2], 16).ok()?; } + Some(out) } fn triple_value_to_json(v: &aingle_graph::Value) -> serde_json::Value { @@ -677,3 +707,122 @@ fn triple_value_to_json(v: &aingle_graph::Value) -> serde_json::Value { _ => serde_json::Value::String(format!("{:?}", v)), } } + +#[cfg(test)] +mod tests { + use super::*; + use aingle_graph::dag::{DagAction, DagPayload, Provenance, TripleInsertPayload}; + use aingle_graph::NodeId; + use chrono::Utc; + + fn test_action(payload: DagPayload) -> DagAction { + DagAction { + parents: vec![], + author: NodeId::named("node:test"), + seq: 0, + timestamp: Utc::now(), + payload, + signature: None, + } + } + + #[test] + fn action_to_dto_extracts_content_hash_from_triple_insert() { + let provenance = Provenance { + source_path: "vault/note.md".into(), + line_start: 1, + line_end: 3, + content_hash: "deadbeef".into(), + }; + let action = test_action(DagPayload::TripleInsert { + triples: vec![TripleInsertPayload { + subject: "akashi://note".into(), + predicate: "akashi:title".into(), + object: serde_json::json!("Test Note"), + provenance: Some(provenance), + }], + }); + + let dto = action_to_dto(&action); + + assert_eq!( + dto.content_hash, + Some("deadbeef".into()), + "content_hash must be extracted from TripleInsert provenance" + ); + } + + #[test] + fn action_to_dto_extracts_content_hash_from_batch_with_triple_insert() { + let provenance = Provenance { + source_path: "vault/doc.md".into(), + line_start: 5, + line_end: 10, + content_hash: "cafebabe".into(), + }; + let action = test_action(DagPayload::Batch { + ops: vec![ + DagPayload::TripleInsert { + triples: vec![TripleInsertPayload { + subject: "akashi://doc".into(), + predicate: "akashi:body".into(), + object: serde_json::json!("content"), + provenance: Some(provenance), + }], + }, + DagPayload::Noop, + ], + }); + + let dto = action_to_dto(&action); + + assert_eq!( + dto.content_hash, + Some("cafebabe".into()), + "content_hash must be extracted from first TripleInsert inside Batch" + ); + } + + #[test] + fn action_to_dto_content_hash_none_for_triple_insert_without_provenance() { + let action = test_action(DagPayload::TripleInsert { + triples: vec![TripleInsertPayload { + subject: "s".into(), + predicate: "p".into(), + object: serde_json::json!("o"), + provenance: None, + }], + }); + + let dto = action_to_dto(&action); + + assert_eq!( + dto.content_hash, None, + "content_hash must be None when no provenance is present" + ); + } + + #[test] + fn action_to_dto_content_hash_none_for_genesis() { + let action = test_action(DagPayload::Genesis { + triple_count: 0, + description: "root".into(), + }); + + let dto = action_to_dto(&action); + + assert_eq!( + dto.content_hash, None, + "Genesis actions have no content_hash" + ); + } + + #[test] + fn action_to_dto_content_hash_none_for_noop() { + let action = test_action(DagPayload::Noop); + + let dto = action_to_dto(&action); + + assert_eq!(dto.content_hash, None, "Noop actions have no content_hash"); + } +} diff --git a/crates/aingle_cortex/src/rest/memory.rs b/crates/aingle_cortex/src/rest/memory.rs index a59e92cb..6f864c28 100644 --- a/crates/aingle_cortex/src/rest/memory.rs +++ b/crates/aingle_cortex/src/rest/memory.rs @@ -23,8 +23,8 @@ use axum::{ http::StatusCode, Json, }; -use serde::{Deserialize, Serialize}; use ineru::{MemoryEntry, MemoryId, MemoryQuery}; +use serde::{Deserialize, Serialize}; use crate::error::{Error, Result}; use crate::state::AppState; @@ -151,16 +151,15 @@ pub async fn remember( .clone() .unwrap_or_else(|| "raft".to_string()); - return Ok(( - StatusCode::CREATED, - Json(RememberResponse { id }), - )); + return Ok((StatusCode::CREATED, Json(RememberResponse { id }))); } // Guard: if Raft is initialized, all writes MUST go through Raft (#2). #[cfg(feature = "cluster")] if state.raft.is_some() { - return Err(Error::Internal("Raft initialized but write not routed through Raft".into())); + return Err(Error::Internal( + "Raft initialized but write not routed through Raft".into(), + )); } // Non-cluster mode: direct write @@ -192,14 +191,13 @@ pub async fn remember( entry_type: req.entry_type.clone(), data: wal_data.clone(), importance: req.importance, - }).map_err(|e| Error::Internal(format!("WAL append failed: {e}")))?; + }) + .map_err(|e| Error::Internal(format!("WAL append failed: {e}")))?; } Ok(( StatusCode::CREATED, - Json(RememberResponse { - id: id.to_hex(), - }), + Json(RememberResponse { id: id.to_hex() }), )) } @@ -237,9 +235,7 @@ pub async fn recall( } /// Force consolidation of important STM entries into LTM. -pub async fn consolidate( - State(state): State, -) -> Result> { +pub async fn consolidate(State(state): State) -> Result> { // Cluster mode: route through Raft so all nodes consolidate deterministically #[cfg(feature = "cluster")] if let Some(ref raft) = state.raft { @@ -278,7 +274,9 @@ pub async fn consolidate( // Guard: if Raft is initialized, all writes MUST go through Raft (#2). #[cfg(feature = "cluster")] if state.raft.is_some() { - return Err(Error::Internal("Raft initialized but write not routed through Raft".into())); + return Err(Error::Internal( + "Raft initialized but write not routed through Raft".into(), + )); } // Non-cluster mode: direct consolidation @@ -292,7 +290,8 @@ pub async fn consolidate( if let Some(ref wal) = state.wal { wal.append(aingle_wal::WalEntryKind::MemoryConsolidate { consolidated_count: count, - }).map_err(|e| Error::Internal(format!("WAL append failed: {e}")))?; + }) + .map_err(|e| Error::Internal(format!("WAL append failed: {e}")))?; } Ok(Json(ConsolidateResponse { @@ -315,10 +314,7 @@ pub async fn stats(State(state): State) -> Result } /// Forget (delete) a specific memory entry. -pub async fn forget( - State(state): State, - Path(id): Path, -) -> Result { +pub async fn forget(State(state): State, Path(id): Path) -> Result { // Cluster mode: route through Raft #[cfg(feature = "cluster")] if let Some(ref raft) = state.raft { @@ -347,7 +343,9 @@ pub async fn forget( // Guard: if Raft is initialized, all writes MUST go through Raft (#2). #[cfg(feature = "cluster")] if state.raft.is_some() { - return Err(Error::Internal("Raft initialized but write not routed through Raft".into())); + return Err(Error::Internal( + "Raft initialized but write not routed through Raft".into(), + )); } // Non-cluster mode: direct delete @@ -364,7 +362,8 @@ pub async fn forget( if let Some(ref wal) = state.wal { wal.append(aingle_wal::WalEntryKind::MemoryForget { memory_id: id.clone(), - }).map_err(|e| Error::Internal(format!("WAL append failed: {e}")))?; + }) + .map_err(|e| Error::Internal(format!("WAL append failed: {e}")))?; } Ok(StatusCode::NO_CONTENT) @@ -379,9 +378,9 @@ pub async fn checkpoint( // For now, create a proof-of-state in the proof store let memory = state.memory.read().await; let s = memory.stats(); - let label = req.label.unwrap_or_else(|| { - format!("checkpoint-{}", chrono::Utc::now().timestamp()) - }); + let label = req + .label + .unwrap_or_else(|| format!("checkpoint-{}", chrono::Utc::now().timestamp())); let checkpoint_data = serde_json::json!({ "label": label, @@ -429,16 +428,12 @@ pub async fn list_checkpoints( .into_iter() .filter(|p| p.metadata.tags.contains(&"checkpoint".to_string())) .map(|p| { - let data: serde_json::Value = - serde_json::from_slice(&p.data).unwrap_or_default(); + let data: serde_json::Value = serde_json::from_slice(&p.data).unwrap_or_default(); CheckpointListDto { id: p.id.clone(), label: data.get("label").and_then(|v| v.as_str()).map(String::from), created_at: p.created_at.to_rfc3339(), - stm_count: data - .get("stm_count") - .and_then(|v| v.as_u64()) - .unwrap_or(0) as usize, + stm_count: data.get("stm_count").and_then(|v| v.as_u64()).unwrap_or(0) as usize, ltm_entity_count: data .get("ltm_entity_count") .and_then(|v| v.as_u64()) @@ -507,7 +502,9 @@ pub async fn vector_search( Json(req): Json, ) -> Result>> { let memory = state.memory.read().await; - let results = memory.ltm.vector_search_memories(&req.embedding, req.k, req.min_similarity); + let results = memory + .ltm + .vector_search_memories(&req.embedding, req.k, req.min_similarity); let mut dtos: Vec = results .into_iter() @@ -543,7 +540,9 @@ pub async fn vector_index_stats( State(state): State, ) -> Result> { let memory = state.memory.read().await; - let stats = memory.ltm.hnsw_index() + let stats = memory + .ltm + .hnsw_index() .map(|idx| idx.stats()) .unwrap_or(ineru::hnsw::HnswStats { point_count: 0, @@ -562,9 +561,7 @@ pub async fn vector_index_stats( } /// Force rebuild of the HNSW vector index. -pub async fn rebuild_vector_index( - State(state): State, -) -> Result { +pub async fn rebuild_vector_index(State(state): State) -> Result { let mut memory = state.memory.write().await; if let Some(hnsw) = memory.ltm.hnsw_index_mut() { hnsw.rebuild(); diff --git a/crates/aingle_cortex/src/rest/observability.rs b/crates/aingle_cortex/src/rest/observability.rs index e9493afb..6bd5b7d4 100644 --- a/crates/aingle_cortex/src/rest/observability.rs +++ b/crates/aingle_cortex/src/rest/observability.rs @@ -212,9 +212,7 @@ pub async fn query_events( // Query all event:type triples to find event subjects let type_pred = Predicate::named(format!("{}:event:type", ns)); - let type_triples = graph - .get_predicate(&type_pred) - .unwrap_or_default(); + let type_triples = graph.get_predicate(&type_pred).unwrap_or_default(); let mut events: Vec = Vec::new(); diff --git a/crates/aingle_cortex/src/rest/p2p.rs b/crates/aingle_cortex/src/rest/p2p.rs index 28352e12..34ff9508 100644 --- a/crates/aingle_cortex/src/rest/p2p.rs +++ b/crates/aingle_cortex/src/rest/p2p.rs @@ -43,7 +43,8 @@ async fn p2p_status(State(state): State) -> impl IntoResponse { Err(e) => ( StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": format!("serialize p2p status: {e}")})), - ).into_response(), + ) + .into_response(), } } @@ -64,7 +65,8 @@ async fn list_peers(State(state): State) -> impl IntoResponse { Err(e) => ( StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({"error": format!("serialize peers: {e}")})), - ).into_response(), + ) + .into_response(), } } diff --git a/crates/aingle_cortex/src/rest/raft_rpc.rs b/crates/aingle_cortex/src/rest/raft_rpc.rs index 65d1c76b..59b0ca07 100644 --- a/crates/aingle_cortex/src/rest/raft_rpc.rs +++ b/crates/aingle_cortex/src/rest/raft_rpc.rs @@ -45,13 +45,10 @@ pub async fn raft_append_entries( let req: openraft::raft::AppendEntriesRequest = serde_json::from_slice(&body) .map_err(|e| Error::Internal(format!("Deserialize AppendEntries: {e}")))?; - let resp = tokio::time::timeout( - std::time::Duration::from_secs(10), - raft.append_entries(req), - ) - .await - .map_err(|_| Error::Timeout("AppendEntries RPC timed out (10s)".into()))? - .map_err(|e| Error::Internal(format!("AppendEntries failed: {e}")))?; + let resp = tokio::time::timeout(std::time::Duration::from_secs(10), raft.append_entries(req)) + .await + .map_err(|_| Error::Timeout("AppendEntries RPC timed out (10s)".into()))? + .map_err(|e| Error::Internal(format!("AppendEntries failed: {e}")))?; let payload = serde_json::to_vec(&resp) .map_err(|e| Error::Internal(format!("Serialize response: {e}")))?; @@ -78,13 +75,10 @@ pub async fn raft_vote( let req: openraft::raft::VoteRequest = serde_json::from_slice(&body) .map_err(|e| Error::Internal(format!("Deserialize Vote: {e}")))?; - let resp = tokio::time::timeout( - std::time::Duration::from_secs(10), - raft.vote(req), - ) - .await - .map_err(|_| Error::Timeout("Vote RPC timed out (10s)".into()))? - .map_err(|e| Error::Internal(format!("Vote failed: {e}")))?; + let resp = tokio::time::timeout(std::time::Duration::from_secs(10), raft.vote(req)) + .await + .map_err(|_| Error::Timeout("Vote RPC timed out (10s)".into()))? + .map_err(|e| Error::Internal(format!("Vote failed: {e}")))?; let payload = serde_json::to_vec(&resp) .map_err(|e| Error::Internal(format!("Serialize response: {e}")))?; @@ -153,9 +147,8 @@ struct SnapshotBuffer { /// In-flight chunked snapshot buffers, keyed by snapshot_id. /// Buffers older than `BUFFER_TTL` are evicted to prevent memory leaks /// from abandoned transfers. -static SNAPSHOT_BUFFERS: std::sync::LazyLock< - dashmap::DashMap, -> = std::sync::LazyLock::new(dashmap::DashMap::new); +static SNAPSHOT_BUFFERS: std::sync::LazyLock> = + std::sync::LazyLock::new(dashmap::DashMap::new); /// Maximum time a partial snapshot buffer can live before eviction. const BUFFER_TTL: std::time::Duration = std::time::Duration::from_secs(300); // 5 min @@ -226,7 +219,9 @@ pub async fn raft_snapshot_chunk( // Remove buffer and validate completeness let full_buf = SNAPSHOT_BUFFERS .remove(&snapshot_id) - .ok_or_else(|| Error::Internal("Snapshot buffer missing on final chunk".into()))? + .ok_or_else(|| { + Error::Internal("Snapshot buffer missing on final chunk".into()) + })? .1; if (full_buf.data.len() as u64) != full_buf.expected_size { @@ -256,10 +251,7 @@ pub async fn raft_snapshot_chunk( } /// Shared logic: install a full snapshot from its raw bytes. -async fn install_full_snapshot_from_bytes( - state: &AppState, - data: &[u8], -) -> Result, Error> { +async fn install_full_snapshot_from_bytes(state: &AppState, data: &[u8]) -> Result, Error> { let raft = state .raft .as_ref() @@ -292,8 +284,7 @@ async fn install_full_snapshot_from_bytes( .map_err(|_| Error::Timeout("InstallSnapshot timed out (60s)".into()))? .map_err(|e| Error::Internal(format!("InstallSnapshot failed: {e}")))?; - serde_json::to_vec(&resp) - .map_err(|e| Error::Internal(format!("Serialize response: {e}"))) + serde_json::to_vec(&resp).map_err(|e| Error::Internal(format!("Serialize response: {e}"))) } /// Create the internal Raft RPC sub-router. diff --git a/crates/aingle_cortex/src/rest/triples.rs b/crates/aingle_cortex/src/rest/triples.rs index bb2f0509..1be2abb0 100644 --- a/crates/aingle_cortex/src/rest/triples.rs +++ b/crates/aingle_cortex/src/rest/triples.rs @@ -200,6 +200,7 @@ pub async fn create_triple( subject: req.subject.clone(), predicate: req.predicate.clone(), object: serde_json::to_value(&req.object).unwrap_or_default(), + provenance: None, }], }, signature: None, diff --git a/crates/aingle_cortex/src/server.rs b/crates/aingle_cortex/src/server.rs index 22df5559..ed4e25ef 100644 --- a/crates/aingle_cortex/src/server.rs +++ b/crates/aingle_cortex/src/server.rs @@ -57,6 +57,10 @@ pub struct CortexConfig { pub mcp_oauth_resource: Option, /// Optional explicit JWKS URL; if None, derived from the issuer (Keycloak certs path). pub mcp_oauth_jwks_url: Option, + /// Optional directory containing a neural embedding model. Selects the neural + /// embedder when set and cortex is built with `neural-embeddings`; otherwise + /// the hash embedder is used. + pub embed_model: Option, } impl Default for CortexConfig { @@ -80,6 +84,7 @@ impl Default for CortexConfig { mcp_oauth_issuer: None, mcp_oauth_resource: None, mcp_oauth_jwks_url: None, + embed_model: None, } } } @@ -124,7 +129,9 @@ impl CortexServer { /// - `None` — Sled-backed persistent storage at `~/.aingle/cortex/graph.sled`. pub fn new(config: CortexConfig) -> Result { let db_path = resolve_db_path(&config.db_path); - let state = AppState::with_db_path(&db_path, config.audit_log_path.clone())?; + let embedder = crate::embedder::build_embedder(config.embed_model.as_deref()); + let state = + AppState::with_db_path_and_embedder(&db_path, config.audit_log_path.clone(), embedder)?; info!("Graph database: {}", db_path); Ok(Self { config, state }) } diff --git a/crates/aingle_cortex/src/service/backlinks.rs b/crates/aingle_cortex/src/service/backlinks.rs new file mode 100644 index 00000000..2dbd5adf --- /dev/null +++ b/crates/aingle_cortex/src/service/backlinks.rs @@ -0,0 +1,337 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Backlinks, outgoing links, and unlinked mentions for a note — the verified +//! link graph around a single note. Deterministic; reuses links_to triples, +//! Ineru chunk text (context + unlinked scan), and DAG provenance. + +use serde::Serialize; +use std::collections::BTreeMap; + +use crate::service::triple_util::{ + basename, obj_string, provenance_anchor_for, resolve_link_target, strip_brackets, +}; + +/// Verified link context for one note. +#[derive(Debug, Clone, Serialize, Default)] +pub struct Backlinks { + pub backlinks: Vec, + pub outgoing: Vec, + pub unlinked: Vec, +} + +/// A note that links to the target, with the link's context + provenance. +#[derive(Debug, Clone, Serialize)] +pub struct BacklinkRef { + pub path: String, + pub context: Option, + pub provenance_anchor: Option, +} + +/// True if `text` contains `word` (case-insensitive) as a whole token — bounded +/// by non-alphanumeric chars or string ends. Handles multi-token names like +/// "meeting-notes" while NOT matching "note" inside "notebook". +fn mentions_word(text: &str, word: &str) -> bool { + let w = word.trim().to_lowercase(); + if w.is_empty() { + return false; + } + let hay = text.to_lowercase(); + let hb = hay.as_bytes(); + let mut from = 0; + while let Some(rel) = hay[from..].find(w.as_str()) { + let start = from + rel; + let end = start + w.len(); + let before_ok = start == 0 || !(hb[start - 1] as char).is_alphanumeric(); + let after_ok = end >= hb.len() || !(hb[end] as char).is_alphanumeric(); + if before_ok && after_ok { + return true; + } + from = start + 1; + if from >= hb.len() { + break; + } + } + false +} + +/// Compute backlinks, outgoing links, and unlinked mentions for `note`. +pub async fn backlinks(state: &crate::state::AppState, note: &str) -> Backlinks { + use aingle_graph::{Predicate, TriplePattern}; + + // Note set + basename index. + let (notes, links): (Vec, Vec<(String, String)>) = { + let g = state.graph.read().await; + let collect = |pred: &str| -> Vec<(String, String)> { + g.find(TriplePattern::any().with_predicate(Predicate::named(pred))) + .unwrap_or_default() + .into_iter() + .filter_map(|t| { + obj_string(&t).map(|o| (strip_brackets(&t.subject.to_string()).to_string(), o)) + }) + .collect() + }; + let mut notes: Vec = collect(crate::service::ingest::PRED_SOURCE_HASH) + .into_iter() + .map(|(s, _)| s) + .collect(); + notes.sort(); + notes.dedup(); + let links = collect("links_to"); + (notes, links) + }; + + let note_set: std::collections::BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + let mut by_base: BTreeMap = BTreeMap::new(); + for n in ¬es { + by_base.entry(basename(n)).or_insert_with(|| n.clone()); + } + let resolve = + |target: &str| -> Option { resolve_link_target(target, ¬e_set, &by_base) }; + let active_base = basename(note); + let active_base_lc = active_base.to_lowercase(); + + // Per-note chunk text (for context + unlinked scan). + let mut text_of: BTreeMap = BTreeMap::new(); + { + let mem = state.memory.read().await; + let mut entries = mem.stm.all_entries(); + entries.extend(mem.ltm.all_entries()); + for e in entries { + if e.entry_type != crate::service::ingest::CHUNK_ENTRY_TYPE { + continue; + } + if let (Some(p), Some(t)) = ( + e.data.get("source_path").and_then(|v| v.as_str()), + e.data.get("text").and_then(|v| v.as_str()), + ) { + let buf = text_of.entry(p.to_string()).or_default(); + buf.push('\n'); + buf.push_str(t); + } + } + } + + // Backlinks: sources linking to `note`. + let mut backlink_paths = std::collections::BTreeSet::new(); + let mut backlinks: Vec = Vec::new(); + for (src, target) in &links { + if src == note || !note_set.contains(src.as_str()) { + continue; + } + if resolve(target).as_deref() == Some(note) && backlink_paths.insert(src.clone()) { + let context = text_of.get(src).and_then(|txt| { + txt.lines() + .find(|l| l.contains("[[") && l.to_lowercase().contains(&active_base_lc)) + .map(|l| { + let t = l.trim(); + if t.chars().count() > 200 { + let cut: String = t.chars().take(200).collect(); + format!("{cut}…") + } else { + t.to_string() + } + }) + }); + let anchor = provenance_anchor_for(state, src).await; + backlinks.push(BacklinkRef { + path: src.clone(), + context, + provenance_anchor: anchor, + }); + } + } + backlinks.sort_by(|a, b| a.path.cmp(&b.path)); + + // Outgoing: notes `note` links to. + let mut outgoing: Vec = links + .iter() + .filter(|(src, _)| src == note) + .filter_map(|(_, target)| resolve(target)) + .filter(|p| p != note) + .collect(); + outgoing.sort(); + outgoing.dedup(); + + // Unlinked mentions: notes whose text names `active_base` but don't link it. + let mut unlinked: Vec = text_of + .iter() + .filter(|(p, _)| { + p.as_str() != note + && !backlink_paths.contains(p.as_str()) + && note_set.contains(p.as_str()) + }) + .filter(|(_, txt)| mentions_word(txt, &active_base)) + .map(|(p, _)| p.clone()) + .collect(); + unlinked.sort(); + unlinked.dedup(); + + Backlinks { + backlinks, + outgoing, + unlinked, + } +} + +#[cfg(test)] +mod tests { + use crate::state::AppState; + use aingle_graph::{NodeId, Predicate, Triple, Value}; + + async fn graph_with(triples: &[(&str, &str, &str)]) -> AppState { + let state = AppState::with_db_path(":memory:", None).unwrap(); + { + let g = state.graph.write().await; + for (s, p, o) in triples { + g.insert(Triple::new( + NodeId::named(*s), + Predicate::named(*p), + Value::literal(*o), + )) + .unwrap(); + } + } + state + } + + #[tokio::test] + async fn backlinks_outgoing_unlinked() { + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ("c.md", "aingle:source_hash", "h3"), + ("target.md", "aingle:source_hash", "h4"), + ("a.md", "links_to", "target"), // a → target (backlink) + ("target.md", "links_to", "b"), // target → b (outgoing) + ]) + .await; + // c.md mentions "target" in text but does not link it (unlinked). + { + let mut mem = state.memory.write().await; + let mut e = ineru::MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": "See target for details.", "source_path": "c.md" }), + ); + e.embedding = Some(ineru::Embedding::new(vec![0.0; 8])); + mem.remember(e).unwrap(); + } + + let r = super::backlinks(&state, "target.md").await; + assert!( + r.backlinks.iter().any(|b| b.path == "a.md"), + "a links to target" + ); + assert!( + r.outgoing.contains(&"b.md".to_string()), + "target links to b" + ); + assert!( + r.unlinked.contains(&"c.md".to_string()), + "c mentions target unlinked" + ); + assert!( + !r.unlinked.contains(&"a.md".to_string()), + "a is a backlink, not unlinked" + ); + } + + #[tokio::test] + async fn unlinked_detects_hyphenated_basename() { + let state = graph_with(&[ + ("meeting-notes.md", "aingle:source_hash", "h1"), + ("c.md", "aingle:source_hash", "h2"), + ]) + .await; + { + let mut mem = state.memory.write().await; + let mut e = ineru::MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": "Discussed in meeting-notes yesterday.", "source_path": "c.md" }), + ); + e.embedding = Some(ineru::Embedding::new(vec![0.0; 8])); + mem.remember(e).unwrap(); + } + let r = super::backlinks(&state, "meeting-notes.md").await; + assert!( + r.unlinked.contains(&"c.md".to_string()), + "hyphenated name must be detected: {r:?}" + ); + } + + #[test] + fn mentions_word_is_bounded() { + assert!(super::mentions_word("a meeting-notes b", "meeting-notes")); + assert!(!super::mentions_word("my notebook here", "note")); + assert!(super::mentions_word("see Target.", "target")); + } + + #[tokio::test] + async fn links_to_node_objects_are_captured() { + // Real ingest stores wikilink targets as Value::Node, not Value::literal. + // This test locks the fix: node-valued links_to objects must be read as + // backlinks/outgoing, not silently dropped. + let state = crate::state::AppState::with_db_path(":memory:", None).unwrap(); + { + let g = state.graph.write().await; + for (s, p) in [ + ("a.md", "aingle:source_hash"), + ("hub.md", "aingle:source_hash"), + ] { + g.insert(Triple::new( + NodeId::named(s), + Predicate::named(p), + Value::literal("h"), + )) + .unwrap(); + } + // links_to stored as a NODE object — how real ingest produces it. + g.insert(Triple::new( + NodeId::named("a.md"), + Predicate::named("links_to"), + Value::Node(NodeId::named("hub")), + )) + .unwrap(); + } + let r = super::backlinks(&state, "hub.md").await; + assert!( + r.backlinks.iter().any(|b| b.path == "a.md"), + "node-valued links_to must appear as a backlink: {r:?}" + ); + let r2 = super::backlinks(&state, "a.md").await; + assert!( + r2.outgoing.contains(&"hub.md".to_string()), + "node-valued links_to must appear as outgoing: {r2:?}" + ); + } + + #[tokio::test] + async fn context_truncation_is_char_safe() { + let state = graph_with(&[ + ("t.md", "aingle:source_hash", "h1"), + ("src.md", "aingle:source_hash", "h2"), + ("src.md", "links_to", "t"), + ]) + .await; + { + let mut mem = state.memory.write().await; + // A line with accented chars whose byte length far exceeds 200 around the cut. + let long = format!("[[t]] {}", "áéíóú ".repeat(80)); + let mut e = ineru::MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": long, "source_path": "src.md" }), + ); + e.embedding = Some(ineru::Embedding::new(vec![0.0; 8])); + mem.remember(e).unwrap(); + } + // Must not panic; context should be present and ≤ 201 chars (200 + ellipsis). + let r = super::backlinks(&state, "t.md").await; + let b = r + .backlinks + .iter() + .find(|b| b.path == "src.md") + .expect("backlink"); + let ctx = b.context.as_ref().expect("context"); + assert!(ctx.chars().count() <= 201); + } +} diff --git a/crates/aingle_cortex/src/service/context.rs b/crates/aingle_cortex/src/service/context.rs new file mode 100644 index 00000000..21daf7c4 --- /dev/null +++ b/crates/aingle_cortex/src/service/context.rs @@ -0,0 +1,925 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Semantic note-context: for an active note, surface the notes that are +//! semantically related (by neural embeddings) even when never linked, each +//! with the matching passage and signed provenance. + +use std::collections::{BTreeMap, BTreeSet}; + +use crate::service::triple_util::{ + basename, obj_string, provenance_anchor_for, resolve_link_target, strip_brackets, +}; + +/// The semantic context for one note — the semantically related notes, even +/// when never explicitly linked. +#[derive(Debug, Clone, serde::Serialize, Default)] +pub struct NoteContext { + /// `true` when the embedder has enough dimensions to produce meaningful + /// semantic similarity (≥ `SEMANTIC_MIN_DIMS`). `false` means the hash + /// fallback is active and no neighbor search was attempted. + pub semantic_ready: bool, + pub neighbors: Vec, +} + +/// A note that is semantically related to the active note. +#[derive(Debug, Clone, serde::Serialize)] +pub struct Neighbor { + /// Full relative path — the canonical identity used everywhere else. + pub path: String, + /// Best chunk cosine similarity against the active note's query vector. + pub score: f32, + /// The matching chunk text, ≤ 200 chars (char-safe), with `…` appended + /// if truncated. + pub passage: Option, + /// Hex hash of the signed DAG action that recorded this source (🔒 anchor). + /// `None` when the feature is off or no signed action exists. + pub provenance_anchor: Option, + /// `true` if the active note already has an explicit `links_to` edge to + /// this neighbor — so the UI can distinguish "related and linked" from + /// "related but not yet linked". + pub already_linked: bool, +} + +/// Minimum embedder dimensionality required to attempt semantic neighbor search. +/// The 64-d hash embedder does not produce meaningful cosine similarity for +/// cross-note retrieval; this gate keeps the result honest. +const SEMANTIC_MIN_DIMS: usize = 128; + +/// Minimum cosine for a note to count as a semantic neighbor. Calibrated for +/// note-to-note neural similarity: multilingual-e5 assigns a high baseline +/// (~0.83) to any same-language text, so the embedder's grounding `low` +/// threshold (0.77) is too permissive here. Mirrors vault_map's +/// SEMANTIC_THRESHOLD rationale (related notes ~0.90+, unrelated ~0.81-0.83). +/// Follow-up: make this per-embedder if more neural models are added. +pub const NEIGHBOR_FLOOR: f32 = 0.88; + +// --------------------------------------------------------------------------- +// Core retrieval +// --------------------------------------------------------------------------- + +/// Compute the semantic neighbors of `note` — up to `limit` related notes, +/// ranked by embedding cosine similarity, each with a matching passage and +/// optional signed provenance anchor. +pub async fn note_context(state: &crate::state::AppState, note: &str, limit: usize) -> NoteContext { + use aingle_graph::{Predicate, TriplePattern}; + use ineru::MemoryQuery; + + // 1. Semantic gate: only proceed when the embedder is neural-grade. + if state.embedder.dimensions() < SEMANTIC_MIN_DIMS { + return NoteContext { + semantic_ready: false, + neighbors: vec![], + }; + } + + // 2. Build the note set (subjects of PRED_SOURCE_HASH) + basename index, + // and collect all links_to triples. + let (notes, links): (Vec, Vec<(String, String)>) = { + let g = state.graph.read().await; + let collect = |pred: &str| -> Vec<(String, String)> { + g.find(TriplePattern::any().with_predicate(Predicate::named(pred))) + .unwrap_or_default() + .into_iter() + .filter_map(|t| { + obj_string(&t).map(|o| (strip_brackets(&t.subject.to_string()).to_string(), o)) + }) + .collect() + }; + let mut ns: Vec = collect(crate::service::ingest::PRED_SOURCE_HASH) + .into_iter() + .map(|(s, _)| s) + .collect(); + ns.sort(); + ns.dedup(); + let links = collect("links_to"); + (ns, links) + }; + + let note_set: BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + + // basename → first full path (for wikilink resolution). + let mut by_base: BTreeMap = BTreeMap::new(); + for n in ¬es { + by_base.entry(basename(n)).or_insert_with(|| n.clone()); + } + + let resolve = + |target: &str| -> Option { resolve_link_target(target, ¬e_set, &by_base) }; + + // 3. Compute `outgoing_set`: full paths that the active `note` links to. + let outgoing_set: BTreeSet = links + .iter() + .filter(|(src, _)| src == note) + .filter_map(|(_, target)| resolve(target)) + .filter(|p| p != note) + .collect(); + + // 4. Build the active note's query text from its own chunks. + // Read STM and LTM separately and filter to `note` immediately — avoids + // allocating a merged Vec of every entry in memory. + let mut own_text = String::new(); + let (stm_entries, ltm_entries) = { + let mem = state.memory.read().await; + (mem.stm.all_entries(), mem.ltm.all_entries()) + }; + for e in stm_entries.iter().chain(ltm_entries.iter()) { + if e.entry_type != crate::service::ingest::CHUNK_ENTRY_TYPE { + continue; + } + if let (Some(p), Some(t)) = ( + e.data.get("source_path").and_then(|v| v.as_str()), + e.data.get("text").and_then(|v| v.as_str()), + ) { + if p == note { + own_text.push('\n'); + own_text.push_str(t); + } + } + } + + let query_text: String = if own_text.trim().is_empty() { + basename(note) + } else { + own_text.clone() + }; + + let q = state.embedder.embed_query(&query_text); + + // 5. Over-fetch from memory and re-rank by cosine similarity. + let fetch_limit = (limit * 8).max(48); + let results = { + let mem = state.memory.read().await; + mem.recall( + &MemoryQuery::text(&query_text) + .with_embedding(q.clone()) + .with_limit(fetch_limit), + ) + .unwrap_or_default() + }; + + // Per-source best (rel, text). + let mut best_by_src: BTreeMap = BTreeMap::new(); + + for r in &results { + if r.entry.entry_type != crate::service::ingest::CHUNK_ENTRY_TYPE { + continue; + } + let emb = match &r.entry.embedding { + Some(e) => e, + None => continue, + }; + let rel = q.cosine_similarity(emb); + if rel < NEIGHBOR_FLOOR { + continue; + } + let d = &r.entry.data; + let src = match d.get("source_path").and_then(|v| v.as_str()) { + Some(s) => s.to_string(), + None => continue, + }; + if src == note { + continue; + } + if crate::service::vault_map::is_maps_path(&src) { + continue; + } + if !note_set.contains(src.as_str()) { + continue; + } + // Only clone the chunk text when actually inserting or replacing the + // best entry — avoids a clone on every already-occupied iteration. + let text = d.get("text").and_then(|v| v.as_str()).unwrap_or(""); + match best_by_src.entry(src) { + std::collections::btree_map::Entry::Vacant(e) => { + e.insert((rel, text.to_string())); + } + std::collections::btree_map::Entry::Occupied(mut e) => { + if rel > e.get().0 { + *e.get_mut() = (rel, text.to_string()); + } + } + } + } + + // 6. Build Neighbor list (provenance is None for now), sort by score desc, + // truncate to `limit`, then resolve provenance only for the survivors. + // This cuts up to ~48 DAG reads (fetch_limit) down to `limit` (≤ 10). + let mut neighbors: Vec = Vec::with_capacity(best_by_src.len()); + for (src, (rel, chunk_text)) in best_by_src { + let passage = Some({ + let t = chunk_text.trim(); + if t.chars().count() > 200 { + let cut: String = t.chars().take(200).collect(); + format!("{cut}…") + } else { + t.to_string() + } + }); + let already_linked = outgoing_set.contains(&src); + neighbors.push(Neighbor { + path: src, + score: rel, + passage, + provenance_anchor: None, + already_linked, + }); + } + + // NaN-safe descending sort (mirrors ground.rs). + neighbors.sort_by(|a, b| { + b.score + .partial_cmp(&a.score) + .unwrap_or(std::cmp::Ordering::Equal) + }); + neighbors.truncate(limit); + + // Resolve provenance only for the survivors (typically ≤ limit DAG reads). + for n in &mut neighbors { + n.provenance_anchor = provenance_anchor_for(state, &n.path).await; + } + + NoteContext { + semantic_ready: true, + neighbors, + } +} + +// --------------------------------------------------------------------------- +// Cached variant +// --------------------------------------------------------------------------- + +/// Like [`note_context`] but memoised on `(triple_count, total_memory_bytes)`. +/// +/// The map key is `(note_path, limit)` so that MCP calls with different `limit` +/// values are cached independently and never serve a stale neighbor count. +pub async fn note_context_cached( + state: &crate::state::AppState, + note: &str, + limit: usize, +) -> NoteContext { + let tc = { state.graph.read().await.stats().triple_count }; + let mem_bytes = { state.memory.read().await.stats().total_memory_bytes }; + let version_key = (tc, mem_bytes); + let map_key = (note.to_string(), limit); + + // Check cache — release lock before any await. + { + let cache = state + .note_context_cache + .lock() + .expect("note_context cache poisoned"); + if let Some((cached_key, ctx)) = cache.get(&map_key) { + if *cached_key == version_key { + return ctx.clone(); + } + } + } + + // Compute without holding the mutex. + let result = note_context(state, note, limit).await; + + // Store result. + { + let mut cache = state + .note_context_cache + .lock() + .expect("note_context cache poisoned"); + // Simple growth cap: if more than 256 entries are cached, clear entirely + // before inserting. This bounds memory without per-entry LRU bookkeeping; + // a typical Akashi session edits far fewer than 256 (note, limit) pairs. + if cache.len() > 256 { + cache.clear(); + } + cache.insert(map_key, (version_key, result.clone())); + } + + result +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use aingle_graph::{NodeId, Predicate, Triple, Value}; + use ineru::{Embedder, Embedding, MemoryEntry}; + + use crate::state::AppState; + + // ----------------------------------------------------------------------- + // Stub embedder: 128-dim, deterministic, text-content-aware. + // - text containing "alpha" → [1.0, 0.0, 0.0, …] (unit basis e0) + // - text containing "zzz" → [0.0, 1.0, 0.0, …] (unit basis e1) + // - query for "alpha" → [1.0, 0.0, 0.0, …] (same basis) + // Cosine("alpha","alpha") = 1.0 ≥ low threshold (0.1) → pass + // Cosine("alpha","zzz") = 0.0 < low threshold → filtered out + // ----------------------------------------------------------------------- + struct StubEmbedder; + + impl Embedder for StubEmbedder { + fn embed_passage(&self, text: &str) -> Embedding { + let mut v = vec![0.0_f32; 128]; + if text.contains("alpha") { + v[0] = 1.0; + } else if text.contains("zzz") { + v[1] = 1.0; + } else { + // default: non-zero to avoid zero-vector edge case + v[2] = 1.0; + } + Embedding::new(v) + } + + fn embed_query(&self, text: &str) -> Embedding { + // Reuse passage embedding logic for query — correct for symmetric + // tests; for real asymmetric models the trait would differ. + self.embed_passage(text) + } + + fn dimensions(&self) -> usize { + 128 + } + + fn relevance_thresholds(&self) -> (f32, f32) { + // high=0.5, low=0.1 — alpha/alpha scores 1.0 (pass), alpha/zzz + // scores 0.0 (filtered). + (0.5, 0.1) + } + } + + // ----------------------------------------------------------------------- + // Helpers + // ----------------------------------------------------------------------- + + fn stub_state() -> AppState { + AppState::with_db_path_and_embedder(":memory:", None, Arc::new(StubEmbedder)).unwrap() + } + + async fn insert_triples(state: &AppState, triples: &[(&str, &str, &str)]) { + let g = state.graph.write().await; + for (s, p, o) in triples { + g.insert(Triple::new( + NodeId::named(*s), + Predicate::named(*p), + Value::literal(*o), + )) + .unwrap(); + } + } + + async fn insert_chunk(state: &AppState, source_path: &str, text: &str, emb: Vec) { + let mut mem = state.memory.write().await; + let mut e = MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": text, "source_path": source_path }), + ); + e.embedding = Some(Embedding::new(emb)); + mem.remember(e).unwrap(); + } + + // ----------------------------------------------------------------------- + // Tests + // ----------------------------------------------------------------------- + + /// Default state uses 64-d hash embedder → semantic gate fires → short-circuit. + #[tokio::test] + async fn hash_grade_embedder_short_circuits() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + let ctx = super::note_context(&state, "active.md", 5).await; + assert!( + !ctx.semantic_ready, + "64-d hash embedder must not be semantic_ready" + ); + assert!(ctx.neighbors.is_empty()); + } + + /// The "alpha" note scores 1.0 (cosine of identical unit vectors) and appears + /// as neighbor #1; the "zzz" note scores 0.0 and is filtered below low threshold. + #[tokio::test] + async fn same_topic_ranks_above_off_topic() { + let state = stub_state(); + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("alpha.md", "aingle:source_hash", "h1"), + ("zzz.md", "aingle:source_hash", "h2"), + ], + ) + .await; + + // Active note's own chunk (alpha text → e0 query vector). + let e0 = vec![ + 1.0_f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + ]; + let e1 = { + let mut v = vec![0.0_f32; 128]; + v[1] = 1.0; + v + }; + insert_chunk(&state, "active.md", "alpha content for active", e0.clone()).await; + insert_chunk(&state, "alpha.md", "alpha related content", e0.clone()).await; + insert_chunk(&state, "zzz.md", "zzz completely unrelated orthogonal", e1).await; + + let ctx = super::note_context(&state, "active.md", 10).await; + assert!(ctx.semantic_ready); + assert!( + ctx.neighbors.iter().any(|n| n.path == "alpha.md"), + "alpha.md must be a neighbor: {:?}", + ctx.neighbors + ); + assert!( + !ctx.neighbors.iter().any(|n| n.path == "zzz.md"), + "zzz.md must be filtered (cosine 0.0 < low threshold): {:?}", + ctx.neighbors + ); + // alpha.md is first (highest score). + assert_eq!(ctx.neighbors[0].path, "alpha.md"); + } + + /// passage is present and its char count is ≤ 201 (200 + optional ellipsis). + /// An accented long chunk proves no byte-slice panic. + #[tokio::test] + async fn passage_present_and_char_safe() { + let state = stub_state(); + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("related.md", "aingle:source_hash", "h1"), + ], + ) + .await; + + let e0: Vec = { + let mut v = vec![0.0; 128]; + v[0] = 1.0; + v + }; + // Long chunk with accented chars to exercise char-safe truncation. + let long_text = format!("alpha {}", "áéíóú ".repeat(80)); + insert_chunk(&state, "active.md", "alpha active note", e0.clone()).await; + insert_chunk(&state, "related.md", &long_text, e0.clone()).await; + + let ctx = super::note_context(&state, "active.md", 10).await; + assert!(ctx.semantic_ready); + let n = ctx + .neighbors + .iter() + .find(|n| n.path == "related.md") + .expect("related.md must be a neighbor"); + let passage = n.passage.as_ref().expect("passage must be present"); + assert!( + passage.chars().count() <= 201, + "passage must be ≤ 201 chars (200 + ellipsis), got {}", + passage.chars().count() + ); + } + + /// `already_linked` is `true` when the active note has a `links_to` triple + /// whose object is `Value::Node` (the real ingest format — NOT a literal). + #[tokio::test] + async fn already_linked_from_node_object() { + let state = stub_state(); + + { + let g = state.graph.write().await; + for (s, p) in [ + ("active.md", "aingle:source_hash"), + ("alpha.md", "aingle:source_hash"), + ] { + g.insert(Triple::new( + NodeId::named(s), + Predicate::named(p), + Value::literal("h"), + )) + .unwrap(); + } + // links_to stored as a NODE object — how real ingest produces it. + g.insert(Triple::new( + NodeId::named("active.md"), + Predicate::named("links_to"), + Value::Node(NodeId::named("alpha")), + )) + .unwrap(); + } + + let e0: Vec = { + let mut v = vec![0.0_f32; 128]; + v[0] = 1.0; + v + }; + insert_chunk(&state, "active.md", "alpha active note", e0.clone()).await; + insert_chunk(&state, "alpha.md", "alpha related content", e0.clone()).await; + + let ctx = super::note_context(&state, "active.md", 10).await; + assert!(ctx.semantic_ready); + let n = ctx + .neighbors + .iter() + .find(|n| n.path == "alpha.md") + .expect("alpha.md must be a neighbor"); + assert!( + n.already_linked, + "alpha.md must have already_linked=true (node-valued links_to): {:?}", + n + ); + } + + /// Notes under `_maps/` are excluded even when their embeddings match. + #[tokio::test] + async fn maps_excluded() { + let state = stub_state(); + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("_maps/vault-map.md", "aingle:source_hash", "h1"), + ], + ) + .await; + + let e0: Vec = { + let mut v = vec![0.0_f32; 128]; + v[0] = 1.0; + v + }; + insert_chunk(&state, "active.md", "alpha active", e0.clone()).await; + insert_chunk( + &state, + "_maps/vault-map.md", + "alpha maps content", + e0.clone(), + ) + .await; + + let ctx = super::note_context(&state, "active.md", 10).await; + assert!(ctx.semantic_ready); + assert!( + !ctx.neighbors.iter().any(|n| n.path.starts_with("_maps/")), + "maps paths must be excluded: {:?}", + ctx.neighbors + ); + } + + /// Without the `dag` feature the provenance anchor is always `None`. + /// With the `dag` feature, a signed action anchors the source and is surfaced. + #[cfg(not(feature = "dag"))] + #[tokio::test] + async fn provenance_none_without_dag() { + let state = stub_state(); + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("alpha.md", "aingle:source_hash", "h1"), + ], + ) + .await; + + let e0: Vec = { + let mut v = vec![0.0_f32; 128]; + v[0] = 1.0; + v + }; + insert_chunk(&state, "active.md", "alpha active", e0.clone()).await; + insert_chunk(&state, "alpha.md", "alpha related", e0).await; + + let ctx = super::note_context(&state, "active.md", 10).await; + let n = ctx + .neighbors + .iter() + .find(|n| n.path == "alpha.md") + .expect("alpha.md must be neighbor"); + assert!( + n.provenance_anchor.is_none(), + "provenance must be None without dag feature" + ); + } + + /// With the `dag` feature, a signed DAG action for the neighbor yields a + /// non-None provenance_anchor. + #[cfg(feature = "dag")] + #[tokio::test] + async fn provenance_present_when_signed() { + let state = + AppState::with_db_path_and_embedder(":memory:", None, Arc::new(StubEmbedder)).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("alpha.md", "aingle:source_hash", "h1"), + ], + ) + .await; + + let e0: Vec = { + let mut v = vec![0.0_f32; 128]; + v[0] = 1.0; + v + }; + insert_chunk(&state, "active.md", "alpha active", e0.clone()).await; + insert_chunk(&state, "alpha.md", "alpha related", e0).await; + + // Record a signed Custom DAG action whose subject is "alpha.md" so that + // history_by_subject("alpha.md") returns a signed entry and + // provenance_anchor_for returns Some(hash_hex). + { + let graph = state.graph.read().await; + let dag_store = graph.dag_store().expect("DAG must be enabled"); + let parents = dag_store.tips().expect("tips must be readable"); + let mut action = aingle_graph::dag::DagAction { + parents, + author: aingle_graph::NodeId::named("test"), + seq: 0, + timestamp: chrono::Utc::now(), + payload: aingle_graph::dag::DagPayload::Custom { + payload_type: "ingest".to_string(), + payload_summary: "alpha.md ingested".to_string(), + payload: None, + subject: Some("alpha.md".to_string()), + }, + signature: None, + }; + let key = aingle_graph::dag::DagSigningKey::generate(); + key.sign(&mut action); + dag_store + .put(&action) + .expect("put signed action must succeed"); + } + + let ctx = super::note_context(&state, "active.md", 10).await; + assert!( + ctx.neighbors.iter().any(|n| n.path == "alpha.md"), + "alpha.md must be a semantic neighbor with dag feature: {:?}", + ctx.neighbors + ); + let n = ctx.neighbors.iter().find(|n| n.path == "alpha.md").unwrap(); + assert!( + n.provenance_anchor.is_some(), + "provenance_anchor must be Some when a signed DAG action is recorded for the source: {:?}", + n + ); + } + + // ----------------------------------------------------------------------- + // Cache tests (item 1 + item 5) + // ----------------------------------------------------------------------- + + /// `note_context_cached` must return an identical result on the second call + /// (cache hit), and a fresh result after graph/memory mutation (invalidation). + /// This test locks both the hit path and the version-change recompute path. + #[tokio::test] + async fn note_context_cached_hit_and_invalidation() { + let state = stub_state(); + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("alpha.md", "aingle:source_hash", "h1"), + ], + ) + .await; + + let e0: Vec = { + let mut v = vec![0.0_f32; 128]; + v[0] = 1.0; + v + }; + insert_chunk(&state, "active.md", "alpha active note", e0.clone()).await; + insert_chunk(&state, "alpha.md", "alpha related content", e0.clone()).await; + + // First call: computes and caches. + let ctx1 = super::note_context_cached(&state, "active.md", 10).await; + assert!(ctx1.semantic_ready, "StubEmbedder is 128d → semantic_ready"); + assert!(!ctx1.neighbors.is_empty(), "alpha.md must be a neighbor"); + + // Second call: graph/memory unchanged → must return the cached result. + let ctx2 = super::note_context_cached(&state, "active.md", 10).await; + assert_eq!( + ctx1.neighbors.len(), + ctx2.neighbors.len(), + "cache hit: neighbor count must be identical" + ); + assert_eq!( + ctx1.neighbors[0].path, ctx2.neighbors[0].path, + "cache hit: top neighbor must be identical" + ); + + // Mutate: add beta.md (changes triple_count AND total_memory_bytes). + insert_triples(&state, &[("beta.md", "aingle:source_hash", "h2")]).await; + insert_chunk(&state, "beta.md", "alpha beta content", e0.clone()).await; + + // Third call: version mismatch → cache must be invalidated; beta.md appears. + let ctx3 = super::note_context_cached(&state, "active.md", 10).await; + assert!( + ctx3.neighbors.iter().any(|n| n.path == "beta.md"), + "after mutation (triple_count+memory_bytes changed), beta.md must appear: {:?}", + ctx3.neighbors + ); + } + + /// When the note_context_cache exceeds 256 entries, inserting a new result + /// must clear the map first so the cache never grows without bound. + #[tokio::test] + async fn cache_cap_clears_when_exceeded() { + let state = stub_state(); + + // Pre-fill the cache with 257 dummy entries to exceed the cap. + { + let mut cache = state.note_context_cache.lock().unwrap(); + for i in 0..257usize { + cache.insert( + (format!("dummy_{i}.md"), 0usize), + ( + (0, 0), + super::NoteContext { + semantic_ready: false, + neighbors: vec![], + }, + ), + ); + } + } + assert_eq!( + state.note_context_cache.lock().unwrap().len(), + 257, + "pre-condition: cache must have 257 dummy entries" + ); + + // Call note_context_cached for a fresh note (not in cache). + // The cap must clear the map before inserting this new entry. + let _ = super::note_context_cached(&state, "fresh.md", 5).await; + + let cache = state.note_context_cache.lock().unwrap(); + assert_eq!( + cache.len(), + 1, + "cap must clear the oversized cache before inserting; got {} entries", + cache.len() + ); + assert!( + cache.contains_key(&("fresh.md".to_string(), 5usize)), + "fresh.md must be in the cache after the cap-and-insert" + ); + } + + // ----------------------------------------------------------------------- + // Optional nit + // ----------------------------------------------------------------------- + + /// An active note with NO chunks falls back to the basename as query text + /// and still surfaces neighbors. The active note must never appear as its + /// own neighbor (self-match guard). + #[tokio::test] + async fn no_chunks_falls_back_to_basename_and_never_self_matches() { + let state = stub_state(); + + insert_triples( + &state, + &[ + ("active.md", "aingle:source_hash", "h0"), + ("related.md", "aingle:source_hash", "h1"), + ], + ) + .await; + + // active.md has NO chunks. StubEmbedder: basename("active.md") = "active" + // → v[2] = 1.0 (default case, no "alpha" or "zzz"). related.md chunk + // "general content" → v[2] = 1.0. Cosine = 1.0 ≥ low threshold (0.1). + let e_default: Vec = { + let mut v = vec![0.0_f32; 128]; + v[2] = 1.0; + v + }; + insert_chunk(&state, "related.md", "general related content", e_default).await; + + let ctx = super::note_context(&state, "active.md", 10).await; + assert!(ctx.semantic_ready, "StubEmbedder is 128d → semantic_ready"); + assert!( + !ctx.neighbors.iter().any(|n| n.path == "active.md"), + "active.md must never be its own neighbor: {:?}", + ctx.neighbors + ); + assert!( + ctx.neighbors.iter().any(|n| n.path == "related.md"), + "basename-fallback must still surface related.md: {:?}", + ctx.neighbors + ); + } + + /// End-to-end acceptance test for the real neural embedder: same-topic notes + /// must surface as semantic neighbors while an off-topic note is filtered out. + /// Gated on the `neural-embeddings` feature and skips if the model files are + /// absent. Requires `ORT_DYLIB_PATH` to point at an onnxruntime shared library. + #[cfg(feature = "neural-embeddings")] + #[tokio::test] + async fn neural_note_context_finds_same_topic() { + let model_dir = std::env::var("INERU_E5_MODEL_DIR").unwrap_or_else(|_| { + concat!( + env!("CARGO_MANIFEST_DIR"), + "/../ineru/test-models/multilingual-e5-small" + ) + .to_string() + }); + if !std::path::Path::new(&model_dir) + .join("onnx/model.onnx") + .exists() + { + eprintln!( + "skipping neural_note_context_finds_same_topic: e5 model not found at {model_dir}" + ); + return; + } + + let embedder = crate::embedder::build_embedder(Some(&model_dir)); + assert_eq!( + embedder.dimensions(), + 384, + "neural embedder must be active (384d)" + ); + + let state = AppState::with_db_path_and_embedder(":memory:", None, embedder).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + + let dir = tempfile::tempdir().unwrap(); + // Two same-topic notes about dog care — sentences reused from + // neural_grounding_is_topical in ground.rs for reliable embedding behaviour. + std::fs::write( + dir.path().join("perros1.md"), + "# Cuidado de perros\n\nLos perros necesitan paseos diarios, agua fresca y una dieta equilibrada para estar sanos.\n", + ) + .unwrap(); + std::fs::write( + dir.path().join("perros2.md"), + "# Mascotas\n\nUn perro sano requiere ejercicio diario, hidratación constante y alimentación balanceada.\n", + ) + .unwrap(); + // Off-topic note: elections have no semantic overlap with dog care. + std::fs::write( + dir.path().join("elecciones.md"), + "# Elecciones\n\nLos resultados de las elecciones presidenciales determinan el futuro del país.\n", + ) + .unwrap(); + + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + let ctx = super::note_context(&state, "perros1.md", 5).await; + + assert!( + ctx.semantic_ready, + "neural embedder (384d) must set semantic_ready=true" + ); + + assert!( + ctx.neighbors.iter().any(|n| n.path == "perros2.md"), + "perros2.md (same-topic sibling) must be a semantic neighbor of perros1.md: {:?}", + ctx.neighbors + ); + + let sibling = ctx + .neighbors + .iter() + .find(|n| n.path == "perros2.md") + .unwrap(); + assert!( + sibling.passage.is_some(), + "perros2.md neighbor must include a matching passage: {:?}", + sibling + ); + + // elecciones.md is semantically orthogonal to dog care; its cosine against + // the perros1.md query vector (~0.83) must not reach NEIGHBOR_FLOOR (0.88). + assert!( + !ctx.neighbors.iter().any(|n| n.path == "elecciones.md"), + "off-topic elecciones.md must not appear as a neighbor (below NEIGHBOR_FLOOR=0.88): {:?}", + ctx.neighbors + ); + } +} diff --git a/crates/aingle_cortex/src/service/dag.rs b/crates/aingle_cortex/src/service/dag.rs index e43997bf..4521e6fc 100644 --- a/crates/aingle_cortex/src/service/dag.rs +++ b/crates/aingle_cortex/src/service/dag.rs @@ -9,6 +9,9 @@ use crate::rest::dag::{ }; use crate::state::AppState; +/// Default action-history limit shared by REST and MCP endpoints. +pub(crate) const DEFAULT_HISTORY_LIMIT: usize = 50; + /// Return DAG actions affecting a subject, newest first, up to `limit`. pub async fn history_by_subject( state: &AppState, diff --git a/crates/aingle_cortex/src/service/ground.rs b/crates/aingle_cortex/src/service/ground.rs new file mode 100644 index 00000000..92def967 --- /dev/null +++ b/crates/aingle_cortex/src/service/ground.rs @@ -0,0 +1,372 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Grounded retrieval: turn a question into cited, provenance-backed context with +//! an explicit groundedness signal, so an LLM answers only from verifiable sources. + +use crate::error::Result; +use crate::state::AppState; +use serde::Serialize; + +/// Number of strong chunks required to call retrieval "grounded". Requiring two +/// independent corroborating sources is a deliberate anti-hallucination policy: +/// a lone strong chunk is surfaced as "weak", not "grounded". The strong/weak +/// similarity cutoffs themselves come from the active embedder via +/// [`ineru::Embedder::relevance_thresholds`]. +const MIN_CORROBORATING_CHUNKS: usize = 2; + +/// A cited chunk of source context. +#[derive(Debug, Clone, Serialize)] +pub struct ContextChunk { + pub text: String, + pub source: String, + pub lines: String, + pub relevance: f32, + /// Hex hash of the signed DAG action that recorded this source — verifiable + /// via the DAG history/action API. `None` when the source has no signed action. + pub provenance_anchor: Option, + pub ingested_at: Option, +} + +/// The grounded answer context returned to the model. +#[derive(Debug, Clone, Serialize)] +pub struct GroundedContext { + pub groundedness: String, // "grounded" | "weak" | "ungrounded" + pub answer_context: Vec, + pub gaps: Vec, + /// Instruction echoed to the model to keep it on the cited path. + pub instruction: String, +} + +use ineru::MemoryQuery; + +/// Retrieve grounded context for `question`. Pulls the top-`k` semantically +/// similar chunks from Ineru, attaches each chunk's signed provenance from the +/// DAG (latest signed action affecting its source path), and computes a +/// groundedness signal from the best similarity. +pub async fn ground(state: &AppState, question: &str, k: usize) -> Result { + let k = k.max(1); + let (ground_high, ground_low) = state.embedder.relevance_thresholds(); + + let query_vec = state.embedder.embed_query(question); + // Fetch a broad candidate pool: Ineru's composite recall score is keyword- + // and importance-weighted (embedding is only a minor term), so we over-fetch + // and re-rank by pure embedding cosine below. That makes grounding a true + // semantic search whose scores match the embedder's `relevance_thresholds`. + let fetch_limit = k.max(24); + let results = { + let mem = state.memory.read().await; + mem.recall( + &MemoryQuery::text(question) + .with_limit(fetch_limit) + .with_embedding(query_vec.clone()), + ) + .map_err(|e| crate::error::Error::Internal(e.to_string()))? + }; + + let mut answer_context = Vec::new(); + for r in &results { + // Only consider chunk memories produced by ingestion. + if r.entry.entry_type != crate::service::ingest::CHUNK_ENTRY_TYPE { + continue; + } + // Semantic relevance = cosine(query, chunk) from the active embedder, + // not Ineru's composite recall score. Skip chunks lacking an embedding + // (dimension-mismatched legacy data scores 0 via cosine_similarity). + let relevance = match &r.entry.embedding { + Some(emb) => query_vec.cosine_similarity(emb), + None => continue, + }; + let d = &r.entry.data; + let source = d + .get("source_path") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + let ls = d.get("line_start").and_then(|v| v.as_u64()).unwrap_or(0); + let le = d.get("line_end").and_then(|v| v.as_u64()).unwrap_or(0); + let text = d + .get("text") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let (sig, ingested_at) = signed_provenance(state, &source).await; + + answer_context.push(ContextChunk { + text, + source, + lines: format!("{ls}-{le}"), + relevance, + provenance_anchor: sig, + ingested_at, + }); + } + + // Re-rank by semantic relevance and keep the top-k. + answer_context.sort_by(|a, b| { + b.relevance + .partial_cmp(&a.relevance) + .unwrap_or(std::cmp::Ordering::Equal) + }); + answer_context.truncate(k); + let best: f32 = answer_context.first().map(|c| c.relevance).unwrap_or(0.0); + + // Require at least MIN_CORROBORATING_CHUNKS strong matches for "grounded"; + // a single strong chunk is only "weak" (independent corroboration guard). + let strong = answer_context + .iter() + .filter(|c| c.relevance >= ground_high) + .count(); + let groundedness = if best >= ground_high && strong >= MIN_CORROBORATING_CHUNKS { + "grounded" + } else if best >= ground_low && !answer_context.is_empty() { + "weak" + } else { + "ungrounded" + }; + + let mut gaps = Vec::new(); + if answer_context.is_empty() { + gaps.push(format!("No ingested source matches: {question:?}.")); + } else if groundedness == "weak" { + if best >= ground_high && strong < MIN_CORROBORATING_CHUNKS { + gaps.push( + "Only one source corroborates this; a second is needed to be grounded.".to_string(), + ); + } else { + gaps.push("Retrieved context is only weakly related to the question.".to_string()); + } + } else if groundedness == "ungrounded" { + // Chunks were retrieved but none are relevant enough to ground an answer. + // Surface the gap so the engine stays honest rather than silently empty. + gaps.push( + "Retrieved context is not relevant enough to ground an answer on this topic." + .to_string(), + ); + } + + Ok(GroundedContext { + groundedness: groundedness.to_string(), + answer_context, + gaps, + instruction: "Answer ONLY from answer_context and cite each claim as \ + source:lines. If groundedness is not \"grounded\", say so explicitly \ + and do not invent facts." + .to_string(), + }) +} + +/// Look up the latest signed DAG action affecting `source_path` and return its +/// action hash (as provenance identifier) and timestamp, if any. +/// +/// Adaptation note: `DagActionDto` has no `signature` field. Instead it has +/// `hash: String` (action hash) and `signed: bool`. We return the action hash +/// as the provenance identifier when the action is signed, or None otherwise. +/// The timestamp field is `timestamp: String` which matches the plan exactly. +async fn signed_provenance( + state: &AppState, + source_path: &str, +) -> (Option, Option) { + #[cfg(feature = "dag")] + { + if source_path.is_empty() { + return (None, None); + } + if let Ok(actions) = crate::service::dag::history_by_subject(state, source_path, 1).await { + if let Some(a) = actions.first() { + // DagActionDto has `hash: String` and `signed: bool` rather than a + // `signature` field, so we use the action hash as the provenance token + // when the action is signed. + let sig = if a.signed { Some(a.hash.clone()) } else { None }; + return (sig, Some(a.timestamp.clone())); + } + } + (None, None) + } + #[cfg(not(feature = "dag"))] + { + let _ = (state, source_path); + (None, None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + async fn enabled_state() -> AppState { + let state = AppState::with_db_path(":memory:", None).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + state + } + + #[tokio::test] + async fn empty_memory_is_ungrounded() { + let state = enabled_state().await; + let g = ground(&state, "anything at all", 5).await.unwrap(); + assert_eq!(g.groundedness, "ungrounded"); + assert!(g.answer_context.is_empty()); + assert!(!g.gaps.is_empty()); + } + + #[tokio::test] + async fn single_corroborating_chunk_is_weak_not_grounded() { + // One source, one chunk: even a strong similarity match must not be called + // "grounded" — with the placeholder embedder a lone high score can be + // spurious, so a single corroborating chunk is downgraded to "weak". + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("note.md"), + "# Note\n\nWe chose sled for its exclusive lock semantics.\n", + ) + .unwrap(); + let state = enabled_state().await; + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + // Query the chunk almost verbatim so the lone chunk scores well above HIGH. + let g = ground(&state, "We chose sled for its exclusive lock semantics.", 5) + .await + .unwrap(); + assert!( + !g.answer_context.is_empty(), + "should retrieve the one chunk" + ); + assert_eq!( + g.groundedness, "weak", + "a single corroborating chunk must be weak, not grounded; ctx: {:?}", + g.answer_context + ); + } + + #[tokio::test] + async fn two_corroborating_sources_are_grounded() { + // The same fact stated in two separate files yields two strong chunks for a + // matching query — that independent corroboration is what makes it grounded. + let dir = tempfile::tempdir().unwrap(); + let fact = "# Doc\n\nThe quorum read requires a valid leader lease.\n"; + std::fs::write(dir.path().join("a.md"), fact).unwrap(); + std::fs::write(dir.path().join("b.md"), fact).unwrap(); + let state = enabled_state().await; + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + let g = ground(&state, "The quorum read requires a valid leader lease.", 5) + .await + .unwrap(); + let strong = g + .answer_context + .iter() + .filter(|c| c.relevance >= 0.55) + .count(); + assert!( + strong >= 2, + "two sources should both score strongly; ctx: {:?}", + g.answer_context + ); + assert_eq!( + g.groundedness, "grounded", + "two corroborating strong chunks must be grounded; ctx: {:?}", + g.answer_context + ); + } + + #[tokio::test] + async fn grounds_after_ingest_with_source() { + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("adr.md"), + "# Storage\n\nWe chose sled because of its exclusive lock semantics.\n", + ) + .unwrap(); + let state = enabled_state().await; + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + let g = ground(&state, "exclusive lock semantics sled", 5) + .await + .unwrap(); + assert!( + !g.answer_context.is_empty(), + "should retrieve the ingested chunk" + ); + assert_eq!(g.answer_context[0].source, "adr.md"); + assert_ne!(g.groundedness, "ungrounded"); + } + + /// End-to-end acceptance test for the real neural embedder: a topical query + /// must be grounded while an off-topic query is ungrounded. Gated on the + /// `neural-embeddings` feature and skips if the model files are absent. + /// Requires `ORT_DYLIB_PATH` to point at an onnxruntime dynamic library. + #[cfg(feature = "neural-embeddings")] + #[tokio::test] + async fn neural_grounding_is_topical() { + let model_dir = std::env::var("INERU_E5_MODEL_DIR").unwrap_or_else(|_| { + concat!( + env!("CARGO_MANIFEST_DIR"), + "/../ineru/test-models/multilingual-e5-small" + ) + .to_string() + }); + if !std::path::Path::new(&model_dir) + .join("onnx/model.onnx") + .exists() + { + eprintln!("skipping: e5 model not found at {model_dir}"); + return; + } + + let embedder = crate::embedder::build_embedder(Some(&model_dir)); + assert_eq!(embedder.dimensions(), 384, "neural embedder must be active"); + + let state = AppState::with_db_path_and_embedder(":memory:", None, embedder).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + + let dir = tempfile::tempdir().unwrap(); + std::fs::write( + dir.path().join("dogs.md"), + "# Cuidado de perros\n\nLos perros necesitan paseos diarios, agua fresca y una dieta equilibrada para estar sanos.\n", + ) + .unwrap(); + std::fs::write( + dir.path().join("dogs2.md"), + "# Mascotas\n\nUn perro sano requiere ejercicio diario, hidratación constante y alimentación balanceada.\n", + ) + .unwrap(); + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + let topical = ground(&state, "¿Cómo debo cuidar a mi perro?", 5) + .await + .unwrap(); + assert_ne!( + topical.groundedness, "ungrounded", + "a dog-care question must find the dog-care notes; ctx: {:?}", + topical.answer_context + ); + + let off_topic = ground( + &state, + "¿Cuál fue el resultado de las elecciones presidenciales?", + 5, + ) + .await + .unwrap(); + assert_eq!( + off_topic.groundedness, "ungrounded", + "an unrelated question must be ungrounded; ctx: {:?}", + off_topic.answer_context + ); + } +} diff --git a/crates/aingle_cortex/src/service/ingest.rs b/crates/aingle_cortex/src/service/ingest.rs new file mode 100644 index 00000000..965a6023 --- /dev/null +++ b/crates/aingle_cortex/src/service/ingest.rs @@ -0,0 +1,444 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Incremental vault ingestion: walk a directory, extract triples and chunks, +//! write them into the graph (with signed DAG provenance) and Ineru memory, +//! and maintain a per-file source-hash registry triple for idempotent re-runs. + +use crate::error::{Error, Result}; +use crate::rest::ValueDto; +use crate::service::triples::{delete_triple, insert_triple_inner}; +use crate::state::AppState; +use aingle_graph::{NodeId, Predicate, TriplePattern}; +use aingle_ingest::{extract, ObjectValue}; +use ineru::{MemoryEntry, MemoryId, MemoryMetadata}; + +// Bring the graph error type into scope for duplicate-matching in ingest logic. +use aingle_graph::Error as GraphError; + +/// The predicate used to anchor the per-file content-hash registry triple. +pub const PRED_SOURCE_HASH: &str = "aingle:source_hash"; + +/// Ineru `entry_type` used for ingested text chunks. Grounding filters on this. +pub const CHUNK_ENTRY_TYPE: &str = "doc_chunk"; + +/// One ingested source file and its content hash at ingest time. +#[derive(Debug, Clone, serde::Serialize)] +pub struct SourceRecord { + pub path: String, + pub content_hash: String, +} + +/// Summary statistics returned by `ingest_path`. +#[derive(Debug, Default, Clone, serde::Serialize)] +pub struct IngestReport { + /// Total number of files encountered during the walk. + pub files_seen: usize, + /// Files that were newly ingested (hash changed or first time). + pub files_ingested: usize, + /// Files skipped because their content hash matched the registry. + pub files_skipped: usize, + /// Total triples written (structural + registry). + pub triples_written: usize, + /// Total text chunks written to Ineru memory. + pub chunks_written: usize, + /// The files ingested in this run, with their content hashes. + pub sources: Vec, +} + +/// Walk `root_path`, extract structural triples and text chunks from each file, +/// write them to the graph (with DAG provenance) and Ineru memory, and maintain +/// a per-file source-hash registry triple for incremental skip on unchanged files. +/// +/// `namespace` is forwarded to the audit log (use `None` for internal/background calls). +pub async fn ingest_path( + state: &AppState, + root_path: &str, + namespace: Option, +) -> Result { + let mut report = IngestReport::default(); + + // Build a walk that respects .gitignore / .ignore files + let walker = ignore::WalkBuilder::new(root_path) + .hidden(false) + .git_ignore(true) + .build(); + + let mut files: Vec<(String, String)> = Vec::new(); // (rel_path, content) + + for entry in walker { + let entry = entry.map_err(|e| Error::Internal(format!("walk error: {e}")))?; + let path = entry.path(); + + // Skip directories + if !path.is_file() { + continue; + } + + // Filter to supported extensions: .md, .markdown, .txt, .rs, .py, .ts, .js + let ext = path + .extension() + .and_then(|e| e.to_str()) + .unwrap_or("") + .to_lowercase(); + if !matches!( + ext.as_str(), + "md" | "markdown" | "txt" | "rs" | "py" | "ts" | "js" | "toml" | "json" + ) { + continue; + } + + report.files_seen += 1; + + let content = std::fs::read_to_string(path) + .map_err(|e| Error::Internal(format!("read {}: {e}", path.display())))?; + + // Compute relative path from root_path for use as the note subject + let rel_path = path + .strip_prefix(root_path) + .unwrap_or(path) + .to_string_lossy() + .replace('\\', "/"); + + files.push((rel_path, content)); + } + + for (rel_path, content) in files { + let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string(); + + // Check registry: does a triple (rel_path, aingle:source_hash, ) already exist? + let existing_hash = { + let graph = state.graph.read().await; + let pattern = TriplePattern::any() + .with_subject(NodeId::named(&rel_path)) + .with_predicate(Predicate::named(PRED_SOURCE_HASH)); + graph + .find(pattern) + .map_err(|e| Error::Internal(format!("graph find error: {e}")))? + .into_iter() + .next() + .and_then(|t| t.object_string().map(|s| s.to_string())) + }; + + if let Some(ref existing) = existing_hash { + if existing == &content_hash { + // File unchanged — skip + report.files_skipped += 1; + continue; + } + } + + report.files_ingested += 1; + + // The file changed (or it's a re-ingest with a different hash): purge all + // prior facts and chunks for this source before writing the fresh ones, so + // stale structural triples and Ineru chunks don't linger and leak into + // grounded retrieval. + if existing_hash.is_some() { + purge_source(state, &rel_path, namespace.clone()).await?; + } + + // Extract triples and chunks from the file + let extraction = extract(&rel_path, &content); + + // Write structural triples + for pt in &extraction.triples { + let object_dto = match &pt.object { + ObjectValue::Node(n) => ValueDto::Node { node: n.clone() }, + ObjectValue::Text(t) => ValueDto::String(t.clone()), + }; + + #[cfg(feature = "dag")] + let prov = Some(pt.provenance.clone()); + #[cfg(not(feature = "dag"))] + let _prov = (); + + let result = insert_triple_inner( + state, + object_dto, + &pt.subject, + &pt.predicate, + #[cfg(feature = "dag")] + prov, + #[cfg(not(feature = "dag"))] + None, + namespace.clone(), + ) + .await; + + match result { + Ok(_) => { + report.triples_written += 1; + } + Err(Error::GraphError(GraphError::Duplicate(_))) => { + // Triple already exists — counts as already-written (idempotent) + report.triples_written += 1; + } + Err(e) => { + return Err(Error::Internal(format!("triple insert error: {e}"))); + } + } + } + + // Write text chunks to Ineru memory + for chunk in &extraction.chunks { + let embedding = state.embedder.embed_passage(&chunk.text); + let mut entry = MemoryEntry::new( + CHUNK_ENTRY_TYPE, + serde_json::json!({ + "text": chunk.text, + "source_path": chunk.provenance.source_path, + "line_start": chunk.provenance.line_start, + "line_end": chunk.provenance.line_end, + "content_hash": chunk.provenance.content_hash, + }), + ); + entry.metadata = MemoryMetadata::with_source(&chunk.provenance.source_path); + entry.metadata.importance = 0.6; + entry.embedding = Some(embedding); + + let mut mem = state.memory.write().await; + mem.remember(entry) + .map_err(|e| Error::Internal(format!("memory write error: {e}")))?; + report.chunks_written += 1; + } + + // Write/update the source-hash registry triple + #[cfg(feature = "dag")] + let registry_prov = Some(aingle_graph::dag::Provenance { + source_path: rel_path.clone(), + line_start: 0, + line_end: 0, + content_hash: content_hash.clone(), + }); + #[cfg(not(feature = "dag"))] + let _registry_prov = (); + + insert_triple_inner( + state, + ValueDto::String(content_hash.clone()), + &rel_path, + PRED_SOURCE_HASH, + #[cfg(feature = "dag")] + registry_prov, + #[cfg(not(feature = "dag"))] + None, + namespace.clone(), + ) + .await + .map_err(|e| Error::Internal(format!("registry triple insert error: {e}")))?; + + report.triples_written += 1; + + report.sources.push(SourceRecord { + path: rel_path.clone(), + content_hash: content_hash.clone(), + }); + } + + Ok(report) +} + +/// Remove every fact and chunk previously ingested from `rel_path`, so a changed +/// file's stale data can't survive a re-ingest. +/// +/// Deletes all graph triples whose subject is `rel_path` (its structural facts +/// plus the source-hash registry triple) and forgets every Ineru chunk whose +/// `metadata.source` is `rel_path`. Inbound links from *other* files (where +/// `rel_path` is the object, not the subject) are left untouched. +async fn purge_source(state: &AppState, rel_path: &str, namespace: Option) -> Result<()> { + // Graph: delete every triple authored by this source (subject == rel_path). + let stale_ids: Vec = { + let graph = state.graph.read().await; + let pattern = TriplePattern::any().with_subject(NodeId::named(rel_path)); + graph + .find(pattern) + .map_err(|e| Error::Internal(format!("graph find error: {e}")))? + .into_iter() + .map(|t| t.id().to_hex()) + .collect() + }; + for hex_id in stale_ids { + // Best-effort: a concurrently-removed triple is fine to skip. + let _ = delete_triple(state, &hex_id, namespace.clone()).await; + } + + // Ineru: forget every chunk that came from this source. + { + let mut mem = state.memory.write().await; + let ids: Vec = mem + .stm + .all_entries() + .into_iter() + .chain(mem.ltm.all_entries()) + .filter(|e| e.entry_type == CHUNK_ENTRY_TYPE && e.metadata.source == rel_path) + .map(|e| e.id) + .collect(); + for id in ids { + let _ = mem.forget(&id); + } + } + + Ok(()) +} + +/// List all source files recorded in the signed registry (path + content hash). +pub async fn list_sources(state: &AppState) -> Result> { + let graph = state.graph.read().await; + let pattern = TriplePattern::any().with_predicate(Predicate::named(PRED_SOURCE_HASH)); + let triples = graph + .find(pattern) + .map_err(|e| Error::Internal(format!("graph find error: {e}")))?; + Ok(triples + .iter() + .filter_map(|t| { + // `NodeId::to_string` renders the IRI form ``; strip the angle + // brackets so the path matches the clean form used by `ingest_path`'s + // report and the chunk provenance (round-trippable into other tools). + let path = t + .subject + .to_string() + .trim_start_matches('<') + .trim_end_matches('>') + .to_string(); + t.object_string().map(|h| SourceRecord { + path, + content_hash: h.to_string(), + }) + }) + .collect()) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn write(dir: &std::path::Path, name: &str, body: &str) { + std::fs::write(dir.join(name), body).unwrap(); + } + + async fn enabled_state() -> AppState { + let state = AppState::with_db_path(":memory:", None).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + state + } + + #[tokio::test] + async fn ingest_writes_triples_and_chunks() { + let dir = tempfile::tempdir().unwrap(); + write( + dir.path(), + "note.md", + "# Title\n\nWe use [[sled]] for storage. #durability\n", + ); + let state = enabled_state().await; + + let report = ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + assert_eq!(report.files_seen, 1); + assert_eq!(report.files_ingested, 1); + assert!(report.triples_written >= 3); // heading + links_to + tagged + registry + assert!(report.chunks_written >= 1); + + let mem = state.memory.read().await; + let hits = mem.recall_text("sled storage").unwrap(); + assert!(!hits.is_empty()); + } + + #[tokio::test] + async fn reingesting_unchanged_is_idempotent() { + let dir = tempfile::tempdir().unwrap(); + write(dir.path(), "note.md", "# Title\n\nStable [[content]].\n"); + let state = enabled_state().await; + let root = dir.path().to_str().unwrap(); + + ingest_path(&state, root, None).await.unwrap(); + let actions_after_first = { + let g = state.graph.read().await; + g.dag_store().unwrap().action_count() + }; + + let report2 = ingest_path(&state, root, None).await.unwrap(); + let actions_after_second = { + let g = state.graph.read().await; + g.dag_store().unwrap().action_count() + }; + + assert_eq!(report2.files_skipped, 1); + assert_eq!(report2.files_ingested, 0); + assert_eq!( + actions_after_first, actions_after_second, + "re-ingesting unchanged files must write zero new DAG actions" + ); + } + + #[tokio::test] + async fn changed_file_reingests() { + let dir = tempfile::tempdir().unwrap(); + write(dir.path(), "note.md", "# A\n\nFirst [[x]].\n"); + let state = enabled_state().await; + let root = dir.path().to_str().unwrap(); + + ingest_path(&state, root, None).await.unwrap(); + write(dir.path(), "note.md", "# A\n\nSecond [[y]] changed.\n"); + let report = ingest_path(&state, root, None).await.unwrap(); + assert_eq!(report.files_ingested, 1); + assert_eq!(report.files_skipped, 0); + } + + #[tokio::test] + async fn changed_file_purges_stale_chunks() { + let dir = tempfile::tempdir().unwrap(); + write(dir.path(), "note.md", "# A\n\nWe use sled for storage.\n"); + let state = enabled_state().await; + let root = dir.path().to_str().unwrap(); + ingest_path(&state, root, None).await.unwrap(); + + // Change the file so the old sentence no longer exists in the source. + write(dir.path(), "note.md", "# A\n\nWe use rocksdb now.\n"); + ingest_path(&state, root, None).await.unwrap(); + + // Querying the OLD sentence verbatim must not surface the stale chunk: + // re-ingesting a changed file must forget the previous chunks for it. + let g = crate::service::ground::ground(&state, "We use sled for storage.", 5) + .await + .unwrap(); + assert!( + !g.answer_context.iter().any(|c| c.text.contains("sled")), + "stale 'sled' chunk should be purged on re-ingest, got: {:?}", + g.answer_context + ); + } + + #[tokio::test] + async fn changed_file_purges_stale_triples() { + let dir = tempfile::tempdir().unwrap(); + write(dir.path(), "note.md", "# A\n\nSee [[sled]].\n"); + let state = enabled_state().await; + let root = dir.path().to_str().unwrap(); + ingest_path(&state, root, None).await.unwrap(); + + // Repoint the wikilink: the old links_to:sled triple must not linger. + write(dir.path(), "note.md", "# A\n\nSee [[rocksdb]].\n"); + ingest_path(&state, root, None).await.unwrap(); + + let graph = state.graph.read().await; + let links = graph + .find( + TriplePattern::any() + .with_subject(NodeId::named("note.md")) + .with_predicate(Predicate::named("links_to")), + ) + .unwrap(); + assert_eq!( + links.len(), + 1, + "stale links_to should be purged, leaving only the new link, got: {links:?}" + ); + } +} diff --git a/crates/aingle_cortex/src/service/local_graph.rs b/crates/aingle_cortex/src/service/local_graph.rs new file mode 100644 index 00000000..4714a27f --- /dev/null +++ b/crates/aingle_cortex/src/service/local_graph.rs @@ -0,0 +1,1146 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Local graph neighborhood for a single note: typed edges (link / semantic / tag) +//! up to depth 2 for the Akashi per-note graph panel (VC-2). + +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet, VecDeque}; + +use crate::service::context::{note_context_cached, NEIGHBOR_FLOOR}; +use crate::service::triple_util::{basename, obj_string, resolve_link_target, strip_brackets}; +use crate::service::vault_map::is_maps_path; + +// --------------------------------------------------------------------------- +// Public types +// --------------------------------------------------------------------------- + +/// The typed local neighborhood graph around a center note. +#[derive(Debug, Clone, serde::Serialize, Default)] +pub struct LocalGraph { + /// The center note path. + pub center: String, + /// All nodes in this neighborhood (center + neighbors). + pub nodes: Vec, + /// All typed edges in this neighborhood. + pub edges: Vec, + /// `true` when the embedder has enough dimensions for semantic edges. + pub semantic_ready: bool, +} + +/// A node in the local neighborhood graph. +#[derive(Debug, Clone, serde::Serialize)] +pub struct GNode { + /// Full relative path (canonical identity). + pub id: String, + /// Human-readable label (basename without extension). + pub label: String, + /// `"center"` for the focal note; `"note"` for all others. + pub kind: String, + /// Semantic cluster id. Always `-1` here (clustering is global / expensive). + pub cluster: i64, + /// Number of edges in THIS graph touching this node. + pub degree: usize, + /// Creation date sourced from the note's `created` frontmatter scalar (e.g. `"2025-09-14"`). + /// `None` when the note has no `created` triple. + pub timestamp: Option, +} + +/// A typed, optionally weighted edge in the local neighborhood graph. +#[derive(Debug, Clone, serde::Serialize)] +pub struct TypedEdge { + pub source: String, + pub target: String, + /// `"link"` | `"semantic"` | `"tag"` + pub kind: String, + /// Cosine similarity score — present only for semantic edges. + pub score: Option, + /// For tag edges: the shared tag name. + pub label: Option, + /// Signed DAG action hash for semantic edges (🔒). `None` if unavailable. + pub provenance_anchor: Option, +} + +// --------------------------------------------------------------------------- +// Private constants +// --------------------------------------------------------------------------- + +const NODE_CAP: usize = 80; +const SEM_PER_NODE: usize = 5; +const MAX_DEPTH: usize = 2; +/// Max tag-edges added per (node, tag) pair — prevents explosion on popular tags. +const TAG_FANOUT_CAP: usize = 6; +/// Maximum frontier size before the per-node semantic pass at each BFS level. +/// Caps the depth-2 semantic N+1: a hub with many link-neighbors would otherwise +/// trigger one `note_context_cached` call per frontier node. Sorting the frontier +/// first ensures deterministic behavior when truncating. +const SEM_FRONTIER_CAP: usize = 16; + +// --------------------------------------------------------------------------- +// Core function +// --------------------------------------------------------------------------- + +/// Build the typed local neighborhood graph for `note` at BFS depth `depth`. +pub async fn local_graph(state: &crate::state::AppState, note: &str, depth: usize) -> LocalGraph { + use aingle_graph::{Predicate, TriplePattern}; + + let depth = depth.clamp(1, MAX_DEPTH); + let semantic_grade = state.embedder.dimensions() >= 128; + + // ----------------------------------------------------------------------- + // 1. Load structural data from the graph once. + // ----------------------------------------------------------------------- + // notes: all ingested note paths + // links_raw: (subject, object-string) for every links_to triple + // tagged_raw: (subject, tag) for every tagged triple + type PairVec = Vec<(String, String)>; + let (notes, links_raw, tagged_raw, created_map): ( + Vec, + PairVec, + PairVec, + BTreeMap, + ) = { + let g = state.graph.read().await; + let collect = |pred: &str| -> PairVec { + g.find(TriplePattern::any().with_predicate(Predicate::named(pred))) + .unwrap_or_default() + .into_iter() + .filter_map(|t| { + obj_string(&t).map(|o| (strip_brackets(&t.subject.to_string()).to_string(), o)) + }) + .collect() + }; + let mut ns: Vec = collect(crate::service::ingest::PRED_SOURCE_HASH) + .into_iter() + .map(|(s, _)| s) + .collect(); + ns.sort(); + ns.dedup(); + let lnks = collect("links_to"); + let tags = collect("tagged"); + // Build created-date map: note_path → date. "date" as fallback, "created" takes precedence. + let mut cmap: BTreeMap = collect("date").into_iter().collect(); + for (k, v) in collect("created") { + cmap.insert(k, v); + } + (ns, lnks, tags, cmap) + }; + + let note_set: BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + + // Basename index for wikilink resolution. + let mut by_base: BTreeMap = BTreeMap::new(); + for n in ¬es { + by_base.entry(basename(n)).or_insert_with(|| n.clone()); + } + + let resolve = + |target: &str| -> Option { resolve_link_target(target, ¬e_set, &by_base) }; + + // Resolved outgoing links: (src, dst) — both are full paths, neither a maps path. + let links: Vec<(String, String)> = links_raw + .iter() + .filter_map(|(src, tgt)| resolve(tgt).map(|dst| (src.clone(), dst))) + .filter(|(src, dst)| src != dst) + .filter(|(src, _)| note_set.contains(src.as_str()) && !is_maps_path(src)) + .filter(|(_, dst)| note_set.contains(dst.as_str()) && !is_maps_path(dst)) + .collect(); + + // Pre-index links for O(1) per-node lookup in the BFS loop — avoids + // re-scanning the full `links` vec twice per node. + let mut by_src: BTreeMap> = BTreeMap::new(); + let mut by_dst: BTreeMap> = BTreeMap::new(); + for (src, dst) in &links { + by_src.entry(src.clone()).or_default().push(dst.clone()); + by_dst.entry(dst.clone()).or_default().push(src.clone()); + } + + // tag_of_note: note → set + // notes_of_tag: tag → vec (sorted, deduped) + let mut tag_of_note: BTreeMap> = BTreeMap::new(); + let mut notes_of_tag: BTreeMap> = BTreeMap::new(); + for (note_path, tag) in &tagged_raw { + if note_set.contains(note_path.as_str()) && !is_maps_path(note_path) { + tag_of_note + .entry(note_path.clone()) + .or_default() + .insert(tag.clone()); + notes_of_tag + .entry(tag.clone()) + .or_default() + .push(note_path.clone()); + } + } + for v in notes_of_tag.values_mut() { + v.sort(); + v.dedup(); + } + + // ----------------------------------------------------------------------- + // 2. BFS to collect edges. + // ----------------------------------------------------------------------- + let mut edges: Vec = Vec::new(); + let mut visited: HashSet = HashSet::new(); + visited.insert(note.to_string()); + + let mut frontier: VecDeque = VecDeque::new(); + frontier.push_back(note.to_string()); + + let mut semantic_ready = semantic_grade; + + for _level in 0..depth { + let mut next_frontier: Vec = Vec::new(); + while let Some(n) = frontier.pop_front() { + if is_maps_path(&n) { + continue; + } + + // --- link edges (outgoing from n) --- + if let Some(dsts) = by_src.get(&n) { + for dst in dsts { + edges.push(TypedEdge { + source: n.clone(), + target: dst.clone(), + kind: "link".to_string(), + score: None, + label: None, + provenance_anchor: None, + }); + if !visited.contains(dst) { + visited.insert(dst.clone()); + next_frontier.push(dst.clone()); + } + } + } + // --- link edges (incoming to n) --- + if let Some(srcs) = by_dst.get(&n) { + for src in srcs { + edges.push(TypedEdge { + source: src.clone(), + target: n.clone(), + kind: "link".to_string(), + score: None, + label: None, + provenance_anchor: None, + }); + if !visited.contains(src) { + visited.insert(src.clone()); + next_frontier.push(src.clone()); + } + } + } + + // --- semantic edges --- + if semantic_grade { + let ctx = note_context_cached(state, &n, SEM_PER_NODE).await; + if !ctx.semantic_ready { + semantic_ready = false; + } else { + for nb in ctx.neighbors { + if nb.score < NEIGHBOR_FLOOR { + continue; + } + if is_maps_path(&nb.path) { + continue; + } + edges.push(TypedEdge { + source: n.clone(), + target: nb.path.clone(), + kind: "semantic".to_string(), + score: Some(nb.score), + label: None, + provenance_anchor: nb.provenance_anchor, + }); + if !visited.contains(&nb.path) { + visited.insert(nb.path.clone()); + next_frontier.push(nb.path.clone()); + } + } + } + } + + // --- tag edges --- + if let Some(tags) = tag_of_note.get(&n) { + for tag in tags { + if let Some(peers) = notes_of_tag.get(tag) { + let mut added = 0usize; + for peer in peers { + if peer == &n || is_maps_path(peer) { + continue; + } + if added >= TAG_FANOUT_CAP { + break; + } + edges.push(TypedEdge { + source: n.clone(), + target: peer.clone(), + kind: "tag".to_string(), + score: None, + label: Some(tag.clone()), + provenance_anchor: None, + }); + if !visited.contains(peer) { + visited.insert(peer.clone()); + next_frontier.push(peer.clone()); + } + added += 1; + } + } + } + } + } + + // Cap the next frontier before promoting to bound semantic cost at the + // next level (≤ SEM_FRONTIER_CAP × note_context_cached calls). + // Depth-1 behavior is identical: next_frontier is never used again. + next_frontier.sort(); + next_frontier.truncate(SEM_FRONTIER_CAP); + for n in next_frontier { + frontier.push_back(n); + } + } + + // ----------------------------------------------------------------------- + // 3. Deduplicate edges. + // ----------------------------------------------------------------------- + // Links are directional — dedupe by (source, target, kind). + // Semantic/tag are symmetric — dedupe order-insensitively by (min,max,kind). + let mut seen_link: HashSet<(String, String)> = HashSet::new(); + let mut seen_sym: HashSet<(String, String, String, String)> = HashSet::new(); + let mut deduped: Vec = Vec::new(); + + for e in edges { + // Remove self-loops. + if e.source == e.target { + continue; + } + match e.kind.as_str() { + "link" => { + let key = (e.source.clone(), e.target.clone()); + if seen_link.insert(key) { + deduped.push(e); + } + } + _ => { + // symmetric kinds: (tag, semantic) + let (lo, hi) = if e.source <= e.target { + (e.source.clone(), e.target.clone()) + } else { + (e.target.clone(), e.source.clone()) + }; + // Include the tag label so a pair sharing two distinct tags + // yields two edges. Semantic label is always None → "" → no clash. + let tag_label = if e.kind == "tag" { + e.label.clone().unwrap_or_default() + } else { + String::new() + }; + let key = (lo, hi, e.kind.clone(), tag_label); + if seen_sym.insert(key) { + deduped.push(e); + } + } + } + } + + // ----------------------------------------------------------------------- + // 4. Collect all node ids referenced by edges, plus the center. + // ----------------------------------------------------------------------- + let mut all_node_ids: HashSet = HashSet::new(); + all_node_ids.insert(note.to_string()); + for e in &deduped { + all_node_ids.insert(e.source.clone()); + all_node_ids.insert(e.target.clone()); + } + + // ----------------------------------------------------------------------- + // 5. Cap: keep center + highest-degree nodes; drop edges to removed nodes. + // ----------------------------------------------------------------------- + let mut degree_map: HashMap = HashMap::new(); + for id in &all_node_ids { + degree_map.insert(id.clone(), 0); + } + for e in &deduped { + *degree_map.entry(e.source.clone()).or_default() += 1; + *degree_map.entry(e.target.clone()).or_default() += 1; + } + + let kept_ids: HashSet = if all_node_ids.len() > NODE_CAP { + // Always keep center; fill remaining slots by degree descending. + let mut by_degree: Vec<(String, usize)> = degree_map + .iter() + .filter(|(id, _)| id.as_str() != note) + .map(|(id, &d)| (id.clone(), d)) + .collect(); + by_degree.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0))); + let mut kept: HashSet = HashSet::new(); + kept.insert(note.to_string()); + for (id, _) in by_degree.into_iter().take(NODE_CAP - 1) { + kept.insert(id); + } + kept + } else { + all_node_ids.clone() + }; + + // Drop edges that reference removed nodes. + let mut final_edges: Vec = deduped + .into_iter() + .filter(|e| kept_ids.contains(&e.source) && kept_ids.contains(&e.target)) + .collect(); + // Sort for stable cross-run output. + final_edges.sort_by(|a, b| { + a.source + .cmp(&b.source) + .then(a.target.cmp(&b.target)) + .then(a.kind.cmp(&b.kind)) + }); + + // Recompute degree map for final kept set. + let mut final_degree: HashMap = HashMap::new(); + for id in &kept_ids { + final_degree.insert(id.clone(), 0); + } + for e in &final_edges { + *final_degree.entry(e.source.clone()).or_default() += 1; + *final_degree.entry(e.target.clone()).or_default() += 1; + } + + // Build nodes vector. + let mut nodes: Vec = kept_ids + .iter() + .map(|id| { + let kind = if id == note { "center" } else { "note" }.to_string(); + let degree = *final_degree.get(id).unwrap_or(&0); + GNode { + label: basename(id), + id: id.clone(), + kind, + cluster: -1, + degree, + timestamp: created_map.get(id).cloned(), + } + }) + .collect(); + nodes.sort_by(|a, b| a.id.cmp(&b.id)); + + LocalGraph { + center: note.to_string(), + nodes, + edges: final_edges, + semantic_ready, + } +} + +// --------------------------------------------------------------------------- +// Cached variant +// --------------------------------------------------------------------------- + +/// Like [`local_graph`] but memoised on `(triple_count, total_memory_bytes)`. +/// +/// Map key is `(note_path, depth)`. Cap: 256 entries (clear-on-exceed). +pub async fn local_graph_cached( + state: &crate::state::AppState, + note: &str, + depth: usize, +) -> LocalGraph { + let tc = { state.graph.read().await.stats().triple_count }; + let mem_bytes = { state.memory.read().await.stats().total_memory_bytes }; + let version_key = (tc, mem_bytes); + let map_key = (note.to_string(), depth); + + // Check cache — release lock before any await. + { + let cache = state + .local_graph_cache + .lock() + .expect("local_graph cache poisoned"); + if let Some((cached_key, graph)) = cache.get(&map_key) { + if *cached_key == version_key { + return graph.clone(); + } + } + } + + // Compute without holding the mutex. + let result = local_graph(state, note, depth).await; + + // Store result. + { + let mut cache = state + .local_graph_cache + .lock() + .expect("local_graph cache poisoned"); + if cache.len() > 256 { + cache.clear(); + } + cache.insert(map_key, (version_key, result.clone())); + } + + result +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use aingle_graph::{NodeId, Predicate, Triple, Value}; + use ineru::{Embedder, Embedding, MemoryEntry}; + + use crate::state::AppState; + + // ----------------------------------------------------------------------- + // Stub embedder: 128-dim (same as context.rs tests). + // text with "alpha" → e0=[1,0,…], "zzz" → e1=[0,1,…], else → e2=[0,0,1,…] + // Cosine(alpha,alpha) = 1.0 ≥ NEIGHBOR_FLOOR(0.88) → passes. + // ----------------------------------------------------------------------- + struct StubEmbedder; + + impl Embedder for StubEmbedder { + fn embed_passage(&self, text: &str) -> Embedding { + let mut v = vec![0.0_f32; 128]; + if text.contains("alpha") { + v[0] = 1.0; + } else if text.contains("zzz") { + v[1] = 1.0; + } else { + v[2] = 1.0; + } + Embedding::new(v) + } + + fn embed_query(&self, text: &str) -> Embedding { + self.embed_passage(text) + } + + fn dimensions(&self) -> usize { + 128 + } + + fn relevance_thresholds(&self) -> (f32, f32) { + (0.5, 0.1) + } + } + + fn stub_state() -> AppState { + AppState::with_db_path_and_embedder(":memory:", None, Arc::new(StubEmbedder)).unwrap() + } + + async fn insert_triple_node(state: &AppState, s: &str, p: &str, o_node: &str) { + let g = state.graph.write().await; + g.insert(Triple::new( + NodeId::named(s), + Predicate::named(p), + Value::Node(NodeId::named(o_node)), + )) + .unwrap(); + } + + async fn insert_triple_lit(state: &AppState, s: &str, p: &str, o: &str) { + let g = state.graph.write().await; + g.insert(Triple::new( + NodeId::named(s), + Predicate::named(p), + Value::literal(o), + )) + .unwrap(); + } + + async fn register_note(state: &AppState, path: &str) { + insert_triple_lit(state, path, crate::service::ingest::PRED_SOURCE_HASH, "h").await; + } + + async fn insert_chunk(state: &AppState, source_path: &str, text: &str, emb: Vec) { + let mut mem = state.memory.write().await; + let mut e = MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": text, "source_path": source_path }), + ); + e.embedding = Some(Embedding::new(emb)); + mem.remember(e).unwrap(); + } + + fn e0() -> Vec { + let mut v = vec![0.0_f32; 128]; + v[0] = 1.0; + v + } + + // ----------------------------------------------------------------------- + // 1. link_edge_from_wikilink + // ----------------------------------------------------------------------- + /// A `links_to` triple (Value::Node) from a.md to b yields a "link" edge a→b, + /// and center is "a.md". + #[tokio::test] + async fn link_edge_from_wikilink() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + // wikilink stored as Value::Node (basename without extension) + insert_triple_node(&state, "a.md", "links_to", "b").await; + + let g = super::local_graph(&state, "a.md", 1).await; + assert_eq!(g.center, "a.md"); + let link = g.edges.iter().find(|e| e.kind == "link"); + assert!(link.is_some(), "must have a link edge: {:?}", g.edges); + let link = link.unwrap(); + assert_eq!(link.source, "a.md"); + assert_eq!(link.target, "b.md"); + } + + // ----------------------------------------------------------------------- + // 2. semantic_edge_from_neighbor + // ----------------------------------------------------------------------- + /// With the stub 128-d embedder and alpha-topic chunks, a.md and b.md both + /// project onto e0. note_context yields them as mutual neighbors with score + /// 1.0 ≥ NEIGHBOR_FLOOR → a "semantic" edge with score.is_some(). + #[tokio::test] + async fn semantic_edge_from_neighbor() { + let state = stub_state(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_chunk(&state, "a.md", "alpha content for a", e0()).await; + insert_chunk(&state, "b.md", "alpha content for b", e0()).await; + + let g = super::local_graph(&state, "a.md", 1).await; + assert!( + g.semantic_ready, + "StubEmbedder(128d) must be semantic_ready" + ); + let sem = g.edges.iter().find(|e| e.kind == "semantic"); + assert!(sem.is_some(), "must have a semantic edge: {:?}", g.edges); + assert!( + sem.unwrap().score.is_some(), + "semantic edge must carry a score" + ); + } + + // ----------------------------------------------------------------------- + // 3. semantic_edge_carries_provenance (dag-gated) + // ----------------------------------------------------------------------- + #[cfg(feature = "dag")] + #[tokio::test] + async fn semantic_edge_carries_provenance() { + let state = + AppState::with_db_path_and_embedder(":memory:", None, Arc::new(StubEmbedder)).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_chunk(&state, "a.md", "alpha content for a", e0()).await; + insert_chunk(&state, "b.md", "alpha content for b", e0()).await; + + // Record a signed DAG action for b.md so provenance_anchor_for("b.md") is Some. + { + let graph = state.graph.read().await; + let dag_store = graph.dag_store().expect("DAG must be enabled"); + let parents = dag_store.tips().expect("tips must be readable"); + let mut action = aingle_graph::dag::DagAction { + parents, + author: aingle_graph::NodeId::named("test"), + seq: 0, + timestamp: chrono::Utc::now(), + payload: aingle_graph::dag::DagPayload::Custom { + payload_type: "ingest".to_string(), + payload_summary: "b.md ingested".to_string(), + payload: None, + subject: Some("b.md".to_string()), + }, + signature: None, + }; + let key = aingle_graph::dag::DagSigningKey::generate(); + key.sign(&mut action); + dag_store + .put(&action) + .expect("put signed action must succeed"); + } + + let g = super::local_graph(&state, "a.md", 1).await; + let sem = g + .edges + .iter() + .find(|e| e.kind == "semantic" && e.target == "b.md") + .expect("must have semantic edge a→b"); + assert!( + sem.provenance_anchor.is_some(), + "semantic edge to b.md must carry provenance_anchor when a signed DAG action exists: {:?}", + sem + ); + } + + // ----------------------------------------------------------------------- + // 4. tag_edge_from_shared_tag + // ----------------------------------------------------------------------- + /// a.md and b.md both tagged "x" → a "tag" edge with label == Some("x"). + #[tokio::test] + async fn tag_edge_from_shared_tag() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_triple_lit(&state, "a.md", "tagged", "x").await; + insert_triple_lit(&state, "b.md", "tagged", "x").await; + + let g = super::local_graph(&state, "a.md", 1).await; + let tag_edge = g.edges.iter().find(|e| e.kind == "tag"); + assert!(tag_edge.is_some(), "must have a tag edge: {:?}", g.edges); + assert_eq!( + tag_edge.unwrap().label.as_deref(), + Some("x"), + "tag edge label must be the shared tag" + ); + } + + // ----------------------------------------------------------------------- + // 5. hash_embedder_omits_semantic + // ----------------------------------------------------------------------- + /// The default 64-dim hash embedder fails the semantic gate → semantic_ready==false, + /// no "semantic" edges; link and tag edges still appear. + #[tokio::test] + async fn hash_embedder_omits_semantic() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + // A link edge (so we know other edges work). + insert_triple_node(&state, "a.md", "links_to", "b").await; + + let g = super::local_graph(&state, "a.md", 1).await; + assert!( + !g.semantic_ready, + "64-dim hash embedder must set semantic_ready=false" + ); + assert!( + g.edges.iter().all(|e| e.kind != "semantic"), + "no semantic edges with hash embedder: {:?}", + g.edges + ); + // Link edges still present. + assert!( + g.edges.iter().any(|e| e.kind == "link"), + "link edges must still appear: {:?}", + g.edges + ); + } + + // ----------------------------------------------------------------------- + // 6. maps_excluded + // ----------------------------------------------------------------------- + /// Notes under `_maps/` are never included in the graph even when they + /// share tags or links with the center note. + #[tokio::test] + async fn maps_excluded() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "_maps/vault-map.md").await; + insert_triple_lit(&state, "a.md", "tagged", "x").await; + insert_triple_lit(&state, "_maps/vault-map.md", "tagged", "x").await; + // Also a direct link to make sure links are filtered too. + insert_triple_node(&state, "a.md", "links_to", "vault-map").await; + + let g = super::local_graph(&state, "a.md", 1).await; + assert!( + !g.nodes.iter().any(|n| n.id.starts_with("_maps/")), + "_maps/ nodes must be excluded: {:?}", + g.nodes + ); + assert!( + !g.edges + .iter() + .any(|e| e.target.starts_with("_maps/") || e.source.starts_with("_maps/")), + "_maps/ edges must be excluded: {:?}", + g.edges + ); + } + + // ----------------------------------------------------------------------- + // 7. caps_respected + // ----------------------------------------------------------------------- + /// With more than NODE_CAP neighbors, nodes.len() <= NODE_CAP and center is present. + #[tokio::test] + async fn caps_respected() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "center.md").await; + // Create NODE_CAP + 10 = 90 notes, each sharing a tag with center.md. + for i in 0..90 { + let path = format!("note{i}.md"); + register_note(&state, &path).await; + insert_triple_lit(&state, &path, "tagged", "bigtag").await; + } + insert_triple_lit(&state, "center.md", "tagged", "bigtag").await; + + let g = super::local_graph(&state, "center.md", 1).await; + assert!( + g.nodes.len() <= super::NODE_CAP, + "nodes.len() ({}) must be <= NODE_CAP ({}): center present: {}", + g.nodes.len(), + super::NODE_CAP, + g.nodes.iter().any(|n| n.id == "center.md") + ); + assert!( + g.nodes.iter().any(|n| n.id == "center.md"), + "center must always be in the graph: {:?}", + g.nodes.iter().map(|n| &n.id).collect::>() + ); + } + + // ----------------------------------------------------------------------- + // 8. depth_two_expands_frontier + // ----------------------------------------------------------------------- + /// A→B→C via wikilinks: depth=2 reaches c.md, depth=1 does not. + #[tokio::test] + async fn depth_two_expands_frontier() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + register_note(&state, "c.md").await; + insert_triple_node(&state, "a.md", "links_to", "b").await; + insert_triple_node(&state, "b.md", "links_to", "c").await; + + let g1 = super::local_graph(&state, "a.md", 1).await; + assert!( + !g1.nodes.iter().any(|n| n.id == "c.md"), + "depth=1 must NOT include c.md: {:?}", + g1.nodes.iter().map(|n| &n.id).collect::>() + ); + + let g2 = super::local_graph(&state, "a.md", 2).await; + assert!( + g2.nodes.iter().any(|n| n.id == "c.md"), + "depth=2 must include c.md (reached via a→b→c): {:?}", + g2.nodes.iter().map(|n| &n.id).collect::>() + ); + } + + // ----------------------------------------------------------------------- + // 9. incoming_link_edge + // ----------------------------------------------------------------------- + /// X links_to A (incoming); local_graph("a.md", 1) must include x→a link edge. + #[tokio::test] + async fn incoming_link_edge() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "x.md").await; + register_note(&state, "a.md").await; + insert_triple_node(&state, "x.md", "links_to", "a").await; + + let g = super::local_graph(&state, "a.md", 1).await; + let link = g + .edges + .iter() + .find(|e| e.kind == "link" && e.source == "x.md" && e.target == "a.md"); + assert!( + link.is_some(), + "incoming link x→a must appear in graph centered on a.md: {:?}", + g.edges + ); + } + + // ----------------------------------------------------------------------- + // 10. pair_with_link_and_semantic_keeps_both + // ----------------------------------------------------------------------- + /// A links_to B AND B is A's semantic neighbor → both a link edge AND a + /// semantic edge must be present for the pair (different dedup sets). + #[tokio::test] + async fn pair_with_link_and_semantic_keeps_both() { + let state = stub_state(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_triple_node(&state, "a.md", "links_to", "b").await; + insert_chunk(&state, "a.md", "alpha content for a", e0()).await; + insert_chunk(&state, "b.md", "alpha content for b", e0()).await; + + let g = super::local_graph(&state, "a.md", 1).await; + let has_link = g + .edges + .iter() + .any(|e| e.kind == "link" && e.source == "a.md" && e.target == "b.md"); + let has_sem = g.edges.iter().any(|e| { + e.kind == "semantic" + && ((e.source == "a.md" && e.target == "b.md") + || (e.source == "b.md" && e.target == "a.md")) + }); + assert!(has_link, "link edge a→b must be present: {:?}", g.edges); + assert!(has_sem, "semantic edge a↔b must be present: {:?}", g.edges); + } + + // ----------------------------------------------------------------------- + // 11. symmetric_semantic_dedup + // ----------------------------------------------------------------------- + /// With a→b and b→a semantic edges produced at different BFS levels, dedup + /// must yield exactly ONE semantic edge for the pair. + #[tokio::test] + async fn symmetric_semantic_dedup() { + let state = stub_state(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_chunk(&state, "a.md", "alpha content for a", e0()).await; + insert_chunk(&state, "b.md", "alpha content for b", e0()).await; + + // depth=2: level-1 processes a.md → finds b.md; level-2 processes b.md → finds a.md. + // Both produce a↔b semantic edge candidates. Dedup keeps exactly one. + let g = super::local_graph(&state, "a.md", 2).await; + let sem_count = g + .edges + .iter() + .filter(|e| { + e.kind == "semantic" + && ((e.source == "a.md" && e.target == "b.md") + || (e.source == "b.md" && e.target == "a.md")) + }) + .count(); + assert_eq!( + sem_count, 1, + "symmetric a↔b semantic must yield exactly ONE edge, got {sem_count}: {:?}", + g.edges + ); + } + + // ----------------------------------------------------------------------- + // 12. local_graph_cached_hit_and_invalidation + // ----------------------------------------------------------------------- + /// Cache hit: second call with unchanged graph returns same result. + /// Invalidation: after a graph mutation, the next call recomputes. + #[tokio::test] + async fn local_graph_cached_hit_and_invalidation() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_triple_node(&state, "a.md", "links_to", "b").await; + + // First call: computes and caches. + let g1 = super::local_graph_cached(&state, "a.md", 1).await; + assert!( + g1.nodes.iter().any(|n| n.id == "b.md"), + "b.md must be in graph" + ); + + // Second call: graph/memory unchanged → cache hit → identical result. + let g2 = super::local_graph_cached(&state, "a.md", 1).await; + assert_eq!( + g1.nodes.len(), + g2.nodes.len(), + "cache hit must return same node count" + ); + + // Mutate: add c.md and a link a→c (changes triple_count). + register_note(&state, "c.md").await; + insert_triple_node(&state, "a.md", "links_to", "c").await; + + // Third call: version mismatch → invalidated → c.md appears. + let g3 = super::local_graph_cached(&state, "a.md", 1).await; + assert!( + g3.nodes.iter().any(|n| n.id == "c.md"), + "after mutation, c.md must appear in recomputed result: {:?}", + g3.nodes.iter().map(|n| &n.id).collect::>() + ); + } + + // ----------------------------------------------------------------------- + // 13. cache_cap_clears_when_exceeded + // ----------------------------------------------------------------------- + /// When local_graph_cache exceeds 256 entries, the next insert clears the map + /// first, then inserts the new entry — so len() == 1 afterward. + #[tokio::test] + async fn cache_cap_clears_when_exceeded() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + + // Pre-fill with 257 dummy entries to exceed the cap. + { + let mut cache = state.local_graph_cache.lock().unwrap(); + for i in 0..257usize { + cache.insert( + (format!("dummy_{i}.md"), 1usize), + ((0, 0), super::LocalGraph::default()), + ); + } + } + assert_eq!( + state.local_graph_cache.lock().unwrap().len(), + 257, + "pre-condition: cache must have 257 dummy entries" + ); + + // Call for a key not in the cache; cap fires before insert. + let _ = super::local_graph_cached(&state, "fresh.md", 1).await; + + let cache = state.local_graph_cache.lock().unwrap(); + assert_eq!( + cache.len(), + 1, + "cap must clear oversized cache then insert one entry; got {} entries", + cache.len() + ); + assert!( + cache.contains_key(&("fresh.md".to_string(), 1usize)), + "fresh.md must be in cache after cap-and-insert" + ); + } + + // ----------------------------------------------------------------------- + // timestamp field: created triple → GNode.timestamp + // ----------------------------------------------------------------------- + + /// A `created` triple for a note must surface its value in `GNode.timestamp`. + /// A note without a `created` triple must have `GNode.timestamp == None`. + #[tokio::test] + async fn gnode_timestamp_from_created_triple() { + let state = AppState::with_db_path(":memory:", None).unwrap(); + register_note(&state, "a.md").await; + register_note(&state, "b.md").await; + insert_triple_node(&state, "a.md", "links_to", "b").await; + insert_triple_lit(&state, "a.md", "created", "2025-03-15").await; + + let g = super::local_graph(&state, "a.md", 1).await; + let node_a = g + .nodes + .iter() + .find(|n| n.id == "a.md") + .expect("a.md must be in graph"); + assert_eq!( + node_a.timestamp, + Some("2025-03-15".to_string()), + "GNode.timestamp must come from the created triple" + ); + let node_b = g + .nodes + .iter() + .find(|n| n.id == "b.md") + .expect("b.md must be in graph"); + assert_eq!( + node_b.timestamp, None, + "GNode without created triple must have timestamp=None" + ); + } + + // ----------------------------------------------------------------------- + // 14. frontier_cap_bounds_semantic (optional perf guard) + // ----------------------------------------------------------------------- + /// A hub with >SEM_FRONTIER_CAP link-neighbors at depth=2 still completes + /// and the result satisfies NODE_CAP and includes the center. + #[tokio::test] + async fn frontier_cap_bounds_semantic() { + let state = stub_state(); + register_note(&state, "center.md").await; + register_note(&state, "hub.md").await; + insert_triple_node(&state, "center.md", "links_to", "hub").await; + insert_chunk(&state, "center.md", "alpha content center", e0()).await; + insert_chunk(&state, "hub.md", "alpha content hub", e0()).await; + + // 20 spokes — more than SEM_FRONTIER_CAP (16). + for i in 0..20usize { + let path = format!("spoke{i}.md"); + register_note(&state, &path).await; + insert_triple_node(&state, "hub.md", "links_to", &format!("spoke{i}")).await; + insert_chunk(&state, &path, "alpha content spoke", e0()).await; + } + + let g = super::local_graph(&state, "center.md", 2).await; + assert!( + g.nodes.len() <= super::NODE_CAP, + "nodes must be ≤ NODE_CAP ({}), got {}", + super::NODE_CAP, + g.nodes.len() + ); + assert!( + g.nodes.iter().any(|n| n.id == "center.md"), + "center must always be in the graph" + ); + } + + // ----------------------------------------------------------------------- + // 15. neural_local_graph_has_semantic_edge (real e5 model, gated) + // ----------------------------------------------------------------------- + /// End-to-end acceptance test using the real multilingual-e5-small model. + /// Skipped when the model files are absent. Requires `ORT_DYLIB_PATH`. + /// + /// Two same-topic Spanish notes (dog care) must share a semantic edge; + /// an off-topic note (elections) must not appear (below NEIGHBOR_FLOOR=0.88). + #[cfg(feature = "neural-embeddings")] + #[tokio::test] + async fn neural_local_graph_has_semantic_edge() { + let model_dir = std::env::var("INERU_E5_MODEL_DIR").unwrap_or_else(|_| { + concat!( + env!("CARGO_MANIFEST_DIR"), + "/../ineru/test-models/multilingual-e5-small" + ) + .to_string() + }); + if !std::path::Path::new(&model_dir) + .join("onnx/model.onnx") + .exists() + { + eprintln!( + "skipping neural_local_graph_has_semantic_edge: e5 model not found at {model_dir}" + ); + return; + } + + let embedder = crate::embedder::build_embedder(Some(&model_dir)); + assert_eq!( + embedder.dimensions(), + 384, + "neural embedder must be active (384d)" + ); + + let state = AppState::with_db_path_and_embedder(":memory:", None, embedder).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + + let dir = tempfile::tempdir().unwrap(); + // Two same-topic notes about dog care (reused from neural_note_context_finds_same_topic). + std::fs::write( + dir.path().join("perros1.md"), + "# Cuidado de perros\n\nLos perros necesitan paseos diarios, agua fresca y una dieta equilibrada para estar sanos.\n", + ) + .unwrap(); + std::fs::write( + dir.path().join("perros2.md"), + "# Mascotas\n\nUn perro sano requiere ejercicio diario, hidratación constante y alimentación balanceada.\n", + ) + .unwrap(); + // Off-topic note: elections have no semantic overlap with dog care. + std::fs::write( + dir.path().join("elecciones.md"), + "# Elecciones\n\nLos resultados de las elecciones presidenciales determinan el futuro del país.\n", + ) + .unwrap(); + + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + + let g = super::local_graph(&state, "perros1.md", 1).await; + + assert!( + g.semantic_ready, + "neural embedder (384d) must set semantic_ready=true" + ); + + // There must be a semantic edge connecting perros1↔perros2 (either orientation). + let has_sem_edge = g.edges.iter().any(|e| { + e.kind == "semantic" + && ((e.source == "perros1.md" && e.target == "perros2.md") + || (e.source == "perros2.md" && e.target == "perros1.md")) + }); + assert!( + has_sem_edge, + "perros1.md and perros2.md (same-topic) must share a semantic edge: {:?}", + g.edges + ); + + // elecciones.md is off-topic; cosine vs perros1 is below NEIGHBOR_FLOOR (0.88). + assert!( + !g.edges.iter().any(|e| { + e.kind == "semantic" + && (e.source == "elecciones.md" || e.target == "elecciones.md") + }), + "off-topic elecciones.md must not have a semantic edge (below NEIGHBOR_FLOOR=0.88): {:?}", + g.edges + ); + } +} diff --git a/crates/aingle_cortex/src/service/mod.rs b/crates/aingle_cortex/src/service/mod.rs index bcc9f1ca..5d3a501b 100644 --- a/crates/aingle_cortex/src/service/mod.rs +++ b/crates/aingle_cortex/src/service/mod.rs @@ -3,8 +3,13 @@ //! Business-logic layer shared by REST handlers and the MCP server. +pub mod backlinks; +pub mod context; #[cfg(feature = "dag")] pub mod dag; +pub mod ground; +pub mod ingest; +pub mod local_graph; pub mod proof; pub mod query; pub mod reputation; @@ -12,5 +17,7 @@ pub mod skill; #[cfg(feature = "sparql")] pub mod sparql; pub mod stats; +pub(crate) mod triple_util; pub mod triples; pub mod validate; +pub mod vault_map; diff --git a/crates/aingle_cortex/src/service/triple_util.rs b/crates/aingle_cortex/src/service/triple_util.rs new file mode 100644 index 00000000..524b5ffc --- /dev/null +++ b/crates/aingle_cortex/src/service/triple_util.rs @@ -0,0 +1,172 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Shared triple-object extraction and wikilink-resolution helpers. +//! +//! # Why a shared module? +//! `obj_string` was previously duplicated verbatim in `backlinks`, `context`, +//! and `vault_map`. A copy-paste drift on exactly this helper caused a real bug +//! (node-valued `links_to` triples were silently dropped). This module is the +//! single source of truth; every consumer must import from here. + +/// Return the object of a triple as a plain `String`, handling both literal +/// strings (`Value::Str`) and graph nodes (`Value::Node`). Node IDs are stored +/// with `<…>` angle-bracket wrappers; this strips them so the result matches +/// the bare names used everywhere else in the service layer. +#[inline] +pub(crate) fn obj_string(t: &aingle_graph::Triple) -> Option { + if let Some(s) = t.object_string() { + Some(s.to_string()) + } else { + t.object_node() + .map(|n| strip_brackets(&n.to_string()).to_string()) + } +} + +/// Strip leading `<` and trailing `>` angle-bracket wrappers from an IRI string. +/// +/// Node IDs in the graph are stored with angle-bracket wrappers (e.g. ``); +/// this strips them so the result matches the bare names used everywhere in the service layer. +pub(crate) fn strip_brackets(s: &str) -> &str { + s.trim_start_matches('<').trim_end_matches('>') +} + +/// Basename without directory or extension (for wikilink resolution). +/// +/// Strips both `/` and `\` directory separators and removes the last `.ext`. +pub(crate) fn basename(path: &str) -> String { + let file = path.rsplit(['/', '\\']).next().unwrap_or(path); + file.rsplit_once('.') + .map(|(s, _)| s) + .unwrap_or(file) + .to_string() +} + +/// Retrieve a signed provenance anchor hash for a note path, if available. +/// +/// Returns the hex hash of the most-recent signed DAG action whose subject is +/// `src`, or `None` when the `dag` feature is off or no signed action exists. +pub(crate) async fn provenance_anchor_for( + state: &crate::state::AppState, + src: &str, +) -> Option { + #[cfg(feature = "dag")] + { + match crate::service::dag::history_by_subject(state, src, 1).await { + Ok(a) => a.first().filter(|x| x.signed).map(|x| x.hash.clone()), + Err(_) => None, + } + } + #[cfg(not(feature = "dag"))] + { + let _ = (state, src); + None + } +} + +/// Strip the extension from the last path segment only. Input must already be +/// slash-normalized (forward slashes). Returns the path-without-ext. +/// "b/note.md" → "b/note", "b/note" → "b/note", "note.md" → "note". +fn path_without_ext(path: &str) -> String { + if let Some(idx) = path.rfind('/') { + let dir = &path[..=idx]; // includes the trailing '/' + let file = &path[idx + 1..]; + let stem = file.rsplit_once('.').map(|(s, _)| s).unwrap_or(file); + format!("{dir}{stem}") + } else { + path.rsplit_once('.') + .map(|(s, _)| s) + .unwrap_or(path) + .to_string() + } +} + +/// Resolve a wikilink `target` to a full note path. Order mirrors the editor's +/// `wikilinks.ts`: +/// 1. Exact path match (after normalizing `\\`→`/`). +/// 2. When `target` is path-qualified (contains `/`), find a note whose +/// slash-normalized path-without-extension equals the target's. +/// This handles `[[dir/note]]` → `dir/note.md` without collapsing to the +/// alphabetically-first note that shares a bare basename. +/// 3. Basename fallback via `by_base`. +pub(crate) fn resolve_link_target( + target: &str, + note_set: &std::collections::BTreeSet<&str>, + by_base: &std::collections::BTreeMap, +) -> Option { + // Normalize backslash to forward slash for consistent matching. + let t_norm = target.replace('\\', "/"); + let t_ref: &str = &t_norm; + + // (1) Exact path match. + if note_set.contains(t_ref) { + return Some(t_norm); + } + + // (2) Path-qualified: find a note whose path-without-ext (slash-normalized) + // equals the target's path-without-ext. + if t_norm.contains('/') { + let t_ne = path_without_ext(t_ref); + for &p in note_set.iter() { + let p_norm = p.replace('\\', "/"); + if path_without_ext(&p_norm) == t_ne { + return Some(p.to_string()); + } + } + } + + // (3) Basename fallback. + by_base.get(&basename(t_ref)).cloned() +} + +#[cfg(test)] +mod tests { + use std::collections::{BTreeMap, BTreeSet}; + + use super::resolve_link_target; + + #[test] + fn exact_path_match() { + // "b/note.md" exists verbatim — must return it, not "a/note.md". + let notes = ["a/note.md".to_string(), "b/note.md".to_string()]; + let note_set: BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + let mut by_base: BTreeMap = BTreeMap::new(); + by_base.insert("note".to_string(), "a/note.md".to_string()); + + assert_eq!( + resolve_link_target("b/note.md", ¬e_set, &by_base).as_deref(), + Some("b/note.md") + ); + } + + #[test] + fn path_qualified_resolves_correct_note_not_alphabetical_first() { + // "[[b/note]]" (no extension) must resolve to "b/note.md", NOT "a/note.md". + // by_base["note"] = "a/note.md" (first alphabetically — the collision + // that previously caused the bug). + let notes = ["a/note.md".to_string(), "b/note.md".to_string()]; + let note_set: BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + let mut by_base: BTreeMap = BTreeMap::new(); + by_base.insert("note".to_string(), "a/note.md".to_string()); + + assert_eq!( + resolve_link_target("b/note", ¬e_set, &by_base).as_deref(), + Some("b/note.md"), + "path-qualified target must not collapse to the alphabetically-first basename match" + ); + } + + #[test] + fn bare_basename_unique_fallback() { + // No path component → falls through to by_base. + let notes = ["dir/note.md".to_string()]; + let note_set: BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + let mut by_base: BTreeMap = BTreeMap::new(); + by_base.insert("note".to_string(), "dir/note.md".to_string()); + + assert_eq!( + resolve_link_target("note", ¬e_set, &by_base).as_deref(), + Some("dir/note.md") + ); + } +} diff --git a/crates/aingle_cortex/src/service/triples.rs b/crates/aingle_cortex/src/service/triples.rs index 28ae7923..6220b409 100644 --- a/crates/aingle_cortex/src/service/triples.rs +++ b/crates/aingle_cortex/src/service/triples.rs @@ -28,29 +28,43 @@ pub async fn create_triple( req: CreateTripleRequest, namespace: Option, ) -> Result { - // Validate input if req.subject.is_empty() { return Err(Error::InvalidInput("Subject cannot be empty".to_string())); } if req.predicate.is_empty() { return Err(Error::InvalidInput("Predicate cannot be empty".to_string())); } + insert_triple_inner( + state, + req.object, + &req.subject, + &req.predicate, + None, + namespace, + ) + .await +} - let object: Value = req.object.clone().into(); - - // Create the triple - let triple = Triple::new( - NodeId::named(&req.subject), - Predicate::named(&req.predicate), - object, - ); +/// Shared single-triple write used by `create_triple` and the ingestion path. +/// `object_dto` is serialized into the DAG payload exactly as the REST path does, +/// so triple IDs / DAG replay stay byte-compatible. `provenance`, when present, +/// is attached to the signed `TripleInsert` payload. +pub async fn insert_triple_inner( + state: &AppState, + object_dto: crate::rest::ValueDto, + subject: &str, + predicate: &str, + #[cfg(feature = "dag")] provenance: Option, + #[cfg(not(feature = "dag"))] _provenance: Option<()>, + namespace: Option, +) -> Result { + let object: Value = object_dto.clone().into(); + let triple = Triple::new(NodeId::named(subject), Predicate::named(predicate), object); - // Add triple to graph (and record DAG action if enabled) let triple_id = { let graph = state.graph.read().await; let id = graph.insert(triple.clone())?; - // Record in DAG if enabled #[cfg(feature = "dag")] if let Some(dag_store) = graph.dag_store() { let dag_author = state @@ -69,9 +83,10 @@ pub async fn create_triple( timestamp: chrono::Utc::now(), payload: aingle_graph::dag::DagPayload::TripleInsert { triples: vec![aingle_graph::dag::TripleInsertPayload { - subject: req.subject.clone(), - predicate: req.predicate.clone(), - object: serde_json::to_value(&req.object).unwrap_or_default(), + subject: subject.to_string(), + predicate: predicate.to_string(), + object: serde_json::to_value(&object_dto).unwrap_or_default(), + provenance, }], }, signature: None, @@ -91,7 +106,6 @@ pub async fn create_triple( id }; - // Record audit entry { let mut audit = state.audit_log.write().await; audit.record(AuditEntry { @@ -100,17 +114,16 @@ pub async fn create_triple( namespace, action: "create".to_string(), resource: format!("/api/v1/triples/{}", triple_id.to_hex()), - details: Some(format!("subject={}", req.subject)), + details: Some(format!("subject={}", subject)), request_id: None, }); } - // Broadcast event state.broadcaster.broadcast(Event::TripleAdded { hash: triple_id.to_hex(), - subject: req.subject, - predicate: req.predicate, - object: serde_json::to_value(&req.object).unwrap_or_default(), + subject: subject.to_string(), + predicate: predicate.to_string(), + object: serde_json::to_value(&object_dto).unwrap_or_default(), }); Ok(triple.into()) @@ -485,6 +498,51 @@ mod tests { assert!(matches!(err, Error::NotFound(_))); } + #[cfg(feature = "dag")] + #[tokio::test] + async fn inner_write_records_provenance_in_dag() { + use aingle_graph::dag::{DagPayload, Provenance}; + + let state = AppState::with_db_path(":memory:", None).unwrap(); + { + let mut graph = state.graph.write().await; + graph.enable_dag(); + } + + let prov = Provenance { + source_path: "docs/x.md".into(), + line_start: 4, + line_end: 4, + content_hash: "abc123".into(), + }; + insert_triple_inner( + &state, + crate::rest::ValueDto::Node { + node: "sled".into(), + }, + "docs/x.md", + "links_to", + Some(prov.clone()), + None, + ) + .await + .unwrap(); + + // The DAG action affecting subject "docs/x.md" must carry the provenance. + let graph = state.graph.read().await; + let actions = graph.dag_history_by_subject("docs/x.md", 10).unwrap(); + let found = actions.iter().any(|a| match &a.payload { + DagPayload::TripleInsert { triples } => { + triples.iter().any(|t| t.provenance.as_ref() == Some(&prov)) + } + _ => false, + }); + assert!( + found, + "provenance must be present in the TripleInsert DAG payload" + ); + } + #[tokio::test] async fn list_triples_returns_inserted() { let state = AppState::with_db_path(":memory:", None).unwrap(); diff --git a/crates/aingle_cortex/src/service/vault_map.rs b/crates/aingle_cortex/src/service/vault_map.rs new file mode 100644 index 00000000..a68449e0 --- /dev/null +++ b/crates/aingle_cortex/src/service/vault_map.rs @@ -0,0 +1,1245 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Vault Map: a deterministic, offline map + navigation manual derived from the +//! semantic graph (links/tags/types) and neural embeddings (semantic topics). + +use serde::Serialize; +use std::collections::BTreeMap; + +use crate::service::triple_util::{basename, obj_string, strip_brackets}; + +/// The full vault map returned to the UI and the connected AI. +#[derive(Debug, Clone, Serialize, Default)] +pub struct VaultMap { + pub totals: Totals, + pub entry_points: Vec, + pub topics: Vec, + pub tag_clusters: Vec, + pub orphans: Vec, + pub tags: Vec, + pub types: Vec, + pub graph: GraphView, + pub guidance: String, + /// Path to the user's identity note (`me.md`) if present — read this first. + pub identity: Option, + /// Note paths tagged as reusable skills/processes (the "skill map"). + pub skills: Vec, +} + +#[derive(Debug, Clone, Serialize, Default)] +pub struct Totals { + pub notes: usize, + pub links: usize, + pub clusters: usize, + pub orphans: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct EntryPoint { + pub path: String, + pub title: String, + pub in_links: usize, + pub out_links: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct Topic { + pub id: usize, + pub label: String, + pub representative: String, + pub notes: Vec, + pub size: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct TagGroup { + pub tag: String, + pub notes: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct TagCount { + pub tag: String, + pub count: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct TypeCount { + pub ty: String, + pub count: usize, +} + +#[derive(Debug, Clone, Serialize, Default)] +pub struct GraphView { + pub nodes: Vec, + pub edges: Vec, +} + +#[derive(Debug, Clone, Serialize)] +pub struct GraphNode { + pub id: String, + pub label: String, + pub cluster: i64, + pub degree: usize, + /// Creation date sourced from the note's `created` frontmatter scalar (e.g. `"2025-09-14"`). + /// `None` when the note has no `created` triple. + pub timestamp: Option, +} + +#[derive(Debug, Clone, Serialize)] +pub struct GraphEdge { + pub source: String, + pub target: String, + /// Edge type: `"link"` for explicit wikilinks, `"semantic"` for cosine-similar pairs + /// discovered during topic clustering. + pub kind: String, +} + +/// Max nodes rendered in the visual graph (top-degree); larger vaults are capped. +const GRAPH_NODE_CAP: usize = 600; + +/// Hard cap on semantic edges in the graph view. Dense clusters can produce O(n²) +/// pairs; beyond this limit only the highest-cosine pairs are kept (the sorting +/// happens inside `compute_vault_map` before truncation). +const SEMANTIC_EDGE_CAP: usize = 1200; + +/// Maximum semantic neighbors each node may contribute via its own top-K ranking. +/// The final edge set is the UNION of all per-node top-K choices (so a strongly +/// similar pair survives even if only one endpoint nominated the other). This bounds +/// the edge count to roughly `N × SEMANTIC_EDGES_PER_NODE` instead of `O(N²)`, +/// preventing hairballs in themed vaults where most notes are mutually similar. +const SEMANTIC_EDGES_PER_NODE: usize = 3; + +/// Tags (case-insensitive) that mark a note as a reusable skill/process. +const SKILL_TAGS: [&str; 6] = ["skill", "process", "sop", "workflow", "how-to", "howto"]; + +/// True for paths under the generated maps folder (excluded from the vault map). +pub(crate) fn is_maps_path(path: &str) -> bool { + path.starts_with("_maps/") || path.starts_with("_maps\\") +} + +/// Structural inputs derived from the graph (no embeddings). +#[derive(Debug, Default)] +pub(crate) struct Structural { + pub notes: Vec, // note rel_paths, sorted + pub in_deg: BTreeMap, // note -> incoming resolved links + pub out_deg: BTreeMap, // note -> outgoing resolved links + pub edges: Vec<(String, String)>, // resolved (src note, dst note) + pub tag_notes: BTreeMap>, // tag -> notes + pub type_counts: BTreeMap, // type -> count + pub link_count: usize, // total resolved links +} + +pub(crate) fn derive_structural(graph: &aingle_graph::GraphDB) -> Structural { + use aingle_graph::{Predicate, TriplePattern}; + + let find = |pred: &str| -> Vec<(String, String)> { + graph + .find(TriplePattern::any().with_predicate(Predicate::named(pred))) + .unwrap_or_default() + .into_iter() + .filter_map(|t| { + let subj = strip_brackets(&t.subject.to_string()).to_string(); + obj_string(&t).map(|o| (subj, o)) + }) + .collect() + }; + + // Note set from the source-hash registry. + let mut notes: Vec = find(crate::service::ingest::PRED_SOURCE_HASH) + .into_iter() + .map(|(s, _)| s) + .collect(); + notes.sort(); + notes.dedup(); + notes.retain(|n| !is_maps_path(n)); + + // O(log n) membership set — avoids linear scans during link/tag resolution. + let note_set: std::collections::BTreeSet<&str> = notes.iter().map(|s| s.as_str()).collect(); + + // Basename -> note path index for wikilink resolution. + let mut by_base: BTreeMap = BTreeMap::new(); + for n in ¬es { + by_base.entry(basename(n)).or_insert_with(|| n.clone()); + } + let resolve = |target: &str| -> Option { + // exact path first, else basename match + if note_set.contains(target) { + Some(target.to_string()) + } else { + by_base.get(&basename(target)).cloned() + } + }; + + let mut in_deg: BTreeMap = BTreeMap::new(); + let mut out_deg: BTreeMap = BTreeMap::new(); + let mut edges: Vec<(String, String)> = Vec::new(); + for (src, target) in find("links_to") { + if !note_set.contains(src.as_str()) { + continue; + } + if let Some(dst) = resolve(&target) { + if dst == src { + continue; + } + *out_deg.entry(src.clone()).or_default() += 1; + *in_deg.entry(dst.clone()).or_default() += 1; + edges.push((src, dst)); + } + } + let link_count = edges.len(); + + let mut tag_notes: BTreeMap> = BTreeMap::new(); + for (note, tag) in find("tagged") { + if note_set.contains(note.as_str()) { + tag_notes.entry(tag).or_default().push(note); + } + } + for v in tag_notes.values_mut() { + v.sort(); + v.dedup(); + } + + let mut type_counts: BTreeMap = BTreeMap::new(); + for (_note, ty) in find("type") { + *type_counts.entry(ty).or_default() += 1; + } + + Structural { + notes, + in_deg, + out_deg, + edges, + tag_notes, + type_counts, + link_count, + } +} + +/// From a sorted-descending list of `(a, b, cosine)` pairs (with `a ≤ b` and no +/// duplicates), return the canonical `(String, String) → cosine` map for the +/// top-`k` semantic neighbors of every node (union semantics: a pair is kept if +/// EITHER endpoint ranked the other in its top-`k`). +/// +/// Because the input is sorted desc by cosine and we iterate in that order, each +/// per-node accumulator is also sorted desc — so `take(k)` yields the top-k without +/// an additional per-node sort. +fn top_k_semantic_pairs<'a>( + candidates: &'a [(String, String, f32)], + k: usize, +) -> BTreeMap<(String, String), f32> { + // Accumulate per-node (partner, cosine) lists in global cosine-desc order. + let mut per_node: BTreeMap<&'a str, Vec<(&'a str, f32)>> = BTreeMap::new(); + for (a, b, c) in candidates { + per_node + .entry(a.as_str()) + .or_default() + .push((b.as_str(), *c)); + per_node + .entry(b.as_str()) + .or_default() + .push((a.as_str(), *c)); + } + // Union: an ordered pair (min, max) is kept if EITHER endpoint selects the other. + let mut chosen: BTreeMap<(String, String), f32> = BTreeMap::new(); + for (node, partners) in &per_node { + for (partner, c) in partners.iter().take(k) { + let key = if *node <= *partner { + (node.to_string(), partner.to_string()) + } else { + (partner.to_string(), node.to_string()) + }; + // `or_insert`: the first insertion for any pair holds the correct cosine + // because we iterate globally in desc order (highest cosine first). + chosen.entry(key).or_insert(*c); + } + } + chosen +} + +/// Cosine similarity between two raw vectors (same length). +fn cosine(a: &[f32], b: &[f32]) -> f32 { + if a.len() != b.len() || a.is_empty() { + return 0.0; + } + let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum(); + let ma = a.iter().map(|x| x * x).sum::().sqrt(); + let mb = b.iter().map(|x| x * x).sum::().sqrt(); + if ma == 0.0 || mb == 0.0 { + 0.0 + } else { + dot / (ma * mb) + } +} + +/// Connected-components clustering over a cosine-similarity graph: notes whose +/// cosine >= `threshold` are linked; each connected component is a topic. Labeled +/// by the most central note (highest mean cosine to its component). Deterministic +/// (inputs are a sorted BTreeMap). O(n^2) — the caller caps n. +/// +/// Returns `(topics, sem_pairs)` where `sem_pairs` is the list of +/// `(note_a, note_b, cosine)` pairs that met the threshold. These are captured +/// during the union-find pass so no additional O(n²) scan is needed. +pub(crate) fn cluster_semantic( + vecs: &BTreeMap>, + threshold: f32, +) -> (Vec, Vec<(String, String, f32)>) { + let names: Vec<&String> = vecs.keys().collect(); + let n = names.len(); + // union-find + let mut parent: Vec = (0..n).collect(); + fn find(parent: &mut [usize], mut x: usize) -> usize { + while parent[x] != x { + parent[x] = parent[parent[x]]; + x = parent[x]; + } + x + } + // Pairs above threshold — captured here so `compute_vault_map` can emit + // semantic edges without an additional O(n²) pass. + let mut sem_pairs: Vec<(String, String, f32)> = Vec::new(); + for i in 0..n { + for j in (i + 1)..n { + let c = cosine(&vecs[names[i]], &vecs[names[j]]); + if c >= threshold { + let (ri, rj) = (find(&mut parent, i), find(&mut parent, j)); + if ri != rj { + parent[ri] = rj; + } + sem_pairs.push((names[i].clone(), names[j].clone(), c)); + } + } + } + // group by root + let mut groups: BTreeMap> = BTreeMap::new(); + for i in 0..n { + let r = find(&mut parent, i); + groups.entry(r).or_default().push(i); + } + let mut topics: Vec = Vec::new(); + for (id, (_root, members)) in groups.into_iter().enumerate() { + // central note = max mean cosine to the rest of its group + let central = *members + .iter() + .max_by(|&&x, &&y| { + let mx = mean_sim(x, &members, &names, vecs); + let my = mean_sim(y, &members, &names, vecs); + mx.partial_cmp(&my).unwrap_or(std::cmp::Ordering::Equal) + }) + .unwrap(); + let mut notes: Vec = members.iter().map(|&m| names[m].clone()).collect(); + notes.sort(); + let rep = names[central].clone(); + topics.push(Topic { + id, + label: basename(&rep), + representative: rep, + size: notes.len(), + notes, + }); + } + topics.sort_by(|a, b| b.size.cmp(&a.size).then(a.label.cmp(&b.label))); + (topics, sem_pairs) +} + +fn mean_sim( + self_idx: usize, + members: &[usize], + names: &[&String], + vecs: &BTreeMap>, +) -> f32 { + if members.len() <= 1 { + return 1.0; + } + let v = &vecs[names[self_idx]]; + let mut sum = 0.0; + let mut cnt = 0; + for &m in members { + if m == self_idx { + continue; + } + sum += cosine(v, &vecs[names[m]]); + cnt += 1; + } + if cnt == 0 { + 1.0 + } else { + sum / cnt as f32 + } +} + +/// Mean per-note embedding from Ineru `doc_chunk` entries, grouped by source_path. +pub(crate) fn per_note_vectors(mem: &ineru::IneruMemory) -> BTreeMap> { + let mut sums: BTreeMap, usize)> = BTreeMap::new(); + let mut entries = mem.stm.all_entries(); + entries.extend(mem.ltm.all_entries()); + for e in entries { + if e.entry_type != crate::service::ingest::CHUNK_ENTRY_TYPE { + continue; + } + let Some(path) = e.data.get("source_path").and_then(|v| v.as_str()) else { + continue; + }; + let Some(emb) = e.embedding.as_ref() else { + continue; + }; + let entry = sums + .entry(path.to_string()) + .or_insert_with(|| (vec![0.0; emb.0.len()], 0)); + if entry.0.len() == emb.0.len() { + for (acc, x) in entry.0.iter_mut().zip(&emb.0) { + *acc += *x; + } + entry.1 += 1; + } + } + sums.into_iter() + .filter(|(_, (_, c))| *c > 0) + .map(|(p, (mut v, c))| { + for x in &mut v { + *x /= c as f32; + } + (p, v) + }) + .collect() +} + +/// Cosine threshold for semantic topic membership (note-level mean vectors). +/// Calibrated for E5; the hash embedder produces denser similarities but topics +/// remain a useful secondary facet. +const SEMANTIC_THRESHOLD: f32 = 0.88; +/// Above this note count, skip O(n^2) semantic clustering (tag clusters remain). +const SEMANTIC_NOTE_CAP: usize = 2000; + +/// Compute the full vault map (uncached). +pub async fn compute_vault_map(state: &crate::state::AppState) -> VaultMap { + let s = { + let g = state.graph.read().await; + derive_structural(&g) + }; + + // Hubs / entry points: top by in-degree, tie-break out-degree. + let mut entry_points: Vec = s + .notes + .iter() + .map(|p| EntryPoint { + path: p.clone(), + title: basename(p), + in_links: s.in_deg.get(p).copied().unwrap_or(0), + out_links: s.out_deg.get(p).copied().unwrap_or(0), + }) + .collect(); + entry_points.sort_by(|a, b| { + b.in_links + .cmp(&a.in_links) + .then(b.out_links.cmp(&a.out_links)) + .then(a.path.cmp(&b.path)) + }); + entry_points.retain(|e| e.in_links > 0 || e.out_links > 0); + entry_points.truncate(20); + + // Orphans. + let orphans: Vec = s + .notes + .iter() + .filter(|p| { + s.in_deg.get(*p).copied().unwrap_or(0) == 0 + && s.out_deg.get(*p).copied().unwrap_or(0) == 0 + }) + .cloned() + .collect(); + + // Semantic topics (capped) + raw pairs for semantic-edge emission. + // `raw_sem_pairs` holds (note_a, note_b, cosine) captured during the O(n²) + // union-find pass — no additional scan is needed to produce semantic edges. + let (topics, raw_sem_pairs) = if s.notes.len() <= SEMANTIC_NOTE_CAP { + let mem = state.memory.read().await; + let all_vecs = per_note_vectors(&mem); + let vecs: std::collections::BTreeMap> = all_vecs + .into_iter() + .filter(|(p, _)| s.notes.iter().any(|n| n == p)) + .collect(); + if vecs.len() >= 2 { + cluster_semantic(&vecs, SEMANTIC_THRESHOLD) + } else { + (Vec::new(), Vec::new()) + } + } else { + log::info!( + "vault_map: {} notes > cap {}, skipping semantic clustering (tag clusters used)", + s.notes.len(), + SEMANTIC_NOTE_CAP + ); + (Vec::new(), Vec::new()) + }; + + // Tag clusters + tag index. + let mut tag_clusters: Vec = s + .tag_notes + .iter() + .map(|(tag, notes)| TagGroup { + tag: tag.clone(), + notes: notes.clone(), + }) + .collect(); + tag_clusters.sort_by(|a, b| b.notes.len().cmp(&a.notes.len()).then(a.tag.cmp(&b.tag))); + let mut tags: Vec = s + .tag_notes + .iter() + .map(|(tag, notes)| TagCount { + tag: tag.clone(), + count: notes.len(), + }) + .collect(); + tags.sort_by(|a, b| b.count.cmp(&a.count).then(a.tag.cmp(&b.tag))); + + let mut types: Vec = s + .type_counts + .iter() + .map(|(ty, count)| TypeCount { + ty: ty.clone(), + count: *count, + }) + .collect(); + types.sort_by(|a, b| b.count.cmp(&a.count).then(a.ty.cmp(&b.ty))); + + // Cluster id per note (for graph coloring). + let mut cluster_of: BTreeMap = BTreeMap::new(); + for t in &topics { + for npath in &t.notes { + cluster_of.insert(npath.clone(), t.id as i64); + } + } + + // Build created-date map: note_path → date string, from "created" triples. + // Falls back to "date" when "created" is absent for a given note. + let created: BTreeMap = { + use aingle_graph::{Predicate, TriplePattern}; + let g = state.graph.read().await; + let collect_pred = |pred: &str| -> BTreeMap { + g.find(TriplePattern::any().with_predicate(Predicate::named(pred))) + .unwrap_or_default() + .into_iter() + .filter_map(|t| { + let subj = strip_brackets(&t.subject.to_string()).to_string(); + obj_string(&t).map(|o| (subj, o)) + }) + .collect() + }; + let mut map = collect_pred("date"); + // "created" takes precedence: overwrite any "date" entry. + for (k, v) in collect_pred("created") { + map.insert(k, v); + } + map + }; + + // GraphView (cap by degree). + let mut ranked: Vec<&String> = s.notes.iter().collect(); + ranked.sort_by(|a, b| { + let da = s.in_deg.get(*a).copied().unwrap_or(0) + s.out_deg.get(*a).copied().unwrap_or(0); + let db = s.in_deg.get(*b).copied().unwrap_or(0) + s.out_deg.get(*b).copied().unwrap_or(0); + db.cmp(&da).then(a.cmp(b)) + }); + if s.notes.len() > GRAPH_NODE_CAP { + log::info!( + "vault_map: {} notes > graph cap {}, rendering the {} most-connected", + s.notes.len(), + GRAPH_NODE_CAP, + GRAPH_NODE_CAP + ); + } + let kept: std::collections::BTreeSet = + ranked.into_iter().take(GRAPH_NODE_CAP).cloned().collect(); + let nodes: Vec = kept + .iter() + .map(|p| GraphNode { + id: p.clone(), + label: basename(p), + cluster: cluster_of.get(p).copied().unwrap_or(-1), + degree: s.in_deg.get(p).copied().unwrap_or(0) + s.out_deg.get(p).copied().unwrap_or(0), + timestamp: created.get(p).cloned(), + }) + .collect(); + // Link edges (explicit wikilinks), typed "link". + let mut edges: Vec = s + .edges + .iter() + .filter(|(a, b)| kept.contains(a) && kept.contains(b)) + .map(|(a, b)| GraphEdge { + source: a.clone(), + target: b.clone(), + kind: "link".into(), + }) + .collect(); + + // Semantic edges — per-node top-K selection with union semantics. + // Replaces the old "every pair ≥ threshold" approach that produced hairballs + // on themed vaults. Each node contributes at most SEMANTIC_EDGES_PER_NODE edges + // from its own ranking; the final set is the UNION of all per-node choices so + // no node becomes isolated. Total edges ≈ N × K instead of O(N²). + // + // Rules (unchanged): + // 1. Both endpoints must be in the rendered node set (`kept`). + // 2. Skip pairs already covered by an explicit wikilink edge (order-insensitive). + // 3. Keep the highest-cosine chosen pairs up to SEMANTIC_EDGE_CAP. + { + // Canonical (min, max) keys for dedup against existing link edges. + let link_pair_set: std::collections::BTreeSet<(String, String)> = s + .edges + .iter() + .map(|(a, b)| { + if a <= b { + (a.clone(), b.clone()) + } else { + (b.clone(), a.clone()) + } + }) + .collect(); + + // Normalise pair order, filter to the rendered node set, sort by cosine desc. + let mut candidates: Vec<(String, String, f32)> = raw_sem_pairs + .into_iter() + .filter(|(a, b, _)| kept.contains(a) && kept.contains(b)) + .map(|(a, b, c)| if a <= b { (a, b, c) } else { (b, a, c) }) + .collect(); + candidates.sort_by(|x, y| y.2.partial_cmp(&x.2).unwrap_or(std::cmp::Ordering::Equal)); + + // Per-node top-K with union semantics → O(N·K) edges instead of O(N²). + let chosen = top_k_semantic_pairs(&candidates, SEMANTIC_EDGES_PER_NODE); + + // Sort by cosine desc so SEMANTIC_EDGE_CAP retains the highest-quality edges. + let mut chosen_sorted: Vec<((String, String), f32)> = chosen.into_iter().collect(); + chosen_sorted.sort_by(|x, y| y.1.partial_cmp(&x.1).unwrap_or(std::cmp::Ordering::Equal)); + + let mut sem_count = 0usize; + for ((a, b), _c) in chosen_sorted { + if sem_count >= SEMANTIC_EDGE_CAP { + break; + } + if link_pair_set.contains(&(a.clone(), b.clone())) { + continue; + } + edges.push(GraphEdge { + source: a, + target: b, + kind: "semantic".into(), + }); + sem_count += 1; + } + } + + // `totals.links` counts only explicit wikilinks (s.link_count), not semantic edges. + let totals = Totals { + notes: s.notes.len(), + links: s.link_count, + clusters: topics.len(), + orphans: orphans.len(), + }; + + // Identity: the root `me.md` (exact rel_path), read first by the AI. + let identity = s + .notes + .iter() + .find(|n| n.as_str() == "me.md" || n.as_str() == "me.markdown") + .cloned(); + + // Skills: notes tagged with any SKILL_TAGS value (case-insensitive). + let mut skills: Vec = Vec::new(); + for (tag, notes) in &s.tag_notes { + if SKILL_TAGS.contains(&tag.to_lowercase().as_str()) { + skills.extend(notes.iter().cloned()); + } + } + skills.sort(); + skills.dedup(); + + let guidance = if totals.notes == 0 { + "Vault not yet indexed. Once notes are ingested, this map lists entry-point (hub) \ + notes, topic clusters, and orphans so you can navigate accurately." + .to_string() + } else { + let mut g = String::new(); + if identity.is_some() { + g.push_str("Read me.md first for the user's identity and preferences. "); + } + g.push_str(&format!( + "This vault has {} notes, {} links, {} topics, {} orphans. To answer about a topic, \ + start at its entry_points and the topic's representative note, then follow links. \ + Ground every claim with aingle_ground (it returns signed provenance). Orphan notes \ + are unconnected and may be incomplete.", + totals.notes, totals.links, totals.clusters, totals.orphans + )); + if !skills.is_empty() { + g.push_str(" Follow the skill notes (skill-map) for the user's documented processes."); + } + g + }; + + VaultMap { + totals, + entry_points, + topics, + tag_clusters, + orphans, + tags, + types, + graph: GraphView { nodes, edges }, + guidance, + identity, + skills, + } +} + +/// Cached vault map, keyed on `(graph triple_count, memory bytes)`. The graph +/// count invalidates on structural change; the memory-bytes signal invalidates +/// when chunk content/embeddings change even if the triple count is unchanged +/// (e.g. a same-structure prose edit) — so semantic topics don't go stale. +pub async fn vault_map_cached(state: &crate::state::AppState) -> VaultMap { + let tc = { state.graph.read().await.stats().triple_count }; + let mem_bytes = { state.memory.read().await.stats().total_memory_bytes }; + let key = (tc, mem_bytes); + { + let cache = state + .vault_map_cache + .lock() + .expect("vault_map cache poisoned"); + if let Some((cached_key, map)) = cache.as_ref() { + if *cached_key == key { + return map.clone(); + } + } + } + // The cache mutex is intentionally released before the async compute to avoid + // holding it across an `.await` point. + let map = compute_vault_map(state).await; + let mut cache = state + .vault_map_cache + .lock() + .expect("vault_map cache poisoned"); + *cache = Some((key, map.clone())); + map +} + +#[cfg(test)] +mod tests { + use super::*; + use aingle_graph::{NodeId, Predicate, Triple, Value}; + + pub(super) async fn graph_with(triples: &[(&str, &str, &str)]) -> crate::state::AppState { + let state = crate::state::AppState::with_db_path(":memory:", None).unwrap(); + { + let g = state.graph.write().await; + for (s, p, o) in triples { + g.insert(Triple::new( + NodeId::named(*s), + Predicate::named(*p), + Value::literal(*o), + )) + .unwrap(); + } + } + state + } + + #[tokio::test] + async fn structural_hubs_orphans_tags() { + // a.md and b.md both link to hub.md; orphan.md links to nothing and is + // linked by nothing. Tags group a.md + b.md under "storage". + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ("hub.md", "aingle:source_hash", "h3"), + ("orphan.md", "aingle:source_hash", "h4"), + ("a.md", "links_to", "hub"), + ("b.md", "links_to", "hub"), + // self-link: "a" resolves to "a.md" via basename → must be skipped + ("a.md", "links_to", "a"), + ("a.md", "tagged", "storage"), + ("b.md", "tagged", "storage"), + ("a.md", "type", "note"), + ]) + .await; + + let s = { + let g = state.graph.read().await; + super::derive_structural(&g) + }; + assert_eq!(s.notes.len(), 4); + assert_eq!( + s.in_deg.get("hub.md").copied().unwrap_or(0), + 2, + "hub has 2 incoming" + ); + assert_eq!(s.out_deg.get("a.md").copied().unwrap_or(0), 1); + assert_eq!(s.tag_notes.get("storage").map(|v| v.len()), Some(2)); + assert_eq!(s.link_count, 2); + // Self-link must not be counted as incoming for a.md. + assert_eq!( + s.in_deg.get("a.md").copied().unwrap_or(0), + 0, + "self-link must not count as incoming" + ); + // type_counts must reflect the triple ("a.md","type","note"). + assert_eq!(s.type_counts.get("note"), Some(&1)); + } + + #[test] + fn semantic_clusters_group_similar_notes() { + // Three notes: a & b have near-identical vectors, c is far. + let mut vecs: BTreeMap> = BTreeMap::new(); + vecs.insert("a.md".into(), vec![1.0, 0.0, 0.0]); + vecs.insert("b.md".into(), vec![0.99, 0.01, 0.0]); + vecs.insert("c.md".into(), vec![0.0, 0.0, 1.0]); + + let (topics, sem_pairs) = super::cluster_semantic(&vecs, 0.9); + // a & b together, c alone → 2 topics + assert_eq!(topics.len(), 2); + let big = topics.iter().max_by_key(|t| t.size).unwrap(); + assert_eq!(big.size, 2); + assert!(big.notes.contains(&"a.md".to_string()) && big.notes.contains(&"b.md".to_string())); + // The pair (a.md, b.md) must be captured in sem_pairs with cosine ≥ 0.9. + assert!( + sem_pairs.iter().any(|(a, b, c)| { + ((a == "a.md" && b == "b.md") || (a == "b.md" && b == "a.md")) && *c >= 0.9 + }), + "sem_pairs must contain (a.md, b.md) pair: {:?}", + sem_pairs + ); + } + + #[tokio::test] + async fn vault_map_cached_assembles_and_caches() { + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("hub.md", "aingle:source_hash", "h2"), + ("orphan.md", "aingle:source_hash", "h3"), + ("a.md", "links_to", "hub"), + ("a.md", "tagged", "storage"), + ]) + .await; + + let m1 = super::vault_map_cached(&state).await; + assert_eq!(m1.totals.notes, 3); + assert_eq!(m1.totals.links, 1); + assert_eq!(m1.totals.orphans, 1); // orphan.md + assert!(m1 + .entry_points + .iter() + .any(|e| e.path == "hub.md" && e.in_links == 1)); + assert!(m1.tag_clusters.iter().any(|t| t.tag == "storage")); + assert!(!m1.guidance.is_empty()); + assert!(!m1.graph.nodes.is_empty()); + + // Cached: no graph change → identical totals (and cheap). + let m2 = super::vault_map_cached(&state).await; + assert_eq!(m2.totals.notes, m1.totals.notes); + } + + #[tokio::test] + async fn excludes_maps_folder_notes() { + let state = graph_with(&[ + ("real.md", "aingle:source_hash", "h1"), + ("hub.md", "aingle:source_hash", "h2"), + ("_maps/vault-map.md", "aingle:source_hash", "h3"), + ("_maps/vault-map.md", "links_to", "hub"), + ("real.md", "links_to", "hub"), + ]) + .await; + + let map = super::vault_map_cached(&state).await; + assert_eq!(map.totals.notes, 2, "_maps/ notes excluded from the count"); + assert!(!map.graph.nodes.iter().any(|n| n.id.starts_with("_maps/"))); + assert!(!map + .entry_points + .iter() + .any(|e| e.path.starts_with("_maps/"))); + let hub = map + .entry_points + .iter() + .find(|e| e.path == "hub.md") + .expect("hub"); + assert_eq!(hub.in_links, 1, "the _maps link to hub must be excluded"); + } + + #[tokio::test] + async fn detects_identity_and_skills() { + let state = graph_with(&[ + ("me.md", "aingle:source_hash", "h0"), + ("note.md", "aingle:source_hash", "h1"), + ("deploy.md", "aingle:source_hash", "h2"), + ("writing.md", "aingle:source_hash", "h3"), + ("deploy.md", "tagged", "sop"), + ("writing.md", "tagged", "process"), + ("note.md", "tagged", "misc"), + ]) + .await; + + let map = super::vault_map_cached(&state).await; + assert_eq!(map.identity.as_deref(), Some("me.md")); + assert!(map.skills.contains(&"deploy.md".to_string())); + assert!(map.skills.contains(&"writing.md".to_string())); + assert!( + !map.skills.contains(&"note.md".to_string()), + "non-skill tag excluded" + ); + assert!( + map.guidance.contains("me.md"), + "guidance points at identity" + ); + } + + #[tokio::test] + async fn links_to_node_objects_are_read() { + // Real ingest stores wikilink targets as Value::Node, not Value::literal. + // All link-counting and hub detection must work for node-valued objects. + let state = crate::state::AppState::with_db_path(":memory:", None).unwrap(); + { + let g = state.graph.write().await; + for (s, p) in [ + ("a.md", "aingle:source_hash"), + ("hub.md", "aingle:source_hash"), + ] { + g.insert(Triple::new( + NodeId::named(s), + Predicate::named(p), + Value::literal("h"), + )) + .unwrap(); + } + // links_to as a NODE object — how real ingest produces it. + g.insert(Triple::new( + NodeId::named("a.md"), + Predicate::named("links_to"), + Value::Node(NodeId::named("hub")), + )) + .unwrap(); + } + let map = super::vault_map_cached(&state).await; + assert_eq!( + map.totals.links, 1, + "node-valued links_to must be counted: {:?}", + map.totals + ); + assert!( + map.entry_points + .iter() + .any(|e| e.path == "hub.md" && e.in_links == 1), + "hub.md must appear as a hub with 1 incoming link: {:?}", + map.entry_points + ); + } + + #[tokio::test] + async fn vault_map_cache_invalidates_on_change() { + let state = graph_with(&[("a.md", "aingle:source_hash", "h1")]).await; + let m1 = super::vault_map_cached(&state).await; + assert_eq!(m1.totals.notes, 1); + { + let g = state.graph.write().await; + g.insert(Triple::new( + NodeId::named("b.md"), + Predicate::named("aingle:source_hash"), + Value::literal("h2"), + )) + .unwrap(); + } + let m2 = super::vault_map_cached(&state).await; + assert_eq!( + m2.totals.notes, 2, + "cache must invalidate when triple_count changes" + ); + } + + // ----------------------------------------------------------------- + // VC-2 Task 2: typed edges + semantic edge emission + // ----------------------------------------------------------------- + + /// Every explicit wikilink must produce a GraphEdge with `kind == "link"`. + #[tokio::test] + async fn link_edges_have_link_kind() { + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ("a.md", "links_to", "b"), + ]) + .await; + let map = super::vault_map_cached(&state).await; + let edge = map.graph.edges.iter().find(|e| { + (e.source == "a.md" && e.target == "b.md") || (e.source == "b.md" && e.target == "a.md") + }); + let edge = edge.expect("link edge between a.md and b.md must exist"); + assert_eq!(edge.kind, "link", "wikilink edges must carry kind='link'"); + } + + /// Clustering must emit `kind == "semantic"` edges for similar notes, and must + /// NOT duplicate a pair that already has an explicit link edge. + #[tokio::test] + async fn clustering_emits_semantic_edges() { + use ineru::{Embedding, MemoryEntry}; + + // --- variant A: no explicit link; semantic edge must appear ---------------- + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ]) + .await; + { + let mut mem = state.memory.write().await; + // Identical embeddings → cosine 1.0 ≥ SEMANTIC_THRESHOLD (0.88). + for path in ["a.md", "b.md"] { + let mut e = MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": "content", "source_path": path }), + ); + e.embedding = Some(Embedding::new(vec![1.0_f32, 0.0, 0.0])); + mem.remember(e).unwrap(); + } + } + let map = super::compute_vault_map(&state).await; + let sem_ab = map.graph.edges.iter().find(|e| { + e.kind == "semantic" + && ((e.source == "a.md" && e.target == "b.md") + || (e.source == "b.md" && e.target == "a.md")) + }); + assert!( + sem_ab.is_some(), + "semantic edge between a.md and b.md must exist: {:?}", + map.graph.edges + ); + + // --- variant B: also linked explicitly; must not produce a semantic dup --- + let state2 = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ("a.md", "links_to", "b"), // explicit wikilink + ]) + .await; + { + let mut mem = state2.memory.write().await; + for path in ["a.md", "b.md"] { + let mut e = MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": "content", "source_path": path }), + ); + e.embedding = Some(Embedding::new(vec![1.0_f32, 0.0, 0.0])); + mem.remember(e).unwrap(); + } + } + let map2 = super::compute_vault_map(&state2).await; + let edges_ab: Vec<_> = map2 + .graph + .edges + .iter() + .filter(|e| { + (e.source == "a.md" && e.target == "b.md") + || (e.source == "b.md" && e.target == "a.md") + }) + .collect(); + assert_eq!( + edges_ab.len(), + 1, + "a.md-b.md pair must appear exactly once (no semantic dup): {:?}", + map2.graph.edges + ); + assert_eq!( + edges_ab[0].kind, "link", + "the single edge must have kind='link', not 'semantic': {:?}", + edges_ab[0] + ); + } + + // ----------------------------------------------------------------- + // Timestamp field: created triple → GraphNode.timestamp + // ----------------------------------------------------------------- + + /// A note with a `created` triple must surface its date in `GraphNode.timestamp`. + /// A note without a `created` triple must have `timestamp == None`. + #[tokio::test] + async fn graph_node_timestamp_from_created_triple() { + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ("a.md", "created", "2025-01-02"), + ]) + .await; + + let map = super::compute_vault_map(&state).await; + let node_a = map + .graph + .nodes + .iter() + .find(|n| n.id == "a.md") + .expect("a.md must be in graph"); + assert_eq!( + node_a.timestamp, + Some("2025-01-02".to_string()), + "timestamp must be populated from the created triple" + ); + let node_b = map + .graph + .nodes + .iter() + .find(|n| n.id == "b.md") + .expect("b.md must be in graph"); + assert_eq!( + node_b.timestamp, None, + "node without a created triple must have timestamp=None" + ); + } + + // ----------------------------------------------------------------- + // Per-node top-K semantic edges (hairball reduction) + // ----------------------------------------------------------------- + + /// `top_k_semantic_pairs` selects per-node top-k and applies union semantics. + #[test] + fn top_k_semantic_pairs_selects_union() { + // 3 pairs sorted desc by cosine: + // a picks b (0.99, its highest) + // b picks a (0.99, its highest) + // c picks a (0.95 > 0.91, so c's top-1 is a, NOT b) + // With k=1: union = {(a,b),(a,c)}. (b,c) absent — neither b nor c picks + // the other as its top-1. + let pairs = vec![ + ("a.md".to_string(), "b.md".to_string(), 0.99_f32), + ("a.md".to_string(), "c.md".to_string(), 0.95_f32), + ("b.md".to_string(), "c.md".to_string(), 0.91_f32), + ]; + let chosen = super::top_k_semantic_pairs(&pairs, 1); + assert!( + chosen.contains_key(&("a.md".to_string(), "b.md".to_string())), + "a-b must be chosen (a's and b's top-1)" + ); + assert!( + chosen.contains_key(&("a.md".to_string(), "c.md".to_string())), + "a-c must be chosen (c's top-1 is a)" + ); + assert!( + !chosen.contains_key(&("b.md".to_string(), "c.md".to_string())), + "b-c must be absent: neither b nor c ranks the other as top-1" + ); + assert_eq!(chosen.len(), 2, "exactly 2 pairs with k=1"); + } + + /// Per-node top-K reduces a fully-similar 5-note graph from C(5,2)=10 edges + /// to 9, pruning the (n3.md, n4.md) pair that neither endpoint selects in its + /// top-SEMANTIC_EDGES_PER_NODE. + #[tokio::test] + async fn per_node_top_k_reduces_hairball() { + use ineru::{Embedding, MemoryEntry}; + + // 5 notes, all with identical embeddings → every pair has cosine 1.0 ≥ threshold. + // Old code emits all C(5,2)=10 pairs. New per-node top-3 union emits 9: + // hub picks n1,n2,n3 (its first 3 in sort order); + // n4 picks hub,n1,n2 → (n3,n4) selected by neither endpoint. + let state = graph_with(&[ + ("hub.md", "aingle:source_hash", "h0"), + ("n1.md", "aingle:source_hash", "h1"), + ("n2.md", "aingle:source_hash", "h2"), + ("n3.md", "aingle:source_hash", "h3"), + ("n4.md", "aingle:source_hash", "h4"), + ]) + .await; + { + let mut mem = state.memory.write().await; + for path in ["hub.md", "n1.md", "n2.md", "n3.md", "n4.md"] { + let mut e = MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": "content", "source_path": path }), + ); + e.embedding = Some(Embedding::new(vec![1.0_f32, 0.0, 0.0])); + mem.remember(e).unwrap(); + } + } + + let map = super::compute_vault_map(&state).await; + let sem_edges: Vec<_> = map + .graph + .edges + .iter() + .filter(|e| e.kind == "semantic") + .collect(); + + // (a) Clearly-strongest pair is connected. + assert!( + sem_edges.iter().any(|e| { + (e.source == "hub.md" && e.target == "n1.md") + || (e.source == "n1.md" && e.target == "hub.md") + }), + "hub.md-n1.md must be a semantic edge (strongest pair): {:?}", + sem_edges + ); + + // (b) (n3.md, n4.md) is absent: neither endpoint ranks the other in its top-3. + assert!( + !sem_edges.iter().any(|e| { + (e.source == "n3.md" && e.target == "n4.md") + || (e.source == "n4.md" && e.target == "n3.md") + }), + "n3.md-n4.md must be pruned by per-node top-K: {:?}", + sem_edges + ); + + // (c) Total semantic edges are reduced below the old O(n²) full-mesh count. + assert!( + sem_edges.len() < 10, + "per-node top-K must reduce edges below C(5,2)=10, got {}: {:?}", + sem_edges.len(), + sem_edges + ); + + // (d) Exact deterministic count for this naming + identical-vector combination. + assert_eq!( + sem_edges.len(), + 9, + "expected exactly 9 semantic edges with per-node top-3 union: {:?}", + sem_edges + ); + } + + /// `totals.links` must count only explicit wikilinks, not semantic edges. + #[tokio::test] + async fn totals_links_counts_only_explicit() { + use ineru::{Embedding, MemoryEntry}; + + // One explicit link between a and b; a and c are semantically similar but + // not wikilinked. `totals.links` must stay 1. + let state = graph_with(&[ + ("a.md", "aingle:source_hash", "h1"), + ("b.md", "aingle:source_hash", "h2"), + ("c.md", "aingle:source_hash", "h3"), + ("a.md", "links_to", "b"), + ]) + .await; + { + let mut mem = state.memory.write().await; + // a and c share a near-identical embedding → cosine 1.0 ≥ threshold. + for path in ["a.md", "c.md"] { + let mut e = MemoryEntry::new( + crate::service::ingest::CHUNK_ENTRY_TYPE, + serde_json::json!({ "text": "content", "source_path": path }), + ); + e.embedding = Some(Embedding::new(vec![1.0_f32, 0.0, 0.0])); + mem.remember(e).unwrap(); + } + } + let map = super::compute_vault_map(&state).await; + assert_eq!( + map.totals.links, 1, + "totals.links must count only explicit wikilinks, not semantic edges: {:?}", + map.totals + ); + let sem_edges: Vec<_> = map + .graph + .edges + .iter() + .filter(|e| e.kind == "semantic") + .collect(); + assert!( + !sem_edges.is_empty(), + "semantic edges between similar notes must exist even when totals.links is 1" + ); + } +} diff --git a/crates/aingle_cortex/src/state.rs b/crates/aingle_cortex/src/state.rs index ea2c3db5..973274a9 100644 --- a/crates/aingle_cortex/src/state.rs +++ b/crates/aingle_cortex/src/state.rs @@ -5,9 +5,9 @@ use aingle_graph::GraphDB; use aingle_logic::RuleEngine; +use ineru::{Embedder, HashEmbedder, IneruMemory}; use std::path::Path; use std::sync::Arc; -use ineru::IneruMemory; use tokio::sync::RwLock; #[cfg(feature = "auth")] @@ -15,6 +15,30 @@ use crate::auth::UserStore; use crate::proofs::ProofStore; use crate::rest::audit::AuditLog; +// --------------------------------------------------------------------------- +// Cache type aliases (avoid clippy::type_complexity on the struct fields) +// --------------------------------------------------------------------------- + +/// Shared cache type for the vault map. +type VaultMapCache = + std::sync::Mutex>; + +/// Shared cache type for per-note semantic-neighbor contexts. +type NoteContextCache = std::sync::Mutex< + std::collections::HashMap< + (String, usize), + ((usize, usize), crate::service::context::NoteContext), + >, +>; + +/// Shared cache type for per-note local-graph neighborhoods. +type LocalGraphCache = std::sync::Mutex< + std::collections::HashMap< + (String, usize), + ((usize, usize), crate::service::local_graph::LocalGraph), + >, +>; + /// The shared state accessible by all API handlers. /// /// This struct uses `Arc` and `RwLock` to provide safe, concurrent access @@ -27,6 +51,21 @@ pub struct AppState { pub logic: Arc>, /// The Ineru dual-memory system (STM + LTM with consolidation). pub memory: Arc>, + /// The active text embedder (hash fallback or neural). Shared, thread-safe. + pub embedder: std::sync::Arc, + /// Cached vault map, keyed on (graph triple-count, memory bytes) — see + /// service::vault_map::vault_map_cached. + pub vault_map_cache: Arc, + /// Per-note semantic-neighbor cache, keyed by `(note_path, limit)`, storing + /// `(graph_triple_count, total_memory_bytes) → NoteContext`. Invalidated + /// whenever the graph or memory changes — same staleness signal as + /// vault_map_cache. `limit` is part of the key so that MCP calls with + /// different limits do not serve stale neighbor counts from cache. + pub note_context_cache: Arc, + /// Per-note local-graph cache, keyed by `(note_path, depth)`, storing + /// `(graph_triple_count, total_memory_bytes) → LocalGraph`. Invalidated + /// on any graph or memory change — mirrors note_context_cache semantics. + pub local_graph_cache: Arc, /// The event broadcaster for sending real-time updates to WebSocket subscribers. pub broadcaster: Arc, /// The store for managing and verifying zero-knowledge proofs. @@ -48,7 +87,12 @@ pub struct AppState { pub wal: Option>, /// Raft consensus instance for cluster coordination. #[cfg(feature = "cluster")] - pub raft: Option>>, + pub raft: Option< + openraft::Raft< + aingle_raft::CortexTypeConfig, + std::sync::Arc, + >, + >, /// This node's ID in the Raft cluster. #[cfg(feature = "cluster")] pub cluster_node_id: Option, @@ -89,6 +133,14 @@ impl AppState { graph: Arc::new(RwLock::new(graph)), logic: Arc::new(RwLock::new(logic)), memory: Arc::new(RwLock::new(memory)), + embedder: std::sync::Arc::new(HashEmbedder::new()), + vault_map_cache: std::sync::Arc::new(std::sync::Mutex::new(None)), + note_context_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), + local_graph_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), broadcaster: Arc::new(EventBroadcaster::new()), proof_store: Arc::new(ProofStore::new()), sandbox_manager: Arc::new(SandboxManager::new()), @@ -133,6 +185,14 @@ impl AppState { graph: Arc::new(RwLock::new(graph)), logic: Arc::new(RwLock::new(logic)), memory: Arc::new(RwLock::new(memory)), + embedder: std::sync::Arc::new(HashEmbedder::new()), + vault_map_cache: std::sync::Arc::new(std::sync::Mutex::new(None)), + note_context_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), + local_graph_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), broadcaster: Arc::new(EventBroadcaster::new()), proof_store: Arc::new(ProofStore::new()), sandbox_manager: Arc::new(SandboxManager::new()), @@ -177,6 +237,14 @@ impl AppState { graph: Arc::new(RwLock::new(graph)), logic: Arc::new(RwLock::new(logic)), memory: Arc::new(RwLock::new(memory)), + embedder: std::sync::Arc::new(HashEmbedder::new()), + vault_map_cache: std::sync::Arc::new(std::sync::Mutex::new(None)), + note_context_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), + local_graph_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), broadcaster: Arc::new(EventBroadcaster::new()), proof_store: Arc::new(ProofStore::new()), sandbox_manager: Arc::new(SandboxManager::new()), @@ -211,6 +279,22 @@ impl AppState { pub fn with_db_path( db_path: &str, audit_log_path: Option, + ) -> crate::error::Result { + Self::with_db_path_and_embedder( + db_path, + audit_log_path, + std::sync::Arc::new(HashEmbedder::new()), + ) + } + + /// Like [`with_db_path`] but with an explicit embedder. If a persisted + /// snapshot was produced by a different-dimension embedder, the snapshot is + /// discarded and the `aingle:source_hash` registry is cleared so the next + /// ingest re-embeds everything with this embedder. + pub fn with_db_path_and_embedder( + db_path: &str, + audit_log_path: Option, + embedder: std::sync::Arc, ) -> crate::error::Result { let graph = if db_path == ":memory:" { GraphDB::memory()? @@ -224,13 +308,25 @@ impl AppState { let logic = RuleEngine::new(); - // Load Ineru snapshot if available next to the graph database + // Embedder-change migration + snapshot load (persistent only). let memory = if db_path != ":memory:" { - let snapshot_path = Path::new(db_path) - .parent() - .unwrap_or(Path::new(".")) - .join("ineru.snapshot"); - if snapshot_path.exists() { + let dbdir = Path::new(db_path).parent().unwrap_or(Path::new(".")); + let snapshot_path = dbdir.join("ineru.snapshot"); + let active_dims = embedder.dimensions(); + // Pre-sidecar databases were written by the 64d hash embedder. + let persisted_dims = crate::embedder::read_dims(dbdir).unwrap_or(64); + let snapshot_exists = snapshot_path.exists(); + let dim_mismatch = snapshot_exists && persisted_dims != active_dims; + + if dim_mismatch { + let removed = crate::embedder::clear_source_registry(&graph) + .map_err(|e| crate::error::Error::Internal(format!("clear registry: {e}")))?; + log::warn!( + "Embedder changed ({}d → {}d): cleared {} source-hash entries; re-ingest required.", + persisted_dims, active_dims, removed + ); + IneruMemory::agent_mode() + } else if snapshot_exists { match IneruMemory::load_from_file(&snapshot_path) { Ok(mem) => { log::info!("Loaded Ineru snapshot from {}", snapshot_path.display()); @@ -268,7 +364,10 @@ impl AppState { Arc::new(ps) } Err(e) => { - log::warn!("Failed to open Sled ProofStore: {}. Falling back to in-memory.", e); + log::warn!( + "Failed to open Sled ProofStore: {}. Falling back to in-memory.", + e + ); Arc::new(ProofStore::new()) } } @@ -287,6 +386,14 @@ impl AppState { graph: Arc::new(RwLock::new(graph)), logic: Arc::new(RwLock::new(logic)), memory: Arc::new(RwLock::new(memory)), + embedder, + vault_map_cache: std::sync::Arc::new(std::sync::Mutex::new(None)), + note_context_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), + local_graph_cache: std::sync::Arc::new(std::sync::Mutex::new( + std::collections::HashMap::new(), + )), broadcaster: Arc::new(EventBroadcaster::new()), proof_store, sandbox_manager: Arc::new(SandboxManager::new()), @@ -314,7 +421,6 @@ impl AppState { }) } - /// Flushes the graph database and saves the Ineru memory snapshot to disk. /// /// This should be called before shutdown or binary updates to ensure @@ -340,6 +446,7 @@ impl AppState { } else { log::info!("Ineru snapshot saved to {}", snapshot_path.display()); } + crate::embedder::write_dims(dir, self.embedder.dimensions()); } Ok(()) @@ -545,3 +652,170 @@ impl Default for SandboxManager { Self::new() } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn appstate_has_default_hash_embedder() { + let state = AppState::new().unwrap(); + assert_eq!(state.embedder.dimensions(), 64); + } + + #[tokio::test] + async fn embedder_change_clears_source_registry_and_snapshot() { + use aingle_graph::{Predicate, TriplePattern}; + let dir = tempfile::tempdir().unwrap(); + let db = dir.path().join("graph.sled"); + let db_str = db.to_str().unwrap(); + + // First boot with the default (hash, 64d): ingest writes a registry triple, + // flush writes snapshot + embedder.dims=64. + { + let state = AppState::with_db_path(db_str, None).unwrap(); + { + let mut g = state.graph.write().await; + g.enable_dag(); + } + std::fs::write( + dir.path().join("note.md"), + "# N\n\nsled has exclusive locks.\n", + ) + .unwrap(); + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + state.flush(Some(db.parent().unwrap())).await.unwrap(); + } + + // Registry triple exists on disk now. + { + let state = AppState::with_db_path(db_str, None).unwrap(); + let g = state.graph.read().await; + let n = g + .find( + TriplePattern::any() + .with_predicate(Predicate::named(crate::service::ingest::PRED_SOURCE_HASH)), + ) + .unwrap() + .len(); + assert!(n >= 1, "registry triple should exist after first ingest"); + } + + // Second boot with a 384d embedder → mismatch → registry cleared, memory empty. + { + let fake_384: std::sync::Arc = std::sync::Arc::new(Fake384); + let state = AppState::with_db_path_and_embedder(db_str, None, fake_384).unwrap(); + let g = state.graph.read().await; + let n = g + .find( + TriplePattern::any() + .with_predicate(Predicate::named(crate::service::ingest::PRED_SOURCE_HASH)), + ) + .unwrap() + .len(); + assert_eq!(n, 0, "registry must be cleared on embedder dim change"); + } + } + + #[tokio::test] + async fn legacy_snapshot_without_sidecar_migrates_on_dim_change() { + use aingle_graph::{Predicate, TriplePattern}; + let dir = tempfile::tempdir().unwrap(); + let db = dir.path().join("graph.sled"); + let db_str = db.to_str().unwrap(); + + // First boot with default hash (64d): ingest + flush (writes snapshot + sidecar). + { + let state = AppState::with_db_path(db_str, None).unwrap(); + { + let mut g = state.graph.write().await; + g.enable_dag(); + } + std::fs::write( + dir.path().join("n.md"), + "# N\n\nsled has exclusive locks.\n", + ) + .unwrap(); + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + state.flush(Some(db.parent().unwrap())).await.unwrap(); + } + + // Simulate a legacy DB: delete the sidecar so persisted_dims is absent. + std::fs::remove_file(db.parent().unwrap().join("embedder.dims")).unwrap(); + + // Boot with a 384d embedder: absent sidecar must be treated as 64d → mismatch → cleared. + { + let fake_384: std::sync::Arc = std::sync::Arc::new(Fake384); + let state = AppState::with_db_path_and_embedder(db_str, None, fake_384).unwrap(); + let g = state.graph.read().await; + let n = g + .find( + TriplePattern::any() + .with_predicate(Predicate::named(crate::service::ingest::PRED_SOURCE_HASH)), + ) + .unwrap() + .len(); + assert_eq!( + n, 0, + "legacy snapshot without sidecar must migrate when dims differ" + ); + } + } + + #[tokio::test] + async fn same_dims_preserves_snapshot_and_registry() { + use aingle_graph::{Predicate, TriplePattern}; + let dir = tempfile::tempdir().unwrap(); + let db = dir.path().join("graph.sled"); + let db_str = db.to_str().unwrap(); + + { + let state = AppState::with_db_path(db_str, None).unwrap(); + { + let mut g = state.graph.write().await; + g.enable_dag(); + } + std::fs::write( + dir.path().join("n.md"), + "# N\n\nsled has exclusive locks.\n", + ) + .unwrap(); + crate::service::ingest::ingest_path(&state, dir.path().to_str().unwrap(), None) + .await + .unwrap(); + state.flush(Some(db.parent().unwrap())).await.unwrap(); + } + + // Second boot with the same default 64d hash embedder: no migration. + { + let state = AppState::with_db_path(db_str, None).unwrap(); + let g = state.graph.read().await; + let n = g + .find( + TriplePattern::any() + .with_predicate(Predicate::named(crate::service::ingest::PRED_SOURCE_HASH)), + ) + .unwrap() + .len(); + assert!(n >= 1, "same-dims boot must preserve the registry"); + } + } + + /// A stand-in 384-dim embedder for migration tests (no model needed). + struct Fake384; + impl Embedder for Fake384 { + fn embed_passage(&self, _t: &str) -> ineru::Embedding { + ineru::Embedding::new(vec![0.0; 384]) + } + fn embed_query(&self, _t: &str) -> ineru::Embedding { + ineru::Embedding::new(vec![0.0; 384]) + } + fn dimensions(&self) -> usize { + 384 + } + } +} diff --git a/crates/aingle_cortex/tests/data_integrity_test.rs b/crates/aingle_cortex/tests/data_integrity_test.rs index 0e261328..70ddb0a7 100644 --- a/crates/aingle_cortex/tests/data_integrity_test.rs +++ b/crates/aingle_cortex/tests/data_integrity_test.rs @@ -9,8 +9,8 @@ //! - State flush/restore round-trip //! - Batch insert atomicity +use aingle_cortex::proofs::{ProofMetadata, ProofStore, ProofType, SubmitProofRequest}; use aingle_cortex::state::AppState; -use aingle_cortex::proofs::{ProofStore, ProofType, SubmitProofRequest, ProofMetadata}; // ============================================================================ // 1. ProofStore persistence round-trip (Sled backend) @@ -62,7 +62,9 @@ async fn test_proof_store_sled_roundtrip_data_integrity() { assert_eq!(store.count().await, 20, "count mismatch after reopen"); for (i, id) in proof_ids.iter().enumerate() { - let proof = store.get(id).await + let proof = store + .get(id) + .await .unwrap_or_else(|| panic!("proof {} (id={}) missing after reopen", i, id)); // Verify data field contains correct index @@ -99,11 +101,17 @@ async fn test_proof_store_sled_roundtrip_data_integrity() { // Deleted ones should be gone for id in &proof_ids[0..10] { - assert!(store.get(id).await.is_none(), "deleted proof {} should not exist", id); + assert!( + store.get(id).await.is_none(), + "deleted proof {} should not exist", + id + ); } // Remaining ones should be intact for (i, id) in proof_ids[10..20].iter().enumerate() { - let proof = store.get(id).await + let proof = store + .get(id) + .await .unwrap_or_else(|| panic!("remaining proof {} missing", i + 10)); let data: serde_json::Value = serde_json::from_slice(&proof.data).unwrap(); assert_eq!(data["index"].as_u64().unwrap() as usize, i + 10); @@ -117,7 +125,7 @@ async fn test_proof_store_sled_roundtrip_data_integrity() { #[tokio::test] async fn test_graph_dag_triple_materialization_consistency() { - use aingle_graph::{GraphDB, NodeId, Predicate, Value, Triple, TriplePattern}; + use aingle_graph::{GraphDB, NodeId, Predicate, Triple, TriplePattern, Value}; let mut graph = GraphDB::memory().unwrap(); graph.enable_dag(); @@ -144,17 +152,28 @@ async fn test_graph_dag_triple_materialization_consistency() { // Verify each triple can be retrieved by ID for (i, tid) in triple_ids.iter().enumerate() { - let triple = graph.get(tid).unwrap() + let triple = graph + .get(tid) + .unwrap() .unwrap_or_else(|| panic!("triple {} not found by ID", i)); - assert_eq!(triple.object, Value::Integer(i as i64 * 100), - "value mismatch for triple {}", i); + assert_eq!( + triple.object, + Value::Integer(i as i64 * 100), + "value mismatch for triple {}", + i + ); } // Verify pattern queries return correct results for i in 0..50 { let pattern = TriplePattern::subject(NodeId::named(&format!("entity:{}", i))); let results = graph.find(pattern).unwrap(); - assert_eq!(results.len(), 1, "entity:{} should have exactly 1 triple", i); + assert_eq!( + results.len(), + 1, + "entity:{} should have exactly 1 triple", + i + ); assert_eq!(results[0].object, Value::Integer(i * 100)); } @@ -182,17 +201,19 @@ async fn test_graph_dag_triple_materialization_consistency() { #[tokio::test] async fn test_batch_insert_index_consistency() { - use aingle_graph::{GraphDB, NodeId, Predicate, Value, Triple, TriplePattern}; + use aingle_graph::{GraphDB, NodeId, Predicate, Triple, TriplePattern, Value}; let graph = GraphDB::memory().unwrap(); // Batch insert 100 triples let triples: Vec = (0..100) - .map(|i| Triple::new( - NodeId::named(&format!("batch:{}", i)), - Predicate::named("batch_value"), - Value::Integer(i), - )) + .map(|i| { + Triple::new( + NodeId::named(&format!("batch:{}", i)), + Predicate::named("batch_value"), + Value::Integer(i), + ) + }) .collect(); let ids = graph.insert_batch(triples).unwrap(); @@ -208,18 +229,20 @@ async fn test_batch_insert_index_consistency() { } // Verify predicate index works - let by_pred = graph.find( - TriplePattern::predicate(Predicate::named("batch_value")) - ).unwrap(); + let by_pred = graph + .find(TriplePattern::predicate(Predicate::named("batch_value"))) + .unwrap(); assert_eq!(by_pred.len(), 100, "predicate index should find all 100"); // Re-batch the same triples — should skip duplicates, no count change let triples2: Vec = (0..100) - .map(|i| Triple::new( - NodeId::named(&format!("batch:{}", i)), - Predicate::named("batch_value"), - Value::Integer(i), - )) + .map(|i| { + Triple::new( + NodeId::named(&format!("batch:{}", i)), + Predicate::named("batch_value"), + Value::Integer(i), + ) + }) .collect(); let ids2 = graph.insert_batch(triples2).unwrap(); assert_eq!(ids2.len(), 100); @@ -232,7 +255,7 @@ async fn test_batch_insert_index_consistency() { #[tokio::test] async fn test_app_state_flush_restore_roundtrip() { - use aingle_graph::{NodeId, Predicate, Value, Triple, TriplePattern}; + use aingle_graph::{NodeId, Predicate, Triple, TriplePattern, Value}; let dir = tempfile::TempDir::new().unwrap(); let db_path = dir.path().join("graph.sled"); @@ -291,7 +314,8 @@ async fn test_app_state_flush_restore_roundtrip() { assert_eq!( results[0].object, Value::String(format!("data-{}", i)), - "data mismatch for node:{}", i + "data mismatch for node:{}", + i ); } } @@ -300,7 +324,10 @@ async fn test_app_state_flush_restore_roundtrip() { let proof_count = state.proof_store.count().await; assert_eq!(proof_count, 5, "proofs should survive restart"); for (i, id) in proof_ids.iter().enumerate() { - let proof = state.proof_store.get(id).await + let proof = state + .proof_store + .get(id) + .await .unwrap_or_else(|| panic!("proof {} missing after restart", i)); let data: serde_json::Value = serde_json::from_slice(&proof.data).unwrap(); assert_eq!(data["flush_test"].as_u64().unwrap(), i as u64); @@ -314,9 +341,7 @@ async fn test_app_state_flush_restore_roundtrip() { #[tokio::test] async fn test_raft_snapshot_with_proofs_roundtrip() { - use aingle_raft::state_machine::{ - ClusterSnapshot, TripleSnapshot, ProofSnapshot, - }; + use aingle_raft::state_machine::{ClusterSnapshot, ProofSnapshot, TripleSnapshot}; let snapshot = ClusterSnapshot { triples: vec![ @@ -379,7 +404,10 @@ async fn test_raft_snapshot_with_proofs_roundtrip() { assert_eq!(restored.proofs[0].proof_type, "schnorr"); assert_eq!(restored.proofs[0].data, vec![1, 2, 3, 4]); assert!(restored.proofs[0].verified); - assert_eq!(restored.proofs[0].verified_at.as_deref(), Some("2026-03-16T00:01:00Z")); + assert_eq!( + restored.proofs[0].verified_at.as_deref(), + Some("2026-03-16T00:01:00Z") + ); assert_eq!(restored.proofs[1].id, "proof-002"); assert!(!restored.proofs[1].verified); assert!(restored.proofs[1].verified_at.is_none()); @@ -398,9 +426,7 @@ async fn test_raft_snapshot_with_proofs_roundtrip() { #[tokio::test] async fn test_snapshot_checksum_changes_with_proofs() { - use aingle_raft::state_machine::{ - ClusterSnapshot, TripleSnapshot, ProofSnapshot, - }; + use aingle_raft::state_machine::{ClusterSnapshot, ProofSnapshot, TripleSnapshot}; // Snapshot without proofs let snap_no_proofs = ClusterSnapshot { @@ -447,8 +473,10 @@ async fn test_snapshot_checksum_changes_with_proofs() { let r2 = ClusterSnapshot::from_bytes(&bytes2).unwrap(); // Checksums should differ - assert_ne!(r1.checksum, r2.checksum, - "checksum should change when proofs are added"); + assert_ne!( + r1.checksum, r2.checksum, + "checksum should change when proofs are added" + ); } // ============================================================================ @@ -457,7 +485,7 @@ async fn test_snapshot_checksum_changes_with_proofs() { #[tokio::test] async fn test_graph_sled_persistence_full_cycle() { - use aingle_graph::{GraphDB, NodeId, Predicate, Value, Triple, TriplePattern}; + use aingle_graph::{GraphDB, NodeId, Predicate, Triple, TriplePattern, Value}; let dir = tempfile::TempDir::new().unwrap(); let path = dir.path().join("test.sled"); @@ -504,7 +532,7 @@ async fn test_graph_sled_persistence_full_cycle() { #[tokio::test] async fn test_audit_log_fsync_integrity() { - use aingle_cortex::rest::audit::{AuditLog, AuditEntry}; + use aingle_cortex::rest::audit::{AuditEntry, AuditLog}; let dir = tempfile::TempDir::new().unwrap(); let path = dir.path().join("audit_test.jsonl"); @@ -517,7 +545,11 @@ async fn test_audit_log_fsync_integrity() { timestamp: format!("2026-03-16T00:{:02}:00Z", i), user_id: format!("user-{}", i % 5), namespace: Some("test".to_string()), - action: if i % 3 == 0 { "create".into() } else { "read".into() }, + action: if i % 3 == 0 { + "create".into() + } else { + "read".into() + }, resource: format!("/api/v1/triples/{}", i), details: Some(format!("detail-{}", i)), request_id: Some(format!("req-{}", i)), diff --git a/crates/aingle_graph/Cargo.toml b/crates/aingle_graph/Cargo.toml index c0d45378..905babd6 100644 --- a/crates/aingle_graph/Cargo.toml +++ b/crates/aingle_graph/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_graph" -version = "0.6.3" +version = "0.7.0" description = "Native GraphDB for AIngle - Semantic triple store with SPO indexes" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" diff --git a/crates/aingle_graph/src/dag/action.rs b/crates/aingle_graph/src/dag/action.rs index af9c2378..5d647edb 100644 --- a/crates/aingle_graph/src/dag/action.rs +++ b/crates/aingle_graph/src/dag/action.rs @@ -111,22 +111,38 @@ pub enum DagPayload { }, } +/// Where an ingested fact or chunk came from: a file and the line span within it, +/// plus the content hash of the whole file at ingest time. Carried in the signed +/// DAG payload so provenance is cryptographically bound to the fact. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Provenance { + /// Path of the source file, relative to the ingest root. + pub source_path: String, + /// 1-based first line of the span this fact was extracted from. + pub line_start: u32, + /// 1-based last line of the span (inclusive). + pub line_end: u32, + /// Hex blake3 of the full file content at ingest time. + pub content_hash: String, +} + /// Wire format for a triple insert within a DAG action. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub struct TripleInsertPayload { pub subject: String, pub predicate: String, pub object: serde_json::Value, + /// Optional source provenance. Omitted from the wire form (and thus from the + /// content hash) when absent, so pre-provenance action hashes are unchanged. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub provenance: Option, } /// Kinds of memory operations tracked in the DAG. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum MemoryOpKind { /// A memory entry was stored. - Store { - entry_type: String, - importance: f32, - }, + Store { entry_type: String, importance: f32 }, /// A memory entry was forgotten. Forget { memory_id: String }, /// Consolidation was triggered. @@ -178,8 +194,8 @@ impl DagAction { // Author — serde_json::to_vec cannot fail for NodeId (no maps with // non-string keys, no NaN/Inf floats), so expect() is safe here. - let author_bytes = serde_json::to_vec(&self.author) - .expect("NodeId serialization must not fail"); + let author_bytes = + serde_json::to_vec(&self.author).expect("NodeId serialization must not fail"); hasher.update(&(author_bytes.len() as u64).to_le_bytes()); hasher.update(&author_bytes); @@ -192,8 +208,8 @@ impl DagAction { // Payload — same reasoning: DagPayload contains only strings, // integers, booleans, and JSON values — all safely serializable. - let payload_bytes = serde_json::to_vec(&self.payload) - .expect("DagPayload serialization must not fail"); + let payload_bytes = + serde_json::to_vec(&self.payload).expect("DagPayload serialization must not fail"); hasher.update(&(payload_bytes.len() as u64).to_le_bytes()); hasher.update(&payload_bytes); @@ -234,6 +250,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }], }, signature: None, @@ -313,6 +330,7 @@ mod tests { subject: "a".into(), predicate: "b".into(), object: serde_json::json!("c"), + provenance: None, }], }, DagPayload::TripleDelete { @@ -355,9 +373,8 @@ mod tests { "another_future": 123 }"#; - let action: DagAction = serde_json::from_str(json).expect( - "must deserialize actions with unknown fields (forward compat)" - ); + let action: DagAction = serde_json::from_str(json) + .expect("must deserialize actions with unknown fields (forward compat)"); assert_eq!(action.seq, 42); assert!(matches!(action.payload, DagPayload::Noop)); } @@ -396,9 +413,62 @@ mod tests { "payload": "Noop" }"#; - let action: DagAction = serde_json::from_str(json).expect( - "must deserialize actions without signature field (backward compat)" - ); + let action: DagAction = serde_json::from_str(json) + .expect("must deserialize actions without signature field (backward compat)"); assert!(action.signature.is_none()); } + + #[test] + fn provenance_none_is_omitted_and_hash_is_stable() { + use crate::dag::{DagPayload, TripleInsertPayload}; + use chrono::TimeZone; + + // A payload with no provenance must serialize WITHOUT a "provenance" key, + // so DAG action hashes computed before this field existed stay identical. + let p = TripleInsertPayload { + subject: "alice".into(), + predicate: "knows".into(), + object: serde_json::json!("bob"), + provenance: None, + }; + let json = serde_json::to_string(&p).unwrap(); + assert!( + !json.contains("provenance"), + "None provenance must be skipped: {json}" + ); + + // Old wire format (no provenance key) still deserializes. + let old = r#"{"subject":"a","predicate":"b","object":"c"}"#; + let parsed: TripleInsertPayload = serde_json::from_str(old).unwrap(); + assert!(parsed.provenance.is_none()); + + // A populated provenance round-trips. + let prov = Provenance { + source_path: "docs/x.md".into(), + line_start: 3, + line_end: 5, + content_hash: "deadbeef".into(), + }; + let p2 = TripleInsertPayload { + subject: "s".into(), + predicate: "p".into(), + object: serde_json::json!("o"), + provenance: Some(prov.clone()), + }; + let round: TripleInsertPayload = + serde_json::from_str(&serde_json::to_string(&p2).unwrap()).unwrap(); + assert_eq!(round.provenance, Some(prov)); + + // Sanity: an action carrying a None-provenance TripleInsert hashes the same + // as the equivalent payload built inline (documents hash-stability intent). + let action = DagAction { + parents: vec![], + author: crate::NodeId::named("node:a"), + seq: 0, + timestamp: chrono::Utc.timestamp_opt(0, 0).unwrap(), + payload: DagPayload::TripleInsert { triples: vec![p] }, + signature: None, + }; + let _ = action.compute_hash(); // must not panic + } } diff --git a/crates/aingle_graph/src/dag/backend.rs b/crates/aingle_graph/src/dag/backend.rs index aa2e7fde..dd12d432 100644 --- a/crates/aingle_graph/src/dag/backend.rs +++ b/crates/aingle_graph/src/dag/backend.rs @@ -158,8 +158,8 @@ impl DagBackend for SledDagBackend { fn scan_prefix(&self, prefix: &[u8]) -> crate::Result, Vec)>> { let mut results = Vec::new(); for item in self.tree.scan_prefix(prefix) { - let (k, v) = item - .map_err(|e| crate::Error::Storage(format!("sled dag scan error: {}", e)))?; + let (k, v) = + item.map_err(|e| crate::Error::Storage(format!("sled dag scan error: {}", e)))?; results.push((k.to_vec(), v.to_vec())); } Ok(results) diff --git a/crates/aingle_graph/src/dag/export.rs b/crates/aingle_graph/src/dag/export.rs index a6c8bbac..c8c0190e 100644 --- a/crates/aingle_graph/src/dag/export.rs +++ b/crates/aingle_graph/src/dag/export.rs @@ -62,8 +62,7 @@ fn short_id(id: &str) -> &str { impl DagGraph { /// Build a graph from a list of actions and their tip status. pub fn from_actions(actions: &[DagAction], tips: &[DagActionHash]) -> Self { - let tip_set: std::collections::HashSet<[u8; 32]> = - tips.iter().map(|h| h.0).collect(); + let tip_set: std::collections::HashSet<[u8; 32]> = tips.iter().map(|h| h.0).collect(); let mut nodes = Vec::with_capacity(actions.len()); let mut edges = Vec::new(); @@ -85,7 +84,9 @@ impl DagGraph { DagPayload::Genesis { .. } => "Genesis".into(), DagPayload::Compact { .. } => "Compact".into(), DagPayload::Noop => "Noop".into(), - DagPayload::Custom { ref payload_type, .. } => { + DagPayload::Custom { + ref payload_type, .. + } => { format!("Custom({})", payload_type) } }; @@ -115,7 +116,9 @@ impl DagGraph { /// Export as Graphviz DOT format. pub fn to_dot(&self) -> String { - let mut out = String::from("digraph DAG {\n rankdir=BT;\n node [shape=box, style=filled, fontsize=10];\n\n"); + let mut out = String::from( + "digraph DAG {\n rankdir=BT;\n node [shape=box, style=filled, fontsize=10];\n\n", + ); for node in &self.nodes { let color = if node.is_tip { @@ -173,7 +176,10 @@ impl DagGraph { // Style tips for node in &self.nodes { if node.is_tip { - out.push_str(&format!(" style {} fill:#4CAF50,color:white\n", short_id(&node.id))); + out.push_str(&format!( + " style {} fill:#4CAF50,color:white\n", + short_id(&node.id) + )); } } @@ -213,6 +219,7 @@ mod tests { subject: format!("s{}", seq), predicate: "p".into(), object: serde_json::json!("o"), + provenance: None, }], }, signature: None, @@ -305,7 +312,10 @@ mod tests { assert_eq!(ExportFormat::from_str("dot"), Some(ExportFormat::Dot)); assert_eq!(ExportFormat::from_str("DOT"), Some(ExportFormat::Dot)); assert_eq!(ExportFormat::from_str("graphviz"), Some(ExportFormat::Dot)); - assert_eq!(ExportFormat::from_str("mermaid"), Some(ExportFormat::Mermaid)); + assert_eq!( + ExportFormat::from_str("mermaid"), + Some(ExportFormat::Mermaid) + ); assert_eq!(ExportFormat::from_str("json"), Some(ExportFormat::Json)); assert_eq!(ExportFormat::from_str("xml"), None); } diff --git a/crates/aingle_graph/src/dag/mod.rs b/crates/aingle_graph/src/dag/mod.rs index cf9ea511..aa94cb8d 100644 --- a/crates/aingle_graph/src/dag/mod.rs +++ b/crates/aingle_graph/src/dag/mod.rs @@ -25,10 +25,12 @@ pub mod sync; pub mod timetravel; pub mod tips; -pub use action::{DagAction, DagActionHash, DagPayload, MemoryOpKind, TripleInsertPayload}; -pub use backend::{DagBackend, MemoryDagBackend}; +pub use action::{ + DagAction, DagActionHash, DagPayload, MemoryOpKind, Provenance, TripleInsertPayload, +}; #[cfg(feature = "sled-backend")] pub use backend::SledDagBackend; +pub use backend::{DagBackend, MemoryDagBackend}; pub use export::{DagGraph, ExportFormat}; pub use pruning::{PruneResult, RetentionPolicy}; #[cfg(feature = "dag-sign")] diff --git a/crates/aingle_graph/src/dag/signing.rs b/crates/aingle_graph/src/dag/signing.rs index 5f9b6427..7efcfd2a 100644 --- a/crates/aingle_graph/src/dag/signing.rs +++ b/crates/aingle_graph/src/dag/signing.rs @@ -203,6 +203,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }], }, signature: None, @@ -338,8 +339,8 @@ mod tests { #[test] fn test_verifying_key_from_bytes_invalid() { let bad_bytes = [0u8; 32]; // not a valid Ed25519 point - // This may or may not fail depending on the point — use all-zero which is identity - // For safety, just test that the API doesn't panic + // This may or may not fail depending on the point — use all-zero which is identity + // For safety, just test that the API doesn't panic let _ = DagVerifyingKey::from_bytes(&bad_bytes); } } diff --git a/crates/aingle_graph/src/dag/store.rs b/crates/aingle_graph/src/dag/store.rs index d2479264..b32ea183 100644 --- a/crates/aingle_graph/src/dag/store.rs +++ b/crates/aingle_graph/src/dag/store.rs @@ -1123,6 +1123,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }], }, signature: None, @@ -1185,6 +1186,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }); let history = store.history(&tid, 10).unwrap(); @@ -1261,6 +1263,7 @@ mod tests { subject: subject.into(), predicate: predicate.into(), object: object_json.clone(), + provenance: None, }); // Compute via TripleId::from_triple (the canonical graph path) @@ -1322,6 +1325,7 @@ mod tests { subject: format!("s{}", seq), predicate: "p".into(), object: serde_json::json!(seq), + provenance: None, }], }, signature: None, @@ -1613,6 +1617,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }); let history = store.history(&tid, 10).unwrap(); assert_eq!(history.len(), 3); diff --git a/crates/aingle_graph/src/dag/sync.rs b/crates/aingle_graph/src/dag/sync.rs index ddfbe588..d9d79093 100644 --- a/crates/aingle_graph/src/dag/sync.rs +++ b/crates/aingle_graph/src/dag/sync.rs @@ -64,6 +64,7 @@ mod tests { subject: subject.into(), predicate: "knows".into(), object: serde_json::json!("x"), + provenance: None, }], }, signature: None, diff --git a/crates/aingle_graph/src/dag/timetravel.rs b/crates/aingle_graph/src/dag/timetravel.rs index 46df01d5..95abfb82 100644 --- a/crates/aingle_graph/src/dag/timetravel.rs +++ b/crates/aingle_graph/src/dag/timetravel.rs @@ -105,6 +105,7 @@ mod tests { subject: subject.into(), predicate: "knows".into(), object: serde_json::json!(object), + provenance: None, }], }, signature: None, @@ -120,6 +121,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }], }; replay_payload(&db, &payload).unwrap(); @@ -155,6 +157,7 @@ mod tests { subject: "alice".into(), predicate: "knows".into(), object: serde_json::json!("bob"), + provenance: None, }], }, DagPayload::TripleInsert { @@ -162,6 +165,7 @@ mod tests { subject: "bob".into(), predicate: "knows".into(), object: serde_json::json!("charlie"), + provenance: None, }], }, ], @@ -258,10 +262,12 @@ mod tests { // At a timestamp before any actions: None let result = db.dag_at_timestamp(&(before - chrono::Duration::seconds(10))); - assert!(result.is_err() || { - // Should fail or return empty - true - }); + assert!( + result.is_err() || { + // Should fail or return empty + true + } + ); // At current time: should get state with both triples let (snap, info) = db.dag_at_timestamp(&Utc::now()).unwrap(); diff --git a/crates/aingle_graph/src/lib.rs b/crates/aingle_graph/src/lib.rs index 098fca5d..ebfa4bfe 100644 --- a/crates/aingle_graph/src/lib.rs +++ b/crates/aingle_graph/src/lib.rs @@ -421,6 +421,7 @@ impl GraphDB { subject: triple.subject.to_string(), predicate: triple.predicate.to_string(), object: value_to_json(&triple.object), + provenance: None, }], }, signature: None, @@ -483,11 +484,7 @@ impl GraphDB { /// Get mutation history for a specific triple. #[cfg(feature = "dag")] - pub fn dag_history( - &self, - triple_id: &[u8; 32], - limit: usize, - ) -> Result> { + pub fn dag_history(&self, triple_id: &[u8; 32], limit: usize) -> Result> { self.dag_store .as_ref() .ok_or_else(|| Error::Config("DAG not enabled".into()))? @@ -584,11 +581,7 @@ impl GraphDB { /// Sign a DAG action using an Ed25519 signing key. #[cfg(feature = "dag-sign")] - pub fn dag_sign( - &self, - action: &mut dag::DagAction, - key: &dag::DagSigningKey, - ) { + pub fn dag_sign(&self, action: &mut dag::DagAction, key: &dag::DagSigningKey) { key.sign(action); } @@ -599,8 +592,7 @@ impl GraphDB { action: &dag::DagAction, public_key: &[u8; 32], ) -> Result { - dag::signing::verify_action(action, public_key) - .map_err(|e| Error::Config(e.to_string())) + dag::signing::verify_action(action, public_key).map_err(|e| Error::Config(e.to_string())) } /// Export the full DAG as a portable graph structure. diff --git a/crates/aingle_ingest/Cargo.toml b/crates/aingle_ingest/Cargo.toml new file mode 100644 index 00000000..49f39fc0 --- /dev/null +++ b/crates/aingle_ingest/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "aingle_ingest" +version = "0.7.0" +description = "Structural extraction of triples and text chunks from markdown/code for AIngle" +license = "Apache-2.0 OR LicenseRef-Commercial" +edition = "2021" +rust-version = "1.83" + +[dependencies] +aingle_graph = { version = "0.7", path = "../aingle_graph", features = ["dag"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +regex = "1.12" +blake3 = "1.8" +once_cell = "1.4" diff --git a/crates/aingle_ingest/src/chunk.rs b/crates/aingle_ingest/src/chunk.rs new file mode 100644 index 00000000..6fef5506 --- /dev/null +++ b/crates/aingle_ingest/src/chunk.rs @@ -0,0 +1,96 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Splitting source text into line-ranged chunks for semantic recall. + +use crate::{Chunk, Provenance}; + +fn prov(path: &str, hash: &str, start: u32, end: u32) -> Provenance { + Provenance { + source_path: path.to_string(), + line_start: start, + line_end: end, + content_hash: hash.to_string(), + } +} + +/// Fixed-window chunking: every `window` lines becomes one chunk. Used for +/// non-markdown files. `window` must be >= 1. +pub fn chunk_fixed(path: &str, content: &str, hash: &str, window: usize) -> Vec { + let window = window.max(1); + let lines: Vec<&str> = content.lines().collect(); + if lines.is_empty() { + return Vec::new(); + } + let mut out = Vec::new(); + let mut i = 0; + while i < lines.len() { + let end = (i + window).min(lines.len()); + let text = lines[i..end].join("\n"); + out.push(Chunk { + text, + provenance: prov(path, hash, (i + 1) as u32, end as u32), + }); + i = end; + } + out +} + +/// Markdown chunking: split on ATX heading lines (`# ...`). Each heading starts a +/// new chunk that runs until the next heading (or EOF). Content before the first +/// heading (e.g. frontmatter + intro) is its own leading chunk. Oversized sections +/// (> 80 lines) are further split with `chunk_fixed`. +pub fn chunk_markdown(path: &str, content: &str, hash: &str) -> Vec { + let lines: Vec<&str> = content.lines().collect(); + if lines.is_empty() { + return Vec::new(); + } + // Boundaries: indices (0-based) where a heading starts a new section. + let mut starts: Vec = Vec::new(); + for (idx, line) in lines.iter().enumerate() { + if is_heading(line) { + starts.push(idx); + } + } + // Ensure the first section starts at line 0 even if there is leading content. + if starts.first() != Some(&0) { + starts.insert(0, 0); + } + starts.dedup(); + + let mut out = Vec::new(); + for (n, &start) in starts.iter().enumerate() { + let end = if n + 1 < starts.len() { + starts[n + 1] + } else { + lines.len() + }; + let section = &lines[start..end]; + if section.len() > 80 { + // chunk_fixed returns 1-based lines within the section; adding the + // 0-based section offset `start` yields correct absolute 1-based lines. + let joined = section.join("\n"); + for mut c in chunk_fixed(path, &joined, hash, 50) { + c.provenance.line_start += start as u32; + c.provenance.line_end += start as u32; + out.push(c); + } + } else { + out.push(Chunk { + text: section.join("\n"), + provenance: prov(path, hash, (start + 1) as u32, end as u32), + }); + } + } + out +} + +/// True for an ATX markdown heading line: optional leading whitespace, 1–6 `#` +/// characters, then at least one whitespace character. Mirrors the `HEADING` +/// regex used by triple extraction so chunk boundaries and `has_section` +/// triples agree on what a heading is. +fn is_heading(line: &str) -> bool { + let t = line.trim_start(); + let hashes = t.chars().take_while(|c| *c == '#').count(); + (1..=6).contains(&hashes) && t.chars().nth(hashes).is_some_and(|c| c.is_whitespace()) +} diff --git a/crates/aingle_ingest/src/lib.rs b/crates/aingle_ingest/src/lib.rs new file mode 100644 index 00000000..f0a27c71 --- /dev/null +++ b/crates/aingle_ingest/src/lib.rs @@ -0,0 +1,135 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Pure, deterministic structural extraction: `(path, content)` → triples + chunks. + +mod chunk; +mod markdown; + +pub use aingle_graph::dag::Provenance; + +/// Object side of an extracted triple. Mapped to the graph value type by the caller. +#[derive(Debug, Clone, PartialEq)] +pub enum ObjectValue { + /// A reference to another node/entity (e.g. a wikilink target). + Node(String), + /// A literal text value (e.g. a frontmatter scalar). + Text(String), +} + +/// A triple plus where it came from. +#[derive(Debug, Clone, PartialEq)] +pub struct ProvenancedTriple { + pub subject: String, + pub predicate: String, + pub object: ObjectValue, + pub provenance: Provenance, +} + +/// A span of source text to embed for semantic recall. +#[derive(Debug, Clone, PartialEq)] +pub struct Chunk { + pub text: String, + pub provenance: Provenance, +} + +/// The full result of extracting one file. +#[derive(Debug, Clone, PartialEq)] +pub struct Extraction { + pub triples: Vec, + pub chunks: Vec, +} + +/// Extract structural triples and text chunks from a file's content. +/// +/// `path` is used verbatim as the note subject and recorded in provenance. +/// Markdown files (`.md`/`.markdown`) get structural triples + heading-aware +/// chunks; all other files get fixed-window chunks only. +pub fn extract(path: &str, content: &str) -> Extraction { + let content_hash = blake3::hash(content.as_bytes()).to_hex().to_string(); + let is_md = path.to_lowercase().ends_with(".md") || path.to_lowercase().ends_with(".markdown"); + + let mut triples = Vec::new(); + let chunks; + if is_md { + triples = markdown::extract_triples(path, content, &content_hash); + chunks = chunk::chunk_markdown(path, content, &content_hash); + } else { + chunks = chunk::chunk_fixed(path, content, &content_hash, 50); + } + Extraction { triples, chunks } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn prov(p: &Provenance) -> (u32, u32) { + (p.line_start, p.line_end) + } + + #[test] + fn extracts_wikilink_heading_tag_and_frontmatter() { + let md = "---\ntype: adr\ntags: [storage, decision]\n---\n\ + # Storage Decision\n\n\ + We chose [[sled]] because of the lock. See #durability.\n"; + let ex = extract("docs/adr/007.md", md); + + // frontmatter scalar -> (note, type, adr) + assert!(ex.triples.iter().any(|t| t.subject == "docs/adr/007.md" + && t.predicate == "type" + && t.object == ObjectValue::Text("adr".into()))); + // frontmatter tags -> two tagged triples + assert!(ex + .triples + .iter() + .any(|t| t.predicate == "tagged" && t.object == ObjectValue::Text("storage".into()))); + assert!(ex + .triples + .iter() + .any(|t| t.predicate == "tagged" && t.object == ObjectValue::Text("decision".into()))); + // heading -> has_section + assert!(ex.triples.iter().any(|t| t.predicate == "has_section" + && t.object == ObjectValue::Text("Storage Decision".into()))); + // wikilink -> links_to sled + let link = ex + .triples + .iter() + .find(|t| t.predicate == "links_to") + .unwrap(); + assert_eq!(link.object, ObjectValue::Node("sled".into())); + // inline tag -> tagged durability + assert!( + ex.triples + .iter() + .any(|t| t.predicate == "tagged" + && t.object == ObjectValue::Text("durability".into())) + ); + + // provenance line numbers are 1-based and point at the real lines. + assert_eq!(prov(&link.provenance).0, 7); // the "We chose [[sled]]" line + assert_eq!(link.provenance.source_path, "docs/adr/007.md"); + + // at least one chunk, all carrying the same content hash. + assert!(!ex.chunks.is_empty()); + assert!(ex + .chunks + .iter() + .all(|c| !c.provenance.content_hash.is_empty())); + } + + #[test] + fn non_markdown_gets_chunks_only() { + let code = (1..=120) + .map(|i| format!("line {i}")) + .collect::>() + .join("\n"); + let ex = extract("src/main.rs", &code); + assert!(ex.triples.is_empty()); + // 120 lines / 50-line window => 3 chunks. + assert_eq!(ex.chunks.len(), 3); + assert_eq!(ex.chunks[0].provenance.line_start, 1); + assert_eq!(ex.chunks[0].provenance.line_end, 50); + assert_eq!(ex.chunks[2].provenance.line_end, 120); + } +} diff --git a/crates/aingle_ingest/src/markdown.rs b/crates/aingle_ingest/src/markdown.rs new file mode 100644 index 00000000..7aa36773 --- /dev/null +++ b/crates/aingle_ingest/src/markdown.rs @@ -0,0 +1,113 @@ +// Copyright 2019-2026 Apilium Technologies OÜ. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 OR Commercial + +//! Deterministic structural triple extraction from markdown. + +use crate::{ObjectValue, Provenance, ProvenancedTriple}; +use once_cell::sync::Lazy; +use regex::Regex; + +static WIKILINK: Lazy = Lazy::new(|| Regex::new(r"\[\[([^\]|]+)(?:\|[^\]]+)?\]\]").unwrap()); +static HEADING: Lazy = Lazy::new(|| Regex::new(r"^\s*#{1,6}\s+(.+?)\s*$").unwrap()); +// Inline tag: `#word` where `#` is at start or preceded by whitespace and is +// immediately followed by a letter (so `# Heading` and `##x` are not tags). +static INLINE_TAG: Lazy = + Lazy::new(|| Regex::new(r"(?:^|\s)#([A-Za-z][A-Za-z0-9_/-]*)").unwrap()); + +fn prov(path: &str, hash: &str, line: u32) -> Provenance { + Provenance { + source_path: path.to_string(), + line_start: line, + line_end: line, + content_hash: hash.to_string(), + } +} + +/// Extract structural triples. `path` is the note subject. +pub fn extract_triples(path: &str, content: &str, hash: &str) -> Vec { + let mut out = Vec::new(); + let lines: Vec<&str> = content.lines().collect(); + + // --- Frontmatter (flat scalars + `tags`). Only when the file starts with `---`. + let mut body_start = 0usize; + if lines.first().map(|l| l.trim_end()) == Some("---") { + if let Some(close_rel) = lines[1..].iter().position(|l| l.trim_end() == "---") { + let close = close_rel + 1; // index of closing --- + for (i, raw) in lines[1..close].iter().enumerate() { + let line_no = (i + 2) as u32; // 1-based, after opening --- + if let Some((key, val)) = raw.split_once(':') { + let key = key.trim(); + let val = val.trim(); + if key.is_empty() { + continue; + } + if key == "tags" { + for tag in parse_tag_list(val) { + out.push(ProvenancedTriple { + subject: path.into(), + predicate: "tagged".into(), + object: ObjectValue::Text(tag), + provenance: prov(path, hash, line_no), + }); + } + } else if !val.is_empty() { + out.push(ProvenancedTriple { + subject: path.into(), + predicate: key.into(), + object: ObjectValue::Text(val.into()), + provenance: prov(path, hash, line_no), + }); + } + } + } + body_start = close + 1; + } + } + + // --- Body: headings, wikilinks, inline tags (with real line numbers). + for (i, line) in lines.iter().enumerate().skip(body_start) { + let line_no = (i + 1) as u32; + + if let Some(c) = HEADING.captures(line) { + out.push(ProvenancedTriple { + subject: path.into(), + predicate: "has_section".into(), + object: ObjectValue::Text(c[1].trim().to_string()), + provenance: prov(path, hash, line_no), + }); + // Fall through: a heading line may still contain wikilinks/tags + // (e.g. `# See also [[foo]]`), so keep scanning it below. + } + + for c in WIKILINK.captures_iter(line) { + out.push(ProvenancedTriple { + subject: path.into(), + predicate: "links_to".into(), + object: ObjectValue::Node(c[1].trim().to_string()), + provenance: prov(path, hash, line_no), + }); + } + for c in INLINE_TAG.captures_iter(line) { + out.push(ProvenancedTriple { + subject: path.into(), + predicate: "tagged".into(), + object: ObjectValue::Text(c[1].to_string()), + provenance: prov(path, hash, line_no), + }); + } + } + + out +} + +/// Parse a frontmatter tag value into individual tags. Strips a single `[`/`]` +/// per side (not a balanced-bracket parse) then splits on commas, trimming +/// surrounding quotes/whitespace. Handles `[a, b]`, bare `a, b`, and single `a`. +fn parse_tag_list(val: &str) -> Vec { + let inner = val.trim().trim_start_matches('[').trim_end_matches(']'); + inner + .split(',') + .map(|s| s.trim().trim_matches('"').trim_matches('\'').to_string()) + .filter(|s| !s.is_empty()) + .collect() +} diff --git a/crates/aingle_logic/Cargo.toml b/crates/aingle_logic/Cargo.toml index abf698e7..7ebe5f1e 100644 --- a/crates/aingle_logic/Cargo.toml +++ b/crates/aingle_logic/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_logic" -version = "0.6.3" +version = "0.7.0" description = "Proof-of-Logic validation engine for AIngle semantic graphs" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -21,7 +21,7 @@ owl = [] [dependencies] # Graph database -aingle_graph = { version = "0.6", path = "../aingle_graph" } +aingle_graph = { version = "0.7", path = "../aingle_graph" } # Serialization serde = { version = "1.0", features = ["derive"] } diff --git a/crates/aingle_logic/src/engine.rs b/crates/aingle_logic/src/engine.rs index 13460d16..5f305f0c 100644 --- a/crates/aingle_logic/src/engine.rs +++ b/crates/aingle_logic/src/engine.rs @@ -105,28 +105,33 @@ impl RuleEngine { /// /// The stats provide metrics on validations, inferences, rejections, etc. pub fn stats(&self) -> EngineStats { - self.stats.read() + self.stats + .read() .unwrap_or_else(|poisoned| poisoned.into_inner()) .clone() } /// Resets all collected `EngineStats` to their default (zero) values. pub fn clear_stats(&self) { - let mut guard = self.stats.write() + let mut guard = self + .stats + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()); *guard = EngineStats::default(); } /// Retrieves a clone of all triples that have been inferred by the engine. pub fn inferred_triples(&self) -> Vec { - self.inferred.read() + self.inferred + .read() .unwrap_or_else(|poisoned| poisoned.into_inner()) .clone() } /// Clears the internal cache of inferred triples. pub fn clear_inferred(&self) { - self.inferred.write() + self.inferred + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()) .clear(); } @@ -145,7 +150,9 @@ impl RuleEngine { /// A `ValidationResult` indicating whether the triple is valid, and detailing any /// matches, rejections, warnings, or chained rules. pub fn validate(&self, triple: &Triple) -> ValidationResult { - let mut stats = self.stats.write() + let mut stats = self + .stats + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()); stats.validations += 1; @@ -172,7 +179,9 @@ impl RuleEngine { } Action::Infer(pattern) => { if let Some(inferred) = pattern.instantiate(&bindings) { - let mut inf = self.inferred.write() + let mut inf = self + .inferred + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()); inf.push(inferred); stats.inferences += 1; @@ -204,7 +213,9 @@ impl RuleEngine { /// A `Result` containing a `ForwardChainResult` which includes the number of iterations /// and all new facts inferred, or an `Error` if the process exceeds `max_depth`. pub fn forward_chain(&self, graph: &GraphDB) -> Result { - let mut stats = self.stats.write() + let mut stats = self + .stats + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()); let mut result = ForwardChainResult::new(); let mut iteration = 0; @@ -263,7 +274,8 @@ impl RuleEngine { // Add new facts to result (would be added to graph in real use) for fact in new_facts { - self.inferred.write() + self.inferred + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()) .push(fact); } @@ -292,7 +304,9 @@ impl RuleEngine { graph: &GraphDB, goal: &TriplePattern, ) -> Result { - let mut stats = self.stats.write() + let mut stats = self + .stats + .write() .unwrap_or_else(|poisoned| poisoned.into_inner()); stats.backward_queries += 1; diff --git a/crates/aingle_minimal/Cargo.toml b/crates/aingle_minimal/Cargo.toml index b90e7d5e..59767d86 100644 --- a/crates/aingle_minimal/Cargo.toml +++ b/crates/aingle_minimal/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_minimal" -version = "0.6.3" +version = "0.7.0" description = "Ultra-light AIngle node for IoT devices (<1MB RAM)" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -124,10 +124,10 @@ embedded-hal = { version = "1.0", optional = true } embedded-hal-async = { version = "1.0", optional = true } # AI Memory (Ineru) -ineru = { version = "0.6", path = "../ineru", optional = true } +ineru = { version = "0.7", path = "../ineru", optional = true } # Kaneru (AI Agent Framework) -kaneru = { version = "0.6", path = "../kaneru", optional = true } +kaneru = { version = "0.7", path = "../kaneru", optional = true } # REST API server (lightweight HTTP) tiny_http = { version = "0.12", optional = true } diff --git a/crates/aingle_minimal/src/discovery.rs b/crates/aingle_minimal/src/discovery.rs index 5a40b187..2cfad8ac 100644 --- a/crates/aingle_minimal/src/discovery.rs +++ b/crates/aingle_minimal/src/discovery.rs @@ -10,9 +10,9 @@ //! - **mDNS**: Service type `_aingle._udp.local.` (feature: mdns) //! - **CoAP Multicast**: `/.well-known/core` to 224.0.1.187:5683 (feature: coap) -use crate::error::Result; #[cfg(feature = "mdns")] use crate::error::Error; +use crate::error::Result; use std::collections::HashMap; use std::net::{IpAddr, SocketAddr}; use std::time::{Duration, Instant}; @@ -202,7 +202,11 @@ impl Discovery { return; } - let addresses: Vec = info.get_addresses().iter().map(|a| a.to_ip_addr()).collect(); + let addresses: Vec = info + .get_addresses() + .iter() + .map(|a| a.to_ip_addr()) + .collect(); let mut props = HashMap::new(); for prop in info.get_properties().iter() { diff --git a/crates/aingle_minimal/src/lib.rs b/crates/aingle_minimal/src/lib.rs index 72f332ec..4c4ebeb6 100644 --- a/crates/aingle_minimal/src/lib.rs +++ b/crates/aingle_minimal/src/lib.rs @@ -266,6 +266,8 @@ pub use ota::{OtaManager, UpdateChannel, UpdateInfo, UpdateState}; pub use power::{BatteryInfo, PowerManager, PowerProfile}; #[cfg(feature = "quic")] pub use quic::{QuicConfig, QuicServer}; +#[cfg(feature = "rest")] +pub use rest::{RestConfig, RestServer}; pub use sensors::{CalibrationParams, Sensor, SensorManager, SensorReading, SensorType}; #[cfg(feature = "smart_agents")] pub use smart::{IoTPolicyBuilder, SensorAdapter, SmartNode, SmartNodeConfig, SmartNodeStats}; @@ -281,8 +283,6 @@ pub use webrtc::{ ConnectionState, PeerConnection, SignalingClient, SignalingConfig, SignalingMessage, SignalingServer, WebRtcConfig, WebRtcServer, WebRtcStats, }; -#[cfg(feature = "rest")] -pub use rest::{RestConfig, RestServer}; /// Version information for the crate. /// diff --git a/crates/aingle_minimal/src/memory.rs b/crates/aingle_minimal/src/memory.rs index 8577cbc0..20feb847 100644 --- a/crates/aingle_minimal/src/memory.rs +++ b/crates/aingle_minimal/src/memory.rs @@ -8,9 +8,9 @@ #[cfg(feature = "ai_memory")] pub use ineru::{ - ConsolidationConfig, Embedding, Entity, EntityId, KnowledgeGraph, Link, LinkType, + ConsolidationConfig, Embedding, Entity, EntityId, IneruMemory, KnowledgeGraph, Link, LinkType, LongTermMemory, LtmConfig, MemoryConfig, MemoryEntry, MemoryId, MemoryMetadata, MemoryQuery, - MemoryResult, MemoryStats, Relation, SemanticTag, ShortTermMemory, StmConfig, IneruMemory, + MemoryResult, MemoryStats, Relation, SemanticTag, ShortTermMemory, StmConfig, }; #[cfg(feature = "ai_memory")] diff --git a/crates/aingle_minimal/src/quic.rs b/crates/aingle_minimal/src/quic.rs index 61ff2a4f..87c7c55e 100644 --- a/crates/aingle_minimal/src/quic.rs +++ b/crates/aingle_minimal/src/quic.rs @@ -215,7 +215,11 @@ impl QuicServer { // Reject oversized messages (max 1MB) const MAX_MESSAGE_SIZE: usize = 1024 * 1024; if len > MAX_MESSAGE_SIZE { - log::warn!("Rejecting oversized QUIC message: {} bytes from {}", len, addr); + log::warn!( + "Rejecting oversized QUIC message: {} bytes from {}", + len, + addr + ); continue; } @@ -327,7 +331,9 @@ impl QuicServer { // In a real deployment, load trusted peer certificates here. // For self-signed mesh networks, each node pins peer certs at discovery time. // Using dangerous() only as fallback for initial handshake — log a warning. - log::warn!("QUIC client using permissive certificate validation — pin peer certs in production"); + log::warn!( + "QUIC client using permissive certificate validation — pin peer certs in production" + ); let crypto = rustls::ClientConfig::builder() .dangerous() .with_custom_certificate_verifier(Arc::new(LoggingCertVerifier)) diff --git a/crates/aingle_minimal/src/rest.rs b/crates/aingle_minimal/src/rest.rs index 19af4334..452a720e 100644 --- a/crates/aingle_minimal/src/rest.rs +++ b/crates/aingle_minimal/src/rest.rs @@ -187,8 +187,12 @@ impl RestServer { /// The server will run in a background thread until `stop()` is called. pub fn start(config: RestConfig, node: &mut MinimalNode) -> Result { let bind_addr = config.bind_address(); - let server = Server::http(&bind_addr) - .map_err(|e| Error::Network(NetworkError::Other(format!("Failed to start REST server: {}", e))))?; + let server = Server::http(&bind_addr).map_err(|e| { + Error::Network(NetworkError::Other(format!( + "Failed to start REST server: {}", + e + ))) + })?; log::info!("REST API server starting on http://{}", bind_addr); @@ -206,7 +210,14 @@ impl RestServer { // In production, this would use channels or shared state let handle = thread::spawn(move || { - Self::server_loop(server, running_clone, enable_cors, node_id, version, start_time); + Self::server_loop( + server, + running_clone, + enable_cors, + node_id, + version, + start_time, + ); }); Ok(Self { @@ -219,13 +230,14 @@ impl RestServer { /// Start the REST server with shared node access. /// /// This version allows the node to be accessed from the REST handlers. - pub fn start_with_node( - config: RestConfig, - node: Arc>, - ) -> Result { + pub fn start_with_node(config: RestConfig, node: Arc>) -> Result { let bind_addr = config.bind_address(); - let server = Server::http(&bind_addr) - .map_err(|e| Error::Network(NetworkError::Other(format!("Failed to start REST server: {}", e))))?; + let server = Server::http(&bind_addr).map_err(|e| { + Error::Network(NetworkError::Other(format!( + "Failed to start REST server: {}", + e + ))) + })?; log::info!("REST API server starting on http://{}", bind_addr); @@ -257,12 +269,8 @@ impl RestServer { // Use a timeout so we can check the running flag periodically match server.recv_timeout(std::time::Duration::from_millis(100)) { Ok(Some(request)) => { - let response = Self::handle_static_request( - &request, - &node_id, - &version, - start_time, - ); + let response = + Self::handle_static_request(&request, &node_id, &version, start_time); Self::send_response(request, response, enable_cors); } Ok(None) => continue, // Timeout, check running flag @@ -299,10 +307,7 @@ impl RestServer { } /// Handle a request with full node access - fn handle_request( - request: &mut Request, - node: &Arc>, - ) -> (u16, String) { + fn handle_request(request: &mut Request, node: &Arc>) -> (u16, String) { let method = request.method().clone(); let url = request.url().to_string(); @@ -310,14 +315,10 @@ impl RestServer { match (method, url.as_str()) { // GET /api/v1/info - (Method::Get, "/api/v1/info") => { - Self::handle_info(node) - } + (Method::Get, "/api/v1/info") => Self::handle_info(node), // POST /api/v1/entries - (Method::Post, "/api/v1/entries") => { - Self::handle_create_entry(request, node) - } + (Method::Post, "/api/v1/entries") => Self::handle_create_entry(request, node), // GET /api/v1/entries/:hash (Method::Get, path) if path.starts_with("/api/v1/entries/") => { @@ -326,19 +327,13 @@ impl RestServer { } // GET /api/v1/peers - (Method::Get, "/api/v1/peers") => { - Self::handle_peers(node) - } + (Method::Get, "/api/v1/peers") => Self::handle_peers(node), // GET /api/v1/stats - (Method::Get, "/api/v1/stats") => { - Self::handle_stats(node) - } + (Method::Get, "/api/v1/stats") => Self::handle_stats(node), // OPTIONS (CORS preflight) - (Method::Options, _) => { - (204, String::new()) - } + (Method::Options, _) => (204, String::new()), // Health check (Method::Get, "/health") | (Method::Get, "/") => { @@ -406,10 +401,7 @@ impl RestServer { } /// Handle POST /api/v1/entries - fn handle_create_entry( - request: &mut Request, - node: &Arc>, - ) -> (u16, String) { + fn handle_create_entry(request: &mut Request, node: &Arc>) -> (u16, String) { // Read body let mut body = String::new(); let reader = request.as_reader(); @@ -444,7 +436,10 @@ impl RestServer { timestamp: crate::types::Timestamp::now().as_millis(), }; let api_response = ApiResponse::success(response); - (201, serde_json::to_string(&api_response).unwrap_or_default()) + ( + 201, + serde_json::to_string(&api_response).unwrap_or_default(), + ) } Err(e) => { let response = ApiResponse::<()>::error(format!("Failed to create entry: {}", e)); @@ -454,10 +449,7 @@ impl RestServer { } /// Handle GET /api/v1/entries/:hash - fn handle_get_entry( - hash_str: &str, - node: &Arc>, - ) -> (u16, String) { + fn handle_get_entry(hash_str: &str, node: &Arc>) -> (u16, String) { // Parse hash let hash = match Hash::from_hex(hash_str) { Ok(h) => h, @@ -482,7 +474,7 @@ impl RestServer { let content: serde_json::Value = serde_json::from_slice(&entry.content) .unwrap_or_else(|_| { serde_json::Value::String( - String::from_utf8_lossy(&entry.content).to_string() + String::from_utf8_lossy(&entry.content).to_string(), ) }); @@ -493,7 +485,10 @@ impl RestServer { size: entry.size(), }; let api_response = ApiResponse::success(response); - (200, serde_json::to_string(&api_response).unwrap_or_default()) + ( + 200, + serde_json::to_string(&api_response).unwrap_or_default(), + ) } Ok(None) => { let response = ApiResponse::<()>::error("Entry not found"); @@ -564,7 +559,10 @@ impl RestServer { }; let api_response = ApiResponse::success(response); - (200, serde_json::to_string(&api_response).unwrap_or_default()) + ( + 200, + serde_json::to_string(&api_response).unwrap_or_default(), + ) } /// Handle static requests (without node access) @@ -613,25 +611,25 @@ impl RestServer { fn send_response(request: Request, response: (u16, String), enable_cors: bool) { let (status, body) = response; - let mut headers = vec![ - Header::from_bytes(&b"Content-Type"[..], &b"application/json"[..]).unwrap(), - ]; + let mut headers = + vec![Header::from_bytes(&b"Content-Type"[..], &b"application/json"[..]).unwrap()]; if enable_cors { - headers.push( - Header::from_bytes(&b"Access-Control-Allow-Origin"[..], &b"*"[..]).unwrap() - ); + headers + .push(Header::from_bytes(&b"Access-Control-Allow-Origin"[..], &b"*"[..]).unwrap()); headers.push( Header::from_bytes( &b"Access-Control-Allow-Methods"[..], &b"GET, POST, OPTIONS"[..], - ).unwrap() + ) + .unwrap(), ); headers.push( Header::from_bytes( &b"Access-Control-Allow-Headers"[..], &b"Content-Type, Authorization"[..], - ).unwrap() + ) + .unwrap(), ); } diff --git a/crates/aingle_minimal/src/rocks_storage.rs b/crates/aingle_minimal/src/rocks_storage.rs index acfd0f88..18a8aab9 100644 --- a/crates/aingle_minimal/src/rocks_storage.rs +++ b/crates/aingle_minimal/src/rocks_storage.rs @@ -99,9 +99,9 @@ impl RocksStorage { /// Get column family handle fn cf(&self, name: &str) -> Result<&ColumnFamily> { - self.db - .cf_handle(name) - .ok_or_else(|| crate::error::Error::storage(format!("Column family '{}' not found", name))) + self.db.cf_handle(name).ok_or_else(|| { + crate::error::Error::storage(format!("Column family '{}' not found", name)) + }) } /// Serialize key for actions (hash-based) @@ -136,7 +136,10 @@ impl RocksStorage { arr.copy_from_slice(&v[..8]); Some(i64::from_be_bytes(arr)) } else { - log::warn!("Corrupt sequence counter: expected 8 bytes, got {}", v.len()); + log::warn!( + "Corrupt sequence counter: expected 8 bytes, got {}", + v.len() + ); None } }) diff --git a/crates/aingle_minimal/src/wallet.rs b/crates/aingle_minimal/src/wallet.rs index 6303af01..d7c1cb85 100644 --- a/crates/aingle_minimal/src/wallet.rs +++ b/crates/aingle_minimal/src/wallet.rs @@ -667,9 +667,10 @@ impl ApduCommand { /// Serialize to bytes for transmission pub fn serialize(&self) -> std::result::Result, crate::error::Error> { if self.data.len() > 255 { - return Err(crate::error::Error::network( - format!("APDU data too large: {} bytes (max 255)", self.data.len()), - )); + return Err(crate::error::Error::network(format!( + "APDU data too large: {} bytes (max 255)", + self.data.len() + ))); } let mut bytes = Vec::with_capacity(5 + self.data.len()); bytes.push(self.cla); diff --git a/crates/aingle_minimal/tests/smart_node_integration_tests.rs b/crates/aingle_minimal/tests/smart_node_integration_tests.rs index bc152999..e6656850 100644 --- a/crates/aingle_minimal/tests/smart_node_integration_tests.rs +++ b/crates/aingle_minimal/tests/smart_node_integration_tests.rs @@ -12,9 +12,7 @@ use aingle_minimal::*; use kaneru::policy::Condition; -use kaneru::{ - Action, ActionType, AgentConfig, Goal, Observation, ObservationType, Policy, Rule, -}; +use kaneru::{Action, ActionType, AgentConfig, Goal, Observation, ObservationType, Policy, Rule}; /// Helper to create test configuration fn test_smart_config() -> SmartNodeConfig { diff --git a/crates/aingle_raft/Cargo.toml b/crates/aingle_raft/Cargo.toml index 7a275ee3..dafd9da0 100644 --- a/crates/aingle_raft/Cargo.toml +++ b/crates/aingle_raft/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_raft" -version = "0.6.3" +version = "0.7.0" description = "Raft consensus for AIngle clustering" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -18,7 +18,7 @@ dag = ["aingle_graph/dag"] [dependencies] openraft = { version = "0.10.0-alpha.17", features = ["serde", "type-alias"] } -aingle_wal = { version = "0.6", path = "../aingle_wal" } +aingle_wal = { version = "0.7", path = "../aingle_wal" } serde = { version = "1", features = ["derive"] } serde_json = "1" tokio = { version = "1", features = ["full"] } @@ -28,8 +28,8 @@ tracing = "0.1" chrono = { version = "0.4", features = ["serde"] } futures-util = "0.3" anyerror = "0.1" -aingle_graph = { version = "0.6", path = "../aingle_graph", features = ["sled-backend"] } -ineru = { version = "0.6", path = "../ineru" } +aingle_graph = { version = "0.7", path = "../aingle_graph", features = ["sled-backend"] } +ineru = { version = "0.7", path = "../ineru" } [dev-dependencies] tempfile = "3.26" diff --git a/crates/aingle_raft/src/consistency.rs b/crates/aingle_raft/src/consistency.rs index cee877b3..816e8527 100644 --- a/crates/aingle_raft/src/consistency.rs +++ b/crates/aingle_raft/src/consistency.rs @@ -39,12 +39,30 @@ mod tests { #[test] fn test_from_header() { - assert_eq!(ConsistencyLevel::from_header("local"), ConsistencyLevel::Local); - assert_eq!(ConsistencyLevel::from_header("quorum"), ConsistencyLevel::Quorum); - assert_eq!(ConsistencyLevel::from_header("linearizable"), ConsistencyLevel::Linearizable); - assert_eq!(ConsistencyLevel::from_header("LOCAL"), ConsistencyLevel::Local); - assert_eq!(ConsistencyLevel::from_header("QUORUM"), ConsistencyLevel::Quorum); - assert_eq!(ConsistencyLevel::from_header("unknown"), ConsistencyLevel::Local); + assert_eq!( + ConsistencyLevel::from_header("local"), + ConsistencyLevel::Local + ); + assert_eq!( + ConsistencyLevel::from_header("quorum"), + ConsistencyLevel::Quorum + ); + assert_eq!( + ConsistencyLevel::from_header("linearizable"), + ConsistencyLevel::Linearizable + ); + assert_eq!( + ConsistencyLevel::from_header("LOCAL"), + ConsistencyLevel::Local + ); + assert_eq!( + ConsistencyLevel::from_header("QUORUM"), + ConsistencyLevel::Quorum + ); + assert_eq!( + ConsistencyLevel::from_header("unknown"), + ConsistencyLevel::Local + ); } #[test] diff --git a/crates/aingle_raft/src/lib.rs b/crates/aingle_raft/src/lib.rs index 04c867c8..4ff52aa9 100644 --- a/crates/aingle_raft/src/lib.rs +++ b/crates/aingle_raft/src/lib.rs @@ -6,12 +6,12 @@ //! Uses openraft for leader election and log replication, //! backed by the AIngle WAL for durable log storage. -pub mod types; +pub mod consistency; pub mod log_store; -pub mod state_machine; -pub mod snapshot_builder; pub mod network; -pub mod consistency; +pub mod snapshot_builder; +pub mod state_machine; +pub mod types; -pub use types::{CortexTypeConfig, CortexRequest, CortexResponse, CortexNode, NodeId}; pub use consistency::ConsistencyLevel; +pub use types::{CortexNode, CortexRequest, CortexResponse, CortexTypeConfig, NodeId}; diff --git a/crates/aingle_raft/src/log_store.rs b/crates/aingle_raft/src/log_store.rs index 82bbc8a0..d3dd4d27 100644 --- a/crates/aingle_raft/src/log_store.rs +++ b/crates/aingle_raft/src/log_store.rs @@ -69,7 +69,12 @@ impl CortexLogStore { let mut log = BTreeMap::new(); for wal_entry in &wal_entries { - if let WalEntryKind::RaftEntry { index, term: _, data } = &wal_entry.kind { + if let WalEntryKind::RaftEntry { + index, + term: _, + data, + } = &wal_entry.kind + { match serde_json::from_slice::(data) { Ok(entry) => { log.insert(*index, entry); @@ -248,7 +253,11 @@ impl CortexLogStore { // Write ALL to WAL first for (index, term, ref data, _) in &batch { self.wal - .append(WalEntryKind::RaftEntry { index: *index, term: *term, data: data.clone() }) + .append(WalEntryKind::RaftEntry { + index: *index, + term: *term, + data: data.clone(), + }) .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; } @@ -348,9 +357,12 @@ impl RaftLogStorage for Arc { { // Always invoke the callback, even on error, to prevent openraft hangs. let result = self.append_inner(entries).await; - callback.io_completed(result.as_ref().map(|_| ()).map_err(|e| { - io::Error::new(e.kind(), e.to_string()) - })); + callback.io_completed( + result + .as_ref() + .map(|_| ()) + .map_err(|e| io::Error::new(e.kind(), e.to_string())), + ); result } @@ -381,10 +393,7 @@ impl RaftLogStorage for Arc { async fn purge(&mut self, log_id: LogId) -> Result<(), io::Error> { let mut log = self.log.write().await; - let keys_to_remove: Vec = log - .range(..=log_id.index) - .map(|(k, _)| *k) - .collect(); + let keys_to_remove: Vec = log.range(..=log_id.index).map(|(k, _)| *k).collect(); for k in keys_to_remove { log.remove(&k); } @@ -409,10 +418,7 @@ mod tests { use openraft::vote::RaftLeaderId; fn make_entry(index: u64, term: u64) -> Entry { - Entry::new_blank(openraft::LogId::new( - CommittedLeaderId::new(term, 0), - index, - )) + Entry::new_blank(openraft::LogId::new(CommittedLeaderId::new(term, 0), index)) } #[tokio::test] @@ -434,10 +440,7 @@ mod tests { let entries = vec![make_entry(1, 1), make_entry(2, 1), make_entry(3, 1)]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); let mut reader = store.clone(); let result = reader.try_get_log_entries(1..4).await.unwrap(); @@ -480,10 +483,7 @@ mod tests { make_entry(3, 1), make_entry(4, 1), ]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); // Truncate after index 2 let lid = openraft::LogId::new(CommittedLeaderId::new(1, 0), 2); @@ -509,10 +509,7 @@ mod tests { make_entry(3, 1), make_entry(4, 1), ]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); let lid = openraft::LogId::new(CommittedLeaderId::new(1, 0), 2); store_mut.truncate_after(Some(lid)).await.unwrap(); @@ -523,7 +520,11 @@ mod tests { let store = Arc::new(CortexLogStore::open(dir.path()).unwrap()); let mut reader = store.clone(); let result = reader.try_get_log_entries(1..5).await.unwrap(); - assert_eq!(result.len(), 2, "truncated entries must not survive restart"); + assert_eq!( + result.len(), + 2, + "truncated entries must not survive restart" + ); } } @@ -533,15 +534,8 @@ mod tests { let store = Arc::new(CortexLogStore::open(dir.path()).unwrap()); let mut store_mut = store.clone(); - let entries = vec![ - make_entry(1, 1), - make_entry(2, 1), - make_entry(3, 1), - ]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + let entries = vec![make_entry(1, 1), make_entry(2, 1), make_entry(3, 1)]; + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); let purge_id = openraft::LogId::new(CommittedLeaderId::new(1, 0), 2); store_mut.purge(purge_id).await.unwrap(); @@ -561,15 +555,8 @@ mod tests { let store = Arc::new(CortexLogStore::open(dir.path()).unwrap()); let mut store_mut = store.clone(); - let entries = vec![ - make_entry(1, 1), - make_entry(2, 1), - make_entry(3, 1), - ]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + let entries = vec![make_entry(1, 1), make_entry(2, 1), make_entry(3, 1)]; + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); let purge_id = openraft::LogId::new(CommittedLeaderId::new(1, 0), 2); store_mut.purge(purge_id).await.unwrap(); @@ -601,10 +588,7 @@ mod tests { let mut store_mut = store.clone(); let entries = vec![make_entry(1, 1), make_entry(2, 1)]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); } // Reopen and verify entries are recovered @@ -644,10 +628,7 @@ mod tests { let mut store_mut = store.clone(); let entries = vec![make_entry(1, 1), make_entry(2, 1)]; - store_mut - .append(entries, IOFlushed::noop()) - .await - .unwrap(); + store_mut.append(entries, IOFlushed::noop()).await.unwrap(); let state = store_mut.get_log_state().await.unwrap(); assert!(state.last_purged_log_id.is_none()); diff --git a/crates/aingle_raft/src/network.rs b/crates/aingle_raft/src/network.rs index 4fcb62a4..2ad84f8f 100644 --- a/crates/aingle_raft/src/network.rs +++ b/crates/aingle_raft/src/network.rs @@ -166,18 +166,15 @@ impl RaftNetworkFactory for CortexNetworkFactory { async fn new_client(&mut self, target: NodeId, node: &CortexNode) -> Self::Network { // Use REST address for HTTP-based Raft RPC routing. // Fallback is constructed infallibly (no parse) to avoid panics. - let addr: SocketAddr = node - .rest_addr - .parse() - .unwrap_or_else(|e| { - tracing::warn!( - target_node = target, - addr = %node.rest_addr, - error = %e, - "Invalid REST address for Raft peer, falling back to localhost:19090" - ); - SocketAddr::from(([127, 0, 0, 1], 19090)) - }); + let addr: SocketAddr = node.rest_addr.parse().unwrap_or_else(|e| { + tracing::warn!( + target_node = target, + addr = %node.rest_addr, + error = %e, + "Invalid REST address for Raft peer, falling back to localhost:19090" + ); + SocketAddr::from(([127, 0, 0, 1], 19090)) + }); CortexNetworkConnection { target, @@ -284,18 +281,15 @@ impl RaftNetworkV2 for CortexNetworkConnection { "meta": snapshot.meta, "data": snapshot.snapshot.into_inner(), }); - let payload = serde_json::to_vec(&snap_data).map_err(|e| { - StreamingError::Unreachable(Unreachable::new(&AnyError::error(e))) - })?; + let payload = serde_json::to_vec(&snap_data) + .map_err(|e| StreamingError::Unreachable(Unreachable::new(&AnyError::error(e))))?; // Use chunked transfer for payloads > 1MB to avoid timeouts // and reduce memory pressure on the receiver. const CHUNK_THRESHOLD: usize = 1024 * 1024; // 1MB if payload.len() > CHUNK_THRESHOLD { - return self - .send_chunked_snapshot(&payload, option) - .await; + return self.send_chunked_snapshot(&payload, option).await; } // Small snapshot: send monolithic @@ -327,7 +321,6 @@ impl RaftNetworkV2 for CortexNetworkConnection { ))), } } - } impl CortexNetworkConnection { @@ -390,9 +383,7 @@ impl CortexNetworkConnection { "Snapshot chunk at offset {offset} timed out after {per_chunk_timeout:?}" )))) })? - .map_err(|e| { - StreamingError::Unreachable(Unreachable::new(&AnyError::error(e))) - })?; + .map_err(|e| StreamingError::Unreachable(Unreachable::new(&AnyError::error(e))))?; match response { // Final chunk returns the install response diff --git a/crates/aingle_raft/src/snapshot_builder.rs b/crates/aingle_raft/src/snapshot_builder.rs index 4851fb2c..d3dff2e8 100644 --- a/crates/aingle_raft/src/snapshot_builder.rs +++ b/crates/aingle_raft/src/snapshot_builder.rs @@ -97,10 +97,7 @@ impl RaftSnapshotBuilder for CortexSnapshotBuilder { .to_bytes() .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - let snapshot_id = format!( - "snap-{}-{}", - last_applied_term, last_applied_index - ); + let snapshot_id = format!("snap-{}-{}", last_applied_term, last_applied_index); let meta = SnapshotMeta { last_log_id: self.last_applied.clone(), @@ -152,10 +149,7 @@ mod tests { let mut builder = CortexSnapshotBuilder { graph: Arc::new(RwLock::new(graph)), memory: Arc::new(RwLock::new(memory)), - last_applied: Some(openraft::LogId::new( - CommittedLeaderId::new(1, 0), - 5, - )), + last_applied: Some(openraft::LogId::new(CommittedLeaderId::new(1, 0), 5)), last_membership: openraft::StoredMembership::default(), proof_provider: None, }; diff --git a/crates/aingle_raft/src/state_machine.rs b/crates/aingle_raft/src/state_machine.rs index 06551e98..64ec27e4 100644 --- a/crates/aingle_raft/src/state_machine.rs +++ b/crates/aingle_raft/src/state_machine.rs @@ -219,9 +219,7 @@ impl CortexStateMachine { id: None, } } - WalEntryKind::DagAction { action_bytes } => { - self.apply_dag_action(action_bytes).await - } + WalEntryKind::DagAction { action_bytes } => self.apply_dag_action(action_bytes).await, _ => CortexResponse { success: true, detail: None, @@ -248,9 +246,7 @@ impl CortexStateMachine { }; // Reject unsigned actions (Genesis exempt — system-generated at init) - if action.signature.is_none() - && !matches!(action.payload, DagPayload::Genesis { .. }) - { + if action.signature.is_none() && !matches!(action.payload, DagPayload::Genesis { .. }) { tracing::warn!( seq = action.seq, author = %action.author, @@ -285,9 +281,8 @@ impl CortexStateMachine { DagPayload::TripleInsert { triples } => { let graph = self.graph.read().await; for t in triples { - let value = json_to_value( - &serde_json::to_value(&t.object).unwrap_or_default(), - ); + let value = + json_to_value(&serde_json::to_value(&t.object).unwrap_or_default()); let triple = aingle_graph::Triple::new( aingle_graph::NodeId::named(&t.subject), aingle_graph::Predicate::named(&t.predicate), @@ -320,10 +315,15 @@ impl CortexStateMachine { ); } aingle_graph::dag::MemoryOpKind::Forget { memory_id } => { - tracing::debug!(memory_id, "DagAction MemoryOp::Forget recorded (audit only)"); + tracing::debug!( + memory_id, + "DagAction MemoryOp::Forget recorded (audit only)" + ); } aingle_graph::dag::MemoryOpKind::Consolidate => { - tracing::debug!("DagAction MemoryOp::Consolidate recorded (audit only)"); + tracing::debug!( + "DagAction MemoryOp::Consolidate recorded (audit only)" + ); } } } @@ -360,24 +360,42 @@ impl CortexStateMachine { // Audit-only: no graph mutation needed } DagPayload::Batch { .. } => { - tracing::warn!("Nested Batch inside Batch — skipping to avoid recursion"); + tracing::warn!( + "Nested Batch inside Batch — skipping to avoid recursion" + ); } } } } - DagPayload::Genesis { triple_count, description } => { + DagPayload::Genesis { + triple_count, + description, + } => { + tracing::info!(triple_count, description, "Applied DagAction::Genesis"); + } + DagPayload::Compact { + pruned_count, + retained_count, + ref policy, + } => { tracing::info!( - triple_count, - description, - "Applied DagAction::Genesis" + pruned_count, + retained_count, + policy, + "Applied DagAction::Compact" ); } - DagPayload::Compact { pruned_count, retained_count, ref policy } => { - tracing::info!(pruned_count, retained_count, policy, "Applied DagAction::Compact"); - } DagPayload::Noop => {} - DagPayload::Custom { ref payload_type, ref payload_summary, .. } => { - tracing::info!(payload_type, payload_summary, "Applied DagAction::Custom (audit only)"); + DagPayload::Custom { + ref payload_type, + ref payload_summary, + .. + } => { + tracing::info!( + payload_type, + payload_summary, + "Applied DagAction::Custom (audit only)" + ); } } @@ -426,9 +444,7 @@ impl CortexStateMachine { impl RaftStateMachine for Arc { type SnapshotBuilder = CortexSnapshotBuilder; - async fn applied_state( - &mut self, - ) -> Result<(Option, StoredMembershipOf), io::Error> { + async fn applied_state(&mut self) -> Result<(Option, StoredMembershipOf), io::Error> { let la = self.last_applied.read().await; let membership = self.last_membership.read().await; Ok((la.clone(), membership.clone())) @@ -436,9 +452,7 @@ impl RaftStateMachine for Arc { async fn apply(&mut self, mut entries: Strm) -> Result<(), io::Error> where - Strm: futures_util::Stream, io::Error>> - + Unpin - + Send, + Strm: futures_util::Stream, io::Error>> + Unpin + Send, { while let Some(item) = entries.next().await { let (entry, responder) = item?; @@ -456,9 +470,7 @@ impl RaftStateMachine for Arc { detail: None, id: None, }, - openraft::EntryPayload::Normal(ref req) => { - self.apply_mutation(&req.kind).await - } + openraft::EntryPayload::Normal(ref req) => self.apply_mutation(&req.kind).await, openraft::EntryPayload::Membership(_) => CortexResponse { success: true, detail: None, @@ -509,8 +521,8 @@ impl RaftStateMachine for Arc { // Build both new graph and new memory into temporaries FIRST, // then swap atomically only if both succeed (#7). - let new_graph = GraphDB::memory() - .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; + let new_graph = + GraphDB::memory().map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; for ts in &cluster_snap.triples { let value = json_to_value(&ts.object); let triple = aingle_graph::Triple::new( @@ -525,9 +537,12 @@ impl RaftStateMachine for Arc { let new_memory = if !cluster_snap.ineru_ltm.is_empty() { Some( - IneruMemory::import_snapshot(&cluster_snap.ineru_ltm) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, - format!("Failed to restore Ineru from snapshot: {e}")))? + IneruMemory::import_snapshot(&cluster_snap.ineru_ltm).map_err(|e| { + io::Error::new( + io::ErrorKind::InvalidData, + format!("Failed to restore Ineru from snapshot: {e}"), + ) + })?, ) } else { None @@ -775,10 +790,7 @@ mod tests { fn make_graph_and_memory() -> (Arc>, Arc>) { let graph = GraphDB::memory().unwrap(); let memory = IneruMemory::agent_mode(); - ( - Arc::new(RwLock::new(graph)), - Arc::new(RwLock::new(memory)), - ) + (Arc::new(RwLock::new(graph)), Arc::new(RwLock::new(memory))) } #[tokio::test] @@ -954,7 +966,11 @@ mod tests { // Verify: old data cleared, only snapshot data present let g = graph.read().await; - assert_eq!(g.count(), 1, "old data should be cleared, only snapshot data remains"); + assert_eq!( + g.count(), + 1, + "old data should be cleared, only snapshot data remains" + ); let triples = g.find(aingle_graph::TriplePattern::any()).unwrap(); let subject_str = triples[0].subject.to_string(); assert!( @@ -1024,7 +1040,10 @@ mod tests { // Verify checksum was written into serialized data let raw: serde_json::Value = serde_json::from_slice(&bytes).unwrap(); let checksum = raw["checksum"].as_str().unwrap(); - assert!(!checksum.is_empty(), "checksum should be set after to_bytes"); + assert!( + !checksum.is_empty(), + "checksum should be set after to_bytes" + ); // Valid roundtrip succeeds let restored = ClusterSnapshot::from_bytes(&bytes).unwrap(); @@ -1097,6 +1116,9 @@ mod tests { }; let bytes = serde_json::to_vec(&snap).unwrap(); let result = ClusterSnapshot::from_bytes(&bytes); - assert!(result.is_ok(), "empty checksum should be accepted for backward compat"); + assert!( + result.is_ok(), + "empty checksum should be accepted for backward compat" + ); } } diff --git a/crates/aingle_raft/src/types.rs b/crates/aingle_raft/src/types.rs index c15fd068..d8a62b9e 100644 --- a/crates/aingle_raft/src/types.rs +++ b/crates/aingle_raft/src/types.rs @@ -51,7 +51,11 @@ pub struct CortexNode { impl fmt::Display for CortexNode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "CortexNode(rest={}, p2p={})", self.rest_addr, self.p2p_addr) + write!( + f, + "CortexNode(rest={}, p2p={})", + self.rest_addr, self.p2p_addr + ) } } diff --git a/crates/aingle_viz/Cargo.toml b/crates/aingle_viz/Cargo.toml index 982b3a77..95b09b19 100644 --- a/crates/aingle_viz/Cargo.toml +++ b/crates/aingle_viz/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_viz" -version = "0.6.3" +version = "0.7.0" description = "DAG Visualization for AIngle - Web-based graph explorer" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -30,8 +30,8 @@ serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" # Graph data -aingle_graph = { version = "0.6", path = "../aingle_graph" } -aingle_minimal = { version = "0.6", path = "../aingle_minimal", default-features = false, features = ["sqlite"] } +aingle_graph = { version = "0.7", path = "../aingle_graph" } +aingle_minimal = { version = "0.7", path = "../aingle_minimal", default-features = false, features = ["sqlite"] } # Utilities log = "0.4" diff --git a/crates/aingle_viz/web/assets/logo-legacy.svg b/crates/aingle_viz/web/assets/logo-legacy.svg new file mode 100644 index 00000000..e55a326d --- /dev/null +++ b/crates/aingle_viz/web/assets/logo-legacy.svg @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/crates/aingle_viz/web/assets/logo.svg b/crates/aingle_viz/web/assets/logo.svg index e55a326d..b1651e21 100644 --- a/crates/aingle_viz/web/assets/logo.svg +++ b/crates/aingle_viz/web/assets/logo.svg @@ -1,80 +1,11 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + diff --git a/crates/aingle_wal/Cargo.toml b/crates/aingle_wal/Cargo.toml index 05d31a95..3242a041 100644 --- a/crates/aingle_wal/Cargo.toml +++ b/crates/aingle_wal/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_wal" -version = "0.6.3" +version = "0.7.0" description = "Write-Ahead Log for AIngle clustering and replication" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" diff --git a/crates/aingle_wal/src/entry.rs b/crates/aingle_wal/src/entry.rs index bb35235a..5dccf4b6 100644 --- a/crates/aingle_wal/src/entry.rs +++ b/crates/aingle_wal/src/entry.rs @@ -23,7 +23,12 @@ pub struct WalEntry { impl WalEntry { /// Compute the hash for this entry's payload (kind + seq + timestamp + prev_hash). - pub fn compute_hash(seq: u64, timestamp: &DateTime, kind: &WalEntryKind, prev_hash: &[u8; 32]) -> [u8; 32] { + pub fn compute_hash( + seq: u64, + timestamp: &DateTime, + kind: &WalEntryKind, + prev_hash: &[u8; 32], + ) -> [u8; 32] { let mut hasher = blake3::Hasher::new(); hasher.update(&seq.to_le_bytes()); hasher.update(timestamp.to_rfc3339().as_bytes()); @@ -47,9 +52,7 @@ pub enum WalEntryKind { triple_id: [u8; 32], }, /// Triple deleted from GraphDB. - TripleDelete { - triple_id: [u8; 32], - }, + TripleDelete { triple_id: [u8; 32] }, /// Memory entry stored in Ineru STM. MemoryStore { memory_id: String, @@ -58,13 +61,9 @@ pub enum WalEntryKind { importance: f32, }, /// Memory entry forgotten. - MemoryForget { - memory_id: String, - }, + MemoryForget { memory_id: String }, /// STM → LTM consolidation occurred. - MemoryConsolidate { - consolidated_count: usize, - }, + MemoryConsolidate { consolidated_count: usize }, /// Proof submitted. ProofSubmit { proof_id: String, @@ -90,9 +89,7 @@ pub enum WalEntryKind { weight: f32, }, /// LTM entity deleted (for Ineru replication). - LtmEntityDelete { - entity_id: String, - }, + LtmEntityDelete { entity_id: String }, /// Serialized openraft Raft log entry. RaftEntry { index: u64, @@ -128,7 +125,9 @@ mod tests { #[test] fn test_compute_hash_deterministic() { let ts = Utc::now(); - let kind = WalEntryKind::TripleDelete { triple_id: [1u8; 32] }; + let kind = WalEntryKind::TripleDelete { + triple_id: [1u8; 32], + }; let prev = [0u8; 32]; let h1 = WalEntry::compute_hash(1, &ts, &kind, &prev); @@ -139,7 +138,9 @@ mod tests { #[test] fn test_compute_hash_differs_on_seq() { let ts = Utc::now(); - let kind = WalEntryKind::TripleDelete { triple_id: [1u8; 32] }; + let kind = WalEntryKind::TripleDelete { + triple_id: [1u8; 32], + }; let prev = [0u8; 32]; let h1 = WalEntry::compute_hash(1, &ts, &kind, &prev); diff --git a/crates/aingle_wal/src/reader.rs b/crates/aingle_wal/src/reader.rs index 472092ba..9f2398c2 100644 --- a/crates/aingle_wal/src/reader.rs +++ b/crates/aingle_wal/src/reader.rs @@ -76,12 +76,8 @@ impl WalReader { let entry = &entries[i]; // Verify this entry's hash - let expected_hash = WalEntry::compute_hash( - entry.seq, - &entry.timestamp, - &entry.kind, - &entry.prev_hash, - ); + let expected_hash = + WalEntry::compute_hash(entry.seq, &entry.timestamp, &entry.kind, &entry.prev_hash); if entry.hash != expected_hash { return Ok(VerifyResult { valid: false, diff --git a/crates/aingle_wal/src/segment.rs b/crates/aingle_wal/src/segment.rs index b5bcd1e8..26f4aab5 100644 --- a/crates/aingle_wal/src/segment.rs +++ b/crates/aingle_wal/src/segment.rs @@ -55,12 +55,11 @@ impl WalSegment { .strip_prefix("wal-") .and_then(|s| s.strip_suffix(".seg")) .and_then(|s| s.parse::().ok()) - .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "invalid segment filename"))?; + .ok_or_else(|| { + io::Error::new(io::ErrorKind::InvalidInput, "invalid segment filename") + })?; - let file = OpenOptions::new() - .create(true) - .append(true) - .open(path)?; + let file = OpenOptions::new().create(true).append(true).open(path)?; let size_bytes = file.metadata()?.len(); @@ -84,8 +83,8 @@ impl WalSegment { /// Append a WAL entry to the segment. pub fn append(&mut self, entry: &WalEntry) -> io::Result<()> { - let payload = serde_json::to_vec(entry) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + let payload = + serde_json::to_vec(entry).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; let len = payload.len() as u32; self.file.write_all(&len.to_be_bytes())?; self.file.write_all(&payload)?; diff --git a/crates/aingle_wal/src/writer.rs b/crates/aingle_wal/src/writer.rs index 6440a4dc..13b4cae1 100644 --- a/crates/aingle_wal/src/writer.rs +++ b/crates/aingle_wal/src/writer.rs @@ -73,8 +73,12 @@ impl WalWriter { let timestamp = Utc::now(); let prev_hash = { - let guard = self.last_hash.lock() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("WAL last_hash lock poisoned: {e}")))?; + let guard = self.last_hash.lock().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("WAL last_hash lock poisoned: {e}"), + ) + })?; *guard }; @@ -89,8 +93,12 @@ impl WalWriter { }; { - let mut seg = self.current_segment.lock() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("WAL segment lock poisoned: {e}")))?; + let mut seg = self.current_segment.lock().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("WAL segment lock poisoned: {e}"), + ) + })?; seg.append(&entry)?; seg.sync()?; @@ -104,8 +112,12 @@ impl WalWriter { // Update last_hash { - let mut guard = self.last_hash.lock() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("WAL last_hash lock poisoned: {e}")))?; + let mut guard = self.last_hash.lock().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("WAL last_hash lock poisoned: {e}"), + ) + })?; *guard = entry.hash; } @@ -114,15 +126,23 @@ impl WalWriter { /// Flush the current segment to disk. pub fn sync(&self) -> io::Result<()> { - let mut seg = self.current_segment.lock() - .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("WAL segment lock poisoned: {e}")))?; + let mut seg = self.current_segment.lock().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("WAL segment lock poisoned: {e}"), + ) + })?; seg.sync() } /// The next sequence number that will be assigned. pub fn last_seq(&self) -> u64 { let next = self.next_seq.load(Ordering::SeqCst); - if next == 0 { 0 } else { next - 1 } + if next == 0 { + 0 + } else { + next - 1 + } } /// Get the WAL directory path. diff --git a/crates/aingle_zk/Cargo.toml b/crates/aingle_zk/Cargo.toml index a911f652..9c3c70e0 100644 --- a/crates/aingle_zk/Cargo.toml +++ b/crates/aingle_zk/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aingle_zk" -version = "0.6.3" +version = "0.7.0" description = "Zero-Knowledge Proofs for AIngle - privacy-preserving cryptographic primitives" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" diff --git a/crates/ineru/Cargo.toml b/crates/ineru/Cargo.toml index 272c3d75..36a4189c 100644 --- a/crates/ineru/Cargo.toml +++ b/crates/ineru/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ineru" -version = "0.6.3" +version = "0.7.0" description = "Ineru: Neural-inspired memory system for AIngle AI agents" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -21,6 +21,9 @@ persistent = [] wasm = [] # Compression for memory entries compression = [] +# Real neural embeddings via fastembed (ONNX). ort loaded dynamically at +# runtime from a controlled path (no build-time binary download, no network). +neural-embeddings = ["dep:fastembed"] [dependencies] # Serialization @@ -40,6 +43,11 @@ log = "0.4" # Optional: SQLite for persistent LTM (matching workspace version) rusqlite = { version = "0.32", default-features = false, features = ["bundled"], optional = true } +# Optional: real neural embeddings (multilingual-e5-small via ONNX). +# default-features off → no hf-hub network deps; ort-load-dynamic → onnxruntime +# is loaded from a runtime path we ship, not downloaded/linked at build time. +fastembed = { version = "5", default-features = false, features = ["ort-load-dynamic"], optional = true } + [dev-dependencies] criterion = "0.5" diff --git a/crates/ineru/benches/memory_bench.rs b/crates/ineru/benches/memory_bench.rs index 78a282cf..25157415 100644 --- a/crates/ineru/benches/memory_bench.rs +++ b/crates/ineru/benches/memory_bench.rs @@ -7,7 +7,7 @@ use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; use ineru::{ - ConsolidationConfig, LtmConfig, MemoryConfig, MemoryEntry, MemoryQuery, StmConfig, IneruMemory, + ConsolidationConfig, IneruMemory, LtmConfig, MemoryConfig, MemoryEntry, MemoryQuery, StmConfig, }; /// Benchmark STM store operations diff --git a/crates/ineru/src/consolidation.rs b/crates/ineru/src/consolidation.rs index 4e057ae3..cec64b06 100644 --- a/crates/ineru/src/consolidation.rs +++ b/crates/ineru/src/consolidation.rs @@ -199,7 +199,7 @@ pub struct ConsolidationStats { } /// Defines the strategy used to select memories for consolidation. -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Default, Clone, Copy)] pub enum ConsolidationStrategy { /// Consolidates memories that are accessed most frequently. FrequencyBased, @@ -208,15 +208,10 @@ pub enum ConsolidationStrategy { /// Consolidates memories that are semantically novel compared to existing LTM content. NoveltyBased, /// A default strategy that combines importance, frequency, recency, and novelty. + #[default] Combined, } -impl Default for ConsolidationStrategy { - fn default() -> Self { - Self::Combined - } -} - /// An advanced consolidator that can apply different strategies for selecting memories. /// /// This provides more flexible control over the consolidation process than the basic `Consolidator`. @@ -259,7 +254,7 @@ impl AdvancedConsolidator { .cloned() .collect(); - candidates.sort_by(|a, b| b.metadata.access_count.cmp(&a.metadata.access_count)); + candidates.sort_by_key(|b| std::cmp::Reverse(b.metadata.access_count)); let mut count = 0; for entry in candidates.into_iter().take(self.base.config.batch_size) { diff --git a/crates/ineru/src/embedder.rs b/crates/ineru/src/embedder.rs new file mode 100644 index 00000000..e3508d64 --- /dev/null +++ b/crates/ineru/src/embedder.rs @@ -0,0 +1,276 @@ +//! Text-to-embedding strategies. +//! +//! [`Embedder`] is the unit callers own and inject. Implementations may hold a +//! loaded model (stateful) and may block, so embedding is *not* baked into data +//! structures like `MemoryQuery`. + +use crate::types::Embedding; + +/// Produces semantic embeddings for text. +/// +/// `embed_passage` is for documents/chunks that get stored and searched against; +/// `embed_query` is for search queries. They are distinct because some models +/// (e.g. the E5 family) are trained with asymmetric prefixes, so the right one +/// must be applied at each call site. +pub trait Embedder: Send + Sync { + /// Embed a document/chunk to be stored and searched against. + fn embed_passage(&self, text: &str) -> Embedding; + /// Embed a search query. + fn embed_query(&self, text: &str) -> Embedding; + /// Dimensionality of the vectors this embedder produces. + fn dimensions(&self) -> usize; + /// `(strong, weak)` cosine-similarity cutoffs for this embedder's score + /// distribution: at/above `strong` a match corroborates; below `weak` it is + /// noise. The default suits the lexical-hash scale; model-based embedders + /// override it. + fn relevance_thresholds(&self) -> (f32, f32) { + (0.55, 0.30) + } +} + +/// 64-dimensional fallback embedder built on the lexical hash scheme +/// (`Embedding::from_text_simple`). Always available; captures lexical overlap, +/// not meaning. The hash scheme is symmetric, so passage and query embeddings +/// are identical and no prefixes are applied. +#[derive(Debug, Default, Clone, Copy)] +pub struct HashEmbedder; + +impl HashEmbedder { + /// Creates a new `HashEmbedder`. + pub fn new() -> Self { + Self + } +} + +impl Embedder for HashEmbedder { + fn embed_passage(&self, text: &str) -> Embedding { + Embedding::from_text_simple(text) + } + + fn embed_query(&self, text: &str) -> Embedding { + Embedding::from_text_simple(text) + } + + fn dimensions(&self) -> usize { + 64 + } +} + +#[cfg(feature = "neural-embeddings")] +use std::path::Path; +#[cfg(feature = "neural-embeddings")] +use std::sync::Mutex; + +#[cfg(feature = "neural-embeddings")] +use fastembed::{ + InitOptionsUserDefined, Pooling, TextEmbedding, TokenizerFiles, UserDefinedEmbeddingModel, +}; + +/// Real neural embedder: multilingual-e5-small (384-dim) via fastembed/ONNX, +/// loaded entirely from a local directory (no network). E5 is trained with +/// asymmetric prefixes, so `embed_query` prepends `"query: "` and +/// `embed_passage` prepends `"passage: "`. +/// +/// fastembed's `embed` takes `&mut self`, so the model is held behind a `Mutex` +/// to satisfy the `&self` trait methods while staying `Send + Sync`. +/// Concurrent callers serialize through this lock for the duration of inference. +#[cfg(feature = "neural-embeddings")] +pub struct NeuralEmbedder { + model: Mutex, +} + +#[cfg(feature = "neural-embeddings")] +impl NeuralEmbedder { + /// Output dimensionality of multilingual-e5-small. + const DIM: usize = 384; + + /// Loads the model from a directory containing `onnx/model.onnx`, + /// `tokenizer.json`, `config.json`, `special_tokens_map.json`, and + /// `tokenizer_config.json`. Returns an error (never panics) if any file is + /// missing or the model fails to initialize, so callers can fall back. + pub fn from_path(dir: &Path) -> crate::Result { + let read = |name: &str| -> crate::Result> { + std::fs::read(dir.join(name)) + .map_err(|e| crate::Error::Storage(format!("reading {name}: {e}"))) + }; + + let onnx = read("onnx/model.onnx")?; + let tokenizer_files = TokenizerFiles { + tokenizer_file: read("tokenizer.json")?, + config_file: read("config.json")?, + special_tokens_map_file: read("special_tokens_map.json")?, + tokenizer_config_file: read("tokenizer_config.json")?, + }; + + // E5 REQUIRES mean pooling; the fastembed default is Cls. + let model = + UserDefinedEmbeddingModel::new(onnx, tokenizer_files).with_pooling(Pooling::Mean); + let options = InitOptionsUserDefined::new().with_max_length(512); + + let embedding = TextEmbedding::try_new_from_user_defined(model, options) + .map_err(|e| crate::Error::Internal(format!("init e5: {e}")))?; + + Ok(Self { + model: Mutex::new(embedding), + }) + } + + fn embed_one(&self, prefixed: String) -> Embedding { + let mut guard = self.model.lock().expect("embedder mutex poisoned"); + let out = guard.embed(vec![prefixed], None).expect("e5 embed failed"); + let vector = out + .into_iter() + .next() + .expect("e5 returned empty batch for single-item input"); + Embedding::new(vector) + } +} + +#[cfg(feature = "neural-embeddings")] +impl Embedder for NeuralEmbedder { + fn embed_passage(&self, text: &str) -> Embedding { + self.embed_one(format!("passage: {text}")) + } + + fn embed_query(&self, text: &str) -> Embedding { + self.embed_one(format!("query: {text}")) + } + + fn dimensions(&self) -> usize { + Self::DIM + } + + fn relevance_thresholds(&self) -> (f32, f32) { + (0.80, 0.77) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn hash_embedder_has_64_dimensions() { + let e = HashEmbedder::new(); + assert_eq!(e.dimensions(), 64); + } + + #[test] + fn hash_embedder_produces_64_dim_vectors() { + let e = HashEmbedder::new(); + let p = e.embed_passage("the quick brown fox"); + let q = e.embed_query("the quick brown fox"); + assert_eq!(p.0.len(), 64); + assert_eq!(q.0.len(), 64); + } + + #[test] + fn hash_embedder_is_deterministic() { + let e = HashEmbedder::new(); + let a = e.embed_passage("hello world"); + let b = e.embed_passage("hello world"); + assert_eq!(a.0, b.0); + } + + #[test] + fn hash_embedder_passage_and_query_are_identical() { + let e = HashEmbedder::new(); + let p = e.embed_passage("test input"); + let q = e.embed_query("test input"); + assert_eq!(p.0, q.0); + } + + #[test] + fn hash_embedder_relevance_thresholds() { + let e = HashEmbedder::new(); + assert_eq!(e.relevance_thresholds(), (0.55, 0.30)); + } +} + +#[cfg(all(test, feature = "neural-embeddings"))] +mod neural_tests { + use super::*; + use std::path::PathBuf; + + /// Returns the model dir, or `None` (test skips) if it isn't present. + fn model_dir() -> Option { + let dir = std::env::var("INERU_E5_MODEL_DIR").unwrap_or_else(|_| { + concat!( + env!("CARGO_MANIFEST_DIR"), + "/test-models/multilingual-e5-small" + ) + .to_string() + }); + let p = PathBuf::from(dir); + if p.join("onnx/model.onnx").exists() { + Some(p) + } else { + eprintln!("skipping: model files not found at {}", p.display()); + None + } + } + + #[test] + fn neural_embedder_reports_384_dimensions() { + let Some(dir) = model_dir() else { return }; + let e = NeuralEmbedder::from_path(&dir).expect("load model"); + assert_eq!(e.dimensions(), 384); + } + + #[test] + fn neural_embedder_produces_384_dim_vectors() { + let Some(dir) = model_dir() else { return }; + let e = NeuralEmbedder::from_path(&dir).expect("load model"); + let v = e.embed_passage("el perro corre en el parque"); + assert_eq!(v.0.len(), 384); + assert!(v.0.iter().any(|x| *x != 0.0)); + } + + #[test] + fn neural_embedder_captures_semantic_similarity() { + let Some(dir) = model_dir() else { return }; + let e = NeuralEmbedder::from_path(&dir).expect("load model"); + + // E5 is trained for sentence/passage retrieval, which is exactly how this + // embedder is used (queries = questions, chunks = sentences). Isolated + // single words cluster too tightly to test meaningfully; realistic + // sentence-level inputs produce a clear semantic margin. + let query = e.embed_query("¿Cómo debo cuidar a mi perro?"); + let related = e.embed_passage( + "Los perros necesitan paseos diarios, agua fresca y una dieta equilibrada.", + ); + let unrelated = e.embed_passage( + "La bolsa de valores cerró hoy con fuertes pérdidas para los inversores.", + ); + + let near = query.cosine_similarity(&related); + let far = query.cosine_similarity(&unrelated); + + // A real model ranks the dog-care passage above the stock-market one for a + // dog-care question. The 64-dim hash embedder cannot. + assert!( + near > far, + "expected sim(query,related)={near} > sim(query,unrelated)={far}" + ); + } + + #[test] + fn neural_embedder_applies_distinct_prefixes() { + let Some(dir) = model_dir() else { return }; + let e = NeuralEmbedder::from_path(&dir).expect("load model"); + + // Same raw text, different prefixes → different vectors. + let as_query = e.embed_query("documento"); + let as_passage = e.embed_passage("documento"); + assert_ne!(as_query.0, as_passage.0); + } + + #[test] + fn neural_embedder_relevance_thresholds() { + // Calibrated to multilingual-e5-small's anisotropic cosine scale: + // unrelated sentence pairs ceil ~0.76, related floor ~0.81. + let Some(dir) = model_dir() else { return }; + let e = NeuralEmbedder::from_path(&dir).expect("load model"); + assert_eq!(e.relevance_thresholds(), (0.80, 0.77)); + } +} diff --git a/crates/ineru/src/hnsw.rs b/crates/ineru/src/hnsw.rs index e88bd553..55f7bc84 100644 --- a/crates/ineru/src/hnsw.rs +++ b/crates/ineru/src/hnsw.rs @@ -9,8 +9,8 @@ use crate::error::{Error, Result}; use crate::types::MemoryId; use serde::{Deserialize, Serialize}; -use std::collections::{BinaryHeap, HashMap, HashSet}; use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap, HashSet}; // ============================================================================ // Config @@ -82,7 +82,10 @@ impl Eq for Candidate {} impl Ord for Candidate { fn cmp(&self, other: &Self) -> Ordering { // Reverse for min-heap behavior (BinaryHeap is max-heap) - other.distance.partial_cmp(&self.distance).unwrap_or(Ordering::Equal) + other + .distance + .partial_cmp(&self.distance) + .unwrap_or(Ordering::Equal) } } @@ -109,7 +112,9 @@ impl Eq for MaxCandidate {} impl Ord for MaxCandidate { fn cmp(&self, other: &Self) -> Ordering { - self.distance.partial_cmp(&other.distance).unwrap_or(Ordering::Equal) + self.distance + .partial_cmp(&other.distance) + .unwrap_or(Ordering::Equal) } } @@ -259,7 +264,8 @@ impl HnswIndex { /// Rebuild the index, removing deleted points. pub fn rebuild(&mut self) { - let active_points: Vec<(MemoryId, Vec)> = self.points + let active_points: Vec<(MemoryId, Vec)> = self + .points .iter() .filter(|p| !p.deleted) .map(|p| (p.id.clone(), p.embedding.clone())) @@ -282,7 +288,8 @@ impl HnswIndex { /// Serialize the index to bytes (M1 fix — preserves full topology). pub fn serialize(&self) -> Result> { - let points: Vec = self.points + let points: Vec = self + .points .iter() .map(|p| HnswPointSnapshot { id: p.id.clone(), @@ -302,8 +309,7 @@ impl HnswIndex { points, }; - serde_json::to_vec(&snapshot) - .map_err(|e| Error::internal(format!("HNSW serialize: {e}"))) + serde_json::to_vec(&snapshot).map_err(|e| Error::internal(format!("HNSW serialize: {e}"))) } /// Deserialize an index from bytes (M1 fix — backward-compatible). @@ -320,7 +326,9 @@ impl HnswIndex { index.max_layer = snapshot.max_layer; index.entry_point = snapshot.entry_point; - let dimensions = snapshot.points.first() + let dimensions = snapshot + .points + .first() .map(|p| p.embedding.len()) .unwrap_or(0); index.dimensions = dimensions; @@ -474,15 +482,27 @@ impl HnswIndex { current } - fn search_layer(&self, start: usize, query: &[f32], ef: usize, _layer: usize) -> Vec { + fn search_layer( + &self, + start: usize, + query: &[f32], + ef: usize, + _layer: usize, + ) -> Vec { let mut visited = HashSet::new(); let start_dist = Self::cosine_distance(&self.points[start].embedding, query); let mut candidates = BinaryHeap::new(); // min-heap let mut result = BinaryHeap::::new(); // max-heap - candidates.push(Candidate { index: start, distance: start_dist }); - result.push(MaxCandidate { index: start, distance: start_dist }); + candidates.push(Candidate { + index: start, + distance: start_dist, + }); + result.push(MaxCandidate { + index: start, + distance: start_dist, + }); visited.insert(start); while let Some(current) = candidates.pop() { @@ -521,8 +541,14 @@ impl HnswIndex { }; if should_add { - candidates.push(Candidate { index: neighbor_idx, distance: dist }); - result.push(MaxCandidate { index: neighbor_idx, distance: dist }); + candidates.push(Candidate { + index: neighbor_idx, + distance: dist, + }); + result.push(MaxCandidate { + index: neighbor_idx, + distance: dist, + }); if result.len() > ef { result.pop(); // Remove worst @@ -534,9 +560,16 @@ impl HnswIndex { // Convert max-heap to sorted vec (best first) let mut results: Vec = result .into_iter() - .map(|mc| Candidate { index: mc.index, distance: mc.distance }) + .map(|mc| Candidate { + index: mc.index, + distance: mc.distance, + }) .collect(); - results.sort_by(|a, b| a.distance.partial_cmp(&b.distance).unwrap_or(Ordering::Equal)); + results.sort_by(|a, b| { + a.distance + .partial_cmp(&b.distance) + .unwrap_or(Ordering::Equal) + }); results } @@ -571,17 +604,20 @@ impl HnswIndex { // Add reverse links for &neighbor_idx in &selected { if layer < self.points[neighbor_idx].neighbors.len() { - let already_linked = self.points[neighbor_idx].neighbors[layer].contains(&new_idx); + let already_linked = + self.points[neighbor_idx].neighbors[layer].contains(&new_idx); if !already_linked { self.points[neighbor_idx].neighbors[layer].push(new_idx); // Prune if too many neighbors if self.points[neighbor_idx].neighbors[layer].len() > max_neighbors { // Compute distances for sorting, then sort & truncate let emb = self.points[neighbor_idx].embedding.clone(); - let mut scored: Vec<(usize, f32)> = self.points[neighbor_idx] - .neighbors[layer] + let mut scored: Vec<(usize, f32)> = self.points[neighbor_idx].neighbors + [layer] .iter() - .map(|&n| (n, Self::cosine_distance(&self.points[n].embedding, &emb))) + .map(|&n| { + (n, Self::cosine_distance(&self.points[n].embedding, &emb)) + }) .collect(); scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(Ordering::Equal)); scored.truncate(max_neighbors); @@ -640,7 +676,7 @@ struct HnswSnapshotLegacy { fn rand_f64() -> f64 { use std::cell::Cell; thread_local! { - static SEED: Cell = Cell::new(0x12345678_9abcdef0); + static SEED: Cell = const { Cell::new(0x12345678_9abcdef0) }; } SEED.with(|s| { let mut x = s.get(); @@ -851,7 +887,11 @@ mod tests { #[test] fn test_entry_point_updates_on_higher_level() { // Use a config with small m to increase chance of higher layers - let config = HnswConfig { m: 2, ef_construction: 10, ef_search: 10 }; + let config = HnswConfig { + m: 2, + ef_construction: 10, + ef_search: 10, + }; let mut index = HnswIndex::new(config); // Insert many points; at least one should get a level > 0 @@ -882,7 +922,11 @@ mod tests { #[test] fn test_deletion_prunes_neighbor_lists() { - let config = HnswConfig { m: 4, ef_construction: 20, ef_search: 10 }; + let config = HnswConfig { + m: 4, + ef_construction: 20, + ef_search: 10, + }; let mut index = HnswIndex::new(config); // Insert several closely-related points so they appear in each other's diff --git a/crates/ineru/src/lib.rs b/crates/ineru/src/lib.rs index 927c822c..c5983d5f 100644 --- a/crates/ineru/src/lib.rs +++ b/crates/ineru/src/lib.rs @@ -65,6 +65,7 @@ pub mod config; pub mod consolidation; +mod embedder; pub mod error; pub mod hnsw; pub mod ltm; @@ -73,6 +74,9 @@ pub mod types; pub use config::{ConsolidationConfig, LtmConfig, MemoryConfig, StmConfig}; pub use consolidation::Consolidator; +#[cfg(feature = "neural-embeddings")] +pub use embedder::NeuralEmbedder; +pub use embedder::{Embedder, HashEmbedder}; pub use error::{Error, Result}; pub use ltm::{KnowledgeGraph, LongTermMemory}; pub use stm::ShortTermMemory; @@ -102,7 +106,7 @@ impl IneruMemory { /// # Arguments /// /// * `config` - The `MemoryConfig` that defines the behavior and capacity - /// of the STM, LTM, and consolidation process. + /// of the STM, LTM, and consolidation process. pub fn new(config: MemoryConfig) -> Self { Self { stm: ShortTermMemory::new(config.stm.clone()), @@ -148,7 +152,7 @@ impl IneruMemory { /// /// * `entry` - The `MemoryEntry` to store. /// * `importance` - A float score determining the entry's importance. Higher values - /// make it more likely to be consolidated into LTM. + /// make it more likely to be consolidated into LTM. /// /// # Returns /// @@ -364,7 +368,8 @@ impl IneruMemory { config: self.config.clone(), }; - serde_json::to_vec(&snapshot).map_err(|e| Error::internal(format!("snapshot export: {}", e))) + serde_json::to_vec(&snapshot) + .map_err(|e| Error::internal(format!("snapshot export: {}", e))) } /// Imports a memory state from a JSON byte slice. @@ -401,8 +406,8 @@ impl IneruMemory { /// Loads a memory state from a file. pub fn load_from_file(path: &std::path::Path) -> Result { - let data = std::fs::read(path) - .map_err(|e| Error::internal(format!("snapshot read: {}", e)))?; + let data = + std::fs::read(path).map_err(|e| Error::internal(format!("snapshot read: {}", e)))?; Self::import_snapshot(&data) } } @@ -530,9 +535,8 @@ mod tests { memory.remember_important(entry, 0.9).unwrap(); } - let consolidated = memory.consolidate().unwrap(); - // Consolidation may or may not move entries depending on thresholds - assert!(consolidated >= 0); + // Consolidation may or may not move entries depending on thresholds; just ensure it runs. + let _consolidated = memory.consolidate().unwrap(); } #[test] @@ -599,9 +603,8 @@ mod tests { memory.remember(entry).unwrap(); } - let pruned = memory.prune_stm().unwrap(); - // Should have pruned some entries - assert!(pruned >= 0); + // Should have pruned some entries (result is usize, always valid). + let _pruned = memory.prune_stm().unwrap(); } #[test] diff --git a/crates/ineru/src/ltm.rs b/crates/ineru/src/ltm.rs index 158f75e7..167c0227 100644 --- a/crates/ineru/src/ltm.rs +++ b/crates/ineru/src/ltm.rs @@ -371,14 +371,14 @@ impl LongTermMemory { results .into_iter() .filter(|(_, sim)| *sim >= min_similarity) - .filter_map(|(id, sim)| { - self.memories.get(&id).map(|entry| (entry, sim)) - }) + .filter_map(|(id, sim)| self.memories.get(&id).map(|entry| (entry, sim))) .collect() } else { // Fallback to brute-force over memory entries with embeddings let query_emb = Embedding::new(query.to_vec()); - let mut scored: Vec<_> = self.memories.values() + let mut scored: Vec<_> = self + .memories + .values() .filter_map(|entry| { entry.embedding.as_ref().map(|emb| { let sim = query_emb.cosine_similarity(emb); diff --git a/crates/ineru/src/types.rs b/crates/ineru/src/types.rs index db160588..869249a5 100644 --- a/crates/ineru/src/types.rs +++ b/crates/ineru/src/types.rs @@ -341,6 +341,16 @@ impl MemoryQuery { self.min_importance = Some(importance); self } + + /// Attaches (or replaces) the embedding vector used for similarity search. + /// + /// Callers that own an [`crate::Embedder`] use this to inject a vector computed + /// by a real model, overriding the default lexical-hash embedding that + /// [`MemoryQuery::text`] currently attaches. + pub fn with_embedding(mut self, embedding: Embedding) -> Self { + self.embedding = Some(embedding); + self + } } /// A single result returned from a memory query. @@ -580,4 +590,15 @@ mod tests { assert_eq!(entity.name, "temp_001"); assert!(entity.properties.contains_key("location")); } + + #[test] + fn with_embedding_overrides_query_vector() { + let injected = Embedding::new(vec![0.25; 384]); + let q = MemoryQuery::text("perro").with_embedding(injected.clone()); + let emb = q.embedding.expect("embedding present"); + assert_eq!(emb.0.len(), 384); + assert_eq!(emb.0, injected.0); + // text is still retained + assert_eq!(q.text.as_deref(), Some("perro")); + } } diff --git a/crates/kaneru/Cargo.toml b/crates/kaneru/Cargo.toml index f5f6175c..b917190b 100644 --- a/crates/kaneru/Cargo.toml +++ b/crates/kaneru/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "kaneru" -version = "0.6.3" +version = "0.7.0" description = "Kaneru: Unified Multi-Agent Execution System for AIngle AI agents" license = "Apache-2.0 OR LicenseRef-Commercial" repository = "https://github.com/ApiliumCode/aingle" @@ -31,7 +31,7 @@ serde_json = "1.0" log = "0.4" # AI Memory integration -ineru = { version = "0.6", path = "../ineru", optional = true } +ineru = { version = "0.7", path = "../ineru", optional = true } # Random for exploration (updated from 0.7) rand = { version = "0.9", default-features = false, features = ["std", "thread_rng"] } diff --git a/crates/kaneru/src/coordination.rs b/crates/kaneru/src/coordination.rs index 1056131e..2ec8f594 100644 --- a/crates/kaneru/src/coordination.rs +++ b/crates/kaneru/src/coordination.rs @@ -419,7 +419,10 @@ impl AgentCoordinator { } /// Unregisters an agent from the coordinator. - pub fn unregister_agent(&mut self, agent_id: &AgentId) -> Result { + pub fn unregister_agent( + &mut self, + agent_id: &AgentId, + ) -> Result { self.agents .remove(agent_id) .map(|handle| handle.agent) diff --git a/crates/kaneru/src/kaneru_agent.rs b/crates/kaneru/src/kaneru_agent.rs index 6911c256..c948d3fb 100644 --- a/crates/kaneru/src/kaneru_agent.rs +++ b/crates/kaneru/src/kaneru_agent.rs @@ -330,7 +330,9 @@ impl KaneruAgent { let prev_obs = match self.observation_history.back() { Some(obs) => obs, None => { - log::warn!("learn() called with empty observation history — skipping predictive update"); + log::warn!( + "learn() called with empty observation history — skipping predictive update" + ); // Still update goal progress below self.current_state = Some(new_state); self.observation_history.push_back(outcome.new_observation); diff --git a/crates/kaneru/src/memory.rs b/crates/kaneru/src/memory.rs index a3433bf6..81f3ad82 100644 --- a/crates/kaneru/src/memory.rs +++ b/crates/kaneru/src/memory.rs @@ -11,7 +11,7 @@ use crate::agent::{Agent, AgentId, AgentState, SimpleAgent}; use crate::config::AgentConfig; use crate::error::Result; use crate::observation::Observation; -use ineru::{MemoryConfig, MemoryEntry, MemoryQuery, IneruMemory}; +use ineru::{IneruMemory, MemoryConfig, MemoryEntry, MemoryQuery}; /// An agent wrapper that adds memory capabilities using `IneruMemory`. /// diff --git a/crates/kaneru/src/persistence.rs b/crates/kaneru/src/persistence.rs index c20b51c8..73c54fdf 100644 --- a/crates/kaneru/src/persistence.rs +++ b/crates/kaneru/src/persistence.rs @@ -223,7 +223,8 @@ impl AgentPersistence for KaneruAgent { let mut bytes = Vec::new(); file.read_to_end(&mut bytes)?; - let state: crate::kaneru_agent::SerializedState = deserialize_with_options(&bytes, options)?; + let state: crate::kaneru_agent::SerializedState = + deserialize_with_options(&bytes, options)?; let mut agent = KaneruAgent::new(state.config.clone()); agent.load_state(state); diff --git a/crates/kaneru/tests/integration_test.rs b/crates/kaneru/tests/integration_test.rs index 2b60ec2d..3641882a 100644 --- a/crates/kaneru/tests/integration_test.rs +++ b/crates/kaneru/tests/integration_test.rs @@ -243,7 +243,8 @@ fn test_persistence_formats() { compress: false, }; let json_bytes = agent.to_bytes_with_options(&json_options).unwrap(); - let loaded_from_json = KaneruAgent::from_bytes_with_options(&json_bytes, &json_options).unwrap(); + let loaded_from_json = + KaneruAgent::from_bytes_with_options(&json_bytes, &json_options).unwrap(); assert_eq!( loaded_from_json.get_statistics().total_steps, agent.get_statistics().total_steps diff --git a/crates/kaneru/tests/integration_tests.rs b/crates/kaneru/tests/integration_tests.rs index f8228570..91545f64 100644 --- a/crates/kaneru/tests/integration_tests.rs +++ b/crates/kaneru/tests/integration_tests.rs @@ -7,6 +7,7 @@ #![cfg(feature = "memory")] +use ineru::MemoryConfig; use kaneru::{ action::{Action, ActionType}, agent::Agent, @@ -16,7 +17,6 @@ use kaneru::{ observation::Observation, policy::{Condition, Rule}, }; -use ineru::MemoryConfig; /// Test: Create a memory agent and store observations #[test]