Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions config/client_text.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
huri_url: ws://localhost:8000/session

topic_list: [question, rag_response]
topic_list: [question, token]

senders:
text:
Expand All @@ -12,4 +12,7 @@ modules:
args:
language: en
tone: formal
logging: INFO
max_history_turns: 6
temperature: 0.7
response_format: short
persona: ""
24 changes: 17 additions & 7 deletions config/huri.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,15 @@ applications:
import_path: src.app:app
runtime_env:
env_vars:
OLLAMA_BASE_URL: "http://localhost:11434"
OLLAMA_API_KEY: "ollama"

RAY_COLOR_PREFIX: "1"

# Make the CosyVoice repo (and its Matcha-TTS submodule) importable
# inside every replica, regardless of cwd.
PYTHONPATH: "/home/fifster/Tek/Eip/HuRI/assets/cosyvoice:/home/fifster/Tek/Eip/HuRI/assets/cosyvoice/third_party/Matcha-TTS"

# --- Gesture sliding-window defaults (run in the HuRI CPU actor) ---
HURI_GESTURE_CONTEXT_SEC: "2.0"
HURI_GESTURE_MIN_CHUNK_SEC: "0.5"
Expand All @@ -56,19 +63,19 @@ applications:
# it as an instruction and intermittently speaks it (prompt leakage).
HURI_VOICE_TRANSCRIPT: "You are a helpful assistant.<|endofprompt|>Instinct creates its own oppressors and bids us rise up against them."
# From .Values.models.cosytts.env (mountPath/modelId) — edit for local layout.
HURI_MODEL_PATH: /models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512
HURI_MODEL_PATH: assets/cosyvoice_model
# Path to the CosyVoice repo root containing third_party/Matcha-TTS.
HURI_COSY_DIR: /app/cosyvoice
HURI_COSY_DIR: assets/cosyvoice
# From .Values.voiceAssets.env — the reference voice sample.
HURI_VOICE_SAMPLE_PATH: /assets/voice.wav
HURI_VOICE_SAMPLE_PATH: assets/voice.wav

# --- STT (faster-whisper) ---
# From .Values.models.whisper.env (mountPath/repoId) — edit for local layout.
HURI_STT_MODEL_PATH: /models/whisper/Systran/faster-whisper-base
# HURI_STT_MODEL_PATH: /models/whisper/Systran/faster-whisper-base

# --- Gesture (EMAGE) ---
# From .Values.models.emage.env (mountPath/repoId) — edit for local layout.
HURI_EMAGE_REPO: /models/emage/H-Liu1997/emage_audio
HURI_EMAGE_REPO: assets/emage_audio

# --- GPU-vendor runtime env (Helm puts these on the worker containers) ---
NVIDIA_VISIBLE_DEVICES: "all"
Expand All @@ -88,7 +95,7 @@ applications:
num_replicas: 1
ray_actor_options:
num_cpus: 1
num_gpus: 0.5
num_gpus: 0

# RAG: embeddings (API) + LLM client. No GPU needed.
- name: RAGHandle
Expand All @@ -98,7 +105,10 @@ applications:
num_gpus: 0
user_config:
embedding_model: "bge-large-en-v1.5-gguf-Q4_K_M"
llm_model: "Qwen3.5-4B-GGUF"
llm_model: "mistral:7b"
llm_base_url: "http://localhost:11434/v1" # Ollama's OpenAI-compatible endpoint
llm_api_key: "ollama"
memory_maintenance_check_hours: 0.5

# GPU split (manual override knob): num_gpus are Ray *scheduling*
# fractions that let replicas pack onto the same device and bias the
Expand Down
16 changes: 14 additions & 2 deletions src/core/huri.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,5 +125,17 @@ async def receive_loop(session: Session, ws: WebSocket):
finally:
print(f"Client {user_id} disconnected")

await receive_loop(self.clients[session_id], ws)
del self.clients[session_id]
try:
await receive_loop(self.clients[session_id], ws)
finally:
# Persist per-session state (e.g. conversation memory) on disconnect.
for module in modules:
fin = getattr(module, "finalize", None)
if fin is None:
continue
try:
await fin()
except Exception:
import traceback
print(f"[HuRI] finalize failed for {type(module).__name__}:\n{traceback.format_exc()}")
self.clients.pop(session_id, None)
90 changes: 90 additions & 0 deletions src/modules/rag/memory_inspect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""Inspect conversation memories: current strength, decay projection, fate.

Usage:
python -m src.modules.rag.memory.memory_inspect
python -m src.modules.rag.memory.memory_inspect --user-id <id>
"""
import argparse
from datetime import datetime, timedelta

try:
from .qdrant_utils import make_qdrant_client
except ImportError:
from qdrant_utils import make_qdrant_client

HALF_LIFE_DAYS = 5.0
DELETE_BELOW, CONSOLIDATE_BELOW = 0.05, 0.30


def strength(payload: dict, at: datetime | None = None) -> float:
"""Query-independent strength: recency * importance (matches maintenance)."""
at = at or datetime.now()
imp = payload.get("importance", 3)
half = max(HALF_LIFE_DAYS * (imp / 5.0), 0.5)
try:
last = datetime.fromisoformat(payload.get("last_accessed") or payload["created_at"])
age = (at - last).total_seconds() / 86400.0
except Exception:
age = 0.0
return (0.5 ** (max(age, 0) / half)) * (imp / 10.0)


def fate(s: float) -> str:
if s < DELETE_BELOW:
return "DELETE"
if s < CONSOLIDATE_BELOW:
return "CONSOLIDATE"
return "KEEP"


def main():
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default="http://localhost:6333")
ap.add_argument("--collection", default="conversations")
ap.add_argument("--user-id", default=None)
args = ap.parse_args()

qdrant = make_qdrant_client(args.qdrant_url)
points, offset = [], None
while True:
batch, offset = qdrant.scroll(collection_name=args.collection, limit=200,
offset=offset, with_payload=True, with_vectors=False)
points.extend(batch)
if offset is None:
break

now = datetime.now()
rows = []
for p in points:
pl = p.payload
if pl.get("type") == "maintenance_marker":
print(f"[marker] last maintenance run: {pl.get('last_run')}\n")
continue
if args.user_id and pl.get("_user_id") != args.user_id:
continue
s_now = strength(pl, now)
rows.append({
"text": pl.get("text", "")[:60].replace("\n", " "),
"type": pl.get("type", "?"),
"imp": pl.get("importance", "?"),
"acc": pl.get("access_count", 0),
"age_d": round((now - datetime.fromisoformat(
pl.get("last_accessed") or pl["created_at"])).total_seconds() / 86400, 1),
"now": round(s_now, 3),
"+5d": round(strength(pl, now + timedelta(days=5)), 3),
"+15d": round(strength(pl, now + timedelta(days=15)), 3),
"fate": fate(s_now),
})

rows.sort(key=lambda r: r["now"], reverse=True)
hdr = f"{'strength':>8} {'+5d':>6} {'+15d':>6} {'imp':>3} {'acc':>3} {'age':>5} {'fate':<12} text"
print(hdr)
print("-" * len(hdr))
for r in rows:
print(f"{r['now']:>8} {r['+5d']:>6} {r['+15d']:>6} {r['imp']:>3} "
f"{r['acc']:>3} {r['age_d']:>5} {r['fate']:<12} {r['text']}")
print(f"\n{len(rows)} memories")


if __name__ == "__main__":
main()
119 changes: 119 additions & 0 deletions src/modules/rag/memory_maintenance.py

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

du coup on doit lancer le cron en parralele ? je pense qu'il faudrait voir avec mister pommier pour pouvoir le lancer avec huri

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non on ne le lance pas a coté il travaille tout seul dans son coin

Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
"""Periodic memory maintenance: decay-based pruning + consolidation.

Run every N days (cron/systemd timer):
python -m src.modules.rag.memory_maintenance
Strong memories are kept, weak ones are merged into a consolidated memory,
dead ones are deleted. Decay itself is computed lazily at query time in
rag.py; this job only prunes and compresses.
"""
import argparse
import json
import uuid
from collections import defaultdict
from datetime import datetime

import httpx
from qdrant_client.models import Distance, PointIdsList, PointStruct, VectorParams

try:
from .qdrant_utils import make_qdrant_client
except ImportError:
from qdrant_utils import make_qdrant_client

DELETE_BELOW = 0.05
CONSOLIDATE_BELOW = 0.30
HALF_LIFE_DAYS = 5.0


def strength(payload: dict) -> float:
"""Query-independent strength: recency * importance (no relevance term)."""
importance = payload.get("importance", 3)
half_life = max(HALF_LIFE_DAYS * (importance / 5.0), 0.5)
try:
last = datetime.fromisoformat(payload.get("last_accessed") or payload["created_at"])
age_days = (datetime.now() - last).total_seconds() / 86400.0
except Exception:
age_days = 0.0
recency = 0.5 ** (age_days / half_life)
return recency * (importance / 10.0)


def embed(client: httpx.Client, url: str, model: str, text: str) -> list[float]:
r = client.post(f"{url}/v1/embeddings", json={"model": model, "input": text})
r.raise_for_status()
return r.json()["data"][0]["embedding"]


def llm(client: httpx.Client, url: str, model: str, prompt: str) -> str:
r = client.post(f"{url}/api/chat", json={
"model": model, "stream": False,
"messages": [{"role": "user", "content": prompt}],
"options": {"num_predict": 300},
})
r.raise_for_status()
return r.json()["message"]["content"]


def main():
ap = argparse.ArgumentParser()
ap.add_argument("--qdrant-url", default="http://localhost:6333")
ap.add_argument("--collection", default="conversations")
ap.add_argument("--ollama-url", default="http://localhost:11434")
ap.add_argument("--embedding-model", default="bge-large-en-v1.5-gguf-Q4_K_M")
ap.add_argument("--llm-model", default="mistral:7b")
ap.add_argument("--dry-run", action="store_true")
args = ap.parse_args()

qdrant = make_qdrant_client(args.qdrant_url)
http = httpx.Client(timeout=180.0)

points, offset = [], None
while True:
batch, offset = qdrant.scroll(collection_name=args.collection, limit=200,
offset=offset, with_payload=True, with_vectors=False)
points.extend(batch)
if offset is None:
break
print(f"{len(points)} memories in '{args.collection}'")

to_delete, weak_by_user = [], defaultdict(list)
for p in points:
s = strength(p.payload)
if s < DELETE_BELOW:
to_delete.append(p)
elif s < CONSOLIDATE_BELOW:
weak_by_user[p.payload.get("_user_id", "anonymous")].append(p)
print(f"delete: {len(to_delete)}, consolidate candidates: {sum(map(len, weak_by_user.values()))}")

if args.dry_run:
return

for user, weak in weak_by_user.items():
if len(weak) < 3:
continue # not worth merging yet; keep decaying
texts = [p.payload["text"] for p in weak]
merged = llm(http, args.ollama_url, args.llm_model,
"These are old memories about conversations with the same person. "
"Merge them into a single 3-5 sentence memory keeping only durable "
"facts, preferences and recurring themes. Drop one-off small talk.\n\n"
+ "\n---\n".join(texts)).strip()
vec = embed(http, args.ollama_url, args.embedding_model, merged)
now = datetime.now().isoformat()
imp = min(max(p.payload.get("importance", 3) for p in weak) + 1, 10)
qdrant.upsert(collection_name=args.collection, points=[PointStruct(
id=str(uuid.uuid4()), vector=vec, payload={
"text": merged, "_user_id": user, "type": "conversation_consolidated",
"created_at": now, "last_accessed": now, "access_count": 0,
"importance": imp,
})])
to_delete.extend(weak)
print(f"[{user}] consolidated {len(weak)} → 1 (importance={imp})")

if to_delete:
qdrant.delete(collection_name=args.collection,
points_selector=PointIdsList(points=[p.id for p in to_delete]))
print(f"deleted {len(to_delete)} points")


if __name__ == "__main__":
main()
Loading