Sentience-Robotics · Popochounet · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
diff --git a/config/client_aux.yaml b/config/client_aux.yaml
@@ -1,13 +1,19 @@
 huri_url: ws://localhost:8000/session
 
-topic_list: [question]
+topic_list: [transcript, question, token, motion]
 
 senders:
   audio:
     name: audio
     args:
       sample_rate: 16000
       frame_duration: 0.030
+  text:
+    name: text
+    topic: question
+    args:
+      sample_rate: 16000
+      frame_duration: 0.030
 
 modules:
   mic:
@@ -26,3 +32,27 @@ modules:
   tag:
     name: tag
     logging: INFO
+  emo:
+    name: emo
+    args:
+      block_duration: ${senders.audio.args.frame_duration}
+  eag:
+    name: eag
+  qag:
+    name: qag
+  rag:
+    name: rag
+    args:
+      language: en
+      tone: formal
+      response_format: paragraph
+      max_length: 1024
+    logging: INFO
+  tts:
+    name: tts
+    args:
+      min_clause_chars: 20
+    logging: INFO
+  gesture:
+    name: gesture
+    logging: INFO
diff --git a/config/client_full.yaml b/config/client_full.yaml
@@ -8,6 +8,20 @@ senders:
     args:
       sample_rate: 16000
       frame_duration: 0.030
+  text:
+    name: text
+    topic: question
+    args:
+      sample_rate: 16000
+      frame_duration: 0.030
+
+hooks:
+  audio:
+    name: audio
+    topics: [audio]
+    args:
+      incoming_sample_rate: ${senders.audio.args.sample_rate}
+      sample_rate: 44100
 
 modules:
   mic:
@@ -26,6 +40,14 @@ modules:
   tag:
     name: tag
     logging: INFO
+  emo:
+    name: emo
+    args:
+      block_duration: ${senders.audio.args.frame_duration}
+  eag:
+    name: eag
+  qag:
+    name: qag
   rag:
     name: rag
     args:

diff --git a/src/core/client.py b/src/core/client.py
@@ -93,8 +93,8 @@ async def _receive_loop(self, ws: websockets.ClientConnection):
                         print(f"<< bytes ({len(msg)}B, no topic)")
                         continue
                     (topic_len,) = struct.unpack(">H", msg[:2])
-                    topic = msg[2:2 + topic_len].decode()
-                    payload = msg[2 + topic_len:]
+                    topic = msg[2 : 2 + topic_len].decode()
+                    payload = msg[2 + topic_len :]
 
                     if topic == "audio" and len(payload) >= 13:
                         sample_rate, end_flag, pts = struct.unpack(">IBd", payload[:13])

diff --git a/src/core/events.py b/src/core/events.py
@@ -52,7 +52,8 @@ async def publish(self, event_topic, data):
         if event_topic not in ("audio_in",):  # skip mic-frame spam
             logger.info(
                 "[GRAPH] publish topic=%r subscribers=%s",
-                event_topic, [type(m).__name__ for m in subs],
+                event_topic,
+                [type(m).__name__ for m in subs],
             )
         for module in subs:
             asyncio.create_task(self._run(module, data))
@@ -68,26 +69,36 @@ async def _run(self, module: Module, data):
                             continue
                         logger.info(
                             "[GRAPH] %s -> %r: %s",
-                            type(module).__name__, module.output_type, _summarize(item),
+                            type(module).__name__,
+                            module.output_type,
+                            _summarize(item),
                         )
                         await self.publish(module.output_type, item)
                 except Exception:
-                    logger.exception("[GRAPH] async generator failed in %s", type(module).__name__)
+                    logger.exception(
+                        "[GRAPH] async generator failed in %s", type(module).__name__
+                    )
 
             else:
                 try:
                     value = await result
                     if value is not None:
                         logger.info(
                             "[GRAPH] %s -> %r: %s",
-                            type(module).__name__, module.output_type, _summarize(value),
+                            type(module).__name__,
+                            module.output_type,
+                            _summarize(value),
                         )
                         await self.publish(module.output_type, value)
                 except Exception:
-                    logger.exception("[GRAPH] coroutine failed in %s", type(module).__name__)
+                    logger.exception(
+                        "[GRAPH] coroutine failed in %s", type(module).__name__
+                    )
 
         except Exception:
-            logger.exception("[GRAPH] process() call failed in %s", type(module).__name__)
+            logger.exception(
+                "[GRAPH] process() call failed in %s", type(module).__name__
+            )
 
 
 def _summarize(item) -> str:

diff --git a/src/modules/__init__.py b/src/modules/__init__.py
diff --git a/src/modules/emotion/__init__.py b/src/modules/emotion/__init__.py
diff --git a/src/modules/emotion/emotion_aggregator.py b/src/modules/emotion/emotion_aggregator.py
@@ -0,0 +1,73 @@
+from collections import defaultdict
+from typing import Dict, Optional
+
+from src.core.module import Module
+from src.modules.rag.events import PartialQuestion
+
+from .events import Emotion
+
+
+class EAG(Module):
+    """EAG Module
+
+    Aggregate all emotions and send when voice end.
+
+    input: emotion,
+    output: partial_question
+
+    :ema_alpha: if not None, the aggragation will use ema computation instead
+    of average. Recents emotion will have stronger impact on the final score.
+    Lower alpha will make impact lower, and higher alpha will make it higher. \
+    Default alpha would be ~0.3.
+    """
+
+    input_type = "emotion"
+    output_type = "partial_question"
+
+    def __init__(self, ema_alpha: Optional[float] = None):
+        super().__init__()
+
+        self.scores: Dict[str, float] = defaultdict(float)
+        self.count: int = 0
+
+        self.ema_alpha = ema_alpha
+
+    def _finalize(self) -> Emotion:
+        avg_scores = (
+            {label: score / self.count for label, score in self.scores.items()}
+            if self.ema_alpha is None
+            else self.scores
+        )
+
+        best_label = max(avg_scores, key=lambda label: avg_scores[label])
+
+        result = Emotion(
+            label=best_label,
+            confidence=avg_scores[best_label],
+            scores=avg_scores,
+            end=True,
+        )
+
+        self.scores.clear()
+        self.count = 0
+
+        return result
+
+    async def process(self, emotion: Emotion) -> Optional[PartialQuestion]:
+        if self.ema_alpha is not None:
+            for label, score in emotion.scores.items():
+                self.scores[label] = (
+                    self.ema_alpha * score + (1 - self.ema_alpha) * self.scores[label]
+                )
+        else:
+            for label, score in emotion.scores.items():
+                self.scores[label] += score
+
+        self.count += 1
+
+        if emotion.end:
+            emotion = self._finalize()
+
+            return PartialQuestion(transcript=None, emotion=emotion)
+
+        return None
diff --git a/src/modules/emotion/events.py b/src/modules/emotion/events.py
@@ -0,0 +1,12 @@
+from dataclasses import dataclass
+from typing import Dict
+
+from src.core.events import EventData
+
+
+@dataclass
+class Emotion(EventData):
+    label: str
+    confidence: float
+    scores: Dict[str, float]
+    end: bool
diff --git a/src/modules/emotion/prosody_analysis.py b/src/modules/emotion/prosody_analysis.py
@@ -0,0 +1,109 @@
+import asyncio
+from typing import List, Optional
+
+import numpy as np
+import torch
+from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
+
+from src.core.module import Module
+from src.modules.speech_to_text.events import Voice
+
+from .events import Emotion
+
+
+class EMO(Module):
+    """EMO Module
+
+    Prosody Analysis of user voice speech.
+
+    input: voice,
+    output: emotion
+
+    :model_name: name of the Emotion Analysis model.
+    :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000.
+    :block_duration: size of received voice audio (in s).
+    :analysis_window: duration of audio per analysis (in s).
+    """
+
+    input_type = "voice"
+    output_type = "emotion"
+
+    def __init__(
+        self,
+        model_name: str = "superb/hubert-large-superb-er",
+        sample_rate: int = 16000,
+        block_duration: float = 0.020,  # s
+        analysis_window: float = 4.0,  # s
+    ):
+        super().__init__()
+
+        self.model = AutoModelForAudioClassification.from_pretrained(model_name)
+        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
+
+        self.sample_rate = sample_rate
+        self.window_size = int(analysis_window / block_duration)
+
+        self.buffer: List[np.ndarray] = []
+
+        self.silence: bool = True
+
+        self.running = False
+        self.lock: asyncio.Lock = asyncio.Lock()
+
+    def _predict_emotion(self, audio_np: np.ndarray):
+        inputs = self.feature_extractor(
+            audio_np, sampling_rate=self.sample_rate, return_tensors="pt", padding=True
+        )
+
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+            probs = torch.softmax(logits, dim=-1)[0]
+
+        predicted_id = int(torch.argmax(probs).item())
+
+        labels = self.model.config.id2label
+
+        return {
+            "label": labels[predicted_id],
+            "confidence": float(probs[predicted_id]),
+            "scores": {labels[i]: float(probs[i]) for i in range(len(labels))},
+        }
+
+    async def process(self, voice: Voice) -> Optional[Emotion]:
+        if voice.data is None:
+            self.silence = True
+        else:
+            self.silence = False
+            async with self.lock:
+                self.buffer.append(voice.data)
+
+        async with self.lock:
+            if self.running:
+                return None
+            self.running = True
+
+        async with self.lock:
+            buffer_size = len(self.buffer)
+            if buffer_size == 0 or (
+                self.silence is False and buffer_size < self.window_size
+            ):
+                self.running = False
+                return None
+            processing_chunks = self.buffer[: self.window_size]
+
+        processing_audio = np.concatenate(processing_chunks, axis=0)
+
+        emotion_result = await asyncio.to_thread(
+            self._predict_emotion, audio_np=processing_audio
+        )
+
+        async with self.lock:
+            self.buffer = self.buffer[self.window_size :]
+            self.running = False
+
+        return Emotion(
+            emotion_result["label"],
+            emotion_result["confidence"],
+            emotion_result["scores"],
+            self.silence,
+        )
diff --git a/src/modules/events.py b/src/modules/events.py
@@ -1,9 +1,11 @@
 from typing import Dict, Type
 
 from src.core.events import EventData
-from src.modules.speech_to_text.events import Sentence, Transcript, Voice
-from src.modules.text_to_speech.events import Audio, Token
+from src.modules.emotion.events import Emotion
 from src.modules.gesture.events import Motion
+from src.modules.rag.events import PartialQuestion, RAGQuestion
+from src.modules.speech_to_text.events import Transcript, Voice
+from src.modules.text_to_speech.events import Token
 
 
 def get_events() -> Dict[str, Type[EventData | bytes]]:
@@ -12,9 +14,11 @@ def get_events() -> Dict[str, Type[EventData | bytes]]:
         "audio": bytes,
         "voice": Voice,
         "transcript": Transcript,
-        "question": Sentence,
+        "emotion": Emotion,
+        "partial_question": PartialQuestion,
+        "question": RAGQuestion,
         "token": Token,
-        "motion": Motion
+        "motion": Motion,
     }
 
     return events