Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion config/client_aux.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
huri_url: ws://localhost:8000/session

topic_list: [question]
topic_list: [transcript, question, token, motion]

senders:
audio:
name: audio
args:
sample_rate: 16000
frame_duration: 0.030
text:
name: text
topic: question
args:
sample_rate: 16000
frame_duration: 0.030

modules:
mic:
Expand All @@ -26,3 +32,27 @@ modules:
tag:
name: tag
logging: INFO
emo:
name: emo
args:
block_duration: ${senders.audio.args.frame_duration}
eag:
name: eag
qag:
name: qag
rag:
name: rag
args:
language: en
tone: formal
response_format: paragraph
max_length: 1024
logging: INFO
tts:
name: tts
args:
min_clause_chars: 20
logging: INFO
gesture:
name: gesture
logging: INFO
22 changes: 22 additions & 0 deletions config/client_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,20 @@ senders:
args:
sample_rate: 16000
frame_duration: 0.030
text:
name: text
topic: question
args:
sample_rate: 16000
frame_duration: 0.030

hooks:
audio:
name: audio
topics: [audio]
args:
incoming_sample_rate: ${senders.audio.args.sample_rate}
sample_rate: 44100

modules:
mic:
Expand All @@ -26,6 +40,14 @@ modules:
tag:
name: tag
logging: INFO
emo:
name: emo
args:
block_duration: ${senders.audio.args.frame_duration}
eag:
name: eag
qag:
name: qag
rag:
name: rag
args:
Expand Down
4 changes: 2 additions & 2 deletions src/core/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,8 @@ async def _receive_loop(self, ws: websockets.ClientConnection):
print(f"<< bytes ({len(msg)}B, no topic)")
continue
(topic_len,) = struct.unpack(">H", msg[:2])
topic = msg[2:2 + topic_len].decode()
payload = msg[2 + topic_len:]
topic = msg[2 : 2 + topic_len].decode()
payload = msg[2 + topic_len :]

if topic == "audio" and len(payload) >= 13:
sample_rate, end_flag, pts = struct.unpack(">IBd", payload[:13])
Expand Down
23 changes: 17 additions & 6 deletions src/core/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ async def publish(self, event_topic, data):
if event_topic not in ("audio_in",): # skip mic-frame spam
logger.info(
"[GRAPH] publish topic=%r subscribers=%s",
event_topic, [type(m).__name__ for m in subs],
event_topic,
[type(m).__name__ for m in subs],
)
for module in subs:
asyncio.create_task(self._run(module, data))
Expand All @@ -68,26 +69,36 @@ async def _run(self, module: Module, data):
continue
logger.info(
"[GRAPH] %s -> %r: %s",
type(module).__name__, module.output_type, _summarize(item),
type(module).__name__,
module.output_type,
_summarize(item),
)
await self.publish(module.output_type, item)
except Exception:
logger.exception("[GRAPH] async generator failed in %s", type(module).__name__)
logger.exception(
"[GRAPH] async generator failed in %s", type(module).__name__
)

else:
try:
value = await result
if value is not None:
logger.info(
"[GRAPH] %s -> %r: %s",
type(module).__name__, module.output_type, _summarize(value),
type(module).__name__,
module.output_type,
_summarize(value),
)
await self.publish(module.output_type, value)
except Exception:
logger.exception("[GRAPH] coroutine failed in %s", type(module).__name__)
logger.exception(
"[GRAPH] coroutine failed in %s", type(module).__name__
)

except Exception:
logger.exception("[GRAPH] process() call failed in %s", type(module).__name__)
logger.exception(
"[GRAPH] process() call failed in %s", type(module).__name__
)


def _summarize(item) -> str:
Expand Down
Empty file added src/modules/__init__.py
Empty file.
Empty file added src/modules/emotion/__init__.py
Empty file.
73 changes: 73 additions & 0 deletions src/modules/emotion/emotion_aggregator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from collections import defaultdict
from typing import Dict, Optional

from src.core.module import Module
from src.modules.rag.events import PartialQuestion

from .events import Emotion


class EAG(Module):
"""EAG Module

Aggregate all emotions and send when voice end.

input: emotion,
output: partial_question

:ema_alpha: if not None, the aggragation will use ema computation instead
of average. Recents emotion will have stronger impact on the final score.
Lower alpha will make impact lower, and higher alpha will make it higher. \
Default alpha would be ~0.3.
"""

input_type = "emotion"
output_type = "partial_question"

def __init__(self, ema_alpha: Optional[float] = None):
super().__init__()

self.scores: Dict[str, float] = defaultdict(float)
self.count: int = 0

self.ema_alpha = ema_alpha

def _finalize(self) -> Emotion:
avg_scores = (
{label: score / self.count for label, score in self.scores.items()}
if self.ema_alpha is None
else self.scores
)

best_label = max(avg_scores, key=lambda label: avg_scores[label])

result = Emotion(
label=best_label,
confidence=avg_scores[best_label],
scores=avg_scores,
end=True,
)

self.scores.clear()
self.count = 0

return result

async def process(self, emotion: Emotion) -> Optional[PartialQuestion]:
if self.ema_alpha is not None:
for label, score in emotion.scores.items():
self.scores[label] = (
self.ema_alpha * score + (1 - self.ema_alpha) * self.scores[label]
)
else:
for label, score in emotion.scores.items():
self.scores[label] += score

self.count += 1

if emotion.end:
emotion = self._finalize()

return PartialQuestion(transcript=None, emotion=emotion)

return None
12 changes: 12 additions & 0 deletions src/modules/emotion/events.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from dataclasses import dataclass
from typing import Dict

from src.core.events import EventData


@dataclass
class Emotion(EventData):
label: str
confidence: float
scores: Dict[str, float]
end: bool
109 changes: 109 additions & 0 deletions src/modules/emotion/prosody_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import asyncio
from typing import List, Optional

import numpy as np
import torch
from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor

from src.core.module import Module
from src.modules.speech_to_text.events import Voice

from .events import Emotion


class EMO(Module):
"""EMO Module

Prosody Analysis of user voice speech.

input: voice,
output: emotion

:model_name: name of the Emotion Analysis model.
:sample_rate: size of received voice audio. Usually 8000, 16000 or 48000.
:block_duration: size of received voice audio (in s).
:analysis_window: duration of audio per analysis (in s).
"""

input_type = "voice"
output_type = "emotion"

def __init__(
self,
model_name: str = "superb/hubert-large-superb-er",
sample_rate: int = 16000,
block_duration: float = 0.020, # s
analysis_window: float = 4.0, # s
):
super().__init__()

self.model = AutoModelForAudioClassification.from_pretrained(model_name)
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)

self.sample_rate = sample_rate
self.window_size = int(analysis_window / block_duration)

self.buffer: List[np.ndarray] = []

self.silence: bool = True

self.running = False
self.lock: asyncio.Lock = asyncio.Lock()

def _predict_emotion(self, audio_np: np.ndarray):
inputs = self.feature_extractor(
audio_np, sampling_rate=self.sample_rate, return_tensors="pt", padding=True
)

with torch.no_grad():
logits = self.model(**inputs).logits
probs = torch.softmax(logits, dim=-1)[0]

predicted_id = int(torch.argmax(probs).item())

labels = self.model.config.id2label

return {
"label": labels[predicted_id],
"confidence": float(probs[predicted_id]),
"scores": {labels[i]: float(probs[i]) for i in range(len(labels))},
}

async def process(self, voice: Voice) -> Optional[Emotion]:
if voice.data is None:
self.silence = True
else:
self.silence = False
async with self.lock:
self.buffer.append(voice.data)

async with self.lock:
if self.running:
return None
self.running = True

async with self.lock:
buffer_size = len(self.buffer)
if buffer_size == 0 or (
self.silence is False and buffer_size < self.window_size
):
self.running = False
return None
processing_chunks = self.buffer[: self.window_size]

processing_audio = np.concatenate(processing_chunks, axis=0)

emotion_result = await asyncio.to_thread(
self._predict_emotion, audio_np=processing_audio
)

async with self.lock:
self.buffer = self.buffer[self.window_size :]
self.running = False

return Emotion(
emotion_result["label"],
emotion_result["confidence"],
emotion_result["scores"],
self.silence,
)
12 changes: 8 additions & 4 deletions src/modules/events.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from typing import Dict, Type

from src.core.events import EventData
from src.modules.speech_to_text.events import Sentence, Transcript, Voice
from src.modules.text_to_speech.events import Audio, Token
from src.modules.emotion.events import Emotion
from src.modules.gesture.events import Motion
from src.modules.rag.events import PartialQuestion, RAGQuestion
from src.modules.speech_to_text.events import Transcript, Voice
from src.modules.text_to_speech.events import Token


def get_events() -> Dict[str, Type[EventData | bytes]]:
Expand All @@ -12,9 +14,11 @@ def get_events() -> Dict[str, Type[EventData | bytes]]:
"audio": bytes,
"voice": Voice,
"transcript": Transcript,
"question": Sentence,
"emotion": Emotion,
"partial_question": PartialQuestion,
"question": RAGQuestion,
"token": Token,
"motion": Motion
"motion": Motion,
}

return events
Loading