From 2a3d9af810969ead8a4e598fcfc44596b2db0646 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 01:38:25 +0200
Subject: [PATCH 01/31] feat(gesture): all gesture ray module from EMAGE

---
 src/modules/gesture/__init__.py            |   1 +
 src/modules/gesture/emage/__init__.py      |  12 +
 src/modules/gesture/emage/configuration.py |  32 ++
 src/modules/gesture/emage/modeling.py      | 449 +++++++++++++++++++++
 src/modules/gesture/emage/processing.py    | 380 +++++++++++++++++
 src/modules/gesture/gesture.py             | 142 +++++++
 6 files changed, 1016 insertions(+)
 create mode 100644 src/modules/gesture/__init__.py
 create mode 100644 src/modules/gesture/emage/__init__.py
 create mode 100644 src/modules/gesture/emage/configuration.py
 create mode 100644 src/modules/gesture/emage/modeling.py
 create mode 100644 src/modules/gesture/emage/processing.py
 create mode 100644 src/modules/gesture/gesture.py

diff --git a/src/modules/gesture/__init__.py b/src/modules/gesture/__init__.py
new file mode 100644
index 0000000..8d8e6c2
--- /dev/null
+++ b/src/modules/gesture/__init__.py
@@ -0,0 +1 @@
+from .gesture import Gesture, Motion
diff --git a/src/modules/gesture/emage/__init__.py b/src/modules/gesture/emage/__init__.py
new file mode 100644
index 0000000..df5a071
--- /dev/null
+++ b/src/modules/gesture/emage/__init__.py
@@ -0,0 +1,12 @@
+from .configuration import EmageAudioConfig, EmageVAEConvConfig, EmageVQVAEConvConfig
+from .modeling import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv
+
+__all__ = [
+    "EmageAudioConfig",
+    "EmageAudioModel",
+    "EmageVAEConvConfig",
+    "EmageVAEConv",
+    "EmageVQVAEConvConfig",
+    "EmageVQVAEConv",
+    "EmageVQModel",
+]
diff --git a/src/modules/gesture/emage/configuration.py b/src/modules/gesture/emage/configuration.py
new file mode 100644
index 0000000..ad97076
--- /dev/null
+++ b/src/modules/gesture/emage/configuration.py
@@ -0,0 +1,32 @@
+from omegaconf import OmegaConf
+from transformers import PretrainedConfig
+
+
+class EmageAudioConfig(PretrainedConfig):
+    model_type = "emage_audio"
+
+    def __init__(self, config_obj=None, **kwargs):
+        if config_obj is not None:
+            cfg_dict = OmegaConf.to_container(config_obj, resolve=True)
+            kwargs.update(cfg_dict)
+        super().__init__(**kwargs)
+
+
+class EmageVQVAEConvConfig(PretrainedConfig):
+    model_type = "emage_vqvaeconv"
+
+    def __init__(self, config_obj=None, **kwargs):
+        if config_obj is not None:
+            cfg_dict = OmegaConf.to_container(config_obj, resolve=True)
+            kwargs.update(cfg_dict)
+        super().__init__(**kwargs)
+
+
+class EmageVAEConvConfig(PretrainedConfig):
+    model_type = "emage_vaeconv"
+
+    def __init__(self, config_obj=None, **kwargs):
+        if config_obj is not None:
+            cfg_dict = OmegaConf.to_container(config_obj, resolve=True)
+            kwargs.update(cfg_dict)
+        super().__init__(**kwargs)
diff --git a/src/modules/gesture/emage/modeling.py b/src/modules/gesture/emage/modeling.py
new file mode 100644
index 0000000..b4d71ab
--- /dev/null
+++ b/src/modules/gesture/emage/modeling.py
@@ -0,0 +1,449 @@
+import copy
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+
+from .configuration import EmageAudioConfig, EmageVAEConvConfig, EmageVQVAEConvConfig
+from .processing import (
+    MLP,
+    PeriodicPositionalEncoding,
+    Quantizer,
+    VQDecoderV5,
+    VQEncoderV5,
+    VQEncoderV6,
+    WavEncoder,
+    axis_angle_to_rotation_6d,
+    matrix_to_axis_angle,
+    matrix_to_rotation_6d,
+    recover_from_mask_ts,
+    rotation_6d_to_axis_angle,
+    rotation_6d_to_matrix,
+    velocity2position,
+    axis_angle_to_matrix,
+)
+
+
+class EmageVAEConv(PreTrainedModel):
+    config_class = EmageVAEConvConfig
+    base_model_prefix = "emage_vaeconv"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.encoder = VQEncoderV5(config)
+        self.decoder = VQDecoderV5(config)
+
+    def forward(self, inputs):
+        pre_latent = self.encoder(inputs)
+        rec_pose = self.decoder(pre_latent)
+        return {"rec_pose": rec_pose}
+
+
+class EmageVQVAEConv(PreTrainedModel):
+    config_class = EmageVQVAEConvConfig
+    base_model_prefix = "emage_vqvaeconv"
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.encoder = VQEncoderV5(config)
+        self.quantizer = Quantizer(config.vae_codebook_size, config.vae_length, config.vae_quantizer_lambda)
+        self.decoder = VQDecoderV5(config)
+
+    def forward(self, inputs):
+        pre_latent = self.encoder(inputs)
+        embedding_loss, vq_latent, _, perplexity = self.quantizer(pre_latent)
+        rec_pose = self.decoder(vq_latent)
+        return {"poses_feat": vq_latent, "embedding_loss": embedding_loss, "perplexity": perplexity, "rec_pose": rec_pose}
+
+    def map2index(self, inputs):
+        pre_latent = self.encoder(inputs)
+        return self.quantizer.map2index(pre_latent)
+
+    def map2latent(self, inputs):
+        pre_latent = self.encoder(inputs)
+        index = self.quantizer.map2index(pre_latent)
+        return self.quantizer.get_codebook_entry(index)
+
+    def decode(self, index):
+        z_q = self.quantizer.get_codebook_entry(index)
+        return self.decoder(z_q)
+
+    def decode_from_latent(self, latent):
+        z_flattened = latent.contiguous().view(-1, self.quantizer.e_dim)
+        d = (
+            torch.sum(z_flattened ** 2, dim=1, keepdim=True)
+            + torch.sum(self.quantizer.embedding.weight ** 2, dim=1)
+            - 2 * torch.matmul(z_flattened, self.quantizer.embedding.weight.t())
+        )
+        min_encoding_indices = torch.argmin(d, dim=1)
+        indices = min_encoding_indices.view(latent.shape[0], latent.shape[1])
+        z_q = self.quantizer.get_codebook_entry(indices)
+        return self.decoder(z_q)
+
+
+class EmageVQModel(nn.Module):
+    def __init__(self, face_model, upper_model, hands_model, lower_model, global_model):
+        super().__init__()
+        self.joint_mask_upper = [
+            False, False, False, True, False, False, True, False, False, True,
+            False, False, True, True, True, True, True, True, True, True,
+            True, True, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False,
+        ]
+        self.joint_mask_lower = [
+            True, True, True, False, True, True, False, True, True, False,
+            True, True, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False, False, False, False, False, False,
+            False, False, False, False, False,
+        ]
+        self.vq_model_face = face_model
+        self.vq_model_upper = upper_model
+        self.vq_model_hands = hands_model
+        self.vq_model_lower = lower_model
+        self.global_motion = global_model
+
+    def spilt_inputs(self, smplx_body_rot6d, expression, tar_contact=None, tar_trans=None):
+        bs, t, j6 = smplx_body_rot6d.shape
+        smplx_body_rot6d = smplx_body_rot6d.reshape(bs, t, j6 // 6, 6)
+        jaw_rot6d = smplx_body_rot6d[:, :, 22:23, :].reshape(bs, t, 6)
+        face = torch.cat([jaw_rot6d, expression], dim=2)
+        upper_rot6d = smplx_body_rot6d[:, :, self.joint_mask_upper, :].reshape(bs, t, 78)
+        hands_rot6d = smplx_body_rot6d[:, :, 25:55, :].reshape(bs, t, 180)
+        lower_rot6d = smplx_body_rot6d[:, :, self.joint_mask_lower, :].reshape(bs, t, 54)
+        tar_contact = torch.zeros(bs, t, 4, device=smplx_body_rot6d.device) if tar_contact is None else tar_contact
+        tar_trans = torch.zeros(bs, t, 3, device=smplx_body_rot6d.device) if tar_trans is None else tar_trans
+        lower = torch.cat([lower_rot6d, tar_trans, tar_contact], dim=2)
+        return dict(face=face, upper=upper_rot6d, hands=hands_rot6d, lower=lower)
+
+    def map2index(self, smplx_body_rot6d, expression, tar_contact=None, tar_trans=None):
+        inputs = self.spilt_inputs(smplx_body_rot6d, expression, tar_contact=tar_contact, tar_trans=tar_trans)
+        return dict(
+            face=self.vq_model_face.map2index(inputs["face"]),
+            upper=self.vq_model_upper.map2index(inputs["upper"]),
+            hands=self.vq_model_hands.map2index(inputs["hands"]),
+            lower=self.vq_model_lower.map2index(inputs["lower"]),
+        )
+
+    def map2latent(self, smplx_body_rot6d, expression, tar_contact=None, tar_trans=None):
+        inputs = self.spilt_inputs(smplx_body_rot6d, expression, tar_contact=tar_contact, tar_trans=tar_trans)
+        return dict(
+            face=self.vq_model_face.map2latent(inputs["face"]),
+            upper=self.vq_model_upper.map2latent(inputs["upper"]),
+            hands=self.vq_model_hands.map2latent(inputs["hands"]),
+            lower=self.vq_model_lower.map2latent(inputs["lower"]),
+        )
+
+    def decode(
+        self,
+        face_index=None, upper_index=None, hands_index=None, lower_index=None,
+        face_latent=None, upper_latent=None, hands_latent=None, lower_latent=None,
+        get_global_motion=False, ref_trans=None,
+    ):
+        for t in [face_index, upper_index, hands_index, lower_index, face_latent, upper_latent, hands_latent, lower_latent]:
+            if t is not None:
+                bs, seq = t.shape[:2]
+                break
+
+        if face_index is not None:
+            face_mix = self.vq_model_face.decode(face_index)
+            face_jaw_6d, expression = face_mix[:, :, :6], face_mix[:, :, 6:]
+            face_jaw = rotation_6d_to_axis_angle(face_jaw_6d)
+        elif face_latent is not None:
+            face_mix = self.vq_model_face.decode_from_latent(face_latent)
+            face_jaw_6d, expression = face_mix[:, :, :6], face_mix[:, :, 6:]
+            face_jaw = rotation_6d_to_axis_angle(face_jaw_6d)
+        else:
+            face_jaw = torch.zeros(bs, seq, 3, device=self.vq_model_face.device)
+            expression = torch.zeros(bs, seq, 100, device=self.vq_model_face.device)
+
+        if upper_index is not None:
+            upper_6d = self.vq_model_upper.decode(upper_index)
+            upper = rotation_6d_to_axis_angle(upper_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1)
+        elif upper_latent is not None:
+            upper_6d = self.vq_model_upper.decode_from_latent(upper_latent)
+            upper = rotation_6d_to_axis_angle(upper_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1)
+        else:
+            upper = torch.zeros(bs, seq, 39, device=self.vq_model_upper.device)
+
+        if hands_index is not None:
+            hands_6d = self.vq_model_hands.decode(hands_index)
+            hands = rotation_6d_to_axis_angle(hands_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1)
+        elif hands_latent is not None:
+            hands_6d = self.vq_model_hands.decode_from_latent(hands_latent)
+            hands = rotation_6d_to_axis_angle(hands_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1)
+        else:
+            hands = torch.zeros(bs, seq, 90, device=self.vq_model_hands.device)
+
+        if lower_index is not None:
+            lower_mix = self.vq_model_lower.decode(lower_index)
+            lower_6d, transfoot = lower_mix[:, :, :-7], lower_mix[:, :, -7:]
+            lower = rotation_6d_to_axis_angle(lower_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1)
+        elif lower_latent is not None:
+            lower_mix = self.vq_model_lower.decode_from_latent(lower_latent)
+            lower_6d, transfoot = lower_mix[:, :, :-7], lower_mix[:, :, -7:]
+            lower = rotation_6d_to_axis_angle(lower_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1)
+        else:
+            lower = torch.zeros(bs, seq, 27, device=self.vq_model_lower.device)
+            transfoot = torch.zeros(bs, seq, 7, device=self.vq_model_lower.device)
+            lower_6d = axis_angle_to_rotation_6d(lower.reshape(bs, seq, -1, 3)).reshape(bs, seq, -1)
+            lower_mix = torch.cat([lower_6d, transfoot], dim=-1)
+
+        upper2all = recover_from_mask_ts(upper, self.joint_mask_upper)
+        hands2all = recover_from_mask_ts(hands, [False] * 25 + [True] * 30)
+        lower2all = recover_from_mask_ts(lower, self.joint_mask_lower)
+
+        all_motion_axis_angle = upper2all + hands2all + lower2all
+        all_motion_axis_angle[:, :, 22 * 3:22 * 3 + 3] = face_jaw
+        all_motion_rot6d = axis_angle_to_rotation_6d(all_motion_axis_angle.reshape(bs, seq, 55, 3)).reshape(bs, seq, 55 * 6)
+        all_motion4inference = torch.cat([all_motion_rot6d, transfoot], dim=2)
+
+        global_motion = None
+        if get_global_motion:
+            global_motion = self._get_global_motion(lower_mix, ref_trans)
+
+        return dict(
+            expression=expression,
+            all_motion4inference=all_motion4inference,
+            motion_axis_angle=all_motion_axis_angle,
+            trans=global_motion,
+        )
+
+    def _get_global_motion(self, lower_body, ref_trans):
+        global_motion = self.global_motion(lower_body)
+        rec_trans_v_s = global_motion["rec_pose"][:, :, 54:57]
+        if len(ref_trans.shape) == 2:
+            ref_trans = ref_trans.unsqueeze(0).repeat(rec_trans_v_s.shape[0], 1, 1)
+        rec_x_trans = velocity2position(rec_trans_v_s[:, :, 0:1], 1 / 30, ref_trans[:, 0, 0:1])
+        rec_z_trans = velocity2position(rec_trans_v_s[:, :, 2:3], 1 / 30, ref_trans[:, 0, 2:3])
+        rec_y_trans = rec_trans_v_s[:, :, 1:2]
+        return torch.cat([rec_x_trans, rec_y_trans, rec_z_trans], dim=-1)
+
+
+class EmageAudioModel(PreTrainedModel):
+    config_class = EmageAudioConfig
+    base_model_prefix = "emage_audio"
+
+    def __init__(self, config: EmageAudioConfig):
+        super().__init__(config)
+        self.cfg = config
+        self.audio_encoder_face = WavEncoder(self.cfg.audio_f)
+        self.audio_encoder_body = WavEncoder(self.cfg.audio_f)
+        self.speaker_embedding_body = nn.Embedding(self.cfg.speaker_dims, self.cfg.hidden_size)
+        self.speaker_embedding_face = nn.Embedding(self.cfg.speaker_dims, self.cfg.hidden_size)
+        self.mask_embedding = nn.Parameter(torch.zeros(1, 1, self.cfg.pose_dims + 3 + 4))
+        nn.init.normal_(self.mask_embedding, 0, self.cfg.hidden_size ** -0.5)
+
+        args_top = copy.deepcopy(self.cfg)
+        args_top.vae_layer = 3
+        args_top.vae_length = self.cfg.motion_f
+        args_top.vae_test_dim = self.cfg.pose_dims + 3 + 4
+        self.motion_encoder = VQEncoderV6(args_top)
+        self.bodyhints_face = MLP(self.cfg.motion_f, self.cfg.hidden_size, self.cfg.motion_f)
+        self.bodyhints_body = MLP(self.cfg.motion_f, self.cfg.hidden_size, self.cfg.motion_f)
+        self.audio_body_motion_proj = nn.Linear(self.cfg.audio_f, self.cfg.hidden_size)
+        self.moton_proj = nn.Linear(self.cfg.motion_f, self.cfg.hidden_size)
+        self.position_embeddings = PeriodicPositionalEncoding(self.cfg.hidden_size, period=self.cfg.pose_length, max_seq_len=self.cfg.pose_length)
+        self.transformer_en_layer = nn.TransformerEncoderLayer(d_model=self.cfg.hidden_size, nhead=4, dim_feedforward=self.cfg.hidden_size * 2)
+        self.motion_self_encoder = nn.TransformerEncoder(self.transformer_en_layer, num_layers=1)
+        self.audio_motion_cross_attn_layer = nn.TransformerDecoderLayer(d_model=self.cfg.hidden_size, nhead=4, dim_feedforward=self.cfg.hidden_size * 2)
+        self.audio_motion_cross_attn = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=8)
+        self.motion2latent_upper = MLP(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.hidden_size)
+        self.motion2latent_hands = MLP(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.hidden_size)
+        self.motion2latent_lower = MLP(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.hidden_size)
+        self.body_motion_decoder_upper = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=1)
+        self.body_motion_decoder_hands = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=1)
+        self.body_motion_decoder_lower = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=1)
+        self.motion_out_proj_upper = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.motion_out_proj_hands = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.motion_out_proj_lower = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.motion_cls_upper = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.motion_cls_hands = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.motion_cls_lower = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.audio_face_motion_proj = nn.Linear(self.cfg.audio_f + self.cfg.motion_f, self.cfg.hidden_size)
+        self.face_motion_decoder = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=4)
+        self.face_out_proj = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size)
+        self.face_cls = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size)
+
+    def forward(self, audio, speaker_id, masked_motion, mask, use_audio=True):
+        masked_embeddings = self.mask_embedding.expand_as(masked_motion)
+        masked_motion = torch.where(mask == 1, masked_embeddings, masked_motion)
+
+        body_hint = self.motion_encoder(masked_motion)
+        body_hint_body = self.bodyhints_body(body_hint)
+        body_hint_face = self.bodyhints_face(body_hint)
+
+        audio2face_fea = self.audio_encoder_face(audio)
+        audio2body_fea = self.audio_encoder_body(audio)
+
+        if audio2face_fea.shape[1] > body_hint_face.shape[1]:
+            audio2face_fea = audio2face_fea[:, :body_hint_face.shape[1]]
+        if audio2body_fea.shape[1] > body_hint_face.shape[1]:
+            audio2face_fea = audio2face_fea[:, :body_hint_face.shape[1]]
+
+        bs, t, _ = audio2face_fea.shape
+
+        speaker_motion_fea_proj = self.speaker_embedding_body(speaker_id).repeat(1, t, 1)
+        speaker_face_fea_proj = self.speaker_embedding_face(speaker_id).repeat(1, t, 1)
+
+        audio2face_fea_proj = self.audio_face_motion_proj(torch.cat([audio2face_fea, body_hint_face], dim=2))
+        face_proj = self.position_embeddings(speaker_face_fea_proj)
+        decode_face = self.face_motion_decoder(tgt=face_proj.permute(1, 0, 2), memory=audio2face_fea_proj.permute(1, 0, 2)).permute(1, 0, 2)
+        face_latent = self.face_out_proj(decode_face)
+        classify_face = self.face_cls(face_latent)
+
+        masked_motion_proj = self.moton_proj(body_hint_body)
+        masked_motion_proj = self.position_embeddings(masked_motion_proj)
+        masked_motion_proj = speaker_motion_fea_proj + masked_motion_proj
+        motion_fea = self.motion_self_encoder(masked_motion_proj.permute(1, 0, 2)).permute(1, 0, 2)
+
+        audio2body_fea_proj = self.audio_body_motion_proj(audio2body_fea)
+        motion_fea = motion_fea + speaker_motion_fea_proj
+        motion_fea = self.position_embeddings(motion_fea)
+        audio2body_fea_cross = self.audio_motion_cross_attn(tgt=motion_fea.permute(1, 0, 2), memory=audio2body_fea_proj.permute(1, 0, 2)).permute(1, 0, 2)
+        if not use_audio:
+            audio2body_fea_cross = audio2body_fea_cross * 0.0
+        motion_fea = motion_fea + audio2body_fea_cross
+
+        upper_latent = self.motion2latent_upper(motion_fea)
+        hands_latent = self.motion2latent_hands(motion_fea)
+        lower_latent = self.motion2latent_lower(motion_fea)
+
+        motion_upper_refine = self.body_motion_decoder_upper(tgt=upper_latent.permute(1, 0, 2) + speaker_motion_fea_proj.permute(1, 0, 2), memory=(hands_latent + lower_latent).permute(1, 0, 2)).permute(1, 0, 2)
+        motion_hands_refine = self.body_motion_decoder_hands(tgt=hands_latent.permute(1, 0, 2) + speaker_motion_fea_proj.permute(1, 0, 2), memory=(upper_latent + lower_latent).permute(1, 0, 2)).permute(1, 0, 2)
+        motion_lower_refine = self.body_motion_decoder_lower(tgt=lower_latent.permute(1, 0, 2) + speaker_motion_fea_proj.permute(1, 0, 2), memory=(upper_latent + hands_latent).permute(1, 0, 2)).permute(1, 0, 2)
+        upper_latent = self.motion_out_proj_upper(upper_latent + motion_upper_refine)
+        hands_latent = self.motion_out_proj_hands(hands_latent + motion_hands_refine)
+        lower_latent = self.motion_out_proj_lower(lower_latent + motion_lower_refine)
+
+        classify_upper = self.motion_cls_upper(upper_latent)
+        classify_hands = self.motion_cls_hands(hands_latent)
+        classify_lower = self.motion_cls_lower(lower_latent)
+
+        return {
+            "rec_face": face_latent,
+            "rec_upper": upper_latent,
+            "rec_hands": hands_latent,
+            "rec_lower": lower_latent,
+            "cls_face": classify_face,
+            "cls_upper": classify_upper,
+            "cls_hands": classify_hands,
+            "cls_lower": classify_lower,
+        }
+
+    def inference(self, audio, speaker_id, vq_model, masked_motion=None, mask=None):
+        length = audio.shape[1] * 30 // 16000
+        bs = audio.shape[0]
+
+        fake_axis_angle = torch.zeros(bs, length, 55, 3).to(audio.device)
+        fake_motion = axis_angle_to_rotation_6d(fake_axis_angle).reshape(bs, length, -1)
+        fake_foot_and_trans = torch.zeros(bs, length, 7).to(audio.device)
+        fake_motion = torch.cat([fake_motion, fake_foot_and_trans], dim=-1)
+        if masked_motion is not None:
+            fake_motion[:, :masked_motion.shape[1]] = masked_motion
+        masked_motion = fake_motion
+
+        fake_mask = torch.ones_like(masked_motion)
+        if mask is not None:
+            fake_mask[:, :mask.shape[1]] = mask
+        mask = fake_mask
+
+        bs, total_len, c = masked_motion.shape
+        window = self.cfg.pose_length
+        pre_frames = self.cfg.seed_frames
+        rounds = (total_len - pre_frames) // (window - pre_frames)
+        remain = (total_len - pre_frames) % (window - pre_frames)
+
+        rec_all_face, rec_all_lower, rec_all_upper, rec_all_hands = [], [], [], []
+        cls_all_face, cls_all_lower, cls_all_upper, cls_all_hands = [], [], [], []
+
+        last_motion = masked_motion[:, :pre_frames, :]
+
+        for i in range(rounds):
+            start_idx = i * (window - pre_frames)
+            end_idx = start_idx + window
+
+            window_mask = mask[:, start_idx:end_idx, :].clone()
+            window_motion = masked_motion[:, start_idx:end_idx, :].clone()
+            window_motion[:, :pre_frames, :] = torch.where(
+                (window_mask[:, :pre_frames, :] == 0),
+                masked_motion[:, start_idx:start_idx + pre_frames, :],
+                last_motion,
+            )
+            window_mask[:, :pre_frames, :] = 0
+
+            audio_slice_len = (end_idx - start_idx) * (16000 // 30)
+            audio_slice = audio[:, start_idx * (16000 // 30):start_idx * (16000 // 30) + audio_slice_len]
+            net_out_val = self.forward(audio_slice, speaker_id, masked_motion=window_motion, mask=window_mask, use_audio=True)
+
+            _, cls_face = torch.max(F.log_softmax(net_out_val["cls_face"], dim=2), dim=2)
+            _, cls_upper = torch.max(F.log_softmax(net_out_val["cls_upper"], dim=2), dim=2)
+            _, cls_hands = torch.max(F.log_softmax(net_out_val["cls_hands"], dim=2), dim=2)
+            _, cls_lower = torch.max(F.log_softmax(net_out_val["cls_lower"], dim=2), dim=2)
+
+            face_latent = net_out_val["rec_face"] if self.cfg.lf > 0 and self.cfg.cf == 0 else None
+            upper_latent = net_out_val["rec_upper"] if self.cfg.lu > 0 and self.cfg.cu == 0 else None
+            hands_latent = net_out_val["rec_hands"] if self.cfg.lh > 0 and self.cfg.ch == 0 else None
+            lower_latent = net_out_val["rec_lower"] if self.cfg.ll > 0 and self.cfg.cl == 0 else None
+            face_index = cls_face if self.cfg.cf > 0 else None
+            upper_index = cls_upper if self.cfg.cu > 0 else None
+            hands_index = cls_hands if self.cfg.ch > 0 else None
+            lower_index = cls_lower if self.cfg.cl > 0 else None
+
+            decode_dict = vq_model.decode(
+                face_latent=face_latent, upper_latent=upper_latent,
+                lower_latent=lower_latent, hands_latent=hands_latent,
+                face_index=face_index, upper_index=upper_index,
+                lower_index=lower_index, hands_index=hands_index,
+            )
+
+            last_motion = decode_dict["all_motion4inference"][:, -pre_frames:, :]
+            rec_all_face.append(net_out_val["rec_face"][:, :-pre_frames, :])
+            rec_all_upper.append(net_out_val["rec_upper"][:, :-pre_frames, :])
+            rec_all_hands.append(net_out_val["rec_hands"][:, :-pre_frames, :])
+            rec_all_lower.append(net_out_val["rec_lower"][:, :-pre_frames, :])
+            cls_all_face.append(net_out_val["cls_face"][:, :-pre_frames])
+            cls_all_upper.append(net_out_val["cls_upper"][:, :-pre_frames])
+            cls_all_hands.append(net_out_val["cls_hands"][:, :-pre_frames])
+            cls_all_lower.append(net_out_val["cls_lower"][:, :-pre_frames])
+
+        if remain > pre_frames:
+            final_start = rounds * (window - pre_frames)
+            final_end = final_start + pre_frames + remain
+
+            final_mask = mask[:, final_start:final_end, :].clone()
+            final_motion = masked_motion[:, final_start:final_end, :].clone()
+            final_motion[:, :pre_frames, :] = torch.where(
+                (final_mask[:, :pre_frames, :] == 0),
+                masked_motion[:, final_start:final_start + pre_frames, :],
+                last_motion,
+            )
+            final_mask[:, :pre_frames, :] = 0
+
+            audio_slice_len = (final_end - final_start) * (16000 // 30)
+            audio_slice = audio[:, final_start * (16000 // 30):final_start * (16000 // 30) + audio_slice_len]
+            net_out_val = self.forward(audio_slice, speaker_id, masked_motion=final_motion, mask=final_mask, use_audio=True)
+
+            rec_all_face.append(net_out_val["rec_face"])
+            rec_all_upper.append(net_out_val["rec_upper"])
+            rec_all_hands.append(net_out_val["rec_hands"])
+            rec_all_lower.append(net_out_val["rec_lower"])
+            cls_all_face.append(net_out_val["cls_face"])
+            cls_all_upper.append(net_out_val["cls_upper"])
+            cls_all_hands.append(net_out_val["cls_hands"])
+            cls_all_lower.append(net_out_val["cls_lower"])
+
+        return {
+            "rec_face": torch.cat(rec_all_face, dim=1),
+            "rec_upper": torch.cat(rec_all_upper, dim=1),
+            "rec_hands": torch.cat(rec_all_hands, dim=1),
+            "rec_lower": torch.cat(rec_all_lower, dim=1),
+            "cls_face": torch.cat(cls_all_face, dim=1),
+            "cls_upper": torch.cat(cls_all_upper, dim=1),
+            "cls_hands": torch.cat(cls_all_hands, dim=1),
+            "cls_lower": torch.cat(cls_all_lower, dim=1),
+        }
diff --git a/src/modules/gesture/emage/processing.py b/src/modules/gesture/emage/processing.py
new file mode 100644
index 0000000..2d128b1
--- /dev/null
+++ b/src/modules/gesture/emage/processing.py
@@ -0,0 +1,380 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def _copysign(a, b):
+    signs_differ = (a < 0) != (b < 0)
+    return torch.where(signs_differ, -a, a)
+
+
+def _sqrt_positive_part(x):
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+
+
+def matrix_to_quaternion(matrix):
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError
+    m00 = matrix[..., 0, 0]
+    m11 = matrix[..., 1, 1]
+    m22 = matrix[..., 2, 2]
+    o0 = 0.5 * _sqrt_positive_part(1 + m00 + m11 + m22)
+    x = 0.5 * _sqrt_positive_part(1 + m00 - m11 - m22)
+    y = 0.5 * _sqrt_positive_part(1 - m00 + m11 - m22)
+    z = 0.5 * _sqrt_positive_part(1 - m00 - m11 + m22)
+    o1 = _copysign(x, matrix[..., 2, 1] - matrix[..., 1, 2])
+    o2 = _copysign(y, matrix[..., 0, 2] - matrix[..., 2, 0])
+    o3 = _copysign(z, matrix[..., 1, 0] - matrix[..., 0, 1])
+    return torch.stack((o0, o1, o2, o3), -1)
+
+
+def quaternion_to_axis_angle(quaternions):
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+
+
+def matrix_to_axis_angle(matrix):
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+
+
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+
+
+def rotation_6d_to_axis_angle(rot6d):
+    return matrix_to_axis_angle(rotation_6d_to_matrix(rot6d))
+
+
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    return matrix[..., :2, :].clone().reshape(*matrix.size()[:-2], 6)
+
+
+def axis_angle_to_quaternion(axis_angle):
+    angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+    half_angles = 0.5 * angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    quaternions = torch.cat(
+        [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+    )
+    return quaternions
+
+
+def quaternion_to_matrix(quaternions):
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def axis_angle_to_matrix(axis_angle):
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+
+
+def axis_angle_to_rotation_6d(axis_angle):
+    return matrix_to_rotation_6d(axis_angle_to_matrix(axis_angle))
+
+
+def velocity2position(data_seq, dt, init_pos):
+    res_trans = []
+    for i in range(data_seq.shape[1]):
+        if i == 0:
+            res_trans.append(init_pos.unsqueeze(1))
+        else:
+            res = data_seq[:, i - 1:i] * dt + res_trans[-1]
+            res_trans.append(res)
+    return torch.cat(res_trans, dim=1)
+
+
+def recover_from_mask_ts(selected_motion: torch.Tensor, mask: list) -> torch.Tensor:
+    device = selected_motion.device
+    dtype = selected_motion.dtype
+    mask_arr = torch.tensor(mask, dtype=torch.bool, device=device)
+    j = len(mask_arr)
+    sum_mask = mask_arr.sum().item()
+    c_channels = selected_motion.shape[-1] // sum_mask
+    new_shape = selected_motion.shape[:-1] + (sum_mask, c_channels)
+    selected_motion = selected_motion.reshape(new_shape)
+    out_shape = list(selected_motion.shape[:-2]) + [j, c_channels]
+    recovered = torch.zeros(out_shape, dtype=dtype, device=device)
+    recovered[..., mask_arr, :] = selected_motion
+    final_shape = list(recovered.shape[:-2]) + [j * c_channels]
+    recovered = recovered.reshape(final_shape)
+    return recovered
+
+
+class Quantizer(nn.Module):
+    def __init__(self, n_e, e_dim, beta):
+        super().__init__()
+        self.e_dim = e_dim
+        self.n_e = n_e
+        self.beta = beta
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+    def forward(self, z):
+        assert z.shape[-1] == self.e_dim
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+        d = (
+            torch.sum(z_flattened ** 2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight ** 2, dim=1)
+            - 2 * torch.matmul(z_flattened, self.embedding.weight.t())
+        )
+        min_encoding_indices = torch.argmin(d, dim=1)
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        loss = torch.mean((z_q - z.detach()) ** 2) + self.beta * torch.mean((z_q.detach() - z) ** 2)
+        z_q = z + (z_q - z).detach()
+        min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype)
+        e_mean = torch.mean(min_encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
+        return loss, z_q, min_encoding_indices, perplexity
+
+    def map2index(self, z):
+        assert z.shape[-1] == self.e_dim
+        z_flattened = z.contiguous().view(-1, self.e_dim)
+        d = (
+            torch.sum(z_flattened ** 2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight ** 2, dim=1)
+            - 2 * torch.matmul(z_flattened, self.embedding.weight.t())
+        )
+        min_encoding_indices = torch.argmin(d, dim=1)
+        return min_encoding_indices.reshape(z.shape[0], -1)
+
+    def get_codebook_entry(self, indices):
+        index_flattened = indices.view(-1)
+        z_q = self.embedding(index_flattened)
+        z_q = z_q.view(indices.shape + (self.e_dim,)).contiguous()
+        return z_q
+
+
+def init_weight(m):
+    if isinstance(m, (nn.Conv1d, nn.Linear, nn.ConvTranspose1d)):
+        nn.init.xavier_normal_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, channel):
+        super().__init__()
+        self.model = nn.Sequential(
+            nn.Conv1d(channel, channel, 3, 1, 1),
+            nn.LeakyReLU(0.2, True),
+            nn.Conv1d(channel, channel, 3, 1, 1),
+        )
+
+    def forward(self, x):
+        return self.model(x) + x
+
+
+class VQEncoderV5(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        n_down = args.vae_layer
+        channels = [args.vae_length] * n_down
+        input_size = args.vae_test_dim
+        layers = [
+            nn.Conv1d(input_size, channels[0], 3, 1, 1),
+            nn.LeakyReLU(0.2, True),
+            ResBlock(channels[0]),
+        ]
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i - 1], channels[i], 3, 1, 1),
+                nn.LeakyReLU(0.2, True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+
+class VQEncoderV6(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        n_down = args.vae_layer
+        channels = [args.vae_length] * n_down
+        input_size = args.vae_test_dim
+        layers = [
+            nn.Conv1d(input_size, channels[0], 3, 1, 1),
+            nn.LeakyReLU(0.2, True),
+            ResBlock(channels[0]),
+        ]
+        for i in range(1, n_down):
+            layers += [
+                nn.Conv1d(channels[i - 1], channels[i], 3, 1, 1),
+                nn.LeakyReLU(0.2, True),
+                ResBlock(channels[i]),
+            ]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+
+class VQDecoderV5(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        n_up = args.vae_layer
+        channels = [args.vae_length] * n_up + [args.vae_test_dim]
+        input_size = args.vae_length
+        n_resblk = 2
+        if input_size == channels[0]:
+            layers = []
+        else:
+            layers = [nn.Conv1d(input_size, channels[0], 3, 1, 1)]
+        for i in range(n_resblk):
+            layers += [ResBlock(channels[0])]
+        for i in range(n_up):
+            layers += [
+                nn.Conv1d(channels[i], channels[i + 1], 3, 1, 1),
+                nn.LeakyReLU(0.2, True),
+            ]
+        layers += [nn.Conv1d(channels[-1], channels[-1], 3, 1, 1)]
+        self.main = nn.Sequential(*layers)
+        self.main.apply(init_weight)
+
+    def forward(self, inputs):
+        inputs = inputs.permute(0, 2, 1)
+        outputs = self.main(inputs).permute(0, 2, 1)
+        return outputs
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, ker_size, stride=1, downsample=None, dilation=1, first_dilation=None, act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm1d):
+        super().__init__()
+        self.conv1 = nn.Conv1d(
+            inplanes, planes, kernel_size=ker_size, stride=stride,
+            padding=first_dilation, dilation=dilation, bias=True,
+        )
+        self.bn1 = norm_layer(planes)
+        self.act1 = act_layer(inplace=True)
+        self.conv2 = nn.Conv1d(
+            planes, planes, kernel_size=ker_size, padding=ker_size // 2,
+            dilation=dilation, bias=True,
+        )
+        self.bn2 = norm_layer(planes)
+        self.act2 = act_layer(inplace=True)
+        if downsample is not None:
+            self.downsample = nn.Sequential(
+                nn.Conv1d(inplanes, planes, stride=stride, kernel_size=ker_size,
+                          padding=first_dilation, dilation=dilation, bias=True),
+                norm_layer(planes),
+            )
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        shortcut = x
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act2(x)
+        return x
+
+
+class WavEncoder(nn.Module):
+    def __init__(self, out_dim, audio_in=1):
+        super().__init__()
+        self.out_dim = out_dim
+        self.feat_extractor = nn.Sequential(
+            BasicBlock(audio_in, out_dim // 4, 15, 5, first_dilation=1600, downsample=True),
+            BasicBlock(out_dim // 4, out_dim // 4, 15, 6, first_dilation=0, downsample=True),
+            BasicBlock(out_dim // 4, out_dim // 4, 15, 1, first_dilation=7),
+            BasicBlock(out_dim // 4, out_dim // 2, 15, 6, first_dilation=0, downsample=True),
+            BasicBlock(out_dim // 2, out_dim // 2, 15, 1, first_dilation=7),
+            BasicBlock(out_dim // 2, out_dim, 15, 3, first_dilation=0, downsample=True),
+        )
+
+    def forward(self, wav_data):
+        if wav_data.dim() == 2:
+            wav_data = wav_data.unsqueeze(1)
+        else:
+            wav_data = wav_data.transpose(1, 2)
+        out = self.feat_extractor(wav_data)
+        return out.transpose(1, 2)
+
+
+class MLP(nn.Module):
+    def __init__(self, in_dim, middle_dim, out_dim):
+        super().__init__()
+        self.fc1 = nn.Linear(in_dim, middle_dim)
+        self.fc2 = nn.Linear(middle_dim, out_dim)
+        self.act = nn.LeakyReLU(0.1, True)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+
+
+class PeriodicPositionalEncoding(nn.Module):
+    def __init__(self, d_model, dropout=0.1, period=15, max_seq_len=60):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        pe = torch.zeros(period, d_model)
+        position = torch.arange(0, period, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        repeat_num = (max_seq_len // period) + 1
+        pe = pe.repeat(1, repeat_num, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1), :]
+        return self.dropout(x)
diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
new file mode 100644
index 0000000..4e2a380
--- /dev/null
+++ b/src/modules/gesture/gesture.py
@@ -0,0 +1,142 @@
+import asyncio
+import os
+from dataclasses import dataclass
+from typing import AsyncGenerator, Optional
+
+import numpy as np
+from ray import serve
+from ray.serve import handle
+
+from src.core.module import Module, ModuleWithHandle
+from src.modules.text_to_speech.text_to_speech import Audio
+
+
+_HF_REPO = os.environ.get("HURI_EMAGE_REPO", "H-Liu1997/emage_audio")
+_EMAGE_SR = 16000  # EMAGE expects 16 kHz mono audio
+
+
+@dataclass
+class Motion:
+    poses: np.ndarray        # (t, 165)  SMPL-X axis-angle, 55 joints × 3
+    expressions: np.ndarray  # (t, 100)  facial expression coefficients
+    trans: np.ndarray        # (t, 3)    global root translation
+    fps: int = 30
+
+
+@serve.deployment(name="GestureGeneration")
+class GestureDeployment:
+    def __init__(
+        self,
+        hf_repo: str = _HF_REPO,
+        device: Optional[str] = None,
+    ):
+        import torch
+        from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv
+
+        self.device = torch.device(
+            device if device else ("cuda" if torch.cuda.is_available() else "cpu")
+        )
+
+        face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device)
+        upper_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/upper").to(self.device)
+        lower_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/lower").to(self.device)
+        hands_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/hands").to(self.device)
+        global_ae = EmageVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/global").to(self.device)
+
+        self.motion_vq = EmageVQModel(
+            face_model=face_vq,
+            upper_model=upper_vq,
+            lower_model=lower_vq,
+            hands_model=hands_vq,
+            global_model=global_ae,
+        )
+        self.motion_vq.eval()
+
+        self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device)
+        self.model.eval()
+
+    def infer(self, audio_np: np.ndarray) -> Motion:
+        import torch
+        import torch.nn.functional as F
+
+        audio_ts = torch.from_numpy(audio_np).to(self.device).unsqueeze(0)
+        speaker_id = torch.zeros(1, 1, dtype=torch.long, device=self.device)
+
+        with torch.no_grad():
+            ref_trans = torch.zeros(1, 1, 3, device=self.device)
+            latent_dict = self.model.inference(audio_ts, speaker_id, self.motion_vq)
+
+            cfg = self.model.cfg
+            face_latent = latent_dict["rec_face"] if cfg.lf > 0 and cfg.cf == 0 else None
+            upper_latent = latent_dict["rec_upper"] if cfg.lu > 0 and cfg.cu == 0 else None
+            hands_latent = latent_dict["rec_hands"] if cfg.lh > 0 and cfg.ch == 0 else None
+            lower_latent = latent_dict["rec_lower"] if cfg.ll > 0 and cfg.cl == 0 else None
+            face_index = torch.max(F.log_softmax(latent_dict["cls_face"], dim=2), dim=2)[1] if cfg.cf > 0 else None
+            upper_index = torch.max(F.log_softmax(latent_dict["cls_upper"], dim=2), dim=2)[1] if cfg.cu > 0 else None
+            hands_index = torch.max(F.log_softmax(latent_dict["cls_hands"], dim=2), dim=2)[1] if cfg.ch > 0 else None
+            lower_index = torch.max(F.log_softmax(latent_dict["cls_lower"], dim=2), dim=2)[1] if cfg.cl > 0 else None
+
+            all_pred = self.motion_vq.decode(
+                face_latent=face_latent, upper_latent=upper_latent,
+                lower_latent=lower_latent, hands_latent=hands_latent,
+                face_index=face_index, upper_index=upper_index,
+                lower_index=lower_index, hands_index=hands_index,
+                get_global_motion=True, ref_trans=ref_trans[:, 0],
+            )
+
+        t = all_pred["motion_axis_angle"].shape[1]
+        return Motion(
+            poses=all_pred["motion_axis_angle"].cpu().numpy().reshape(t, -1),
+            expressions=all_pred["expression"].cpu().numpy().reshape(t, -1),
+            trans=all_pred["trans"].cpu().numpy().reshape(t, -1),
+        )
+
+
+class Gesture(ModuleWithHandle):
+    """Gesture Module
+
+    Consumes streaming Audio chunks produced by TTS and generates whole-body
+    SMPL-X motion using the EMAGE audio-to-gesture model.
+
+    Audio chunks are buffered until TTS signals the end of an utterance
+    (Audio.end == True). At that point the full waveform is passed to EMAGE
+    and a single Motion object is yielded.
+
+    input:  audio (Audio)
+    output: motion (Motion)
+
+    :hf_repo: HuggingFace repository to load EMAGE weights from.
+    :device:  PyTorch device string; defaults to CUDA when available.
+    """
+
+    _handle_cls = GestureDeployment
+    input_type = "audio"
+    output_type = "motion"
+
+    def __init__(
+        self,
+        handle: handle.DeploymentHandle,
+    ):
+        super().__init__(handle)
+        self._chunks: list[np.ndarray] = []
+
+    async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type: ignore[override]
+        import librosa
+
+        if audio.data.size > 0:
+            chunk = audio.data
+            if audio.sample_rate != _EMAGE_SR:
+                chunk = librosa.resample(chunk, orig_sr=audio.sample_rate, target_sr=_EMAGE_SR)
+            self._chunks.append(chunk.astype(np.float32))
+
+        if not audio.end:
+            return
+
+        if not self._chunks:
+            return
+
+        full_audio = np.concatenate(self._chunks)
+        self._chunks = []
+
+        motion = await self.handle.infer.remote(full_audio)
+        yield motion

From 17098acf07fc5c35775a25248cea06c1ea2b99ab Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 01:39:23 +0200
Subject: [PATCH 02/31] feat(tts): all files related to CosyTTS

---
 src/modules/text_to_speech/__init__.py       |   0
 src/modules/text_to_speech/text_to_speech.py | 156 +++++++++++++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 src/modules/text_to_speech/__init__.py
 create mode 100644 src/modules/text_to_speech/text_to_speech.py

diff --git a/src/modules/text_to_speech/__init__.py b/src/modules/text_to_speech/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py
new file mode 100644
index 0000000..07c7af5
--- /dev/null
+++ b/src/modules/text_to_speech/text_to_speech.py
@@ -0,0 +1,156 @@
+import asyncio
+import os
+import re
+from dataclasses import dataclass
+from typing import AsyncGenerator, Optional
+
+import numpy as np
+from ray import serve
+from ray.serve import handle
+
+from src.core.module import Module, ModuleWithHandle
+
+
+# Defaults — overridden by env vars in production (see README.md)
+_MODEL_PATH = os.environ.get("HURI_MODEL_PATH", "/models/cosytts/iic/CosyVoice2-0.5B")
+_VOICE_SAMPLE_PATH = os.environ.get("HURI_VOICE_SAMPLE_PATH", "/assets/voice.wav")
+_VOICE_SAMPLE_TRANSCRIPT = os.environ.get(
+    "HURI_VOICE_TRANSCRIPT", "Hello, this is my voice sample for cloning."
+)
+
+# Hard endings (.!?) trigger synthesis immediately; soft endings (,;:) only after
+# min_clause_chars are buffered, to avoid synthesizing very short fragments.
+_HARD_END_RE = re.compile(r'[.!?]["\']?\s+')
+_SOFT_END_RE = re.compile(r'[,;:]\s+')
+
+_DONE = object()  # sentinel for exhausted sync generator
+
+
+@dataclass
+class Token:
+    text: str
+    end: bool  # True on the last token of an LLM stream
+
+
+@dataclass
+class Audio:
+    data: np.ndarray  # float32, values in [-1.0, 1.0]
+    sample_rate: int
+    end: bool = False  # True on the last chunk of an utterance
+
+
+@serve.deployment(name="TTS")
+class TTSDeployment:
+    def __init__(
+        self,
+        model_path: str = _MODEL_PATH,
+        voice_sample_path: str = _VOICE_SAMPLE_PATH,
+        voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT,
+    ):
+        from cosyvoice.cli.cosyvoice import CosyVoice2
+        from cosyvoice.utils.file_utils import load_wav
+
+        self.model = CosyVoice2(model_path, load_jit=False, load_trt=False)
+        self.sample_rate: int = self.model.sample_rate
+
+        self.prompt_speech = load_wav(voice_sample_path, 16000)
+        self.prompt_text: str = voice_sample_transcript
+
+    async def synthesize(self, text: str) -> AsyncGenerator[Audio, None]:
+        """Run CosyVoice2 streaming inference and yield Audio chunks.
+
+        The synchronous CosyVoice2 generator runs in a thread-pool executor so
+        it does not block the asyncio event loop between chunks.
+        """
+        loop = asyncio.get_running_loop()
+        gen = self.model.inference_zero_shot(
+            text,
+            self.prompt_text,
+            self.prompt_speech,
+            stream=True,
+        )
+        while True:
+            result = await loop.run_in_executor(None, next, gen, _DONE)
+            if result is _DONE:
+                break
+            yield Audio(
+                data=result["tts_speech"].squeeze(0).numpy().astype(np.float32),
+                sample_rate=self.sample_rate,
+            )
+
+    async def get_sample_rate(self) -> int:
+        return self.sample_rate
+
+
+class TTS(ModuleWithHandle):
+    """TTS Module
+
+    Stream text tokens in, stream audio chunks out using CosyVoice2 zero-shot
+    voice cloning.
+
+    Buffers incoming tokens and synthesizes as soon as a sentence or clause
+    boundary is detected. Audio chunks are yielded immediately as CosyVoice2
+    produces them, so playback can start before synthesis is complete.
+
+    Compatible with both the Ray Serve event graph (async generator support in
+    EventGraph._run) and direct client streaming.
+
+    input: token (Token),
+    output: audio (Audio)
+
+    :min_clause_chars: minimum buffer length before a soft boundary (,;:)
+        triggers synthesis. Hard endings (.!?) always trigger immediately.
+        Raise this value to produce longer, more natural-sounding segments.
+    """
+
+    _handle_cls = TTSDeployment
+    input_type = "token"
+    output_type = "audio"
+
+    def __init__(
+        self,
+        handle: handle.DeploymentHandle,
+        min_clause_chars: int = 20,
+    ):
+        super().__init__(handle)
+        self.min_clause_chars: int = min_clause_chars
+        self._buffer: str = ""
+
+    async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: ignore[override]
+        self._buffer += token.text
+
+        # Drain all complete clauses from the buffer before waiting for more tokens
+        while True:
+            clause, remainder = self._split(self._buffer)
+            if not clause:
+                break
+            self._buffer = remainder
+            async for chunk in self.handle.synthesize.remote(clause):
+                yield chunk
+
+        # Flush the remaining buffer when the LLM stream ends
+        if token.end and self._buffer.strip():
+            async for chunk in self.handle.synthesize.remote(self._buffer.strip()):
+                yield chunk
+            self._buffer = ""
+        if token.end:
+            sample_rate = await self.handle.get_sample_rate.remote()
+            yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True)
+
+    def _split(self, text: str) -> tuple[str, str]:
+        """Return (clause_to_synthesize, remaining_buffer).
+
+        Splits on the first hard sentence ending (.!?) unconditionally, or on
+        the first soft clause ending (,;:) once the buffer is long enough.
+        Returns ("", text) when no boundary is found.
+        """
+        m = _HARD_END_RE.search(text)
+        if m:
+            return text[: m.end()].strip(), text[m.end() :]
+
+        if len(text) >= self.min_clause_chars:
+            m = _SOFT_END_RE.search(text)
+            if m:
+                return text[: m.end()].strip(), text[m.end() :]
+
+        return "", text

From 2847f72163e1f5e4b8e62e4ca2c69956f3d28a72 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 01:44:25 +0200
Subject: [PATCH 03/31] feat(helm): local cluster featuring AMD and NVIDIA

---
 deploy/Dockerfile.amd                         |  73 +++++
 deploy/Dockerfile.base                        |  16 +
 deploy/Dockerfile.nvidia                      |  28 ++
 deploy/examples/local_nvidia_amd/Chart.lock   |   6 +
 deploy/examples/local_nvidia_amd/Chart.yaml   |  20 ++
 .../templates/cosytts-model-init-job.yaml     |  86 ++++++
 .../templates/emage-model-init-job.yaml       |  86 ++++++
 .../templates/ingress-dashboard.yaml          |  32 ++
 .../local_nvidia_amd/templates/ingress.yaml   |  32 ++
 .../templates/rayservice.yaml                 | 169 +++++++++++
 .../templates/voice-assets-pvc.yaml           |  28 ++
 deploy/examples/local_nvidia_amd/values.yaml  | 281 ++++++++++++++++++
 requirements-nvidia.txt                       |  37 +++
 13 files changed, 894 insertions(+)
 create mode 100644 deploy/Dockerfile.amd
 create mode 100644 deploy/Dockerfile.base
 create mode 100644 deploy/Dockerfile.nvidia
 create mode 100644 deploy/examples/local_nvidia_amd/Chart.lock
 create mode 100644 deploy/examples/local_nvidia_amd/Chart.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/templates/ingress.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/templates/rayservice.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml
 create mode 100644 deploy/examples/local_nvidia_amd/values.yaml
 create mode 100644 requirements-nvidia.txt

diff --git a/deploy/Dockerfile.amd b/deploy/Dockerfile.amd
new file mode 100644
index 0000000..24c7b45
--- /dev/null
+++ b/deploy/Dockerfile.amd
@@ -0,0 +1,73 @@
+FROM rayproject/ray:2.55.1-py312
+WORKDIR /app
+USER root
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    gnupg2 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add ROCm 7.2.3 repository (Ubuntu 22.04 Jammy)
+RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/keyrings/rocm.gpg \
+    && printf 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.2.3 jammy main\ndeb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.2 jammy main\n' \
+        > /etc/apt/sources.list.d/rocm.list \
+    && printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600\n' \
+        > /etc/apt/preferences.d/rocm-pin-600
+
+# Install ROCm runtime + libraries
+RUN apt-get update && apt-get install -y \
+    rocm-hip-runtime \
+    rocm-hip-libraries \
+    rocm-device-libs \
+    rocm-smi-lib \
+    rocblas \
+    hipblas \
+    miopen-hip \
+    rccl \
+    rocsolver \
+    rocfft \
+    rocrand \
+    hipsparse \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set ROCm environment
+ENV ROCM_PATH=/opt/rocm
+ENV PATH="${ROCM_PATH}/bin:${PATH}"
+ENV LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH}"
+
+USER ray
+
+
+COPY serve_requirements.txt /app
+RUN pip install --no-cache-dir -r serve_requirements.txt
+
+# 1. AMD's PyTorch built for ROCm (NOT the PyPI one — it's built for ROCm 6.2 and will silently break)
+ARG ROCM_VERSION=7.2
+ARG PYTHON_VERSION=cp312
+ARG TRITON_VERSION=3.4.0+rocm7.2.0.git0cace8d2
+
+RUN pip install --no-cache-dir \
+    "https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/triton-${TRITON_VERSION}-${PYTHON_VERSION}-${PYTHON_VERSION}-linux_x86_64.whl"
+
+RUN pip install --no-cache-dir \
+    --extra-index-url https://repo.radeon.com/rocm/pypi/ \
+    "https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/torch-2.8.0+rocm${ROCM_VERSION}.0.lw.gitbf943426-${PYTHON_VERSION}-${PYTHON_VERSION}-linux_x86_64.whl" \
+    "https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/torchaudio-2.8.0+rocm${ROCM_VERSION}.0.git6e1c7fe9-${PYTHON_VERSION}-${PYTHON_VERSION}-linux_x86_64.whl"
+
+RUN pip install --no-cache-dir filelock sympy networkx jinja2 fsspec numpy
+
+USER root
+
+# 3. Official CTranslate2 ROCm wheel (it's inside a zip on the releases page)
+RUN apt-get update && apt-get install -y unzip curl \
+ && curl -L https://github.com/OpenNMT/CTranslate2/releases/download/v4.7.1/rocm-python-wheels-Linux.zip \
+    -o /tmp/ct2-rocm.zip \
+ && unzip -j /tmp/ct2-rocm.zip 'temp-linux/ctranslate2-4.7.1-cp312-*manylinux*x86_64.whl' -d /tmp/ct2 \
+ && pip install --no-cache-dir /tmp/ct2/ctranslate2-4.7.1-cp312-*.whl \
+ && rm -rf /tmp/ct2 /tmp/ct2-rocm.zip
+
+USER ray
+
+# 4. faster-whisper
+RUN pip install --no-cache-dir faster-whisper
+COPY src /app/src
diff --git a/deploy/Dockerfile.base b/deploy/Dockerfile.base
new file mode 100644
index 0000000..f46ac35
--- /dev/null
+++ b/deploy/Dockerfile.base
@@ -0,0 +1,16 @@
+FROM rayproject/ray:2.55.1-py312
+
+WORKDIR /app
+
+
+USER root
+RUN apt-get update && apt-get install -y \
+build-essential \
+&& rm -rf /var/lib/apt/lists/*
+
+USER ray
+
+COPY serve_requirements.txt /app
+RUN pip install --no-cache-dir -r serve_requirements.txt
+
+COPY src /app/src
diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia
new file mode 100644
index 0000000..f9e704f
--- /dev/null
+++ b/deploy/Dockerfile.nvidia
@@ -0,0 +1,28 @@
+FROM rayproject/ray:2.55.1-py312-gpu
+
+WORKDIR /app
+
+# Full CUDA 12.1 dependency stack (CosyVoice2, faster-whisper, TensorRT, …).
+# PyTorch cu121 wheels live on the PyTorch index; TensorRT wheels on the NGC index.
+COPY requirements-nvidia.txt /app
+RUN pip install --no-cache-dir \
+    --extra-index-url https://download.pytorch.org/whl/cu121 \
+    --extra-index-url https://pypi.ngc.nvidia.com \
+    -r requirements-nvidia.txt
+
+USER root
+
+RUN apt-get update && apt-get install -y --no-install-recommends git \
+    && rm -rf /var/lib/apt/lists/*
+
+USER ray
+
+# CosyVoice2 has no setup.py/pyproject.toml so it cannot be pip-installed.
+# Clone at a pinned commit for supply-chain integrity and expose it via PYTHONPATH.
+RUN git clone https://github.com/FunAudioLLM/CosyVoice.git /app/cosyvoice \
+    && git -C /app/cosyvoice checkout 074ca6dc9e80a2f424f1f74b48bdd7d3fea531cc \
+    && git -C /app/cosyvoice submodule update --init --recursive
+
+ENV PYTHONPATH="/app/cosyvoice:${PYTHONPATH:-}"
+
+COPY src /app/src
diff --git a/deploy/examples/local_nvidia_amd/Chart.lock b/deploy/examples/local_nvidia_amd/Chart.lock
new file mode 100644
index 0000000..b0e0b95
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/Chart.lock
@@ -0,0 +1,6 @@
+dependencies:
+- name: kuberay-operator
+  repository: https://ray-project.github.io/kuberay-helm/
+  version: 1.6.0
+digest: sha256:b9057481d9a5e2d8b8798488b0b321bbd3f6e43dcb5a9dea18b181641a63b400
+generated: "2026-05-22T17:28:37.5934885+02:00"
diff --git a/deploy/examples/local_nvidia_amd/Chart.yaml b/deploy/examples/local_nvidia_amd/Chart.yaml
new file mode 100644
index 0000000..c51091a
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/Chart.yaml
@@ -0,0 +1,20 @@
+apiVersion: v2
+name: huri
+description: HuRI service powered by Ray Serve on KubeRay
+type: application
+version: 0.1.0
+appVersion: "2.52.0"
+keywords:
+  - ray
+  - kuberay
+  - ray-serve
+  - robotics
+  - hri
+maintainers:
+  - name: Sentience Robotics
+
+dependencies:
+  - name: kuberay-operator
+    version: "1.6.0"
+    repository: "https://ray-project.github.io/kuberay-helm/"
+    condition: kuberay.install
diff --git a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
new file mode 100644
index 0000000..0d84298
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
@@ -0,0 +1,86 @@
+{{- if .Values.models.cosytts.enabled }}
+{{- $model := .Values.models.cosytts }}
+{{- $pvcName := printf "%s-cosytts-models" (include "huri.fullname" .) }}
+{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ $pvcName }}
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-10"
+    "helm.sh/resource-policy": keep
+spec:
+  accessModes:
+    {{- toYaml $model.pvc.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ $model.pvc.size }}
+  {{- if $model.pvc.storageClassName }}
+  storageClassName: {{ $model.pvc.storageClassName }}
+  {{- end }}
+{{- end }}
+---
+# Runs only on first install (not on upgrade) — models are already on the PVC.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "huri.fullname" . }}-cosytts-init
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-5"
+    "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
+spec:
+  backoffLimit: 3
+  template:
+    metadata:
+      labels:
+        {{- include "huri.selectorLabels" . | nindent 8 }}
+    spec:
+      restartPolicy: OnFailure
+      {{- with $model.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: {{ include "huri.fullname" . }}-cosytts-models
+      containers:
+        - name: cosytts-downloader
+          image: python:3.11-slim
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -e
+              MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}"
+              if [ -f "$MODEL_DIR/cosyvoice2.yaml" ]; then
+                echo "Model already present at $MODEL_DIR — skipping download."
+                exit 0
+              fi
+              echo "Downloading {{ $model.modelSource.modelId }} into $MODEL_DIR …"
+              pip install --quiet modelscope
+              python - <<'PYEOF'
+              from modelscope import snapshot_download
+              snapshot_download(
+                  "{{ $model.modelSource.modelId }}",
+                  local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}",
+              )
+              PYEOF
+              echo "Download complete."
+          volumeMounts:
+            - name: models
+              mountPath: {{ $model.mountPath }}
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "512Mi"
+            limits:
+              cpu: "2"
+              memory: "2Gi"
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml
new file mode 100644
index 0000000..c4c4a08
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml
@@ -0,0 +1,86 @@
+{{- if .Values.models.emage.enabled }}
+{{- $model := .Values.models.emage }}
+{{- $pvcName := printf "%s-emage-models" (include "huri.fullname" .) }}
+{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ $pvcName }}
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-10"
+    "helm.sh/resource-policy": keep
+spec:
+  accessModes:
+    {{- toYaml $model.pvc.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ $model.pvc.size }}
+  {{- if $model.pvc.storageClassName }}
+  storageClassName: {{ $model.pvc.storageClassName }}
+  {{- end }}
+{{- end }}
+---
+# Runs only on first install (not on upgrade) — models are already on the PVC.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "huri.fullname" . }}-emage-init
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-5"
+    "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
+spec:
+  backoffLimit: 3
+  template:
+    metadata:
+      labels:
+        {{- include "huri.selectorLabels" . | nindent 8 }}
+    spec:
+      restartPolicy: OnFailure
+      {{- with $model.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: {{ include "huri.fullname" . }}-emage-models
+      containers:
+        - name: emage-downloader
+          image: python:3.11-slim
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -e
+              MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}"
+              if [ -d "$MODEL_DIR" ] && [ "$(ls -A $MODEL_DIR 2>/dev/null)" ]; then
+                echo "Model already present at $MODEL_DIR — skipping download."
+                exit 0
+              fi
+              echo "Downloading {{ $model.modelSource.repoId }} into $MODEL_DIR …"
+              pip install --quiet huggingface_hub
+              python - <<'PYEOF'
+              from huggingface_hub import snapshot_download
+              snapshot_download(
+                  "{{ $model.modelSource.repoId }}",
+                  local_dir="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}",
+              )
+              PYEOF
+              echo "Download complete."
+          volumeMounts:
+            - name: models
+              mountPath: {{ $model.mountPath }}
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "512Mi"
+            limits:
+              cpu: "2"
+              memory: "2Gi"
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml
new file mode 100644
index 0000000..c8153df
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml
@@ -0,0 +1,32 @@
+{{- if .Values.dashboard.ingress.enabled }}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "huri.fullname" . }}-dashboard
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  {{- with .Values.dashboard.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if .Values.dashboard.ingress.className }}
+  ingressClassName: {{ .Values.dashboard.ingress.className }}
+  {{- end }}
+  {{- with .Values.dashboard.ingress.tls }}
+  tls:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  rules:
+    - host: {{ .Values.dashboard.ingress.host | quote }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                # KubeRay creates <rayservice-name>-head-svc for the head node.
+                name: {{ include "huri.headSvcName" . }}
+                port:
+                  number: 8265
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/ingress.yaml b/deploy/examples/local_nvidia_amd/templates/ingress.yaml
new file mode 100644
index 0000000..85f94de
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/ingress.yaml
@@ -0,0 +1,32 @@
+{{- if .Values.ingress.enabled }}
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: {{ include "huri.fullname" . }}
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  {{- with .Values.ingress.annotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{- if .Values.ingress.className }}
+  ingressClassName: {{ .Values.ingress.className }}
+  {{- end }}
+  {{- with .Values.ingress.tls }}
+  tls:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+  rules:
+    - host: {{ .Values.ingress.host | quote }}
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                # KubeRay creates <rayservice-name>-serve-svc for the Serve endpoint.
+                name: {{ include "huri.serveSvcName" . }}
+                port:
+                  number: 8000
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
new file mode 100644
index 0000000..31d7f00
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
@@ -0,0 +1,169 @@
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  name: {{ include "huri.fullname" . }}
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    ray.io/initializing-timeout: "10m"
+spec:
+  serveConfigV2: |
+{{ .Values.ray.serveConfig | indent 4 }}
+  rayClusterConfig:
+    rayVersion: {{ .Values.ray.version | quote }}
+
+    headGroupSpec:
+      serviceType: {{ .Values.head.serviceType }}
+      rayStartParams:
+        {{- toYaml .Values.head.rayStartParams | nindent 8 }}
+
+      template:
+        metadata:
+          labels:
+            {{- include "huri.selectorLabels" . | nindent 12 }}
+        spec:
+          {{- with .Values.head.nodeSelector }}
+          nodeSelector:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .Values.head.affinity }}
+          affinity:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          {{- with .Values.head.tolerations }}
+          tolerations:
+            {{- toYaml . | nindent 12 }}
+          {{- end }}
+          containers:
+            - name: ray-head
+              image: {{ .Values.image.repository }}:{{ .Values.image.tag }}
+              imagePullPolicy: {{ .Values.image.pullPolicy }}
+              ports:
+                - containerPort: 6379
+                  name: gcs-server
+                - containerPort: 8265
+                  name: dashboard
+                - containerPort: 10001
+                  name: client
+                - containerPort: 8000
+                  name: serve
+              resources:
+                {{- toYaml .Values.head.resources | nindent 16 }}
+
+    workerGroupSpecs:
+      {{- range .Values.workerGroups }}
+      {{- $group := . }}
+      - replicas: {{ .replicas }}
+        minReplicas: {{ .minReplicas }}
+        maxReplicas: {{ .maxReplicas }}
+        groupName: {{ .groupName | quote }}
+        rayStartParams:
+          {{- toYaml .rayStartParams | nindent 10 }}
+          {{- if .customResources }}
+          resources: {{ .customResources | squote }}
+          {{- end }}
+
+        template:
+          metadata:
+            labels:
+              {{- include "huri.selectorLabels" $ | nindent 14 }}
+          spec:
+            {{- if .hostIPC }}
+            hostIPC: {{ .hostIPC }}
+            {{- end }}
+            {{- if .runtimeClassName }}
+            # Tells containerd to invoke the nvidia/amd runtime, which mounts
+            # the GPU devices into the pod. Required on WSL2 + k3s.
+            runtimeClassName: {{ .runtimeClassName }}
+            {{- end }}
+            {{- if .podSecurityContext }}
+            # Pod-level security: supplementalGroups, fsGroup, etc.
+            securityContext:
+              {{- toYaml .podSecurityContext | nindent 14 }}
+            {{- end }}
+            {{- with .nodeSelector }}
+            nodeSelector:
+              {{- toYaml . | nindent 14 }}
+            {{- end }}
+            {{- with .affinity }}
+            affinity:
+              {{- toYaml . | nindent 14 }}
+            {{- end }}
+            {{- with .tolerations }}
+            tolerations:
+              {{- toYaml . | nindent 14 }}
+            {{- end }}
+            volumes:
+              - name: dshm
+                emptyDir:
+                  medium: Memory
+                  sizeLimit: {{ .shmSize | default "1Gi" }}
+              {{- range .mountedModels }}
+              {{- $model := index $.Values.models . }}
+              {{- if $model.enabled }}
+              - name: model-{{ . }}
+                persistentVolumeClaim:
+                  claimName: {{ include "huri.fullname" $ }}-{{ . }}-models
+              {{- end }}
+              {{- end }}
+              {{- if and $.Values.voiceAssets.enabled .mountVoiceAssets }}
+              - name: voice-assets
+                persistentVolumeClaim:
+                  claimName: {{ include "huri.fullname" $ }}-voice-assets
+              {{- end }}
+            containers:
+              - name: ray-worker
+                {{- if .image }}
+                image: {{ .image }}
+                {{- else }}
+                image: {{ $.Values.image.repository }}:{{ $.Values.image.tag }}
+                {{- end }}
+                imagePullPolicy: {{ $.Values.image.pullPolicy }}
+                env:
+                  {{- $hasEnv := false }}
+                  {{- if .containerEnv }}
+                  {{- $hasEnv = true }}
+                  {{- toYaml .containerEnv | nindent 18 }}
+                  {{- end }}
+                  {{- range .mountedModels }}
+                  {{- $model := index $.Values.models . }}
+                  {{- if and $model.enabled $model.env }}
+                  {{- $hasEnv = true }}
+                  {{- range $envKey, $envVal := $model.env }}
+                  - name: {{ $envKey }}
+                    value: {{ $envVal | quote }}
+                  {{- end }}
+                  {{- end }}
+                  {{- end }}
+                  {{- if and $.Values.voiceAssets.enabled $group.mountVoiceAssets }}
+                  {{- range $envKey, $envVal := $.Values.voiceAssets.env }}
+                  {{- $hasEnv = true }}
+                  - name: {{ $envKey }}
+                    value: {{ $envVal | quote }}
+                  {{- end }}
+                  {{- end }}
+                  {{- if not $hasEnv }}
+                  []
+                  {{- end }}
+                {{- if .securityContext }}
+                # Container-level security: seLinuxOptions, capabilities, etc.
+                securityContext:
+                  {{- toYaml .securityContext | nindent 18 }}
+                {{- end }}
+                resources:
+                  {{- toYaml .resources | nindent 18 }}
+                volumeMounts:
+                  - name: dshm
+                    mountPath: /dev/shm
+                  {{- range .mountedModels }}
+                  {{- $model := index $.Values.models . }}
+                  {{- if $model.enabled }}
+                  - name: model-{{ . }}
+                    mountPath: {{ $model.mountPath }}
+                  {{- end }}
+                  {{- end }}
+                  {{- if and $.Values.voiceAssets.enabled .mountVoiceAssets }}
+                  - name: voice-assets
+                    mountPath: {{ $.Values.voiceAssets.mountPath }}
+                  {{- end }}
+      {{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml b/deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml
new file mode 100644
index 0000000..6456e8e
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml
@@ -0,0 +1,28 @@
+{{- if .Values.voiceAssets.enabled }}
+{{- $pvcName := printf "%s-voice-assets" (include "huri.fullname" .) }}
+{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }}
+---
+# PVC for the voice sample used by the TTS module (HURI_VOICE_SAMPLE_PATH).
+# Populate after first install with:
+#   kubectl cp voice.wav <pod>:{{ .Values.voiceAssets.mountPath }}/voice.wav
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ $pvcName }}
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-10"
+    "helm.sh/resource-policy": keep
+spec:
+  accessModes:
+    {{- toYaml .Values.voiceAssets.pvc.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ .Values.voiceAssets.pvc.size }}
+  {{- if .Values.voiceAssets.pvc.storageClassName }}
+  storageClassName: {{ .Values.voiceAssets.pvc.storageClassName }}
+  {{- end }}
+{{- end }}
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
new file mode 100644
index 0000000..3e90893
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -0,0 +1,281 @@
+nameOverride: ""
+fullnameOverride: ""
+
+image:
+  repository: docker.pommier.dev/huri
+  tag: base-2.55.1
+  pullPolicy: Always
+
+ray:
+  version: "2.55.1"
+  # Inline Ray Serve config (equivalent to config/huri.yaml).
+  # Change import_path or add deployments here without rebuilding the image.
+  #
+  # To pin a deployment to a specific GPU vendor, set resources in
+  # ray_actor_options matching the resources string in the desired workerGroup's
+  # rayStartParams:
+  #
+  #   ray_actor_options:
+  #     num_gpus: 1
+  #     resources: {"GPU_TYPE_NVIDIA": 1}   # → runs only on gpu-nvidia workers
+  #     resources: {"GPU_TYPE_AMD": 1}      # → runs only on gpu-amd workers
+  serveConfig: |
+    proxy_location: EveryNode
+    http_options:
+      host: 0.0.0.0
+      port: 8000
+    applications:
+      - name: huri-app
+        route_prefix: /
+        import_path: src.app:app
+        runtime_env:
+          env_vars:
+            RAY_COLOR_PREFIX: "1"
+        deployments:
+          - name: HuRI
+            ray_actor_options:
+              num_cpus: 1
+              num_gpus: 0
+          - name: TTS
+            ray_actor_options:
+              num_cpus: 1
+              num_gpus: 0.5
+              resources: {"GPU_TYPE_NVIDIA": 0.5}
+          - name: GestureGeneration
+            ray_actor_options:
+              num_cpus: 1
+              num_gpus: 0.5
+              resources: {"GPU_TYPE_NVIDIA": 0.5}
+
+head:
+  # ClusterIP is preferred on real clusters; use NodePort for kind/minikube/k3s.
+  serviceType: ClusterIP
+  # Pin the head to the control-plane node so it does not consume GPU memory.
+  # Set to {} to let the scheduler decide.
+  nodeSelector:
+    node-role.kubernetes.io/control-plane: "true"
+  # affinity: {}  # optional Kubernetes affinity rules (nodeAffinity, podAffinity…)
+  tolerations:
+    # Required to schedule on the control-plane node (tainted by default in k3s).
+    - key: node-role.kubernetes.io/control-plane
+      operator: Exists
+      effect: NoSchedule
+  rayStartParams:
+    num-cpus: "2"
+    num-gpus: "0"
+  resources:
+    limits:
+      cpu: "2"
+      memory: "8Gi"
+    requests:
+      cpu: "2"
+      memory: "4Gi"
+
+# Worker groups: one logical "gpu" group per vendor and one "cpu" group.
+#
+# Node placement (Kubernetes level)
+# ──────────────────────────────────
+#   nodeSelector / affinity / tolerations pin the *pod* to specific physical nodes.
+#
+# Ray deployment routing (Ray level)
+# ────────────────────────────────────
+#   rayStartParams.resources advertises named custom resources to the Ray
+#   scheduler. Ray Serve deployments request them via ray_actor_options.resources
+#   to run exclusively on a given worker group regardless of GPU vendor.
+#
+#   Example — to route a deployment to NVIDIA workers only:
+#     workerGroups[gpu-nvidia].customResources: '{"GPU_TYPE_NVIDIA":1}'
+#     serveConfig deployment ray_actor_options.resources: {"GPU_TYPE_NVIDIA": 1}
+#
+# Model volumes
+# ──────────────
+#   mountedModels lists keys from .Values.models. For each enabled model the
+#   template adds a PVC volume + volumeMount + env vars to the worker pod.
+workerGroups:
+  # --- GPU workers (Nvidia) ---
+  - groupName: gpu-nvidia
+    image: docker.pommier.dev/huri:nvidia-2.55.1
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 1
+    mountVoiceAssets: false
+    nodeSelector:
+      gpu: nvidia
+    # affinity: {}  # optional
+    tolerations: []
+    runtimeClassName: nvidia
+    customResources: '{\"GPU_TYPE_NVIDIA\":1}'
+    rayStartParams:
+      num-gpus: "1"
+    containerEnv:
+      - name: NVIDIA_VISIBLE_DEVICES
+        value: "all"
+      - name: NVIDIA_DRIVER_CAPABILITIES
+        value: "compute,utility"
+    # Models whose PVC will be mounted in this worker group.
+    # Keys must match entries under .Values.models.
+    mountedModels:
+      - cosytts
+      - emage
+    resources:
+      limits:
+        cpu: "4"
+        memory: "8Gi"
+        nvidia.com/gpu: "1"
+      requests:
+        cpu: "2"
+        memory: "4Gi"
+        nvidia.com/gpu: "1"
+    shmSize: 2Gi
+
+  # --- GPU workers (AMD) ---
+  - groupName: gpu-amd
+    image: docker.pommier.dev/huri:amd-2.55.1
+    replicas: 1
+    minReplicas: 1
+    maxReplicas: 1
+    mountVoiceAssets: true
+    nodeSelector:
+      gpu: amd
+    # affinity: {}  # optional
+    tolerations: []
+    hostIPC: true
+    customResources: '{\"GPU_TYPE_AMD\":1}'
+    podSecurityContext:
+      supplementalGroups: [39, 107]
+    rayStartParams:
+      num-gpus: "1"
+    containerEnv:
+      - name: HSA_OVERRIDE_GFX_VERSION
+        value: "11.5.1"
+    securityContext:
+      seLinuxOptions:
+        type: "spc_t"
+      # privileged: true  # Uncomment if spc_t still gets blocked by Fedora
+    mountedModels: []
+    resources:
+      limits:
+        cpu: "4"
+        memory: "8Gi"
+        amd.com/gpu: "1"
+      requests:
+        cpu: "2"
+        memory: "4Gi"
+        amd.com/gpu: "1"
+
+  # --- CPU-only workers ---
+  # Handles tasks that do not need a GPU (pre/post-processing, routing, etc.).
+  # Set replicas: 0 to disable this group entirely.
+  - groupName: cpu-workers
+    replicas: 0
+    minReplicas: 0
+    maxReplicas: 0
+    nodeSelector: {}
+    tolerations: []
+    rayStartParams:
+      num-cpus: "2"
+    mountedModels: []
+    resources:
+      limits:
+        cpu: "2"
+        memory: "4Gi"
+      requests:
+        cpu: "1"
+        memory: "2Gi"
+    shmSize: 256Mi
+
+# AI model definitions.
+# Each key corresponds to a model that can be mounted into worker groups via
+# mountedModels. The chart creates one PVC per enabled model and a pre-install
+# Job that downloads the weights on first deploy (idempotent).
+models:
+  cosytts:
+    enabled: true
+    nodeSelector:
+      gpu: nvidia
+    pvc:
+      # Leave storageClassName empty to use the cluster default StorageClass.
+      storageClassName: ""
+      size: 20Gi
+      # Use ReadWriteMany if the PVC must be shared across multiple worker pods
+      # (requires a CSI driver that supports RWX, e.g. NFS or Longhorn).
+      # Use ReadWriteOnce for single-node clusters.
+      accessModes:
+        - ReadWriteOnce
+    # Where the model weights PVC is mounted inside the worker container.
+    mountPath: /models/cosytts
+    modelSource:
+      # type: modelscope | huggingface  (only modelscope is implemented)
+      type: modelscope
+      # ModelScope model ID — snapshot_download uses this as the sub-path
+      # inside mountPath, so the final path is mountPath/modelId.
+      modelId: iic/CosyVoice2-0.5B
+    # Env vars injected into every worker that mounts this model.
+    # HURI_MODEL_PATH must match mountPath/modelId (see text_to_speech.py).
+    env:
+      HURI_MODEL_PATH: /models/cosytts/iic/CosyVoice2-0.5B
+
+  emage:
+    enabled: true
+    nodeSelector:
+      gpu: nvidia
+    pvc:
+      storageClassName: ""
+      size: 10Gi
+      accessModes:
+        - ReadWriteOnce
+    # Where the EMAGE weights PVC is mounted inside the worker container.
+    mountPath: /models/emage
+    modelSource:
+      # type: huggingface — snapshot_download uses repoId as the sub-path
+      # inside mountPath, so the final path is mountPath/repoId.
+      type: huggingface
+      repoId: H-Liu1997/emage_audio
+    # HURI_EMAGE_REPO must match mountPath/repoId (see gesture.py).
+    env:
+      HURI_EMAGE_REPO: /models/emage/H-Liu1997/emage_audio
+
+# Voice sample asset volume.
+# Populate the PVC after first install:
+#   kubectl cp voice.wav <pod>:/assets/voice.wav
+voiceAssets:
+  enabled: true
+  pvc:
+    storageClassName: ""
+    size: 100Mi
+    accessModes:
+      - ReadWriteOnce
+  mountPath: /assets
+  env:
+    HURI_VOICE_SAMPLE_PATH: /assets/voice.wav
+    HURI_VOICE_TRANSCRIPT: "Hello, this is my voice sample for cloning."
+
+# Ingress for the Ray Serve endpoint (port 8000).
+ingress:
+  enabled: false
+  className: nginx
+  annotations: {}
+    # nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
+    # nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
+    # nginx.ingress.kubernetes.io/proxy-buffering: "off"
+  host: huri.example.com
+  tls: []
+  # - secretName: huri-tls
+  #   hosts:
+  #     - huri.example.com
+
+# Ingress for the Ray Dashboard (port 8265). Disabled by default – not safe to
+# expose publicly without additional auth.
+dashboard:
+  ingress:
+    enabled: false
+    className: nginx
+    annotations: {}
+    host: huri-dashboard.example.com
+    tls: []
+
+# Set to true to let this chart manage the KubeRay operator as a sub-chart.
+# Set to false when the operator is already installed cluster-wide (typical for
+# shared clusters).
+kuberay:
+  install: false
diff --git a/requirements-nvidia.txt b/requirements-nvidia.txt
new file mode 100644
index 0000000..a1a78db
--- /dev/null
+++ b/requirements-nvidia.txt
@@ -0,0 +1,37 @@
+# --- Shared / pinned by CosyVoice (keep these versions) ---
+torch==2.3.1
+torchaudio==2.3.1
+numpy==1.26.4
+transformers==4.51.3
+diffusers==0.29.0
+omegaconf==2.3.0
+librosa==0.10.2
+soundfile==0.12.1
+hydra-core==1.3.2          # only because HyperPyYAML configs may resolve hydra refs; safe to keep
+HyperPyYAML==1.2.3
+modelscope==1.20.0
+onnx==1.16.0
+onnxruntime-gpu==1.18.0    # Linux; campplus + speech_tokenizer
+openai-whisper==20250625   # frontend.py: import whisper
+inflect==7.3.1
+wetext==0.0.4              # text normalization fallback (ttsfrd not installed)
+conformer==0.3.2
+x-transformers==2.11.24
+einops==0.8.2
+tiktoken==0.13.0           # cosyvoice/tokenizer
+pyarrow==18.1.0            # imported by cli paths via dataset utils? actually only dataset/processor — can drop
+protobuf==4.25
+pydantic==2.7.0            # transitive (transformers/fastapi), but pinning avoids drift
+regex==2025.11.3
+tqdm==4.67.3
+
+# --- EMAGE extras ---
+huggingface_hub==0.36.2    # from_pretrained
+smplx==0.1.28
+pyrender==0.1.45           # fast_render top-level import
+trimesh==4.12.2
+imageio==2.33.0
+# Visualization-only (skip if --visualization off):
+# opencv-python==4.8.1.78
+# pytorch3d                # has to be built from source for torch 2.3 / py3.12
+# torchvision

From 5e9eb3b47622a1d50cbf9f1d26bdd97dafe409f8 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 01:52:57 +0200
Subject: [PATCH 04/31] Merge branches

---
 .gitignore                                   |  3 +++
 serve_requirements.txt                       |  5 +++++
 src/modules/modules.py                       | 23 +++++++++++++++++++-
 src/modules/speech_to_text/speech_to_text.py |  8 ++++++-
 4 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 serve_requirements.txt

diff --git a/.gitignore b/.gitignore
index 7892d64..990d633 100644
--- a/.gitignore
+++ b/.gitignore
@@ -176,6 +176,9 @@ cython_debug/
 # PyPI configuration file
 .pypirc
 
+# Helm
+**/charts/*.tgz
+
 # Others
 .trash
 docs
\ No newline at end of file
diff --git a/serve_requirements.txt b/serve_requirements.txt
new file mode 100644
index 0000000..242576e
--- /dev/null
+++ b/serve_requirements.txt
@@ -0,0 +1,5 @@
+# server
+numpy
+click<8.2
+webrtcvad
+faster-whisper
diff --git a/src/modules/modules.py b/src/modules/modules.py
index 8fbc53c..04a020c 100644
--- a/src/modules/modules.py
+++ b/src/modules/modules.py
@@ -1,3 +1,4 @@
+import logging
 from typing import Dict, Type
 
 from src.modules.rag.rag import RAG
@@ -7,6 +8,26 @@
 
 from .factory import Module
 
+_LOG = logging.getLogger(__name__)
+
 
 def get_modules() -> Dict[str, Type[Module]]:
-    return {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG}
+    modules: Dict[str, Type[Module]] = {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG}
+
+    # The following imports may contain modules with custom dependencies, depending on the Dockerfile
+    # CPU doesn't need some dependencies, nor the AMD that isn't compatible
+    try:
+        from src.modules.text_to_speech.text_to_speech import TTS
+    except Exception as exc:  # noqa: BLE001
+        _LOG.info("Skipping TTS module: %s", exc)
+    else:
+        modules["tts"] = TTS
+
+    try:
+        from src.modules.gesture.gesture import Gesture
+    except Exception as exc:  # noqa: BLE001
+        _LOG.info("Skipping Gesture module: %s", exc)
+    else:
+        modules["gesture"] = Gesture
+
+    return modules
diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py
index 1300dd3..04afa00 100644
--- a/src/modules/speech_to_text/speech_to_text.py
+++ b/src/modules/speech_to_text/speech_to_text.py
@@ -34,6 +34,8 @@ def __init__(
         self,
         model: str = "base",
         language: str = "en",
+        device: str = "auto",
+        compute_type: str = "auto",
         sample_rate: int = 16000,
         block_duration: float = 0.020,  # s
         transcribe_window: float = 2.0,  # s
@@ -41,7 +43,11 @@ def __init__(
     ):
         super().__init__()
 
-        self.model_faster = WhisperModel(model)
+        self.model_faster = WhisperModel(
+            model,
+            device=device,
+            compute_type=compute_type,
+        )
         self.language = language
 
         self.sample_rate = sample_rate

From a237d26a26f20cb2084a6ff1ec2939495902a075 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 06:31:53 +0200
Subject: [PATCH 05/31] fixed(tts): module deplyment and init

---
 deploy/Dockerfile.nvidia                      |  3 +
 .../templates/cosytts-model-init-job.yaml     | 22 ++++++
 requirements-nvidia.txt                       |  7 ++
 src/modules/gesture/gesture.py                |  8 +-
 src/modules/speech_to_text/speech_to_text.py  | 77 +++++++++++--------
 src/modules/text_to_speech/events.py          | 18 +++++
 src/modules/text_to_speech/text_to_speech.py  | 37 ++++-----
 7 files changed, 117 insertions(+), 55 deletions(-)
 create mode 100644 src/modules/text_to_speech/events.py

diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia
index f9e704f..1cb0c8d 100644
--- a/deploy/Dockerfile.nvidia
+++ b/deploy/Dockerfile.nvidia
@@ -23,6 +23,9 @@ RUN git clone https://github.com/FunAudioLLM/CosyVoice.git /app/cosyvoice \
     && git -C /app/cosyvoice checkout 074ca6dc9e80a2f424f1f74b48bdd7d3fea531cc \
     && git -C /app/cosyvoice submodule update --init --recursive
 
+
+RUN pip install --no-cache-dir lightning==2.2.4 gdown==5.1.0 matplotlib==3.7.5 wget==3.2 pyworld==0.3.4
+
 ENV PYTHONPATH="/app/cosyvoice:${PYTHONPATH:-}"
 
 COPY src /app/src
diff --git a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
index 0d84298..0f62dd1 100644
--- a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
+++ b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
@@ -59,10 +59,26 @@ spec:
             - |
               set -e
               MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}"
+              BLANK_EN_DIR="$MODEL_DIR/CosyVoice-BlankEN"
+              HAS_MAIN_CONFIG="no"
+              HAS_QWEN_WEIGHTS="no"
               if [ -f "$MODEL_DIR/cosyvoice2.yaml" ]; then
+                HAS_MAIN_CONFIG="yes"
+              fi
+              if [ -f "$BLANK_EN_DIR/model.safetensors" ] || \
+                 [ -f "$BLANK_EN_DIR/pytorch_model.bin" ] || \
+                 [ -f "$BLANK_EN_DIR/model.safetensors.index.json" ] || \
+                 [ -f "$BLANK_EN_DIR/pytorch_model.bin.index.json" ]; then
+                HAS_QWEN_WEIGHTS="yes"
+              fi
+              if [ "$HAS_MAIN_CONFIG" = "yes" ] && [ "$HAS_QWEN_WEIGHTS" = "yes" ]; then
                 echo "Model already present at $MODEL_DIR — skipping download."
                 exit 0
               fi
+              if [ "$HAS_QWEN_WEIGHTS" = "no" ] && [ -d "$BLANK_EN_DIR" ]; then
+                echo "Partial Qwen weights detected; clearing $BLANK_EN_DIR before re-download."
+                rm -rf "$BLANK_EN_DIR"
+              fi
               echo "Downloading {{ $model.modelSource.modelId }} into $MODEL_DIR …"
               pip install --quiet modelscope
               python - <<'PYEOF'
@@ -71,6 +87,12 @@ spec:
                   "{{ $model.modelSource.modelId }}",
                   local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}",
               )
+              # CosyVoice2 loads this sub-model at runtime; pre-download it so
+              # the worker pod does not need outbound internet access.
+              snapshot_download(
+                  "iic/CosyVoice-BlankEN",
+                  local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}/CosyVoice-BlankEN",
+              )
               PYEOF
               echo "Download complete."
           volumeMounts:
diff --git a/requirements-nvidia.txt b/requirements-nvidia.txt
index a1a78db..441b738 100644
--- a/requirements-nvidia.txt
+++ b/requirements-nvidia.txt
@@ -25,6 +25,13 @@ pydantic==2.7.0            # transitive (transformers/fastapi), but pinning avoi
 regex==2025.11.3
 tqdm==4.67.3
 
+# --- RAG / LLM extras ---
+httpx==0.27.2
+qdrant-client==1.12.1
+sentence-transformers==3.2.1
+pypdf==5.1.0
+semantic_chunker==0.2.0
+
 # --- EMAGE extras ---
 huggingface_hub==0.36.2    # from_pretrained
 smplx==0.1.28
diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index 4e2a380..427806b 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -8,7 +8,7 @@
 from ray.serve import handle
 
 from src.core.module import Module, ModuleWithHandle
-from src.modules.text_to_speech.text_to_speech import Audio
+from src.modules.text_to_speech.events import Audio
 
 
 _HF_REPO = os.environ.get("HURI_EMAGE_REPO", "H-Liu1997/emage_audio")
@@ -115,9 +115,9 @@ class Gesture(ModuleWithHandle):
 
     def __init__(
         self,
-        handle: handle.DeploymentHandle,
+        _handle: handle.DeploymentHandle,
     ):
-        super().__init__(handle)
+        super().__init__(_handle)
         self._chunks: list[np.ndarray] = []
 
     async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type: ignore[override]
@@ -138,5 +138,5 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type:
         full_audio = np.concatenate(self._chunks)
         self._chunks = []
 
-        motion = await self.handle.infer.remote(full_audio)
+        motion = await self._handle.infer.remote(full_audio)
         yield motion
diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py
index 04afa00..fa3b790 100644
--- a/src/modules/speech_to_text/speech_to_text.py
+++ b/src/modules/speech_to_text/speech_to_text.py
@@ -3,53 +3,80 @@
 
 import numpy as np
 from faster_whisper import WhisperModel
+from ray import serve
+from ray.serve import handle
 
-from src.core.module import Module
+from src.core.module import ModuleWithHandle
 
 from .events import Transcript, Voice
 
 
-class STT(Module):
+@serve.deployment(name="STT")
+class STTDeployment:
+    """Stateless Whisper inference actor.
+
+    Holds the faster-whisper model in a single Ray actor (pinned to the AMD
+    worker in deployment configs). Exposes a single transcribe() call so
+    per-session STT clients can offload the heavy work without owning a GPU.
+    """
+
+    def __init__(
+        self,
+        model: str = "base",
+        device: str = "auto",
+        compute_type: str = "auto",
+    ):
+        self.model_faster = WhisperModel(
+            model,
+            device=device,
+            compute_type=compute_type,
+        )
+        self.language = "en"
+
+    async def transcribe(self, audio: np.ndarray) -> str:
+        loop = asyncio.get_running_loop()
+        segments, _ = await loop.run_in_executor(
+            None,
+            lambda: self.model_faster.transcribe(
+                audio,
+                language=self.language,
+                beam_size=1,
+            ),
+        )
+        return " ".join(seg.text for seg in segments).strip()
+
+
+class STT(ModuleWithHandle):
     """STT Module
 
-    Transcribe voice using Faster_Whisper.
+    Per-session client: keeps the rolling window / silence state, offloads each
+    transcription window to the shared STTDeployment actor.
 
     input: voice,
     output: transcript
 
-    :model: size of the model to use (tiny, tiny.en, base, base.en, small,
-        small.en, distil-small.en, medium, medium.en, distil-medium.en,
-        large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3,
-        large-v3-turbo, or turbo).
-    :language: language spoken in the audio. It should be a language code such
-        as "en" or "fr".
     :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000.
     :block_duration: size of received voice audio (in s).
+    :transcribe_window: rolling window length (s) handed to Whisper.
+    :transcribe_step: stride (s) between successive windows.
     """
 
+    _handle_cls = STTDeployment
     input_type = "voice"
     output_type = "transcript"
 
     def __init__(
         self,
-        model: str = "base",
+        _handle: handle.DeploymentHandle,
         language: str = "en",
-        device: str = "auto",
-        compute_type: str = "auto",
         sample_rate: int = 16000,
         block_duration: float = 0.020,  # s
         transcribe_window: float = 2.0,  # s
         transcribe_step: float = 1.0,  # s
     ):
-        super().__init__()
+        super().__init__(_handle=_handle)
 
-        self.model_faster = WhisperModel(
-            model,
-            device=device,
-            compute_type=compute_type,
-        )
         self.language = language
-
         self.sample_rate = sample_rate
         self.window_size: int = int(transcribe_window / block_duration)
         self.step_size: int = int(transcribe_step / block_duration)
@@ -58,9 +85,6 @@ def __init__(
 
         self.silence: bool = True
 
-        self.prev_text: str = ""
-        self.stable_text: str = ""
-
         self.running = False
         self.lock: asyncio.Lock = asyncio.Lock()
 
@@ -86,16 +110,9 @@ async def process(self, voice: Voice) -> Optional[Transcript]:
                 return None
             processing_chunks = self.buffer[: self.window_size]
 
-        self.pending_silence = False
         processing_audio = np.concatenate(processing_chunks, axis=0)
 
-        segments, _ = self.model_faster.transcribe(
-            processing_audio,
-            language=self.language,
-            beam_size=1,  # faster for realtime
-        )
-
-        current_text = " ".join([seg.text for seg in segments]).strip()
+        current_text: str = await self._handle.transcribe.remote(processing_audio)
 
         processed_size = self.window_size - self.step_size
         async with self.lock:
diff --git a/src/modules/text_to_speech/events.py b/src/modules/text_to_speech/events.py
new file mode 100644
index 0000000..9dd6fd4
--- /dev/null
+++ b/src/modules/text_to_speech/events.py
@@ -0,0 +1,18 @@
+from dataclasses import dataclass
+
+import numpy as np
+
+from src.core.events import EventData
+
+
+@dataclass
+class Token(EventData):
+    text: str
+    end: bool
+
+
+@dataclass
+class Audio(EventData):
+    data: np.ndarray
+    sample_rate: int
+    end: bool = False
diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py
index 07c7af5..8c25eb8 100644
--- a/src/modules/text_to_speech/text_to_speech.py
+++ b/src/modules/text_to_speech/text_to_speech.py
@@ -1,14 +1,16 @@
 import asyncio
 import os
 import re
-from dataclasses import dataclass
-from typing import AsyncGenerator, Optional
+import sys
+from typing import AsyncGenerator
 
 import numpy as np
 from ray import serve
 from ray.serve import handle
 
-from src.core.module import Module, ModuleWithHandle
+from src.core.module import ModuleWithHandle
+
+from .events import Audio, Token
 
 
 # Defaults — overridden by env vars in production (see README.md)
@@ -26,19 +28,6 @@
 _DONE = object()  # sentinel for exhausted sync generator
 
 
-@dataclass
-class Token:
-    text: str
-    end: bool  # True on the last token of an LLM stream
-
-
-@dataclass
-class Audio:
-    data: np.ndarray  # float32, values in [-1.0, 1.0]
-    sample_rate: int
-    end: bool = False  # True on the last chunk of an utterance
-
-
 @serve.deployment(name="TTS")
 class TTSDeployment:
     def __init__(
@@ -47,6 +36,12 @@ def __init__(
         voice_sample_path: str = _VOICE_SAMPLE_PATH,
         voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT,
     ):
+        cosy_dir = os.environ.get("HURI_COSY_DIR")
+        if cosy_dir:
+            matcha_path = os.path.join(cosy_dir, "third_party", "Matcha-TTS")
+            if os.path.isdir(matcha_path) and matcha_path not in sys.path:
+                sys.path.insert(0, matcha_path)
+
         from cosyvoice.cli.cosyvoice import CosyVoice2
         from cosyvoice.utils.file_utils import load_wav
 
@@ -109,10 +104,10 @@ class TTS(ModuleWithHandle):
 
     def __init__(
         self,
-        handle: handle.DeploymentHandle,
+        _handle: handle.DeploymentHandle,
         min_clause_chars: int = 20,
     ):
-        super().__init__(handle)
+        super().__init__(_handle)
         self.min_clause_chars: int = min_clause_chars
         self._buffer: str = ""
 
@@ -125,16 +120,16 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: i
             if not clause:
                 break
             self._buffer = remainder
-            async for chunk in self.handle.synthesize.remote(clause):
+            async for chunk in self._handle.synthesize.remote(clause):
                 yield chunk
 
         # Flush the remaining buffer when the LLM stream ends
         if token.end and self._buffer.strip():
-            async for chunk in self.handle.synthesize.remote(self._buffer.strip()):
+            async for chunk in self._handle.synthesize.remote(self._buffer.strip()):
                 yield chunk
             self._buffer = ""
         if token.end:
-            sample_rate = await self.handle.get_sample_rate.remote()
+            sample_rate = await self._handle.get_sample_rate.remote()
             yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True)
 
     def _split(self, text: str) -> tuple[str, str]:

From d104b7bc3d08a552c65c38fafd9ec10f675813bf Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 06:34:03 +0200
Subject: [PATCH 06/31] fixed(rag): streaming tokens + removing deprecated
 docker_services since that's the job of Helm

---
 config/client_full.yaml            |  44 +++++
 config/huri.yaml                   |  35 ----
 deploy/Dockerfile.amd              |   6 +
 requirements-amd.txt               |  16 ++
 src/app.py                         |  38 +----
 src/modules/events.py              |  21 ++-
 src/modules/factory.py             |   9 +-
 src/modules/rag/docker_services.py | 240 ---------------------------
 src/modules/rag/rag.py             | 252 +++++++++++++++--------------
 9 files changed, 213 insertions(+), 448 deletions(-)
 create mode 100644 config/client_full.yaml
 delete mode 100644 config/huri.yaml
 create mode 100644 requirements-amd.txt
 delete mode 100644 src/modules/rag/docker_services.py

diff --git a/config/client_full.yaml b/config/client_full.yaml
new file mode 100644
index 0000000..1481430
--- /dev/null
+++ b/config/client_full.yaml
@@ -0,0 +1,44 @@
+huri_url: ws://localhost:8000/session
+
+topic_list: [transcript, question, token, motion]
+
+senders:
+  audio:
+    name: audio
+    args:
+      sample_rate: 16000
+      frame_duration: 0.030
+
+modules:
+  mic:
+    name: mic
+    args:
+      vad_agressiveness: 3
+      silence_duration: 1.5
+      block_duration: ${senders.audio.args.frame_duration}
+    logging: INFO
+  stt:
+    name: stt
+    args:
+      language: en
+      block_duration: ${senders.audio.args.frame_duration}
+    logging: INFO
+  tag:
+    name: tag
+    logging: INFO
+  rag:
+    name: rag
+    args:
+      language: en
+      tone: formal
+      response_format: paragraph
+      max_length: 1024
+    logging: INFO
+  tts:
+    name: tts
+    args:
+      min_clause_chars: 20
+    logging: INFO
+  gesture:
+    name: gesture
+    logging: INFO
diff --git a/config/huri.yaml b/config/huri.yaml
deleted file mode 100644
index 70d2cc7..0000000
--- a/config/huri.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-proxy_location: EveryNode
-
-http_options:
-  host: 0.0.0.0
-  port: 8000
-
-logging_config:
-  encoding: TEXT
-  log_level: INFO
-  logs_dir: null
-  enable_access_log: true
-  additional_log_standard_attrs: []
-
-services:
-  qdrant:
-    port: 6333
-    image: "qdrant/qdrant:latest"
-    storage_volume: "qdrant_data"
-  ollama:
-    model: "mistral:7b"
-    image: "ollama/ollama:rocm"
-    gpu_devices: true
-    num_replicas: 1
-
-applications:
-  - name: huri-app
-    route_prefix: /
-    import_path: src.app:app
-    runtime_env: { RAY_COLOR_PREFIX=1 }
-    deployments:
-      - name: HuRI
-      - name: RAGHandle
-        num_replicas: 2
-      - name: OllamaService
-      - name: QdrantService
diff --git a/deploy/Dockerfile.amd b/deploy/Dockerfile.amd
index 24c7b45..786094b 100644
--- a/deploy/Dockerfile.amd
+++ b/deploy/Dockerfile.amd
@@ -70,4 +70,10 @@ USER ray
 
 # 4. faster-whisper
 RUN pip install --no-cache-dir faster-whisper
+
+# 5. RAG / LLM extras (httpx, qdrant-client, sentence-transformers, …)
+#    Installed last so the ROCm torch wheel installed above is the resolved one.
+COPY requirements-amd.txt /app
+RUN pip install --no-cache-dir -r requirements-amd.txt
+
 COPY src /app/src
diff --git a/requirements-amd.txt b/requirements-amd.txt
new file mode 100644
index 0000000..2fefb8d
--- /dev/null
+++ b/requirements-amd.txt
@@ -0,0 +1,16 @@
+# AMD/ROCm worker extras (on top of serve_requirements.txt installed by Dockerfile.base layer
+# and the ROCm torch/torchaudio wheels installed directly in Dockerfile.amd).
+#
+# Hosts: STT (faster-whisper, already in serve_requirements.txt) and RAG/LLM.
+# Does NOT include CosyVoice2 or EMAGE — those run on the NVIDIA worker.
+
+# --- RAG / LLM ---
+httpx==0.27.2
+qdrant-client==1.12.1
+sentence-transformers==3.2.1
+pypdf==5.1.0
+semantic_chunker==0.2.0
+
+# transformers is pulled in by sentence-transformers; pin to a version compatible
+# with the ROCm torch 2.8 wheel.
+transformers==4.46.3
diff --git a/src/app.py b/src/app.py
index 79f58db..1b314de 100644
--- a/src/app.py
+++ b/src/app.py
@@ -1,51 +1,15 @@
-from pathlib import Path
-from typing import Any
-
-import yaml
 from ray.serve import Application
 
 from src.core.huri import HuRI
 from src.modules.events import get_events
 from src.modules.factory import bind_deployment_handles
 from src.modules.modules import get_modules
-from src.modules.rag.docker_services import OllamaService, QdrantService
-
-
-def load_services_config() -> Any:
-    config_path = Path(__file__).resolve().parents[1] / "config" / "huri.yaml"
-    with open(config_path) as f:
-        config = yaml.safe_load(f)
-    return config.get("services", {})
-
-
-def build_qdrant(config: dict) -> Any:
-    return QdrantService.bind(  # type: ignore[attr-defined]
-        port=config.get("port", 6333),
-        image=config.get("image", "qdrant/qdrant:latest"),
-        storage_volume=config.get("storage_volume", "qdrant_data"),
-    )
-
-
-def build_ollama(config: dict) -> Any:
-    return OllamaService.options(  # type: ignore[attr-defined]
-        num_replicas=config.get("num_replicas", 1),
-    ).bind(
-        model=config.get("model", "mistral:7b"),
-        image=config.get("image", "ollama/ollama:latest"),
-        gpu_devices=config.get("gpu_devices", False),
-    )
 
 
 def build_app() -> Application:
     modules = get_modules()
     events = get_events()
-
-    services_config = load_services_config()
-
-    qdrant = build_qdrant(services_config.get("qdrant", {}))
-    ollama = build_ollama(services_config.get("ollama", {}))
-
-    handles = bind_deployment_handles(modules, ollama=ollama, qdrant=qdrant)
+    handles = bind_deployment_handles(modules)
     app: Application = HuRI.bind(modules, handles, events)  # type: ignore[attr-defined]
     return app
 
diff --git a/src/modules/events.py b/src/modules/events.py
index 43f6c71..b731c21 100644
--- a/src/modules/events.py
+++ b/src/modules/events.py
@@ -1,15 +1,30 @@
 from typing import Dict, Type
 
 from src.core.events import EventData
-from src.modules.rag.events import RAGResult
 from src.modules.speech_to_text.events import Sentence, Transcript, Voice
+from src.modules.text_to_speech.events import Audio, Token
 
 
 def get_events() -> Dict[str, Type[EventData | bytes]]:
-    return {
+    events: Dict[str, Type[EventData | bytes]] = {
         "audio": bytes,
         "voice": Voice,
         "transcript": Transcript,
         "question": Sentence,
-        "rag_response": RAGResult,
+        "token": Token,
     }
+
+    # Motion lives in the gesture module — only available when EMAGE deps installed.
+    try:
+        from src.modules.gesture.gesture import Motion
+    except Exception:
+        pass
+    else:
+        events["motion"] = Motion
+
+    # TTS output "audio" is an Audio dataclass internally; the websocket boundary
+    # only ever decodes raw bytes for the "audio" topic (mic input), so the
+    # registry keeps bytes there. Keep Audio importable for type completeness.
+    _ = Audio
+
+    return events
diff --git a/src/modules/factory.py b/src/modules/factory.py
index 728aba3..14acd99 100644
--- a/src/modules/factory.py
+++ b/src/modules/factory.py
@@ -94,7 +94,6 @@ def create_from_config(
 
 def bind_deployment_handles(
     modules: Dict[str, Type[Module]],
-    **service_handles,
 ) -> Dict[str, handle.DeploymentHandle]:
     handles: Dict[str, handle.DeploymentHandle] = {}
     for name, module_cls in modules.items():
@@ -106,12 +105,6 @@ def bind_deployment_handles(
 
         handle_cls = module_cls._handle_cls
 
-        if name == "rag" and service_handles:
-            handles[name] = handle_cls.bind(
-                ollama_handle=service_handles.get("ollama"),
-                qdrant_handle=service_handles.get("qdrant"),
-            )
-        else:
-            handles[name] = handle_cls.bind()
+        handles[name] = handle_cls.bind()
 
     return handles
diff --git a/src/modules/rag/docker_services.py b/src/modules/rag/docker_services.py
deleted file mode 100644
index cc9f4b5..0000000
--- a/src/modules/rag/docker_services.py
+++ /dev/null
@@ -1,240 +0,0 @@
-import socket
-import subprocess
-import time
-from typing import Any
-
-import httpx
-from ray import serve
-
-
-def find_free_port() -> Any:
-    """
-    Ask the OS for a random free port.
-    We need this because if we run multiple Ollama containers,
-    they can't all use port 11434 — each needs its own.
-    """
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
-
-
-def wait_for_service(url: str, timeout: int = 120) -> bool:
-    """
-    Returns True if ready, False if timeout.
-    """
-    start = time.time()
-    while time.time() - start < timeout:
-        try:
-            resp = httpx.get(url, timeout=5)
-            if resp.status_code == 200:
-                return True
-        except Exception:
-            pass
-        time.sleep(2)
-    return False
-
-
-def is_container_running(name: str) -> bool:
-    """Check if a Docker container with this name is already running."""
-    result = subprocess.run(
-        ["docker", "ps", "-q", "-f", f"name=^{name}$"],
-        capture_output=True,
-        text=True,
-    )
-    return bool(result.stdout.strip())
-
-
-def remove_container(name: str):
-    """Force remove a container by name (ignores errors if it doesn't exist)."""
-    subprocess.run(["docker", "rm", "-f", name], capture_output=True)
-
-
-@serve.deployment
-class OllamaService:
-    """
-    Manages one Ollama Docker container.
-
-    LIFECYCLE:
-        __init__:  starts container -> waits for it -> pulls model
-        generate:  sends a prompt to the container, returns the answer
-        __del__:   stops and removes the container
-    """
-
-    def __init__(
-        self,
-        model: str = "mistral:7b",
-        image: str = "ollama/ollama:latest",
-        gpu_devices: bool = False,
-    ):
-        self.model = model
-        self.port = find_free_port()
-        self.container_name = f"ollama-ray-{self.port}"
-        self.base_url = f"http://localhost:{self.port}"
-
-        remove_container(self.container_name)
-
-        cmd = [
-            "docker",
-            "run",
-            "-d",
-            "--name",
-            self.container_name,
-            "-p",
-            f"{self.port}:11434",
-            "-v",
-            "ollama_shared:/root/.ollama",
-        ]
-
-        if gpu_devices:
-            cmd.extend(
-                [
-                    "--device=/dev/kfd",
-                    "--device=/dev/dri",
-                    "--group-add=video",
-                ]
-            )
-
-        cmd.append(image)
-
-        print(f"[OllamaService] Starting container \
-'{self.container_name}' on port {self.port}...")
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"Docker failed: {result.stderr}")
-
-        print("[OllamaService] Waiting for Ollama to be ready...")
-        if not wait_for_service(f"{self.base_url}/api/tags"):
-            raise RuntimeError(f"Ollama didn't start within \
-timeout on port {self.port}")
-
-        print(f"[OllamaService] Pulling model '{model}'...")
-        pull_result = subprocess.run(
-            ["docker", "exec", self.container_name, "ollama", "pull", model],
-            capture_output=True,
-            text=True,
-        )
-        if pull_result.returncode != 0:
-            raise RuntimeError(f"Failed to pull model: {pull_result.stderr}")
-
-        print(f"[OllamaService] Ready! \
-container='{self.container_name}', port={self.port}, model='{model}'")
-
-    async def generate(
-        self,
-        messages: list,
-        max_tokens: int = 1024,
-        temperature: float = 0.1,
-    ) -> Any:
-        """
-        Send messages to Ollama and return the response.
-        This is what RAGHandle calls to get LLM answers.
-        """
-        async with httpx.AsyncClient(timeout=60.0) as client:
-            resp = await client.post(
-                f"{self.base_url}/api/chat",
-                json={
-                    "model": self.model,
-                    "messages": messages,
-                    "stream": False,
-                    "options": {
-                        "num_predict": max_tokens,
-                        "temperature": temperature,
-                    },
-                },
-            )
-            resp.raise_for_status()
-            return resp.json()["message"]["content"]
-
-    async def health(self) -> dict:
-        """Check if this Ollama instance is alive."""
-        try:
-            async with httpx.AsyncClient(timeout=5.0) as client:
-                await client.get(f"{self.base_url}/api/tags")
-                return {
-                    "status": "ok",
-                    "port": self.port,
-                    "container": self.container_name,
-                }
-        except Exception as e:
-            return {"status": "error", "error": str(e)}
-
-    def __del__(self):
-        """Cleanup when Ray destroys this replica."""
-        print(f"[OllamaService] Removing container '{self.container_name}'")
-        remove_container(self.container_name)
-
-
-@serve.deployment(num_replicas=1)
-class QdrantService:
-    """
-    Manages a Qdrant Docker container.
-
-    LIFECYCLE:
-        __init__:  starts container (or reuses if already running)
-        get_url:   returns the URL other services should connect to
-        __del__:   leaves the container running (it has data!)
-    """
-
-    def __init__(
-        self,
-        port: int = 6333,
-        image: str = "qdrant/qdrant:latest",
-        storage_volume: str = "qdrant_data",
-    ):
-        self.port = port
-        self.container_name = "qdrant-ray"
-        self.url = f"http://localhost:{self.port}"
-
-        if self._is_healthy():
-            print(f"[QdrantService] Qdrant already running on port {self.port}")
-            return
-
-        remove_container(self.container_name)
-
-        cmd = [
-            "docker",
-            "run",
-            "-d",
-            "--name",
-            self.container_name,
-            "-p",
-            f"{self.port}:6333",
-            "-v",
-            f"{storage_volume}:/qdrant/storage",
-            image,
-        ]
-
-        print(f"[QdrantService] Starting Qdrant on port {self.port}...")
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode != 0:
-            raise RuntimeError(f"Docker failed: {result.stderr}")
-
-        if not wait_for_service(f"{self.url}/healthz"):
-            raise RuntimeError(
-                f"Qdrant didn't start within timeout on port {self.port}"
-            )
-
-        print(f"[QdrantService] Ready on port {self.port}")
-
-    def _is_healthy(self) -> bool:
-        try:
-            resp = httpx.get(f"{self.url}/healthz", timeout=3)
-            return resp.status_code == 200
-        except Exception:
-            return False
-
-    async def get_url(self) -> str:
-        """Return the URL. Called by RAGHandle to know where Qdrant is."""
-        return self.url
-
-    async def health(self) -> dict:
-        try:
-            async with httpx.AsyncClient(timeout=5.0) as client:
-                await client.get(f"{self.url}/healthz")
-                return {"status": "ok", "port": self.port, "url": self.url}
-        except Exception as e:
-            return {"status": "error", "error": str(e)}
-
-    def __del__(self):
-        print(f"[QdrantService] Actor destroyed. \
-Container '{self.container_name}' left running.")
diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py
index 6b9744d..2372334 100644
--- a/src/modules/rag/rag.py
+++ b/src/modules/rag/rag.py
@@ -1,17 +1,14 @@
+import json
+import os
 from dataclasses import dataclass, field
-from typing import Any, Optional
+from typing import Any, AsyncGenerator
 
-import httpx
-from qdrant_client import QdrantClient
-from qdrant_client.models import FieldCondition, Filter, MatchValue
 from ray import serve
 from ray.serve import handle
-from sentence_transformers import SentenceTransformer
 
 from src.core.module import ModuleWithHandle, ModuleWithId
 from src.modules.speech_to_text.events import Sentence
-
-from .events import RAGResult
+from src.modules.text_to_speech.events import Token
 
 
 @dataclass
@@ -21,25 +18,14 @@ class RAGQuery:
     _user_id: str
     question: str
     preferences: dict = field(default_factory=dict)
-    # preferences can include: language, tone,
-    # response_format, max_length, system_prompt, extra_instructions, etc.
 
 
-@serve.deployment(
-    num_replicas=2,
-    ray_actor_options={"num_cpus": 1},
-)
+@serve.deployment(name="RAGHandle")
 class RAGHandle:
-    """
-    Stateless RAG processor. Knows nothing about sessions.
-    Receives a _user_id + question, uses _user_id to find the right
-    collection/data in the vector DB, runs embed -> search -> LLM.
-    """
+    """Stateless RAG processor. Streams LLM tokens to the caller."""
 
     def __init__(
         self,
-        ollama_handle=None,
-        qdrant_handle=None,
         qdrant_url: str = "http://localhost:6333",
         default_collection: str = "documents",
         embedding_model: str = "BAAI/bge-large-en-v1.5",
@@ -50,6 +36,8 @@ def __init__(
         top_k: int = 5,
         score_threshold: float = 0.5,
     ):
+        from sentence_transformers import SentenceTransformer
+
         self.embed_model = SentenceTransformer(embedding_model)
         self.default_collection = default_collection
         self.top_k = top_k
@@ -60,35 +48,21 @@ def __init__(
         self.llm_model = llm_model
         self.llm_api_key = llm_api_key
 
-        self.ollama_handle = ollama_handle
-        self.qdrant_handle = qdrant_handle
-
         self._qdrant_url = qdrant_url
-        self._qdrant: QdrantClient | None = None
+        self._qdrant: Any = None
+        self._verify_ssl = os.environ.get("HURI_RAG_VERIFY_SSL", "true").lower() != "false"
 
     async def _get_qdrant(self):
-        """Connect to Qdrant on first use. Solves the async-in-init problem."""
         if self._qdrant is None:
-            if self.qdrant_handle:
-                self._qdrant_url = await self.qdrant_handle.get_url.remote()
-            self._qdrant = QdrantClient(url=self._qdrant_url)
+            from qdrant_client import QdrantClient
+
+            self._qdrant = QdrantClient(url=self._qdrant_url, verify=self._verify_ssl)
             print(f"[RAGHandle] Connected to Qdrant at {self._qdrant_url}")
         return self._qdrant
 
     def _resolve_user_context(self, _user_id: str) -> tuple[str, dict | None]:
-        """
-        Given a _user_id, decide which collection to search
-        and which filters to apply.
-
-        Options (pick what fits your data model):
-          A) One collection per user:  collection = f"user_{_user_id}"
-          B) Shared collection, filter by _user_id in payload
-          C) Lookup in a DB to find the user's config
-        """
-
         collection = self.default_collection
         filters = {"_user_id": _user_id}
-
         return collection, filters
 
     def _embed(self, text) -> list[float] | Any:
@@ -101,9 +75,10 @@ def _search(
         collection: str,
         filters: dict | None = None,
     ) -> list[dict]:
-
         qdrant_filter: Any = None
         if filters:
+            from qdrant_client.models import FieldCondition, Filter, MatchValue
+
             conditions: Any = [
                 FieldCondition(key=k, match=MatchValue(value=v))
                 for k, v in filters.items()
@@ -135,7 +110,6 @@ def _build_prompt(
         chunks: list[dict],
         preferences: dict,
     ) -> tuple[str, str]:
-
         parts = [
             "You are a robot speaking to a user. Answer based on the provided context.",
             "If the context is insufficient, say so clearly.",
@@ -162,126 +136,159 @@ def _build_prompt(
             context_parts = []
             for i, chunk in enumerate(chunks, 1):
                 source = chunk["metadata"].get("source", "unknown")
-                context_parts.append(f"[{i}] (source: {source}, score: \
-{chunk['score']:.2f})\n{chunk['text']}")
+                context_parts.append(
+                    f"[{i}] (source: {source}, score: {chunk['score']:.2f})\n"
+                    f"{chunk['text']}"
+                )
             context_block = "\n\n".join(context_parts)
             user_prompt = (
                 f"Context:\n{context_block}\n\n"
                 f"Question: {question}\n\n"
-                "Answer based on the context above.\
-Don't speak about the sources, just use them to answer the question."
+                "Answer based on the context above. "
+                "Don't speak about the sources, just use them to answer."
             )
 
         return system_prompt, user_prompt
 
-    async def _llm_generate(
+    async def _stream_ollama(
+        self, messages: list, max_tokens: int
+    ) -> AsyncGenerator[str, None]:
+        import httpx
+
+        async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client:
+            async with client.stream(
+                "POST",
+                f"{self.llm_url}/api/chat",
+                json={
+                    "model": self.llm_model,
+                    "messages": messages,
+                    "stream": True,
+                    "options": {"num_predict": max_tokens, "temperature": 0.1},
+                },
+            ) as resp:
+                resp.raise_for_status()
+                async for line in resp.aiter_lines():
+                    if not line:
+                        continue
+                    try:
+                        chunk = json.loads(line)
+                    except json.JSONDecodeError:
+                        continue
+                    delta = chunk.get("message", {}).get("content", "")
+                    if delta:
+                        yield delta
+                    if chunk.get("done"):
+                        return
+
+    async def _stream_openai_compatible(
+        self,
+        url: str,
+        messages: list,
+        max_tokens: int,
+        api_key: str = "",
+    ) -> AsyncGenerator[str, None]:
+        import httpx
+
+        headers = {"Content-Type": "application/json"}
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+        async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client:
+            async with client.stream(
+                "POST",
+                url,
+                headers=headers,
+                json={
+                    "model": self.llm_model,
+                    "messages": messages,
+                    "max_tokens": max_tokens,
+                    "temperature": 0.1,
+                    "stream": True,
+                },
+            ) as resp:
+                resp.raise_for_status()
+                async for line in resp.aiter_lines():
+                    if not line or not line.startswith("data:"):
+                        continue
+                    payload = line[len("data:"):].strip()
+                    if payload == "[DONE]":
+                        return
+                    try:
+                        chunk = json.loads(payload)
+                    except json.JSONDecodeError:
+                        continue
+                    delta = (
+                        chunk.get("choices", [{}])[0]
+                        .get("delta", {})
+                        .get("content", "")
+                    )
+                    if delta:
+                        yield delta
+
+    async def _llm_stream(
         self,
         system_prompt: str,
         user_prompt: str,
         preferences: dict,
-    ) -> Any:
+    ) -> AsyncGenerator[str, None]:
         max_tokens = preferences.get("max_length", 1024)
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": user_prompt},
         ]
 
-        if self.ollama_handle:
-            return await self.ollama_handle.generate.remote(messages, max_tokens)
-
         if self.llm_provider == "vllm":
-            return await self._call_openai_compatible(
+            async for d in self._stream_openai_compatible(
                 f"{self.llm_url}/v1/chat/completions", messages, max_tokens
-            )
-        elif self.llm_provider == "ollama":
-            return await self._call_ollama(messages, max_tokens)
-
+            ):
+                yield d
         elif self.llm_provider == "api":
-            return await self._call_openai_compatible(
+            async for d in self._stream_openai_compatible(
                 f"{self.llm_url}/v1/chat/completions",
                 messages,
                 max_tokens,
                 self.llm_api_key,
-            )
+            ):
+                yield d
+        elif self.llm_provider == "ollama":
+            async for d in self._stream_ollama(messages, max_tokens):
+                yield d
         else:
             raise ValueError(f"Unknown llm_provider: {self.llm_provider}")
 
-    async def _call_openai_compatible(
-        self, url: str, messages: list, max_tokens: int, api_key: str = ""
-    ) -> Any:
-        headers = {"Content-Type": "application/json"}
-        if api_key:
-            headers["Authorization"] = f"Bearer {api_key}"
-        async with httpx.AsyncClient(timeout=60.0) as client:
-            resp = await client.post(
-                url,
-                headers=headers,
-                json={
-                    "model": self.llm_model,
-                    "messages": messages,
-                    "max_tokens": max_tokens,
-                    "temperature": 0.1,
-                },
-            )
-            resp.raise_for_status()
-            return resp.json()["choices"][0]["message"]["content"]
-
-    async def _call_ollama(self, messages: list, max_tokens: int) -> Any:
-        async with httpx.AsyncClient(timeout=60.0) as client:
-            resp = await client.post(
-                f"{self.llm_url}/api/chat",
-                json={
-                    "model": self.llm_model,
-                    "messages": messages,
-                    "stream": False,
-                    "options": {"num_predict": max_tokens, "temperature": 0.1},
-                },
-            )
-            resp.raise_for_status()
-            return resp.json()["message"]["content"]
-
-    async def process(self, query: RAGQuery) -> RAGResult:
-        """
-        Main entry point. Called by the RAG module.
-        Uses _user_id to determine which collection / filters to use.
-        """
-
+    async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]:
+        """Main streaming entry point — yields LLM text deltas."""
         print(f"[RAG] Question: {query.question}")
 
         qdrant = await self._get_qdrant()
-
         collection, filters = self._resolve_user_context(query._user_id)
         query_vector = self._embed(query.question)
         chunks = self._search(qdrant, query_vector, collection, filters)
 
         print(f"[RAG] Found {len(chunks)} chunks")
-        for c in chunks:
-            print(f"  - score: {c['score']:.2f} | {c['text'][:100]}...")
-
         system_prompt, user_prompt = self._build_prompt(
             query.question, chunks, query.preferences
         )
-        print(f"[RAG] System prompt: {system_prompt[:200]}...")
-        answer = await self._llm_generate(system_prompt, user_prompt, query.preferences)
-        print(f"[RAG] Answer: {answer}")
-
-        return RAGResult(
-            answer=answer,
-            sources=[
-                {"text": c["text"], "score": c["score"], "metadata": c["metadata"]}
-                for c in chunks
-            ],
-        )
+
+        async for delta in self._llm_stream(
+            system_prompt, user_prompt, query.preferences
+        ):
+            yield delta
 
 
 class RAG(ModuleWithHandle, ModuleWithId):
+    """RAG Module — streams LLM tokens.
+
+    input:  question (Sentence)
+    output: token    (Token)
+    """
+
     _handle_cls = RAGHandle
     input_type = "question"
-    output_type = "rag_response"
+    output_type = "token"
 
     def __init__(
         self,
-        _handle: handle.DeploymentHandle[RAGHandle],
+        _handle: handle.DeploymentHandle,
         _user_id: str,
         language="en",
         tone="formal",
@@ -300,22 +307,17 @@ def __init__(
             "extra_instructions": extra_instructions,
         }
 
-    async def process(self, data: Sentence) -> Optional[RAGResult]:
-        """
-        Called when a "question" event arrives through the event bus.
-        Packages _user_id + question, sends to the stateless RAGHandle.
-        """
-        question_text = data.text
-
+    async def process(self, data: Sentence) -> AsyncGenerator[Token, None]:  # type: ignore[override]
         query = RAGQuery(
             _user_id=self._user_id if self._user_id else "anonymous",
-            question=question_text,
+            question=data.text,
             preferences=self.preferences,
         )
 
-        result: RAGResult = await self._handle.process.remote(query)
-        return result
+        stream = self._handle.options(stream=True).stream.remote(query)
+        async for delta in stream:
+            yield Token(text=delta, end=False)
+        yield Token(text="", end=True)
 
     def update_preferences(self, new_preferences: dict):
-        """Client can update preferences mid-session via the event bus."""
         self.preferences.update(new_preferences)

From 616087a8cde9adf74dff022af99d16019da23263 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 27 May 2026 06:35:45 +0200
Subject: [PATCH 07/31] feated(helm): modules loading between local nvidia and
 amd example helm + custom health probe

---
 .../local_nvidia_amd/templates/_helpers.tpl   | 64 +++++++++++++++++++
 .../templates/rayservice.yaml                 | 11 ++++
 deploy/examples/local_nvidia_amd/values.yaml  | 29 ++++++++-
 3 files changed, 101 insertions(+), 3 deletions(-)
 create mode 100644 deploy/examples/local_nvidia_amd/templates/_helpers.tpl

diff --git a/deploy/examples/local_nvidia_amd/templates/_helpers.tpl b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl
new file mode 100644
index 0000000..e15a832
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl
@@ -0,0 +1,64 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "huri.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create a default fully-qualified app name.
+Truncated at 63 chars because some Kubernetes name fields have this limit.
+*/}}
+{{- define "huri.fullname" -}}
+{{- if .Values.fullnameOverride }}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- $name := default .Chart.Name .Values.nameOverride }}
+{{- if contains $name .Release.Name }}
+{{- .Release.Name | trunc 63 | trimSuffix "-" }}
+{{- else }}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
+{{- end }}
+{{- end }}
+{{- end }}
+
+{{/*
+Chart label: <name>-<version>.
+*/}}
+{{- define "huri.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Common labels applied to every resource.
+*/}}
+{{- define "huri.labels" -}}
+helm.sh/chart: {{ include "huri.chart" . }}
+{{ include "huri.selectorLabels" . }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{/*
+Selector labels (used in matchLabels / ingress backends).
+*/}}
+{{- define "huri.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "huri.name" . }}
+app.kubernetes.io/instance: {{ .Release.Name }}
+{{- end }}
+
+{{/*
+Name of the KubeRay-managed serve service.
+KubeRay appends "-serve-svc" to the RayService name.
+*/}}
+{{- define "huri.serveSvcName" -}}
+{{- printf "%s-serve-svc" (include "huri.fullname" .) }}
+{{- end }}
+
+{{/*
+Name of the KubeRay-managed head service.
+KubeRay appends "-head-svc" to the RayService name.
+*/}}
+{{- define "huri.headSvcName" -}}
+{{- printf "%s-head-svc" (include "huri.fullname" .) }}
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
index 31d7f00..318dac8 100644
--- a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
+++ b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
@@ -119,6 +119,17 @@ spec:
                 image: {{ $.Values.image.repository }}:{{ $.Values.image.tag }}
                 {{- end }}
                 imagePullPolicy: {{ $.Values.image.pullPolicy }}
+                readinessProbe:
+                  exec:
+                    command:
+                      - bash
+                      - -c
+                      - wget --tries 1 -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz
+                        | grep success
+                  initialDelaySeconds: 10
+                  periodSeconds: 5
+                  failureThreshold: 10
+                  timeoutSeconds: 5
                 env:
                   {{- $hasEnv := false }}
                   {{- if .containerEnv }}
diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
index 3e90893..8f7a3e5 100644
--- a/deploy/examples/local_nvidia_amd/values.yaml
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -31,11 +31,32 @@ ray:
         runtime_env:
           env_vars:
             RAY_COLOR_PREFIX: "1"
+            HURI_RAG_VERIFY_SSL: "false"
         deployments:
+          # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only —
+          # all GPU work is offloaded to handle-backed deployments below.
           - name: HuRI
             ray_actor_options:
               num_cpus: 1
               num_gpus: 0
+          # STT: shared faster-whisper actor, pinned to AMD.
+          - name: STT
+            num_replicas: 1
+            ray_actor_options:
+              num_cpus: 1
+              num_gpus: 0.5
+              resources: {"GPU_TYPE_AMD": 0.5}
+          # RAG: embeddings (sentence-transformers) + LLM client. Pinned to AMD.
+          - name: RAGHandle
+            num_replicas: 1
+            ray_actor_options:
+              num_cpus: 1
+              num_gpus: 0.5
+              resources: {"GPU_TYPE_AMD": 0.5}
+            init_kwargs:
+              qdrant_url: "https://qdrant.pommier.lan"
+              llm_url: "https://llm.pommier.lan"
+              embedding_model: "bge-large-en-v1.5-gguf-Q4_K_M"
           - name: TTS
             ray_actor_options:
               num_cpus: 1
@@ -98,7 +119,7 @@ workerGroups:
     replicas: 1
     minReplicas: 1
     maxReplicas: 1
-    mountVoiceAssets: false
+    mountVoiceAssets: true
     nodeSelector:
       gpu: nvidia
     # affinity: {}  # optional
@@ -134,7 +155,7 @@ workerGroups:
     replicas: 1
     minReplicas: 1
     maxReplicas: 1
-    mountVoiceAssets: true
+    mountVoiceAssets: false
     nodeSelector:
       gpu: amd
     # affinity: {}  # optional
@@ -214,6 +235,8 @@ models:
     # HURI_MODEL_PATH must match mountPath/modelId (see text_to_speech.py).
     env:
       HURI_MODEL_PATH: /models/cosytts/iic/CosyVoice2-0.5B
+      # Path to the CosyVoice repo root containing third_party/Matcha-TTS.
+      HURI_COSY_DIR: /app/cosyvoice
 
   emage:
     enabled: true
@@ -248,7 +271,7 @@ voiceAssets:
   mountPath: /assets
   env:
     HURI_VOICE_SAMPLE_PATH: /assets/voice.wav
-    HURI_VOICE_TRANSCRIPT: "Hello, this is my voice sample for cloning."
+    HURI_VOICE_TRANSCRIPT: "Instinct creates its own oppressors and bids us rise up against them."
 
 # Ingress for the Ray Serve endpoint (port 8000).
 ingress:

From fe939592b955f7c061e46663ed59f7cdd1c569cc Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Fri, 29 May 2026 18:09:43 +0200
Subject: [PATCH 08/31] fixed(tts): gesture and tts connection using pts +
 summarizing data fetching on client

---
 src/core/client.py                   | 27 ++++++++++++++--
 src/core/events.py                   | 41 ++++++++++++++++++++----
 src/modules/gesture/gesture.py       | 48 ++++++++++++++--------------
 src/modules/text_to_speech/events.py |  1 +
 src/modules/utils/sender.py          | 42 ++++++++++++++++++++++--
 5 files changed, 122 insertions(+), 37 deletions(-)

diff --git a/src/core/client.py b/src/core/client.py
index 085a0b8..b68f4bb 100644
--- a/src/core/client.py
+++ b/src/core/client.py
@@ -35,11 +35,32 @@ def _save_user_id(self, _user_id: str):
             f.write(_user_id)
 
     async def _receive_loop(self, ws: websockets.ClientConnection):
+        import struct
         try:
             while True:
-                text = await ws.recv()
-                print("<<", text)
-                await asyncio.sleep(0.1)
+                msg = await ws.recv()
+                if isinstance(msg, bytes):
+                    if len(msg) < 2:
+                        print(f"<< bytes ({len(msg)}B, no topic)")
+                        continue
+                    (topic_len,) = struct.unpack(">H", msg[:2])
+                    topic = msg[2:2 + topic_len].decode()
+                    payload = msg[2 + topic_len:]
+
+                    if topic == "audio" and len(payload) >= 13:
+                        sample_rate, end_flag, pts = struct.unpack(">IBd", payload[:13])
+                        n_samples = (len(payload) - 13) // 4
+                        print(
+                            f"<< audio: pts={pts:.3f}s samples={n_samples} @ {sample_rate}Hz "
+                            f"end={bool(end_flag)}"
+                        )
+                    elif topic == "motion" and len(payload) >= 16:
+                        pts, fps, n_frames = struct.unpack(">dII", payload[:16])
+                        print(f"<< motion: pts={pts:.3f}s frames={n_frames} @ {fps}fps")
+                    else:
+                        print(f"<< {topic}: bytes ({len(payload)}B)")
+                else:
+                    print("<<", msg)
 
         except (asyncio.CancelledError, websockets.ConnectionClosedOK):
             pass
diff --git a/src/core/events.py b/src/core/events.py
index b764258..1d116bc 100644
--- a/src/core/events.py
+++ b/src/core/events.py
@@ -1,9 +1,12 @@
 import asyncio
+import logging
 from collections import defaultdict
 from dataclasses import dataclass
 
 from .module import Module
 
+logger = logging.getLogger("ray.serve")
+
 
 @dataclass
 class EventData:
@@ -43,7 +46,13 @@ def register(self, module: Module):
         self.subscribers[module.input_type].append(module)
 
     async def publish(self, event_topic, data):
-        for module in self.subscribers[event_topic]:
+        subs = self.subscribers[event_topic]
+        if event_topic not in ("audio",):  # skip mic-frame spam
+            logger.info(
+                "[GRAPH] publish topic=%r subscribers=%s",
+                event_topic, [type(m).__name__ for m in subs],
+            )
+        for module in subs:
             asyncio.create_task(self._run(module, data))
 
     async def _run(self, module: Module, data):
@@ -55,17 +64,35 @@ async def _run(self, module: Module, data):
                     async for item in result:
                         if item is None:
                             continue
+                        logger.info(
+                            "[GRAPH] %s -> %r: %s",
+                            type(module).__name__, module.output_type, _summarize(item),
+                        )
                         await self.publish(module.output_type, item)
-                except Exception as e:
-                    print(f"[ERROR] async generator in {module}: {e}")
+                except Exception:
+                    logger.exception("[GRAPH] async generator failed in %s", type(module).__name__)
 
             else:
                 try:
                     value = await result
                     if value is not None:
+                        logger.info(
+                            "[GRAPH] %s -> %r: %s",
+                            type(module).__name__, module.output_type, _summarize(value),
+                        )
                         await self.publish(module.output_type, value)
-                except Exception as e:
-                    print(f"[ERROR] coroutine in {module}: {e}")
+                except Exception:
+                    logger.exception("[GRAPH] coroutine failed in %s", type(module).__name__)
+
+        except Exception:
+            logger.exception("[GRAPH] process() call failed in %s", type(module).__name__)
+
 
-        except Exception as e:
-            print(f"[ERROR] process() call failed in {module}: {e}")
+def _summarize(item) -> str:
+    """Short repr that avoids dumping full numpy arrays into the log."""
+    cls = type(item).__name__
+    import numpy as np
+    data = getattr(item, "data", None)
+    if isinstance(data, np.ndarray):
+        return f"{cls}(shape={data.shape}, dtype={data.dtype})"
+    return f"{cls}({item!r})"
diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index 427806b..39a2560 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -21,6 +21,7 @@ class Motion:
     expressions: np.ndarray  # (t, 100)  facial expression coefficients
     trans: np.ndarray        # (t, 3)    global root translation
     fps: int = 30
+    pts: float = 0.0         # presentation timestamp in seconds, paired with Audio.pts
 
 
 @serve.deployment(name="GestureGeneration")
@@ -30,17 +31,25 @@ def __init__(
         hf_repo: str = _HF_REPO,
         device: Optional[str] = None,
     ):
+        print(f"[Gesture] importing torch...", flush=True)
         import torch
+        print(f"[Gesture] importing emage...", flush=True)
         from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv
 
         self.device = torch.device(
             device if device else ("cuda" if torch.cuda.is_available() else "cpu")
         )
+        print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}", flush=True)
 
+        print("[Gesture] loading face_vq...", flush=True)
         face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device)
+        print("[Gesture] loading upper_vq...", flush=True)
         upper_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/upper").to(self.device)
+        print("[Gesture] loading lower_vq...", flush=True)
         lower_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/lower").to(self.device)
+        print("[Gesture] loading hands_vq...", flush=True)
         hands_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/hands").to(self.device)
+        print("[Gesture] loading global_ae...", flush=True)
         global_ae = EmageVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/global").to(self.device)
 
         self.motion_vq = EmageVQModel(
@@ -52,13 +61,19 @@ def __init__(
         )
         self.motion_vq.eval()
 
+        print("[Gesture] loading EmageAudioModel...", flush=True)
         self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device)
         self.model.eval()
+        print(f"[Gesture] ready", flush=True)
 
-    def infer(self, audio_np: np.ndarray) -> Motion:
+    def infer(self, audio_np: np.ndarray, source_sr: int = _EMAGE_SR) -> Motion:
         import torch
         import torch.nn.functional as F
 
+        if source_sr != _EMAGE_SR:
+            import librosa
+            audio_np = librosa.resample(audio_np, orig_sr=source_sr, target_sr=_EMAGE_SR)
+
         audio_ts = torch.from_numpy(audio_np).to(self.device).unsqueeze(0)
         speaker_id = torch.zeros(1, 1, dtype=torch.long, device=self.device)
 
@@ -96,11 +111,9 @@ class Gesture(ModuleWithHandle):
     """Gesture Module
 
     Consumes streaming Audio chunks produced by TTS and generates whole-body
-    SMPL-X motion using the EMAGE audio-to-gesture model.
-
-    Audio chunks are buffered until TTS signals the end of an utterance
-    (Audio.end == True). At that point the full waveform is passed to EMAGE
-    and a single Motion object is yielded.
+    SMPL-X motion using the EMAGE audio-to-gesture model. Inference runs once
+    per chunk so Motion events interleave with audio playback instead of all
+    arriving at the end of the utterance.
 
     input:  audio (Audio)
     output: motion (Motion)
@@ -118,25 +131,12 @@ def __init__(
         _handle: handle.DeploymentHandle,
     ):
         super().__init__(_handle)
-        self._chunks: list[np.ndarray] = []
 
     async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type: ignore[override]
-        import librosa
-
-        if audio.data.size > 0:
-            chunk = audio.data
-            if audio.sample_rate != _EMAGE_SR:
-                chunk = librosa.resample(chunk, orig_sr=audio.sample_rate, target_sr=_EMAGE_SR)
-            self._chunks.append(chunk.astype(np.float32))
-
-        if not audio.end:
+        if audio.data.size == 0:
             return
-
-        if not self._chunks:
-            return
-
-        full_audio = np.concatenate(self._chunks)
-        self._chunks = []
-
-        motion = await self._handle.infer.remote(full_audio)
+        motion = await self._handle.infer.remote(
+            audio.data.astype(np.float32), audio.sample_rate
+        )
+        motion.pts = audio.pts
         yield motion
diff --git a/src/modules/text_to_speech/events.py b/src/modules/text_to_speech/events.py
index 9dd6fd4..dceb269 100644
--- a/src/modules/text_to_speech/events.py
+++ b/src/modules/text_to_speech/events.py
@@ -16,3 +16,4 @@ class Audio(EventData):
     data: np.ndarray
     sample_rate: int
     end: bool = False
+    pts: float = 0.0  # presentation timestamp in seconds from utterance start
diff --git a/src/modules/utils/sender.py b/src/modules/utils/sender.py
index f09b0ba..1a1d7eb 100644
--- a/src/modules/utils/sender.py
+++ b/src/modules/utils/sender.py
@@ -1,9 +1,16 @@
+import logging
+import struct
 from dataclasses import asdict
 
+import numpy as np
 from fastapi import WebSocket
 
 from src.core.events import EventData
 from src.core.module import Module
+from src.modules.gesture.gesture import Motion
+from src.modules.text_to_speech.events import Audio
+
+logger = logging.getLogger("ray.serve")
 
 
 class Sender(Module):
@@ -11,6 +18,9 @@ class Sender(Module):
 
     Send output data to the client.
     This data must be JSON serialisable, like a dataclass.
+    Audio wire format:  [4B sample_rate uint32][1B end][8B pts float64][float32 PCM].
+    Motion wire format: [8B pts float64][4B fps uint32][4B n_frames uint32]
+                        [poses float32 n*165][expressions float32 n*100][trans float32 n*3].
 
     input: auto, output: None"""
 
@@ -21,10 +31,36 @@ def __init__(self, ws: WebSocket, type: str):
         self.ws: WebSocket = ws
         self.input_type = type
 
-    async def process(self, data: EventData | bytes):
+    async def process(self, _):
+        data = _
+        logger.info("[Sender:%s] received %s", self.input_type, type(data).__name__)
         if isinstance(data, bytes):
-            await self.ws.send_bytes(data)
+            await self.ws.send_bytes(self._prefix(data))
+        elif isinstance(data, Audio):
+            logger.info(
+                "[Sender:%s] Audio samples=%d sr=%d end=%s pts=%.3fs",
+                self.input_type, data.data.shape[0], data.sample_rate, data.end, data.pts,
+            )
+            header = struct.pack(">IBd", data.sample_rate, int(data.end), data.pts)
+            await self.ws.send_bytes(self._prefix(header + data.data.tobytes()))
+        elif isinstance(data, Motion):
+            n_frames = data.poses.shape[0]
+            logger.info(
+                "[Sender:%s] Motion frames=%d fps=%d pts=%.3fs",
+                self.input_type, n_frames, data.fps, data.pts,
+            )
+            header = struct.pack(">dII", data.pts, data.fps, n_frames)
+            body = (
+                data.poses.astype(np.float32).tobytes()
+                + data.expressions.astype(np.float32).tobytes()
+                + data.trans.astype(np.float32).tobytes()
+            )
+            await self.ws.send_bytes(self._prefix(header + body))
         elif isinstance(data, EventData):
             await self.ws.send_json(asdict(data))
         else:
-            await self.ws.send_text(data)
+            await self.ws.send_text(str(data))
+
+    def _prefix(self, payload: bytes) -> bytes:
+        topic_bytes = self.input_type.encode()
+        return struct.pack(">H", len(topic_bytes)) + topic_bytes + payload

From b728cd8c3dcbdff43795091f5cc2c8f180e9f440 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Fri, 29 May 2026 18:10:53 +0200
Subject: [PATCH 09/31] fixed(rag): init arguments not being taken in kube
 values + hot values swap capability

---
 src/modules/rag/rag.py | 148 +++++++++++++++++++++++------------------
 1 file changed, 85 insertions(+), 63 deletions(-)

diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py
index 2372334..0195e48 100644
--- a/src/modules/rag/rag.py
+++ b/src/modules/rag/rag.py
@@ -1,8 +1,8 @@
 import json
-import os
 from dataclasses import dataclass, field
 from typing import Any, AsyncGenerator
 
+from pydantic import BaseModel
 from ray import serve
 from ray.serve import handle
 
@@ -11,6 +11,20 @@
 from src.modules.text_to_speech.events import Token
 
 
+class RAGDeploymentConfig(BaseModel):
+    qdrant_url: str = "http://localhost:6333"
+    default_collection: str = "documents"
+    embedding_model: str = "BAAI/bge-large-en-v1.5"
+    embedding_url: str = ""
+    llm_provider: str = "ollama"  # "vllm", "ollama", "api"
+    llm_url: str = "http://localhost:11434"
+    llm_model: str = "mistral:7b"
+    llm_api_key: str = ""
+    verify_ssl: bool = True
+    top_k: int = 5
+    score_threshold: float = 0.5
+
+
 @dataclass
 class RAGQuery:
     """What flows from RAG module to RAGHandle."""
@@ -24,49 +38,52 @@ class RAGQuery:
 class RAGHandle:
     """Stateless RAG processor. Streams LLM tokens to the caller."""
 
-    def __init__(
-        self,
-        qdrant_url: str = "http://localhost:6333",
-        default_collection: str = "documents",
-        embedding_model: str = "BAAI/bge-large-en-v1.5",
-        llm_provider: str = "ollama",  # "vllm", "ollama", "api"
-        llm_url: str = "http://localhost:11434",
-        llm_model: str = "mistral:7b",
-        llm_api_key: str = "",
-        top_k: int = 5,
-        score_threshold: float = 0.5,
-    ):
-        from sentence_transformers import SentenceTransformer
-
-        self.embed_model = SentenceTransformer(embedding_model)
-        self.default_collection = default_collection
-        self.top_k = top_k
-        self.score_threshold = score_threshold
+    def __init__(self, **kwargs):
+        self._cfg = RAGDeploymentConfig(**kwargs)
+        self._apply_config()
 
-        self.llm_provider = llm_provider
-        self.llm_url = llm_url
-        self.llm_model = llm_model
-        self.llm_api_key = llm_api_key
+    def reconfigure(self, config: dict) -> None:
+        self._cfg = RAGDeploymentConfig(**{**self._cfg.model_dump(), **config})
+        self._apply_config()
 
-        self._qdrant_url = qdrant_url
-        self._qdrant: Any = None
-        self._verify_ssl = os.environ.get("HURI_RAG_VERIFY_SSL", "true").lower() != "false"
-
-    async def _get_qdrant(self):
-        if self._qdrant is None:
-            from qdrant_client import QdrantClient
+    def _apply_config(self) -> None:
+        import httpx
+        from qdrant_client import QdrantClient
 
-            self._qdrant = QdrantClient(url=self._qdrant_url, verify=self._verify_ssl)
-            print(f"[RAGHandle] Connected to Qdrant at {self._qdrant_url}")
-        return self._qdrant
+        cfg = self._cfg
+        self.embedding_url = cfg.embedding_url or cfg.llm_url
+        self._qdrant = QdrantClient(url=cfg.qdrant_url, verify=cfg.verify_ssl)
+        print(f"[RAGHandle] Connected to Qdrant at {cfg.qdrant_url}")
+        self._embed_client = httpx.AsyncClient(timeout=30.0, verify=cfg.verify_ssl)
+        self._llm_client = httpx.AsyncClient(timeout=120.0, verify=cfg.verify_ssl)
 
     def _resolve_user_context(self, _user_id: str) -> tuple[str, dict | None]:
-        collection = self.default_collection
+        collection = self._cfg.default_collection
         filters = {"_user_id": _user_id}
         return collection, filters
 
-    def _embed(self, text) -> list[float] | Any:
-        return self.embed_model.encode(str(text), normalize_embeddings=True).tolist()
+    async def _embed(self, text: str) -> list[float]:
+        url = f"{self.embedding_url}/v1/embeddings"
+        resp = await self._embed_client.post(
+            url,
+            json={"model": self._cfg.embedding_model, "input": str(text)},
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(
+                f"Embedding HTTP {resp.status_code} from {url}: {resp.text[:1000]}"
+            )
+        try:
+            payload = resp.json()
+        except Exception as e:
+            raise RuntimeError(
+                f"Embedding non-JSON response from {url}: {resp.text[:1000]}"
+            ) from e
+        try:
+            return payload["data"][0]["embedding"]
+        except (KeyError, IndexError, TypeError) as e:
+            raise RuntimeError(
+                f"Embedding unexpected schema from {url}: {str(payload)[:1000]}"
+            ) from e
 
     def _search(
         self,
@@ -90,8 +107,8 @@ def _search(
                 collection_name=collection,
                 query=query_vector,
                 query_filter=qdrant_filter,
-                limit=self.top_k,
-                score_threshold=self.score_threshold,
+                limit=self._cfg.top_k,
+                score_threshold=self._cfg.score_threshold,
             ).points
         except Exception:
             results = []
@@ -153,14 +170,11 @@ def _build_prompt(
     async def _stream_ollama(
         self, messages: list, max_tokens: int
     ) -> AsyncGenerator[str, None]:
-        import httpx
-
-        async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client:
-            async with client.stream(
+        async with self._llm_client.stream(
                 "POST",
-                f"{self.llm_url}/api/chat",
+                f"{self._cfg.llm_url}/api/chat",
                 json={
-                    "model": self.llm_model,
+                    "model": self._cfg.llm_model,
                     "messages": messages,
                     "stream": True,
                     "options": {"num_predict": max_tokens, "temperature": 0.1},
@@ -187,18 +201,15 @@ async def _stream_openai_compatible(
         max_tokens: int,
         api_key: str = "",
     ) -> AsyncGenerator[str, None]:
-        import httpx
-
         headers = {"Content-Type": "application/json"}
         if api_key:
             headers["Authorization"] = f"Bearer {api_key}"
-        async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client:
-            async with client.stream(
+        async with self._llm_client.stream(
                 "POST",
                 url,
                 headers=headers,
                 json={
-                    "model": self.llm_model,
+                    "model": self._cfg.llm_model,
                     "messages": messages,
                     "max_tokens": max_tokens,
                     "temperature": 0.1,
@@ -236,43 +247,54 @@ async def _llm_stream(
             {"role": "user", "content": user_prompt},
         ]
 
-        if self.llm_provider == "vllm":
+        if self._cfg.llm_provider == "vllm":
             async for d in self._stream_openai_compatible(
-                f"{self.llm_url}/v1/chat/completions", messages, max_tokens
+                f"{self._cfg.llm_url}/v1/chat/completions", messages, max_tokens
             ):
                 yield d
-        elif self.llm_provider == "api":
+        elif self._cfg.llm_provider == "api":
             async for d in self._stream_openai_compatible(
-                f"{self.llm_url}/v1/chat/completions",
+                f"{self._cfg.llm_url}/v1/chat/completions",
                 messages,
                 max_tokens,
-                self.llm_api_key,
+                self._cfg.llm_api_key,
             ):
                 yield d
-        elif self.llm_provider == "ollama":
+        elif self._cfg.llm_provider == "ollama":
             async for d in self._stream_ollama(messages, max_tokens):
                 yield d
         else:
-            raise ValueError(f"Unknown llm_provider: {self.llm_provider}")
+            raise ValueError(f"Unknown llm_provider: {self._cfg.llm_provider}")
 
     async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]:
         """Main streaming entry point — yields LLM text deltas."""
+        import traceback
+
         print(f"[RAG] Question: {query.question}")
 
-        qdrant = await self._get_qdrant()
         collection, filters = self._resolve_user_context(query._user_id)
-        query_vector = self._embed(query.question)
-        chunks = self._search(qdrant, query_vector, collection, filters)
+        query_vector = await self._embed(query.question)
+
+        try:
+            chunks = self._search(self._qdrant, query_vector, collection, filters)
+        except Exception:
+            print(f"[RAG] FAILED during Qdrant search:\n{traceback.format_exc()}")
+            raise
 
         print(f"[RAG] Found {len(chunks)} chunks")
         system_prompt, user_prompt = self._build_prompt(
             query.question, chunks, query.preferences
         )
 
-        async for delta in self._llm_stream(
-            system_prompt, user_prompt, query.preferences
-        ):
-            yield delta
+        print(f"[RAG] Streaming from LLM at {self._cfg.llm_url} (provider={self._cfg.llm_provider}, model={self._cfg.llm_model})")
+        try:
+            async for delta in self._llm_stream(
+                system_prompt, user_prompt, query.preferences
+            ):
+                yield delta
+        except Exception:
+            print(f"[RAG] FAILED during LLM stream:\n{traceback.format_exc()}")
+            raise
 
 
 class RAG(ModuleWithHandle, ModuleWithId):

From ad7f65835c3be3b0d0c80a6a36e07a8cd3be4d0a Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Fri, 29 May 2026 18:11:56 +0200
Subject: [PATCH 10/31] fixed(tts): fixed bi stream capacity + debug logs
 support directly in dashboard

---
 src/modules/text_to_speech/text_to_speech.py | 257 ++++++++++++-------
 1 file changed, 169 insertions(+), 88 deletions(-)

diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py
index 8c25eb8..9bd579b 100644
--- a/src/modules/text_to_speech/text_to_speech.py
+++ b/src/modules/text_to_speech/text_to_speech.py
@@ -1,7 +1,9 @@
 import asyncio
+import logging
 import os
-import re
+import queue
 import sys
+import uuid
 from typing import AsyncGenerator
 
 import numpy as np
@@ -12,6 +14,19 @@
 
 from .events import Audio, Token
 
+logger = logging.getLogger("ray.serve")
+logger.setLevel(os.environ.get("HURI_TTS_LOG_LEVEL", "INFO").upper())
+
+
+def _trace(msg: str) -> None:
+    """Belt-and-braces log: hits both the ray.serve logger AND stdout.
+
+    Ray Serve captures stdout per replica and surfaces it in the dashboard's
+    Logs tab — that's the path that survives any logger misconfiguration.
+    """
+    logger.info(msg)
+    print(f"[TTS] {msg}", flush=True)
+
 
 # Defaults — overridden by env vars in production (see README.md)
 _MODEL_PATH = os.environ.get("HURI_MODEL_PATH", "/models/cosytts/iic/CosyVoice2-0.5B")
@@ -20,132 +35,198 @@
     "HURI_VOICE_TRANSCRIPT", "Hello, this is my voice sample for cloning."
 )
 
-# Hard endings (.!?) trigger synthesis immediately; soft endings (,;:) only after
-# min_clause_chars are buffered, to avoid synthesizing very short fragments.
-_HARD_END_RE = re.compile(r'[.!?]["\']?\s+')
-_SOFT_END_RE = re.compile(r'[,;:]\s+')
-
-_DONE = object()  # sentinel for exhausted sync generator
+_END_TEXT = object()   # sentinel pushed into the text queue to close synth
+_END_AUDIO = object()  # sentinel pushed into the audio queue when synth completes
+_DONE = object()       # sentinel for exhausted sync generator
 
 
-@serve.deployment(name="TTS")
+@serve.deployment(name="TTS", max_ongoing_requests=200)
 class TTSDeployment:
+    """CosyVoice2 wrapper with per-session bistream synthesis.
+
+    The model's `inference_zero_shot` accepts a Python generator as `tts_text`
+    and yields audio chunks as text arrives — that's the "bistream" mode.
+    Because the model call is fully synchronous, each session runs in a thread
+    via `run_in_executor` and is fed by a thread-safe `queue.Queue` that the
+    asyncio side pushes text into.
+    """
+
     def __init__(
         self,
         model_path: str = _MODEL_PATH,
         voice_sample_path: str = _VOICE_SAMPLE_PATH,
         voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT,
     ):
+        _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path}")
+
         cosy_dir = os.environ.get("HURI_COSY_DIR")
         if cosy_dir:
             matcha_path = os.path.join(cosy_dir, "third_party", "Matcha-TTS")
             if os.path.isdir(matcha_path) and matcha_path not in sys.path:
                 sys.path.insert(0, matcha_path)
+                logger.debug("Added Matcha-TTS path to sys.path: %s", matcha_path)
 
         from cosyvoice.cli.cosyvoice import CosyVoice2
-        from cosyvoice.utils.file_utils import load_wav
 
         self.model = CosyVoice2(model_path, load_jit=False, load_trt=False)
         self.sample_rate: int = self.model.sample_rate
+        _trace(f"CosyVoice2 loaded (sample_rate={self.sample_rate})")
 
-        self.prompt_speech = load_wav(voice_sample_path, 16000)
+        self.prompt_speech = voice_sample_path
         self.prompt_text: str = voice_sample_transcript
 
-    async def synthesize(self, text: str) -> AsyncGenerator[Audio, None]:
-        """Run CosyVoice2 streaming inference and yield Audio chunks.
-
-        The synchronous CosyVoice2 generator runs in a thread-pool executor so
-        it does not block the asyncio event loop between chunks.
-        """
-        loop = asyncio.get_running_loop()
-        gen = self.model.inference_zero_shot(
-            text,
-            self.prompt_text,
-            self.prompt_speech,
-            stream=True,
-        )
-        while True:
-            result = await loop.run_in_executor(None, next, gen, _DONE)
-            if result is _DONE:
-                break
-            yield Audio(
-                data=result["tts_speech"].squeeze(0).numpy().astype(np.float32),
-                sample_rate=self.sample_rate,
-            )
+        self._text_queues: dict[str, queue.Queue] = {}
 
     async def get_sample_rate(self) -> int:
         return self.sample_rate
 
+    async def start_session(self, session_id: str) -> None:
+        self._text_queues[session_id] = queue.Queue()
+        _trace(f"[{session_id}] session started (active={len(self._text_queues)})")
+
+    async def push_text(self, session_id: str, text: str, end: bool) -> None:
+        q = self._text_queues.get(session_id)
+        if q is None:
+            _trace(f"[{session_id}] WARNING push_text on unknown session (text={text!r} end={end})")
+            return
+        if text:
+            q.put(text)
+            _trace(f"[{session_id}] push_text {text!r} (qsize={q.qsize()})")
+        if end:
+            q.put(_END_TEXT)
+            _trace(f"[{session_id}] push_text: end-of-stream sentinel")
+
+    async def stream_audio(self, session_id: str) -> AsyncGenerator[Audio, None]:
+        text_q = self._text_queues[session_id]
+        loop = asyncio.get_running_loop()
+        chunk_count = 0
+        _trace(f"[{session_id}] stream_audio: starting CosyVoice inference")
+
+        def text_gen():
+            while True:
+                item = text_q.get()
+                if item is _END_TEXT:
+                    _trace(f"[{session_id}] text_gen: received end sentinel")
+                    return
+                _trace(f"[{session_id}] text_gen yielding: {item!r}")
+                yield item
+
+        try:
+            audio_iter = self.model.inference_zero_shot(
+                text_gen(),
+                self.prompt_text,
+                self.prompt_speech,
+                stream=True,
+            )
+            while True:
+                result = await loop.run_in_executor(None, next, audio_iter, _DONE)
+                if result is _DONE:
+                    break
+                assert isinstance(result, dict)
+                chunk_count += 1
+                speech = result["tts_speech"].squeeze(0).numpy().astype(np.float32)
+                _trace(
+                    f"[{session_id}] audio chunk #{chunk_count}: "
+                    f"{speech.shape[0]} samples (~{speech.shape[0] / self.sample_rate:.2f}s)"
+                )
+                yield Audio(data=speech, sample_rate=self.sample_rate)
+        except Exception as e:
+            _trace(f"[{session_id}] stream_audio FAILED: {e!r}")
+            logger.exception("[%s] stream_audio failed", session_id)
+            raise
+        finally:
+            self._text_queues.pop(session_id, None)
+            _trace(f"[{session_id}] stream_audio finished (chunks={chunk_count})")
 
-class TTS(ModuleWithHandle):
-    """TTS Module
-
-    Stream text tokens in, stream audio chunks out using CosyVoice2 zero-shot
-    voice cloning.
 
-    Buffers incoming tokens and synthesizes as soon as a sentence or clause
-    boundary is detected. Audio chunks are yielded immediately as CosyVoice2
-    produces them, so playback can start before synthesis is complete.
+class TTS(ModuleWithHandle):
+    """TTS Module — bistream tokens-in / audio-out via CosyVoice2.
 
-    Compatible with both the Ray Serve event graph (async generator support in
-    EventGraph._run) and direct client streaming.
+    Opens one synthesis session per utterance (delimited by `token.end`). Each
+    incoming token is pushed straight into the model's text generator so audio
+    starts coming back before the LLM has finished producing the response.
+    No clause buffering on our side — CosyVoice's frontend handles segmentation
+    and stitches LM calls together across the whole utterance.
 
-    input: token (Token),
+    input: token (Token)
     output: audio (Audio)
-
-    :min_clause_chars: minimum buffer length before a soft boundary (,;:)
-        triggers synthesis. Hard endings (.!?) always trigger immediately.
-        Raise this value to produce longer, more natural-sounding segments.
     """
 
     _handle_cls = TTSDeployment
     input_type = "token"
     output_type = "audio"
 
-    def __init__(
-        self,
-        _handle: handle.DeploymentHandle,
-        min_clause_chars: int = 20,
-    ):
+    def __init__(self, _handle: handle.DeploymentHandle):
         super().__init__(_handle)
-        self.min_clause_chars: int = min_clause_chars
-        self._buffer: str = ""
+        self._session_id: str | None = None
+        self._audio_q: asyncio.Queue | None = None
+        self._stream_task: asyncio.Task | None = None
+        self._session_ready: asyncio.Event | None = None
 
     async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: ignore[override]
-        self._buffer += token.text
-
-        # Drain all complete clauses from the buffer before waiting for more tokens
-        while True:
-            clause, remainder = self._split(self._buffer)
-            if not clause:
-                break
-            self._buffer = remainder
-            async for chunk in self._handle.synthesize.remote(clause):
-                yield chunk
-
-        # Flush the remaining buffer when the LLM stream ends
-        if token.end and self._buffer.strip():
-            async for chunk in self._handle.synthesize.remote(self._buffer.strip()):
-                yield chunk
-            self._buffer = ""
-        if token.end:
+        # Subsequent tokens within an utterance just push text — the first
+        # token's invocation is the long-running yielder that emits chunks as
+        # soon as CosyVoice produces them, decoupled from token arrival.
+        if self._session_id is not None:
+            sid = self._session_id
+            ready = self._session_ready
+            if ready is not None:
+                await ready.wait()
+            print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True)
+            await self._handle.push_text.remote(sid, token.text, token.end)
+            return
+
+        self._session_id = str(uuid.uuid4())
+        self._session_ready = asyncio.Event()
+        sid = self._session_id
+        audio_q: asyncio.Queue = asyncio.Queue()
+        self._audio_q = audio_q
+        print(f"[TTS-client] [{sid}] opening new utterance session", flush=True)
+        await self._handle.start_session.remote(sid)
+        self._stream_task = asyncio.create_task(self._drain_audio(sid, audio_q))
+        self._session_ready.set()
+
+        print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True)
+        await self._handle.push_text.remote(sid, token.text, token.end)
+
+        try:
+            count = 0
+            while True:
+                item = await audio_q.get()
+                if item is _END_AUDIO:
+                    break
+                count += 1
+                print(f"[TTS-client] [{sid}] yield chunk #{count}", flush=True)
+                yield item
+            await self._stream_task
+            print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)", flush=True)
+
             sample_rate = await self._handle.get_sample_rate.remote()
             yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True)
-
-    def _split(self, text: str) -> tuple[str, str]:
-        """Return (clause_to_synthesize, remaining_buffer).
-
-        Splits on the first hard sentence ending (.!?) unconditionally, or on
-        the first soft clause ending (,;:) once the buffer is long enough.
-        Returns ("", text) when no boundary is found.
-        """
-        m = _HARD_END_RE.search(text)
-        if m:
-            return text[: m.end()].strip(), text[m.end() :]
-
-        if len(text) >= self.min_clause_chars:
-            m = _SOFT_END_RE.search(text)
-            if m:
-                return text[: m.end()].strip(), text[m.end() :]
-
-        return "", text
+        finally:
+            self._session_id = None
+            self._audio_q = None
+            self._stream_task = None
+            self._session_ready = None
+
+    async def _drain_audio(self, session_id: str, audio_q: asyncio.Queue) -> None:
+        try:
+            response = self._handle.options(stream=True).stream_audio.remote(session_id)
+            count = 0
+            pts = 0.0
+            async for audio in response:  # type: ignore[attr-defined]
+                count += 1
+                audio.pts = pts
+                pts += audio.data.shape[0] / audio.sample_rate
+                print(
+                    f"[TTS-client] [{session_id}] drain received chunk #{count} "
+                    f"pts={audio.pts:.3f}s next={pts:.3f}s",
+                    flush=True,
+                )
+                await audio_q.put(audio)
+        except Exception as e:
+            print(f"[TTS-client] [{session_id}] drain task FAILED: {e!r}", flush=True)
+            raise
+        finally:
+            await audio_q.put(_END_AUDIO)
+            print(f"[TTS-client] [{session_id}] drain task finished", flush=True)

From 080f3422ab4662e955020ba92693d05e63c1096c Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Fri, 29 May 2026 18:13:18 +0200
Subject: [PATCH 11/31] feated(stt): using kube PVC to store whisper model

---
 .../templates/rayservice.yaml                 | 12 ++-
 .../templates/whisper-model-init-job.yaml     | 84 +++++++++++++++++++
 src/modules/speech_to_text/speech_to_text.py  | 14 +++-
 3 files changed, 107 insertions(+), 3 deletions(-)
 create mode 100644 deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml

diff --git a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
index 318dac8..1922d2e 100644
--- a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
+++ b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml
@@ -5,7 +5,7 @@ metadata:
   labels:
     {{- include "huri.labels" . | nindent 4 }}
   annotations:
-    ray.io/initializing-timeout: "10m"
+    ray.io/initializing-timeout: "20m"
 spec:
   serveConfigV2: |
 {{ .Values.ray.serveConfig | indent 4 }}
@@ -98,6 +98,12 @@ spec:
                 emptyDir:
                   medium: Memory
                   sizeLimit: {{ .shmSize | default "1Gi" }}
+              {{- if .cudaCacheHostPath }}
+              - name: cuda-cache
+                hostPath:
+                  path: {{ .cudaCacheHostPath }}
+                  type: DirectoryOrCreate
+              {{- end }}
               {{- range .mountedModels }}
               {{- $model := index $.Values.models . }}
               {{- if $model.enabled }}
@@ -166,6 +172,10 @@ spec:
                 volumeMounts:
                   - name: dshm
                     mountPath: /dev/shm
+                  {{- if .cudaCacheHostPath }}
+                  - name: cuda-cache
+                    mountPath: /home/ray/.nv
+                  {{- end }}
                   {{- range .mountedModels }}
                   {{- $model := index $.Values.models . }}
                   {{- if $model.enabled }}
diff --git a/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml
new file mode 100644
index 0000000..098f7f7
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml
@@ -0,0 +1,84 @@
+{{- if .Values.models.whisper.enabled }}
+{{- $model := .Values.models.whisper }}
+{{- $pvcName := printf "%s-whisper-models" (include "huri.fullname" .) }}
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: {{ $pvcName }}
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-10"
+    "helm.sh/resource-policy": keep
+spec:
+  accessModes:
+    {{- toYaml $model.pvc.accessModes | nindent 4 }}
+  resources:
+    requests:
+      storage: {{ $model.pvc.size }}
+  {{- if $model.pvc.storageClassName }}
+  storageClassName: {{ $model.pvc.storageClassName }}
+  {{- end }}
+---
+# Runs only on first install (not on upgrade) — model is already on the PVC.
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "huri.fullname" . }}-whisper-init
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+  annotations:
+    "helm.sh/hook": pre-install,pre-upgrade
+    "helm.sh/hook-weight": "-5"
+    "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation
+spec:
+  backoffLimit: 3
+  template:
+    metadata:
+      labels:
+        {{- include "huri.selectorLabels" . | nindent 8 }}
+    spec:
+      restartPolicy: OnFailure
+      {{- with $model.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: {{ include "huri.fullname" . }}-whisper-models
+      containers:
+        - name: whisper-downloader
+          image: python:3.11-slim
+          command: ["/bin/sh", "-c"]
+          args:
+            - |
+              set -e
+              MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}"
+              if [ -f "$MODEL_DIR/model.bin" ]; then
+                echo "Model already present at $MODEL_DIR — skipping download."
+                exit 0
+              fi
+              echo "Downloading {{ $model.modelSource.repoId }} into $MODEL_DIR …"
+              pip install --quiet huggingface_hub
+              python - <<'PYEOF'
+              from huggingface_hub import snapshot_download
+              snapshot_download(
+                  "{{ $model.modelSource.repoId }}",
+                  local_dir="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}",
+              )
+              PYEOF
+              echo "Download complete."
+          volumeMounts:
+            - name: models
+              mountPath: {{ $model.mountPath }}
+          resources:
+            requests:
+              cpu: "500m"
+              memory: "512Mi"
+            limits:
+              cpu: "2"
+              memory: "1Gi"
+{{- end }}
diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py
index fa3b790..29c3ad8 100644
--- a/src/modules/speech_to_text/speech_to_text.py
+++ b/src/modules/speech_to_text/speech_to_text.py
@@ -1,8 +1,8 @@
 import asyncio
+import os
 from typing import List, Optional
 
 import numpy as np
-from faster_whisper import WhisperModel
 from ray import serve
 from ray.serve import handle
 
@@ -10,6 +10,8 @@
 
 from .events import Transcript, Voice
 
+_MODEL_PATH = os.environ.get("HURI_STT_MODEL_PATH", "base")
+
 
 @serve.deployment(name="STT")
 class STTDeployment:
@@ -18,19 +20,27 @@ class STTDeployment:
     Holds the faster-whisper model in a single Ray actor (pinned to the AMD
     worker in deployment configs). Exposes a single transcribe() call so
     per-session STT clients can offload the heavy work without owning a GPU.
+
+    HURI_STT_MODEL_PATH: path to a local faster-whisper model directory (from
+    the whisper PVC). Falls back to "base" which triggers a HuggingFace
+    download — only acceptable for local dev without a PVC.
     """
 
     def __init__(
         self,
-        model: str = "base",
+        model: str = _MODEL_PATH,
         device: str = "auto",
         compute_type: str = "auto",
     ):
+        print(f"[STT] loading model from {model!r} (device={device} compute_type={compute_type})", flush=True)
+        from faster_whisper import WhisperModel
+
         self.model_faster = WhisperModel(
             model,
             device=device,
             compute_type=compute_type,
         )
+        print(f"[STT] model loaded", flush=True)
         self.language = "en"
 
     async def transcribe(self, audio: np.ndarray) -> str:

From 5146a0efd9fb1b650cf521f89fbda1e0555ee4d8 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Fri, 29 May 2026 18:14:28 +0200
Subject: [PATCH 12/31] feat(kube): ingress template to avoid port forwarding
 huri for the socket and the dashboard

---
 .../local_nvidia_amd/templates/_helpers.tpl      |  8 ++++++++
 .../templates/head-dashboard-svc.yaml            | 16 ++++++++++++++++
 .../templates/ingress-dashboard.yaml             |  3 +--
 3 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml

diff --git a/deploy/examples/local_nvidia_amd/templates/_helpers.tpl b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl
index e15a832..cb431d4 100644
--- a/deploy/examples/local_nvidia_amd/templates/_helpers.tpl
+++ b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl
@@ -62,3 +62,11 @@ KubeRay appends "-head-svc" to the RayService name.
 {{- define "huri.headSvcName" -}}
 {{- printf "%s-head-svc" (include "huri.fullname" .) }}
 {{- end }}
+
+{{/*
+Name of the stable dashboard service managed by this chart.
+Selects the head pod via stable labels, avoiding KubeRay's random-suffix service.
+*/}}
+{{- define "huri.headDashboardSvcName" -}}
+{{- printf "%s-head-dashboard-svc" (include "huri.fullname" .) }}
+{{- end }}
diff --git a/deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml b/deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml
new file mode 100644
index 0000000..9b47431
--- /dev/null
+++ b/deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "huri.fullname" . }}-head-dashboard-svc
+  labels:
+    {{- include "huri.labels" . | nindent 4 }}
+spec:
+  type: ClusterIP
+  selector:
+    ray.io/node-type: head
+    ray.io/group: headgroup
+    app.kubernetes.io/instance: {{ .Release.Name }}
+  ports:
+    - name: dashboard
+      port: 8265
+      targetPort: 8265
diff --git a/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml
index c8153df..dc91299 100644
--- a/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml
+++ b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml
@@ -25,8 +25,7 @@ spec:
             pathType: Prefix
             backend:
               service:
-                # KubeRay creates <rayservice-name>-head-svc for the head node.
-                name: {{ include "huri.headSvcName" . }}
+                name: {{ include "huri.headDashboardSvcName" . }}
                 port:
                   number: 8265
 {{- end }}

From d17f6bf15527c9c82c620517011393be66cd3d35 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Fri, 29 May 2026 18:15:32 +0200
Subject: [PATCH 13/31] feat(kube): new values to support new ingress and stt
 cache system (+ avoid HF to redownload everytime models)

---
 deploy/examples/local_nvidia_amd/values.yaml | 55 ++++++++++++++------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
index 8f7a3e5..3e4d0a1 100644
--- a/deploy/examples/local_nvidia_amd/values.yaml
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -31,7 +31,6 @@ ray:
         runtime_env:
           env_vars:
             RAY_COLOR_PREFIX: "1"
-            HURI_RAG_VERIFY_SSL: "false"
         deployments:
           # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only —
           # all GPU work is offloaded to handle-backed deployments below.
@@ -46,17 +45,21 @@ ray:
               num_cpus: 1
               num_gpus: 0.5
               resources: {"GPU_TYPE_AMD": 0.5}
-          # RAG: embeddings (sentence-transformers) + LLM client. Pinned to AMD.
+          # RAG: embeddings (API) + LLM client. Pinned to AMD for its dependencies, no GPU needed.
           - name: RAGHandle
             num_replicas: 1
             ray_actor_options:
               num_cpus: 1
-              num_gpus: 0.5
-              resources: {"GPU_TYPE_AMD": 0.5}
-            init_kwargs:
+              num_gpus: 0
+              resources: {"GPU_TYPE_AMD": 0.001}
+            user_config:
               qdrant_url: "https://qdrant.pommier.lan"
-              llm_url: "https://llm.pommier.lan"
+              llm_url: "https://llm.huri.lan"
+              embedding_url: "https://embedding.huri.lan"
               embedding_model: "bge-large-en-v1.5-gguf-Q4_K_M"
+              llm_provider: "vllm"
+              llm_model: "Qwen3.5-4B-GGUF"
+              verify_ssl: false
           - name: TTS
             ray_actor_options:
               num_cpus: 1
@@ -120,6 +123,7 @@ workerGroups:
     minReplicas: 1
     maxReplicas: 1
     mountVoiceAssets: true
+    cudaCacheHostPath: /var/cache/huri/cuda-nvidia
     nodeSelector:
       gpu: nvidia
     # affinity: {}  # optional
@@ -133,6 +137,8 @@ workerGroups:
         value: "all"
       - name: NVIDIA_DRIVER_CAPABILITIES
         value: "compute,utility"
+      - name: HF_HUB_DOWNLOAD_TIMEOUT
+        value: "10"
     # Models whose PVC will be mounted in this worker group.
     # Keys must match entries under .Values.models.
     mountedModels:
@@ -169,11 +175,14 @@ workerGroups:
     containerEnv:
       - name: HSA_OVERRIDE_GFX_VERSION
         value: "11.5.1"
+      - name: HF_HUB_DOWNLOAD_TIMEOUT
+        value: "10"
     securityContext:
       seLinuxOptions:
         type: "spc_t"
       # privileged: true  # Uncomment if spc_t still gets blocked by Fedora
-    mountedModels: []
+    mountedModels:
+      - whisper
     resources:
       limits:
         cpu: "4"
@@ -238,6 +247,22 @@ models:
       # Path to the CosyVoice repo root containing third_party/Matcha-TTS.
       HURI_COSY_DIR: /app/cosyvoice
 
+  whisper:
+    enabled: true
+    nodeSelector:
+      gpu: amd
+    pvc:
+      storageClassName: ""
+      size: 2Gi
+      accessModes:
+        - ReadWriteOnce
+    mountPath: /models/whisper
+    modelSource:
+      type: huggingface
+      repoId: Systran/faster-whisper-base
+    env:
+      HURI_STT_MODEL_PATH: /models/whisper/Systran/faster-whisper-base
+
   emage:
     enabled: true
     nodeSelector:
@@ -275,13 +300,13 @@ voiceAssets:
 
 # Ingress for the Ray Serve endpoint (port 8000).
 ingress:
-  enabled: false
+  enabled: true
   className: nginx
-  annotations: {}
-    # nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
-    # nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
-    # nginx.ingress.kubernetes.io/proxy-buffering: "off"
-  host: huri.example.com
+  annotations:
+    nginx.ingress.kubernetes.io/proxy-read-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-send-timeout: "3600"
+    nginx.ingress.kubernetes.io/proxy-buffering: "off"
+  host: huri.lan
   tls: []
   # - secretName: huri-tls
   #   hosts:
@@ -291,10 +316,10 @@ ingress:
 # expose publicly without additional auth.
 dashboard:
   ingress:
-    enabled: false
+    enabled: true
     className: nginx
     annotations: {}
-    host: huri-dashboard.example.com
+    host: dashboard.huri.lan
     tls: []
 
 # Set to true to let this chart manage the KubeRay operator as a sub-chart.

From 69c614bde15e1a8946590483d132c3149544aebb Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Sun, 31 May 2026 17:56:33 +0200
Subject: [PATCH 14/31] fix(tts): text to speech missing tokens in audio

---
 deploy/examples/local_nvidia_amd/values.yaml |  30 +++-
 src/modules/gesture/gesture.py               | 168 +++++++++++++++++--
 src/modules/text_to_speech/text_to_speech.py |  59 ++++---
 3 files changed, 215 insertions(+), 42 deletions(-)

diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
index 3e4d0a1..8f67376 100644
--- a/deploy/examples/local_nvidia_amd/values.yaml
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -31,6 +31,15 @@ ray:
         runtime_env:
           env_vars:
             RAY_COLOR_PREFIX: "1"
+            # Gesture sliding-window defaults. The Gesture *module* runs in the
+            # HuRI (CPU) actor, so these live app-wide here rather than on the
+            # nvidia worker. context_sec primes EMAGE for continuity across
+            # audio chunks; min_chunk_sec coalesces tiny TTS chunks so fewer,
+            # bounded inferences run (lower latency, smoother motion). Can be
+            # overridden per session via the gesture module `args` in the
+            # client config.
+            HURI_GESTURE_CONTEXT_SEC: "2.0"
+            HURI_GESTURE_MIN_CHUNK_SEC: "0.5"
         deployments:
           # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only —
           # all GPU work is offloaded to handle-backed deployments below.
@@ -60,16 +69,23 @@ ray:
               llm_provider: "vllm"
               llm_model: "Qwen3.5-4B-GGUF"
               verify_ssl: false
+          # GPU split (manual override knob): TTS and Gesture share one NVIDIA
+          # GPU. num_gpus/resources are Ray *scheduling* fractions — they let
+          # both replicas pack onto the same device and bias the split. Audio
+          # (TTS) gets the lion's share so streamed speech stays low-latency;
+          # gesture is given the remainder. Tune these two pairs together so
+          # they sum to <= 1.0. To also cap gesture's actual VRAM allocation,
+          # set HURI_GESTURE_GPU_MEM_FRACTION on the nvidia worker (see below).
           - name: TTS
             ray_actor_options:
               num_cpus: 1
-              num_gpus: 0.5
-              resources: {"GPU_TYPE_NVIDIA": 0.5}
+              num_gpus: 0.8
+              resources: {"GPU_TYPE_NVIDIA": 0.8}
           - name: GestureGeneration
             ray_actor_options:
               num_cpus: 1
-              num_gpus: 0.5
-              resources: {"GPU_TYPE_NVIDIA": 0.5}
+              num_gpus: 0.2
+              resources: {"GPU_TYPE_NVIDIA": 0.2}
 
 head:
   # ClusterIP is preferred on real clusters; use NodePort for kind/minikube/k3s.
@@ -139,6 +155,12 @@ workerGroups:
         value: "compute,utility"
       - name: HF_HUB_DOWNLOAD_TIMEOUT
         value: "10"
+      # Manual GPU split for gesture (see GestureGeneration above). Caps the
+      # EMAGE process to a fraction of GPU memory so TTS keeps the rest. "0"
+      # disables the cap. Keep roughly in line with its num_gpus fraction.
+      # Read by GestureGeneration, which runs in this worker group.
+      - name: HURI_GESTURE_GPU_MEM_FRACTION
+        value: "0.2"
     # Models whose PVC will be mounted in this worker group.
     # Keys must match entries under .Values.models.
     mountedModels:
diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index 39a2560..f939d38 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -13,6 +13,16 @@
 
 _HF_REPO = os.environ.get("HURI_EMAGE_REPO", "H-Liu1997/emage_audio")
 _EMAGE_SR = 16000  # EMAGE expects 16 kHz mono audio
+_EMAGE_FPS = 30    # EMAGE emits motion at 30 fps
+
+# Sliding-window defaults. Overridable per-deployment via the module `args`
+# block in the client config, or globally via the env vars below.
+_CONTEXT_SEC = float(os.environ.get("HURI_GESTURE_CONTEXT_SEC", "2.0"))
+_MIN_CHUNK_SEC = float(os.environ.get("HURI_GESTURE_MIN_CHUNK_SEC", "0.5"))
+
+# Optional manual GPU split: cap the gesture process to a fraction of the GPU so
+# TTS keeps the lion's share. Only applied on CUDA when the value is set (>0).
+_GPU_MEM_FRACTION = float(os.environ.get("HURI_GESTURE_GPU_MEM_FRACTION", "0.0"))
 
 
 @dataclass
@@ -20,7 +30,7 @@ class Motion:
     poses: np.ndarray        # (t, 165)  SMPL-X axis-angle, 55 joints × 3
     expressions: np.ndarray  # (t, 100)  facial expression coefficients
     trans: np.ndarray        # (t, 3)    global root translation
-    fps: int = 30
+    fps: int = _EMAGE_FPS
     pts: float = 0.0         # presentation timestamp in seconds, paired with Audio.pts
 
 
@@ -30,6 +40,7 @@ def __init__(
         self,
         hf_repo: str = _HF_REPO,
         device: Optional[str] = None,
+        gpu_mem_fraction: float = _GPU_MEM_FRACTION,
     ):
         print(f"[Gesture] importing torch...", flush=True)
         import torch
@@ -41,6 +52,21 @@ def __init__(
         )
         print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}", flush=True)
 
+        # Manual GPU split: cap this process' share of GPU memory so the audio
+        # (TTS) path keeps the rest. num_gpus in the Ray serveConfig handles
+        # scheduling/packing; this caps actual allocation on the device.
+        if self.device.type == "cuda" and gpu_mem_fraction > 0:
+            try:
+                torch.cuda.set_per_process_memory_fraction(
+                    gpu_mem_fraction, self.device.index or 0
+                )
+                print(
+                    f"[Gesture] GPU memory fraction capped at {gpu_mem_fraction:.2f}",
+                    flush=True,
+                )
+            except Exception as e:  # noqa: BLE001 — best-effort knob, never fatal
+                print(f"[Gesture] WARNING could not cap GPU memory: {e!r}", flush=True)
+
         print("[Gesture] loading face_vq...", flush=True)
         face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device)
         print("[Gesture] loading upper_vq...", flush=True)
@@ -111,15 +137,32 @@ class Gesture(ModuleWithHandle):
     """Gesture Module
 
     Consumes streaming Audio chunks produced by TTS and generates whole-body
-    SMPL-X motion using the EMAGE audio-to-gesture model. Inference runs once
-    per chunk so Motion events interleave with audio playback instead of all
-    arriving at the end of the utterance.
+    SMPL-X motion using the EMAGE audio-to-gesture model.
+
+    Sliding window
+    ──────────────
+    TTS emits short, uneven audio chunks. Running EMAGE on each chunk in
+    isolation produces motion that is jerky at chunk seams (the model has no
+    context across boundaries) and is slow because the per-chunk overhead is
+    re-paid for tiny inputs — and gets worse the longer the utterance runs if
+    naively re-fed the whole buffer.
+
+    Instead we keep a rolling buffer and, each time at least ``min_chunk_sec``
+    of fresh audio has arrived, run inference over a window of
+    ``[context_sec of already-spoken audio] + [the fresh audio]``. The context
+    primes the model so the seam is continuous; only the motion frames for the
+    fresh audio are emitted. The window length is bounded by
+    ``context_sec + chunk size`` so inference cost stays flat regardless of
+    utterance length. Global root translation is rebased onto the previously
+    emitted frame to avoid a jump every window.
 
     input:  audio (Audio)
     output: motion (Motion)
 
-    :hf_repo: HuggingFace repository to load EMAGE weights from.
-    :device:  PyTorch device string; defaults to CUDA when available.
+    :hf_repo:      HuggingFace repository to load EMAGE weights from.
+    :device:       PyTorch device string; defaults to CUDA when available.
+    :context_sec:  Seconds of prior audio prepended to each window for continuity.
+    :min_chunk_sec: Minimum seconds of fresh audio to accumulate before inferring.
     """
 
     _handle_cls = GestureDeployment
@@ -129,14 +172,115 @@ class Gesture(ModuleWithHandle):
     def __init__(
         self,
         _handle: handle.DeploymentHandle,
+        context_sec: float = _CONTEXT_SEC,
+        min_chunk_sec: float = _MIN_CHUNK_SEC,
     ):
         super().__init__(_handle)
+        self._context_sec = float(context_sec)
+        self._min_chunk_sec = float(min_chunk_sec)
+
+        # Per-utterance sliding-window state. All sample counts are in the
+        # source sample rate; resampling to 16 kHz happens once inside infer().
+        self._lock = asyncio.Lock()
+        self._sr: Optional[int] = None
+        self._buffer = np.empty(0, dtype=np.float32)  # trailing audio (ctx + unprocessed)
+        self._buf_start = 0       # source-sr sample index of buffer[0] in utterance timeline
+        self._emitted = 0         # source-sr samples whose motion has been emitted
+        self._trans_anchor: Optional[np.ndarray] = None  # last emitted trans, for continuity
+
+    def _reset(self) -> None:
+        self._sr = None
+        self._buffer = np.empty(0, dtype=np.float32)
+        self._buf_start = 0
+        self._emitted = 0
+        self._trans_anchor = None
 
     async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type: ignore[override]
-        if audio.data.size == 0:
-            return
-        motion = await self._handle.infer.remote(
-            audio.data.astype(np.float32), audio.sample_rate
+        # Each chunk arrives as its own process() task on the shared per-session
+        # instance, so serialise under a lock to keep the buffer ordered.
+        async with self._lock:
+            if audio.data.size > 0:
+                if self._sr is None:
+                    self._sr = audio.sample_rate
+                self._buffer = np.concatenate(
+                    [self._buffer, audio.data.astype(np.float32)]
+                )
+
+            sr = self._sr
+            end_of_utterance = audio.end
+
+            if sr is None:
+                # Nothing buffered yet (e.g. a lone end marker). Reset and bail.
+                if end_of_utterance:
+                    self._reset()
+                return
+
+            ctx_samples = int(self._context_sec * sr)
+            min_new_samples = int(self._min_chunk_sec * sr)
+
+            global_end = self._buf_start + len(self._buffer)
+            new_samples = global_end - self._emitted
+
+            # Wait for more audio unless this is the final flush of the utterance.
+            if new_samples <= 0 or (not end_of_utterance and new_samples < min_new_samples):
+                if end_of_utterance:
+                    self._reset()
+                return
+
+            motion = await self._infer_window(sr, ctx_samples, global_end)
+            if motion is not None:
+                yield motion
+
+            if end_of_utterance:
+                self._reset()
+
+    async def _infer_window(
+        self, sr: int, ctx_samples: int, global_end: int
+    ) -> Optional[Motion]:
+        # Window = [context of already-emitted audio] + [fresh audio].
+        win_start = max(self._buf_start, self._emitted - ctx_samples)
+        window = self._buffer[win_start - self._buf_start :]
+        if window.size == 0:
+            return None
+
+        motion: Motion = await self._handle.infer.remote(window, sr)
+        total_frames = motion.poses.shape[0]
+
+        # Drop the leading frames that correspond to the context (already emitted).
+        skip_sec = (self._emitted - win_start) / sr
+        skip_frames = int(round(skip_sec * motion.fps))
+        skip_frames = max(0, min(skip_frames, total_frames))
+
+        poses = motion.poses[skip_frames:]
+        expressions = motion.expressions[skip_frames:]
+        trans = motion.trans[skip_frames:].copy()
+
+        # Advance the timeline even if rounding left no new frames to emit.
+        self._emitted = global_end
+        self._trim_buffer(ctx_samples, global_end)
+
+        if poses.shape[0] == 0:
+            return None
+
+        # Rebase global translation onto the last emitted frame: every window
+        # restarts root motion near the origin, so without this the avatar would
+        # teleport back at each seam.
+        if self._trans_anchor is not None:
+            trans += self._trans_anchor - trans[0]
+        self._trans_anchor = trans[-1].copy()
+
+        out = Motion(
+            poses=poses,
+            expressions=expressions,
+            trans=trans,
+            fps=motion.fps,
+            pts=win_start / sr + skip_sec,  # == self._emitted_before / sr
         )
-        motion.pts = audio.pts
-        yield motion
+        return out
+
+    def _trim_buffer(self, ctx_samples: int, global_end: int) -> None:
+        # Keep only the trailing context so the next window stays bounded.
+        keep_from = global_end - ctx_samples
+        if keep_from > self._buf_start:
+            self._buffer = self._buffer[keep_from - self._buf_start :]
+            self._buf_start = keep_from
diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py
index 9bd579b..9daa384 100644
--- a/src/modules/text_to_speech/text_to_speech.py
+++ b/src/modules/text_to_speech/text_to_speech.py
@@ -161,34 +161,41 @@ def __init__(self, _handle: handle.DeploymentHandle):
         self._session_id: str | None = None
         self._audio_q: asyncio.Queue | None = None
         self._stream_task: asyncio.Task | None = None
-        self._session_ready: asyncio.Event | None = None
+        # The EventGraph fans each token out as its own concurrent process()
+        # task on this shared instance. This lock serialises session setup and
+        # text pushes so tokens reach CosyVoice's text queue in arrival order,
+        # exactly once. asyncio.Lock wakes waiters FIFO and tokens are created
+        # in order, so order is preserved — crucially the end-of-utterance token
+        # can no longer overtake a content token (which would truncate synthesis
+        # and silently drop trailing words).
+        self._push_lock = asyncio.Lock()
 
     async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: ignore[override]
-        # Subsequent tokens within an utterance just push text — the first
-        # token's invocation is the long-running yielder that emits chunks as
-        # soon as CosyVoice produces them, decoupled from token arrival.
-        if self._session_id is not None:
+        # Acquire BEFORE any await so lock-acquisition order matches token order.
+        # Setup + push happen under the lock; only the first token of an
+        # utterance goes on to drain/yield audio (outside the lock, so pushes of
+        # later tokens are never blocked by the long-running drain).
+        async with self._push_lock:
+            is_first = self._session_id is None
+            if is_first:
+                self._session_id = str(uuid.uuid4())
+                self._audio_q = asyncio.Queue()
+                print(f"[TTS-client] [{self._session_id}] opening new utterance session", flush=True)
+                await self._handle.start_session.remote(self._session_id)
+                self._stream_task = asyncio.create_task(
+                    self._drain_audio(self._session_id, self._audio_q)
+                )
+
             sid = self._session_id
-            ready = self._session_ready
-            if ready is not None:
-                await ready.wait()
+            audio_q = self._audio_q
+            stream_task = self._stream_task
             print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True)
             await self._handle.push_text.remote(sid, token.text, token.end)
-            return
 
-        self._session_id = str(uuid.uuid4())
-        self._session_ready = asyncio.Event()
-        sid = self._session_id
-        audio_q: asyncio.Queue = asyncio.Queue()
-        self._audio_q = audio_q
-        print(f"[TTS-client] [{sid}] opening new utterance session", flush=True)
-        await self._handle.start_session.remote(sid)
-        self._stream_task = asyncio.create_task(self._drain_audio(sid, audio_q))
-        self._session_ready.set()
-
-        print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True)
-        await self._handle.push_text.remote(sid, token.text, token.end)
+        if not is_first:
+            return
 
+        assert audio_q is not None and stream_task is not None
         try:
             count = 0
             while True:
@@ -198,16 +205,16 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: i
                 count += 1
                 print(f"[TTS-client] [{sid}] yield chunk #{count}", flush=True)
                 yield item
-            await self._stream_task
+            await stream_task
             print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)", flush=True)
 
             sample_rate = await self._handle.get_sample_rate.remote()
             yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True)
         finally:
-            self._session_id = None
-            self._audio_q = None
-            self._stream_task = None
-            self._session_ready = None
+            async with self._push_lock:
+                self._session_id = None
+                self._audio_q = None
+                self._stream_task = None
 
     async def _drain_audio(self, session_id: str, audio_q: asyncio.Queue) -> None:
         try:

From 01ac7ef24902ae61927573d4a0753562eec83e82 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Sun, 31 May 2026 17:58:38 +0200
Subject: [PATCH 15/31] feat(client): option to save streamed TTS audio to .wav
 files

---
 src/client.py      | 12 ++++++++-
 src/core/client.py | 61 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/src/client.py b/src/client.py
index 466cc05..d6bba63 100644
--- a/src/client.py
+++ b/src/client.py
@@ -26,11 +26,21 @@ async def launch_client():
         required=True,
         help="Path to Client config file (YAML)",
     )
+    parser.add_argument(
+        "--save-audio",
+        nargs="?",
+        const="audio_dumps",
+        default=None,
+        metavar="DIR",
+        help="Save streamed TTS audio to .wav files (one per utterance) in DIR "
+        "for quality-checking. Defaults to ./audio_dumps when the flag is given "
+        "without a value.",
+    )
 
     args = parser.parse_args()
     config = load_client_config(args.config)
 
-    await Client(config=config).run()
+    await Client(config=config, save_audio_dir=args.save_audio).run()
 
 
 if __name__ == "__main__":
diff --git a/src/core/client.py b/src/core/client.py
index b68f4bb..2af97f2 100644
--- a/src/core/client.py
+++ b/src/core/client.py
@@ -1,9 +1,13 @@
 import asyncio
 import json
 import os
+import struct
+import wave
 from dataclasses import asdict
+from datetime import datetime
 from typing import Dict, List, Optional, Type
 
+import numpy as np
 import websockets
 
 from src.core.dataclasses.config import ClientConfig
@@ -19,11 +23,22 @@ def __init__(
         config: ClientConfig,
         user_id_file: str = os.path.expanduser("~/.huri_user_id"),
         senders_dict: Dict[str, Type[ClientSender]] = get_senders(),
+        save_audio_dir: Optional[str] = None,
     ):
         self.config = config
         self.user_id_file = user_id_file
         self.senders_dict = senders_dict
 
+        # When set, incoming audio chunks are buffered per utterance and written
+        # to a .wav under this directory each time an end-of-utterance marker
+        # arrives — handy for ear-checking what the TTS actually streamed.
+        self.save_audio_dir = save_audio_dir
+        self._audio_buf: List[np.ndarray] = []
+        self._audio_sr: Optional[int] = None
+        self._audio_idx = 0
+        if save_audio_dir:
+            os.makedirs(save_audio_dir, exist_ok=True)
+
     def _load_user_id(self) -> Optional[str]:
         if os.path.exists(self.user_id_file):
             with open(self.user_id_file) as f:
@@ -34,8 +49,42 @@ def _save_user_id(self, _user_id: str):
         with open(self.user_id_file, "w") as f:
             f.write(_user_id)
 
+    def _collect_audio(self, samples: np.ndarray, sample_rate: int, end: bool) -> None:
+        if samples.size:
+            self._audio_buf.append(samples)
+            self._audio_sr = sample_rate
+        if end:
+            self._flush_audio()
+
+    def _flush_audio(self) -> None:
+        if not self._audio_buf or self._audio_sr is None:
+            self._audio_buf = []
+            return
+        audio = np.concatenate(self._audio_buf)
+        stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        path = os.path.join(
+            self.save_audio_dir, f"utt-{self._audio_idx:03d}-{stamp}.wav"
+        )
+        self._write_wav(path, audio, self._audio_sr)
+        print(
+            f"** saved audio: {path} ({audio.size} samples, "
+            f"~{audio.size / self._audio_sr:.2f}s @ {self._audio_sr}Hz)"
+        )
+        self._audio_idx += 1
+        self._audio_buf = []
+
+    @staticmethod
+    def _write_wav(path: str, audio: np.ndarray, sample_rate: int) -> None:
+        # float32 [-1, 1] -> 16-bit PCM, clipped to avoid wraparound on overshoot.
+        pcm = np.clip(audio, -1.0, 1.0)
+        pcm = (pcm * 32767.0).astype("<i2")
+        with wave.open(path, "wb") as wav:
+            wav.setnchannels(1)
+            wav.setsampwidth(2)
+            wav.setframerate(sample_rate)
+            wav.writeframes(pcm.tobytes())
+
     async def _receive_loop(self, ws: websockets.ClientConnection):
-        import struct
         try:
             while True:
                 msg = await ws.recv()
@@ -49,11 +98,14 @@ async def _receive_loop(self, ws: websockets.ClientConnection):
 
                     if topic == "audio" and len(payload) >= 13:
                         sample_rate, end_flag, pts = struct.unpack(">IBd", payload[:13])
-                        n_samples = (len(payload) - 13) // 4
+                        # Samples are native-endian float32 (Sender uses ndarray.tobytes()).
+                        samples = np.frombuffer(payload[13:], dtype=np.float32)
                         print(
-                            f"<< audio: pts={pts:.3f}s samples={n_samples} @ {sample_rate}Hz "
+                            f"<< audio: pts={pts:.3f}s samples={samples.size} @ {sample_rate}Hz "
                             f"end={bool(end_flag)}"
                         )
+                        if self.save_audio_dir:
+                            self._collect_audio(samples, sample_rate, bool(end_flag))
                     elif topic == "motion" and len(payload) >= 16:
                         pts, fps, n_frames = struct.unpack(">dII", payload[:16])
                         print(f"<< motion: pts={pts:.3f}s frames={n_frames} @ {fps}fps")
@@ -64,6 +116,9 @@ async def _receive_loop(self, ws: websockets.ClientConnection):
 
         except (asyncio.CancelledError, websockets.ConnectionClosedOK):
             pass
+        finally:
+            if self.save_audio_dir:
+                self._flush_audio()  # save anything left if the stream ended mid-utterance
 
     async def run(self):
         async with websockets.connect(self.config.huri_url) as ws:

From 28e279fb32cbd7ea2e48fefe23b81c08f867604d Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Sun, 31 May 2026 17:59:20 +0200
Subject: [PATCH 16/31] fix(stt): missing lookup for PVC on kube init

---
 .../local_nvidia_amd/templates/whisper-model-init-job.yaml      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml
index 098f7f7..879d908 100644
--- a/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml
+++ b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml
@@ -1,6 +1,7 @@
 {{- if .Values.models.whisper.enabled }}
 {{- $model := .Values.models.whisper }}
 {{- $pvcName := printf "%s-whisper-models" (include "huri.fullname" .) }}
+{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }}
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
@@ -21,6 +22,7 @@ spec:
   {{- if $model.pvc.storageClassName }}
   storageClassName: {{ $model.pvc.storageClassName }}
   {{- end }}
+{{- end }}
 ---
 # Runs only on first install (not on upgrade) — model is already on the PVC.
 apiVersion: batch/v1

From 734aaf95d323759182b780a42f1f4fa0345a3d11 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 14:04:31 +0200
Subject: [PATCH 17/31] feat(gesture): warmup to generate output faster on
 first time

---
 src/modules/gesture/gesture.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index f939d38..e4fb01b 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -90,8 +90,29 @@ def __init__(
         print("[Gesture] loading EmageAudioModel...", flush=True)
         self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device)
         self.model.eval()
+
+        self._warmup()
         print(f"[Gesture] ready", flush=True)
 
+    def _warmup(self) -> None:
+        # The first inference pays a one-time cold-start cost (CUDA context
+        # init, kernel JIT/load, cuDNN autotuning, caching-allocator warmup)
+        # that can take several seconds. Run a throwaway pass here so that cost
+        # is paid at startup — where we're already blocking on weight loads —
+        # rather than on the first user-facing gesture. Best-effort: a failure
+        # here must never prevent the deployment from coming up.
+        import time
+
+        try:
+            # ~3 s of silence at 16 kHz exercises the full sliding-window path
+            # (multiple rounds + remainder) the way a real utterance would.
+            dummy = np.zeros(_EMAGE_SR * 3, dtype=np.float32)
+            t0 = time.time()
+            self.infer(dummy, source_sr=_EMAGE_SR)
+            print(f"[Gesture] warmup done in {time.time() - t0:.2f}s", flush=True)
+        except Exception as e:  # noqa: BLE001 — warmup is an optimisation, never fatal
+            print(f"[Gesture] WARNING warmup failed: {e!r}", flush=True)
+
     def infer(self, audio_np: np.ndarray, source_sr: int = _EMAGE_SR) -> Motion:
         import torch
         import torch.nn.functional as F

From 191b6b0693000df819ecc75bf2994dc8d38b4b9b Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 14:05:15 +0200
Subject: [PATCH 18/31] feat(tts): using CosyVoice3 model

---
 .../templates/cosytts-model-init-job.yaml     | 39 ++++++++-----------
 deploy/examples/local_nvidia_amd/values.yaml  | 18 ++++++---
 src/modules/text_to_speech/text_to_speech.py  | 27 ++++++++-----
 3 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
index 0f62dd1..b0e3bc7 100644
--- a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
+++ b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml
@@ -24,7 +24,9 @@ spec:
   {{- end }}
 {{- end }}
 ---
-# Runs only on first install (not on upgrade) — models are already on the PVC.
+# Runs on every install/upgrade (pre-* hook) but exits early once the model dir
+# already looks complete — so a no-op upgrade costs only a pod spin-up. Set
+# $model.forceDownload=true to wipe and re-fetch (e.g. when changing versions).
 apiVersion: batch/v1
 kind: Job
 metadata:
@@ -59,26 +61,25 @@ spec:
             - |
               set -e
               MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}"
-              BLANK_EN_DIR="$MODEL_DIR/CosyVoice-BlankEN"
-              HAS_MAIN_CONFIG="no"
-              HAS_QWEN_WEIGHTS="no"
-              if [ -f "$MODEL_DIR/cosyvoice2.yaml" ]; then
-                HAS_MAIN_CONFIG="yes"
+              {{- if $model.forceDownload }}
+              echo "forceDownload=true — wiping $MODEL_DIR for a fresh download."
+              rm -rf "$MODEL_DIR"
+              {{- end }}
+              # CosyVoice3 ships cosyvoice3.yaml + llm.pt (the Qwen LM) plus a
+              # bundled CosyVoice-BlankEN/ dir, so a single snapshot grabs
+              # everything the worker needs — no separate sub-model download.
+              HAS_CONFIG="no"
+              HAS_LLM="no"
+              if [ -f "$MODEL_DIR/cosyvoice3.yaml" ]; then
+                HAS_CONFIG="yes"
               fi
-              if [ -f "$BLANK_EN_DIR/model.safetensors" ] || \
-                 [ -f "$BLANK_EN_DIR/pytorch_model.bin" ] || \
-                 [ -f "$BLANK_EN_DIR/model.safetensors.index.json" ] || \
-                 [ -f "$BLANK_EN_DIR/pytorch_model.bin.index.json" ]; then
-                HAS_QWEN_WEIGHTS="yes"
+              if [ -f "$MODEL_DIR/llm.pt" ]; then
+                HAS_LLM="yes"
               fi
-              if [ "$HAS_MAIN_CONFIG" = "yes" ] && [ "$HAS_QWEN_WEIGHTS" = "yes" ]; then
+              if [ "$HAS_CONFIG" = "yes" ] && [ "$HAS_LLM" = "yes" ]; then
                 echo "Model already present at $MODEL_DIR — skipping download."
                 exit 0
               fi
-              if [ "$HAS_QWEN_WEIGHTS" = "no" ] && [ -d "$BLANK_EN_DIR" ]; then
-                echo "Partial Qwen weights detected; clearing $BLANK_EN_DIR before re-download."
-                rm -rf "$BLANK_EN_DIR"
-              fi
               echo "Downloading {{ $model.modelSource.modelId }} into $MODEL_DIR …"
               pip install --quiet modelscope
               python - <<'PYEOF'
@@ -87,12 +88,6 @@ spec:
                   "{{ $model.modelSource.modelId }}",
                   local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}",
               )
-              # CosyVoice2 loads this sub-model at runtime; pre-download it so
-              # the worker pod does not need outbound internet access.
-              snapshot_download(
-                  "iic/CosyVoice-BlankEN",
-                  local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}/CosyVoice-BlankEN",
-              )
               PYEOF
               echo "Download complete."
           volumeMounts:
diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
index 8f67376..e771758 100644
--- a/deploy/examples/local_nvidia_amd/values.yaml
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -40,6 +40,10 @@ ray:
             # client config.
             HURI_GESTURE_CONTEXT_SEC: "2.0"
             HURI_GESTURE_MIN_CHUNK_SEC: "0.5"
+            # CosyVoice3 contract: "<instruction><|endofprompt|><transcript-of-voice.wav>".
+            # The reference transcript MUST come AFTER the marker, or the LM treats it as
+            # an instruction and intermittently speaks it (prompt leakage).
+            HURI_VOICE_TRANSCRIPT: "You are a helpful assistant.<|endofprompt|>Instinct creates its own oppressors and bids us rise up against them."
         deployments:
           # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only —
           # all GPU work is offloaded to handle-backed deployments below.
@@ -169,11 +173,11 @@ workerGroups:
     resources:
       limits:
         cpu: "4"
-        memory: "8Gi"
+        memory: "14Gi"
         nvidia.com/gpu: "1"
       requests:
         cpu: "2"
-        memory: "4Gi"
+        memory: "8Gi"
         nvidia.com/gpu: "1"
     shmSize: 2Gi
 
@@ -256,16 +260,21 @@ models:
         - ReadWriteOnce
     # Where the model weights PVC is mounted inside the worker container.
     mountPath: /models/cosytts
+    # Set to true to wipe the existing model dir and re-download on the next
+    # helm upgrade. Needed when switching model versions on a PVC that already
+    # holds weights (the init Job otherwise skips download when the model dir
+    # already looks complete). Set back to false after the redownload runs.
+    forceDownload: false
     modelSource:
       # type: modelscope | huggingface  (only modelscope is implemented)
       type: modelscope
       # ModelScope model ID — snapshot_download uses this as the sub-path
       # inside mountPath, so the final path is mountPath/modelId.
-      modelId: iic/CosyVoice2-0.5B
+      modelId: FunAudioLLM/Fun-CosyVoice3-0.5B-2512
     # Env vars injected into every worker that mounts this model.
     # HURI_MODEL_PATH must match mountPath/modelId (see text_to_speech.py).
     env:
-      HURI_MODEL_PATH: /models/cosytts/iic/CosyVoice2-0.5B
+      HURI_MODEL_PATH: /models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512
       # Path to the CosyVoice repo root containing third_party/Matcha-TTS.
       HURI_COSY_DIR: /app/cosyvoice
 
@@ -318,7 +327,6 @@ voiceAssets:
   mountPath: /assets
   env:
     HURI_VOICE_SAMPLE_PATH: /assets/voice.wav
-    HURI_VOICE_TRANSCRIPT: "Instinct creates its own oppressors and bids us rise up against them."
 
 # Ingress for the Ray Serve endpoint (port 8000).
 ingress:
diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py
index 9daa384..4b9baf2 100644
--- a/src/modules/text_to_speech/text_to_speech.py
+++ b/src/modules/text_to_speech/text_to_speech.py
@@ -29,10 +29,19 @@ def _trace(msg: str) -> None:
 
 
 # Defaults — overridden by env vars in production (see README.md)
-_MODEL_PATH = os.environ.get("HURI_MODEL_PATH", "/models/cosytts/iic/CosyVoice2-0.5B")
+_MODEL_PATH = os.environ.get(
+    "HURI_MODEL_PATH", "/models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512"
+)
 _VOICE_SAMPLE_PATH = os.environ.get("HURI_VOICE_SAMPLE_PATH", "/assets/voice.wav")
-_VOICE_SAMPLE_TRANSCRIPT = os.environ.get(
-    "HURI_VOICE_TRANSCRIPT", "Hello, this is my voice sample for cloning."
+# CosyVoice3 expects "<instruction><|endofprompt|><transcript-of-voice.wav>". If the
+# config supplies a bare transcript (no marker), prepend the default instruction so the
+# transcript lands AFTER <|endofprompt|> — otherwise the LM treats it as an instruction
+# and intermittently renders it as speech (prompt leakage). The transcription is a must.
+_raw_transcript = os.environ["HURI_VOICE_TRANSCRIPT"]
+_VOICE_SAMPLE_TRANSCRIPT = (
+    _raw_transcript
+    if "<|endofprompt|>" in _raw_transcript
+    else f"You are a helpful assistant.<|endofprompt|>{_raw_transcript}"
 )
 
 _END_TEXT = object()   # sentinel pushed into the text queue to close synth
@@ -42,7 +51,7 @@ def _trace(msg: str) -> None:
 
 @serve.deployment(name="TTS", max_ongoing_requests=200)
 class TTSDeployment:
-    """CosyVoice2 wrapper with per-session bistream synthesis.
+    """CosyVoice3 wrapper with per-session bistream synthesis.
 
     The model's `inference_zero_shot` accepts a Python generator as `tts_text`
     and yields audio chunks as text arrives — that's the "bistream" mode.
@@ -57,7 +66,7 @@ def __init__(
         voice_sample_path: str = _VOICE_SAMPLE_PATH,
         voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT,
     ):
-        _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path}")
+        _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path} transcript={voice_sample_transcript}")
 
         cosy_dir = os.environ.get("HURI_COSY_DIR")
         if cosy_dir:
@@ -66,11 +75,11 @@ def __init__(
                 sys.path.insert(0, matcha_path)
                 logger.debug("Added Matcha-TTS path to sys.path: %s", matcha_path)
 
-        from cosyvoice.cli.cosyvoice import CosyVoice2
+        from cosyvoice.cli.cosyvoice import CosyVoice3
 
-        self.model = CosyVoice2(model_path, load_jit=False, load_trt=False)
+        self.model = CosyVoice3(model_dir=model_path, load_trt=False)
         self.sample_rate: int = self.model.sample_rate
-        _trace(f"CosyVoice2 loaded (sample_rate={self.sample_rate})")
+        _trace(f"CosyVoice3 loaded (sample_rate={self.sample_rate})")
 
         self.prompt_speech = voice_sample_path
         self.prompt_text: str = voice_sample_transcript
@@ -140,7 +149,7 @@ def text_gen():
 
 
 class TTS(ModuleWithHandle):
-    """TTS Module — bistream tokens-in / audio-out via CosyVoice2.
+    """TTS Module — bistream tokens-in / audio-out via CosyVoice3.
 
     Opens one synthesis session per utterance (delimited by `token.end`). Each
     incoming token is pushed straight into the model's text generator so audio

From 5bc82be8712c56f2faf4c558a316104290635256 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 14:40:22 +0200
Subject: [PATCH 19/31] fixed(stt): audio_in event for the pipeline, so that
 text is read by RAG

---
 src/core/client_senders.py                   |  4 +-
 src/core/events.py                           |  2 +-
 src/modules/events.py                        |  8 +-
 src/modules/speech_to_text/microphone_vad.py |  7 +-
 src/modules/speech_to_text/speech_to_text.py | 82 +++++++++++++-------
 5 files changed, 68 insertions(+), 35 deletions(-)

diff --git a/src/core/client_senders.py b/src/core/client_senders.py
index 03301a6..c1ae31f 100644
--- a/src/core/client_senders.py
+++ b/src/core/client_senders.py
@@ -45,7 +45,9 @@ async def send(self, topic: str, data: EventData | bytes):
 
 
 class AudioSender(ClientSender):
-    output_type = "audio"
+    # Mic frames go out on "audio_in"; the server's "audio" topic is reserved
+    # for TTS output streamed back to us (see MIC.input_type).
+    output_type = "audio_in"
 
     def __init__(
         self, sample_rate: int = 16000, frame_duration: float = 0.030, **kwargs
diff --git a/src/core/events.py b/src/core/events.py
index 1d116bc..5df1bf8 100644
--- a/src/core/events.py
+++ b/src/core/events.py
@@ -47,7 +47,7 @@ def register(self, module: Module):
 
     async def publish(self, event_topic, data):
         subs = self.subscribers[event_topic]
-        if event_topic not in ("audio",):  # skip mic-frame spam
+        if event_topic not in ("audio_in",):  # skip mic-frame spam
             logger.info(
                 "[GRAPH] publish topic=%r subscribers=%s",
                 event_topic, [type(m).__name__ for m in subs],
diff --git a/src/modules/events.py b/src/modules/events.py
index b731c21..cb7bd61 100644
--- a/src/modules/events.py
+++ b/src/modules/events.py
@@ -7,6 +7,7 @@
 
 def get_events() -> Dict[str, Type[EventData | bytes]]:
     events: Dict[str, Type[EventData | bytes]] = {
+        "audio_in": bytes,  # inbound mic frames (raw int16 PCM)
         "audio": bytes,
         "voice": Voice,
         "transcript": Transcript,
@@ -22,9 +23,10 @@ def get_events() -> Dict[str, Type[EventData | bytes]]:
     else:
         events["motion"] = Motion
 
-    # TTS output "audio" is an Audio dataclass internally; the websocket boundary
-    # only ever decodes raw bytes for the "audio" topic (mic input), so the
-    # registry keeps bytes there. Keep Audio importable for type completeness.
+    # TTS output "audio" is an Audio dataclass internally, sent to the client by
+    # the Sender's Audio branch (never decoded inbound). Inbound mic frames use
+    # the separate "audio_in" topic above. The registry keeps bytes for "audio"
+    # only for output-type registration. Keep Audio importable for completeness.
     _ = Audio
 
     return events
diff --git a/src/modules/speech_to_text/microphone_vad.py b/src/modules/speech_to_text/microphone_vad.py
index ffb82f7..eb52f74 100644
--- a/src/modules/speech_to_text/microphone_vad.py
+++ b/src/modules/speech_to_text/microphone_vad.py
@@ -13,7 +13,7 @@ class MIC(Module):
 
     Detect voice and silence using WebRTC VAD.
 
-    input: audio,
+    input: audio_in,
     output: voice
 
     :vad_agressiveness: from 0 (low) to 3 (high, can distord audio).
@@ -23,7 +23,10 @@ class MIC(Module):
         Can only be 0.010, 0.020 and 0.030.
     """
 
-    input_type = "audio"
+    # Inbound microphone frames travel on their own topic so the TTS-output
+    # "audio" topic (consumed by Gesture and the client Sender) never collides
+    # with mic input — otherwise raw mic bytes get echoed back to the client.
+    input_type = "audio_in"
     output_type = "voice"
 
     def __init__(
diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py
index 29c3ad8..7a9b91d 100644
--- a/src/modules/speech_to_text/speech_to_text.py
+++ b/src/modules/speech_to_text/speech_to_text.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import List, Optional
+from typing import AsyncGenerator, List
 
 import numpy as np
 from ray import serve
@@ -93,40 +93,66 @@ def __init__(
 
         self.buffer: List[np.ndarray] = []
 
-        self.silence: bool = True
+        # Set when the VAD emits its single end-of-utterance marker (Voice(None)).
+        # Remembered rather than acted on immediately so it survives an in-flight
+        # transcribe — otherwise the terminal window (and thus the question that
+        # drives the RAG) is silently dropped.
+        self.pending_end: bool = False
 
         self.running = False
         self.lock: asyncio.Lock = asyncio.Lock()
 
-    async def process(self, voice: Voice) -> Optional[Transcript]:
-        if voice.data is None:
-            self.silence = True
-        else:
-            self.silence = False
-            async with self.lock:
+    async def process(self, voice: Voice) -> AsyncGenerator[Transcript, None]:  # type: ignore[override]
+        async with self.lock:
+            if voice.data is None:
+                self.pending_end = True
+            else:
                 self.buffer.append(voice.data)
 
-        async with self.lock:
             if self.running:
-                return None
+                # Another invocation owns the drain loop below; it will pick up
+                # the frame we just buffered (and any pending end-of-utterance).
+                return
             self.running = True
 
-        async with self.lock:
-            buffer_size = len(self.buffer)
-            if buffer_size == 0 or (
-                self.silence is False and buffer_size < self.window_size
-            ):
+        try:
+            while True:
+                async with self.lock:
+                    end = self.pending_end
+                    buffer_size = len(self.buffer)
+
+                    if buffer_size == 0:
+                        # Nothing left to transcribe. If the utterance just
+                        # ended, still emit a terminal transcript so the
+                        # aggregator finalises the question.
+                        if end:
+                            self.pending_end = False
+                            yield Transcript("", True)
+                        return
+
+                    # Mid-speech: hold until a full window has accumulated.
+                    if not end and buffer_size < self.window_size:
+                        return
+
+                    processing_chunks = self.buffer[: self.window_size]
+                    # On end-of-utterance, the last window drains the buffer.
+                    final = end and buffer_size <= self.window_size
+
+                processing_audio = np.concatenate(processing_chunks, axis=0)
+                current_text: str = await self._handle.transcribe.remote(
+                    processing_audio
+                )
+
+                async with self.lock:
+                    if final:
+                        self.buffer = []
+                        self.pending_end = False
+                    else:
+                        self.buffer = self.buffer[self.window_size - self.step_size :]
+
+                yield Transcript(current_text, final)
+                if final:
+                    return
+        finally:
+            async with self.lock:
                 self.running = False
-                return None
-            processing_chunks = self.buffer[: self.window_size]
-
-        processing_audio = np.concatenate(processing_chunks, axis=0)
-
-        current_text: str = await self._handle.transcribe.remote(processing_audio)
-
-        processed_size = self.window_size - self.step_size
-        async with self.lock:
-            self.buffer = self.buffer[processed_size:]
-            self.running = False
-
-        return Transcript(current_text, self.silence)

From 14a06bf775ea0def4b8ce19cb85cbc7ca0e25a74 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 15:47:48 +0200
Subject: [PATCH 20/31] fix(stt): going back to previous STT impelmentation,
 with a simpler handle and module

---
 deploy/examples/local_nvidia_amd/values.yaml |   1 +
 src/modules/speech_to_text/speech_to_text.py | 143 ++++++++-----------
 2 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
index e771758..9033384 100644
--- a/deploy/examples/local_nvidia_amd/values.yaml
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -44,6 +44,7 @@ ray:
             # The reference transcript MUST come AFTER the marker, or the LM treats it as
             # an instruction and intermittently speaks it (prompt leakage).
             HURI_VOICE_TRANSCRIPT: "You are a helpful assistant.<|endofprompt|>Instinct creates its own oppressors and bids us rise up against them."
+            HURI_STT_MODEL_PATH: /models/whisper/Systran/faster-whisper-base
         deployments:
           # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only —
           # all GPU work is offloaded to handle-backed deployments below.
diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py
index 7a9b91d..9e48a62 100644
--- a/src/modules/speech_to_text/speech_to_text.py
+++ b/src/modules/speech_to_text/speech_to_text.py
@@ -1,6 +1,6 @@
 import asyncio
 import os
-from typing import AsyncGenerator, List
+from typing import List, Optional
 
 import numpy as np
 from ray import serve
@@ -15,15 +15,20 @@
 
 @serve.deployment(name="STT")
 class STTDeployment:
-    """Stateless Whisper inference actor.
+    """faster-whisper model wrapper.
 
-    Holds the faster-whisper model in a single Ray actor (pinned to the AMD
-    worker in deployment configs). Exposes a single transcribe() call so
-    per-session STT clients can offload the heavy work without owning a GPU.
+    Holds the WhisperModel and runs transcription on its own Ray Serve actor,
+    off the HuRI master actor — model load and GPU inference no longer block the
+    websocket ingress / per-session router. Pinned to a GPU worker via
+    ray_actor_options in the Serve config (see deploy values.yaml).
 
-    HURI_STT_MODEL_PATH: path to a local faster-whisper model directory (from
-    the whisper PVC). Falls back to "base" which triggers a HuggingFace
-    download — only acceptable for local dev without a PVC.
+    Stateless across calls: the per-session sliding-window buffering lives in the
+    STT module, so this deployment is shared across all sessions.
+
+    :model: path to (or size name of) the faster-whisper model. Defaults to the
+        HURI_STT_MODEL_PATH env var, falling back to "base".
+    :device: "cpu", "cuda", or "auto".
+    :compute_type: e.g. "int8", "float16", or "auto".
     """
 
     def __init__(
@@ -32,7 +37,6 @@ def __init__(
         device: str = "auto",
         compute_type: str = "auto",
     ):
-        print(f"[STT] loading model from {model!r} (device={device} compute_type={compute_type})", flush=True)
         from faster_whisper import WhisperModel
 
         self.model_faster = WhisperModel(
@@ -40,35 +44,32 @@ def __init__(
             device=device,
             compute_type=compute_type,
         )
-        print(f"[STT] model loaded", flush=True)
-        self.language = "en"
-
-    async def transcribe(self, audio: np.ndarray) -> str:
-        loop = asyncio.get_running_loop()
-        segments, _ = await loop.run_in_executor(
-            None,
-            lambda: self.model_faster.transcribe(
-                audio,
-                language=self.language,
-                beam_size=1,
-            ),
+
+    async def transcribe(self, audio: np.ndarray, language: str = "en") -> str:
+        segments, _ = self.model_faster.transcribe(
+            audio,
+            language=language,
+            beam_size=1,  # faster for realtime
         )
-        return " ".join(seg.text for seg in segments).strip()
+        return " ".join([seg.text for seg in segments]).strip()
 
 
 class STT(ModuleWithHandle):
     """STT Module
 
-    Per-session client: keeps the rolling window / silence state, offloads each
-    transcription window to the shared STTDeployment actor.
+    Transcribe voice using Faster_Whisper.
+
+    Holds the per-session sliding-window buffer and delegates the actual
+    transcription to a handle-backed STTDeployment, so the Whisper model runs
+    off the HuRI master node.
 
     input: voice,
     output: transcript
 
+    :language: language spoken in the audio. It should be a language code such
+        as "en" or "fr".
     :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000.
     :block_duration: size of received voice audio (in s).
-    :transcribe_window: rolling window length (s) handed to Whisper.
-    :transcribe_step: stride (s) between successive windows.
     """
 
     _handle_cls = STTDeployment
@@ -83,76 +84,58 @@ def __init__(
         block_duration: float = 0.020,  # s
         transcribe_window: float = 2.0,  # s
         transcribe_step: float = 1.0,  # s
+        **kwargs,
     ):
-        super().__init__(_handle=_handle)
+        super().__init__(_handle=_handle, **kwargs)
 
         self.language = language
+
         self.sample_rate = sample_rate
         self.window_size: int = int(transcribe_window / block_duration)
         self.step_size: int = int(transcribe_step / block_duration)
 
         self.buffer: List[np.ndarray] = []
 
-        # Set when the VAD emits its single end-of-utterance marker (Voice(None)).
-        # Remembered rather than acted on immediately so it survives an in-flight
-        # transcribe — otherwise the terminal window (and thus the question that
-        # drives the RAG) is silently dropped.
-        self.pending_end: bool = False
+        self.silence: bool = True
+
+        self.prev_text: str = ""
+        self.stable_text: str = ""
 
         self.running = False
         self.lock: asyncio.Lock = asyncio.Lock()
 
-    async def process(self, voice: Voice) -> AsyncGenerator[Transcript, None]:  # type: ignore[override]
-        async with self.lock:
-            if voice.data is None:
-                self.pending_end = True
-            else:
+    async def process(self, voice: Voice) -> Optional[Transcript]:  # type: ignore[override]
+        if voice.data is None:
+            self.silence = True
+        else:
+            self.silence = False
+            async with self.lock:
                 self.buffer.append(voice.data)
 
+        async with self.lock:
             if self.running:
-                # Another invocation owns the drain loop below; it will pick up
-                # the frame we just buffered (and any pending end-of-utterance).
-                return
+                return None
             self.running = True
 
-        try:
-            while True:
-                async with self.lock:
-                    end = self.pending_end
-                    buffer_size = len(self.buffer)
-
-                    if buffer_size == 0:
-                        # Nothing left to transcribe. If the utterance just
-                        # ended, still emit a terminal transcript so the
-                        # aggregator finalises the question.
-                        if end:
-                            self.pending_end = False
-                            yield Transcript("", True)
-                        return
-
-                    # Mid-speech: hold until a full window has accumulated.
-                    if not end and buffer_size < self.window_size:
-                        return
-
-                    processing_chunks = self.buffer[: self.window_size]
-                    # On end-of-utterance, the last window drains the buffer.
-                    final = end and buffer_size <= self.window_size
-
-                processing_audio = np.concatenate(processing_chunks, axis=0)
-                current_text: str = await self._handle.transcribe.remote(
-                    processing_audio
-                )
-
-                async with self.lock:
-                    if final:
-                        self.buffer = []
-                        self.pending_end = False
-                    else:
-                        self.buffer = self.buffer[self.window_size - self.step_size :]
-
-                yield Transcript(current_text, final)
-                if final:
-                    return
-        finally:
-            async with self.lock:
+        async with self.lock:
+            buffer_size = len(self.buffer)
+            if buffer_size == 0 or (
+                self.silence is False and buffer_size < self.window_size
+            ):
                 self.running = False
+                return None
+            processing_chunks = self.buffer[: self.window_size]
+
+        self.pending_silence = False
+        processing_audio = np.concatenate(processing_chunks, axis=0)
+
+        current_text = await self._handle.transcribe.remote(
+            processing_audio, self.language
+        )
+
+        processed_size = self.window_size - self.step_size
+        async with self.lock:
+            self.buffer = self.buffer[processed_size:]
+            self.running = False
+
+        return Transcript(current_text, self.silence)

From a1f5e6df070ae7141ab40d08b4f5ac734fcd2031 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 18:52:14 +0200
Subject: [PATCH 21/31] feat(rag): profile data from RAG based of uid

---
 src/modules/rag/ingestion.py | 154 +++++++++++++++++++++++++++++++++--
 src/modules/rag/rag.py       |  93 ++++++++++++++++++---
 2 files changed, 228 insertions(+), 19 deletions(-)

diff --git a/src/modules/rag/ingestion.py b/src/modules/rag/ingestion.py
index f4e4dae..88d85a8 100644
--- a/src/modules/rag/ingestion.py
+++ b/src/modules/rag/ingestion.py
@@ -7,7 +7,8 @@
 from pathlib import Path
 from typing import Any, List
 
-from pypdf import PdfReader
+import httpx
+import numpy as np
 from qdrant_client import QdrantClient
 from qdrant_client.models import (
     Distance,
@@ -17,12 +18,37 @@
     PointStruct,
     VectorParams,
 )
-from semantic_chunker import SemanticChunker
-from sentence_transformers import SentenceTransformer
 
 USER_ID_FILE = os.path.expanduser("~/.huri_user_id")
 
 
+class RemoteEmbedder:
+    """Embed via an OpenAI-compatible ``/v1/embeddings`` endpoint (e.g. llama.cpp).
+
+    Drop-in for the subset of ``SentenceTransformer`` this tool uses: a single
+    ``.encode(text, normalize_embeddings=...)`` returning a 1-D numpy array, so
+    the existing ``.tolist()`` / ``len(...)`` call sites keep working unchanged.
+    """
+
+    def __init__(self, url: str, model_name: str):
+        self.url = url.rstrip("/")
+        self.model_name = model_name
+        self._client = httpx.Client(timeout=60.0, verify=False)
+
+    def encode(self, text: str, normalize_embeddings: bool = True) -> np.ndarray:
+        resp = self._client.post(
+            f"{self.url}/v1/embeddings",
+            json={"model": self.model_name, "input": str(text)},
+        )
+        resp.raise_for_status()
+        vec = np.asarray(resp.json()["data"][0]["embedding"], dtype=np.float32)
+        if normalize_embeddings:
+            norm = np.linalg.norm(vec)
+            if norm > 0:
+                vec = vec / norm
+        return vec
+
+
 def _split_sentences(text: str) -> list[str]:
     """Simple sentence splitter."""
     result: List = []
@@ -77,6 +103,8 @@ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]
 def extract_text_from_pdf(pdf_path: str) -> str:
     """Extract text from a PDF file."""
     try:
+        from pypdf import PdfReader
+
         reader = PdfReader(pdf_path)
         text = ""
         for page in reader.pages:
@@ -116,7 +144,7 @@ def ensure_collection(client: QdrantClient, collection: str, vector_size: int):
 
 def ingest_chunks(
     client: QdrantClient,
-    model: SentenceTransformer,
+    model: Any,
     collection: str,
     chunks: list[str],
     _user_id: str,
@@ -154,9 +182,11 @@ def ingest_chunks(
     return len(points)
 
 
-def chunk_strat(text: str, args, model: SentenceTransformer) -> list[str] | Any:
+def chunk_strat(text: str, args, model: Any) -> list[str] | Any:
     """Pick the right chunking strategy based on args."""
     if args.chunking == "semantic":
+        from semantic_chunker import SemanticChunker
+
         chunker = SemanticChunker(
             model=model,
             strategy=args.semantic_strategy,
@@ -287,6 +317,50 @@ def cmd_write(args, client, model, _user_id):
     print(f"Done. Ingested {count} chunks as '{title}'")
 
 
+def cmd_profile(args, client, model, _user_id):
+    """Store always-on profile facts about the user (name, etc.).
+
+    Unlike regular documents, profile facts are NOT retrieved by vector
+    similarity. The RAG handle pulls them by filter (_user_id + type=profile)
+    on every query and injects them into the system prompt, so the character
+    always knows them.
+    """
+    sample = model.encode("test", normalize_embeddings=True)
+    ensure_collection(client, args.collection, len(sample))
+
+    facts: List[str] = []
+    if args.name:
+        facts.append(f"The user's name is {args.name}.")
+    for fact in args.fact or []:
+        facts.append(fact)
+
+    if not facts:
+        print("Nothing to store. Use --name and/or --fact 'some fact'.")
+        return
+
+    # Replace the existing profile so facts don't pile up across runs.
+    client.delete(
+        collection_name=args.collection,
+        points_selector=Filter(
+            must=[
+                FieldCondition(key="_user_id", match=MatchValue(value=_user_id)),
+                FieldCondition(key="type", match=MatchValue(value="profile")),
+            ]
+        ),
+    )
+
+    count = ingest_chunks(
+        client,
+        model,
+        args.collection,
+        facts,
+        _user_id,
+        source="profile",
+        doc_type="profile",
+    )
+    print(f"Stored {count} profile fact(s) for user {_user_id}")
+
+
 def cmd_list(args, client, model, _user_id):
     """List what's in the database for this user."""
 
@@ -355,7 +429,23 @@ def main():
     parser.add_argument("--user-id", type=str, default=None)
     parser.add_argument("--collection", type=str, default="documents")
     parser.add_argument("--qdrant-url", type=str, default="http://localhost:6333")
+    parser.add_argument(
+        "--no-verify-ssl",
+        action="store_true",
+        default=False,
+        help="Disable SSL certificate verification (needed for self-signed LAN certs).",
+    )
     parser.add_argument("--embedding-model", type=str, default="BAAI/bge-large-en-v1.5")
+    parser.add_argument(
+        "--embedding-url",
+        type=str,
+        default="",
+        help=(
+            "OpenAI-compatible embedding endpoint (e.g. llama.cpp at "
+            "http://localhost:8080). When set, embeddings are computed remotely "
+            "instead of with a local SentenceTransformer. Requires --chunking fixed."
+        ),
+    )
     parser.add_argument(
         "--chunk-size",
         type=int,
@@ -394,6 +484,16 @@ def main():
     p_write = subparsers.add_parser("write", help="Write text interactively")
     p_write.add_argument("--title", type=str, default=None, help="Title/source name")
 
+    p_profile = subparsers.add_parser(
+        "profile", help="Store always-on profile facts (name, etc.)"
+    )
+    p_profile.add_argument("--name", type=str, default=None, help="User's name")
+    p_profile.add_argument(
+        "--fact",
+        action="append",
+        help="A fact about the user, e.g. --fact 'Likes cheese' (repeatable)",
+    )
+
     subparsers.add_parser("list", help="List ingested documents")
 
     p_delete = subparsers.add_parser("delete", help="Delete documents by source")
@@ -403,16 +503,54 @@ def main():
 
     args = parser.parse_args()
 
-    _user_id = get_user_id(args._user_id)
+    if args.embedding_url and args.chunking == "semantic":
+        parser.error(
+            "--chunking semantic needs a local SentenceTransformer model and "
+            "cannot run over --embedding-url. Use --chunking fixed."
+        )
+
+    _user_id = get_user_id(args.user_id)
     print(f"User: {_user_id}")
 
-    client = QdrantClient(url=args.qdrant_url)
-    model = SentenceTransformer(args.embedding_model)
+    verify_ssl = not args.no_verify_ssl
+    # Parse the URL explicitly so QdrantClient gets the correct host/port/https.
+    # When given just "https://host" with no port, some qdrant-client versions
+    # silently fall back to their default port (6333) instead of 443, causing
+    # a timeout that looks like an SSL issue.
+    from urllib.parse import urlparse
+
+    _parsed = urlparse(args.qdrant_url)
+    _is_https = _parsed.scheme == "https"
+    _host = _parsed.hostname
+    _port = _parsed.port or (443 if _is_https else 6333)
+    client = QdrantClient(
+        host=_host,
+        port=_port,
+        https=_is_https,
+        verify=verify_ssl,
+        check_compatibility=verify_ssl,
+    )
+
+    # Lazy-load the model only if the command needs embeddings.
+    # Commands that don't need it: list, delete, profile (doesn't use embeddings).
+    needs_embeddings = args.command in ("pdf", "text", "write", "profile")
+
+    if needs_embeddings:
+        if args.embedding_url:
+            print(f"Embedding remotely via {args.embedding_url} (model={args.embedding_model})")
+            model = RemoteEmbedder(args.embedding_url, args.embedding_model)
+        else:
+            from sentence_transformers import SentenceTransformer
+
+            model = SentenceTransformer(args.embedding_model)
+    else:
+        model = None
 
     commands = {
         "pdf": cmd_pdf,
         "text": cmd_text,
         "write": cmd_write,
+        "profile": cmd_profile,
         "list": cmd_list,
         "delete": cmd_delete,
     }
diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py
index 0195e48..7ffc6da 100644
--- a/src/modules/rag/rag.py
+++ b/src/modules/rag/rag.py
@@ -51,8 +51,18 @@ def _apply_config(self) -> None:
         from qdrant_client import QdrantClient
 
         cfg = self._cfg
+        from urllib.parse import urlparse
+
         self.embedding_url = cfg.embedding_url or cfg.llm_url
-        self._qdrant = QdrantClient(url=cfg.qdrant_url, verify=cfg.verify_ssl)
+        _p = urlparse(cfg.qdrant_url)
+        _is_https = _p.scheme == "https"
+        self._qdrant = QdrantClient(
+            host=_p.hostname,
+            port=_p.port or (443 if _is_https else 6333),
+            https=_is_https,
+            verify=cfg.verify_ssl,
+            check_compatibility=cfg.verify_ssl,
+        )
         print(f"[RAGHandle] Connected to Qdrant at {cfg.qdrant_url}")
         self._embed_client = httpx.AsyncClient(timeout=30.0, verify=cfg.verify_ssl)
         self._llm_client = httpx.AsyncClient(timeout=120.0, verify=cfg.verify_ssl)
@@ -85,6 +95,32 @@ async def _embed(self, text: str) -> list[float]:
                 f"Embedding unexpected schema from {url}: {str(payload)[:1000]}"
             ) from e
 
+    def _get_profile(self, collection: str, _user_id: str) -> list[str]:
+        """Always-on facts about the user (name, etc.).
+
+        Retrieved deterministically by filter — NOT by vector similarity —
+        so they are always available to the prompt regardless of the question.
+        Populated via `ingestion.py profile`.
+        """
+        from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+        try:
+            points, _ = self._qdrant.scroll(
+                collection_name=collection,
+                scroll_filter=Filter(
+                    must=[
+                        FieldCondition(key="_user_id", match=MatchValue(value=_user_id)),
+                        FieldCondition(key="type", match=MatchValue(value="profile")),
+                    ]
+                ),
+                limit=50,
+                with_payload=True,
+                with_vectors=False,
+            )
+        except Exception:
+            return []
+        return [p.payload.get("text", "") for p in points if p.payload.get("text")]
+
     def _search(
         self,
         qdrant,
@@ -126,11 +162,24 @@ def _build_prompt(
         question: str,
         chunks: list[dict],
         preferences: dict,
+        profile_facts: list[str] | None = None,
     ) -> tuple[str, str]:
-        parts = [
-            "You are a robot speaking to a user. Answer based on the provided context.",
-            "If the context is insufficient, say so clearly.",
-        ]
+        # TEMP TEMP TEMP: this persona should be defined in the configuration
+        persona = preferences.get(
+            "persona",
+            "You are Rat-Man, a witty, charismatic animated rat character. "
+            "You are NOT an AI, an assistant, or a language model, and you must "
+            "never say that you are one. Stay fully in character at all times, "
+            "speak naturally and with personality, and never break the illusion.",
+        )
+        parts = [persona]
+
+        if profile_facts:
+            parts.append(
+                "Here is what you know about the person you're talking to: "
+                + " ".join(profile_facts)
+            )
+
         if preferences.get("language"):
             parts.append(f"Always respond in {preferences['language']}.")
         if preferences.get("tone"):
@@ -141,6 +190,14 @@ def _build_prompt(
             parts.append("Keep your answer to 2-3 sentences maximum.")
         if preferences.get("extra_instructions"):
             parts.append(preferences["extra_instructions"])
+
+        parts.append(
+            "Use the context in the user's message to inform your answers when "
+            "it is relevant, but always answer in character. If you don't know "
+            "something, improvise in character rather than admitting you lack "
+            "information or breaking character."
+            "Make small sentences, and no emojis"
+        )
         system_prompt = " ".join(parts)
 
         if not chunks:
@@ -168,7 +225,7 @@ def _build_prompt(
         return system_prompt, user_prompt
 
     async def _stream_ollama(
-        self, messages: list, max_tokens: int
+        self, messages: list, max_tokens: int, temperature: float = 0.7
     ) -> AsyncGenerator[str, None]:
         async with self._llm_client.stream(
                 "POST",
@@ -177,7 +234,7 @@ async def _stream_ollama(
                     "model": self._cfg.llm_model,
                     "messages": messages,
                     "stream": True,
-                    "options": {"num_predict": max_tokens, "temperature": 0.1},
+                    "options": {"num_predict": max_tokens, "temperature": temperature},
                 },
             ) as resp:
                 resp.raise_for_status()
@@ -200,6 +257,7 @@ async def _stream_openai_compatible(
         messages: list,
         max_tokens: int,
         api_key: str = "",
+        temperature: float = 0.7,
     ) -> AsyncGenerator[str, None]:
         headers = {"Content-Type": "application/json"}
         if api_key:
@@ -212,7 +270,7 @@ async def _stream_openai_compatible(
                     "model": self._cfg.llm_model,
                     "messages": messages,
                     "max_tokens": max_tokens,
-                    "temperature": 0.1,
+                    "temperature": temperature,
                     "stream": True,
                 },
             ) as resp:
@@ -242,6 +300,7 @@ async def _llm_stream(
         preferences: dict,
     ) -> AsyncGenerator[str, None]:
         max_tokens = preferences.get("max_length", 1024)
+        temperature = preferences.get("temperature", 0.7)
         messages = [
             {"role": "system", "content": system_prompt},
             {"role": "user", "content": user_prompt},
@@ -249,7 +308,10 @@ async def _llm_stream(
 
         if self._cfg.llm_provider == "vllm":
             async for d in self._stream_openai_compatible(
-                f"{self._cfg.llm_url}/v1/chat/completions", messages, max_tokens
+                f"{self._cfg.llm_url}/v1/chat/completions",
+                messages,
+                max_tokens,
+                temperature=temperature,
             ):
                 yield d
         elif self._cfg.llm_provider == "api":
@@ -258,10 +320,11 @@ async def _llm_stream(
                 messages,
                 max_tokens,
                 self._cfg.llm_api_key,
+                temperature=temperature,
             ):
                 yield d
         elif self._cfg.llm_provider == "ollama":
-            async for d in self._stream_ollama(messages, max_tokens):
+            async for d in self._stream_ollama(messages, max_tokens, temperature):
                 yield d
         else:
             raise ValueError(f"Unknown llm_provider: {self._cfg.llm_provider}")
@@ -282,8 +345,11 @@ async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]:
             raise
 
         print(f"[RAG] Found {len(chunks)} chunks")
+        profile_facts = self._get_profile(collection, query._user_id)
+        if profile_facts:
+            print(f"[RAG] Loaded {len(profile_facts)} profile fact(s)")
         system_prompt, user_prompt = self._build_prompt(
-            query.question, chunks, query.preferences
+            query.question, chunks, query.preferences, profile_facts
         )
 
         print(f"[RAG] Streaming from LLM at {self._cfg.llm_url} (provider={self._cfg.llm_provider}, model={self._cfg.llm_model})")
@@ -317,6 +383,8 @@ def __init__(
         response_format="paragraph",
         max_length=1024,
         extra_instructions="",
+        persona="",
+        temperature=0.7,
         **kwargs,
     ):
         super().__init__(_handle=_handle, _user_id=_user_id, **kwargs)
@@ -327,7 +395,10 @@ def __init__(
             "response_format": response_format,
             "max_length": max_length,
             "extra_instructions": extra_instructions,
+            "temperature": temperature,
         }
+        if persona:
+            self.preferences["persona"] = persona
 
     async def process(self, data: Sentence) -> AsyncGenerator[Token, None]:  # type: ignore[override]
         query = RAGQuery(

From 2e74004fd0d1ee01cbb392c2b507999d7ff48785 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 18:53:29 +0200
Subject: [PATCH 22/31] fixed(huri):  Qdrant too old dependancy + new nvidia
 dependancies

---
 deploy/Dockerfile.nvidia | 3 ---
 requirements-amd.txt     | 2 +-
 requirements-nvidia.txt  | 7 ++++++-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia
index 1cb0c8d..f9e704f 100644
--- a/deploy/Dockerfile.nvidia
+++ b/deploy/Dockerfile.nvidia
@@ -23,9 +23,6 @@ RUN git clone https://github.com/FunAudioLLM/CosyVoice.git /app/cosyvoice \
     && git -C /app/cosyvoice checkout 074ca6dc9e80a2f424f1f74b48bdd7d3fea531cc \
     && git -C /app/cosyvoice submodule update --init --recursive
 
-
-RUN pip install --no-cache-dir lightning==2.2.4 gdown==5.1.0 matplotlib==3.7.5 wget==3.2 pyworld==0.3.4
-
 ENV PYTHONPATH="/app/cosyvoice:${PYTHONPATH:-}"
 
 COPY src /app/src
diff --git a/requirements-amd.txt b/requirements-amd.txt
index 2fefb8d..1c99594 100644
--- a/requirements-amd.txt
+++ b/requirements-amd.txt
@@ -6,7 +6,7 @@
 
 # --- RAG / LLM ---
 httpx==0.27.2
-qdrant-client==1.12.1
+qdrant-client==1.18.0
 sentence-transformers==3.2.1
 pypdf==5.1.0
 semantic_chunker==0.2.0
diff --git a/requirements-nvidia.txt b/requirements-nvidia.txt
index 441b738..98a8950 100644
--- a/requirements-nvidia.txt
+++ b/requirements-nvidia.txt
@@ -27,7 +27,7 @@ tqdm==4.67.3
 
 # --- RAG / LLM extras ---
 httpx==0.27.2
-qdrant-client==1.12.1
+qdrant-client==1.18.0
 sentence-transformers==3.2.1
 pypdf==5.1.0
 semantic_chunker==0.2.0
@@ -42,3 +42,8 @@ imageio==2.33.0
 # opencv-python==4.8.1.78
 # pytorch3d                # has to be built from source for torch 2.3 / py3.12
 # torchvision
+lightning==2.2.4
+gdown==5.1.0
+matplotlib==3.7.5 # ridiculous but necessary for init modules
+wget==3.2
+pyworld==0.3.4

From 11642138d8cf100594c688eaa6410b8a0a9c75c6 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 18:54:36 +0200
Subject: [PATCH 23/31] feat(huri): core config uid fetching

---
 src/core/dataclasses/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/dataclasses/config.py b/src/core/dataclasses/config.py
index aea111f..8cccb03 100644
--- a/src/core/dataclasses/config.py
+++ b/src/core/dataclasses/config.py
@@ -47,7 +47,7 @@ def from_dict(cls, raw: Dict) -> "ClientConfig":
             for module_id, mod_raw in raw.get("modules", {}).items()
         }
         return cls(
-            user_id=None,
+            user_id=raw.get("user_id"),
             huri_url=raw["huri_url"],
             topic_list=raw["topic_list"],
             senders=senders,

From 109491aaf3e13db7fcb2c17beee4b3a36469606d Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 18:54:59 +0200
Subject: [PATCH 24/31] feat(huri): automatically exclude outputed .wav files

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 990d633..92a7c35 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,4 +181,7 @@ cython_debug/
 
 # Others
 .trash
-docs
\ No newline at end of file
+docs
+
+# HuRI client outputs
+*.wav

From f9d3a936c1d3929af79a4e435f257d4d0c59852c Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 23:03:55 +0200
Subject: [PATCH 25/31] feat(rag): improved prompt to avoid too long sentences

---
 src/modules/rag/rag.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py
index 7ffc6da..20a833b 100644
--- a/src/modules/rag/rag.py
+++ b/src/modules/rag/rag.py
@@ -195,8 +195,8 @@ def _build_prompt(
             "Use the context in the user's message to inform your answers when "
             "it is relevant, but always answer in character. If you don't know "
             "something, improvise in character rather than admitting you lack "
-            "information or breaking character."
-            "Make small sentences, and no emojis"
+            "information or breaking character. "
+            "IMPORTANT: Reply in 1-3 short sentences maximum. Be extremely concise. No lists, no emojis, no long explanations."
         )
         system_prompt = " ".join(parts)
 
@@ -381,7 +381,7 @@ def __init__(
         language="en",
         tone="formal",
         response_format="paragraph",
-        max_length=1024,
+        max_length=220,
         extra_instructions="",
         persona="",
         temperature=0.7,
@@ -389,6 +389,8 @@ def __init__(
     ):
         super().__init__(_handle=_handle, _user_id=_user_id, **kwargs)
 
+        print(f"[RAG] Initialized with user_id={_user_id}, language={language}, tone={tone}, response_format={response_format}, max_length={max_length}, temperature={temperature}")
+
         self.preferences = {
             "language": language,
             "tone": tone,

From a048d9a3bc80dedb3aa77ca8f7cb17d26a627b91 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Mon, 1 Jun 2026 23:05:17 +0200
Subject: [PATCH 26/31] feat(sender): sending topic type to clients

---
 deploy/examples/local_nvidia_amd/values.yaml | 4 ++--
 src/modules/utils/sender.py                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml
index 9033384..e980c57 100644
--- a/deploy/examples/local_nvidia_amd/values.yaml
+++ b/deploy/examples/local_nvidia_amd/values.yaml
@@ -111,10 +111,10 @@ head:
   resources:
     limits:
       cpu: "2"
-      memory: "8Gi"
+      memory: "3Gi"
     requests:
       cpu: "2"
-      memory: "4Gi"
+      memory: "2Gi"
 
 # Worker groups: one logical "gpu" group per vendor and one "cpu" group.
 #
diff --git a/src/modules/utils/sender.py b/src/modules/utils/sender.py
index 1a1d7eb..9261662 100644
--- a/src/modules/utils/sender.py
+++ b/src/modules/utils/sender.py
@@ -57,7 +57,7 @@ async def process(self, _):
             )
             await self.ws.send_bytes(self._prefix(header + body))
         elif isinstance(data, EventData):
-            await self.ws.send_json(asdict(data))
+            await self.ws.send_json({"topic": self.input_type, **asdict(data)})
         else:
             await self.ws.send_text(str(data))
 

From 2e404000caedfa89e27368c3685c035b942efea9 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Tue, 2 Jun 2026 03:45:43 +0200
Subject: [PATCH 27/31] fix(rag): missing previous prompt context

---
 src/modules/rag/rag.py | 47 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py
index 20a833b..b7e21f8 100644
--- a/src/modules/rag/rag.py
+++ b/src/modules/rag/rag.py
@@ -32,6 +32,10 @@ class RAGQuery:
     _user_id: str
     question: str
     preferences: dict = field(default_factory=dict)
+    # Prior conversation turns as OpenAI-style messages
+    # ([{"role": "user"|"assistant", "content": str}, ...]). The handle is
+    # stateless, so the per-session RAG module owns and supplies this.
+    history: list = field(default_factory=list)
 
 
 @serve.deployment(name="RAGHandle")
@@ -298,13 +302,14 @@ async def _llm_stream(
         system_prompt: str,
         user_prompt: str,
         preferences: dict,
+        history: list | None = None,
     ) -> AsyncGenerator[str, None]:
         max_tokens = preferences.get("max_length", 1024)
         temperature = preferences.get("temperature", 0.7)
-        messages = [
-            {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt},
-        ]
+        messages = [{"role": "system", "content": system_prompt}]
+        if history:
+            messages.extend(history)
+        messages.append({"role": "user", "content": user_prompt})
 
         if self._cfg.llm_provider == "vllm":
             async for d in self._stream_openai_compatible(
@@ -352,10 +357,14 @@ async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]:
             query.question, chunks, query.preferences, profile_facts
         )
 
-        print(f"[RAG] Streaming from LLM at {self._cfg.llm_url} (provider={self._cfg.llm_provider}, model={self._cfg.llm_model})")
+        print(
+            f"[RAG] Streaming from LLM at {self._cfg.llm_url} "
+            f"(provider={self._cfg.llm_provider}, model={self._cfg.llm_model}, "
+            f"history_msgs={len(query.history)})"
+        )
         try:
             async for delta in self._llm_stream(
-                system_prompt, user_prompt, query.preferences
+                system_prompt, user_prompt, query.preferences, query.history
             ):
                 yield delta
         except Exception:
@@ -385,11 +394,12 @@ def __init__(
         extra_instructions="",
         persona="",
         temperature=0.7,
+        max_history_turns=6,
         **kwargs,
     ):
         super().__init__(_handle=_handle, _user_id=_user_id, **kwargs)
 
-        print(f"[RAG] Initialized with user_id={_user_id}, language={language}, tone={tone}, response_format={response_format}, max_length={max_length}, temperature={temperature}")
+        print(f"[RAG] Initialized with user_id={_user_id}, language={language}, tone={tone}, response_format={response_format}, max_length={max_length}, temperature={temperature}, max_history_turns={max_history_turns}")
 
         self.preferences = {
             "language": language,
@@ -402,17 +412,40 @@ def __init__(
         if persona:
             self.preferences["persona"] = persona
 
+        # Per-session conversation memory, kept on the (per-WebSocket) module
+        # instance because the RAGHandle deployment is stateless/shared.
+        # Stored as OpenAI-style messages; trimmed to the last N turns.
+        self._max_history_turns = max_history_turns
+        self.history: list[dict] = []
+
     async def process(self, data: Sentence) -> AsyncGenerator[Token, None]:  # type: ignore[override]
         query = RAGQuery(
             _user_id=self._user_id if self._user_id else "anonymous",
             question=data.text,
             preferences=self.preferences,
+            history=list(self.history),  # snapshot of prior turns
         )
 
+        parts: list[str] = []
         stream = self._handle.options(stream=True).stream.remote(query)
         async for delta in stream:
+            parts.append(delta)
             yield Token(text=delta, end=False)
         yield Token(text="", end=True)
 
+        self._record_turn(data.text, "".join(parts))
+
+    def _record_turn(self, question: str, answer: str) -> None:
+        """Append this turn to the session history (raw Q/A, no RAG context)
+        and trim to the most recent `max_history_turns` exchanges."""
+        answer = answer.strip()
+        if not answer:
+            return
+        self.history.append({"role": "user", "content": question})
+        self.history.append({"role": "assistant", "content": answer})
+        max_msgs = self._max_history_turns * 2
+        if len(self.history) > max_msgs:
+            del self.history[:-max_msgs]
+
     def update_preferences(self, new_preferences: dict):
         self.preferences.update(new_preferences)

From 0b39013cc6b795cdb18e316078b2e517b851e145 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Tue, 2 Jun 2026 03:47:16 +0200
Subject: [PATCH 28/31] fix(gesture): smoother transition from sliding window +
 small desync in inference

---
 src/modules/gesture/gesture.py | 119 +++++++++++++++++++++++++++------
 1 file changed, 98 insertions(+), 21 deletions(-)

diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index e4fb01b..33bde05 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -20,6 +20,10 @@
 _CONTEXT_SEC = float(os.environ.get("HURI_GESTURE_CONTEXT_SEC", "2.0"))
 _MIN_CHUNK_SEC = float(os.environ.get("HURI_GESTURE_MIN_CHUNK_SEC", "0.5"))
 
+# Seconds over which a fresh window's first frames are eased onto the last
+# emitted pose, killing the seam snap between windows and between utterances.
+_BLEND_SEC = float(os.environ.get("HURI_GESTURE_BLEND_SEC", "0.2"))
+
 # Optional manual GPU split: cap the gesture process to a fraction of the GPU so
 # TTS keeps the lion's share. Only applied on CUDA when the value is set (>0).
 _GPU_MEM_FRACTION = float(os.environ.get("HURI_GESTURE_GPU_MEM_FRACTION", "0.0"))
@@ -174,8 +178,21 @@ class Gesture(ModuleWithHandle):
     primes the model so the seam is continuous; only the motion frames for the
     fresh audio are emitted. The window length is bounded by
     ``context_sec + chunk size`` so inference cost stays flat regardless of
-    utterance length. Global root translation is rebased onto the previously
-    emitted frame to avoid a jump every window.
+    utterance length.
+
+    Seam blending
+    ─────────────
+    Priming with context keeps the *audio* continuous across a window, but the
+    motion still snaps at seams: EMAGE has no future context at a window's right
+    edge, so its last frames wind down differently from how the next window —
+    fully primed — opens, and at utterance boundaries it cold-starts from a rest
+    pose entirely. So each fresh segment is eased onto the previously emitted
+    frame: poses, expressions and root translation all start exactly continuous
+    and a cosine-decaying offset fades to zero over ``blend_sec``, restoring the
+    model's intended motion (and avoiding the cumulative drift a constant rebase
+    would cause). The anchors survive the end-of-utterance reset, so the first
+    window of the next utterance blends out of the pose still on screen instead
+    of teleporting.
 
     input:  audio (Audio)
     output: motion (Motion)
@@ -184,6 +201,7 @@ class Gesture(ModuleWithHandle):
     :device:       PyTorch device string; defaults to CUDA when available.
     :context_sec:  Seconds of prior audio prepended to each window for continuity.
     :min_chunk_sec: Minimum seconds of fresh audio to accumulate before inferring.
+    :blend_sec:    Seconds over which each window's seam is eased onto the prior frame.
     """
 
     _handle_cls = GestureDeployment
@@ -195,10 +213,12 @@ def __init__(
         _handle: handle.DeploymentHandle,
         context_sec: float = _CONTEXT_SEC,
         min_chunk_sec: float = _MIN_CHUNK_SEC,
+        blend_sec: float = _BLEND_SEC,
     ):
         super().__init__(_handle)
         self._context_sec = float(context_sec)
         self._min_chunk_sec = float(min_chunk_sec)
+        self._blend_sec = float(blend_sec)
 
         # Per-utterance sliding-window state. All sample counts are in the
         # source sample rate; resampling to 16 kHz happens once inside infer().
@@ -207,14 +227,23 @@ def __init__(
         self._buffer = np.empty(0, dtype=np.float32)  # trailing audio (ctx + unprocessed)
         self._buf_start = 0       # source-sr sample index of buffer[0] in utterance timeline
         self._emitted = 0         # source-sr samples whose motion has been emitted
-        self._trans_anchor: Optional[np.ndarray] = None  # last emitted trans, for continuity
 
-    def _reset(self) -> None:
+        # Last emitted frame per channel, used to ease the next segment's seam.
+        # These persist across the end-of-utterance reset (see _end_utterance) so
+        # gestures stay continuous when a new utterance starts.
+        self._trans_anchor: Optional[np.ndarray] = None
+        self._pose_anchor: Optional[np.ndarray] = None
+        self._expr_anchor: Optional[np.ndarray] = None
+
+    def _end_utterance(self) -> None:
+        # Reset only per-utterance buffering/timeline state. The seam anchors
+        # deliberately survive so the first window of the next utterance eases
+        # out of the pose currently on screen instead of snapping to EMAGE's
+        # cold-start rest pose.
         self._sr = None
         self._buffer = np.empty(0, dtype=np.float32)
         self._buf_start = 0
         self._emitted = 0
-        self._trans_anchor = None
 
     async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type: ignore[override]
         # Each chunk arrives as its own process() task on the shared per-session
@@ -233,7 +262,7 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type:
             if sr is None:
                 # Nothing buffered yet (e.g. a lone end marker). Reset and bail.
                 if end_of_utterance:
-                    self._reset()
+                    self._end_utterance()
                 return
 
             ctx_samples = int(self._context_sec * sr)
@@ -245,7 +274,7 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type:
             # Wait for more audio unless this is the final flush of the utterance.
             if new_samples <= 0 or (not end_of_utterance and new_samples < min_new_samples):
                 if end_of_utterance:
-                    self._reset()
+                    self._end_utterance()
                 return
 
             motion = await self._infer_window(sr, ctx_samples, global_end)
@@ -253,7 +282,7 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]:  # type:
                 yield motion
 
             if end_of_utterance:
-                self._reset()
+                self._end_utterance()
 
     async def _infer_window(
         self, sr: int, ctx_samples: int, global_end: int
@@ -267,27 +296,50 @@ async def _infer_window(
         motion: Motion = await self._handle.infer.remote(window, sr)
         total_frames = motion.poses.shape[0]
 
+        # EMAGE's internal windowing (EmageAudioModel.inference) emits a
+        # contiguous *prefix* of the requested window and silently drops up to
+        # ~2*seed_frames frames off the END whenever the trailing partial window
+        # is shorter than its motion seed. So the returned frames cover only
+        # [win_start, win_start + total_frames] — not necessarily the whole
+        # window. Map emission off the actual frame count, not the requested
+        # length: otherwise the freshest motion is dropped while _emitted skips
+        # over it, tearing a hole in the timeline that reads as a freeze-then-
+        # jump (and drifts gesture out of sync with speech).
+        covered_end = win_start + int(round(total_frames * sr / motion.fps))
+
         # Drop the leading frames that correspond to the context (already emitted).
         skip_sec = (self._emitted - win_start) / sr
         skip_frames = int(round(skip_sec * motion.fps))
         skip_frames = max(0, min(skip_frames, total_frames))
 
-        poses = motion.poses[skip_frames:]
-        expressions = motion.expressions[skip_frames:]
+        poses = motion.poses[skip_frames:].copy()
+        expressions = motion.expressions[skip_frames:].copy()
         trans = motion.trans[skip_frames:].copy()
 
-        # Advance the timeline even if rounding left no new frames to emit.
-        self._emitted = global_end
-        self._trim_buffer(ctx_samples, global_end)
+        # Advance only past audio the model actually turned into motion; any
+        # dropped tail stays buffered and is re-inferred next window, this time
+        # with real right-context. Cap at global_end so rounding can't overrun
+        # the buffer, and never move backwards.
+        self._emitted = min(global_end, max(self._emitted, covered_end))
+        self._trim_buffer(ctx_samples)
 
         if poses.shape[0] == 0:
             return None
 
-        # Rebase global translation onto the last emitted frame: every window
-        # restarts root motion near the origin, so without this the avatar would
-        # teleport back at each seam.
-        if self._trans_anchor is not None:
-            trans += self._trans_anchor - trans[0]
+        # Ease this segment's seam onto the last emitted frame. Poses and
+        # expressions snap because EMAGE regenerates the boundary without the
+        # right-context the next window will have (and cold-starts across
+        # utterances); root translation snaps because every window restarts near
+        # the origin. Blending all three keeps the seam continuous, and the
+        # decaying (vs. constant) offset returns to the model's intended motion
+        # so root translation doesn't accumulate drift across windows.
+        blend_frames = int(round(self._blend_sec * motion.fps))
+        self._blend_into(poses, self._pose_anchor, blend_frames)
+        self._blend_into(expressions, self._expr_anchor, blend_frames)
+        self._blend_into(trans, self._trans_anchor, blend_frames)
+
+        self._pose_anchor = poses[-1].copy()
+        self._expr_anchor = expressions[-1].copy()
         self._trans_anchor = trans[-1].copy()
 
         out = Motion(
@@ -299,9 +351,34 @@ async def _infer_window(
         )
         return out
 
-    def _trim_buffer(self, ctx_samples: int, global_end: int) -> None:
-        # Keep only the trailing context so the next window stays bounded.
-        keep_from = global_end - ctx_samples
+    @staticmethod
+    def _blend_into(
+        arr: np.ndarray, anchor: Optional[np.ndarray], blend_frames: int
+    ) -> None:
+        """Ease the start of a fresh segment onto ``anchor`` in place.
+
+        Frame 0 is shifted to equal ``anchor`` (a continuous seam) and the
+        offset fades to zero over ``blend_frames`` with a cosine ease — zero
+        slope at both ends, so neither the value nor its velocity jumps — after
+        which the segment is the model's untouched output.
+
+        Poses are SMPL-X axis-angle, so this is a linear blend in axis-angle
+        space: exact only for small seam offsets, which is the regime here since
+        consecutive frames are already close. A quaternion slerp would be needed
+        for large discontinuities but is overkill for seam clean-up.
+        """
+        if anchor is None or blend_frames <= 0 or arr.shape[0] == 0:
+            return
+        n = min(blend_frames, arr.shape[0])
+        w = 0.5 * (1.0 + np.cos(np.pi * np.linspace(0.0, 1.0, n, dtype=arr.dtype)))
+        arr[:n] += w[:, None] * (anchor - arr[0])
+
+    def _trim_buffer(self, ctx_samples: int) -> None:
+        # Keep one context window of audio before the last *emitted* sample so
+        # the next window stays bounded — but never discard audio whose motion
+        # hasn't been emitted yet (the dropped tail above lives between
+        # _emitted and the buffer end).
+        keep_from = self._emitted - ctx_samples
         if keep_from > self._buf_start:
             self._buffer = self._buffer[keep_from - self._buf_start :]
             self._buf_start = keep_from

From 818bad86c7704ae19fde16c30a0bfb78dd071a8b Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Tue, 2 Jun 2026 04:18:33 +0200
Subject: [PATCH 29/31] fixed(gesture): warmup now works when cold starting

---
 src/modules/gesture/gesture.py | 81 +++++++++++++++++++++++++++++-----
 1 file changed, 70 insertions(+), 11 deletions(-)

diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index 33bde05..291f3ad 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -28,6 +28,15 @@
 # TTS keeps the lion's share. Only applied on CUDA when the value is set (>0).
 _GPU_MEM_FRACTION = float(os.environ.get("HURI_GESTURE_GPU_MEM_FRACTION", "0.0"))
 
+# Source sample rate used to warm the inference path. Real audio arrives from
+# the TTS (CosyVoice ≈ 24 kHz), so every real infer() call resamples to 16 kHz.
+# Warming at 16 kHz — as the old warmup did — skips that resample entirely,
+# leaving librosa's first-call cost to land on the first user-facing gesture.
+# Default to the TTS rate so the resample path is warmed too. Override if your
+# TTS uses a different rate (the exact value only affects which resampler
+# filter is pre-built; the model shapes follow the 16 kHz duration regardless).
+_WARMUP_SRC_SR = int(os.environ.get("HURI_GESTURE_WARMUP_SR", "24000"))
+
 
 @dataclass
 class Motion:
@@ -48,6 +57,19 @@ def __init__(
     ):
         print(f"[Gesture] importing torch...", flush=True)
         import torch
+
+        # Pin algorithm selection so the kernels warmed below are the same ones
+        # used at serve time. With cudnn.benchmark enabled, cuDNN re-autotunes
+        # for every new input length — and the sliding window feeds a different
+        # length almost every call — so the first inference at each new shape
+        # would stall on autotuning, defeating the warmup. Keep it off (also the
+        # default) and pin it explicitly. TF32 just speeds matmul/conv on
+        # Ampere+ with no meaningful quality impact for gesture.
+        torch.backends.cudnn.benchmark = False
+        if torch.cuda.is_available():
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+
         print(f"[Gesture] importing emage...", flush=True)
         from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv
 
@@ -99,21 +121,58 @@ def __init__(
         print(f"[Gesture] ready", flush=True)
 
     def _warmup(self) -> None:
-        # The first inference pays a one-time cold-start cost (CUDA context
-        # init, kernel JIT/load, cuDNN autotuning, caching-allocator warmup)
-        # that can take several seconds. Run a throwaway pass here so that cost
-        # is paid at startup — where we're already blocking on weight loads —
-        # rather than on the first user-facing gesture. Best-effort: a failure
-        # here must never prevent the deployment from coming up.
+        # The first inference pays one-time costs that are *shape- and
+        # path-dependent*: per-input-length kernel/primitive selection (cuDNN
+        # algo pick on GPU, oneDNN primitive build on CPU), librosa's first-call
+        # resampler build, the caching allocator's first growth, and CUDA
+        # context/kernel load. The old warmup ran a single 16 kHz, fixed-length,
+        # no-resample pass — so it warmed exactly one shape on a path real calls
+        # never take. The first real gesture (a different length, arriving at the
+        # TTS rate and therefore resampled) re-paid almost all of it, which is
+        # why the warmup "did nothing".
+        #
+        # Instead, sweep the window lengths the sliding window actually feeds
+        # infer() — from the small first-chunk window up to a full context+chunk
+        # steady-state window — on the *real* resample path, twice (the first
+        # pass pays the costs, the second confirms the path is hot), and
+        # synchronize so the GPU work is finished before we report ready.
+        # Best-effort: a failure here must never prevent the deployment coming up.
         import time
+        import torch
+
+        # Representative window lengths (seconds). The dominant per-window
+        # transformer forward is a fixed shape warmed by any window, but the
+        # trailing remainder forward varies with total length, so warm a spread.
+        secs = sorted({
+            round(s, 3)
+            for s in (
+                _MIN_CHUNK_SEC,                    # first tiny window of an utterance
+                _CONTEXT_SEC,                      # context-only sized window
+                _CONTEXT_SEC + _MIN_CHUNK_SEC,     # steady-state window
+                _CONTEXT_SEC + 2 * _MIN_CHUNK_SEC, # a larger fresh chunk
+            )
+            if s and s > 0
+        }) or [3.0]
 
         try:
-            # ~3 s of silence at 16 kHz exercises the full sliding-window path
-            # (multiple rounds + remainder) the way a real utterance would.
-            dummy = np.zeros(_EMAGE_SR * 3, dtype=np.float32)
             t0 = time.time()
-            self.infer(dummy, source_sr=_EMAGE_SR)
-            print(f"[Gesture] warmup done in {time.time() - t0:.2f}s", flush=True)
+            for pass_idx in range(2):
+                for s in secs:
+                    n = max(1, int(_WARMUP_SRC_SR * s))
+                    dummy = np.zeros(n, dtype=np.float32)
+                    ts = time.time()
+                    self.infer(dummy, source_sr=_WARMUP_SRC_SR)
+                    if self.device.type == "cuda":
+                        torch.cuda.synchronize(self.device)
+                    print(
+                        f"[Gesture] warmup pass {pass_idx} {s:.2f}s "
+                        f"({n} samples @ {_WARMUP_SRC_SR} Hz) in {time.time() - ts:.2f}s",
+                        flush=True,
+                    )
+            print(
+                f"[Gesture] warmup done ({len(secs)} shapes x2) in {time.time() - t0:.2f}s",
+                flush=True,
+            )
         except Exception as e:  # noqa: BLE001 — warmup is an optimisation, never fatal
             print(f"[Gesture] WARNING warmup failed: {e!r}", flush=True)
 

From fa4bb76f55b8314cad7118da082a37f9050345e6 Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 10 Jun 2026 03:53:47 +0200
Subject: [PATCH 30/31] feat(huri): refactorisation on modules and removing
 temp features

---
 src/core/events.py                           |  3 +-
 src/modules/factory.py                       |  4 +-
 src/modules/gesture/gesture.py               | 27 +++---
 src/modules/modules.py                       | 24 +----
 src/modules/rag/ingestion.py                 | 25 ++----
 src/modules/rag/qdrant_utils.py              | 31 +++++++
 src/modules/rag/rag.py                       | 48 +++++-----
 src/modules/text_to_speech/text_to_speech.py | 95 +++++++++-----------
 8 files changed, 121 insertions(+), 136 deletions(-)
 create mode 100644 src/modules/rag/qdrant_utils.py

diff --git a/src/core/events.py b/src/core/events.py
index 5df1bf8..dfaf66b 100644
--- a/src/core/events.py
+++ b/src/core/events.py
@@ -3,6 +3,8 @@
 from collections import defaultdict
 from dataclasses import dataclass
 
+import numpy as np
+
 from .module import Module
 
 logger = logging.getLogger("ray.serve")
@@ -91,7 +93,6 @@ async def _run(self, module: Module, data):
 def _summarize(item) -> str:
     """Short repr that avoids dumping full numpy arrays into the log."""
     cls = type(item).__name__
-    import numpy as np
     data = getattr(item, "data", None)
     if isinstance(data, np.ndarray):
         return f"{cls}(shape={data.shape}, dtype={data.dtype})"
diff --git a/src/modules/factory.py b/src/modules/factory.py
index 14acd99..fb0f58c 100644
--- a/src/modules/factory.py
+++ b/src/modules/factory.py
@@ -1,8 +1,10 @@
 from typing import Any, Dict, List, Mapping, Type
 
+from ray.serve import handle
+
 from src.core.dataclasses.config import ModuleConfig
 from src.core.events import EventData
-from src.core.module import Module, ModuleWithHandle, ModuleWithId, handle
+from src.core.module import Module, ModuleWithHandle, ModuleWithId
 
 
 class EventDataFactory:
diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py
index 291f3ad..ecd50f1 100644
--- a/src/modules/gesture/gesture.py
+++ b/src/modules/gesture/gesture.py
@@ -55,7 +55,7 @@ def __init__(
         device: Optional[str] = None,
         gpu_mem_fraction: float = _GPU_MEM_FRACTION,
     ):
-        print(f"[Gesture] importing torch...", flush=True)
+        print(f"[Gesture] importing torch...")
         import torch
 
         # Pin algorithm selection so the kernels warmed below are the same ones
@@ -70,13 +70,13 @@ def __init__(
             torch.backends.cuda.matmul.allow_tf32 = True
             torch.backends.cudnn.allow_tf32 = True
 
-        print(f"[Gesture] importing emage...", flush=True)
+        print(f"[Gesture] importing emage...")
         from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv
 
         self.device = torch.device(
             device if device else ("cuda" if torch.cuda.is_available() else "cpu")
         )
-        print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}", flush=True)
+        print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}")
 
         # Manual GPU split: cap this process' share of GPU memory so the audio
         # (TTS) path keeps the rest. num_gpus in the Ray serveConfig handles
@@ -88,20 +88,19 @@ def __init__(
                 )
                 print(
                     f"[Gesture] GPU memory fraction capped at {gpu_mem_fraction:.2f}",
-                    flush=True,
                 )
             except Exception as e:  # noqa: BLE001 — best-effort knob, never fatal
-                print(f"[Gesture] WARNING could not cap GPU memory: {e!r}", flush=True)
+                print(f"[Gesture] WARNING could not cap GPU memory: {e!r}")
 
-        print("[Gesture] loading face_vq...", flush=True)
+        print("[Gesture] loading face_vq...")
         face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device)
-        print("[Gesture] loading upper_vq...", flush=True)
+        print("[Gesture] loading upper_vq...")
         upper_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/upper").to(self.device)
-        print("[Gesture] loading lower_vq...", flush=True)
+        print("[Gesture] loading lower_vq...")
         lower_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/lower").to(self.device)
-        print("[Gesture] loading hands_vq...", flush=True)
+        print("[Gesture] loading hands_vq...")
         hands_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/hands").to(self.device)
-        print("[Gesture] loading global_ae...", flush=True)
+        print("[Gesture] loading global_ae...")
         global_ae = EmageVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/global").to(self.device)
 
         self.motion_vq = EmageVQModel(
@@ -113,12 +112,12 @@ def __init__(
         )
         self.motion_vq.eval()
 
-        print("[Gesture] loading EmageAudioModel...", flush=True)
+        print("[Gesture] loading EmageAudioModel...")
         self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device)
         self.model.eval()
 
         self._warmup()
-        print(f"[Gesture] ready", flush=True)
+        print(f"[Gesture] ready")
 
     def _warmup(self) -> None:
         # The first inference pays one-time costs that are *shape- and
@@ -167,14 +166,12 @@ def _warmup(self) -> None:
                     print(
                         f"[Gesture] warmup pass {pass_idx} {s:.2f}s "
                         f"({n} samples @ {_WARMUP_SRC_SR} Hz) in {time.time() - ts:.2f}s",
-                        flush=True,
                     )
             print(
                 f"[Gesture] warmup done ({len(secs)} shapes x2) in {time.time() - t0:.2f}s",
-                flush=True,
             )
         except Exception as e:  # noqa: BLE001 — warmup is an optimisation, never fatal
-            print(f"[Gesture] WARNING warmup failed: {e!r}", flush=True)
+            print(f"[Gesture] WARNING warmup failed: {e!r}")
 
     def infer(self, audio_np: np.ndarray, source_sr: int = _EMAGE_SR) -> Motion:
         import torch
diff --git a/src/modules/modules.py b/src/modules/modules.py
index 04a020c..d8fefb9 100644
--- a/src/modules/modules.py
+++ b/src/modules/modules.py
@@ -1,33 +1,15 @@
-import logging
 from typing import Dict, Type
 
 from src.modules.rag.rag import RAG
 from src.modules.speech_to_text.microphone_vad import MIC
 from src.modules.speech_to_text.speech_to_text import STT
 from src.modules.speech_to_text.text_aggregator import TAG
+from src.modules.text_to_speech.text_to_speech import TTS
+from src.modules.gesture.gesture import Gesture
 
 from .factory import Module
 
-_LOG = logging.getLogger(__name__)
-
 
 def get_modules() -> Dict[str, Type[Module]]:
-    modules: Dict[str, Type[Module]] = {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG}
-
-    # The following imports may contain modules with custom dependencies, depending on the Dockerfile
-    # CPU doesn't need some dependencies, nor the AMD that isn't compatible
-    try:
-        from src.modules.text_to_speech.text_to_speech import TTS
-    except Exception as exc:  # noqa: BLE001
-        _LOG.info("Skipping TTS module: %s", exc)
-    else:
-        modules["tts"] = TTS
-
-    try:
-        from src.modules.gesture.gesture import Gesture
-    except Exception as exc:  # noqa: BLE001
-        _LOG.info("Skipping Gesture module: %s", exc)
-    else:
-        modules["gesture"] = Gesture
-
+    modules: Dict[str, Type[Module]] = {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG, "tts": TTS, "gesture": Gesture}
     return modules
diff --git a/src/modules/rag/ingestion.py b/src/modules/rag/ingestion.py
index 88d85a8..529c7ff 100644
--- a/src/modules/rag/ingestion.py
+++ b/src/modules/rag/ingestion.py
@@ -185,7 +185,8 @@ def ingest_chunks(
 def chunk_strat(text: str, args, model: Any) -> list[str] | Any:
     """Pick the right chunking strategy based on args."""
     if args.chunking == "semantic":
-        from semantic_chunker import SemanticChunker
+        # Thomas: I need to import here, bceause it takes too much time earlier, or use a jupyter notebook to do it instead
+        from .semantic_chunker import SemanticChunker
 
         chunker = SemanticChunker(
             model=model,
@@ -513,23 +514,11 @@ def main():
     print(f"User: {_user_id}")
 
     verify_ssl = not args.no_verify_ssl
-    # Parse the URL explicitly so QdrantClient gets the correct host/port/https.
-    # When given just "https://host" with no port, some qdrant-client versions
-    # silently fall back to their default port (6333) instead of 443, causing
-    # a timeout that looks like an SSL issue.
-    from urllib.parse import urlparse
-
-    _parsed = urlparse(args.qdrant_url)
-    _is_https = _parsed.scheme == "https"
-    _host = _parsed.hostname
-    _port = _parsed.port or (443 if _is_https else 6333)
-    client = QdrantClient(
-        host=_host,
-        port=_port,
-        https=_is_https,
-        verify=verify_ssl,
-        check_compatibility=verify_ssl,
-    )
+    try:
+        from .qdrant_utils import make_qdrant_client
+    except ImportError:
+        from qdrant_utils import make_qdrant_client
+    client = make_qdrant_client(args.qdrant_url, verify_ssl)
 
     # Lazy-load the model only if the command needs embeddings.
     # Commands that don't need it: list, delete, profile (doesn't use embeddings).
diff --git a/src/modules/rag/qdrant_utils.py b/src/modules/rag/qdrant_utils.py
new file mode 100644
index 0000000..080980e
--- /dev/null
+++ b/src/modules/rag/qdrant_utils.py
@@ -0,0 +1,31 @@
+"""Shared Qdrant client construction.
+
+Centralises the URL→client parsing used by both the RAGHandle deployment
+(``rag.py``) and the offline ingestion CLI (``ingestion.py``), so the port/SSL
+handling lives in exactly one place instead of being copy-pasted.
+"""
+
+from urllib.parse import urlparse
+
+from qdrant_client import QdrantClient
+
+with open("portal.tmp", 'w') as f:
+    f.write("WTFF SWORKS ,??,,\n")
+
+def make_qdrant_client(qdrant_url: str, verify_ssl: bool = True) -> QdrantClient:
+    """Build a :class:`QdrantClient` from a URL.
+
+    Parses the URL explicitly so the client gets the correct host/port/https.
+    When given just ``https://host`` with no port, some qdrant-client versions
+    silently fall back to their default port (6333) instead of 443, causing a
+    timeout that looks like an SSL issue — so derive the port from the scheme.
+    """
+    parsed = urlparse(qdrant_url)
+    is_https = parsed.scheme == "https"
+    return QdrantClient(
+        host=parsed.hostname,
+        port=parsed.port or (443 if is_https else 6333),
+        https=is_https,
+        verify=verify_ssl,
+        check_compatibility=verify_ssl,
+    )
diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py
index b7e21f8..0e65bbe 100644
--- a/src/modules/rag/rag.py
+++ b/src/modules/rag/rag.py
@@ -1,4 +1,6 @@
 import json
+import os
+import traceback
 from dataclasses import dataclass, field
 from typing import Any, AsyncGenerator
 
@@ -10,6 +12,22 @@
 from src.modules.speech_to_text.events import Sentence
 from src.modules.text_to_speech.events import Token
 
+import httpx
+
+from qdrant_client.models import FieldCondition, Filter, MatchValue
+from .qdrant_utils import make_qdrant_client
+
+# Default character persona. Overridable per session via the `persona` key in the
+# client config's module args, or globally via HURI_RAG_DEFAULT_PERSONA in the
+# Serve app runtime_env.env_vars (see deploy values.yaml) — no rebuild needed.
+_DEFAULT_PERSONA = os.environ.get(
+    "HURI_RAG_DEFAULT_PERSONA",
+    "You are Mouse-Man, a witty, charismatic animated mouse character. "
+    "You are NOT an AI, an assistant, or a language model, and you must "
+    "never say that you are one. Stay fully in character at all times, "
+    "speak naturally and with personality, and never break the illusion.",
+)
+
 
 class RAGDeploymentConfig(BaseModel):
     qdrant_url: str = "http://localhost:6333"
@@ -51,22 +69,9 @@ def reconfigure(self, config: dict) -> None:
         self._apply_config()
 
     def _apply_config(self) -> None:
-        import httpx
-        from qdrant_client import QdrantClient
-
         cfg = self._cfg
-        from urllib.parse import urlparse
-
         self.embedding_url = cfg.embedding_url or cfg.llm_url
-        _p = urlparse(cfg.qdrant_url)
-        _is_https = _p.scheme == "https"
-        self._qdrant = QdrantClient(
-            host=_p.hostname,
-            port=_p.port or (443 if _is_https else 6333),
-            https=_is_https,
-            verify=cfg.verify_ssl,
-            check_compatibility=cfg.verify_ssl,
-        )
+        self._qdrant = make_qdrant_client(cfg.qdrant_url, cfg.verify_ssl)
         print(f"[RAGHandle] Connected to Qdrant at {cfg.qdrant_url}")
         self._embed_client = httpx.AsyncClient(timeout=30.0, verify=cfg.verify_ssl)
         self._llm_client = httpx.AsyncClient(timeout=120.0, verify=cfg.verify_ssl)
@@ -106,8 +111,6 @@ def _get_profile(self, collection: str, _user_id: str) -> list[str]:
         so they are always available to the prompt regardless of the question.
         Populated via `ingestion.py profile`.
         """
-        from qdrant_client.models import FieldCondition, Filter, MatchValue
-
         try:
             points, _ = self._qdrant.scroll(
                 collection_name=collection,
@@ -134,8 +137,6 @@ def _search(
     ) -> list[dict]:
         qdrant_filter: Any = None
         if filters:
-            from qdrant_client.models import FieldCondition, Filter, MatchValue
-
             conditions: Any = [
                 FieldCondition(key=k, match=MatchValue(value=v))
                 for k, v in filters.items()
@@ -168,14 +169,7 @@ def _build_prompt(
         preferences: dict,
         profile_facts: list[str] | None = None,
     ) -> tuple[str, str]:
-        # TEMP TEMP TEMP: this persona should be defined in the configuration
-        persona = preferences.get(
-            "persona",
-            "You are Rat-Man, a witty, charismatic animated rat character. "
-            "You are NOT an AI, an assistant, or a language model, and you must "
-            "never say that you are one. Stay fully in character at all times, "
-            "speak naturally and with personality, and never break the illusion.",
-        )
+        persona = preferences.get("persona", _DEFAULT_PERSONA)
         parts = [persona]
 
         if profile_facts:
@@ -336,8 +330,6 @@ async def _llm_stream(
 
     async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]:
         """Main streaming entry point — yields LLM text deltas."""
-        import traceback
-
         print(f"[RAG] Question: {query.question}")
 
         collection, filters = self._resolve_user_context(query._user_id)
diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py
index 4b9baf2..26a4b52 100644
--- a/src/modules/text_to_speech/text_to_speech.py
+++ b/src/modules/text_to_speech/text_to_speech.py
@@ -1,10 +1,10 @@
 import asyncio
-import logging
 import os
 import queue
 import sys
+import traceback
 import uuid
-from typing import AsyncGenerator
+from typing import AsyncGenerator, Optional
 
 import numpy as np
 from ray import serve
@@ -14,35 +14,28 @@
 
 from .events import Audio, Token
 
-logger = logging.getLogger("ray.serve")
-logger.setLevel(os.environ.get("HURI_TTS_LOG_LEVEL", "INFO").upper())
-
-
-def _trace(msg: str) -> None:
-    """Belt-and-braces log: hits both the ray.serve logger AND stdout.
-
-    Ray Serve captures stdout per replica and surfaces it in the dashboard's
-    Logs tab — that's the path that survives any logger misconfiguration.
-    """
-    logger.info(msg)
-    print(f"[TTS] {msg}", flush=True)
-
-
 # Defaults — overridden by env vars in production (see README.md)
 _MODEL_PATH = os.environ.get(
     "HURI_MODEL_PATH", "/models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512"
 )
 _VOICE_SAMPLE_PATH = os.environ.get("HURI_VOICE_SAMPLE_PATH", "/assets/voice.wav")
-# CosyVoice3 expects "<instruction><|endofprompt|><transcript-of-voice.wav>". If the
-# config supplies a bare transcript (no marker), prepend the default instruction so the
-# transcript lands AFTER <|endofprompt|> — otherwise the LM treats it as an instruction
-# and intermittently renders it as speech (prompt leakage). The transcription is a must.
-_raw_transcript = os.environ["HURI_VOICE_TRANSCRIPT"]
-_VOICE_SAMPLE_TRANSCRIPT = (
-    _raw_transcript
-    if "<|endofprompt|>" in _raw_transcript
-    else f"You are a helpful assistant.<|endofprompt|>{_raw_transcript}"
-)
+_DEFAULT_INSTRUCTION = "You are a helpful assistant."
+
+
+def _normalize_transcript(raw: str) -> str:
+    """Make a reference transcript safe for CosyVoice3.
+
+    CosyVoice3 expects "<instruction><|endofprompt|><transcript-of-voice.wav>".
+    If the configured transcript supplies a bare transcript (no marker), prepend
+    the default instruction so the transcript lands AFTER <|endofprompt|> —
+    otherwise the LM treats it as an instruction and intermittently renders it as
+    speech (prompt leakage).
+    """
+    return (
+        raw
+        if "<|endofprompt|>" in raw
+        else f"{_DEFAULT_INSTRUCTION}<|endofprompt|>{raw}"
+    )
 
 _END_TEXT = object()   # sentinel pushed into the text queue to close synth
 _END_AUDIO = object()  # sentinel pushed into the audio queue when synth completes
@@ -64,22 +57,34 @@ def __init__(
         self,
         model_path: str = _MODEL_PATH,
         voice_sample_path: str = _VOICE_SAMPLE_PATH,
-        voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT,
+        voice_sample_transcript: Optional[str] = None,
     ):
-        _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path} transcript={voice_sample_transcript}")
+        # Resolve the reference transcript here (deploy time on the GPU worker)
+        # rather than at module import: importing this module must not require
+        # HURI_VOICE_TRANSCRIPT, since modules.py imports it inside a broad
+        # try/except that would otherwise make TTS silently vanish from the
+        # pipeline when the var is unset. Fail loudly and locally instead.
+        if voice_sample_transcript is None:
+            raw = os.environ.get("HURI_VOICE_TRANSCRIPT")
+            if not raw:
+                raise RuntimeError(
+                    "HURI_VOICE_TRANSCRIPT is not set. The TTS deployment needs the "
+                    "transcript of the reference voice sample (voice.wav). Set it in "
+                    "the Serve app runtime_env.env_vars (see deploy values.yaml)."
+                )
+            voice_sample_transcript = raw
+        voice_sample_transcript = _normalize_transcript(voice_sample_transcript)
 
         cosy_dir = os.environ.get("HURI_COSY_DIR")
         if cosy_dir:
             matcha_path = os.path.join(cosy_dir, "third_party", "Matcha-TTS")
             if os.path.isdir(matcha_path) and matcha_path not in sys.path:
                 sys.path.insert(0, matcha_path)
-                logger.debug("Added Matcha-TTS path to sys.path: %s", matcha_path)
 
         from cosyvoice.cli.cosyvoice import CosyVoice3
 
         self.model = CosyVoice3(model_dir=model_path, load_trt=False)
         self.sample_rate: int = self.model.sample_rate
-        _trace(f"CosyVoice3 loaded (sample_rate={self.sample_rate})")
 
         self.prompt_speech = voice_sample_path
         self.prompt_text: str = voice_sample_transcript
@@ -91,33 +96,26 @@ async def get_sample_rate(self) -> int:
 
     async def start_session(self, session_id: str) -> None:
         self._text_queues[session_id] = queue.Queue()
-        _trace(f"[{session_id}] session started (active={len(self._text_queues)})")
 
     async def push_text(self, session_id: str, text: str, end: bool) -> None:
         q = self._text_queues.get(session_id)
         if q is None:
-            _trace(f"[{session_id}] WARNING push_text on unknown session (text={text!r} end={end})")
             return
         if text:
             q.put(text)
-            _trace(f"[{session_id}] push_text {text!r} (qsize={q.qsize()})")
         if end:
             q.put(_END_TEXT)
-            _trace(f"[{session_id}] push_text: end-of-stream sentinel")
 
     async def stream_audio(self, session_id: str) -> AsyncGenerator[Audio, None]:
         text_q = self._text_queues[session_id]
         loop = asyncio.get_running_loop()
         chunk_count = 0
-        _trace(f"[{session_id}] stream_audio: starting CosyVoice inference")
 
         def text_gen():
             while True:
                 item = text_q.get()
                 if item is _END_TEXT:
-                    _trace(f"[{session_id}] text_gen: received end sentinel")
                     return
-                _trace(f"[{session_id}] text_gen yielding: {item!r}")
                 yield item
 
         try:
@@ -134,18 +132,12 @@ def text_gen():
                 assert isinstance(result, dict)
                 chunk_count += 1
                 speech = result["tts_speech"].squeeze(0).numpy().astype(np.float32)
-                _trace(
-                    f"[{session_id}] audio chunk #{chunk_count}: "
-                    f"{speech.shape[0]} samples (~{speech.shape[0] / self.sample_rate:.2f}s)"
-                )
                 yield Audio(data=speech, sample_rate=self.sample_rate)
-        except Exception as e:
-            _trace(f"[{session_id}] stream_audio FAILED: {e!r}")
-            logger.exception("[%s] stream_audio failed", session_id)
+        except Exception:
+            traceback.print_exc()
             raise
         finally:
             self._text_queues.pop(session_id, None)
-            _trace(f"[{session_id}] stream_audio finished (chunks={chunk_count})")
 
 
 class TTS(ModuleWithHandle):
@@ -189,7 +181,7 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: i
             if is_first:
                 self._session_id = str(uuid.uuid4())
                 self._audio_q = asyncio.Queue()
-                print(f"[TTS-client] [{self._session_id}] opening new utterance session", flush=True)
+                print(f"[TTS-client] [{self._session_id}] opening new utterance session")
                 await self._handle.start_session.remote(self._session_id)
                 self._stream_task = asyncio.create_task(
                     self._drain_audio(self._session_id, self._audio_q)
@@ -198,7 +190,7 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: i
             sid = self._session_id
             audio_q = self._audio_q
             stream_task = self._stream_task
-            print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True)
+            print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})")
             await self._handle.push_text.remote(sid, token.text, token.end)
 
         if not is_first:
@@ -212,10 +204,10 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]:  # type: i
                 if item is _END_AUDIO:
                     break
                 count += 1
-                print(f"[TTS-client] [{sid}] yield chunk #{count}", flush=True)
+                print(f"[TTS-client] [{sid}] yield chunk #{count}")
                 yield item
             await stream_task
-            print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)", flush=True)
+            print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)")
 
             sample_rate = await self._handle.get_sample_rate.remote()
             yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True)
@@ -237,12 +229,11 @@ async def _drain_audio(self, session_id: str, audio_q: asyncio.Queue) -> None:
                 print(
                     f"[TTS-client] [{session_id}] drain received chunk #{count} "
                     f"pts={audio.pts:.3f}s next={pts:.3f}s",
-                    flush=True,
                 )
                 await audio_q.put(audio)
         except Exception as e:
-            print(f"[TTS-client] [{session_id}] drain task FAILED: {e!r}", flush=True)
+            print(f"[TTS-client] [{session_id}] drain task FAILED: {e!r}")
             raise
         finally:
             await audio_q.put(_END_AUDIO)
-            print(f"[TTS-client] [{session_id}] drain task finished", flush=True)
+            print(f"[TTS-client] [{session_id}] drain task finished")

From 93237f7264429e7edad21416d862e4bfa1c06c8c Mon Sep 17 00:00:00 2001
From: "thomas.pommier" <thomas.pommier04@hotmail.fr>
Date: Wed, 10 Jun 2026 03:54:35 +0200
Subject: [PATCH 31/31] fixed(rag): qdrant is installed on all dockerfiles

---
 deploy/Dockerfile.nvidia | 3 +++
 requirements-amd.txt     | 4 ++--
 serve_requirements.txt   | 7 +++++++
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia
index f9e704f..f0e6e4d 100644
--- a/deploy/Dockerfile.nvidia
+++ b/deploy/Dockerfile.nvidia
@@ -10,6 +10,9 @@ RUN pip install --no-cache-dir \
     --extra-index-url https://pypi.ngc.nvidia.com \
     -r requirements-nvidia.txt
 
+COPY serve_requirements.txt /app
+RUN pip install --no-cache-dir -r serve_requirements.txt
+
 USER root
 
 RUN apt-get update && apt-get install -y --no-install-recommends git \
diff --git a/requirements-amd.txt b/requirements-amd.txt
index 1c99594..7fa9358 100644
--- a/requirements-amd.txt
+++ b/requirements-amd.txt
@@ -5,8 +5,8 @@
 # Does NOT include CosyVoice2 or EMAGE — those run on the NVIDIA worker.
 
 # --- RAG / LLM ---
-httpx==0.27.2
-qdrant-client==1.18.0
+# httpx + qdrant-client moved to serve_requirements.txt so the head/base
+# controller can import rag.py without falling back to None (see that file).
 sentence-transformers==3.2.1
 pypdf==5.1.0
 semantic_chunker==0.2.0
diff --git a/serve_requirements.txt b/serve_requirements.txt
index 242576e..1ebe583 100644
--- a/serve_requirements.txt
+++ b/serve_requirements.txt
@@ -3,3 +3,10 @@ numpy
 click<8.2
 webrtcvad
 faster-whisper
+
+# RAG/LLM client deps. Kept in the *base* image (not just the AMD worker)
+# because the Serve controller on the head node imports rag.py to build the app;
+# if these are missing there, the module-level imports fall back to None and that
+# None is baked into the cloudpickled RAGHandle shipped to the AMD replica.
+httpx==0.27.2
+qdrant-client==1.18.0