From 2a3d9af810969ead8a4e598fcfc44596b2db0646 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 01:38:25 +0200 Subject: [PATCH 01/31] feat(gesture): all gesture ray module from EMAGE --- src/modules/gesture/__init__.py | 1 + src/modules/gesture/emage/__init__.py | 12 + src/modules/gesture/emage/configuration.py | 32 ++ src/modules/gesture/emage/modeling.py | 449 +++++++++++++++++++++ src/modules/gesture/emage/processing.py | 380 +++++++++++++++++ src/modules/gesture/gesture.py | 142 +++++++ 6 files changed, 1016 insertions(+) create mode 100644 src/modules/gesture/__init__.py create mode 100644 src/modules/gesture/emage/__init__.py create mode 100644 src/modules/gesture/emage/configuration.py create mode 100644 src/modules/gesture/emage/modeling.py create mode 100644 src/modules/gesture/emage/processing.py create mode 100644 src/modules/gesture/gesture.py diff --git a/src/modules/gesture/__init__.py b/src/modules/gesture/__init__.py new file mode 100644 index 0000000..8d8e6c2 --- /dev/null +++ b/src/modules/gesture/__init__.py @@ -0,0 +1 @@ +from .gesture import Gesture, Motion diff --git a/src/modules/gesture/emage/__init__.py b/src/modules/gesture/emage/__init__.py new file mode 100644 index 0000000..df5a071 --- /dev/null +++ b/src/modules/gesture/emage/__init__.py @@ -0,0 +1,12 @@ +from .configuration import EmageAudioConfig, EmageVAEConvConfig, EmageVQVAEConvConfig +from .modeling import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv + +__all__ = [ + "EmageAudioConfig", + "EmageAudioModel", + "EmageVAEConvConfig", + "EmageVAEConv", + "EmageVQVAEConvConfig", + "EmageVQVAEConv", + "EmageVQModel", +] diff --git a/src/modules/gesture/emage/configuration.py b/src/modules/gesture/emage/configuration.py new file mode 100644 index 0000000..ad97076 --- /dev/null +++ b/src/modules/gesture/emage/configuration.py @@ -0,0 +1,32 @@ +from omegaconf import OmegaConf +from transformers import PretrainedConfig + + +class EmageAudioConfig(PretrainedConfig): + model_type = "emage_audio" + + def __init__(self, config_obj=None, **kwargs): + if config_obj is not None: + cfg_dict = OmegaConf.to_container(config_obj, resolve=True) + kwargs.update(cfg_dict) + super().__init__(**kwargs) + + +class EmageVQVAEConvConfig(PretrainedConfig): + model_type = "emage_vqvaeconv" + + def __init__(self, config_obj=None, **kwargs): + if config_obj is not None: + cfg_dict = OmegaConf.to_container(config_obj, resolve=True) + kwargs.update(cfg_dict) + super().__init__(**kwargs) + + +class EmageVAEConvConfig(PretrainedConfig): + model_type = "emage_vaeconv" + + def __init__(self, config_obj=None, **kwargs): + if config_obj is not None: + cfg_dict = OmegaConf.to_container(config_obj, resolve=True) + kwargs.update(cfg_dict) + super().__init__(**kwargs) diff --git a/src/modules/gesture/emage/modeling.py b/src/modules/gesture/emage/modeling.py new file mode 100644 index 0000000..b4d71ab --- /dev/null +++ b/src/modules/gesture/emage/modeling.py @@ -0,0 +1,449 @@ +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PreTrainedModel + +from .configuration import EmageAudioConfig, EmageVAEConvConfig, EmageVQVAEConvConfig +from .processing import ( + MLP, + PeriodicPositionalEncoding, + Quantizer, + VQDecoderV5, + VQEncoderV5, + VQEncoderV6, + WavEncoder, + axis_angle_to_rotation_6d, + matrix_to_axis_angle, + matrix_to_rotation_6d, + recover_from_mask_ts, + rotation_6d_to_axis_angle, + rotation_6d_to_matrix, + velocity2position, + axis_angle_to_matrix, +) + + +class EmageVAEConv(PreTrainedModel): + config_class = EmageVAEConvConfig + base_model_prefix = "emage_vaeconv" + + def __init__(self, config): + super().__init__(config) + self.encoder = VQEncoderV5(config) + self.decoder = VQDecoderV5(config) + + def forward(self, inputs): + pre_latent = self.encoder(inputs) + rec_pose = self.decoder(pre_latent) + return {"rec_pose": rec_pose} + + +class EmageVQVAEConv(PreTrainedModel): + config_class = EmageVQVAEConvConfig + base_model_prefix = "emage_vqvaeconv" + + def __init__(self, config): + super().__init__(config) + self.encoder = VQEncoderV5(config) + self.quantizer = Quantizer(config.vae_codebook_size, config.vae_length, config.vae_quantizer_lambda) + self.decoder = VQDecoderV5(config) + + def forward(self, inputs): + pre_latent = self.encoder(inputs) + embedding_loss, vq_latent, _, perplexity = self.quantizer(pre_latent) + rec_pose = self.decoder(vq_latent) + return {"poses_feat": vq_latent, "embedding_loss": embedding_loss, "perplexity": perplexity, "rec_pose": rec_pose} + + def map2index(self, inputs): + pre_latent = self.encoder(inputs) + return self.quantizer.map2index(pre_latent) + + def map2latent(self, inputs): + pre_latent = self.encoder(inputs) + index = self.quantizer.map2index(pre_latent) + return self.quantizer.get_codebook_entry(index) + + def decode(self, index): + z_q = self.quantizer.get_codebook_entry(index) + return self.decoder(z_q) + + def decode_from_latent(self, latent): + z_flattened = latent.contiguous().view(-1, self.quantizer.e_dim) + d = ( + torch.sum(z_flattened ** 2, dim=1, keepdim=True) + + torch.sum(self.quantizer.embedding.weight ** 2, dim=1) + - 2 * torch.matmul(z_flattened, self.quantizer.embedding.weight.t()) + ) + min_encoding_indices = torch.argmin(d, dim=1) + indices = min_encoding_indices.view(latent.shape[0], latent.shape[1]) + z_q = self.quantizer.get_codebook_entry(indices) + return self.decoder(z_q) + + +class EmageVQModel(nn.Module): + def __init__(self, face_model, upper_model, hands_model, lower_model, global_model): + super().__init__() + self.joint_mask_upper = [ + False, False, False, True, False, False, True, False, False, True, + False, False, True, True, True, True, True, True, True, True, + True, True, False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False, False, False, + False, False, False, False, False, + ] + self.joint_mask_lower = [ + True, True, True, False, True, True, False, True, True, False, + True, True, False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False, False, False, + False, False, False, False, False, False, False, False, False, False, + False, False, False, False, False, + ] + self.vq_model_face = face_model + self.vq_model_upper = upper_model + self.vq_model_hands = hands_model + self.vq_model_lower = lower_model + self.global_motion = global_model + + def spilt_inputs(self, smplx_body_rot6d, expression, tar_contact=None, tar_trans=None): + bs, t, j6 = smplx_body_rot6d.shape + smplx_body_rot6d = smplx_body_rot6d.reshape(bs, t, j6 // 6, 6) + jaw_rot6d = smplx_body_rot6d[:, :, 22:23, :].reshape(bs, t, 6) + face = torch.cat([jaw_rot6d, expression], dim=2) + upper_rot6d = smplx_body_rot6d[:, :, self.joint_mask_upper, :].reshape(bs, t, 78) + hands_rot6d = smplx_body_rot6d[:, :, 25:55, :].reshape(bs, t, 180) + lower_rot6d = smplx_body_rot6d[:, :, self.joint_mask_lower, :].reshape(bs, t, 54) + tar_contact = torch.zeros(bs, t, 4, device=smplx_body_rot6d.device) if tar_contact is None else tar_contact + tar_trans = torch.zeros(bs, t, 3, device=smplx_body_rot6d.device) if tar_trans is None else tar_trans + lower = torch.cat([lower_rot6d, tar_trans, tar_contact], dim=2) + return dict(face=face, upper=upper_rot6d, hands=hands_rot6d, lower=lower) + + def map2index(self, smplx_body_rot6d, expression, tar_contact=None, tar_trans=None): + inputs = self.spilt_inputs(smplx_body_rot6d, expression, tar_contact=tar_contact, tar_trans=tar_trans) + return dict( + face=self.vq_model_face.map2index(inputs["face"]), + upper=self.vq_model_upper.map2index(inputs["upper"]), + hands=self.vq_model_hands.map2index(inputs["hands"]), + lower=self.vq_model_lower.map2index(inputs["lower"]), + ) + + def map2latent(self, smplx_body_rot6d, expression, tar_contact=None, tar_trans=None): + inputs = self.spilt_inputs(smplx_body_rot6d, expression, tar_contact=tar_contact, tar_trans=tar_trans) + return dict( + face=self.vq_model_face.map2latent(inputs["face"]), + upper=self.vq_model_upper.map2latent(inputs["upper"]), + hands=self.vq_model_hands.map2latent(inputs["hands"]), + lower=self.vq_model_lower.map2latent(inputs["lower"]), + ) + + def decode( + self, + face_index=None, upper_index=None, hands_index=None, lower_index=None, + face_latent=None, upper_latent=None, hands_latent=None, lower_latent=None, + get_global_motion=False, ref_trans=None, + ): + for t in [face_index, upper_index, hands_index, lower_index, face_latent, upper_latent, hands_latent, lower_latent]: + if t is not None: + bs, seq = t.shape[:2] + break + + if face_index is not None: + face_mix = self.vq_model_face.decode(face_index) + face_jaw_6d, expression = face_mix[:, :, :6], face_mix[:, :, 6:] + face_jaw = rotation_6d_to_axis_angle(face_jaw_6d) + elif face_latent is not None: + face_mix = self.vq_model_face.decode_from_latent(face_latent) + face_jaw_6d, expression = face_mix[:, :, :6], face_mix[:, :, 6:] + face_jaw = rotation_6d_to_axis_angle(face_jaw_6d) + else: + face_jaw = torch.zeros(bs, seq, 3, device=self.vq_model_face.device) + expression = torch.zeros(bs, seq, 100, device=self.vq_model_face.device) + + if upper_index is not None: + upper_6d = self.vq_model_upper.decode(upper_index) + upper = rotation_6d_to_axis_angle(upper_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1) + elif upper_latent is not None: + upper_6d = self.vq_model_upper.decode_from_latent(upper_latent) + upper = rotation_6d_to_axis_angle(upper_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1) + else: + upper = torch.zeros(bs, seq, 39, device=self.vq_model_upper.device) + + if hands_index is not None: + hands_6d = self.vq_model_hands.decode(hands_index) + hands = rotation_6d_to_axis_angle(hands_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1) + elif hands_latent is not None: + hands_6d = self.vq_model_hands.decode_from_latent(hands_latent) + hands = rotation_6d_to_axis_angle(hands_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1) + else: + hands = torch.zeros(bs, seq, 90, device=self.vq_model_hands.device) + + if lower_index is not None: + lower_mix = self.vq_model_lower.decode(lower_index) + lower_6d, transfoot = lower_mix[:, :, :-7], lower_mix[:, :, -7:] + lower = rotation_6d_to_axis_angle(lower_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1) + elif lower_latent is not None: + lower_mix = self.vq_model_lower.decode_from_latent(lower_latent) + lower_6d, transfoot = lower_mix[:, :, :-7], lower_mix[:, :, -7:] + lower = rotation_6d_to_axis_angle(lower_6d.reshape(bs, seq, -1, 6)).reshape(bs, seq, -1) + else: + lower = torch.zeros(bs, seq, 27, device=self.vq_model_lower.device) + transfoot = torch.zeros(bs, seq, 7, device=self.vq_model_lower.device) + lower_6d = axis_angle_to_rotation_6d(lower.reshape(bs, seq, -1, 3)).reshape(bs, seq, -1) + lower_mix = torch.cat([lower_6d, transfoot], dim=-1) + + upper2all = recover_from_mask_ts(upper, self.joint_mask_upper) + hands2all = recover_from_mask_ts(hands, [False] * 25 + [True] * 30) + lower2all = recover_from_mask_ts(lower, self.joint_mask_lower) + + all_motion_axis_angle = upper2all + hands2all + lower2all + all_motion_axis_angle[:, :, 22 * 3:22 * 3 + 3] = face_jaw + all_motion_rot6d = axis_angle_to_rotation_6d(all_motion_axis_angle.reshape(bs, seq, 55, 3)).reshape(bs, seq, 55 * 6) + all_motion4inference = torch.cat([all_motion_rot6d, transfoot], dim=2) + + global_motion = None + if get_global_motion: + global_motion = self._get_global_motion(lower_mix, ref_trans) + + return dict( + expression=expression, + all_motion4inference=all_motion4inference, + motion_axis_angle=all_motion_axis_angle, + trans=global_motion, + ) + + def _get_global_motion(self, lower_body, ref_trans): + global_motion = self.global_motion(lower_body) + rec_trans_v_s = global_motion["rec_pose"][:, :, 54:57] + if len(ref_trans.shape) == 2: + ref_trans = ref_trans.unsqueeze(0).repeat(rec_trans_v_s.shape[0], 1, 1) + rec_x_trans = velocity2position(rec_trans_v_s[:, :, 0:1], 1 / 30, ref_trans[:, 0, 0:1]) + rec_z_trans = velocity2position(rec_trans_v_s[:, :, 2:3], 1 / 30, ref_trans[:, 0, 2:3]) + rec_y_trans = rec_trans_v_s[:, :, 1:2] + return torch.cat([rec_x_trans, rec_y_trans, rec_z_trans], dim=-1) + + +class EmageAudioModel(PreTrainedModel): + config_class = EmageAudioConfig + base_model_prefix = "emage_audio" + + def __init__(self, config: EmageAudioConfig): + super().__init__(config) + self.cfg = config + self.audio_encoder_face = WavEncoder(self.cfg.audio_f) + self.audio_encoder_body = WavEncoder(self.cfg.audio_f) + self.speaker_embedding_body = nn.Embedding(self.cfg.speaker_dims, self.cfg.hidden_size) + self.speaker_embedding_face = nn.Embedding(self.cfg.speaker_dims, self.cfg.hidden_size) + self.mask_embedding = nn.Parameter(torch.zeros(1, 1, self.cfg.pose_dims + 3 + 4)) + nn.init.normal_(self.mask_embedding, 0, self.cfg.hidden_size ** -0.5) + + args_top = copy.deepcopy(self.cfg) + args_top.vae_layer = 3 + args_top.vae_length = self.cfg.motion_f + args_top.vae_test_dim = self.cfg.pose_dims + 3 + 4 + self.motion_encoder = VQEncoderV6(args_top) + self.bodyhints_face = MLP(self.cfg.motion_f, self.cfg.hidden_size, self.cfg.motion_f) + self.bodyhints_body = MLP(self.cfg.motion_f, self.cfg.hidden_size, self.cfg.motion_f) + self.audio_body_motion_proj = nn.Linear(self.cfg.audio_f, self.cfg.hidden_size) + self.moton_proj = nn.Linear(self.cfg.motion_f, self.cfg.hidden_size) + self.position_embeddings = PeriodicPositionalEncoding(self.cfg.hidden_size, period=self.cfg.pose_length, max_seq_len=self.cfg.pose_length) + self.transformer_en_layer = nn.TransformerEncoderLayer(d_model=self.cfg.hidden_size, nhead=4, dim_feedforward=self.cfg.hidden_size * 2) + self.motion_self_encoder = nn.TransformerEncoder(self.transformer_en_layer, num_layers=1) + self.audio_motion_cross_attn_layer = nn.TransformerDecoderLayer(d_model=self.cfg.hidden_size, nhead=4, dim_feedforward=self.cfg.hidden_size * 2) + self.audio_motion_cross_attn = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=8) + self.motion2latent_upper = MLP(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.hidden_size) + self.motion2latent_hands = MLP(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.hidden_size) + self.motion2latent_lower = MLP(self.cfg.hidden_size, self.cfg.hidden_size, self.cfg.hidden_size) + self.body_motion_decoder_upper = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=1) + self.body_motion_decoder_hands = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=1) + self.body_motion_decoder_lower = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=1) + self.motion_out_proj_upper = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.motion_out_proj_hands = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.motion_out_proj_lower = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.motion_cls_upper = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.motion_cls_hands = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.motion_cls_lower = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.audio_face_motion_proj = nn.Linear(self.cfg.audio_f + self.cfg.motion_f, self.cfg.hidden_size) + self.face_motion_decoder = nn.TransformerDecoder(self.audio_motion_cross_attn_layer, num_layers=4) + self.face_out_proj = nn.Linear(self.cfg.hidden_size, self.cfg.vae_codebook_size) + self.face_cls = MLP(self.cfg.vae_codebook_size, self.cfg.hidden_size, self.cfg.vae_codebook_size) + + def forward(self, audio, speaker_id, masked_motion, mask, use_audio=True): + masked_embeddings = self.mask_embedding.expand_as(masked_motion) + masked_motion = torch.where(mask == 1, masked_embeddings, masked_motion) + + body_hint = self.motion_encoder(masked_motion) + body_hint_body = self.bodyhints_body(body_hint) + body_hint_face = self.bodyhints_face(body_hint) + + audio2face_fea = self.audio_encoder_face(audio) + audio2body_fea = self.audio_encoder_body(audio) + + if audio2face_fea.shape[1] > body_hint_face.shape[1]: + audio2face_fea = audio2face_fea[:, :body_hint_face.shape[1]] + if audio2body_fea.shape[1] > body_hint_face.shape[1]: + audio2face_fea = audio2face_fea[:, :body_hint_face.shape[1]] + + bs, t, _ = audio2face_fea.shape + + speaker_motion_fea_proj = self.speaker_embedding_body(speaker_id).repeat(1, t, 1) + speaker_face_fea_proj = self.speaker_embedding_face(speaker_id).repeat(1, t, 1) + + audio2face_fea_proj = self.audio_face_motion_proj(torch.cat([audio2face_fea, body_hint_face], dim=2)) + face_proj = self.position_embeddings(speaker_face_fea_proj) + decode_face = self.face_motion_decoder(tgt=face_proj.permute(1, 0, 2), memory=audio2face_fea_proj.permute(1, 0, 2)).permute(1, 0, 2) + face_latent = self.face_out_proj(decode_face) + classify_face = self.face_cls(face_latent) + + masked_motion_proj = self.moton_proj(body_hint_body) + masked_motion_proj = self.position_embeddings(masked_motion_proj) + masked_motion_proj = speaker_motion_fea_proj + masked_motion_proj + motion_fea = self.motion_self_encoder(masked_motion_proj.permute(1, 0, 2)).permute(1, 0, 2) + + audio2body_fea_proj = self.audio_body_motion_proj(audio2body_fea) + motion_fea = motion_fea + speaker_motion_fea_proj + motion_fea = self.position_embeddings(motion_fea) + audio2body_fea_cross = self.audio_motion_cross_attn(tgt=motion_fea.permute(1, 0, 2), memory=audio2body_fea_proj.permute(1, 0, 2)).permute(1, 0, 2) + if not use_audio: + audio2body_fea_cross = audio2body_fea_cross * 0.0 + motion_fea = motion_fea + audio2body_fea_cross + + upper_latent = self.motion2latent_upper(motion_fea) + hands_latent = self.motion2latent_hands(motion_fea) + lower_latent = self.motion2latent_lower(motion_fea) + + motion_upper_refine = self.body_motion_decoder_upper(tgt=upper_latent.permute(1, 0, 2) + speaker_motion_fea_proj.permute(1, 0, 2), memory=(hands_latent + lower_latent).permute(1, 0, 2)).permute(1, 0, 2) + motion_hands_refine = self.body_motion_decoder_hands(tgt=hands_latent.permute(1, 0, 2) + speaker_motion_fea_proj.permute(1, 0, 2), memory=(upper_latent + lower_latent).permute(1, 0, 2)).permute(1, 0, 2) + motion_lower_refine = self.body_motion_decoder_lower(tgt=lower_latent.permute(1, 0, 2) + speaker_motion_fea_proj.permute(1, 0, 2), memory=(upper_latent + hands_latent).permute(1, 0, 2)).permute(1, 0, 2) + upper_latent = self.motion_out_proj_upper(upper_latent + motion_upper_refine) + hands_latent = self.motion_out_proj_hands(hands_latent + motion_hands_refine) + lower_latent = self.motion_out_proj_lower(lower_latent + motion_lower_refine) + + classify_upper = self.motion_cls_upper(upper_latent) + classify_hands = self.motion_cls_hands(hands_latent) + classify_lower = self.motion_cls_lower(lower_latent) + + return { + "rec_face": face_latent, + "rec_upper": upper_latent, + "rec_hands": hands_latent, + "rec_lower": lower_latent, + "cls_face": classify_face, + "cls_upper": classify_upper, + "cls_hands": classify_hands, + "cls_lower": classify_lower, + } + + def inference(self, audio, speaker_id, vq_model, masked_motion=None, mask=None): + length = audio.shape[1] * 30 // 16000 + bs = audio.shape[0] + + fake_axis_angle = torch.zeros(bs, length, 55, 3).to(audio.device) + fake_motion = axis_angle_to_rotation_6d(fake_axis_angle).reshape(bs, length, -1) + fake_foot_and_trans = torch.zeros(bs, length, 7).to(audio.device) + fake_motion = torch.cat([fake_motion, fake_foot_and_trans], dim=-1) + if masked_motion is not None: + fake_motion[:, :masked_motion.shape[1]] = masked_motion + masked_motion = fake_motion + + fake_mask = torch.ones_like(masked_motion) + if mask is not None: + fake_mask[:, :mask.shape[1]] = mask + mask = fake_mask + + bs, total_len, c = masked_motion.shape + window = self.cfg.pose_length + pre_frames = self.cfg.seed_frames + rounds = (total_len - pre_frames) // (window - pre_frames) + remain = (total_len - pre_frames) % (window - pre_frames) + + rec_all_face, rec_all_lower, rec_all_upper, rec_all_hands = [], [], [], [] + cls_all_face, cls_all_lower, cls_all_upper, cls_all_hands = [], [], [], [] + + last_motion = masked_motion[:, :pre_frames, :] + + for i in range(rounds): + start_idx = i * (window - pre_frames) + end_idx = start_idx + window + + window_mask = mask[:, start_idx:end_idx, :].clone() + window_motion = masked_motion[:, start_idx:end_idx, :].clone() + window_motion[:, :pre_frames, :] = torch.where( + (window_mask[:, :pre_frames, :] == 0), + masked_motion[:, start_idx:start_idx + pre_frames, :], + last_motion, + ) + window_mask[:, :pre_frames, :] = 0 + + audio_slice_len = (end_idx - start_idx) * (16000 // 30) + audio_slice = audio[:, start_idx * (16000 // 30):start_idx * (16000 // 30) + audio_slice_len] + net_out_val = self.forward(audio_slice, speaker_id, masked_motion=window_motion, mask=window_mask, use_audio=True) + + _, cls_face = torch.max(F.log_softmax(net_out_val["cls_face"], dim=2), dim=2) + _, cls_upper = torch.max(F.log_softmax(net_out_val["cls_upper"], dim=2), dim=2) + _, cls_hands = torch.max(F.log_softmax(net_out_val["cls_hands"], dim=2), dim=2) + _, cls_lower = torch.max(F.log_softmax(net_out_val["cls_lower"], dim=2), dim=2) + + face_latent = net_out_val["rec_face"] if self.cfg.lf > 0 and self.cfg.cf == 0 else None + upper_latent = net_out_val["rec_upper"] if self.cfg.lu > 0 and self.cfg.cu == 0 else None + hands_latent = net_out_val["rec_hands"] if self.cfg.lh > 0 and self.cfg.ch == 0 else None + lower_latent = net_out_val["rec_lower"] if self.cfg.ll > 0 and self.cfg.cl == 0 else None + face_index = cls_face if self.cfg.cf > 0 else None + upper_index = cls_upper if self.cfg.cu > 0 else None + hands_index = cls_hands if self.cfg.ch > 0 else None + lower_index = cls_lower if self.cfg.cl > 0 else None + + decode_dict = vq_model.decode( + face_latent=face_latent, upper_latent=upper_latent, + lower_latent=lower_latent, hands_latent=hands_latent, + face_index=face_index, upper_index=upper_index, + lower_index=lower_index, hands_index=hands_index, + ) + + last_motion = decode_dict["all_motion4inference"][:, -pre_frames:, :] + rec_all_face.append(net_out_val["rec_face"][:, :-pre_frames, :]) + rec_all_upper.append(net_out_val["rec_upper"][:, :-pre_frames, :]) + rec_all_hands.append(net_out_val["rec_hands"][:, :-pre_frames, :]) + rec_all_lower.append(net_out_val["rec_lower"][:, :-pre_frames, :]) + cls_all_face.append(net_out_val["cls_face"][:, :-pre_frames]) + cls_all_upper.append(net_out_val["cls_upper"][:, :-pre_frames]) + cls_all_hands.append(net_out_val["cls_hands"][:, :-pre_frames]) + cls_all_lower.append(net_out_val["cls_lower"][:, :-pre_frames]) + + if remain > pre_frames: + final_start = rounds * (window - pre_frames) + final_end = final_start + pre_frames + remain + + final_mask = mask[:, final_start:final_end, :].clone() + final_motion = masked_motion[:, final_start:final_end, :].clone() + final_motion[:, :pre_frames, :] = torch.where( + (final_mask[:, :pre_frames, :] == 0), + masked_motion[:, final_start:final_start + pre_frames, :], + last_motion, + ) + final_mask[:, :pre_frames, :] = 0 + + audio_slice_len = (final_end - final_start) * (16000 // 30) + audio_slice = audio[:, final_start * (16000 // 30):final_start * (16000 // 30) + audio_slice_len] + net_out_val = self.forward(audio_slice, speaker_id, masked_motion=final_motion, mask=final_mask, use_audio=True) + + rec_all_face.append(net_out_val["rec_face"]) + rec_all_upper.append(net_out_val["rec_upper"]) + rec_all_hands.append(net_out_val["rec_hands"]) + rec_all_lower.append(net_out_val["rec_lower"]) + cls_all_face.append(net_out_val["cls_face"]) + cls_all_upper.append(net_out_val["cls_upper"]) + cls_all_hands.append(net_out_val["cls_hands"]) + cls_all_lower.append(net_out_val["cls_lower"]) + + return { + "rec_face": torch.cat(rec_all_face, dim=1), + "rec_upper": torch.cat(rec_all_upper, dim=1), + "rec_hands": torch.cat(rec_all_hands, dim=1), + "rec_lower": torch.cat(rec_all_lower, dim=1), + "cls_face": torch.cat(cls_all_face, dim=1), + "cls_upper": torch.cat(cls_all_upper, dim=1), + "cls_hands": torch.cat(cls_all_hands, dim=1), + "cls_lower": torch.cat(cls_all_lower, dim=1), + } diff --git a/src/modules/gesture/emage/processing.py b/src/modules/gesture/emage/processing.py new file mode 100644 index 0000000..2d128b1 --- /dev/null +++ b/src/modules/gesture/emage/processing.py @@ -0,0 +1,380 @@ +import math +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def _copysign(a, b): + signs_differ = (a < 0) != (b < 0) + return torch.where(signs_differ, -a, a) + + +def _sqrt_positive_part(x): + ret = torch.zeros_like(x) + positive_mask = x > 0 + ret[positive_mask] = torch.sqrt(x[positive_mask]) + return ret + + +def matrix_to_quaternion(matrix): + if matrix.size(-1) != 3 or matrix.size(-2) != 3: + raise ValueError + m00 = matrix[..., 0, 0] + m11 = matrix[..., 1, 1] + m22 = matrix[..., 2, 2] + o0 = 0.5 * _sqrt_positive_part(1 + m00 + m11 + m22) + x = 0.5 * _sqrt_positive_part(1 + m00 - m11 - m22) + y = 0.5 * _sqrt_positive_part(1 - m00 + m11 - m22) + z = 0.5 * _sqrt_positive_part(1 - m00 - m11 + m22) + o1 = _copysign(x, matrix[..., 2, 1] - matrix[..., 1, 2]) + o2 = _copysign(y, matrix[..., 0, 2] - matrix[..., 2, 0]) + o3 = _copysign(z, matrix[..., 1, 0] - matrix[..., 0, 1]) + return torch.stack((o0, o1, o2, o3), -1) + + +def quaternion_to_axis_angle(quaternions): + norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True) + half_angles = torch.atan2(norms, quaternions[..., :1]) + angles = 2 * half_angles + eps = 1e-6 + small_angles = angles.abs() < eps + sin_half_angles_over_angles = torch.empty_like(angles) + sin_half_angles_over_angles[~small_angles] = ( + torch.sin(half_angles[~small_angles]) / angles[~small_angles] + ) + sin_half_angles_over_angles[small_angles] = ( + 0.5 - (angles[small_angles] * angles[small_angles]) / 48 + ) + return quaternions[..., 1:] / sin_half_angles_over_angles + + +def matrix_to_axis_angle(matrix): + return quaternion_to_axis_angle(matrix_to_quaternion(matrix)) + + +def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor: + a1, a2 = d6[..., :3], d6[..., 3:] + b1 = F.normalize(a1, dim=-1) + b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1 + b2 = F.normalize(b2, dim=-1) + b3 = torch.cross(b1, b2, dim=-1) + return torch.stack((b1, b2, b3), dim=-2) + + +def rotation_6d_to_axis_angle(rot6d): + return matrix_to_axis_angle(rotation_6d_to_matrix(rot6d)) + + +def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor: + return matrix[..., :2, :].clone().reshape(*matrix.size()[:-2], 6) + + +def axis_angle_to_quaternion(axis_angle): + angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True) + half_angles = 0.5 * angles + eps = 1e-6 + small_angles = angles.abs() < eps + sin_half_angles_over_angles = torch.empty_like(angles) + sin_half_angles_over_angles[~small_angles] = ( + torch.sin(half_angles[~small_angles]) / angles[~small_angles] + ) + sin_half_angles_over_angles[small_angles] = ( + 0.5 - (angles[small_angles] * angles[small_angles]) / 48 + ) + quaternions = torch.cat( + [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1 + ) + return quaternions + + +def quaternion_to_matrix(quaternions): + r, i, j, k = torch.unbind(quaternions, -1) + two_s = 2.0 / (quaternions * quaternions).sum(-1) + o = torch.stack( + ( + 1 - two_s * (j * j + k * k), + two_s * (i * j - k * r), + two_s * (i * k + j * r), + two_s * (i * j + k * r), + 1 - two_s * (i * i + k * k), + two_s * (j * k - i * r), + two_s * (i * k - j * r), + two_s * (j * k + i * r), + 1 - two_s * (i * i + j * j), + ), + -1, + ) + return o.reshape(quaternions.shape[:-1] + (3, 3)) + + +def axis_angle_to_matrix(axis_angle): + return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle)) + + +def axis_angle_to_rotation_6d(axis_angle): + return matrix_to_rotation_6d(axis_angle_to_matrix(axis_angle)) + + +def velocity2position(data_seq, dt, init_pos): + res_trans = [] + for i in range(data_seq.shape[1]): + if i == 0: + res_trans.append(init_pos.unsqueeze(1)) + else: + res = data_seq[:, i - 1:i] * dt + res_trans[-1] + res_trans.append(res) + return torch.cat(res_trans, dim=1) + + +def recover_from_mask_ts(selected_motion: torch.Tensor, mask: list) -> torch.Tensor: + device = selected_motion.device + dtype = selected_motion.dtype + mask_arr = torch.tensor(mask, dtype=torch.bool, device=device) + j = len(mask_arr) + sum_mask = mask_arr.sum().item() + c_channels = selected_motion.shape[-1] // sum_mask + new_shape = selected_motion.shape[:-1] + (sum_mask, c_channels) + selected_motion = selected_motion.reshape(new_shape) + out_shape = list(selected_motion.shape[:-2]) + [j, c_channels] + recovered = torch.zeros(out_shape, dtype=dtype, device=device) + recovered[..., mask_arr, :] = selected_motion + final_shape = list(recovered.shape[:-2]) + [j * c_channels] + recovered = recovered.reshape(final_shape) + return recovered + + +class Quantizer(nn.Module): + def __init__(self, n_e, e_dim, beta): + super().__init__() + self.e_dim = e_dim + self.n_e = n_e + self.beta = beta + self.embedding = nn.Embedding(self.n_e, self.e_dim) + self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e) + + def forward(self, z): + assert z.shape[-1] == self.e_dim + z_flattened = z.contiguous().view(-1, self.e_dim) + d = ( + torch.sum(z_flattened ** 2, dim=1, keepdim=True) + + torch.sum(self.embedding.weight ** 2, dim=1) + - 2 * torch.matmul(z_flattened, self.embedding.weight.t()) + ) + min_encoding_indices = torch.argmin(d, dim=1) + z_q = self.embedding(min_encoding_indices).view(z.shape) + loss = torch.mean((z_q - z.detach()) ** 2) + self.beta * torch.mean((z_q.detach() - z) ** 2) + z_q = z + (z_q - z).detach() + min_encodings = F.one_hot(min_encoding_indices, self.n_e).type(z.dtype) + e_mean = torch.mean(min_encodings, dim=0) + perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10))) + return loss, z_q, min_encoding_indices, perplexity + + def map2index(self, z): + assert z.shape[-1] == self.e_dim + z_flattened = z.contiguous().view(-1, self.e_dim) + d = ( + torch.sum(z_flattened ** 2, dim=1, keepdim=True) + + torch.sum(self.embedding.weight ** 2, dim=1) + - 2 * torch.matmul(z_flattened, self.embedding.weight.t()) + ) + min_encoding_indices = torch.argmin(d, dim=1) + return min_encoding_indices.reshape(z.shape[0], -1) + + def get_codebook_entry(self, indices): + index_flattened = indices.view(-1) + z_q = self.embedding(index_flattened) + z_q = z_q.view(indices.shape + (self.e_dim,)).contiguous() + return z_q + + +def init_weight(m): + if isinstance(m, (nn.Conv1d, nn.Linear, nn.ConvTranspose1d)): + nn.init.xavier_normal_(m.weight) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + +class ResBlock(nn.Module): + def __init__(self, channel): + super().__init__() + self.model = nn.Sequential( + nn.Conv1d(channel, channel, 3, 1, 1), + nn.LeakyReLU(0.2, True), + nn.Conv1d(channel, channel, 3, 1, 1), + ) + + def forward(self, x): + return self.model(x) + x + + +class VQEncoderV5(nn.Module): + def __init__(self, args): + super().__init__() + n_down = args.vae_layer + channels = [args.vae_length] * n_down + input_size = args.vae_test_dim + layers = [ + nn.Conv1d(input_size, channels[0], 3, 1, 1), + nn.LeakyReLU(0.2, True), + ResBlock(channels[0]), + ] + for i in range(1, n_down): + layers += [ + nn.Conv1d(channels[i - 1], channels[i], 3, 1, 1), + nn.LeakyReLU(0.2, True), + ResBlock(channels[i]), + ] + self.main = nn.Sequential(*layers) + self.main.apply(init_weight) + + def forward(self, inputs): + inputs = inputs.permute(0, 2, 1) + outputs = self.main(inputs).permute(0, 2, 1) + return outputs + + +class VQEncoderV6(nn.Module): + def __init__(self, args): + super().__init__() + n_down = args.vae_layer + channels = [args.vae_length] * n_down + input_size = args.vae_test_dim + layers = [ + nn.Conv1d(input_size, channels[0], 3, 1, 1), + nn.LeakyReLU(0.2, True), + ResBlock(channels[0]), + ] + for i in range(1, n_down): + layers += [ + nn.Conv1d(channels[i - 1], channels[i], 3, 1, 1), + nn.LeakyReLU(0.2, True), + ResBlock(channels[i]), + ] + self.main = nn.Sequential(*layers) + self.main.apply(init_weight) + + def forward(self, inputs): + inputs = inputs.permute(0, 2, 1) + outputs = self.main(inputs).permute(0, 2, 1) + return outputs + + +class VQDecoderV5(nn.Module): + def __init__(self, args): + super().__init__() + n_up = args.vae_layer + channels = [args.vae_length] * n_up + [args.vae_test_dim] + input_size = args.vae_length + n_resblk = 2 + if input_size == channels[0]: + layers = [] + else: + layers = [nn.Conv1d(input_size, channels[0], 3, 1, 1)] + for i in range(n_resblk): + layers += [ResBlock(channels[0])] + for i in range(n_up): + layers += [ + nn.Conv1d(channels[i], channels[i + 1], 3, 1, 1), + nn.LeakyReLU(0.2, True), + ] + layers += [nn.Conv1d(channels[-1], channels[-1], 3, 1, 1)] + self.main = nn.Sequential(*layers) + self.main.apply(init_weight) + + def forward(self, inputs): + inputs = inputs.permute(0, 2, 1) + outputs = self.main(inputs).permute(0, 2, 1) + return outputs + + +class BasicBlock(nn.Module): + def __init__(self, inplanes, planes, ker_size, stride=1, downsample=None, dilation=1, first_dilation=None, act_layer=nn.LeakyReLU, norm_layer=nn.BatchNorm1d): + super().__init__() + self.conv1 = nn.Conv1d( + inplanes, planes, kernel_size=ker_size, stride=stride, + padding=first_dilation, dilation=dilation, bias=True, + ) + self.bn1 = norm_layer(planes) + self.act1 = act_layer(inplace=True) + self.conv2 = nn.Conv1d( + planes, planes, kernel_size=ker_size, padding=ker_size // 2, + dilation=dilation, bias=True, + ) + self.bn2 = norm_layer(planes) + self.act2 = act_layer(inplace=True) + if downsample is not None: + self.downsample = nn.Sequential( + nn.Conv1d(inplanes, planes, stride=stride, kernel_size=ker_size, + padding=first_dilation, dilation=dilation, bias=True), + norm_layer(planes), + ) + else: + self.downsample = None + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + x = self.conv2(x) + x = self.bn2(x) + if self.downsample is not None: + shortcut = self.downsample(shortcut) + x += shortcut + x = self.act2(x) + return x + + +class WavEncoder(nn.Module): + def __init__(self, out_dim, audio_in=1): + super().__init__() + self.out_dim = out_dim + self.feat_extractor = nn.Sequential( + BasicBlock(audio_in, out_dim // 4, 15, 5, first_dilation=1600, downsample=True), + BasicBlock(out_dim // 4, out_dim // 4, 15, 6, first_dilation=0, downsample=True), + BasicBlock(out_dim // 4, out_dim // 4, 15, 1, first_dilation=7), + BasicBlock(out_dim // 4, out_dim // 2, 15, 6, first_dilation=0, downsample=True), + BasicBlock(out_dim // 2, out_dim // 2, 15, 1, first_dilation=7), + BasicBlock(out_dim // 2, out_dim, 15, 3, first_dilation=0, downsample=True), + ) + + def forward(self, wav_data): + if wav_data.dim() == 2: + wav_data = wav_data.unsqueeze(1) + else: + wav_data = wav_data.transpose(1, 2) + out = self.feat_extractor(wav_data) + return out.transpose(1, 2) + + +class MLP(nn.Module): + def __init__(self, in_dim, middle_dim, out_dim): + super().__init__() + self.fc1 = nn.Linear(in_dim, middle_dim) + self.fc2 = nn.Linear(middle_dim, out_dim) + self.act = nn.LeakyReLU(0.1, True) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +class PeriodicPositionalEncoding(nn.Module): + def __init__(self, d_model, dropout=0.1, period=15, max_seq_len=60): + super().__init__() + self.dropout = nn.Dropout(p=dropout) + pe = torch.zeros(period, d_model) + position = torch.arange(0, period, dtype=torch.float).unsqueeze(1) + div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) + pe[:, 0::2] = torch.sin(position * div_term) + pe[:, 1::2] = torch.cos(position * div_term) + pe = pe.unsqueeze(0) + repeat_num = (max_seq_len // period) + 1 + pe = pe.repeat(1, repeat_num, 1) + self.register_buffer('pe', pe) + + def forward(self, x): + x = x + self.pe[:, :x.size(1), :] + return self.dropout(x) diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py new file mode 100644 index 0000000..4e2a380 --- /dev/null +++ b/src/modules/gesture/gesture.py @@ -0,0 +1,142 @@ +import asyncio +import os +from dataclasses import dataclass +from typing import AsyncGenerator, Optional + +import numpy as np +from ray import serve +from ray.serve import handle + +from src.core.module import Module, ModuleWithHandle +from src.modules.text_to_speech.text_to_speech import Audio + + +_HF_REPO = os.environ.get("HURI_EMAGE_REPO", "H-Liu1997/emage_audio") +_EMAGE_SR = 16000 # EMAGE expects 16 kHz mono audio + + +@dataclass +class Motion: + poses: np.ndarray # (t, 165) SMPL-X axis-angle, 55 joints × 3 + expressions: np.ndarray # (t, 100) facial expression coefficients + trans: np.ndarray # (t, 3) global root translation + fps: int = 30 + + +@serve.deployment(name="GestureGeneration") +class GestureDeployment: + def __init__( + self, + hf_repo: str = _HF_REPO, + device: Optional[str] = None, + ): + import torch + from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv + + self.device = torch.device( + device if device else ("cuda" if torch.cuda.is_available() else "cpu") + ) + + face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device) + upper_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/upper").to(self.device) + lower_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/lower").to(self.device) + hands_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/hands").to(self.device) + global_ae = EmageVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/global").to(self.device) + + self.motion_vq = EmageVQModel( + face_model=face_vq, + upper_model=upper_vq, + lower_model=lower_vq, + hands_model=hands_vq, + global_model=global_ae, + ) + self.motion_vq.eval() + + self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device) + self.model.eval() + + def infer(self, audio_np: np.ndarray) -> Motion: + import torch + import torch.nn.functional as F + + audio_ts = torch.from_numpy(audio_np).to(self.device).unsqueeze(0) + speaker_id = torch.zeros(1, 1, dtype=torch.long, device=self.device) + + with torch.no_grad(): + ref_trans = torch.zeros(1, 1, 3, device=self.device) + latent_dict = self.model.inference(audio_ts, speaker_id, self.motion_vq) + + cfg = self.model.cfg + face_latent = latent_dict["rec_face"] if cfg.lf > 0 and cfg.cf == 0 else None + upper_latent = latent_dict["rec_upper"] if cfg.lu > 0 and cfg.cu == 0 else None + hands_latent = latent_dict["rec_hands"] if cfg.lh > 0 and cfg.ch == 0 else None + lower_latent = latent_dict["rec_lower"] if cfg.ll > 0 and cfg.cl == 0 else None + face_index = torch.max(F.log_softmax(latent_dict["cls_face"], dim=2), dim=2)[1] if cfg.cf > 0 else None + upper_index = torch.max(F.log_softmax(latent_dict["cls_upper"], dim=2), dim=2)[1] if cfg.cu > 0 else None + hands_index = torch.max(F.log_softmax(latent_dict["cls_hands"], dim=2), dim=2)[1] if cfg.ch > 0 else None + lower_index = torch.max(F.log_softmax(latent_dict["cls_lower"], dim=2), dim=2)[1] if cfg.cl > 0 else None + + all_pred = self.motion_vq.decode( + face_latent=face_latent, upper_latent=upper_latent, + lower_latent=lower_latent, hands_latent=hands_latent, + face_index=face_index, upper_index=upper_index, + lower_index=lower_index, hands_index=hands_index, + get_global_motion=True, ref_trans=ref_trans[:, 0], + ) + + t = all_pred["motion_axis_angle"].shape[1] + return Motion( + poses=all_pred["motion_axis_angle"].cpu().numpy().reshape(t, -1), + expressions=all_pred["expression"].cpu().numpy().reshape(t, -1), + trans=all_pred["trans"].cpu().numpy().reshape(t, -1), + ) + + +class Gesture(ModuleWithHandle): + """Gesture Module + + Consumes streaming Audio chunks produced by TTS and generates whole-body + SMPL-X motion using the EMAGE audio-to-gesture model. + + Audio chunks are buffered until TTS signals the end of an utterance + (Audio.end == True). At that point the full waveform is passed to EMAGE + and a single Motion object is yielded. + + input: audio (Audio) + output: motion (Motion) + + :hf_repo: HuggingFace repository to load EMAGE weights from. + :device: PyTorch device string; defaults to CUDA when available. + """ + + _handle_cls = GestureDeployment + input_type = "audio" + output_type = "motion" + + def __init__( + self, + handle: handle.DeploymentHandle, + ): + super().__init__(handle) + self._chunks: list[np.ndarray] = [] + + async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: ignore[override] + import librosa + + if audio.data.size > 0: + chunk = audio.data + if audio.sample_rate != _EMAGE_SR: + chunk = librosa.resample(chunk, orig_sr=audio.sample_rate, target_sr=_EMAGE_SR) + self._chunks.append(chunk.astype(np.float32)) + + if not audio.end: + return + + if not self._chunks: + return + + full_audio = np.concatenate(self._chunks) + self._chunks = [] + + motion = await self.handle.infer.remote(full_audio) + yield motion From 17098acf07fc5c35775a25248cea06c1ea2b99ab Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 01:39:23 +0200 Subject: [PATCH 02/31] feat(tts): all files related to CosyTTS --- src/modules/text_to_speech/__init__.py | 0 src/modules/text_to_speech/text_to_speech.py | 156 +++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 src/modules/text_to_speech/__init__.py create mode 100644 src/modules/text_to_speech/text_to_speech.py diff --git a/src/modules/text_to_speech/__init__.py b/src/modules/text_to_speech/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py new file mode 100644 index 0000000..07c7af5 --- /dev/null +++ b/src/modules/text_to_speech/text_to_speech.py @@ -0,0 +1,156 @@ +import asyncio +import os +import re +from dataclasses import dataclass +from typing import AsyncGenerator, Optional + +import numpy as np +from ray import serve +from ray.serve import handle + +from src.core.module import Module, ModuleWithHandle + + +# Defaults — overridden by env vars in production (see README.md) +_MODEL_PATH = os.environ.get("HURI_MODEL_PATH", "/models/cosytts/iic/CosyVoice2-0.5B") +_VOICE_SAMPLE_PATH = os.environ.get("HURI_VOICE_SAMPLE_PATH", "/assets/voice.wav") +_VOICE_SAMPLE_TRANSCRIPT = os.environ.get( + "HURI_VOICE_TRANSCRIPT", "Hello, this is my voice sample for cloning." +) + +# Hard endings (.!?) trigger synthesis immediately; soft endings (,;:) only after +# min_clause_chars are buffered, to avoid synthesizing very short fragments. +_HARD_END_RE = re.compile(r'[.!?]["\']?\s+') +_SOFT_END_RE = re.compile(r'[,;:]\s+') + +_DONE = object() # sentinel for exhausted sync generator + + +@dataclass +class Token: + text: str + end: bool # True on the last token of an LLM stream + + +@dataclass +class Audio: + data: np.ndarray # float32, values in [-1.0, 1.0] + sample_rate: int + end: bool = False # True on the last chunk of an utterance + + +@serve.deployment(name="TTS") +class TTSDeployment: + def __init__( + self, + model_path: str = _MODEL_PATH, + voice_sample_path: str = _VOICE_SAMPLE_PATH, + voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT, + ): + from cosyvoice.cli.cosyvoice import CosyVoice2 + from cosyvoice.utils.file_utils import load_wav + + self.model = CosyVoice2(model_path, load_jit=False, load_trt=False) + self.sample_rate: int = self.model.sample_rate + + self.prompt_speech = load_wav(voice_sample_path, 16000) + self.prompt_text: str = voice_sample_transcript + + async def synthesize(self, text: str) -> AsyncGenerator[Audio, None]: + """Run CosyVoice2 streaming inference and yield Audio chunks. + + The synchronous CosyVoice2 generator runs in a thread-pool executor so + it does not block the asyncio event loop between chunks. + """ + loop = asyncio.get_running_loop() + gen = self.model.inference_zero_shot( + text, + self.prompt_text, + self.prompt_speech, + stream=True, + ) + while True: + result = await loop.run_in_executor(None, next, gen, _DONE) + if result is _DONE: + break + yield Audio( + data=result["tts_speech"].squeeze(0).numpy().astype(np.float32), + sample_rate=self.sample_rate, + ) + + async def get_sample_rate(self) -> int: + return self.sample_rate + + +class TTS(ModuleWithHandle): + """TTS Module + + Stream text tokens in, stream audio chunks out using CosyVoice2 zero-shot + voice cloning. + + Buffers incoming tokens and synthesizes as soon as a sentence or clause + boundary is detected. Audio chunks are yielded immediately as CosyVoice2 + produces them, so playback can start before synthesis is complete. + + Compatible with both the Ray Serve event graph (async generator support in + EventGraph._run) and direct client streaming. + + input: token (Token), + output: audio (Audio) + + :min_clause_chars: minimum buffer length before a soft boundary (,;:) + triggers synthesis. Hard endings (.!?) always trigger immediately. + Raise this value to produce longer, more natural-sounding segments. + """ + + _handle_cls = TTSDeployment + input_type = "token" + output_type = "audio" + + def __init__( + self, + handle: handle.DeploymentHandle, + min_clause_chars: int = 20, + ): + super().__init__(handle) + self.min_clause_chars: int = min_clause_chars + self._buffer: str = "" + + async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: ignore[override] + self._buffer += token.text + + # Drain all complete clauses from the buffer before waiting for more tokens + while True: + clause, remainder = self._split(self._buffer) + if not clause: + break + self._buffer = remainder + async for chunk in self.handle.synthesize.remote(clause): + yield chunk + + # Flush the remaining buffer when the LLM stream ends + if token.end and self._buffer.strip(): + async for chunk in self.handle.synthesize.remote(self._buffer.strip()): + yield chunk + self._buffer = "" + if token.end: + sample_rate = await self.handle.get_sample_rate.remote() + yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True) + + def _split(self, text: str) -> tuple[str, str]: + """Return (clause_to_synthesize, remaining_buffer). + + Splits on the first hard sentence ending (.!?) unconditionally, or on + the first soft clause ending (,;:) once the buffer is long enough. + Returns ("", text) when no boundary is found. + """ + m = _HARD_END_RE.search(text) + if m: + return text[: m.end()].strip(), text[m.end() :] + + if len(text) >= self.min_clause_chars: + m = _SOFT_END_RE.search(text) + if m: + return text[: m.end()].strip(), text[m.end() :] + + return "", text From 2847f72163e1f5e4b8e62e4ca2c69956f3d28a72 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 01:44:25 +0200 Subject: [PATCH 03/31] feat(helm): local cluster featuring AMD and NVIDIA --- deploy/Dockerfile.amd | 73 +++++ deploy/Dockerfile.base | 16 + deploy/Dockerfile.nvidia | 28 ++ deploy/examples/local_nvidia_amd/Chart.lock | 6 + deploy/examples/local_nvidia_amd/Chart.yaml | 20 ++ .../templates/cosytts-model-init-job.yaml | 86 ++++++ .../templates/emage-model-init-job.yaml | 86 ++++++ .../templates/ingress-dashboard.yaml | 32 ++ .../local_nvidia_amd/templates/ingress.yaml | 32 ++ .../templates/rayservice.yaml | 169 +++++++++++ .../templates/voice-assets-pvc.yaml | 28 ++ deploy/examples/local_nvidia_amd/values.yaml | 281 ++++++++++++++++++ requirements-nvidia.txt | 37 +++ 13 files changed, 894 insertions(+) create mode 100644 deploy/Dockerfile.amd create mode 100644 deploy/Dockerfile.base create mode 100644 deploy/Dockerfile.nvidia create mode 100644 deploy/examples/local_nvidia_amd/Chart.lock create mode 100644 deploy/examples/local_nvidia_amd/Chart.yaml create mode 100644 deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml create mode 100644 deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml create mode 100644 deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml create mode 100644 deploy/examples/local_nvidia_amd/templates/ingress.yaml create mode 100644 deploy/examples/local_nvidia_amd/templates/rayservice.yaml create mode 100644 deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml create mode 100644 deploy/examples/local_nvidia_amd/values.yaml create mode 100644 requirements-nvidia.txt diff --git a/deploy/Dockerfile.amd b/deploy/Dockerfile.amd new file mode 100644 index 0000000..24c7b45 --- /dev/null +++ b/deploy/Dockerfile.amd @@ -0,0 +1,73 @@ +FROM rayproject/ray:2.55.1-py312 +WORKDIR /app +USER root +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + gnupg2 \ + && rm -rf /var/lib/apt/lists/* + +# Add ROCm 7.2.3 repository (Ubuntu 22.04 Jammy) +RUN curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor -o /etc/apt/keyrings/rocm.gpg \ + && printf 'deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.2.3 jammy main\ndeb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/7.2 jammy main\n' \ + > /etc/apt/sources.list.d/rocm.list \ + && printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600\n' \ + > /etc/apt/preferences.d/rocm-pin-600 + +# Install ROCm runtime + libraries +RUN apt-get update && apt-get install -y \ + rocm-hip-runtime \ + rocm-hip-libraries \ + rocm-device-libs \ + rocm-smi-lib \ + rocblas \ + hipblas \ + miopen-hip \ + rccl \ + rocsolver \ + rocfft \ + rocrand \ + hipsparse \ + && rm -rf /var/lib/apt/lists/* + +# Set ROCm environment +ENV ROCM_PATH=/opt/rocm +ENV PATH="${ROCM_PATH}/bin:${PATH}" +ENV LD_LIBRARY_PATH="${ROCM_PATH}/lib:${LD_LIBRARY_PATH}" + +USER ray + + +COPY serve_requirements.txt /app +RUN pip install --no-cache-dir -r serve_requirements.txt + +# 1. AMD's PyTorch built for ROCm (NOT the PyPI one — it's built for ROCm 6.2 and will silently break) +ARG ROCM_VERSION=7.2 +ARG PYTHON_VERSION=cp312 +ARG TRITON_VERSION=3.4.0+rocm7.2.0.git0cace8d2 + +RUN pip install --no-cache-dir \ + "https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/triton-${TRITON_VERSION}-${PYTHON_VERSION}-${PYTHON_VERSION}-linux_x86_64.whl" + +RUN pip install --no-cache-dir \ + --extra-index-url https://repo.radeon.com/rocm/pypi/ \ + "https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/torch-2.8.0+rocm${ROCM_VERSION}.0.lw.gitbf943426-${PYTHON_VERSION}-${PYTHON_VERSION}-linux_x86_64.whl" \ + "https://repo.radeon.com/rocm/manylinux/rocm-rel-${ROCM_VERSION}/torchaudio-2.8.0+rocm${ROCM_VERSION}.0.git6e1c7fe9-${PYTHON_VERSION}-${PYTHON_VERSION}-linux_x86_64.whl" + +RUN pip install --no-cache-dir filelock sympy networkx jinja2 fsspec numpy + +USER root + +# 3. Official CTranslate2 ROCm wheel (it's inside a zip on the releases page) +RUN apt-get update && apt-get install -y unzip curl \ + && curl -L https://github.com/OpenNMT/CTranslate2/releases/download/v4.7.1/rocm-python-wheels-Linux.zip \ + -o /tmp/ct2-rocm.zip \ + && unzip -j /tmp/ct2-rocm.zip 'temp-linux/ctranslate2-4.7.1-cp312-*manylinux*x86_64.whl' -d /tmp/ct2 \ + && pip install --no-cache-dir /tmp/ct2/ctranslate2-4.7.1-cp312-*.whl \ + && rm -rf /tmp/ct2 /tmp/ct2-rocm.zip + +USER ray + +# 4. faster-whisper +RUN pip install --no-cache-dir faster-whisper +COPY src /app/src diff --git a/deploy/Dockerfile.base b/deploy/Dockerfile.base new file mode 100644 index 0000000..f46ac35 --- /dev/null +++ b/deploy/Dockerfile.base @@ -0,0 +1,16 @@ +FROM rayproject/ray:2.55.1-py312 + +WORKDIR /app + + +USER root +RUN apt-get update && apt-get install -y \ +build-essential \ +&& rm -rf /var/lib/apt/lists/* + +USER ray + +COPY serve_requirements.txt /app +RUN pip install --no-cache-dir -r serve_requirements.txt + +COPY src /app/src diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia new file mode 100644 index 0000000..f9e704f --- /dev/null +++ b/deploy/Dockerfile.nvidia @@ -0,0 +1,28 @@ +FROM rayproject/ray:2.55.1-py312-gpu + +WORKDIR /app + +# Full CUDA 12.1 dependency stack (CosyVoice2, faster-whisper, TensorRT, …). +# PyTorch cu121 wheels live on the PyTorch index; TensorRT wheels on the NGC index. +COPY requirements-nvidia.txt /app +RUN pip install --no-cache-dir \ + --extra-index-url https://download.pytorch.org/whl/cu121 \ + --extra-index-url https://pypi.ngc.nvidia.com \ + -r requirements-nvidia.txt + +USER root + +RUN apt-get update && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + +USER ray + +# CosyVoice2 has no setup.py/pyproject.toml so it cannot be pip-installed. +# Clone at a pinned commit for supply-chain integrity and expose it via PYTHONPATH. +RUN git clone https://github.com/FunAudioLLM/CosyVoice.git /app/cosyvoice \ + && git -C /app/cosyvoice checkout 074ca6dc9e80a2f424f1f74b48bdd7d3fea531cc \ + && git -C /app/cosyvoice submodule update --init --recursive + +ENV PYTHONPATH="/app/cosyvoice:${PYTHONPATH:-}" + +COPY src /app/src diff --git a/deploy/examples/local_nvidia_amd/Chart.lock b/deploy/examples/local_nvidia_amd/Chart.lock new file mode 100644 index 0000000..b0e0b95 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/Chart.lock @@ -0,0 +1,6 @@ +dependencies: +- name: kuberay-operator + repository: https://ray-project.github.io/kuberay-helm/ + version: 1.6.0 +digest: sha256:b9057481d9a5e2d8b8798488b0b321bbd3f6e43dcb5a9dea18b181641a63b400 +generated: "2026-05-22T17:28:37.5934885+02:00" diff --git a/deploy/examples/local_nvidia_amd/Chart.yaml b/deploy/examples/local_nvidia_amd/Chart.yaml new file mode 100644 index 0000000..c51091a --- /dev/null +++ b/deploy/examples/local_nvidia_amd/Chart.yaml @@ -0,0 +1,20 @@ +apiVersion: v2 +name: huri +description: HuRI service powered by Ray Serve on KubeRay +type: application +version: 0.1.0 +appVersion: "2.52.0" +keywords: + - ray + - kuberay + - ray-serve + - robotics + - hri +maintainers: + - name: Sentience Robotics + +dependencies: + - name: kuberay-operator + version: "1.6.0" + repository: "https://ray-project.github.io/kuberay-helm/" + condition: kuberay.install diff --git a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml new file mode 100644 index 0000000..0d84298 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml @@ -0,0 +1,86 @@ +{{- if .Values.models.cosytts.enabled }} +{{- $model := .Values.models.cosytts }} +{{- $pvcName := printf "%s-cosytts-models" (include "huri.fullname" .) }} +{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $pvcName }} + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/resource-policy": keep +spec: + accessModes: + {{- toYaml $model.pvc.accessModes | nindent 4 }} + resources: + requests: + storage: {{ $model.pvc.size }} + {{- if $model.pvc.storageClassName }} + storageClassName: {{ $model.pvc.storageClassName }} + {{- end }} +{{- end }} +--- +# Runs only on first install (not on upgrade) — models are already on the PVC. +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "huri.fullname" . }}-cosytts-init + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation +spec: + backoffLimit: 3 + template: + metadata: + labels: + {{- include "huri.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + {{- with $model.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: models + persistentVolumeClaim: + claimName: {{ include "huri.fullname" . }}-cosytts-models + containers: + - name: cosytts-downloader + image: python:3.11-slim + command: ["/bin/sh", "-c"] + args: + - | + set -e + MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}" + if [ -f "$MODEL_DIR/cosyvoice2.yaml" ]; then + echo "Model already present at $MODEL_DIR — skipping download." + exit 0 + fi + echo "Downloading {{ $model.modelSource.modelId }} into $MODEL_DIR …" + pip install --quiet modelscope + python - <<'PYEOF' + from modelscope import snapshot_download + snapshot_download( + "{{ $model.modelSource.modelId }}", + local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}", + ) + PYEOF + echo "Download complete." + volumeMounts: + - name: models + mountPath: {{ $model.mountPath }} + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "2" + memory: "2Gi" +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml new file mode 100644 index 0000000..c4c4a08 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/emage-model-init-job.yaml @@ -0,0 +1,86 @@ +{{- if .Values.models.emage.enabled }} +{{- $model := .Values.models.emage }} +{{- $pvcName := printf "%s-emage-models" (include "huri.fullname" .) }} +{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $pvcName }} + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/resource-policy": keep +spec: + accessModes: + {{- toYaml $model.pvc.accessModes | nindent 4 }} + resources: + requests: + storage: {{ $model.pvc.size }} + {{- if $model.pvc.storageClassName }} + storageClassName: {{ $model.pvc.storageClassName }} + {{- end }} +{{- end }} +--- +# Runs only on first install (not on upgrade) — models are already on the PVC. +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "huri.fullname" . }}-emage-init + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation +spec: + backoffLimit: 3 + template: + metadata: + labels: + {{- include "huri.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + {{- with $model.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: models + persistentVolumeClaim: + claimName: {{ include "huri.fullname" . }}-emage-models + containers: + - name: emage-downloader + image: python:3.11-slim + command: ["/bin/sh", "-c"] + args: + - | + set -e + MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}" + if [ -d "$MODEL_DIR" ] && [ "$(ls -A $MODEL_DIR 2>/dev/null)" ]; then + echo "Model already present at $MODEL_DIR — skipping download." + exit 0 + fi + echo "Downloading {{ $model.modelSource.repoId }} into $MODEL_DIR …" + pip install --quiet huggingface_hub + python - <<'PYEOF' + from huggingface_hub import snapshot_download + snapshot_download( + "{{ $model.modelSource.repoId }}", + local_dir="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}", + ) + PYEOF + echo "Download complete." + volumeMounts: + - name: models + mountPath: {{ $model.mountPath }} + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "2" + memory: "2Gi" +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml new file mode 100644 index 0000000..c8153df --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml @@ -0,0 +1,32 @@ +{{- if .Values.dashboard.ingress.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "huri.fullname" . }}-dashboard + labels: + {{- include "huri.labels" . | nindent 4 }} + {{- with .Values.dashboard.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.dashboard.ingress.className }} + ingressClassName: {{ .Values.dashboard.ingress.className }} + {{- end }} + {{- with .Values.dashboard.ingress.tls }} + tls: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + - host: {{ .Values.dashboard.ingress.host | quote }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + # KubeRay creates -head-svc for the head node. + name: {{ include "huri.headSvcName" . }} + port: + number: 8265 +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/ingress.yaml b/deploy/examples/local_nvidia_amd/templates/ingress.yaml new file mode 100644 index 0000000..85f94de --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/ingress.yaml @@ -0,0 +1,32 @@ +{{- if .Values.ingress.enabled }} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "huri.fullname" . }} + labels: + {{- include "huri.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- with .Values.ingress.tls }} + tls: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + - host: {{ .Values.ingress.host | quote }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + # KubeRay creates -serve-svc for the Serve endpoint. + name: {{ include "huri.serveSvcName" . }} + port: + number: 8000 +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml new file mode 100644 index 0000000..31d7f00 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml @@ -0,0 +1,169 @@ +apiVersion: ray.io/v1 +kind: RayService +metadata: + name: {{ include "huri.fullname" . }} + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + ray.io/initializing-timeout: "10m" +spec: + serveConfigV2: | +{{ .Values.ray.serveConfig | indent 4 }} + rayClusterConfig: + rayVersion: {{ .Values.ray.version | quote }} + + headGroupSpec: + serviceType: {{ .Values.head.serviceType }} + rayStartParams: + {{- toYaml .Values.head.rayStartParams | nindent 8 }} + + template: + metadata: + labels: + {{- include "huri.selectorLabels" . | nindent 12 }} + spec: + {{- with .Values.head.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.head.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.head.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} + containers: + - name: ray-head + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: serve + resources: + {{- toYaml .Values.head.resources | nindent 16 }} + + workerGroupSpecs: + {{- range .Values.workerGroups }} + {{- $group := . }} + - replicas: {{ .replicas }} + minReplicas: {{ .minReplicas }} + maxReplicas: {{ .maxReplicas }} + groupName: {{ .groupName | quote }} + rayStartParams: + {{- toYaml .rayStartParams | nindent 10 }} + {{- if .customResources }} + resources: {{ .customResources | squote }} + {{- end }} + + template: + metadata: + labels: + {{- include "huri.selectorLabels" $ | nindent 14 }} + spec: + {{- if .hostIPC }} + hostIPC: {{ .hostIPC }} + {{- end }} + {{- if .runtimeClassName }} + # Tells containerd to invoke the nvidia/amd runtime, which mounts + # the GPU devices into the pod. Required on WSL2 + k3s. + runtimeClassName: {{ .runtimeClassName }} + {{- end }} + {{- if .podSecurityContext }} + # Pod-level security: supplementalGroups, fsGroup, etc. + securityContext: + {{- toYaml .podSecurityContext | nindent 14 }} + {{- end }} + {{- with .nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 14 }} + {{- end }} + {{- with .affinity }} + affinity: + {{- toYaml . | nindent 14 }} + {{- end }} + {{- with .tolerations }} + tolerations: + {{- toYaml . | nindent 14 }} + {{- end }} + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ .shmSize | default "1Gi" }} + {{- range .mountedModels }} + {{- $model := index $.Values.models . }} + {{- if $model.enabled }} + - name: model-{{ . }} + persistentVolumeClaim: + claimName: {{ include "huri.fullname" $ }}-{{ . }}-models + {{- end }} + {{- end }} + {{- if and $.Values.voiceAssets.enabled .mountVoiceAssets }} + - name: voice-assets + persistentVolumeClaim: + claimName: {{ include "huri.fullname" $ }}-voice-assets + {{- end }} + containers: + - name: ray-worker + {{- if .image }} + image: {{ .image }} + {{- else }} + image: {{ $.Values.image.repository }}:{{ $.Values.image.tag }} + {{- end }} + imagePullPolicy: {{ $.Values.image.pullPolicy }} + env: + {{- $hasEnv := false }} + {{- if .containerEnv }} + {{- $hasEnv = true }} + {{- toYaml .containerEnv | nindent 18 }} + {{- end }} + {{- range .mountedModels }} + {{- $model := index $.Values.models . }} + {{- if and $model.enabled $model.env }} + {{- $hasEnv = true }} + {{- range $envKey, $envVal := $model.env }} + - name: {{ $envKey }} + value: {{ $envVal | quote }} + {{- end }} + {{- end }} + {{- end }} + {{- if and $.Values.voiceAssets.enabled $group.mountVoiceAssets }} + {{- range $envKey, $envVal := $.Values.voiceAssets.env }} + {{- $hasEnv = true }} + - name: {{ $envKey }} + value: {{ $envVal | quote }} + {{- end }} + {{- end }} + {{- if not $hasEnv }} + [] + {{- end }} + {{- if .securityContext }} + # Container-level security: seLinuxOptions, capabilities, etc. + securityContext: + {{- toYaml .securityContext | nindent 18 }} + {{- end }} + resources: + {{- toYaml .resources | nindent 18 }} + volumeMounts: + - name: dshm + mountPath: /dev/shm + {{- range .mountedModels }} + {{- $model := index $.Values.models . }} + {{- if $model.enabled }} + - name: model-{{ . }} + mountPath: {{ $model.mountPath }} + {{- end }} + {{- end }} + {{- if and $.Values.voiceAssets.enabled .mountVoiceAssets }} + - name: voice-assets + mountPath: {{ $.Values.voiceAssets.mountPath }} + {{- end }} + {{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml b/deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml new file mode 100644 index 0000000..6456e8e --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/voice-assets-pvc.yaml @@ -0,0 +1,28 @@ +{{- if .Values.voiceAssets.enabled }} +{{- $pvcName := printf "%s-voice-assets" (include "huri.fullname" .) }} +{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }} +--- +# PVC for the voice sample used by the TTS module (HURI_VOICE_SAMPLE_PATH). +# Populate after first install with: +# kubectl cp voice.wav :{{ .Values.voiceAssets.mountPath }}/voice.wav +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $pvcName }} + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/resource-policy": keep +spec: + accessModes: + {{- toYaml .Values.voiceAssets.pvc.accessModes | nindent 4 }} + resources: + requests: + storage: {{ .Values.voiceAssets.pvc.size }} + {{- if .Values.voiceAssets.pvc.storageClassName }} + storageClassName: {{ .Values.voiceAssets.pvc.storageClassName }} + {{- end }} +{{- end }} +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml new file mode 100644 index 0000000..3e90893 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -0,0 +1,281 @@ +nameOverride: "" +fullnameOverride: "" + +image: + repository: docker.pommier.dev/huri + tag: base-2.55.1 + pullPolicy: Always + +ray: + version: "2.55.1" + # Inline Ray Serve config (equivalent to config/huri.yaml). + # Change import_path or add deployments here without rebuilding the image. + # + # To pin a deployment to a specific GPU vendor, set resources in + # ray_actor_options matching the resources string in the desired workerGroup's + # rayStartParams: + # + # ray_actor_options: + # num_gpus: 1 + # resources: {"GPU_TYPE_NVIDIA": 1} # → runs only on gpu-nvidia workers + # resources: {"GPU_TYPE_AMD": 1} # → runs only on gpu-amd workers + serveConfig: | + proxy_location: EveryNode + http_options: + host: 0.0.0.0 + port: 8000 + applications: + - name: huri-app + route_prefix: / + import_path: src.app:app + runtime_env: + env_vars: + RAY_COLOR_PREFIX: "1" + deployments: + - name: HuRI + ray_actor_options: + num_cpus: 1 + num_gpus: 0 + - name: TTS + ray_actor_options: + num_cpus: 1 + num_gpus: 0.5 + resources: {"GPU_TYPE_NVIDIA": 0.5} + - name: GestureGeneration + ray_actor_options: + num_cpus: 1 + num_gpus: 0.5 + resources: {"GPU_TYPE_NVIDIA": 0.5} + +head: + # ClusterIP is preferred on real clusters; use NodePort for kind/minikube/k3s. + serviceType: ClusterIP + # Pin the head to the control-plane node so it does not consume GPU memory. + # Set to {} to let the scheduler decide. + nodeSelector: + node-role.kubernetes.io/control-plane: "true" + # affinity: {} # optional Kubernetes affinity rules (nodeAffinity, podAffinity…) + tolerations: + # Required to schedule on the control-plane node (tainted by default in k3s). + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + rayStartParams: + num-cpus: "2" + num-gpus: "0" + resources: + limits: + cpu: "2" + memory: "8Gi" + requests: + cpu: "2" + memory: "4Gi" + +# Worker groups: one logical "gpu" group per vendor and one "cpu" group. +# +# Node placement (Kubernetes level) +# ────────────────────────────────── +# nodeSelector / affinity / tolerations pin the *pod* to specific physical nodes. +# +# Ray deployment routing (Ray level) +# ──────────────────────────────────── +# rayStartParams.resources advertises named custom resources to the Ray +# scheduler. Ray Serve deployments request them via ray_actor_options.resources +# to run exclusively on a given worker group regardless of GPU vendor. +# +# Example — to route a deployment to NVIDIA workers only: +# workerGroups[gpu-nvidia].customResources: '{"GPU_TYPE_NVIDIA":1}' +# serveConfig deployment ray_actor_options.resources: {"GPU_TYPE_NVIDIA": 1} +# +# Model volumes +# ────────────── +# mountedModels lists keys from .Values.models. For each enabled model the +# template adds a PVC volume + volumeMount + env vars to the worker pod. +workerGroups: + # --- GPU workers (Nvidia) --- + - groupName: gpu-nvidia + image: docker.pommier.dev/huri:nvidia-2.55.1 + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + mountVoiceAssets: false + nodeSelector: + gpu: nvidia + # affinity: {} # optional + tolerations: [] + runtimeClassName: nvidia + customResources: '{\"GPU_TYPE_NVIDIA\":1}' + rayStartParams: + num-gpus: "1" + containerEnv: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + - name: NVIDIA_DRIVER_CAPABILITIES + value: "compute,utility" + # Models whose PVC will be mounted in this worker group. + # Keys must match entries under .Values.models. + mountedModels: + - cosytts + - emage + resources: + limits: + cpu: "4" + memory: "8Gi" + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: "4Gi" + nvidia.com/gpu: "1" + shmSize: 2Gi + + # --- GPU workers (AMD) --- + - groupName: gpu-amd + image: docker.pommier.dev/huri:amd-2.55.1 + replicas: 1 + minReplicas: 1 + maxReplicas: 1 + mountVoiceAssets: true + nodeSelector: + gpu: amd + # affinity: {} # optional + tolerations: [] + hostIPC: true + customResources: '{\"GPU_TYPE_AMD\":1}' + podSecurityContext: + supplementalGroups: [39, 107] + rayStartParams: + num-gpus: "1" + containerEnv: + - name: HSA_OVERRIDE_GFX_VERSION + value: "11.5.1" + securityContext: + seLinuxOptions: + type: "spc_t" + # privileged: true # Uncomment if spc_t still gets blocked by Fedora + mountedModels: [] + resources: + limits: + cpu: "4" + memory: "8Gi" + amd.com/gpu: "1" + requests: + cpu: "2" + memory: "4Gi" + amd.com/gpu: "1" + + # --- CPU-only workers --- + # Handles tasks that do not need a GPU (pre/post-processing, routing, etc.). + # Set replicas: 0 to disable this group entirely. + - groupName: cpu-workers + replicas: 0 + minReplicas: 0 + maxReplicas: 0 + nodeSelector: {} + tolerations: [] + rayStartParams: + num-cpus: "2" + mountedModels: [] + resources: + limits: + cpu: "2" + memory: "4Gi" + requests: + cpu: "1" + memory: "2Gi" + shmSize: 256Mi + +# AI model definitions. +# Each key corresponds to a model that can be mounted into worker groups via +# mountedModels. The chart creates one PVC per enabled model and a pre-install +# Job that downloads the weights on first deploy (idempotent). +models: + cosytts: + enabled: true + nodeSelector: + gpu: nvidia + pvc: + # Leave storageClassName empty to use the cluster default StorageClass. + storageClassName: "" + size: 20Gi + # Use ReadWriteMany if the PVC must be shared across multiple worker pods + # (requires a CSI driver that supports RWX, e.g. NFS or Longhorn). + # Use ReadWriteOnce for single-node clusters. + accessModes: + - ReadWriteOnce + # Where the model weights PVC is mounted inside the worker container. + mountPath: /models/cosytts + modelSource: + # type: modelscope | huggingface (only modelscope is implemented) + type: modelscope + # ModelScope model ID — snapshot_download uses this as the sub-path + # inside mountPath, so the final path is mountPath/modelId. + modelId: iic/CosyVoice2-0.5B + # Env vars injected into every worker that mounts this model. + # HURI_MODEL_PATH must match mountPath/modelId (see text_to_speech.py). + env: + HURI_MODEL_PATH: /models/cosytts/iic/CosyVoice2-0.5B + + emage: + enabled: true + nodeSelector: + gpu: nvidia + pvc: + storageClassName: "" + size: 10Gi + accessModes: + - ReadWriteOnce + # Where the EMAGE weights PVC is mounted inside the worker container. + mountPath: /models/emage + modelSource: + # type: huggingface — snapshot_download uses repoId as the sub-path + # inside mountPath, so the final path is mountPath/repoId. + type: huggingface + repoId: H-Liu1997/emage_audio + # HURI_EMAGE_REPO must match mountPath/repoId (see gesture.py). + env: + HURI_EMAGE_REPO: /models/emage/H-Liu1997/emage_audio + +# Voice sample asset volume. +# Populate the PVC after first install: +# kubectl cp voice.wav :/assets/voice.wav +voiceAssets: + enabled: true + pvc: + storageClassName: "" + size: 100Mi + accessModes: + - ReadWriteOnce + mountPath: /assets + env: + HURI_VOICE_SAMPLE_PATH: /assets/voice.wav + HURI_VOICE_TRANSCRIPT: "Hello, this is my voice sample for cloning." + +# Ingress for the Ray Serve endpoint (port 8000). +ingress: + enabled: false + className: nginx + annotations: {} + # nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + # nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + # nginx.ingress.kubernetes.io/proxy-buffering: "off" + host: huri.example.com + tls: [] + # - secretName: huri-tls + # hosts: + # - huri.example.com + +# Ingress for the Ray Dashboard (port 8265). Disabled by default – not safe to +# expose publicly without additional auth. +dashboard: + ingress: + enabled: false + className: nginx + annotations: {} + host: huri-dashboard.example.com + tls: [] + +# Set to true to let this chart manage the KubeRay operator as a sub-chart. +# Set to false when the operator is already installed cluster-wide (typical for +# shared clusters). +kuberay: + install: false diff --git a/requirements-nvidia.txt b/requirements-nvidia.txt new file mode 100644 index 0000000..a1a78db --- /dev/null +++ b/requirements-nvidia.txt @@ -0,0 +1,37 @@ +# --- Shared / pinned by CosyVoice (keep these versions) --- +torch==2.3.1 +torchaudio==2.3.1 +numpy==1.26.4 +transformers==4.51.3 +diffusers==0.29.0 +omegaconf==2.3.0 +librosa==0.10.2 +soundfile==0.12.1 +hydra-core==1.3.2 # only because HyperPyYAML configs may resolve hydra refs; safe to keep +HyperPyYAML==1.2.3 +modelscope==1.20.0 +onnx==1.16.0 +onnxruntime-gpu==1.18.0 # Linux; campplus + speech_tokenizer +openai-whisper==20250625 # frontend.py: import whisper +inflect==7.3.1 +wetext==0.0.4 # text normalization fallback (ttsfrd not installed) +conformer==0.3.2 +x-transformers==2.11.24 +einops==0.8.2 +tiktoken==0.13.0 # cosyvoice/tokenizer +pyarrow==18.1.0 # imported by cli paths via dataset utils? actually only dataset/processor — can drop +protobuf==4.25 +pydantic==2.7.0 # transitive (transformers/fastapi), but pinning avoids drift +regex==2025.11.3 +tqdm==4.67.3 + +# --- EMAGE extras --- +huggingface_hub==0.36.2 # from_pretrained +smplx==0.1.28 +pyrender==0.1.45 # fast_render top-level import +trimesh==4.12.2 +imageio==2.33.0 +# Visualization-only (skip if --visualization off): +# opencv-python==4.8.1.78 +# pytorch3d # has to be built from source for torch 2.3 / py3.12 +# torchvision From 5e9eb3b47622a1d50cbf9f1d26bdd97dafe409f8 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 01:52:57 +0200 Subject: [PATCH 04/31] Merge branches --- .gitignore | 3 +++ serve_requirements.txt | 5 +++++ src/modules/modules.py | 23 +++++++++++++++++++- src/modules/speech_to_text/speech_to_text.py | 8 ++++++- 4 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 serve_requirements.txt diff --git a/.gitignore b/.gitignore index 7892d64..990d633 100644 --- a/.gitignore +++ b/.gitignore @@ -176,6 +176,9 @@ cython_debug/ # PyPI configuration file .pypirc +# Helm +**/charts/*.tgz + # Others .trash docs \ No newline at end of file diff --git a/serve_requirements.txt b/serve_requirements.txt new file mode 100644 index 0000000..242576e --- /dev/null +++ b/serve_requirements.txt @@ -0,0 +1,5 @@ +# server +numpy +click<8.2 +webrtcvad +faster-whisper diff --git a/src/modules/modules.py b/src/modules/modules.py index 8fbc53c..04a020c 100644 --- a/src/modules/modules.py +++ b/src/modules/modules.py @@ -1,3 +1,4 @@ +import logging from typing import Dict, Type from src.modules.rag.rag import RAG @@ -7,6 +8,26 @@ from .factory import Module +_LOG = logging.getLogger(__name__) + def get_modules() -> Dict[str, Type[Module]]: - return {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG} + modules: Dict[str, Type[Module]] = {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG} + + # The following imports may contain modules with custom dependencies, depending on the Dockerfile + # CPU doesn't need some dependencies, nor the AMD that isn't compatible + try: + from src.modules.text_to_speech.text_to_speech import TTS + except Exception as exc: # noqa: BLE001 + _LOG.info("Skipping TTS module: %s", exc) + else: + modules["tts"] = TTS + + try: + from src.modules.gesture.gesture import Gesture + except Exception as exc: # noqa: BLE001 + _LOG.info("Skipping Gesture module: %s", exc) + else: + modules["gesture"] = Gesture + + return modules diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 1300dd3..04afa00 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -34,6 +34,8 @@ def __init__( self, model: str = "base", language: str = "en", + device: str = "auto", + compute_type: str = "auto", sample_rate: int = 16000, block_duration: float = 0.020, # s transcribe_window: float = 2.0, # s @@ -41,7 +43,11 @@ def __init__( ): super().__init__() - self.model_faster = WhisperModel(model) + self.model_faster = WhisperModel( + model, + device=device, + compute_type=compute_type, + ) self.language = language self.sample_rate = sample_rate From a237d26a26f20cb2084a6ff1ec2939495902a075 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 06:31:53 +0200 Subject: [PATCH 05/31] fixed(tts): module deplyment and init --- deploy/Dockerfile.nvidia | 3 + .../templates/cosytts-model-init-job.yaml | 22 ++++++ requirements-nvidia.txt | 7 ++ src/modules/gesture/gesture.py | 8 +- src/modules/speech_to_text/speech_to_text.py | 77 +++++++++++-------- src/modules/text_to_speech/events.py | 18 +++++ src/modules/text_to_speech/text_to_speech.py | 37 ++++----- 7 files changed, 117 insertions(+), 55 deletions(-) create mode 100644 src/modules/text_to_speech/events.py diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia index f9e704f..1cb0c8d 100644 --- a/deploy/Dockerfile.nvidia +++ b/deploy/Dockerfile.nvidia @@ -23,6 +23,9 @@ RUN git clone https://github.com/FunAudioLLM/CosyVoice.git /app/cosyvoice \ && git -C /app/cosyvoice checkout 074ca6dc9e80a2f424f1f74b48bdd7d3fea531cc \ && git -C /app/cosyvoice submodule update --init --recursive + +RUN pip install --no-cache-dir lightning==2.2.4 gdown==5.1.0 matplotlib==3.7.5 wget==3.2 pyworld==0.3.4 + ENV PYTHONPATH="/app/cosyvoice:${PYTHONPATH:-}" COPY src /app/src diff --git a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml index 0d84298..0f62dd1 100644 --- a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml +++ b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml @@ -59,10 +59,26 @@ spec: - | set -e MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}" + BLANK_EN_DIR="$MODEL_DIR/CosyVoice-BlankEN" + HAS_MAIN_CONFIG="no" + HAS_QWEN_WEIGHTS="no" if [ -f "$MODEL_DIR/cosyvoice2.yaml" ]; then + HAS_MAIN_CONFIG="yes" + fi + if [ -f "$BLANK_EN_DIR/model.safetensors" ] || \ + [ -f "$BLANK_EN_DIR/pytorch_model.bin" ] || \ + [ -f "$BLANK_EN_DIR/model.safetensors.index.json" ] || \ + [ -f "$BLANK_EN_DIR/pytorch_model.bin.index.json" ]; then + HAS_QWEN_WEIGHTS="yes" + fi + if [ "$HAS_MAIN_CONFIG" = "yes" ] && [ "$HAS_QWEN_WEIGHTS" = "yes" ]; then echo "Model already present at $MODEL_DIR — skipping download." exit 0 fi + if [ "$HAS_QWEN_WEIGHTS" = "no" ] && [ -d "$BLANK_EN_DIR" ]; then + echo "Partial Qwen weights detected; clearing $BLANK_EN_DIR before re-download." + rm -rf "$BLANK_EN_DIR" + fi echo "Downloading {{ $model.modelSource.modelId }} into $MODEL_DIR …" pip install --quiet modelscope python - <<'PYEOF' @@ -71,6 +87,12 @@ spec: "{{ $model.modelSource.modelId }}", local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}", ) + # CosyVoice2 loads this sub-model at runtime; pre-download it so + # the worker pod does not need outbound internet access. + snapshot_download( + "iic/CosyVoice-BlankEN", + local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}/CosyVoice-BlankEN", + ) PYEOF echo "Download complete." volumeMounts: diff --git a/requirements-nvidia.txt b/requirements-nvidia.txt index a1a78db..441b738 100644 --- a/requirements-nvidia.txt +++ b/requirements-nvidia.txt @@ -25,6 +25,13 @@ pydantic==2.7.0 # transitive (transformers/fastapi), but pinning avoi regex==2025.11.3 tqdm==4.67.3 +# --- RAG / LLM extras --- +httpx==0.27.2 +qdrant-client==1.12.1 +sentence-transformers==3.2.1 +pypdf==5.1.0 +semantic_chunker==0.2.0 + # --- EMAGE extras --- huggingface_hub==0.36.2 # from_pretrained smplx==0.1.28 diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index 4e2a380..427806b 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -8,7 +8,7 @@ from ray.serve import handle from src.core.module import Module, ModuleWithHandle -from src.modules.text_to_speech.text_to_speech import Audio +from src.modules.text_to_speech.events import Audio _HF_REPO = os.environ.get("HURI_EMAGE_REPO", "H-Liu1997/emage_audio") @@ -115,9 +115,9 @@ class Gesture(ModuleWithHandle): def __init__( self, - handle: handle.DeploymentHandle, + _handle: handle.DeploymentHandle, ): - super().__init__(handle) + super().__init__(_handle) self._chunks: list[np.ndarray] = [] async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: ignore[override] @@ -138,5 +138,5 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: full_audio = np.concatenate(self._chunks) self._chunks = [] - motion = await self.handle.infer.remote(full_audio) + motion = await self._handle.infer.remote(full_audio) yield motion diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 04afa00..fa3b790 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -3,53 +3,80 @@ import numpy as np from faster_whisper import WhisperModel +from ray import serve +from ray.serve import handle -from src.core.module import Module +from src.core.module import ModuleWithHandle from .events import Transcript, Voice -class STT(Module): +@serve.deployment(name="STT") +class STTDeployment: + """Stateless Whisper inference actor. + + Holds the faster-whisper model in a single Ray actor (pinned to the AMD + worker in deployment configs). Exposes a single transcribe() call so + per-session STT clients can offload the heavy work without owning a GPU. + """ + + def __init__( + self, + model: str = "base", + device: str = "auto", + compute_type: str = "auto", + ): + self.model_faster = WhisperModel( + model, + device=device, + compute_type=compute_type, + ) + self.language = "en" + + async def transcribe(self, audio: np.ndarray) -> str: + loop = asyncio.get_running_loop() + segments, _ = await loop.run_in_executor( + None, + lambda: self.model_faster.transcribe( + audio, + language=self.language, + beam_size=1, + ), + ) + return " ".join(seg.text for seg in segments).strip() + + +class STT(ModuleWithHandle): """STT Module - Transcribe voice using Faster_Whisper. + Per-session client: keeps the rolling window / silence state, offloads each + transcription window to the shared STTDeployment actor. input: voice, output: transcript - :model: size of the model to use (tiny, tiny.en, base, base.en, small, - small.en, distil-small.en, medium, medium.en, distil-medium.en, - large-v1, large-v2, large-v3, large, distil-large-v2, distil-large-v3, - large-v3-turbo, or turbo). - :language: language spoken in the audio. It should be a language code such - as "en" or "fr". :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000. :block_duration: size of received voice audio (in s). + :transcribe_window: rolling window length (s) handed to Whisper. + :transcribe_step: stride (s) between successive windows. """ + _handle_cls = STTDeployment input_type = "voice" output_type = "transcript" def __init__( self, - model: str = "base", + _handle: handle.DeploymentHandle, language: str = "en", - device: str = "auto", - compute_type: str = "auto", sample_rate: int = 16000, block_duration: float = 0.020, # s transcribe_window: float = 2.0, # s transcribe_step: float = 1.0, # s ): - super().__init__() + super().__init__(_handle=_handle) - self.model_faster = WhisperModel( - model, - device=device, - compute_type=compute_type, - ) self.language = language - self.sample_rate = sample_rate self.window_size: int = int(transcribe_window / block_duration) self.step_size: int = int(transcribe_step / block_duration) @@ -58,9 +85,6 @@ def __init__( self.silence: bool = True - self.prev_text: str = "" - self.stable_text: str = "" - self.running = False self.lock: asyncio.Lock = asyncio.Lock() @@ -86,16 +110,9 @@ async def process(self, voice: Voice) -> Optional[Transcript]: return None processing_chunks = self.buffer[: self.window_size] - self.pending_silence = False processing_audio = np.concatenate(processing_chunks, axis=0) - segments, _ = self.model_faster.transcribe( - processing_audio, - language=self.language, - beam_size=1, # faster for realtime - ) - - current_text = " ".join([seg.text for seg in segments]).strip() + current_text: str = await self._handle.transcribe.remote(processing_audio) processed_size = self.window_size - self.step_size async with self.lock: diff --git a/src/modules/text_to_speech/events.py b/src/modules/text_to_speech/events.py new file mode 100644 index 0000000..9dd6fd4 --- /dev/null +++ b/src/modules/text_to_speech/events.py @@ -0,0 +1,18 @@ +from dataclasses import dataclass + +import numpy as np + +from src.core.events import EventData + + +@dataclass +class Token(EventData): + text: str + end: bool + + +@dataclass +class Audio(EventData): + data: np.ndarray + sample_rate: int + end: bool = False diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py index 07c7af5..8c25eb8 100644 --- a/src/modules/text_to_speech/text_to_speech.py +++ b/src/modules/text_to_speech/text_to_speech.py @@ -1,14 +1,16 @@ import asyncio import os import re -from dataclasses import dataclass -from typing import AsyncGenerator, Optional +import sys +from typing import AsyncGenerator import numpy as np from ray import serve from ray.serve import handle -from src.core.module import Module, ModuleWithHandle +from src.core.module import ModuleWithHandle + +from .events import Audio, Token # Defaults — overridden by env vars in production (see README.md) @@ -26,19 +28,6 @@ _DONE = object() # sentinel for exhausted sync generator -@dataclass -class Token: - text: str - end: bool # True on the last token of an LLM stream - - -@dataclass -class Audio: - data: np.ndarray # float32, values in [-1.0, 1.0] - sample_rate: int - end: bool = False # True on the last chunk of an utterance - - @serve.deployment(name="TTS") class TTSDeployment: def __init__( @@ -47,6 +36,12 @@ def __init__( voice_sample_path: str = _VOICE_SAMPLE_PATH, voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT, ): + cosy_dir = os.environ.get("HURI_COSY_DIR") + if cosy_dir: + matcha_path = os.path.join(cosy_dir, "third_party", "Matcha-TTS") + if os.path.isdir(matcha_path) and matcha_path not in sys.path: + sys.path.insert(0, matcha_path) + from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.utils.file_utils import load_wav @@ -109,10 +104,10 @@ class TTS(ModuleWithHandle): def __init__( self, - handle: handle.DeploymentHandle, + _handle: handle.DeploymentHandle, min_clause_chars: int = 20, ): - super().__init__(handle) + super().__init__(_handle) self.min_clause_chars: int = min_clause_chars self._buffer: str = "" @@ -125,16 +120,16 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: i if not clause: break self._buffer = remainder - async for chunk in self.handle.synthesize.remote(clause): + async for chunk in self._handle.synthesize.remote(clause): yield chunk # Flush the remaining buffer when the LLM stream ends if token.end and self._buffer.strip(): - async for chunk in self.handle.synthesize.remote(self._buffer.strip()): + async for chunk in self._handle.synthesize.remote(self._buffer.strip()): yield chunk self._buffer = "" if token.end: - sample_rate = await self.handle.get_sample_rate.remote() + sample_rate = await self._handle.get_sample_rate.remote() yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True) def _split(self, text: str) -> tuple[str, str]: From d104b7bc3d08a552c65c38fafd9ec10f675813bf Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 06:34:03 +0200 Subject: [PATCH 06/31] fixed(rag): streaming tokens + removing deprecated docker_services since that's the job of Helm --- config/client_full.yaml | 44 +++++ config/huri.yaml | 35 ---- deploy/Dockerfile.amd | 6 + requirements-amd.txt | 16 ++ src/app.py | 38 +---- src/modules/events.py | 21 ++- src/modules/factory.py | 9 +- src/modules/rag/docker_services.py | 240 --------------------------- src/modules/rag/rag.py | 252 +++++++++++++++-------------- 9 files changed, 213 insertions(+), 448 deletions(-) create mode 100644 config/client_full.yaml delete mode 100644 config/huri.yaml create mode 100644 requirements-amd.txt delete mode 100644 src/modules/rag/docker_services.py diff --git a/config/client_full.yaml b/config/client_full.yaml new file mode 100644 index 0000000..1481430 --- /dev/null +++ b/config/client_full.yaml @@ -0,0 +1,44 @@ +huri_url: ws://localhost:8000/session + +topic_list: [transcript, question, token, motion] + +senders: + audio: + name: audio + args: + sample_rate: 16000 + frame_duration: 0.030 + +modules: + mic: + name: mic + args: + vad_agressiveness: 3 + silence_duration: 1.5 + block_duration: ${senders.audio.args.frame_duration} + logging: INFO + stt: + name: stt + args: + language: en + block_duration: ${senders.audio.args.frame_duration} + logging: INFO + tag: + name: tag + logging: INFO + rag: + name: rag + args: + language: en + tone: formal + response_format: paragraph + max_length: 1024 + logging: INFO + tts: + name: tts + args: + min_clause_chars: 20 + logging: INFO + gesture: + name: gesture + logging: INFO diff --git a/config/huri.yaml b/config/huri.yaml deleted file mode 100644 index 70d2cc7..0000000 --- a/config/huri.yaml +++ /dev/null @@ -1,35 +0,0 @@ -proxy_location: EveryNode - -http_options: - host: 0.0.0.0 - port: 8000 - -logging_config: - encoding: TEXT - log_level: INFO - logs_dir: null - enable_access_log: true - additional_log_standard_attrs: [] - -services: - qdrant: - port: 6333 - image: "qdrant/qdrant:latest" - storage_volume: "qdrant_data" - ollama: - model: "mistral:7b" - image: "ollama/ollama:rocm" - gpu_devices: true - num_replicas: 1 - -applications: - - name: huri-app - route_prefix: / - import_path: src.app:app - runtime_env: { RAY_COLOR_PREFIX=1 } - deployments: - - name: HuRI - - name: RAGHandle - num_replicas: 2 - - name: OllamaService - - name: QdrantService diff --git a/deploy/Dockerfile.amd b/deploy/Dockerfile.amd index 24c7b45..786094b 100644 --- a/deploy/Dockerfile.amd +++ b/deploy/Dockerfile.amd @@ -70,4 +70,10 @@ USER ray # 4. faster-whisper RUN pip install --no-cache-dir faster-whisper + +# 5. RAG / LLM extras (httpx, qdrant-client, sentence-transformers, …) +# Installed last so the ROCm torch wheel installed above is the resolved one. +COPY requirements-amd.txt /app +RUN pip install --no-cache-dir -r requirements-amd.txt + COPY src /app/src diff --git a/requirements-amd.txt b/requirements-amd.txt new file mode 100644 index 0000000..2fefb8d --- /dev/null +++ b/requirements-amd.txt @@ -0,0 +1,16 @@ +# AMD/ROCm worker extras (on top of serve_requirements.txt installed by Dockerfile.base layer +# and the ROCm torch/torchaudio wheels installed directly in Dockerfile.amd). +# +# Hosts: STT (faster-whisper, already in serve_requirements.txt) and RAG/LLM. +# Does NOT include CosyVoice2 or EMAGE — those run on the NVIDIA worker. + +# --- RAG / LLM --- +httpx==0.27.2 +qdrant-client==1.12.1 +sentence-transformers==3.2.1 +pypdf==5.1.0 +semantic_chunker==0.2.0 + +# transformers is pulled in by sentence-transformers; pin to a version compatible +# with the ROCm torch 2.8 wheel. +transformers==4.46.3 diff --git a/src/app.py b/src/app.py index 79f58db..1b314de 100644 --- a/src/app.py +++ b/src/app.py @@ -1,51 +1,15 @@ -from pathlib import Path -from typing import Any - -import yaml from ray.serve import Application from src.core.huri import HuRI from src.modules.events import get_events from src.modules.factory import bind_deployment_handles from src.modules.modules import get_modules -from src.modules.rag.docker_services import OllamaService, QdrantService - - -def load_services_config() -> Any: - config_path = Path(__file__).resolve().parents[1] / "config" / "huri.yaml" - with open(config_path) as f: - config = yaml.safe_load(f) - return config.get("services", {}) - - -def build_qdrant(config: dict) -> Any: - return QdrantService.bind( # type: ignore[attr-defined] - port=config.get("port", 6333), - image=config.get("image", "qdrant/qdrant:latest"), - storage_volume=config.get("storage_volume", "qdrant_data"), - ) - - -def build_ollama(config: dict) -> Any: - return OllamaService.options( # type: ignore[attr-defined] - num_replicas=config.get("num_replicas", 1), - ).bind( - model=config.get("model", "mistral:7b"), - image=config.get("image", "ollama/ollama:latest"), - gpu_devices=config.get("gpu_devices", False), - ) def build_app() -> Application: modules = get_modules() events = get_events() - - services_config = load_services_config() - - qdrant = build_qdrant(services_config.get("qdrant", {})) - ollama = build_ollama(services_config.get("ollama", {})) - - handles = bind_deployment_handles(modules, ollama=ollama, qdrant=qdrant) + handles = bind_deployment_handles(modules) app: Application = HuRI.bind(modules, handles, events) # type: ignore[attr-defined] return app diff --git a/src/modules/events.py b/src/modules/events.py index 43f6c71..b731c21 100644 --- a/src/modules/events.py +++ b/src/modules/events.py @@ -1,15 +1,30 @@ from typing import Dict, Type from src.core.events import EventData -from src.modules.rag.events import RAGResult from src.modules.speech_to_text.events import Sentence, Transcript, Voice +from src.modules.text_to_speech.events import Audio, Token def get_events() -> Dict[str, Type[EventData | bytes]]: - return { + events: Dict[str, Type[EventData | bytes]] = { "audio": bytes, "voice": Voice, "transcript": Transcript, "question": Sentence, - "rag_response": RAGResult, + "token": Token, } + + # Motion lives in the gesture module — only available when EMAGE deps installed. + try: + from src.modules.gesture.gesture import Motion + except Exception: + pass + else: + events["motion"] = Motion + + # TTS output "audio" is an Audio dataclass internally; the websocket boundary + # only ever decodes raw bytes for the "audio" topic (mic input), so the + # registry keeps bytes there. Keep Audio importable for type completeness. + _ = Audio + + return events diff --git a/src/modules/factory.py b/src/modules/factory.py index 728aba3..14acd99 100644 --- a/src/modules/factory.py +++ b/src/modules/factory.py @@ -94,7 +94,6 @@ def create_from_config( def bind_deployment_handles( modules: Dict[str, Type[Module]], - **service_handles, ) -> Dict[str, handle.DeploymentHandle]: handles: Dict[str, handle.DeploymentHandle] = {} for name, module_cls in modules.items(): @@ -106,12 +105,6 @@ def bind_deployment_handles( handle_cls = module_cls._handle_cls - if name == "rag" and service_handles: - handles[name] = handle_cls.bind( - ollama_handle=service_handles.get("ollama"), - qdrant_handle=service_handles.get("qdrant"), - ) - else: - handles[name] = handle_cls.bind() + handles[name] = handle_cls.bind() return handles diff --git a/src/modules/rag/docker_services.py b/src/modules/rag/docker_services.py deleted file mode 100644 index cc9f4b5..0000000 --- a/src/modules/rag/docker_services.py +++ /dev/null @@ -1,240 +0,0 @@ -import socket -import subprocess -import time -from typing import Any - -import httpx -from ray import serve - - -def find_free_port() -> Any: - """ - Ask the OS for a random free port. - We need this because if we run multiple Ollama containers, - they can't all use port 11434 — each needs its own. - """ - with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: - s.bind(("", 0)) - return s.getsockname()[1] - - -def wait_for_service(url: str, timeout: int = 120) -> bool: - """ - Returns True if ready, False if timeout. - """ - start = time.time() - while time.time() - start < timeout: - try: - resp = httpx.get(url, timeout=5) - if resp.status_code == 200: - return True - except Exception: - pass - time.sleep(2) - return False - - -def is_container_running(name: str) -> bool: - """Check if a Docker container with this name is already running.""" - result = subprocess.run( - ["docker", "ps", "-q", "-f", f"name=^{name}$"], - capture_output=True, - text=True, - ) - return bool(result.stdout.strip()) - - -def remove_container(name: str): - """Force remove a container by name (ignores errors if it doesn't exist).""" - subprocess.run(["docker", "rm", "-f", name], capture_output=True) - - -@serve.deployment -class OllamaService: - """ - Manages one Ollama Docker container. - - LIFECYCLE: - __init__: starts container -> waits for it -> pulls model - generate: sends a prompt to the container, returns the answer - __del__: stops and removes the container - """ - - def __init__( - self, - model: str = "mistral:7b", - image: str = "ollama/ollama:latest", - gpu_devices: bool = False, - ): - self.model = model - self.port = find_free_port() - self.container_name = f"ollama-ray-{self.port}" - self.base_url = f"http://localhost:{self.port}" - - remove_container(self.container_name) - - cmd = [ - "docker", - "run", - "-d", - "--name", - self.container_name, - "-p", - f"{self.port}:11434", - "-v", - "ollama_shared:/root/.ollama", - ] - - if gpu_devices: - cmd.extend( - [ - "--device=/dev/kfd", - "--device=/dev/dri", - "--group-add=video", - ] - ) - - cmd.append(image) - - print(f"[OllamaService] Starting container \ -'{self.container_name}' on port {self.port}...") - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"Docker failed: {result.stderr}") - - print("[OllamaService] Waiting for Ollama to be ready...") - if not wait_for_service(f"{self.base_url}/api/tags"): - raise RuntimeError(f"Ollama didn't start within \ -timeout on port {self.port}") - - print(f"[OllamaService] Pulling model '{model}'...") - pull_result = subprocess.run( - ["docker", "exec", self.container_name, "ollama", "pull", model], - capture_output=True, - text=True, - ) - if pull_result.returncode != 0: - raise RuntimeError(f"Failed to pull model: {pull_result.stderr}") - - print(f"[OllamaService] Ready! \ -container='{self.container_name}', port={self.port}, model='{model}'") - - async def generate( - self, - messages: list, - max_tokens: int = 1024, - temperature: float = 0.1, - ) -> Any: - """ - Send messages to Ollama and return the response. - This is what RAGHandle calls to get LLM answers. - """ - async with httpx.AsyncClient(timeout=60.0) as client: - resp = await client.post( - f"{self.base_url}/api/chat", - json={ - "model": self.model, - "messages": messages, - "stream": False, - "options": { - "num_predict": max_tokens, - "temperature": temperature, - }, - }, - ) - resp.raise_for_status() - return resp.json()["message"]["content"] - - async def health(self) -> dict: - """Check if this Ollama instance is alive.""" - try: - async with httpx.AsyncClient(timeout=5.0) as client: - await client.get(f"{self.base_url}/api/tags") - return { - "status": "ok", - "port": self.port, - "container": self.container_name, - } - except Exception as e: - return {"status": "error", "error": str(e)} - - def __del__(self): - """Cleanup when Ray destroys this replica.""" - print(f"[OllamaService] Removing container '{self.container_name}'") - remove_container(self.container_name) - - -@serve.deployment(num_replicas=1) -class QdrantService: - """ - Manages a Qdrant Docker container. - - LIFECYCLE: - __init__: starts container (or reuses if already running) - get_url: returns the URL other services should connect to - __del__: leaves the container running (it has data!) - """ - - def __init__( - self, - port: int = 6333, - image: str = "qdrant/qdrant:latest", - storage_volume: str = "qdrant_data", - ): - self.port = port - self.container_name = "qdrant-ray" - self.url = f"http://localhost:{self.port}" - - if self._is_healthy(): - print(f"[QdrantService] Qdrant already running on port {self.port}") - return - - remove_container(self.container_name) - - cmd = [ - "docker", - "run", - "-d", - "--name", - self.container_name, - "-p", - f"{self.port}:6333", - "-v", - f"{storage_volume}:/qdrant/storage", - image, - ] - - print(f"[QdrantService] Starting Qdrant on port {self.port}...") - result = subprocess.run(cmd, capture_output=True, text=True) - if result.returncode != 0: - raise RuntimeError(f"Docker failed: {result.stderr}") - - if not wait_for_service(f"{self.url}/healthz"): - raise RuntimeError( - f"Qdrant didn't start within timeout on port {self.port}" - ) - - print(f"[QdrantService] Ready on port {self.port}") - - def _is_healthy(self) -> bool: - try: - resp = httpx.get(f"{self.url}/healthz", timeout=3) - return resp.status_code == 200 - except Exception: - return False - - async def get_url(self) -> str: - """Return the URL. Called by RAGHandle to know where Qdrant is.""" - return self.url - - async def health(self) -> dict: - try: - async with httpx.AsyncClient(timeout=5.0) as client: - await client.get(f"{self.url}/healthz") - return {"status": "ok", "port": self.port, "url": self.url} - except Exception as e: - return {"status": "error", "error": str(e)} - - def __del__(self): - print(f"[QdrantService] Actor destroyed. \ -Container '{self.container_name}' left running.") diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index 6b9744d..2372334 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -1,17 +1,14 @@ +import json +import os from dataclasses import dataclass, field -from typing import Any, Optional +from typing import Any, AsyncGenerator -import httpx -from qdrant_client import QdrantClient -from qdrant_client.models import FieldCondition, Filter, MatchValue from ray import serve from ray.serve import handle -from sentence_transformers import SentenceTransformer from src.core.module import ModuleWithHandle, ModuleWithId from src.modules.speech_to_text.events import Sentence - -from .events import RAGResult +from src.modules.text_to_speech.events import Token @dataclass @@ -21,25 +18,14 @@ class RAGQuery: _user_id: str question: str preferences: dict = field(default_factory=dict) - # preferences can include: language, tone, - # response_format, max_length, system_prompt, extra_instructions, etc. -@serve.deployment( - num_replicas=2, - ray_actor_options={"num_cpus": 1}, -) +@serve.deployment(name="RAGHandle") class RAGHandle: - """ - Stateless RAG processor. Knows nothing about sessions. - Receives a _user_id + question, uses _user_id to find the right - collection/data in the vector DB, runs embed -> search -> LLM. - """ + """Stateless RAG processor. Streams LLM tokens to the caller.""" def __init__( self, - ollama_handle=None, - qdrant_handle=None, qdrant_url: str = "http://localhost:6333", default_collection: str = "documents", embedding_model: str = "BAAI/bge-large-en-v1.5", @@ -50,6 +36,8 @@ def __init__( top_k: int = 5, score_threshold: float = 0.5, ): + from sentence_transformers import SentenceTransformer + self.embed_model = SentenceTransformer(embedding_model) self.default_collection = default_collection self.top_k = top_k @@ -60,35 +48,21 @@ def __init__( self.llm_model = llm_model self.llm_api_key = llm_api_key - self.ollama_handle = ollama_handle - self.qdrant_handle = qdrant_handle - self._qdrant_url = qdrant_url - self._qdrant: QdrantClient | None = None + self._qdrant: Any = None + self._verify_ssl = os.environ.get("HURI_RAG_VERIFY_SSL", "true").lower() != "false" async def _get_qdrant(self): - """Connect to Qdrant on first use. Solves the async-in-init problem.""" if self._qdrant is None: - if self.qdrant_handle: - self._qdrant_url = await self.qdrant_handle.get_url.remote() - self._qdrant = QdrantClient(url=self._qdrant_url) + from qdrant_client import QdrantClient + + self._qdrant = QdrantClient(url=self._qdrant_url, verify=self._verify_ssl) print(f"[RAGHandle] Connected to Qdrant at {self._qdrant_url}") return self._qdrant def _resolve_user_context(self, _user_id: str) -> tuple[str, dict | None]: - """ - Given a _user_id, decide which collection to search - and which filters to apply. - - Options (pick what fits your data model): - A) One collection per user: collection = f"user_{_user_id}" - B) Shared collection, filter by _user_id in payload - C) Lookup in a DB to find the user's config - """ - collection = self.default_collection filters = {"_user_id": _user_id} - return collection, filters def _embed(self, text) -> list[float] | Any: @@ -101,9 +75,10 @@ def _search( collection: str, filters: dict | None = None, ) -> list[dict]: - qdrant_filter: Any = None if filters: + from qdrant_client.models import FieldCondition, Filter, MatchValue + conditions: Any = [ FieldCondition(key=k, match=MatchValue(value=v)) for k, v in filters.items() @@ -135,7 +110,6 @@ def _build_prompt( chunks: list[dict], preferences: dict, ) -> tuple[str, str]: - parts = [ "You are a robot speaking to a user. Answer based on the provided context.", "If the context is insufficient, say so clearly.", @@ -162,126 +136,159 @@ def _build_prompt( context_parts = [] for i, chunk in enumerate(chunks, 1): source = chunk["metadata"].get("source", "unknown") - context_parts.append(f"[{i}] (source: {source}, score: \ -{chunk['score']:.2f})\n{chunk['text']}") + context_parts.append( + f"[{i}] (source: {source}, score: {chunk['score']:.2f})\n" + f"{chunk['text']}" + ) context_block = "\n\n".join(context_parts) user_prompt = ( f"Context:\n{context_block}\n\n" f"Question: {question}\n\n" - "Answer based on the context above.\ -Don't speak about the sources, just use them to answer the question." + "Answer based on the context above. " + "Don't speak about the sources, just use them to answer." ) return system_prompt, user_prompt - async def _llm_generate( + async def _stream_ollama( + self, messages: list, max_tokens: int + ) -> AsyncGenerator[str, None]: + import httpx + + async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client: + async with client.stream( + "POST", + f"{self.llm_url}/api/chat", + json={ + "model": self.llm_model, + "messages": messages, + "stream": True, + "options": {"num_predict": max_tokens, "temperature": 0.1}, + }, + ) as resp: + resp.raise_for_status() + async for line in resp.aiter_lines(): + if not line: + continue + try: + chunk = json.loads(line) + except json.JSONDecodeError: + continue + delta = chunk.get("message", {}).get("content", "") + if delta: + yield delta + if chunk.get("done"): + return + + async def _stream_openai_compatible( + self, + url: str, + messages: list, + max_tokens: int, + api_key: str = "", + ) -> AsyncGenerator[str, None]: + import httpx + + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client: + async with client.stream( + "POST", + url, + headers=headers, + json={ + "model": self.llm_model, + "messages": messages, + "max_tokens": max_tokens, + "temperature": 0.1, + "stream": True, + }, + ) as resp: + resp.raise_for_status() + async for line in resp.aiter_lines(): + if not line or not line.startswith("data:"): + continue + payload = line[len("data:"):].strip() + if payload == "[DONE]": + return + try: + chunk = json.loads(payload) + except json.JSONDecodeError: + continue + delta = ( + chunk.get("choices", [{}])[0] + .get("delta", {}) + .get("content", "") + ) + if delta: + yield delta + + async def _llm_stream( self, system_prompt: str, user_prompt: str, preferences: dict, - ) -> Any: + ) -> AsyncGenerator[str, None]: max_tokens = preferences.get("max_length", 1024) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] - if self.ollama_handle: - return await self.ollama_handle.generate.remote(messages, max_tokens) - if self.llm_provider == "vllm": - return await self._call_openai_compatible( + async for d in self._stream_openai_compatible( f"{self.llm_url}/v1/chat/completions", messages, max_tokens - ) - elif self.llm_provider == "ollama": - return await self._call_ollama(messages, max_tokens) - + ): + yield d elif self.llm_provider == "api": - return await self._call_openai_compatible( + async for d in self._stream_openai_compatible( f"{self.llm_url}/v1/chat/completions", messages, max_tokens, self.llm_api_key, - ) + ): + yield d + elif self.llm_provider == "ollama": + async for d in self._stream_ollama(messages, max_tokens): + yield d else: raise ValueError(f"Unknown llm_provider: {self.llm_provider}") - async def _call_openai_compatible( - self, url: str, messages: list, max_tokens: int, api_key: str = "" - ) -> Any: - headers = {"Content-Type": "application/json"} - if api_key: - headers["Authorization"] = f"Bearer {api_key}" - async with httpx.AsyncClient(timeout=60.0) as client: - resp = await client.post( - url, - headers=headers, - json={ - "model": self.llm_model, - "messages": messages, - "max_tokens": max_tokens, - "temperature": 0.1, - }, - ) - resp.raise_for_status() - return resp.json()["choices"][0]["message"]["content"] - - async def _call_ollama(self, messages: list, max_tokens: int) -> Any: - async with httpx.AsyncClient(timeout=60.0) as client: - resp = await client.post( - f"{self.llm_url}/api/chat", - json={ - "model": self.llm_model, - "messages": messages, - "stream": False, - "options": {"num_predict": max_tokens, "temperature": 0.1}, - }, - ) - resp.raise_for_status() - return resp.json()["message"]["content"] - - async def process(self, query: RAGQuery) -> RAGResult: - """ - Main entry point. Called by the RAG module. - Uses _user_id to determine which collection / filters to use. - """ - + async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]: + """Main streaming entry point — yields LLM text deltas.""" print(f"[RAG] Question: {query.question}") qdrant = await self._get_qdrant() - collection, filters = self._resolve_user_context(query._user_id) query_vector = self._embed(query.question) chunks = self._search(qdrant, query_vector, collection, filters) print(f"[RAG] Found {len(chunks)} chunks") - for c in chunks: - print(f" - score: {c['score']:.2f} | {c['text'][:100]}...") - system_prompt, user_prompt = self._build_prompt( query.question, chunks, query.preferences ) - print(f"[RAG] System prompt: {system_prompt[:200]}...") - answer = await self._llm_generate(system_prompt, user_prompt, query.preferences) - print(f"[RAG] Answer: {answer}") - - return RAGResult( - answer=answer, - sources=[ - {"text": c["text"], "score": c["score"], "metadata": c["metadata"]} - for c in chunks - ], - ) + + async for delta in self._llm_stream( + system_prompt, user_prompt, query.preferences + ): + yield delta class RAG(ModuleWithHandle, ModuleWithId): + """RAG Module — streams LLM tokens. + + input: question (Sentence) + output: token (Token) + """ + _handle_cls = RAGHandle input_type = "question" - output_type = "rag_response" + output_type = "token" def __init__( self, - _handle: handle.DeploymentHandle[RAGHandle], + _handle: handle.DeploymentHandle, _user_id: str, language="en", tone="formal", @@ -300,22 +307,17 @@ def __init__( "extra_instructions": extra_instructions, } - async def process(self, data: Sentence) -> Optional[RAGResult]: - """ - Called when a "question" event arrives through the event bus. - Packages _user_id + question, sends to the stateless RAGHandle. - """ - question_text = data.text - + async def process(self, data: Sentence) -> AsyncGenerator[Token, None]: # type: ignore[override] query = RAGQuery( _user_id=self._user_id if self._user_id else "anonymous", - question=question_text, + question=data.text, preferences=self.preferences, ) - result: RAGResult = await self._handle.process.remote(query) - return result + stream = self._handle.options(stream=True).stream.remote(query) + async for delta in stream: + yield Token(text=delta, end=False) + yield Token(text="", end=True) def update_preferences(self, new_preferences: dict): - """Client can update preferences mid-session via the event bus.""" self.preferences.update(new_preferences) From 616087a8cde9adf74dff022af99d16019da23263 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 27 May 2026 06:35:45 +0200 Subject: [PATCH 07/31] feated(helm): modules loading between local nvidia and amd example helm + custom health probe --- .../local_nvidia_amd/templates/_helpers.tpl | 64 +++++++++++++++++++ .../templates/rayservice.yaml | 11 ++++ deploy/examples/local_nvidia_amd/values.yaml | 29 ++++++++- 3 files changed, 101 insertions(+), 3 deletions(-) create mode 100644 deploy/examples/local_nvidia_amd/templates/_helpers.tpl diff --git a/deploy/examples/local_nvidia_amd/templates/_helpers.tpl b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl new file mode 100644 index 0000000..e15a832 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl @@ -0,0 +1,64 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "huri.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully-qualified app name. +Truncated at 63 chars because some Kubernetes name fields have this limit. +*/}} +{{- define "huri.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Chart label: -. +*/}} +{{- define "huri.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels applied to every resource. +*/}} +{{- define "huri.labels" -}} +helm.sh/chart: {{ include "huri.chart" . }} +{{ include "huri.selectorLabels" . }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels (used in matchLabels / ingress backends). +*/}} +{{- define "huri.selectorLabels" -}} +app.kubernetes.io/name: {{ include "huri.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Name of the KubeRay-managed serve service. +KubeRay appends "-serve-svc" to the RayService name. +*/}} +{{- define "huri.serveSvcName" -}} +{{- printf "%s-serve-svc" (include "huri.fullname" .) }} +{{- end }} + +{{/* +Name of the KubeRay-managed head service. +KubeRay appends "-head-svc" to the RayService name. +*/}} +{{- define "huri.headSvcName" -}} +{{- printf "%s-head-svc" (include "huri.fullname" .) }} +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml index 31d7f00..318dac8 100644 --- a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml +++ b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml @@ -119,6 +119,17 @@ spec: image: {{ $.Values.image.repository }}:{{ $.Values.image.tag }} {{- end }} imagePullPolicy: {{ $.Values.image.pullPolicy }} + readinessProbe: + exec: + command: + - bash + - -c + - wget --tries 1 -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz + | grep success + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + timeoutSeconds: 5 env: {{- $hasEnv := false }} {{- if .containerEnv }} diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml index 3e90893..8f7a3e5 100644 --- a/deploy/examples/local_nvidia_amd/values.yaml +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -31,11 +31,32 @@ ray: runtime_env: env_vars: RAY_COLOR_PREFIX: "1" + HURI_RAG_VERIFY_SSL: "false" deployments: + # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only — + # all GPU work is offloaded to handle-backed deployments below. - name: HuRI ray_actor_options: num_cpus: 1 num_gpus: 0 + # STT: shared faster-whisper actor, pinned to AMD. + - name: STT + num_replicas: 1 + ray_actor_options: + num_cpus: 1 + num_gpus: 0.5 + resources: {"GPU_TYPE_AMD": 0.5} + # RAG: embeddings (sentence-transformers) + LLM client. Pinned to AMD. + - name: RAGHandle + num_replicas: 1 + ray_actor_options: + num_cpus: 1 + num_gpus: 0.5 + resources: {"GPU_TYPE_AMD": 0.5} + init_kwargs: + qdrant_url: "https://qdrant.pommier.lan" + llm_url: "https://llm.pommier.lan" + embedding_model: "bge-large-en-v1.5-gguf-Q4_K_M" - name: TTS ray_actor_options: num_cpus: 1 @@ -98,7 +119,7 @@ workerGroups: replicas: 1 minReplicas: 1 maxReplicas: 1 - mountVoiceAssets: false + mountVoiceAssets: true nodeSelector: gpu: nvidia # affinity: {} # optional @@ -134,7 +155,7 @@ workerGroups: replicas: 1 minReplicas: 1 maxReplicas: 1 - mountVoiceAssets: true + mountVoiceAssets: false nodeSelector: gpu: amd # affinity: {} # optional @@ -214,6 +235,8 @@ models: # HURI_MODEL_PATH must match mountPath/modelId (see text_to_speech.py). env: HURI_MODEL_PATH: /models/cosytts/iic/CosyVoice2-0.5B + # Path to the CosyVoice repo root containing third_party/Matcha-TTS. + HURI_COSY_DIR: /app/cosyvoice emage: enabled: true @@ -248,7 +271,7 @@ voiceAssets: mountPath: /assets env: HURI_VOICE_SAMPLE_PATH: /assets/voice.wav - HURI_VOICE_TRANSCRIPT: "Hello, this is my voice sample for cloning." + HURI_VOICE_TRANSCRIPT: "Instinct creates its own oppressors and bids us rise up against them." # Ingress for the Ray Serve endpoint (port 8000). ingress: From fe939592b955f7c061e46663ed59f7cdd1c569cc Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Fri, 29 May 2026 18:09:43 +0200 Subject: [PATCH 08/31] fixed(tts): gesture and tts connection using pts + summarizing data fetching on client --- src/core/client.py | 27 ++++++++++++++-- src/core/events.py | 41 ++++++++++++++++++++---- src/modules/gesture/gesture.py | 48 ++++++++++++++-------------- src/modules/text_to_speech/events.py | 1 + src/modules/utils/sender.py | 42 ++++++++++++++++++++++-- 5 files changed, 122 insertions(+), 37 deletions(-) diff --git a/src/core/client.py b/src/core/client.py index 085a0b8..b68f4bb 100644 --- a/src/core/client.py +++ b/src/core/client.py @@ -35,11 +35,32 @@ def _save_user_id(self, _user_id: str): f.write(_user_id) async def _receive_loop(self, ws: websockets.ClientConnection): + import struct try: while True: - text = await ws.recv() - print("<<", text) - await asyncio.sleep(0.1) + msg = await ws.recv() + if isinstance(msg, bytes): + if len(msg) < 2: + print(f"<< bytes ({len(msg)}B, no topic)") + continue + (topic_len,) = struct.unpack(">H", msg[:2]) + topic = msg[2:2 + topic_len].decode() + payload = msg[2 + topic_len:] + + if topic == "audio" and len(payload) >= 13: + sample_rate, end_flag, pts = struct.unpack(">IBd", payload[:13]) + n_samples = (len(payload) - 13) // 4 + print( + f"<< audio: pts={pts:.3f}s samples={n_samples} @ {sample_rate}Hz " + f"end={bool(end_flag)}" + ) + elif topic == "motion" and len(payload) >= 16: + pts, fps, n_frames = struct.unpack(">dII", payload[:16]) + print(f"<< motion: pts={pts:.3f}s frames={n_frames} @ {fps}fps") + else: + print(f"<< {topic}: bytes ({len(payload)}B)") + else: + print("<<", msg) except (asyncio.CancelledError, websockets.ConnectionClosedOK): pass diff --git a/src/core/events.py b/src/core/events.py index b764258..1d116bc 100644 --- a/src/core/events.py +++ b/src/core/events.py @@ -1,9 +1,12 @@ import asyncio +import logging from collections import defaultdict from dataclasses import dataclass from .module import Module +logger = logging.getLogger("ray.serve") + @dataclass class EventData: @@ -43,7 +46,13 @@ def register(self, module: Module): self.subscribers[module.input_type].append(module) async def publish(self, event_topic, data): - for module in self.subscribers[event_topic]: + subs = self.subscribers[event_topic] + if event_topic not in ("audio",): # skip mic-frame spam + logger.info( + "[GRAPH] publish topic=%r subscribers=%s", + event_topic, [type(m).__name__ for m in subs], + ) + for module in subs: asyncio.create_task(self._run(module, data)) async def _run(self, module: Module, data): @@ -55,17 +64,35 @@ async def _run(self, module: Module, data): async for item in result: if item is None: continue + logger.info( + "[GRAPH] %s -> %r: %s", + type(module).__name__, module.output_type, _summarize(item), + ) await self.publish(module.output_type, item) - except Exception as e: - print(f"[ERROR] async generator in {module}: {e}") + except Exception: + logger.exception("[GRAPH] async generator failed in %s", type(module).__name__) else: try: value = await result if value is not None: + logger.info( + "[GRAPH] %s -> %r: %s", + type(module).__name__, module.output_type, _summarize(value), + ) await self.publish(module.output_type, value) - except Exception as e: - print(f"[ERROR] coroutine in {module}: {e}") + except Exception: + logger.exception("[GRAPH] coroutine failed in %s", type(module).__name__) + + except Exception: + logger.exception("[GRAPH] process() call failed in %s", type(module).__name__) + - except Exception as e: - print(f"[ERROR] process() call failed in {module}: {e}") +def _summarize(item) -> str: + """Short repr that avoids dumping full numpy arrays into the log.""" + cls = type(item).__name__ + import numpy as np + data = getattr(item, "data", None) + if isinstance(data, np.ndarray): + return f"{cls}(shape={data.shape}, dtype={data.dtype})" + return f"{cls}({item!r})" diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index 427806b..39a2560 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -21,6 +21,7 @@ class Motion: expressions: np.ndarray # (t, 100) facial expression coefficients trans: np.ndarray # (t, 3) global root translation fps: int = 30 + pts: float = 0.0 # presentation timestamp in seconds, paired with Audio.pts @serve.deployment(name="GestureGeneration") @@ -30,17 +31,25 @@ def __init__( hf_repo: str = _HF_REPO, device: Optional[str] = None, ): + print(f"[Gesture] importing torch...", flush=True) import torch + print(f"[Gesture] importing emage...", flush=True) from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv self.device = torch.device( device if device else ("cuda" if torch.cuda.is_available() else "cpu") ) + print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}", flush=True) + print("[Gesture] loading face_vq...", flush=True) face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device) + print("[Gesture] loading upper_vq...", flush=True) upper_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/upper").to(self.device) + print("[Gesture] loading lower_vq...", flush=True) lower_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/lower").to(self.device) + print("[Gesture] loading hands_vq...", flush=True) hands_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/hands").to(self.device) + print("[Gesture] loading global_ae...", flush=True) global_ae = EmageVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/global").to(self.device) self.motion_vq = EmageVQModel( @@ -52,13 +61,19 @@ def __init__( ) self.motion_vq.eval() + print("[Gesture] loading EmageAudioModel...", flush=True) self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device) self.model.eval() + print(f"[Gesture] ready", flush=True) - def infer(self, audio_np: np.ndarray) -> Motion: + def infer(self, audio_np: np.ndarray, source_sr: int = _EMAGE_SR) -> Motion: import torch import torch.nn.functional as F + if source_sr != _EMAGE_SR: + import librosa + audio_np = librosa.resample(audio_np, orig_sr=source_sr, target_sr=_EMAGE_SR) + audio_ts = torch.from_numpy(audio_np).to(self.device).unsqueeze(0) speaker_id = torch.zeros(1, 1, dtype=torch.long, device=self.device) @@ -96,11 +111,9 @@ class Gesture(ModuleWithHandle): """Gesture Module Consumes streaming Audio chunks produced by TTS and generates whole-body - SMPL-X motion using the EMAGE audio-to-gesture model. - - Audio chunks are buffered until TTS signals the end of an utterance - (Audio.end == True). At that point the full waveform is passed to EMAGE - and a single Motion object is yielded. + SMPL-X motion using the EMAGE audio-to-gesture model. Inference runs once + per chunk so Motion events interleave with audio playback instead of all + arriving at the end of the utterance. input: audio (Audio) output: motion (Motion) @@ -118,25 +131,12 @@ def __init__( _handle: handle.DeploymentHandle, ): super().__init__(_handle) - self._chunks: list[np.ndarray] = [] async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: ignore[override] - import librosa - - if audio.data.size > 0: - chunk = audio.data - if audio.sample_rate != _EMAGE_SR: - chunk = librosa.resample(chunk, orig_sr=audio.sample_rate, target_sr=_EMAGE_SR) - self._chunks.append(chunk.astype(np.float32)) - - if not audio.end: + if audio.data.size == 0: return - - if not self._chunks: - return - - full_audio = np.concatenate(self._chunks) - self._chunks = [] - - motion = await self._handle.infer.remote(full_audio) + motion = await self._handle.infer.remote( + audio.data.astype(np.float32), audio.sample_rate + ) + motion.pts = audio.pts yield motion diff --git a/src/modules/text_to_speech/events.py b/src/modules/text_to_speech/events.py index 9dd6fd4..dceb269 100644 --- a/src/modules/text_to_speech/events.py +++ b/src/modules/text_to_speech/events.py @@ -16,3 +16,4 @@ class Audio(EventData): data: np.ndarray sample_rate: int end: bool = False + pts: float = 0.0 # presentation timestamp in seconds from utterance start diff --git a/src/modules/utils/sender.py b/src/modules/utils/sender.py index f09b0ba..1a1d7eb 100644 --- a/src/modules/utils/sender.py +++ b/src/modules/utils/sender.py @@ -1,9 +1,16 @@ +import logging +import struct from dataclasses import asdict +import numpy as np from fastapi import WebSocket from src.core.events import EventData from src.core.module import Module +from src.modules.gesture.gesture import Motion +from src.modules.text_to_speech.events import Audio + +logger = logging.getLogger("ray.serve") class Sender(Module): @@ -11,6 +18,9 @@ class Sender(Module): Send output data to the client. This data must be JSON serialisable, like a dataclass. + Audio wire format: [4B sample_rate uint32][1B end][8B pts float64][float32 PCM]. + Motion wire format: [8B pts float64][4B fps uint32][4B n_frames uint32] + [poses float32 n*165][expressions float32 n*100][trans float32 n*3]. input: auto, output: None""" @@ -21,10 +31,36 @@ def __init__(self, ws: WebSocket, type: str): self.ws: WebSocket = ws self.input_type = type - async def process(self, data: EventData | bytes): + async def process(self, _): + data = _ + logger.info("[Sender:%s] received %s", self.input_type, type(data).__name__) if isinstance(data, bytes): - await self.ws.send_bytes(data) + await self.ws.send_bytes(self._prefix(data)) + elif isinstance(data, Audio): + logger.info( + "[Sender:%s] Audio samples=%d sr=%d end=%s pts=%.3fs", + self.input_type, data.data.shape[0], data.sample_rate, data.end, data.pts, + ) + header = struct.pack(">IBd", data.sample_rate, int(data.end), data.pts) + await self.ws.send_bytes(self._prefix(header + data.data.tobytes())) + elif isinstance(data, Motion): + n_frames = data.poses.shape[0] + logger.info( + "[Sender:%s] Motion frames=%d fps=%d pts=%.3fs", + self.input_type, n_frames, data.fps, data.pts, + ) + header = struct.pack(">dII", data.pts, data.fps, n_frames) + body = ( + data.poses.astype(np.float32).tobytes() + + data.expressions.astype(np.float32).tobytes() + + data.trans.astype(np.float32).tobytes() + ) + await self.ws.send_bytes(self._prefix(header + body)) elif isinstance(data, EventData): await self.ws.send_json(asdict(data)) else: - await self.ws.send_text(data) + await self.ws.send_text(str(data)) + + def _prefix(self, payload: bytes) -> bytes: + topic_bytes = self.input_type.encode() + return struct.pack(">H", len(topic_bytes)) + topic_bytes + payload From b728cd8c3dcbdff43795091f5cc2c8f180e9f440 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Fri, 29 May 2026 18:10:53 +0200 Subject: [PATCH 09/31] fixed(rag): init arguments not being taken in kube values + hot values swap capability --- src/modules/rag/rag.py | 148 +++++++++++++++++++++++------------------ 1 file changed, 85 insertions(+), 63 deletions(-) diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index 2372334..0195e48 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -1,8 +1,8 @@ import json -import os from dataclasses import dataclass, field from typing import Any, AsyncGenerator +from pydantic import BaseModel from ray import serve from ray.serve import handle @@ -11,6 +11,20 @@ from src.modules.text_to_speech.events import Token +class RAGDeploymentConfig(BaseModel): + qdrant_url: str = "http://localhost:6333" + default_collection: str = "documents" + embedding_model: str = "BAAI/bge-large-en-v1.5" + embedding_url: str = "" + llm_provider: str = "ollama" # "vllm", "ollama", "api" + llm_url: str = "http://localhost:11434" + llm_model: str = "mistral:7b" + llm_api_key: str = "" + verify_ssl: bool = True + top_k: int = 5 + score_threshold: float = 0.5 + + @dataclass class RAGQuery: """What flows from RAG module to RAGHandle.""" @@ -24,49 +38,52 @@ class RAGQuery: class RAGHandle: """Stateless RAG processor. Streams LLM tokens to the caller.""" - def __init__( - self, - qdrant_url: str = "http://localhost:6333", - default_collection: str = "documents", - embedding_model: str = "BAAI/bge-large-en-v1.5", - llm_provider: str = "ollama", # "vllm", "ollama", "api" - llm_url: str = "http://localhost:11434", - llm_model: str = "mistral:7b", - llm_api_key: str = "", - top_k: int = 5, - score_threshold: float = 0.5, - ): - from sentence_transformers import SentenceTransformer - - self.embed_model = SentenceTransformer(embedding_model) - self.default_collection = default_collection - self.top_k = top_k - self.score_threshold = score_threshold + def __init__(self, **kwargs): + self._cfg = RAGDeploymentConfig(**kwargs) + self._apply_config() - self.llm_provider = llm_provider - self.llm_url = llm_url - self.llm_model = llm_model - self.llm_api_key = llm_api_key + def reconfigure(self, config: dict) -> None: + self._cfg = RAGDeploymentConfig(**{**self._cfg.model_dump(), **config}) + self._apply_config() - self._qdrant_url = qdrant_url - self._qdrant: Any = None - self._verify_ssl = os.environ.get("HURI_RAG_VERIFY_SSL", "true").lower() != "false" - - async def _get_qdrant(self): - if self._qdrant is None: - from qdrant_client import QdrantClient + def _apply_config(self) -> None: + import httpx + from qdrant_client import QdrantClient - self._qdrant = QdrantClient(url=self._qdrant_url, verify=self._verify_ssl) - print(f"[RAGHandle] Connected to Qdrant at {self._qdrant_url}") - return self._qdrant + cfg = self._cfg + self.embedding_url = cfg.embedding_url or cfg.llm_url + self._qdrant = QdrantClient(url=cfg.qdrant_url, verify=cfg.verify_ssl) + print(f"[RAGHandle] Connected to Qdrant at {cfg.qdrant_url}") + self._embed_client = httpx.AsyncClient(timeout=30.0, verify=cfg.verify_ssl) + self._llm_client = httpx.AsyncClient(timeout=120.0, verify=cfg.verify_ssl) def _resolve_user_context(self, _user_id: str) -> tuple[str, dict | None]: - collection = self.default_collection + collection = self._cfg.default_collection filters = {"_user_id": _user_id} return collection, filters - def _embed(self, text) -> list[float] | Any: - return self.embed_model.encode(str(text), normalize_embeddings=True).tolist() + async def _embed(self, text: str) -> list[float]: + url = f"{self.embedding_url}/v1/embeddings" + resp = await self._embed_client.post( + url, + json={"model": self._cfg.embedding_model, "input": str(text)}, + ) + if resp.status_code != 200: + raise RuntimeError( + f"Embedding HTTP {resp.status_code} from {url}: {resp.text[:1000]}" + ) + try: + payload = resp.json() + except Exception as e: + raise RuntimeError( + f"Embedding non-JSON response from {url}: {resp.text[:1000]}" + ) from e + try: + return payload["data"][0]["embedding"] + except (KeyError, IndexError, TypeError) as e: + raise RuntimeError( + f"Embedding unexpected schema from {url}: {str(payload)[:1000]}" + ) from e def _search( self, @@ -90,8 +107,8 @@ def _search( collection_name=collection, query=query_vector, query_filter=qdrant_filter, - limit=self.top_k, - score_threshold=self.score_threshold, + limit=self._cfg.top_k, + score_threshold=self._cfg.score_threshold, ).points except Exception: results = [] @@ -153,14 +170,11 @@ def _build_prompt( async def _stream_ollama( self, messages: list, max_tokens: int ) -> AsyncGenerator[str, None]: - import httpx - - async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client: - async with client.stream( + async with self._llm_client.stream( "POST", - f"{self.llm_url}/api/chat", + f"{self._cfg.llm_url}/api/chat", json={ - "model": self.llm_model, + "model": self._cfg.llm_model, "messages": messages, "stream": True, "options": {"num_predict": max_tokens, "temperature": 0.1}, @@ -187,18 +201,15 @@ async def _stream_openai_compatible( max_tokens: int, api_key: str = "", ) -> AsyncGenerator[str, None]: - import httpx - headers = {"Content-Type": "application/json"} if api_key: headers["Authorization"] = f"Bearer {api_key}" - async with httpx.AsyncClient(timeout=120.0, verify=self._verify_ssl) as client: - async with client.stream( + async with self._llm_client.stream( "POST", url, headers=headers, json={ - "model": self.llm_model, + "model": self._cfg.llm_model, "messages": messages, "max_tokens": max_tokens, "temperature": 0.1, @@ -236,43 +247,54 @@ async def _llm_stream( {"role": "user", "content": user_prompt}, ] - if self.llm_provider == "vllm": + if self._cfg.llm_provider == "vllm": async for d in self._stream_openai_compatible( - f"{self.llm_url}/v1/chat/completions", messages, max_tokens + f"{self._cfg.llm_url}/v1/chat/completions", messages, max_tokens ): yield d - elif self.llm_provider == "api": + elif self._cfg.llm_provider == "api": async for d in self._stream_openai_compatible( - f"{self.llm_url}/v1/chat/completions", + f"{self._cfg.llm_url}/v1/chat/completions", messages, max_tokens, - self.llm_api_key, + self._cfg.llm_api_key, ): yield d - elif self.llm_provider == "ollama": + elif self._cfg.llm_provider == "ollama": async for d in self._stream_ollama(messages, max_tokens): yield d else: - raise ValueError(f"Unknown llm_provider: {self.llm_provider}") + raise ValueError(f"Unknown llm_provider: {self._cfg.llm_provider}") async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]: """Main streaming entry point — yields LLM text deltas.""" + import traceback + print(f"[RAG] Question: {query.question}") - qdrant = await self._get_qdrant() collection, filters = self._resolve_user_context(query._user_id) - query_vector = self._embed(query.question) - chunks = self._search(qdrant, query_vector, collection, filters) + query_vector = await self._embed(query.question) + + try: + chunks = self._search(self._qdrant, query_vector, collection, filters) + except Exception: + print(f"[RAG] FAILED during Qdrant search:\n{traceback.format_exc()}") + raise print(f"[RAG] Found {len(chunks)} chunks") system_prompt, user_prompt = self._build_prompt( query.question, chunks, query.preferences ) - async for delta in self._llm_stream( - system_prompt, user_prompt, query.preferences - ): - yield delta + print(f"[RAG] Streaming from LLM at {self._cfg.llm_url} (provider={self._cfg.llm_provider}, model={self._cfg.llm_model})") + try: + async for delta in self._llm_stream( + system_prompt, user_prompt, query.preferences + ): + yield delta + except Exception: + print(f"[RAG] FAILED during LLM stream:\n{traceback.format_exc()}") + raise class RAG(ModuleWithHandle, ModuleWithId): From ad7f65835c3be3b0d0c80a6a36e07a8cd3be4d0a Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Fri, 29 May 2026 18:11:56 +0200 Subject: [PATCH 10/31] fixed(tts): fixed bi stream capacity + debug logs support directly in dashboard --- src/modules/text_to_speech/text_to_speech.py | 257 ++++++++++++------- 1 file changed, 169 insertions(+), 88 deletions(-) diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py index 8c25eb8..9bd579b 100644 --- a/src/modules/text_to_speech/text_to_speech.py +++ b/src/modules/text_to_speech/text_to_speech.py @@ -1,7 +1,9 @@ import asyncio +import logging import os -import re +import queue import sys +import uuid from typing import AsyncGenerator import numpy as np @@ -12,6 +14,19 @@ from .events import Audio, Token +logger = logging.getLogger("ray.serve") +logger.setLevel(os.environ.get("HURI_TTS_LOG_LEVEL", "INFO").upper()) + + +def _trace(msg: str) -> None: + """Belt-and-braces log: hits both the ray.serve logger AND stdout. + + Ray Serve captures stdout per replica and surfaces it in the dashboard's + Logs tab — that's the path that survives any logger misconfiguration. + """ + logger.info(msg) + print(f"[TTS] {msg}", flush=True) + # Defaults — overridden by env vars in production (see README.md) _MODEL_PATH = os.environ.get("HURI_MODEL_PATH", "/models/cosytts/iic/CosyVoice2-0.5B") @@ -20,132 +35,198 @@ "HURI_VOICE_TRANSCRIPT", "Hello, this is my voice sample for cloning." ) -# Hard endings (.!?) trigger synthesis immediately; soft endings (,;:) only after -# min_clause_chars are buffered, to avoid synthesizing very short fragments. -_HARD_END_RE = re.compile(r'[.!?]["\']?\s+') -_SOFT_END_RE = re.compile(r'[,;:]\s+') - -_DONE = object() # sentinel for exhausted sync generator +_END_TEXT = object() # sentinel pushed into the text queue to close synth +_END_AUDIO = object() # sentinel pushed into the audio queue when synth completes +_DONE = object() # sentinel for exhausted sync generator -@serve.deployment(name="TTS") +@serve.deployment(name="TTS", max_ongoing_requests=200) class TTSDeployment: + """CosyVoice2 wrapper with per-session bistream synthesis. + + The model's `inference_zero_shot` accepts a Python generator as `tts_text` + and yields audio chunks as text arrives — that's the "bistream" mode. + Because the model call is fully synchronous, each session runs in a thread + via `run_in_executor` and is fed by a thread-safe `queue.Queue` that the + asyncio side pushes text into. + """ + def __init__( self, model_path: str = _MODEL_PATH, voice_sample_path: str = _VOICE_SAMPLE_PATH, voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT, ): + _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path}") + cosy_dir = os.environ.get("HURI_COSY_DIR") if cosy_dir: matcha_path = os.path.join(cosy_dir, "third_party", "Matcha-TTS") if os.path.isdir(matcha_path) and matcha_path not in sys.path: sys.path.insert(0, matcha_path) + logger.debug("Added Matcha-TTS path to sys.path: %s", matcha_path) from cosyvoice.cli.cosyvoice import CosyVoice2 - from cosyvoice.utils.file_utils import load_wav self.model = CosyVoice2(model_path, load_jit=False, load_trt=False) self.sample_rate: int = self.model.sample_rate + _trace(f"CosyVoice2 loaded (sample_rate={self.sample_rate})") - self.prompt_speech = load_wav(voice_sample_path, 16000) + self.prompt_speech = voice_sample_path self.prompt_text: str = voice_sample_transcript - async def synthesize(self, text: str) -> AsyncGenerator[Audio, None]: - """Run CosyVoice2 streaming inference and yield Audio chunks. - - The synchronous CosyVoice2 generator runs in a thread-pool executor so - it does not block the asyncio event loop between chunks. - """ - loop = asyncio.get_running_loop() - gen = self.model.inference_zero_shot( - text, - self.prompt_text, - self.prompt_speech, - stream=True, - ) - while True: - result = await loop.run_in_executor(None, next, gen, _DONE) - if result is _DONE: - break - yield Audio( - data=result["tts_speech"].squeeze(0).numpy().astype(np.float32), - sample_rate=self.sample_rate, - ) + self._text_queues: dict[str, queue.Queue] = {} async def get_sample_rate(self) -> int: return self.sample_rate + async def start_session(self, session_id: str) -> None: + self._text_queues[session_id] = queue.Queue() + _trace(f"[{session_id}] session started (active={len(self._text_queues)})") + + async def push_text(self, session_id: str, text: str, end: bool) -> None: + q = self._text_queues.get(session_id) + if q is None: + _trace(f"[{session_id}] WARNING push_text on unknown session (text={text!r} end={end})") + return + if text: + q.put(text) + _trace(f"[{session_id}] push_text {text!r} (qsize={q.qsize()})") + if end: + q.put(_END_TEXT) + _trace(f"[{session_id}] push_text: end-of-stream sentinel") + + async def stream_audio(self, session_id: str) -> AsyncGenerator[Audio, None]: + text_q = self._text_queues[session_id] + loop = asyncio.get_running_loop() + chunk_count = 0 + _trace(f"[{session_id}] stream_audio: starting CosyVoice inference") + + def text_gen(): + while True: + item = text_q.get() + if item is _END_TEXT: + _trace(f"[{session_id}] text_gen: received end sentinel") + return + _trace(f"[{session_id}] text_gen yielding: {item!r}") + yield item + + try: + audio_iter = self.model.inference_zero_shot( + text_gen(), + self.prompt_text, + self.prompt_speech, + stream=True, + ) + while True: + result = await loop.run_in_executor(None, next, audio_iter, _DONE) + if result is _DONE: + break + assert isinstance(result, dict) + chunk_count += 1 + speech = result["tts_speech"].squeeze(0).numpy().astype(np.float32) + _trace( + f"[{session_id}] audio chunk #{chunk_count}: " + f"{speech.shape[0]} samples (~{speech.shape[0] / self.sample_rate:.2f}s)" + ) + yield Audio(data=speech, sample_rate=self.sample_rate) + except Exception as e: + _trace(f"[{session_id}] stream_audio FAILED: {e!r}") + logger.exception("[%s] stream_audio failed", session_id) + raise + finally: + self._text_queues.pop(session_id, None) + _trace(f"[{session_id}] stream_audio finished (chunks={chunk_count})") -class TTS(ModuleWithHandle): - """TTS Module - - Stream text tokens in, stream audio chunks out using CosyVoice2 zero-shot - voice cloning. - Buffers incoming tokens and synthesizes as soon as a sentence or clause - boundary is detected. Audio chunks are yielded immediately as CosyVoice2 - produces them, so playback can start before synthesis is complete. +class TTS(ModuleWithHandle): + """TTS Module — bistream tokens-in / audio-out via CosyVoice2. - Compatible with both the Ray Serve event graph (async generator support in - EventGraph._run) and direct client streaming. + Opens one synthesis session per utterance (delimited by `token.end`). Each + incoming token is pushed straight into the model's text generator so audio + starts coming back before the LLM has finished producing the response. + No clause buffering on our side — CosyVoice's frontend handles segmentation + and stitches LM calls together across the whole utterance. - input: token (Token), + input: token (Token) output: audio (Audio) - - :min_clause_chars: minimum buffer length before a soft boundary (,;:) - triggers synthesis. Hard endings (.!?) always trigger immediately. - Raise this value to produce longer, more natural-sounding segments. """ _handle_cls = TTSDeployment input_type = "token" output_type = "audio" - def __init__( - self, - _handle: handle.DeploymentHandle, - min_clause_chars: int = 20, - ): + def __init__(self, _handle: handle.DeploymentHandle): super().__init__(_handle) - self.min_clause_chars: int = min_clause_chars - self._buffer: str = "" + self._session_id: str | None = None + self._audio_q: asyncio.Queue | None = None + self._stream_task: asyncio.Task | None = None + self._session_ready: asyncio.Event | None = None async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: ignore[override] - self._buffer += token.text - - # Drain all complete clauses from the buffer before waiting for more tokens - while True: - clause, remainder = self._split(self._buffer) - if not clause: - break - self._buffer = remainder - async for chunk in self._handle.synthesize.remote(clause): - yield chunk - - # Flush the remaining buffer when the LLM stream ends - if token.end and self._buffer.strip(): - async for chunk in self._handle.synthesize.remote(self._buffer.strip()): - yield chunk - self._buffer = "" - if token.end: + # Subsequent tokens within an utterance just push text — the first + # token's invocation is the long-running yielder that emits chunks as + # soon as CosyVoice produces them, decoupled from token arrival. + if self._session_id is not None: + sid = self._session_id + ready = self._session_ready + if ready is not None: + await ready.wait() + print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True) + await self._handle.push_text.remote(sid, token.text, token.end) + return + + self._session_id = str(uuid.uuid4()) + self._session_ready = asyncio.Event() + sid = self._session_id + audio_q: asyncio.Queue = asyncio.Queue() + self._audio_q = audio_q + print(f"[TTS-client] [{sid}] opening new utterance session", flush=True) + await self._handle.start_session.remote(sid) + self._stream_task = asyncio.create_task(self._drain_audio(sid, audio_q)) + self._session_ready.set() + + print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True) + await self._handle.push_text.remote(sid, token.text, token.end) + + try: + count = 0 + while True: + item = await audio_q.get() + if item is _END_AUDIO: + break + count += 1 + print(f"[TTS-client] [{sid}] yield chunk #{count}", flush=True) + yield item + await self._stream_task + print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)", flush=True) + sample_rate = await self._handle.get_sample_rate.remote() yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True) - - def _split(self, text: str) -> tuple[str, str]: - """Return (clause_to_synthesize, remaining_buffer). - - Splits on the first hard sentence ending (.!?) unconditionally, or on - the first soft clause ending (,;:) once the buffer is long enough. - Returns ("", text) when no boundary is found. - """ - m = _HARD_END_RE.search(text) - if m: - return text[: m.end()].strip(), text[m.end() :] - - if len(text) >= self.min_clause_chars: - m = _SOFT_END_RE.search(text) - if m: - return text[: m.end()].strip(), text[m.end() :] - - return "", text + finally: + self._session_id = None + self._audio_q = None + self._stream_task = None + self._session_ready = None + + async def _drain_audio(self, session_id: str, audio_q: asyncio.Queue) -> None: + try: + response = self._handle.options(stream=True).stream_audio.remote(session_id) + count = 0 + pts = 0.0 + async for audio in response: # type: ignore[attr-defined] + count += 1 + audio.pts = pts + pts += audio.data.shape[0] / audio.sample_rate + print( + f"[TTS-client] [{session_id}] drain received chunk #{count} " + f"pts={audio.pts:.3f}s next={pts:.3f}s", + flush=True, + ) + await audio_q.put(audio) + except Exception as e: + print(f"[TTS-client] [{session_id}] drain task FAILED: {e!r}", flush=True) + raise + finally: + await audio_q.put(_END_AUDIO) + print(f"[TTS-client] [{session_id}] drain task finished", flush=True) From 080f3422ab4662e955020ba92693d05e63c1096c Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Fri, 29 May 2026 18:13:18 +0200 Subject: [PATCH 11/31] feated(stt): using kube PVC to store whisper model --- .../templates/rayservice.yaml | 12 ++- .../templates/whisper-model-init-job.yaml | 84 +++++++++++++++++++ src/modules/speech_to_text/speech_to_text.py | 14 +++- 3 files changed, 107 insertions(+), 3 deletions(-) create mode 100644 deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml diff --git a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml index 318dac8..1922d2e 100644 --- a/deploy/examples/local_nvidia_amd/templates/rayservice.yaml +++ b/deploy/examples/local_nvidia_amd/templates/rayservice.yaml @@ -5,7 +5,7 @@ metadata: labels: {{- include "huri.labels" . | nindent 4 }} annotations: - ray.io/initializing-timeout: "10m" + ray.io/initializing-timeout: "20m" spec: serveConfigV2: | {{ .Values.ray.serveConfig | indent 4 }} @@ -98,6 +98,12 @@ spec: emptyDir: medium: Memory sizeLimit: {{ .shmSize | default "1Gi" }} + {{- if .cudaCacheHostPath }} + - name: cuda-cache + hostPath: + path: {{ .cudaCacheHostPath }} + type: DirectoryOrCreate + {{- end }} {{- range .mountedModels }} {{- $model := index $.Values.models . }} {{- if $model.enabled }} @@ -166,6 +172,10 @@ spec: volumeMounts: - name: dshm mountPath: /dev/shm + {{- if .cudaCacheHostPath }} + - name: cuda-cache + mountPath: /home/ray/.nv + {{- end }} {{- range .mountedModels }} {{- $model := index $.Values.models . }} {{- if $model.enabled }} diff --git a/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml new file mode 100644 index 0000000..098f7f7 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml @@ -0,0 +1,84 @@ +{{- if .Values.models.whisper.enabled }} +{{- $model := .Values.models.whisper }} +{{- $pvcName := printf "%s-whisper-models" (include "huri.fullname" .) }} +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ $pvcName }} + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-10" + "helm.sh/resource-policy": keep +spec: + accessModes: + {{- toYaml $model.pvc.accessModes | nindent 4 }} + resources: + requests: + storage: {{ $model.pvc.size }} + {{- if $model.pvc.storageClassName }} + storageClassName: {{ $model.pvc.storageClassName }} + {{- end }} +--- +# Runs only on first install (not on upgrade) — model is already on the PVC. +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "huri.fullname" . }}-whisper-init + labels: + {{- include "huri.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": pre-install,pre-upgrade + "helm.sh/hook-weight": "-5" + "helm.sh/hook-delete-policy": hook-succeeded,before-hook-creation +spec: + backoffLimit: 3 + template: + metadata: + labels: + {{- include "huri.selectorLabels" . | nindent 8 }} + spec: + restartPolicy: OnFailure + {{- with $model.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + volumes: + - name: models + persistentVolumeClaim: + claimName: {{ include "huri.fullname" . }}-whisper-models + containers: + - name: whisper-downloader + image: python:3.11-slim + command: ["/bin/sh", "-c"] + args: + - | + set -e + MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}" + if [ -f "$MODEL_DIR/model.bin" ]; then + echo "Model already present at $MODEL_DIR — skipping download." + exit 0 + fi + echo "Downloading {{ $model.modelSource.repoId }} into $MODEL_DIR …" + pip install --quiet huggingface_hub + python - <<'PYEOF' + from huggingface_hub import snapshot_download + snapshot_download( + "{{ $model.modelSource.repoId }}", + local_dir="{{ $model.mountPath }}/{{ $model.modelSource.repoId }}", + ) + PYEOF + echo "Download complete." + volumeMounts: + - name: models + mountPath: {{ $model.mountPath }} + resources: + requests: + cpu: "500m" + memory: "512Mi" + limits: + cpu: "2" + memory: "1Gi" +{{- end }} diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index fa3b790..29c3ad8 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -1,8 +1,8 @@ import asyncio +import os from typing import List, Optional import numpy as np -from faster_whisper import WhisperModel from ray import serve from ray.serve import handle @@ -10,6 +10,8 @@ from .events import Transcript, Voice +_MODEL_PATH = os.environ.get("HURI_STT_MODEL_PATH", "base") + @serve.deployment(name="STT") class STTDeployment: @@ -18,19 +20,27 @@ class STTDeployment: Holds the faster-whisper model in a single Ray actor (pinned to the AMD worker in deployment configs). Exposes a single transcribe() call so per-session STT clients can offload the heavy work without owning a GPU. + + HURI_STT_MODEL_PATH: path to a local faster-whisper model directory (from + the whisper PVC). Falls back to "base" which triggers a HuggingFace + download — only acceptable for local dev without a PVC. """ def __init__( self, - model: str = "base", + model: str = _MODEL_PATH, device: str = "auto", compute_type: str = "auto", ): + print(f"[STT] loading model from {model!r} (device={device} compute_type={compute_type})", flush=True) + from faster_whisper import WhisperModel + self.model_faster = WhisperModel( model, device=device, compute_type=compute_type, ) + print(f"[STT] model loaded", flush=True) self.language = "en" async def transcribe(self, audio: np.ndarray) -> str: From 5146a0efd9fb1b650cf521f89fbda1e0555ee4d8 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Fri, 29 May 2026 18:14:28 +0200 Subject: [PATCH 12/31] feat(kube): ingress template to avoid port forwarding huri for the socket and the dashboard --- .../local_nvidia_amd/templates/_helpers.tpl | 8 ++++++++ .../templates/head-dashboard-svc.yaml | 16 ++++++++++++++++ .../templates/ingress-dashboard.yaml | 3 +-- 3 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml diff --git a/deploy/examples/local_nvidia_amd/templates/_helpers.tpl b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl index e15a832..cb431d4 100644 --- a/deploy/examples/local_nvidia_amd/templates/_helpers.tpl +++ b/deploy/examples/local_nvidia_amd/templates/_helpers.tpl @@ -62,3 +62,11 @@ KubeRay appends "-head-svc" to the RayService name. {{- define "huri.headSvcName" -}} {{- printf "%s-head-svc" (include "huri.fullname" .) }} {{- end }} + +{{/* +Name of the stable dashboard service managed by this chart. +Selects the head pod via stable labels, avoiding KubeRay's random-suffix service. +*/}} +{{- define "huri.headDashboardSvcName" -}} +{{- printf "%s-head-dashboard-svc" (include "huri.fullname" .) }} +{{- end }} diff --git a/deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml b/deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml new file mode 100644 index 0000000..9b47431 --- /dev/null +++ b/deploy/examples/local_nvidia_amd/templates/head-dashboard-svc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "huri.fullname" . }}-head-dashboard-svc + labels: + {{- include "huri.labels" . | nindent 4 }} +spec: + type: ClusterIP + selector: + ray.io/node-type: head + ray.io/group: headgroup + app.kubernetes.io/instance: {{ .Release.Name }} + ports: + - name: dashboard + port: 8265 + targetPort: 8265 diff --git a/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml index c8153df..dc91299 100644 --- a/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml +++ b/deploy/examples/local_nvidia_amd/templates/ingress-dashboard.yaml @@ -25,8 +25,7 @@ spec: pathType: Prefix backend: service: - # KubeRay creates -head-svc for the head node. - name: {{ include "huri.headSvcName" . }} + name: {{ include "huri.headDashboardSvcName" . }} port: number: 8265 {{- end }} From d17f6bf15527c9c82c620517011393be66cd3d35 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Fri, 29 May 2026 18:15:32 +0200 Subject: [PATCH 13/31] feat(kube): new values to support new ingress and stt cache system (+ avoid HF to redownload everytime models) --- deploy/examples/local_nvidia_amd/values.yaml | 55 ++++++++++++++------ 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml index 8f7a3e5..3e4d0a1 100644 --- a/deploy/examples/local_nvidia_amd/values.yaml +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -31,7 +31,6 @@ ray: runtime_env: env_vars: RAY_COLOR_PREFIX: "1" - HURI_RAG_VERIFY_SSL: "false" deployments: # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only — # all GPU work is offloaded to handle-backed deployments below. @@ -46,17 +45,21 @@ ray: num_cpus: 1 num_gpus: 0.5 resources: {"GPU_TYPE_AMD": 0.5} - # RAG: embeddings (sentence-transformers) + LLM client. Pinned to AMD. + # RAG: embeddings (API) + LLM client. Pinned to AMD for its dependencies, no GPU needed. - name: RAGHandle num_replicas: 1 ray_actor_options: num_cpus: 1 - num_gpus: 0.5 - resources: {"GPU_TYPE_AMD": 0.5} - init_kwargs: + num_gpus: 0 + resources: {"GPU_TYPE_AMD": 0.001} + user_config: qdrant_url: "https://qdrant.pommier.lan" - llm_url: "https://llm.pommier.lan" + llm_url: "https://llm.huri.lan" + embedding_url: "https://embedding.huri.lan" embedding_model: "bge-large-en-v1.5-gguf-Q4_K_M" + llm_provider: "vllm" + llm_model: "Qwen3.5-4B-GGUF" + verify_ssl: false - name: TTS ray_actor_options: num_cpus: 1 @@ -120,6 +123,7 @@ workerGroups: minReplicas: 1 maxReplicas: 1 mountVoiceAssets: true + cudaCacheHostPath: /var/cache/huri/cuda-nvidia nodeSelector: gpu: nvidia # affinity: {} # optional @@ -133,6 +137,8 @@ workerGroups: value: "all" - name: NVIDIA_DRIVER_CAPABILITIES value: "compute,utility" + - name: HF_HUB_DOWNLOAD_TIMEOUT + value: "10" # Models whose PVC will be mounted in this worker group. # Keys must match entries under .Values.models. mountedModels: @@ -169,11 +175,14 @@ workerGroups: containerEnv: - name: HSA_OVERRIDE_GFX_VERSION value: "11.5.1" + - name: HF_HUB_DOWNLOAD_TIMEOUT + value: "10" securityContext: seLinuxOptions: type: "spc_t" # privileged: true # Uncomment if spc_t still gets blocked by Fedora - mountedModels: [] + mountedModels: + - whisper resources: limits: cpu: "4" @@ -238,6 +247,22 @@ models: # Path to the CosyVoice repo root containing third_party/Matcha-TTS. HURI_COSY_DIR: /app/cosyvoice + whisper: + enabled: true + nodeSelector: + gpu: amd + pvc: + storageClassName: "" + size: 2Gi + accessModes: + - ReadWriteOnce + mountPath: /models/whisper + modelSource: + type: huggingface + repoId: Systran/faster-whisper-base + env: + HURI_STT_MODEL_PATH: /models/whisper/Systran/faster-whisper-base + emage: enabled: true nodeSelector: @@ -275,13 +300,13 @@ voiceAssets: # Ingress for the Ray Serve endpoint (port 8000). ingress: - enabled: false + enabled: true className: nginx - annotations: {} - # nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" - # nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" - # nginx.ingress.kubernetes.io/proxy-buffering: "off" - host: huri.example.com + annotations: + nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" + nginx.ingress.kubernetes.io/proxy-buffering: "off" + host: huri.lan tls: [] # - secretName: huri-tls # hosts: @@ -291,10 +316,10 @@ ingress: # expose publicly without additional auth. dashboard: ingress: - enabled: false + enabled: true className: nginx annotations: {} - host: huri-dashboard.example.com + host: dashboard.huri.lan tls: [] # Set to true to let this chart manage the KubeRay operator as a sub-chart. From 69c614bde15e1a8946590483d132c3149544aebb Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Sun, 31 May 2026 17:56:33 +0200 Subject: [PATCH 14/31] fix(tts): text to speech missing tokens in audio --- deploy/examples/local_nvidia_amd/values.yaml | 30 +++- src/modules/gesture/gesture.py | 168 +++++++++++++++++-- src/modules/text_to_speech/text_to_speech.py | 59 ++++--- 3 files changed, 215 insertions(+), 42 deletions(-) diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml index 3e4d0a1..8f67376 100644 --- a/deploy/examples/local_nvidia_amd/values.yaml +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -31,6 +31,15 @@ ray: runtime_env: env_vars: RAY_COLOR_PREFIX: "1" + # Gesture sliding-window defaults. The Gesture *module* runs in the + # HuRI (CPU) actor, so these live app-wide here rather than on the + # nvidia worker. context_sec primes EMAGE for continuity across + # audio chunks; min_chunk_sec coalesces tiny TTS chunks so fewer, + # bounded inferences run (lower latency, smoother motion). Can be + # overridden per session via the gesture module `args` in the + # client config. + HURI_GESTURE_CONTEXT_SEC: "2.0" + HURI_GESTURE_MIN_CHUNK_SEC: "0.5" deployments: # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only — # all GPU work is offloaded to handle-backed deployments below. @@ -60,16 +69,23 @@ ray: llm_provider: "vllm" llm_model: "Qwen3.5-4B-GGUF" verify_ssl: false + # GPU split (manual override knob): TTS and Gesture share one NVIDIA + # GPU. num_gpus/resources are Ray *scheduling* fractions — they let + # both replicas pack onto the same device and bias the split. Audio + # (TTS) gets the lion's share so streamed speech stays low-latency; + # gesture is given the remainder. Tune these two pairs together so + # they sum to <= 1.0. To also cap gesture's actual VRAM allocation, + # set HURI_GESTURE_GPU_MEM_FRACTION on the nvidia worker (see below). - name: TTS ray_actor_options: num_cpus: 1 - num_gpus: 0.5 - resources: {"GPU_TYPE_NVIDIA": 0.5} + num_gpus: 0.8 + resources: {"GPU_TYPE_NVIDIA": 0.8} - name: GestureGeneration ray_actor_options: num_cpus: 1 - num_gpus: 0.5 - resources: {"GPU_TYPE_NVIDIA": 0.5} + num_gpus: 0.2 + resources: {"GPU_TYPE_NVIDIA": 0.2} head: # ClusterIP is preferred on real clusters; use NodePort for kind/minikube/k3s. @@ -139,6 +155,12 @@ workerGroups: value: "compute,utility" - name: HF_HUB_DOWNLOAD_TIMEOUT value: "10" + # Manual GPU split for gesture (see GestureGeneration above). Caps the + # EMAGE process to a fraction of GPU memory so TTS keeps the rest. "0" + # disables the cap. Keep roughly in line with its num_gpus fraction. + # Read by GestureGeneration, which runs in this worker group. + - name: HURI_GESTURE_GPU_MEM_FRACTION + value: "0.2" # Models whose PVC will be mounted in this worker group. # Keys must match entries under .Values.models. mountedModels: diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index 39a2560..f939d38 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -13,6 +13,16 @@ _HF_REPO = os.environ.get("HURI_EMAGE_REPO", "H-Liu1997/emage_audio") _EMAGE_SR = 16000 # EMAGE expects 16 kHz mono audio +_EMAGE_FPS = 30 # EMAGE emits motion at 30 fps + +# Sliding-window defaults. Overridable per-deployment via the module `args` +# block in the client config, or globally via the env vars below. +_CONTEXT_SEC = float(os.environ.get("HURI_GESTURE_CONTEXT_SEC", "2.0")) +_MIN_CHUNK_SEC = float(os.environ.get("HURI_GESTURE_MIN_CHUNK_SEC", "0.5")) + +# Optional manual GPU split: cap the gesture process to a fraction of the GPU so +# TTS keeps the lion's share. Only applied on CUDA when the value is set (>0). +_GPU_MEM_FRACTION = float(os.environ.get("HURI_GESTURE_GPU_MEM_FRACTION", "0.0")) @dataclass @@ -20,7 +30,7 @@ class Motion: poses: np.ndarray # (t, 165) SMPL-X axis-angle, 55 joints × 3 expressions: np.ndarray # (t, 100) facial expression coefficients trans: np.ndarray # (t, 3) global root translation - fps: int = 30 + fps: int = _EMAGE_FPS pts: float = 0.0 # presentation timestamp in seconds, paired with Audio.pts @@ -30,6 +40,7 @@ def __init__( self, hf_repo: str = _HF_REPO, device: Optional[str] = None, + gpu_mem_fraction: float = _GPU_MEM_FRACTION, ): print(f"[Gesture] importing torch...", flush=True) import torch @@ -41,6 +52,21 @@ def __init__( ) print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}", flush=True) + # Manual GPU split: cap this process' share of GPU memory so the audio + # (TTS) path keeps the rest. num_gpus in the Ray serveConfig handles + # scheduling/packing; this caps actual allocation on the device. + if self.device.type == "cuda" and gpu_mem_fraction > 0: + try: + torch.cuda.set_per_process_memory_fraction( + gpu_mem_fraction, self.device.index or 0 + ) + print( + f"[Gesture] GPU memory fraction capped at {gpu_mem_fraction:.2f}", + flush=True, + ) + except Exception as e: # noqa: BLE001 — best-effort knob, never fatal + print(f"[Gesture] WARNING could not cap GPU memory: {e!r}", flush=True) + print("[Gesture] loading face_vq...", flush=True) face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device) print("[Gesture] loading upper_vq...", flush=True) @@ -111,15 +137,32 @@ class Gesture(ModuleWithHandle): """Gesture Module Consumes streaming Audio chunks produced by TTS and generates whole-body - SMPL-X motion using the EMAGE audio-to-gesture model. Inference runs once - per chunk so Motion events interleave with audio playback instead of all - arriving at the end of the utterance. + SMPL-X motion using the EMAGE audio-to-gesture model. + + Sliding window + ────────────── + TTS emits short, uneven audio chunks. Running EMAGE on each chunk in + isolation produces motion that is jerky at chunk seams (the model has no + context across boundaries) and is slow because the per-chunk overhead is + re-paid for tiny inputs — and gets worse the longer the utterance runs if + naively re-fed the whole buffer. + + Instead we keep a rolling buffer and, each time at least ``min_chunk_sec`` + of fresh audio has arrived, run inference over a window of + ``[context_sec of already-spoken audio] + [the fresh audio]``. The context + primes the model so the seam is continuous; only the motion frames for the + fresh audio are emitted. The window length is bounded by + ``context_sec + chunk size`` so inference cost stays flat regardless of + utterance length. Global root translation is rebased onto the previously + emitted frame to avoid a jump every window. input: audio (Audio) output: motion (Motion) - :hf_repo: HuggingFace repository to load EMAGE weights from. - :device: PyTorch device string; defaults to CUDA when available. + :hf_repo: HuggingFace repository to load EMAGE weights from. + :device: PyTorch device string; defaults to CUDA when available. + :context_sec: Seconds of prior audio prepended to each window for continuity. + :min_chunk_sec: Minimum seconds of fresh audio to accumulate before inferring. """ _handle_cls = GestureDeployment @@ -129,14 +172,115 @@ class Gesture(ModuleWithHandle): def __init__( self, _handle: handle.DeploymentHandle, + context_sec: float = _CONTEXT_SEC, + min_chunk_sec: float = _MIN_CHUNK_SEC, ): super().__init__(_handle) + self._context_sec = float(context_sec) + self._min_chunk_sec = float(min_chunk_sec) + + # Per-utterance sliding-window state. All sample counts are in the + # source sample rate; resampling to 16 kHz happens once inside infer(). + self._lock = asyncio.Lock() + self._sr: Optional[int] = None + self._buffer = np.empty(0, dtype=np.float32) # trailing audio (ctx + unprocessed) + self._buf_start = 0 # source-sr sample index of buffer[0] in utterance timeline + self._emitted = 0 # source-sr samples whose motion has been emitted + self._trans_anchor: Optional[np.ndarray] = None # last emitted trans, for continuity + + def _reset(self) -> None: + self._sr = None + self._buffer = np.empty(0, dtype=np.float32) + self._buf_start = 0 + self._emitted = 0 + self._trans_anchor = None async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: ignore[override] - if audio.data.size == 0: - return - motion = await self._handle.infer.remote( - audio.data.astype(np.float32), audio.sample_rate + # Each chunk arrives as its own process() task on the shared per-session + # instance, so serialise under a lock to keep the buffer ordered. + async with self._lock: + if audio.data.size > 0: + if self._sr is None: + self._sr = audio.sample_rate + self._buffer = np.concatenate( + [self._buffer, audio.data.astype(np.float32)] + ) + + sr = self._sr + end_of_utterance = audio.end + + if sr is None: + # Nothing buffered yet (e.g. a lone end marker). Reset and bail. + if end_of_utterance: + self._reset() + return + + ctx_samples = int(self._context_sec * sr) + min_new_samples = int(self._min_chunk_sec * sr) + + global_end = self._buf_start + len(self._buffer) + new_samples = global_end - self._emitted + + # Wait for more audio unless this is the final flush of the utterance. + if new_samples <= 0 or (not end_of_utterance and new_samples < min_new_samples): + if end_of_utterance: + self._reset() + return + + motion = await self._infer_window(sr, ctx_samples, global_end) + if motion is not None: + yield motion + + if end_of_utterance: + self._reset() + + async def _infer_window( + self, sr: int, ctx_samples: int, global_end: int + ) -> Optional[Motion]: + # Window = [context of already-emitted audio] + [fresh audio]. + win_start = max(self._buf_start, self._emitted - ctx_samples) + window = self._buffer[win_start - self._buf_start :] + if window.size == 0: + return None + + motion: Motion = await self._handle.infer.remote(window, sr) + total_frames = motion.poses.shape[0] + + # Drop the leading frames that correspond to the context (already emitted). + skip_sec = (self._emitted - win_start) / sr + skip_frames = int(round(skip_sec * motion.fps)) + skip_frames = max(0, min(skip_frames, total_frames)) + + poses = motion.poses[skip_frames:] + expressions = motion.expressions[skip_frames:] + trans = motion.trans[skip_frames:].copy() + + # Advance the timeline even if rounding left no new frames to emit. + self._emitted = global_end + self._trim_buffer(ctx_samples, global_end) + + if poses.shape[0] == 0: + return None + + # Rebase global translation onto the last emitted frame: every window + # restarts root motion near the origin, so without this the avatar would + # teleport back at each seam. + if self._trans_anchor is not None: + trans += self._trans_anchor - trans[0] + self._trans_anchor = trans[-1].copy() + + out = Motion( + poses=poses, + expressions=expressions, + trans=trans, + fps=motion.fps, + pts=win_start / sr + skip_sec, # == self._emitted_before / sr ) - motion.pts = audio.pts - yield motion + return out + + def _trim_buffer(self, ctx_samples: int, global_end: int) -> None: + # Keep only the trailing context so the next window stays bounded. + keep_from = global_end - ctx_samples + if keep_from > self._buf_start: + self._buffer = self._buffer[keep_from - self._buf_start :] + self._buf_start = keep_from diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py index 9bd579b..9daa384 100644 --- a/src/modules/text_to_speech/text_to_speech.py +++ b/src/modules/text_to_speech/text_to_speech.py @@ -161,34 +161,41 @@ def __init__(self, _handle: handle.DeploymentHandle): self._session_id: str | None = None self._audio_q: asyncio.Queue | None = None self._stream_task: asyncio.Task | None = None - self._session_ready: asyncio.Event | None = None + # The EventGraph fans each token out as its own concurrent process() + # task on this shared instance. This lock serialises session setup and + # text pushes so tokens reach CosyVoice's text queue in arrival order, + # exactly once. asyncio.Lock wakes waiters FIFO and tokens are created + # in order, so order is preserved — crucially the end-of-utterance token + # can no longer overtake a content token (which would truncate synthesis + # and silently drop trailing words). + self._push_lock = asyncio.Lock() async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: ignore[override] - # Subsequent tokens within an utterance just push text — the first - # token's invocation is the long-running yielder that emits chunks as - # soon as CosyVoice produces them, decoupled from token arrival. - if self._session_id is not None: + # Acquire BEFORE any await so lock-acquisition order matches token order. + # Setup + push happen under the lock; only the first token of an + # utterance goes on to drain/yield audio (outside the lock, so pushes of + # later tokens are never blocked by the long-running drain). + async with self._push_lock: + is_first = self._session_id is None + if is_first: + self._session_id = str(uuid.uuid4()) + self._audio_q = asyncio.Queue() + print(f"[TTS-client] [{self._session_id}] opening new utterance session", flush=True) + await self._handle.start_session.remote(self._session_id) + self._stream_task = asyncio.create_task( + self._drain_audio(self._session_id, self._audio_q) + ) + sid = self._session_id - ready = self._session_ready - if ready is not None: - await ready.wait() + audio_q = self._audio_q + stream_task = self._stream_task print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True) await self._handle.push_text.remote(sid, token.text, token.end) - return - self._session_id = str(uuid.uuid4()) - self._session_ready = asyncio.Event() - sid = self._session_id - audio_q: asyncio.Queue = asyncio.Queue() - self._audio_q = audio_q - print(f"[TTS-client] [{sid}] opening new utterance session", flush=True) - await self._handle.start_session.remote(sid) - self._stream_task = asyncio.create_task(self._drain_audio(sid, audio_q)) - self._session_ready.set() - - print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True) - await self._handle.push_text.remote(sid, token.text, token.end) + if not is_first: + return + assert audio_q is not None and stream_task is not None try: count = 0 while True: @@ -198,16 +205,16 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: i count += 1 print(f"[TTS-client] [{sid}] yield chunk #{count}", flush=True) yield item - await self._stream_task + await stream_task print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)", flush=True) sample_rate = await self._handle.get_sample_rate.remote() yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True) finally: - self._session_id = None - self._audio_q = None - self._stream_task = None - self._session_ready = None + async with self._push_lock: + self._session_id = None + self._audio_q = None + self._stream_task = None async def _drain_audio(self, session_id: str, audio_q: asyncio.Queue) -> None: try: From 01ac7ef24902ae61927573d4a0753562eec83e82 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Sun, 31 May 2026 17:58:38 +0200 Subject: [PATCH 15/31] feat(client): option to save streamed TTS audio to .wav files --- src/client.py | 12 ++++++++- src/core/client.py | 61 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/src/client.py b/src/client.py index 466cc05..d6bba63 100644 --- a/src/client.py +++ b/src/client.py @@ -26,11 +26,21 @@ async def launch_client(): required=True, help="Path to Client config file (YAML)", ) + parser.add_argument( + "--save-audio", + nargs="?", + const="audio_dumps", + default=None, + metavar="DIR", + help="Save streamed TTS audio to .wav files (one per utterance) in DIR " + "for quality-checking. Defaults to ./audio_dumps when the flag is given " + "without a value.", + ) args = parser.parse_args() config = load_client_config(args.config) - await Client(config=config).run() + await Client(config=config, save_audio_dir=args.save_audio).run() if __name__ == "__main__": diff --git a/src/core/client.py b/src/core/client.py index b68f4bb..2af97f2 100644 --- a/src/core/client.py +++ b/src/core/client.py @@ -1,9 +1,13 @@ import asyncio import json import os +import struct +import wave from dataclasses import asdict +from datetime import datetime from typing import Dict, List, Optional, Type +import numpy as np import websockets from src.core.dataclasses.config import ClientConfig @@ -19,11 +23,22 @@ def __init__( config: ClientConfig, user_id_file: str = os.path.expanduser("~/.huri_user_id"), senders_dict: Dict[str, Type[ClientSender]] = get_senders(), + save_audio_dir: Optional[str] = None, ): self.config = config self.user_id_file = user_id_file self.senders_dict = senders_dict + # When set, incoming audio chunks are buffered per utterance and written + # to a .wav under this directory each time an end-of-utterance marker + # arrives — handy for ear-checking what the TTS actually streamed. + self.save_audio_dir = save_audio_dir + self._audio_buf: List[np.ndarray] = [] + self._audio_sr: Optional[int] = None + self._audio_idx = 0 + if save_audio_dir: + os.makedirs(save_audio_dir, exist_ok=True) + def _load_user_id(self) -> Optional[str]: if os.path.exists(self.user_id_file): with open(self.user_id_file) as f: @@ -34,8 +49,42 @@ def _save_user_id(self, _user_id: str): with open(self.user_id_file, "w") as f: f.write(_user_id) + def _collect_audio(self, samples: np.ndarray, sample_rate: int, end: bool) -> None: + if samples.size: + self._audio_buf.append(samples) + self._audio_sr = sample_rate + if end: + self._flush_audio() + + def _flush_audio(self) -> None: + if not self._audio_buf or self._audio_sr is None: + self._audio_buf = [] + return + audio = np.concatenate(self._audio_buf) + stamp = datetime.now().strftime("%Y%m%d-%H%M%S") + path = os.path.join( + self.save_audio_dir, f"utt-{self._audio_idx:03d}-{stamp}.wav" + ) + self._write_wav(path, audio, self._audio_sr) + print( + f"** saved audio: {path} ({audio.size} samples, " + f"~{audio.size / self._audio_sr:.2f}s @ {self._audio_sr}Hz)" + ) + self._audio_idx += 1 + self._audio_buf = [] + + @staticmethod + def _write_wav(path: str, audio: np.ndarray, sample_rate: int) -> None: + # float32 [-1, 1] -> 16-bit PCM, clipped to avoid wraparound on overshoot. + pcm = np.clip(audio, -1.0, 1.0) + pcm = (pcm * 32767.0).astype("= 13: sample_rate, end_flag, pts = struct.unpack(">IBd", payload[:13]) - n_samples = (len(payload) - 13) // 4 + # Samples are native-endian float32 (Sender uses ndarray.tobytes()). + samples = np.frombuffer(payload[13:], dtype=np.float32) print( - f"<< audio: pts={pts:.3f}s samples={n_samples} @ {sample_rate}Hz " + f"<< audio: pts={pts:.3f}s samples={samples.size} @ {sample_rate}Hz " f"end={bool(end_flag)}" ) + if self.save_audio_dir: + self._collect_audio(samples, sample_rate, bool(end_flag)) elif topic == "motion" and len(payload) >= 16: pts, fps, n_frames = struct.unpack(">dII", payload[:16]) print(f"<< motion: pts={pts:.3f}s frames={n_frames} @ {fps}fps") @@ -64,6 +116,9 @@ async def _receive_loop(self, ws: websockets.ClientConnection): except (asyncio.CancelledError, websockets.ConnectionClosedOK): pass + finally: + if self.save_audio_dir: + self._flush_audio() # save anything left if the stream ended mid-utterance async def run(self): async with websockets.connect(self.config.huri_url) as ws: From 28e279fb32cbd7ea2e48fefe23b81c08f867604d Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Sun, 31 May 2026 17:59:20 +0200 Subject: [PATCH 16/31] fix(stt): missing lookup for PVC on kube init --- .../local_nvidia_amd/templates/whisper-model-init-job.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml index 098f7f7..879d908 100644 --- a/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml +++ b/deploy/examples/local_nvidia_amd/templates/whisper-model-init-job.yaml @@ -1,6 +1,7 @@ {{- if .Values.models.whisper.enabled }} {{- $model := .Values.models.whisper }} {{- $pvcName := printf "%s-whisper-models" (include "huri.fullname" .) }} +{{- if not (lookup "v1" "PersistentVolumeClaim" .Release.Namespace $pvcName) }} --- apiVersion: v1 kind: PersistentVolumeClaim @@ -21,6 +22,7 @@ spec: {{- if $model.pvc.storageClassName }} storageClassName: {{ $model.pvc.storageClassName }} {{- end }} +{{- end }} --- # Runs only on first install (not on upgrade) — model is already on the PVC. apiVersion: batch/v1 From 734aaf95d323759182b780a42f1f4fa0345a3d11 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 14:04:31 +0200 Subject: [PATCH 17/31] feat(gesture): warmup to generate output faster on first time --- src/modules/gesture/gesture.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index f939d38..e4fb01b 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -90,8 +90,29 @@ def __init__( print("[Gesture] loading EmageAudioModel...", flush=True) self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device) self.model.eval() + + self._warmup() print(f"[Gesture] ready", flush=True) + def _warmup(self) -> None: + # The first inference pays a one-time cold-start cost (CUDA context + # init, kernel JIT/load, cuDNN autotuning, caching-allocator warmup) + # that can take several seconds. Run a throwaway pass here so that cost + # is paid at startup — where we're already blocking on weight loads — + # rather than on the first user-facing gesture. Best-effort: a failure + # here must never prevent the deployment from coming up. + import time + + try: + # ~3 s of silence at 16 kHz exercises the full sliding-window path + # (multiple rounds + remainder) the way a real utterance would. + dummy = np.zeros(_EMAGE_SR * 3, dtype=np.float32) + t0 = time.time() + self.infer(dummy, source_sr=_EMAGE_SR) + print(f"[Gesture] warmup done in {time.time() - t0:.2f}s", flush=True) + except Exception as e: # noqa: BLE001 — warmup is an optimisation, never fatal + print(f"[Gesture] WARNING warmup failed: {e!r}", flush=True) + def infer(self, audio_np: np.ndarray, source_sr: int = _EMAGE_SR) -> Motion: import torch import torch.nn.functional as F From 191b6b0693000df819ecc75bf2994dc8d38b4b9b Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 14:05:15 +0200 Subject: [PATCH 18/31] feat(tts): using CosyVoice3 model --- .../templates/cosytts-model-init-job.yaml | 39 ++++++++----------- deploy/examples/local_nvidia_amd/values.yaml | 18 ++++++--- src/modules/text_to_speech/text_to_speech.py | 27 ++++++++----- 3 files changed, 48 insertions(+), 36 deletions(-) diff --git a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml index 0f62dd1..b0e3bc7 100644 --- a/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml +++ b/deploy/examples/local_nvidia_amd/templates/cosytts-model-init-job.yaml @@ -24,7 +24,9 @@ spec: {{- end }} {{- end }} --- -# Runs only on first install (not on upgrade) — models are already on the PVC. +# Runs on every install/upgrade (pre-* hook) but exits early once the model dir +# already looks complete — so a no-op upgrade costs only a pod spin-up. Set +# $model.forceDownload=true to wipe and re-fetch (e.g. when changing versions). apiVersion: batch/v1 kind: Job metadata: @@ -59,26 +61,25 @@ spec: - | set -e MODEL_DIR="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}" - BLANK_EN_DIR="$MODEL_DIR/CosyVoice-BlankEN" - HAS_MAIN_CONFIG="no" - HAS_QWEN_WEIGHTS="no" - if [ -f "$MODEL_DIR/cosyvoice2.yaml" ]; then - HAS_MAIN_CONFIG="yes" + {{- if $model.forceDownload }} + echo "forceDownload=true — wiping $MODEL_DIR for a fresh download." + rm -rf "$MODEL_DIR" + {{- end }} + # CosyVoice3 ships cosyvoice3.yaml + llm.pt (the Qwen LM) plus a + # bundled CosyVoice-BlankEN/ dir, so a single snapshot grabs + # everything the worker needs — no separate sub-model download. + HAS_CONFIG="no" + HAS_LLM="no" + if [ -f "$MODEL_DIR/cosyvoice3.yaml" ]; then + HAS_CONFIG="yes" fi - if [ -f "$BLANK_EN_DIR/model.safetensors" ] || \ - [ -f "$BLANK_EN_DIR/pytorch_model.bin" ] || \ - [ -f "$BLANK_EN_DIR/model.safetensors.index.json" ] || \ - [ -f "$BLANK_EN_DIR/pytorch_model.bin.index.json" ]; then - HAS_QWEN_WEIGHTS="yes" + if [ -f "$MODEL_DIR/llm.pt" ]; then + HAS_LLM="yes" fi - if [ "$HAS_MAIN_CONFIG" = "yes" ] && [ "$HAS_QWEN_WEIGHTS" = "yes" ]; then + if [ "$HAS_CONFIG" = "yes" ] && [ "$HAS_LLM" = "yes" ]; then echo "Model already present at $MODEL_DIR — skipping download." exit 0 fi - if [ "$HAS_QWEN_WEIGHTS" = "no" ] && [ -d "$BLANK_EN_DIR" ]; then - echo "Partial Qwen weights detected; clearing $BLANK_EN_DIR before re-download." - rm -rf "$BLANK_EN_DIR" - fi echo "Downloading {{ $model.modelSource.modelId }} into $MODEL_DIR …" pip install --quiet modelscope python - <<'PYEOF' @@ -87,12 +88,6 @@ spec: "{{ $model.modelSource.modelId }}", local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}", ) - # CosyVoice2 loads this sub-model at runtime; pre-download it so - # the worker pod does not need outbound internet access. - snapshot_download( - "iic/CosyVoice-BlankEN", - local_dir="{{ $model.mountPath }}/{{ $model.modelSource.modelId }}/CosyVoice-BlankEN", - ) PYEOF echo "Download complete." volumeMounts: diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml index 8f67376..e771758 100644 --- a/deploy/examples/local_nvidia_amd/values.yaml +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -40,6 +40,10 @@ ray: # client config. HURI_GESTURE_CONTEXT_SEC: "2.0" HURI_GESTURE_MIN_CHUNK_SEC: "0.5" + # CosyVoice3 contract: "<|endofprompt|>". + # The reference transcript MUST come AFTER the marker, or the LM treats it as + # an instruction and intermittently speaks it (prompt leakage). + HURI_VOICE_TRANSCRIPT: "You are a helpful assistant.<|endofprompt|>Instinct creates its own oppressors and bids us rise up against them." deployments: # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only — # all GPU work is offloaded to handle-backed deployments below. @@ -169,11 +173,11 @@ workerGroups: resources: limits: cpu: "4" - memory: "8Gi" + memory: "14Gi" nvidia.com/gpu: "1" requests: cpu: "2" - memory: "4Gi" + memory: "8Gi" nvidia.com/gpu: "1" shmSize: 2Gi @@ -256,16 +260,21 @@ models: - ReadWriteOnce # Where the model weights PVC is mounted inside the worker container. mountPath: /models/cosytts + # Set to true to wipe the existing model dir and re-download on the next + # helm upgrade. Needed when switching model versions on a PVC that already + # holds weights (the init Job otherwise skips download when the model dir + # already looks complete). Set back to false after the redownload runs. + forceDownload: false modelSource: # type: modelscope | huggingface (only modelscope is implemented) type: modelscope # ModelScope model ID — snapshot_download uses this as the sub-path # inside mountPath, so the final path is mountPath/modelId. - modelId: iic/CosyVoice2-0.5B + modelId: FunAudioLLM/Fun-CosyVoice3-0.5B-2512 # Env vars injected into every worker that mounts this model. # HURI_MODEL_PATH must match mountPath/modelId (see text_to_speech.py). env: - HURI_MODEL_PATH: /models/cosytts/iic/CosyVoice2-0.5B + HURI_MODEL_PATH: /models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512 # Path to the CosyVoice repo root containing third_party/Matcha-TTS. HURI_COSY_DIR: /app/cosyvoice @@ -318,7 +327,6 @@ voiceAssets: mountPath: /assets env: HURI_VOICE_SAMPLE_PATH: /assets/voice.wav - HURI_VOICE_TRANSCRIPT: "Instinct creates its own oppressors and bids us rise up against them." # Ingress for the Ray Serve endpoint (port 8000). ingress: diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py index 9daa384..4b9baf2 100644 --- a/src/modules/text_to_speech/text_to_speech.py +++ b/src/modules/text_to_speech/text_to_speech.py @@ -29,10 +29,19 @@ def _trace(msg: str) -> None: # Defaults — overridden by env vars in production (see README.md) -_MODEL_PATH = os.environ.get("HURI_MODEL_PATH", "/models/cosytts/iic/CosyVoice2-0.5B") +_MODEL_PATH = os.environ.get( + "HURI_MODEL_PATH", "/models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512" +) _VOICE_SAMPLE_PATH = os.environ.get("HURI_VOICE_SAMPLE_PATH", "/assets/voice.wav") -_VOICE_SAMPLE_TRANSCRIPT = os.environ.get( - "HURI_VOICE_TRANSCRIPT", "Hello, this is my voice sample for cloning." +# CosyVoice3 expects "<|endofprompt|>". If the +# config supplies a bare transcript (no marker), prepend the default instruction so the +# transcript lands AFTER <|endofprompt|> — otherwise the LM treats it as an instruction +# and intermittently renders it as speech (prompt leakage). The transcription is a must. +_raw_transcript = os.environ["HURI_VOICE_TRANSCRIPT"] +_VOICE_SAMPLE_TRANSCRIPT = ( + _raw_transcript + if "<|endofprompt|>" in _raw_transcript + else f"You are a helpful assistant.<|endofprompt|>{_raw_transcript}" ) _END_TEXT = object() # sentinel pushed into the text queue to close synth @@ -42,7 +51,7 @@ def _trace(msg: str) -> None: @serve.deployment(name="TTS", max_ongoing_requests=200) class TTSDeployment: - """CosyVoice2 wrapper with per-session bistream synthesis. + """CosyVoice3 wrapper with per-session bistream synthesis. The model's `inference_zero_shot` accepts a Python generator as `tts_text` and yields audio chunks as text arrives — that's the "bistream" mode. @@ -57,7 +66,7 @@ def __init__( voice_sample_path: str = _VOICE_SAMPLE_PATH, voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT, ): - _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path}") + _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path} transcript={voice_sample_transcript}") cosy_dir = os.environ.get("HURI_COSY_DIR") if cosy_dir: @@ -66,11 +75,11 @@ def __init__( sys.path.insert(0, matcha_path) logger.debug("Added Matcha-TTS path to sys.path: %s", matcha_path) - from cosyvoice.cli.cosyvoice import CosyVoice2 + from cosyvoice.cli.cosyvoice import CosyVoice3 - self.model = CosyVoice2(model_path, load_jit=False, load_trt=False) + self.model = CosyVoice3(model_dir=model_path, load_trt=False) self.sample_rate: int = self.model.sample_rate - _trace(f"CosyVoice2 loaded (sample_rate={self.sample_rate})") + _trace(f"CosyVoice3 loaded (sample_rate={self.sample_rate})") self.prompt_speech = voice_sample_path self.prompt_text: str = voice_sample_transcript @@ -140,7 +149,7 @@ def text_gen(): class TTS(ModuleWithHandle): - """TTS Module — bistream tokens-in / audio-out via CosyVoice2. + """TTS Module — bistream tokens-in / audio-out via CosyVoice3. Opens one synthesis session per utterance (delimited by `token.end`). Each incoming token is pushed straight into the model's text generator so audio From 5bc82be8712c56f2faf4c558a316104290635256 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 14:40:22 +0200 Subject: [PATCH 19/31] fixed(stt): audio_in event for the pipeline, so that text is read by RAG --- src/core/client_senders.py | 4 +- src/core/events.py | 2 +- src/modules/events.py | 8 +- src/modules/speech_to_text/microphone_vad.py | 7 +- src/modules/speech_to_text/speech_to_text.py | 82 +++++++++++++------- 5 files changed, 68 insertions(+), 35 deletions(-) diff --git a/src/core/client_senders.py b/src/core/client_senders.py index 03301a6..c1ae31f 100644 --- a/src/core/client_senders.py +++ b/src/core/client_senders.py @@ -45,7 +45,9 @@ async def send(self, topic: str, data: EventData | bytes): class AudioSender(ClientSender): - output_type = "audio" + # Mic frames go out on "audio_in"; the server's "audio" topic is reserved + # for TTS output streamed back to us (see MIC.input_type). + output_type = "audio_in" def __init__( self, sample_rate: int = 16000, frame_duration: float = 0.030, **kwargs diff --git a/src/core/events.py b/src/core/events.py index 1d116bc..5df1bf8 100644 --- a/src/core/events.py +++ b/src/core/events.py @@ -47,7 +47,7 @@ def register(self, module: Module): async def publish(self, event_topic, data): subs = self.subscribers[event_topic] - if event_topic not in ("audio",): # skip mic-frame spam + if event_topic not in ("audio_in",): # skip mic-frame spam logger.info( "[GRAPH] publish topic=%r subscribers=%s", event_topic, [type(m).__name__ for m in subs], diff --git a/src/modules/events.py b/src/modules/events.py index b731c21..cb7bd61 100644 --- a/src/modules/events.py +++ b/src/modules/events.py @@ -7,6 +7,7 @@ def get_events() -> Dict[str, Type[EventData | bytes]]: events: Dict[str, Type[EventData | bytes]] = { + "audio_in": bytes, # inbound mic frames (raw int16 PCM) "audio": bytes, "voice": Voice, "transcript": Transcript, @@ -22,9 +23,10 @@ def get_events() -> Dict[str, Type[EventData | bytes]]: else: events["motion"] = Motion - # TTS output "audio" is an Audio dataclass internally; the websocket boundary - # only ever decodes raw bytes for the "audio" topic (mic input), so the - # registry keeps bytes there. Keep Audio importable for type completeness. + # TTS output "audio" is an Audio dataclass internally, sent to the client by + # the Sender's Audio branch (never decoded inbound). Inbound mic frames use + # the separate "audio_in" topic above. The registry keeps bytes for "audio" + # only for output-type registration. Keep Audio importable for completeness. _ = Audio return events diff --git a/src/modules/speech_to_text/microphone_vad.py b/src/modules/speech_to_text/microphone_vad.py index ffb82f7..eb52f74 100644 --- a/src/modules/speech_to_text/microphone_vad.py +++ b/src/modules/speech_to_text/microphone_vad.py @@ -13,7 +13,7 @@ class MIC(Module): Detect voice and silence using WebRTC VAD. - input: audio, + input: audio_in, output: voice :vad_agressiveness: from 0 (low) to 3 (high, can distord audio). @@ -23,7 +23,10 @@ class MIC(Module): Can only be 0.010, 0.020 and 0.030. """ - input_type = "audio" + # Inbound microphone frames travel on their own topic so the TTS-output + # "audio" topic (consumed by Gesture and the client Sender) never collides + # with mic input — otherwise raw mic bytes get echoed back to the client. + input_type = "audio_in" output_type = "voice" def __init__( diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 29c3ad8..7a9b91d 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -1,6 +1,6 @@ import asyncio import os -from typing import List, Optional +from typing import AsyncGenerator, List import numpy as np from ray import serve @@ -93,40 +93,66 @@ def __init__( self.buffer: List[np.ndarray] = [] - self.silence: bool = True + # Set when the VAD emits its single end-of-utterance marker (Voice(None)). + # Remembered rather than acted on immediately so it survives an in-flight + # transcribe — otherwise the terminal window (and thus the question that + # drives the RAG) is silently dropped. + self.pending_end: bool = False self.running = False self.lock: asyncio.Lock = asyncio.Lock() - async def process(self, voice: Voice) -> Optional[Transcript]: - if voice.data is None: - self.silence = True - else: - self.silence = False - async with self.lock: + async def process(self, voice: Voice) -> AsyncGenerator[Transcript, None]: # type: ignore[override] + async with self.lock: + if voice.data is None: + self.pending_end = True + else: self.buffer.append(voice.data) - async with self.lock: if self.running: - return None + # Another invocation owns the drain loop below; it will pick up + # the frame we just buffered (and any pending end-of-utterance). + return self.running = True - async with self.lock: - buffer_size = len(self.buffer) - if buffer_size == 0 or ( - self.silence is False and buffer_size < self.window_size - ): + try: + while True: + async with self.lock: + end = self.pending_end + buffer_size = len(self.buffer) + + if buffer_size == 0: + # Nothing left to transcribe. If the utterance just + # ended, still emit a terminal transcript so the + # aggregator finalises the question. + if end: + self.pending_end = False + yield Transcript("", True) + return + + # Mid-speech: hold until a full window has accumulated. + if not end and buffer_size < self.window_size: + return + + processing_chunks = self.buffer[: self.window_size] + # On end-of-utterance, the last window drains the buffer. + final = end and buffer_size <= self.window_size + + processing_audio = np.concatenate(processing_chunks, axis=0) + current_text: str = await self._handle.transcribe.remote( + processing_audio + ) + + async with self.lock: + if final: + self.buffer = [] + self.pending_end = False + else: + self.buffer = self.buffer[self.window_size - self.step_size :] + + yield Transcript(current_text, final) + if final: + return + finally: + async with self.lock: self.running = False - return None - processing_chunks = self.buffer[: self.window_size] - - processing_audio = np.concatenate(processing_chunks, axis=0) - - current_text: str = await self._handle.transcribe.remote(processing_audio) - - processed_size = self.window_size - self.step_size - async with self.lock: - self.buffer = self.buffer[processed_size:] - self.running = False - - return Transcript(current_text, self.silence) From 14a06bf775ea0def4b8ce19cb85cbc7ca0e25a74 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 15:47:48 +0200 Subject: [PATCH 20/31] fix(stt): going back to previous STT impelmentation, with a simpler handle and module --- deploy/examples/local_nvidia_amd/values.yaml | 1 + src/modules/speech_to_text/speech_to_text.py | 143 ++++++++----------- 2 files changed, 64 insertions(+), 80 deletions(-) diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml index e771758..9033384 100644 --- a/deploy/examples/local_nvidia_amd/values.yaml +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -44,6 +44,7 @@ ray: # The reference transcript MUST come AFTER the marker, or the LM treats it as # an instruction and intermittently speaks it (prompt leakage). HURI_VOICE_TRANSCRIPT: "You are a helpful assistant.<|endofprompt|>Instinct creates its own oppressors and bids us rise up against them." + HURI_STT_MODEL_PATH: /models/whisper/Systran/faster-whisper-base deployments: # HuRI: FastAPI/WebSocket ingress + per-session router. CPU only — # all GPU work is offloaded to handle-backed deployments below. diff --git a/src/modules/speech_to_text/speech_to_text.py b/src/modules/speech_to_text/speech_to_text.py index 7a9b91d..9e48a62 100644 --- a/src/modules/speech_to_text/speech_to_text.py +++ b/src/modules/speech_to_text/speech_to_text.py @@ -1,6 +1,6 @@ import asyncio import os -from typing import AsyncGenerator, List +from typing import List, Optional import numpy as np from ray import serve @@ -15,15 +15,20 @@ @serve.deployment(name="STT") class STTDeployment: - """Stateless Whisper inference actor. + """faster-whisper model wrapper. - Holds the faster-whisper model in a single Ray actor (pinned to the AMD - worker in deployment configs). Exposes a single transcribe() call so - per-session STT clients can offload the heavy work without owning a GPU. + Holds the WhisperModel and runs transcription on its own Ray Serve actor, + off the HuRI master actor — model load and GPU inference no longer block the + websocket ingress / per-session router. Pinned to a GPU worker via + ray_actor_options in the Serve config (see deploy values.yaml). - HURI_STT_MODEL_PATH: path to a local faster-whisper model directory (from - the whisper PVC). Falls back to "base" which triggers a HuggingFace - download — only acceptable for local dev without a PVC. + Stateless across calls: the per-session sliding-window buffering lives in the + STT module, so this deployment is shared across all sessions. + + :model: path to (or size name of) the faster-whisper model. Defaults to the + HURI_STT_MODEL_PATH env var, falling back to "base". + :device: "cpu", "cuda", or "auto". + :compute_type: e.g. "int8", "float16", or "auto". """ def __init__( @@ -32,7 +37,6 @@ def __init__( device: str = "auto", compute_type: str = "auto", ): - print(f"[STT] loading model from {model!r} (device={device} compute_type={compute_type})", flush=True) from faster_whisper import WhisperModel self.model_faster = WhisperModel( @@ -40,35 +44,32 @@ def __init__( device=device, compute_type=compute_type, ) - print(f"[STT] model loaded", flush=True) - self.language = "en" - - async def transcribe(self, audio: np.ndarray) -> str: - loop = asyncio.get_running_loop() - segments, _ = await loop.run_in_executor( - None, - lambda: self.model_faster.transcribe( - audio, - language=self.language, - beam_size=1, - ), + + async def transcribe(self, audio: np.ndarray, language: str = "en") -> str: + segments, _ = self.model_faster.transcribe( + audio, + language=language, + beam_size=1, # faster for realtime ) - return " ".join(seg.text for seg in segments).strip() + return " ".join([seg.text for seg in segments]).strip() class STT(ModuleWithHandle): """STT Module - Per-session client: keeps the rolling window / silence state, offloads each - transcription window to the shared STTDeployment actor. + Transcribe voice using Faster_Whisper. + + Holds the per-session sliding-window buffer and delegates the actual + transcription to a handle-backed STTDeployment, so the Whisper model runs + off the HuRI master node. input: voice, output: transcript + :language: language spoken in the audio. It should be a language code such + as "en" or "fr". :sample_rate: size of received voice audio. Usually 8000, 16000 or 48000. :block_duration: size of received voice audio (in s). - :transcribe_window: rolling window length (s) handed to Whisper. - :transcribe_step: stride (s) between successive windows. """ _handle_cls = STTDeployment @@ -83,76 +84,58 @@ def __init__( block_duration: float = 0.020, # s transcribe_window: float = 2.0, # s transcribe_step: float = 1.0, # s + **kwargs, ): - super().__init__(_handle=_handle) + super().__init__(_handle=_handle, **kwargs) self.language = language + self.sample_rate = sample_rate self.window_size: int = int(transcribe_window / block_duration) self.step_size: int = int(transcribe_step / block_duration) self.buffer: List[np.ndarray] = [] - # Set when the VAD emits its single end-of-utterance marker (Voice(None)). - # Remembered rather than acted on immediately so it survives an in-flight - # transcribe — otherwise the terminal window (and thus the question that - # drives the RAG) is silently dropped. - self.pending_end: bool = False + self.silence: bool = True + + self.prev_text: str = "" + self.stable_text: str = "" self.running = False self.lock: asyncio.Lock = asyncio.Lock() - async def process(self, voice: Voice) -> AsyncGenerator[Transcript, None]: # type: ignore[override] - async with self.lock: - if voice.data is None: - self.pending_end = True - else: + async def process(self, voice: Voice) -> Optional[Transcript]: # type: ignore[override] + if voice.data is None: + self.silence = True + else: + self.silence = False + async with self.lock: self.buffer.append(voice.data) + async with self.lock: if self.running: - # Another invocation owns the drain loop below; it will pick up - # the frame we just buffered (and any pending end-of-utterance). - return + return None self.running = True - try: - while True: - async with self.lock: - end = self.pending_end - buffer_size = len(self.buffer) - - if buffer_size == 0: - # Nothing left to transcribe. If the utterance just - # ended, still emit a terminal transcript so the - # aggregator finalises the question. - if end: - self.pending_end = False - yield Transcript("", True) - return - - # Mid-speech: hold until a full window has accumulated. - if not end and buffer_size < self.window_size: - return - - processing_chunks = self.buffer[: self.window_size] - # On end-of-utterance, the last window drains the buffer. - final = end and buffer_size <= self.window_size - - processing_audio = np.concatenate(processing_chunks, axis=0) - current_text: str = await self._handle.transcribe.remote( - processing_audio - ) - - async with self.lock: - if final: - self.buffer = [] - self.pending_end = False - else: - self.buffer = self.buffer[self.window_size - self.step_size :] - - yield Transcript(current_text, final) - if final: - return - finally: - async with self.lock: + async with self.lock: + buffer_size = len(self.buffer) + if buffer_size == 0 or ( + self.silence is False and buffer_size < self.window_size + ): self.running = False + return None + processing_chunks = self.buffer[: self.window_size] + + self.pending_silence = False + processing_audio = np.concatenate(processing_chunks, axis=0) + + current_text = await self._handle.transcribe.remote( + processing_audio, self.language + ) + + processed_size = self.window_size - self.step_size + async with self.lock: + self.buffer = self.buffer[processed_size:] + self.running = False + + return Transcript(current_text, self.silence) From a1f5e6df070ae7141ab40d08b4f5ac734fcd2031 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 18:52:14 +0200 Subject: [PATCH 21/31] feat(rag): profile data from RAG based of uid --- src/modules/rag/ingestion.py | 154 +++++++++++++++++++++++++++++++++-- src/modules/rag/rag.py | 93 ++++++++++++++++++--- 2 files changed, 228 insertions(+), 19 deletions(-) diff --git a/src/modules/rag/ingestion.py b/src/modules/rag/ingestion.py index f4e4dae..88d85a8 100644 --- a/src/modules/rag/ingestion.py +++ b/src/modules/rag/ingestion.py @@ -7,7 +7,8 @@ from pathlib import Path from typing import Any, List -from pypdf import PdfReader +import httpx +import numpy as np from qdrant_client import QdrantClient from qdrant_client.models import ( Distance, @@ -17,12 +18,37 @@ PointStruct, VectorParams, ) -from semantic_chunker import SemanticChunker -from sentence_transformers import SentenceTransformer USER_ID_FILE = os.path.expanduser("~/.huri_user_id") +class RemoteEmbedder: + """Embed via an OpenAI-compatible ``/v1/embeddings`` endpoint (e.g. llama.cpp). + + Drop-in for the subset of ``SentenceTransformer`` this tool uses: a single + ``.encode(text, normalize_embeddings=...)`` returning a 1-D numpy array, so + the existing ``.tolist()`` / ``len(...)`` call sites keep working unchanged. + """ + + def __init__(self, url: str, model_name: str): + self.url = url.rstrip("/") + self.model_name = model_name + self._client = httpx.Client(timeout=60.0, verify=False) + + def encode(self, text: str, normalize_embeddings: bool = True) -> np.ndarray: + resp = self._client.post( + f"{self.url}/v1/embeddings", + json={"model": self.model_name, "input": str(text)}, + ) + resp.raise_for_status() + vec = np.asarray(resp.json()["data"][0]["embedding"], dtype=np.float32) + if normalize_embeddings: + norm = np.linalg.norm(vec) + if norm > 0: + vec = vec / norm + return vec + + def _split_sentences(text: str) -> list[str]: """Simple sentence splitter.""" result: List = [] @@ -77,6 +103,8 @@ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str] def extract_text_from_pdf(pdf_path: str) -> str: """Extract text from a PDF file.""" try: + from pypdf import PdfReader + reader = PdfReader(pdf_path) text = "" for page in reader.pages: @@ -116,7 +144,7 @@ def ensure_collection(client: QdrantClient, collection: str, vector_size: int): def ingest_chunks( client: QdrantClient, - model: SentenceTransformer, + model: Any, collection: str, chunks: list[str], _user_id: str, @@ -154,9 +182,11 @@ def ingest_chunks( return len(points) -def chunk_strat(text: str, args, model: SentenceTransformer) -> list[str] | Any: +def chunk_strat(text: str, args, model: Any) -> list[str] | Any: """Pick the right chunking strategy based on args.""" if args.chunking == "semantic": + from semantic_chunker import SemanticChunker + chunker = SemanticChunker( model=model, strategy=args.semantic_strategy, @@ -287,6 +317,50 @@ def cmd_write(args, client, model, _user_id): print(f"Done. Ingested {count} chunks as '{title}'") +def cmd_profile(args, client, model, _user_id): + """Store always-on profile facts about the user (name, etc.). + + Unlike regular documents, profile facts are NOT retrieved by vector + similarity. The RAG handle pulls them by filter (_user_id + type=profile) + on every query and injects them into the system prompt, so the character + always knows them. + """ + sample = model.encode("test", normalize_embeddings=True) + ensure_collection(client, args.collection, len(sample)) + + facts: List[str] = [] + if args.name: + facts.append(f"The user's name is {args.name}.") + for fact in args.fact or []: + facts.append(fact) + + if not facts: + print("Nothing to store. Use --name and/or --fact 'some fact'.") + return + + # Replace the existing profile so facts don't pile up across runs. + client.delete( + collection_name=args.collection, + points_selector=Filter( + must=[ + FieldCondition(key="_user_id", match=MatchValue(value=_user_id)), + FieldCondition(key="type", match=MatchValue(value="profile")), + ] + ), + ) + + count = ingest_chunks( + client, + model, + args.collection, + facts, + _user_id, + source="profile", + doc_type="profile", + ) + print(f"Stored {count} profile fact(s) for user {_user_id}") + + def cmd_list(args, client, model, _user_id): """List what's in the database for this user.""" @@ -355,7 +429,23 @@ def main(): parser.add_argument("--user-id", type=str, default=None) parser.add_argument("--collection", type=str, default="documents") parser.add_argument("--qdrant-url", type=str, default="http://localhost:6333") + parser.add_argument( + "--no-verify-ssl", + action="store_true", + default=False, + help="Disable SSL certificate verification (needed for self-signed LAN certs).", + ) parser.add_argument("--embedding-model", type=str, default="BAAI/bge-large-en-v1.5") + parser.add_argument( + "--embedding-url", + type=str, + default="", + help=( + "OpenAI-compatible embedding endpoint (e.g. llama.cpp at " + "http://localhost:8080). When set, embeddings are computed remotely " + "instead of with a local SentenceTransformer. Requires --chunking fixed." + ), + ) parser.add_argument( "--chunk-size", type=int, @@ -394,6 +484,16 @@ def main(): p_write = subparsers.add_parser("write", help="Write text interactively") p_write.add_argument("--title", type=str, default=None, help="Title/source name") + p_profile = subparsers.add_parser( + "profile", help="Store always-on profile facts (name, etc.)" + ) + p_profile.add_argument("--name", type=str, default=None, help="User's name") + p_profile.add_argument( + "--fact", + action="append", + help="A fact about the user, e.g. --fact 'Likes cheese' (repeatable)", + ) + subparsers.add_parser("list", help="List ingested documents") p_delete = subparsers.add_parser("delete", help="Delete documents by source") @@ -403,16 +503,54 @@ def main(): args = parser.parse_args() - _user_id = get_user_id(args._user_id) + if args.embedding_url and args.chunking == "semantic": + parser.error( + "--chunking semantic needs a local SentenceTransformer model and " + "cannot run over --embedding-url. Use --chunking fixed." + ) + + _user_id = get_user_id(args.user_id) print(f"User: {_user_id}") - client = QdrantClient(url=args.qdrant_url) - model = SentenceTransformer(args.embedding_model) + verify_ssl = not args.no_verify_ssl + # Parse the URL explicitly so QdrantClient gets the correct host/port/https. + # When given just "https://host" with no port, some qdrant-client versions + # silently fall back to their default port (6333) instead of 443, causing + # a timeout that looks like an SSL issue. + from urllib.parse import urlparse + + _parsed = urlparse(args.qdrant_url) + _is_https = _parsed.scheme == "https" + _host = _parsed.hostname + _port = _parsed.port or (443 if _is_https else 6333) + client = QdrantClient( + host=_host, + port=_port, + https=_is_https, + verify=verify_ssl, + check_compatibility=verify_ssl, + ) + + # Lazy-load the model only if the command needs embeddings. + # Commands that don't need it: list, delete, profile (doesn't use embeddings). + needs_embeddings = args.command in ("pdf", "text", "write", "profile") + + if needs_embeddings: + if args.embedding_url: + print(f"Embedding remotely via {args.embedding_url} (model={args.embedding_model})") + model = RemoteEmbedder(args.embedding_url, args.embedding_model) + else: + from sentence_transformers import SentenceTransformer + + model = SentenceTransformer(args.embedding_model) + else: + model = None commands = { "pdf": cmd_pdf, "text": cmd_text, "write": cmd_write, + "profile": cmd_profile, "list": cmd_list, "delete": cmd_delete, } diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index 0195e48..7ffc6da 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -51,8 +51,18 @@ def _apply_config(self) -> None: from qdrant_client import QdrantClient cfg = self._cfg + from urllib.parse import urlparse + self.embedding_url = cfg.embedding_url or cfg.llm_url - self._qdrant = QdrantClient(url=cfg.qdrant_url, verify=cfg.verify_ssl) + _p = urlparse(cfg.qdrant_url) + _is_https = _p.scheme == "https" + self._qdrant = QdrantClient( + host=_p.hostname, + port=_p.port or (443 if _is_https else 6333), + https=_is_https, + verify=cfg.verify_ssl, + check_compatibility=cfg.verify_ssl, + ) print(f"[RAGHandle] Connected to Qdrant at {cfg.qdrant_url}") self._embed_client = httpx.AsyncClient(timeout=30.0, verify=cfg.verify_ssl) self._llm_client = httpx.AsyncClient(timeout=120.0, verify=cfg.verify_ssl) @@ -85,6 +95,32 @@ async def _embed(self, text: str) -> list[float]: f"Embedding unexpected schema from {url}: {str(payload)[:1000]}" ) from e + def _get_profile(self, collection: str, _user_id: str) -> list[str]: + """Always-on facts about the user (name, etc.). + + Retrieved deterministically by filter — NOT by vector similarity — + so they are always available to the prompt regardless of the question. + Populated via `ingestion.py profile`. + """ + from qdrant_client.models import FieldCondition, Filter, MatchValue + + try: + points, _ = self._qdrant.scroll( + collection_name=collection, + scroll_filter=Filter( + must=[ + FieldCondition(key="_user_id", match=MatchValue(value=_user_id)), + FieldCondition(key="type", match=MatchValue(value="profile")), + ] + ), + limit=50, + with_payload=True, + with_vectors=False, + ) + except Exception: + return [] + return [p.payload.get("text", "") for p in points if p.payload.get("text")] + def _search( self, qdrant, @@ -126,11 +162,24 @@ def _build_prompt( question: str, chunks: list[dict], preferences: dict, + profile_facts: list[str] | None = None, ) -> tuple[str, str]: - parts = [ - "You are a robot speaking to a user. Answer based on the provided context.", - "If the context is insufficient, say so clearly.", - ] + # TEMP TEMP TEMP: this persona should be defined in the configuration + persona = preferences.get( + "persona", + "You are Rat-Man, a witty, charismatic animated rat character. " + "You are NOT an AI, an assistant, or a language model, and you must " + "never say that you are one. Stay fully in character at all times, " + "speak naturally and with personality, and never break the illusion.", + ) + parts = [persona] + + if profile_facts: + parts.append( + "Here is what you know about the person you're talking to: " + + " ".join(profile_facts) + ) + if preferences.get("language"): parts.append(f"Always respond in {preferences['language']}.") if preferences.get("tone"): @@ -141,6 +190,14 @@ def _build_prompt( parts.append("Keep your answer to 2-3 sentences maximum.") if preferences.get("extra_instructions"): parts.append(preferences["extra_instructions"]) + + parts.append( + "Use the context in the user's message to inform your answers when " + "it is relevant, but always answer in character. If you don't know " + "something, improvise in character rather than admitting you lack " + "information or breaking character." + "Make small sentences, and no emojis" + ) system_prompt = " ".join(parts) if not chunks: @@ -168,7 +225,7 @@ def _build_prompt( return system_prompt, user_prompt async def _stream_ollama( - self, messages: list, max_tokens: int + self, messages: list, max_tokens: int, temperature: float = 0.7 ) -> AsyncGenerator[str, None]: async with self._llm_client.stream( "POST", @@ -177,7 +234,7 @@ async def _stream_ollama( "model": self._cfg.llm_model, "messages": messages, "stream": True, - "options": {"num_predict": max_tokens, "temperature": 0.1}, + "options": {"num_predict": max_tokens, "temperature": temperature}, }, ) as resp: resp.raise_for_status() @@ -200,6 +257,7 @@ async def _stream_openai_compatible( messages: list, max_tokens: int, api_key: str = "", + temperature: float = 0.7, ) -> AsyncGenerator[str, None]: headers = {"Content-Type": "application/json"} if api_key: @@ -212,7 +270,7 @@ async def _stream_openai_compatible( "model": self._cfg.llm_model, "messages": messages, "max_tokens": max_tokens, - "temperature": 0.1, + "temperature": temperature, "stream": True, }, ) as resp: @@ -242,6 +300,7 @@ async def _llm_stream( preferences: dict, ) -> AsyncGenerator[str, None]: max_tokens = preferences.get("max_length", 1024) + temperature = preferences.get("temperature", 0.7) messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, @@ -249,7 +308,10 @@ async def _llm_stream( if self._cfg.llm_provider == "vllm": async for d in self._stream_openai_compatible( - f"{self._cfg.llm_url}/v1/chat/completions", messages, max_tokens + f"{self._cfg.llm_url}/v1/chat/completions", + messages, + max_tokens, + temperature=temperature, ): yield d elif self._cfg.llm_provider == "api": @@ -258,10 +320,11 @@ async def _llm_stream( messages, max_tokens, self._cfg.llm_api_key, + temperature=temperature, ): yield d elif self._cfg.llm_provider == "ollama": - async for d in self._stream_ollama(messages, max_tokens): + async for d in self._stream_ollama(messages, max_tokens, temperature): yield d else: raise ValueError(f"Unknown llm_provider: {self._cfg.llm_provider}") @@ -282,8 +345,11 @@ async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]: raise print(f"[RAG] Found {len(chunks)} chunks") + profile_facts = self._get_profile(collection, query._user_id) + if profile_facts: + print(f"[RAG] Loaded {len(profile_facts)} profile fact(s)") system_prompt, user_prompt = self._build_prompt( - query.question, chunks, query.preferences + query.question, chunks, query.preferences, profile_facts ) print(f"[RAG] Streaming from LLM at {self._cfg.llm_url} (provider={self._cfg.llm_provider}, model={self._cfg.llm_model})") @@ -317,6 +383,8 @@ def __init__( response_format="paragraph", max_length=1024, extra_instructions="", + persona="", + temperature=0.7, **kwargs, ): super().__init__(_handle=_handle, _user_id=_user_id, **kwargs) @@ -327,7 +395,10 @@ def __init__( "response_format": response_format, "max_length": max_length, "extra_instructions": extra_instructions, + "temperature": temperature, } + if persona: + self.preferences["persona"] = persona async def process(self, data: Sentence) -> AsyncGenerator[Token, None]: # type: ignore[override] query = RAGQuery( From 2e74004fd0d1ee01cbb392c2b507999d7ff48785 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 18:53:29 +0200 Subject: [PATCH 22/31] fixed(huri): Qdrant too old dependancy + new nvidia dependancies --- deploy/Dockerfile.nvidia | 3 --- requirements-amd.txt | 2 +- requirements-nvidia.txt | 7 ++++++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia index 1cb0c8d..f9e704f 100644 --- a/deploy/Dockerfile.nvidia +++ b/deploy/Dockerfile.nvidia @@ -23,9 +23,6 @@ RUN git clone https://github.com/FunAudioLLM/CosyVoice.git /app/cosyvoice \ && git -C /app/cosyvoice checkout 074ca6dc9e80a2f424f1f74b48bdd7d3fea531cc \ && git -C /app/cosyvoice submodule update --init --recursive - -RUN pip install --no-cache-dir lightning==2.2.4 gdown==5.1.0 matplotlib==3.7.5 wget==3.2 pyworld==0.3.4 - ENV PYTHONPATH="/app/cosyvoice:${PYTHONPATH:-}" COPY src /app/src diff --git a/requirements-amd.txt b/requirements-amd.txt index 2fefb8d..1c99594 100644 --- a/requirements-amd.txt +++ b/requirements-amd.txt @@ -6,7 +6,7 @@ # --- RAG / LLM --- httpx==0.27.2 -qdrant-client==1.12.1 +qdrant-client==1.18.0 sentence-transformers==3.2.1 pypdf==5.1.0 semantic_chunker==0.2.0 diff --git a/requirements-nvidia.txt b/requirements-nvidia.txt index 441b738..98a8950 100644 --- a/requirements-nvidia.txt +++ b/requirements-nvidia.txt @@ -27,7 +27,7 @@ tqdm==4.67.3 # --- RAG / LLM extras --- httpx==0.27.2 -qdrant-client==1.12.1 +qdrant-client==1.18.0 sentence-transformers==3.2.1 pypdf==5.1.0 semantic_chunker==0.2.0 @@ -42,3 +42,8 @@ imageio==2.33.0 # opencv-python==4.8.1.78 # pytorch3d # has to be built from source for torch 2.3 / py3.12 # torchvision +lightning==2.2.4 +gdown==5.1.0 +matplotlib==3.7.5 # ridiculous but necessary for init modules +wget==3.2 +pyworld==0.3.4 From 11642138d8cf100594c688eaa6410b8a0a9c75c6 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 18:54:36 +0200 Subject: [PATCH 23/31] feat(huri): core config uid fetching --- src/core/dataclasses/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/dataclasses/config.py b/src/core/dataclasses/config.py index aea111f..8cccb03 100644 --- a/src/core/dataclasses/config.py +++ b/src/core/dataclasses/config.py @@ -47,7 +47,7 @@ def from_dict(cls, raw: Dict) -> "ClientConfig": for module_id, mod_raw in raw.get("modules", {}).items() } return cls( - user_id=None, + user_id=raw.get("user_id"), huri_url=raw["huri_url"], topic_list=raw["topic_list"], senders=senders, From 109491aaf3e13db7fcb2c17beee4b3a36469606d Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 18:54:59 +0200 Subject: [PATCH 24/31] feat(huri): automatically exclude outputed .wav files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 990d633..92a7c35 100644 --- a/.gitignore +++ b/.gitignore @@ -181,4 +181,7 @@ cython_debug/ # Others .trash -docs \ No newline at end of file +docs + +# HuRI client outputs +*.wav From f9d3a936c1d3929af79a4e435f257d4d0c59852c Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 23:03:55 +0200 Subject: [PATCH 25/31] feat(rag): improved prompt to avoid too long sentences --- src/modules/rag/rag.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index 7ffc6da..20a833b 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -195,8 +195,8 @@ def _build_prompt( "Use the context in the user's message to inform your answers when " "it is relevant, but always answer in character. If you don't know " "something, improvise in character rather than admitting you lack " - "information or breaking character." - "Make small sentences, and no emojis" + "information or breaking character. " + "IMPORTANT: Reply in 1-3 short sentences maximum. Be extremely concise. No lists, no emojis, no long explanations." ) system_prompt = " ".join(parts) @@ -381,7 +381,7 @@ def __init__( language="en", tone="formal", response_format="paragraph", - max_length=1024, + max_length=220, extra_instructions="", persona="", temperature=0.7, @@ -389,6 +389,8 @@ def __init__( ): super().__init__(_handle=_handle, _user_id=_user_id, **kwargs) + print(f"[RAG] Initialized with user_id={_user_id}, language={language}, tone={tone}, response_format={response_format}, max_length={max_length}, temperature={temperature}") + self.preferences = { "language": language, "tone": tone, From a048d9a3bc80dedb3aa77ca8f7cb17d26a627b91 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Mon, 1 Jun 2026 23:05:17 +0200 Subject: [PATCH 26/31] feat(sender): sending topic type to clients --- deploy/examples/local_nvidia_amd/values.yaml | 4 ++-- src/modules/utils/sender.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deploy/examples/local_nvidia_amd/values.yaml b/deploy/examples/local_nvidia_amd/values.yaml index 9033384..e980c57 100644 --- a/deploy/examples/local_nvidia_amd/values.yaml +++ b/deploy/examples/local_nvidia_amd/values.yaml @@ -111,10 +111,10 @@ head: resources: limits: cpu: "2" - memory: "8Gi" + memory: "3Gi" requests: cpu: "2" - memory: "4Gi" + memory: "2Gi" # Worker groups: one logical "gpu" group per vendor and one "cpu" group. # diff --git a/src/modules/utils/sender.py b/src/modules/utils/sender.py index 1a1d7eb..9261662 100644 --- a/src/modules/utils/sender.py +++ b/src/modules/utils/sender.py @@ -57,7 +57,7 @@ async def process(self, _): ) await self.ws.send_bytes(self._prefix(header + body)) elif isinstance(data, EventData): - await self.ws.send_json(asdict(data)) + await self.ws.send_json({"topic": self.input_type, **asdict(data)}) else: await self.ws.send_text(str(data)) From 2e404000caedfa89e27368c3685c035b942efea9 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Tue, 2 Jun 2026 03:45:43 +0200 Subject: [PATCH 27/31] fix(rag): missing previous prompt context --- src/modules/rag/rag.py | 47 +++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index 20a833b..b7e21f8 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -32,6 +32,10 @@ class RAGQuery: _user_id: str question: str preferences: dict = field(default_factory=dict) + # Prior conversation turns as OpenAI-style messages + # ([{"role": "user"|"assistant", "content": str}, ...]). The handle is + # stateless, so the per-session RAG module owns and supplies this. + history: list = field(default_factory=list) @serve.deployment(name="RAGHandle") @@ -298,13 +302,14 @@ async def _llm_stream( system_prompt: str, user_prompt: str, preferences: dict, + history: list | None = None, ) -> AsyncGenerator[str, None]: max_tokens = preferences.get("max_length", 1024) temperature = preferences.get("temperature", 0.7) - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ] + messages = [{"role": "system", "content": system_prompt}] + if history: + messages.extend(history) + messages.append({"role": "user", "content": user_prompt}) if self._cfg.llm_provider == "vllm": async for d in self._stream_openai_compatible( @@ -352,10 +357,14 @@ async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]: query.question, chunks, query.preferences, profile_facts ) - print(f"[RAG] Streaming from LLM at {self._cfg.llm_url} (provider={self._cfg.llm_provider}, model={self._cfg.llm_model})") + print( + f"[RAG] Streaming from LLM at {self._cfg.llm_url} " + f"(provider={self._cfg.llm_provider}, model={self._cfg.llm_model}, " + f"history_msgs={len(query.history)})" + ) try: async for delta in self._llm_stream( - system_prompt, user_prompt, query.preferences + system_prompt, user_prompt, query.preferences, query.history ): yield delta except Exception: @@ -385,11 +394,12 @@ def __init__( extra_instructions="", persona="", temperature=0.7, + max_history_turns=6, **kwargs, ): super().__init__(_handle=_handle, _user_id=_user_id, **kwargs) - print(f"[RAG] Initialized with user_id={_user_id}, language={language}, tone={tone}, response_format={response_format}, max_length={max_length}, temperature={temperature}") + print(f"[RAG] Initialized with user_id={_user_id}, language={language}, tone={tone}, response_format={response_format}, max_length={max_length}, temperature={temperature}, max_history_turns={max_history_turns}") self.preferences = { "language": language, @@ -402,17 +412,40 @@ def __init__( if persona: self.preferences["persona"] = persona + # Per-session conversation memory, kept on the (per-WebSocket) module + # instance because the RAGHandle deployment is stateless/shared. + # Stored as OpenAI-style messages; trimmed to the last N turns. + self._max_history_turns = max_history_turns + self.history: list[dict] = [] + async def process(self, data: Sentence) -> AsyncGenerator[Token, None]: # type: ignore[override] query = RAGQuery( _user_id=self._user_id if self._user_id else "anonymous", question=data.text, preferences=self.preferences, + history=list(self.history), # snapshot of prior turns ) + parts: list[str] = [] stream = self._handle.options(stream=True).stream.remote(query) async for delta in stream: + parts.append(delta) yield Token(text=delta, end=False) yield Token(text="", end=True) + self._record_turn(data.text, "".join(parts)) + + def _record_turn(self, question: str, answer: str) -> None: + """Append this turn to the session history (raw Q/A, no RAG context) + and trim to the most recent `max_history_turns` exchanges.""" + answer = answer.strip() + if not answer: + return + self.history.append({"role": "user", "content": question}) + self.history.append({"role": "assistant", "content": answer}) + max_msgs = self._max_history_turns * 2 + if len(self.history) > max_msgs: + del self.history[:-max_msgs] + def update_preferences(self, new_preferences: dict): self.preferences.update(new_preferences) From 0b39013cc6b795cdb18e316078b2e517b851e145 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Tue, 2 Jun 2026 03:47:16 +0200 Subject: [PATCH 28/31] fix(gesture): smoother transition from sliding window + small desync in inference --- src/modules/gesture/gesture.py | 119 +++++++++++++++++++++++++++------ 1 file changed, 98 insertions(+), 21 deletions(-) diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index e4fb01b..33bde05 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -20,6 +20,10 @@ _CONTEXT_SEC = float(os.environ.get("HURI_GESTURE_CONTEXT_SEC", "2.0")) _MIN_CHUNK_SEC = float(os.environ.get("HURI_GESTURE_MIN_CHUNK_SEC", "0.5")) +# Seconds over which a fresh window's first frames are eased onto the last +# emitted pose, killing the seam snap between windows and between utterances. +_BLEND_SEC = float(os.environ.get("HURI_GESTURE_BLEND_SEC", "0.2")) + # Optional manual GPU split: cap the gesture process to a fraction of the GPU so # TTS keeps the lion's share. Only applied on CUDA when the value is set (>0). _GPU_MEM_FRACTION = float(os.environ.get("HURI_GESTURE_GPU_MEM_FRACTION", "0.0")) @@ -174,8 +178,21 @@ class Gesture(ModuleWithHandle): primes the model so the seam is continuous; only the motion frames for the fresh audio are emitted. The window length is bounded by ``context_sec + chunk size`` so inference cost stays flat regardless of - utterance length. Global root translation is rebased onto the previously - emitted frame to avoid a jump every window. + utterance length. + + Seam blending + ───────────── + Priming with context keeps the *audio* continuous across a window, but the + motion still snaps at seams: EMAGE has no future context at a window's right + edge, so its last frames wind down differently from how the next window — + fully primed — opens, and at utterance boundaries it cold-starts from a rest + pose entirely. So each fresh segment is eased onto the previously emitted + frame: poses, expressions and root translation all start exactly continuous + and a cosine-decaying offset fades to zero over ``blend_sec``, restoring the + model's intended motion (and avoiding the cumulative drift a constant rebase + would cause). The anchors survive the end-of-utterance reset, so the first + window of the next utterance blends out of the pose still on screen instead + of teleporting. input: audio (Audio) output: motion (Motion) @@ -184,6 +201,7 @@ class Gesture(ModuleWithHandle): :device: PyTorch device string; defaults to CUDA when available. :context_sec: Seconds of prior audio prepended to each window for continuity. :min_chunk_sec: Minimum seconds of fresh audio to accumulate before inferring. + :blend_sec: Seconds over which each window's seam is eased onto the prior frame. """ _handle_cls = GestureDeployment @@ -195,10 +213,12 @@ def __init__( _handle: handle.DeploymentHandle, context_sec: float = _CONTEXT_SEC, min_chunk_sec: float = _MIN_CHUNK_SEC, + blend_sec: float = _BLEND_SEC, ): super().__init__(_handle) self._context_sec = float(context_sec) self._min_chunk_sec = float(min_chunk_sec) + self._blend_sec = float(blend_sec) # Per-utterance sliding-window state. All sample counts are in the # source sample rate; resampling to 16 kHz happens once inside infer(). @@ -207,14 +227,23 @@ def __init__( self._buffer = np.empty(0, dtype=np.float32) # trailing audio (ctx + unprocessed) self._buf_start = 0 # source-sr sample index of buffer[0] in utterance timeline self._emitted = 0 # source-sr samples whose motion has been emitted - self._trans_anchor: Optional[np.ndarray] = None # last emitted trans, for continuity - def _reset(self) -> None: + # Last emitted frame per channel, used to ease the next segment's seam. + # These persist across the end-of-utterance reset (see _end_utterance) so + # gestures stay continuous when a new utterance starts. + self._trans_anchor: Optional[np.ndarray] = None + self._pose_anchor: Optional[np.ndarray] = None + self._expr_anchor: Optional[np.ndarray] = None + + def _end_utterance(self) -> None: + # Reset only per-utterance buffering/timeline state. The seam anchors + # deliberately survive so the first window of the next utterance eases + # out of the pose currently on screen instead of snapping to EMAGE's + # cold-start rest pose. self._sr = None self._buffer = np.empty(0, dtype=np.float32) self._buf_start = 0 self._emitted = 0 - self._trans_anchor = None async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: ignore[override] # Each chunk arrives as its own process() task on the shared per-session @@ -233,7 +262,7 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: if sr is None: # Nothing buffered yet (e.g. a lone end marker). Reset and bail. if end_of_utterance: - self._reset() + self._end_utterance() return ctx_samples = int(self._context_sec * sr) @@ -245,7 +274,7 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: # Wait for more audio unless this is the final flush of the utterance. if new_samples <= 0 or (not end_of_utterance and new_samples < min_new_samples): if end_of_utterance: - self._reset() + self._end_utterance() return motion = await self._infer_window(sr, ctx_samples, global_end) @@ -253,7 +282,7 @@ async def process(self, audio: Audio) -> AsyncGenerator[Motion, None]: # type: yield motion if end_of_utterance: - self._reset() + self._end_utterance() async def _infer_window( self, sr: int, ctx_samples: int, global_end: int @@ -267,27 +296,50 @@ async def _infer_window( motion: Motion = await self._handle.infer.remote(window, sr) total_frames = motion.poses.shape[0] + # EMAGE's internal windowing (EmageAudioModel.inference) emits a + # contiguous *prefix* of the requested window and silently drops up to + # ~2*seed_frames frames off the END whenever the trailing partial window + # is shorter than its motion seed. So the returned frames cover only + # [win_start, win_start + total_frames] — not necessarily the whole + # window. Map emission off the actual frame count, not the requested + # length: otherwise the freshest motion is dropped while _emitted skips + # over it, tearing a hole in the timeline that reads as a freeze-then- + # jump (and drifts gesture out of sync with speech). + covered_end = win_start + int(round(total_frames * sr / motion.fps)) + # Drop the leading frames that correspond to the context (already emitted). skip_sec = (self._emitted - win_start) / sr skip_frames = int(round(skip_sec * motion.fps)) skip_frames = max(0, min(skip_frames, total_frames)) - poses = motion.poses[skip_frames:] - expressions = motion.expressions[skip_frames:] + poses = motion.poses[skip_frames:].copy() + expressions = motion.expressions[skip_frames:].copy() trans = motion.trans[skip_frames:].copy() - # Advance the timeline even if rounding left no new frames to emit. - self._emitted = global_end - self._trim_buffer(ctx_samples, global_end) + # Advance only past audio the model actually turned into motion; any + # dropped tail stays buffered and is re-inferred next window, this time + # with real right-context. Cap at global_end so rounding can't overrun + # the buffer, and never move backwards. + self._emitted = min(global_end, max(self._emitted, covered_end)) + self._trim_buffer(ctx_samples) if poses.shape[0] == 0: return None - # Rebase global translation onto the last emitted frame: every window - # restarts root motion near the origin, so without this the avatar would - # teleport back at each seam. - if self._trans_anchor is not None: - trans += self._trans_anchor - trans[0] + # Ease this segment's seam onto the last emitted frame. Poses and + # expressions snap because EMAGE regenerates the boundary without the + # right-context the next window will have (and cold-starts across + # utterances); root translation snaps because every window restarts near + # the origin. Blending all three keeps the seam continuous, and the + # decaying (vs. constant) offset returns to the model's intended motion + # so root translation doesn't accumulate drift across windows. + blend_frames = int(round(self._blend_sec * motion.fps)) + self._blend_into(poses, self._pose_anchor, blend_frames) + self._blend_into(expressions, self._expr_anchor, blend_frames) + self._blend_into(trans, self._trans_anchor, blend_frames) + + self._pose_anchor = poses[-1].copy() + self._expr_anchor = expressions[-1].copy() self._trans_anchor = trans[-1].copy() out = Motion( @@ -299,9 +351,34 @@ async def _infer_window( ) return out - def _trim_buffer(self, ctx_samples: int, global_end: int) -> None: - # Keep only the trailing context so the next window stays bounded. - keep_from = global_end - ctx_samples + @staticmethod + def _blend_into( + arr: np.ndarray, anchor: Optional[np.ndarray], blend_frames: int + ) -> None: + """Ease the start of a fresh segment onto ``anchor`` in place. + + Frame 0 is shifted to equal ``anchor`` (a continuous seam) and the + offset fades to zero over ``blend_frames`` with a cosine ease — zero + slope at both ends, so neither the value nor its velocity jumps — after + which the segment is the model's untouched output. + + Poses are SMPL-X axis-angle, so this is a linear blend in axis-angle + space: exact only for small seam offsets, which is the regime here since + consecutive frames are already close. A quaternion slerp would be needed + for large discontinuities but is overkill for seam clean-up. + """ + if anchor is None or blend_frames <= 0 or arr.shape[0] == 0: + return + n = min(blend_frames, arr.shape[0]) + w = 0.5 * (1.0 + np.cos(np.pi * np.linspace(0.0, 1.0, n, dtype=arr.dtype))) + arr[:n] += w[:, None] * (anchor - arr[0]) + + def _trim_buffer(self, ctx_samples: int) -> None: + # Keep one context window of audio before the last *emitted* sample so + # the next window stays bounded — but never discard audio whose motion + # hasn't been emitted yet (the dropped tail above lives between + # _emitted and the buffer end). + keep_from = self._emitted - ctx_samples if keep_from > self._buf_start: self._buffer = self._buffer[keep_from - self._buf_start :] self._buf_start = keep_from From 818bad86c7704ae19fde16c30a0bfb78dd071a8b Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Tue, 2 Jun 2026 04:18:33 +0200 Subject: [PATCH 29/31] fixed(gesture): warmup now works when cold starting --- src/modules/gesture/gesture.py | 81 +++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index 33bde05..291f3ad 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -28,6 +28,15 @@ # TTS keeps the lion's share. Only applied on CUDA when the value is set (>0). _GPU_MEM_FRACTION = float(os.environ.get("HURI_GESTURE_GPU_MEM_FRACTION", "0.0")) +# Source sample rate used to warm the inference path. Real audio arrives from +# the TTS (CosyVoice ≈ 24 kHz), so every real infer() call resamples to 16 kHz. +# Warming at 16 kHz — as the old warmup did — skips that resample entirely, +# leaving librosa's first-call cost to land on the first user-facing gesture. +# Default to the TTS rate so the resample path is warmed too. Override if your +# TTS uses a different rate (the exact value only affects which resampler +# filter is pre-built; the model shapes follow the 16 kHz duration regardless). +_WARMUP_SRC_SR = int(os.environ.get("HURI_GESTURE_WARMUP_SR", "24000")) + @dataclass class Motion: @@ -48,6 +57,19 @@ def __init__( ): print(f"[Gesture] importing torch...", flush=True) import torch + + # Pin algorithm selection so the kernels warmed below are the same ones + # used at serve time. With cudnn.benchmark enabled, cuDNN re-autotunes + # for every new input length — and the sliding window feeds a different + # length almost every call — so the first inference at each new shape + # would stall on autotuning, defeating the warmup. Keep it off (also the + # default) and pin it explicitly. TF32 just speeds matmul/conv on + # Ampere+ with no meaningful quality impact for gesture. + torch.backends.cudnn.benchmark = False + if torch.cuda.is_available(): + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + print(f"[Gesture] importing emage...", flush=True) from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv @@ -99,21 +121,58 @@ def __init__( print(f"[Gesture] ready", flush=True) def _warmup(self) -> None: - # The first inference pays a one-time cold-start cost (CUDA context - # init, kernel JIT/load, cuDNN autotuning, caching-allocator warmup) - # that can take several seconds. Run a throwaway pass here so that cost - # is paid at startup — where we're already blocking on weight loads — - # rather than on the first user-facing gesture. Best-effort: a failure - # here must never prevent the deployment from coming up. + # The first inference pays one-time costs that are *shape- and + # path-dependent*: per-input-length kernel/primitive selection (cuDNN + # algo pick on GPU, oneDNN primitive build on CPU), librosa's first-call + # resampler build, the caching allocator's first growth, and CUDA + # context/kernel load. The old warmup ran a single 16 kHz, fixed-length, + # no-resample pass — so it warmed exactly one shape on a path real calls + # never take. The first real gesture (a different length, arriving at the + # TTS rate and therefore resampled) re-paid almost all of it, which is + # why the warmup "did nothing". + # + # Instead, sweep the window lengths the sliding window actually feeds + # infer() — from the small first-chunk window up to a full context+chunk + # steady-state window — on the *real* resample path, twice (the first + # pass pays the costs, the second confirms the path is hot), and + # synchronize so the GPU work is finished before we report ready. + # Best-effort: a failure here must never prevent the deployment coming up. import time + import torch + + # Representative window lengths (seconds). The dominant per-window + # transformer forward is a fixed shape warmed by any window, but the + # trailing remainder forward varies with total length, so warm a spread. + secs = sorted({ + round(s, 3) + for s in ( + _MIN_CHUNK_SEC, # first tiny window of an utterance + _CONTEXT_SEC, # context-only sized window + _CONTEXT_SEC + _MIN_CHUNK_SEC, # steady-state window + _CONTEXT_SEC + 2 * _MIN_CHUNK_SEC, # a larger fresh chunk + ) + if s and s > 0 + }) or [3.0] try: - # ~3 s of silence at 16 kHz exercises the full sliding-window path - # (multiple rounds + remainder) the way a real utterance would. - dummy = np.zeros(_EMAGE_SR * 3, dtype=np.float32) t0 = time.time() - self.infer(dummy, source_sr=_EMAGE_SR) - print(f"[Gesture] warmup done in {time.time() - t0:.2f}s", flush=True) + for pass_idx in range(2): + for s in secs: + n = max(1, int(_WARMUP_SRC_SR * s)) + dummy = np.zeros(n, dtype=np.float32) + ts = time.time() + self.infer(dummy, source_sr=_WARMUP_SRC_SR) + if self.device.type == "cuda": + torch.cuda.synchronize(self.device) + print( + f"[Gesture] warmup pass {pass_idx} {s:.2f}s " + f"({n} samples @ {_WARMUP_SRC_SR} Hz) in {time.time() - ts:.2f}s", + flush=True, + ) + print( + f"[Gesture] warmup done ({len(secs)} shapes x2) in {time.time() - t0:.2f}s", + flush=True, + ) except Exception as e: # noqa: BLE001 — warmup is an optimisation, never fatal print(f"[Gesture] WARNING warmup failed: {e!r}", flush=True) From fa4bb76f55b8314cad7118da082a37f9050345e6 Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 10 Jun 2026 03:53:47 +0200 Subject: [PATCH 30/31] feat(huri): refactorisation on modules and removing temp features --- src/core/events.py | 3 +- src/modules/factory.py | 4 +- src/modules/gesture/gesture.py | 27 +++--- src/modules/modules.py | 24 +---- src/modules/rag/ingestion.py | 25 ++---- src/modules/rag/qdrant_utils.py | 31 +++++++ src/modules/rag/rag.py | 48 +++++----- src/modules/text_to_speech/text_to_speech.py | 95 +++++++++----------- 8 files changed, 121 insertions(+), 136 deletions(-) create mode 100644 src/modules/rag/qdrant_utils.py diff --git a/src/core/events.py b/src/core/events.py index 5df1bf8..dfaf66b 100644 --- a/src/core/events.py +++ b/src/core/events.py @@ -3,6 +3,8 @@ from collections import defaultdict from dataclasses import dataclass +import numpy as np + from .module import Module logger = logging.getLogger("ray.serve") @@ -91,7 +93,6 @@ async def _run(self, module: Module, data): def _summarize(item) -> str: """Short repr that avoids dumping full numpy arrays into the log.""" cls = type(item).__name__ - import numpy as np data = getattr(item, "data", None) if isinstance(data, np.ndarray): return f"{cls}(shape={data.shape}, dtype={data.dtype})" diff --git a/src/modules/factory.py b/src/modules/factory.py index 14acd99..fb0f58c 100644 --- a/src/modules/factory.py +++ b/src/modules/factory.py @@ -1,8 +1,10 @@ from typing import Any, Dict, List, Mapping, Type +from ray.serve import handle + from src.core.dataclasses.config import ModuleConfig from src.core.events import EventData -from src.core.module import Module, ModuleWithHandle, ModuleWithId, handle +from src.core.module import Module, ModuleWithHandle, ModuleWithId class EventDataFactory: diff --git a/src/modules/gesture/gesture.py b/src/modules/gesture/gesture.py index 291f3ad..ecd50f1 100644 --- a/src/modules/gesture/gesture.py +++ b/src/modules/gesture/gesture.py @@ -55,7 +55,7 @@ def __init__( device: Optional[str] = None, gpu_mem_fraction: float = _GPU_MEM_FRACTION, ): - print(f"[Gesture] importing torch...", flush=True) + print(f"[Gesture] importing torch...") import torch # Pin algorithm selection so the kernels warmed below are the same ones @@ -70,13 +70,13 @@ def __init__( torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True - print(f"[Gesture] importing emage...", flush=True) + print(f"[Gesture] importing emage...") from .emage import EmageAudioModel, EmageVAEConv, EmageVQModel, EmageVQVAEConv self.device = torch.device( device if device else ("cuda" if torch.cuda.is_available() else "cpu") ) - print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}", flush=True) + print(f"[Gesture] device={self.device} hf_repo={hf_repo!r}") # Manual GPU split: cap this process' share of GPU memory so the audio # (TTS) path keeps the rest. num_gpus in the Ray serveConfig handles @@ -88,20 +88,19 @@ def __init__( ) print( f"[Gesture] GPU memory fraction capped at {gpu_mem_fraction:.2f}", - flush=True, ) except Exception as e: # noqa: BLE001 — best-effort knob, never fatal - print(f"[Gesture] WARNING could not cap GPU memory: {e!r}", flush=True) + print(f"[Gesture] WARNING could not cap GPU memory: {e!r}") - print("[Gesture] loading face_vq...", flush=True) + print("[Gesture] loading face_vq...") face_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/face").to(self.device) - print("[Gesture] loading upper_vq...", flush=True) + print("[Gesture] loading upper_vq...") upper_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/upper").to(self.device) - print("[Gesture] loading lower_vq...", flush=True) + print("[Gesture] loading lower_vq...") lower_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/lower").to(self.device) - print("[Gesture] loading hands_vq...", flush=True) + print("[Gesture] loading hands_vq...") hands_vq = EmageVQVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/hands").to(self.device) - print("[Gesture] loading global_ae...", flush=True) + print("[Gesture] loading global_ae...") global_ae = EmageVAEConv.from_pretrained(hf_repo, subfolder="emage_vq/global").to(self.device) self.motion_vq = EmageVQModel( @@ -113,12 +112,12 @@ def __init__( ) self.motion_vq.eval() - print("[Gesture] loading EmageAudioModel...", flush=True) + print("[Gesture] loading EmageAudioModel...") self.model = EmageAudioModel.from_pretrained(hf_repo).to(self.device) self.model.eval() self._warmup() - print(f"[Gesture] ready", flush=True) + print(f"[Gesture] ready") def _warmup(self) -> None: # The first inference pays one-time costs that are *shape- and @@ -167,14 +166,12 @@ def _warmup(self) -> None: print( f"[Gesture] warmup pass {pass_idx} {s:.2f}s " f"({n} samples @ {_WARMUP_SRC_SR} Hz) in {time.time() - ts:.2f}s", - flush=True, ) print( f"[Gesture] warmup done ({len(secs)} shapes x2) in {time.time() - t0:.2f}s", - flush=True, ) except Exception as e: # noqa: BLE001 — warmup is an optimisation, never fatal - print(f"[Gesture] WARNING warmup failed: {e!r}", flush=True) + print(f"[Gesture] WARNING warmup failed: {e!r}") def infer(self, audio_np: np.ndarray, source_sr: int = _EMAGE_SR) -> Motion: import torch diff --git a/src/modules/modules.py b/src/modules/modules.py index 04a020c..d8fefb9 100644 --- a/src/modules/modules.py +++ b/src/modules/modules.py @@ -1,33 +1,15 @@ -import logging from typing import Dict, Type from src.modules.rag.rag import RAG from src.modules.speech_to_text.microphone_vad import MIC from src.modules.speech_to_text.speech_to_text import STT from src.modules.speech_to_text.text_aggregator import TAG +from src.modules.text_to_speech.text_to_speech import TTS +from src.modules.gesture.gesture import Gesture from .factory import Module -_LOG = logging.getLogger(__name__) - def get_modules() -> Dict[str, Type[Module]]: - modules: Dict[str, Type[Module]] = {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG} - - # The following imports may contain modules with custom dependencies, depending on the Dockerfile - # CPU doesn't need some dependencies, nor the AMD that isn't compatible - try: - from src.modules.text_to_speech.text_to_speech import TTS - except Exception as exc: # noqa: BLE001 - _LOG.info("Skipping TTS module: %s", exc) - else: - modules["tts"] = TTS - - try: - from src.modules.gesture.gesture import Gesture - except Exception as exc: # noqa: BLE001 - _LOG.info("Skipping Gesture module: %s", exc) - else: - modules["gesture"] = Gesture - + modules: Dict[str, Type[Module]] = {"mic": MIC, "stt": STT, "tag": TAG, "rag": RAG, "tts": TTS, "gesture": Gesture} return modules diff --git a/src/modules/rag/ingestion.py b/src/modules/rag/ingestion.py index 88d85a8..529c7ff 100644 --- a/src/modules/rag/ingestion.py +++ b/src/modules/rag/ingestion.py @@ -185,7 +185,8 @@ def ingest_chunks( def chunk_strat(text: str, args, model: Any) -> list[str] | Any: """Pick the right chunking strategy based on args.""" if args.chunking == "semantic": - from semantic_chunker import SemanticChunker + # Thomas: I need to import here, bceause it takes too much time earlier, or use a jupyter notebook to do it instead + from .semantic_chunker import SemanticChunker chunker = SemanticChunker( model=model, @@ -513,23 +514,11 @@ def main(): print(f"User: {_user_id}") verify_ssl = not args.no_verify_ssl - # Parse the URL explicitly so QdrantClient gets the correct host/port/https. - # When given just "https://host" with no port, some qdrant-client versions - # silently fall back to their default port (6333) instead of 443, causing - # a timeout that looks like an SSL issue. - from urllib.parse import urlparse - - _parsed = urlparse(args.qdrant_url) - _is_https = _parsed.scheme == "https" - _host = _parsed.hostname - _port = _parsed.port or (443 if _is_https else 6333) - client = QdrantClient( - host=_host, - port=_port, - https=_is_https, - verify=verify_ssl, - check_compatibility=verify_ssl, - ) + try: + from .qdrant_utils import make_qdrant_client + except ImportError: + from qdrant_utils import make_qdrant_client + client = make_qdrant_client(args.qdrant_url, verify_ssl) # Lazy-load the model only if the command needs embeddings. # Commands that don't need it: list, delete, profile (doesn't use embeddings). diff --git a/src/modules/rag/qdrant_utils.py b/src/modules/rag/qdrant_utils.py new file mode 100644 index 0000000..080980e --- /dev/null +++ b/src/modules/rag/qdrant_utils.py @@ -0,0 +1,31 @@ +"""Shared Qdrant client construction. + +Centralises the URL→client parsing used by both the RAGHandle deployment +(``rag.py``) and the offline ingestion CLI (``ingestion.py``), so the port/SSL +handling lives in exactly one place instead of being copy-pasted. +""" + +from urllib.parse import urlparse + +from qdrant_client import QdrantClient + +with open("portal.tmp", 'w') as f: + f.write("WTFF SWORKS ,??,,\n") + +def make_qdrant_client(qdrant_url: str, verify_ssl: bool = True) -> QdrantClient: + """Build a :class:`QdrantClient` from a URL. + + Parses the URL explicitly so the client gets the correct host/port/https. + When given just ``https://host`` with no port, some qdrant-client versions + silently fall back to their default port (6333) instead of 443, causing a + timeout that looks like an SSL issue — so derive the port from the scheme. + """ + parsed = urlparse(qdrant_url) + is_https = parsed.scheme == "https" + return QdrantClient( + host=parsed.hostname, + port=parsed.port or (443 if is_https else 6333), + https=is_https, + verify=verify_ssl, + check_compatibility=verify_ssl, + ) diff --git a/src/modules/rag/rag.py b/src/modules/rag/rag.py index b7e21f8..0e65bbe 100644 --- a/src/modules/rag/rag.py +++ b/src/modules/rag/rag.py @@ -1,4 +1,6 @@ import json +import os +import traceback from dataclasses import dataclass, field from typing import Any, AsyncGenerator @@ -10,6 +12,22 @@ from src.modules.speech_to_text.events import Sentence from src.modules.text_to_speech.events import Token +import httpx + +from qdrant_client.models import FieldCondition, Filter, MatchValue +from .qdrant_utils import make_qdrant_client + +# Default character persona. Overridable per session via the `persona` key in the +# client config's module args, or globally via HURI_RAG_DEFAULT_PERSONA in the +# Serve app runtime_env.env_vars (see deploy values.yaml) — no rebuild needed. +_DEFAULT_PERSONA = os.environ.get( + "HURI_RAG_DEFAULT_PERSONA", + "You are Mouse-Man, a witty, charismatic animated mouse character. " + "You are NOT an AI, an assistant, or a language model, and you must " + "never say that you are one. Stay fully in character at all times, " + "speak naturally and with personality, and never break the illusion.", +) + class RAGDeploymentConfig(BaseModel): qdrant_url: str = "http://localhost:6333" @@ -51,22 +69,9 @@ def reconfigure(self, config: dict) -> None: self._apply_config() def _apply_config(self) -> None: - import httpx - from qdrant_client import QdrantClient - cfg = self._cfg - from urllib.parse import urlparse - self.embedding_url = cfg.embedding_url or cfg.llm_url - _p = urlparse(cfg.qdrant_url) - _is_https = _p.scheme == "https" - self._qdrant = QdrantClient( - host=_p.hostname, - port=_p.port or (443 if _is_https else 6333), - https=_is_https, - verify=cfg.verify_ssl, - check_compatibility=cfg.verify_ssl, - ) + self._qdrant = make_qdrant_client(cfg.qdrant_url, cfg.verify_ssl) print(f"[RAGHandle] Connected to Qdrant at {cfg.qdrant_url}") self._embed_client = httpx.AsyncClient(timeout=30.0, verify=cfg.verify_ssl) self._llm_client = httpx.AsyncClient(timeout=120.0, verify=cfg.verify_ssl) @@ -106,8 +111,6 @@ def _get_profile(self, collection: str, _user_id: str) -> list[str]: so they are always available to the prompt regardless of the question. Populated via `ingestion.py profile`. """ - from qdrant_client.models import FieldCondition, Filter, MatchValue - try: points, _ = self._qdrant.scroll( collection_name=collection, @@ -134,8 +137,6 @@ def _search( ) -> list[dict]: qdrant_filter: Any = None if filters: - from qdrant_client.models import FieldCondition, Filter, MatchValue - conditions: Any = [ FieldCondition(key=k, match=MatchValue(value=v)) for k, v in filters.items() @@ -168,14 +169,7 @@ def _build_prompt( preferences: dict, profile_facts: list[str] | None = None, ) -> tuple[str, str]: - # TEMP TEMP TEMP: this persona should be defined in the configuration - persona = preferences.get( - "persona", - "You are Rat-Man, a witty, charismatic animated rat character. " - "You are NOT an AI, an assistant, or a language model, and you must " - "never say that you are one. Stay fully in character at all times, " - "speak naturally and with personality, and never break the illusion.", - ) + persona = preferences.get("persona", _DEFAULT_PERSONA) parts = [persona] if profile_facts: @@ -336,8 +330,6 @@ async def _llm_stream( async def stream(self, query: RAGQuery) -> AsyncGenerator[str, None]: """Main streaming entry point — yields LLM text deltas.""" - import traceback - print(f"[RAG] Question: {query.question}") collection, filters = self._resolve_user_context(query._user_id) diff --git a/src/modules/text_to_speech/text_to_speech.py b/src/modules/text_to_speech/text_to_speech.py index 4b9baf2..26a4b52 100644 --- a/src/modules/text_to_speech/text_to_speech.py +++ b/src/modules/text_to_speech/text_to_speech.py @@ -1,10 +1,10 @@ import asyncio -import logging import os import queue import sys +import traceback import uuid -from typing import AsyncGenerator +from typing import AsyncGenerator, Optional import numpy as np from ray import serve @@ -14,35 +14,28 @@ from .events import Audio, Token -logger = logging.getLogger("ray.serve") -logger.setLevel(os.environ.get("HURI_TTS_LOG_LEVEL", "INFO").upper()) - - -def _trace(msg: str) -> None: - """Belt-and-braces log: hits both the ray.serve logger AND stdout. - - Ray Serve captures stdout per replica and surfaces it in the dashboard's - Logs tab — that's the path that survives any logger misconfiguration. - """ - logger.info(msg) - print(f"[TTS] {msg}", flush=True) - - # Defaults — overridden by env vars in production (see README.md) _MODEL_PATH = os.environ.get( "HURI_MODEL_PATH", "/models/cosytts/FunAudioLLM/Fun-CosyVoice3-0.5B-2512" ) _VOICE_SAMPLE_PATH = os.environ.get("HURI_VOICE_SAMPLE_PATH", "/assets/voice.wav") -# CosyVoice3 expects "<|endofprompt|>". If the -# config supplies a bare transcript (no marker), prepend the default instruction so the -# transcript lands AFTER <|endofprompt|> — otherwise the LM treats it as an instruction -# and intermittently renders it as speech (prompt leakage). The transcription is a must. -_raw_transcript = os.environ["HURI_VOICE_TRANSCRIPT"] -_VOICE_SAMPLE_TRANSCRIPT = ( - _raw_transcript - if "<|endofprompt|>" in _raw_transcript - else f"You are a helpful assistant.<|endofprompt|>{_raw_transcript}" -) +_DEFAULT_INSTRUCTION = "You are a helpful assistant." + + +def _normalize_transcript(raw: str) -> str: + """Make a reference transcript safe for CosyVoice3. + + CosyVoice3 expects "<|endofprompt|>". + If the configured transcript supplies a bare transcript (no marker), prepend + the default instruction so the transcript lands AFTER <|endofprompt|> — + otherwise the LM treats it as an instruction and intermittently renders it as + speech (prompt leakage). + """ + return ( + raw + if "<|endofprompt|>" in raw + else f"{_DEFAULT_INSTRUCTION}<|endofprompt|>{raw}" + ) _END_TEXT = object() # sentinel pushed into the text queue to close synth _END_AUDIO = object() # sentinel pushed into the audio queue when synth completes @@ -64,22 +57,34 @@ def __init__( self, model_path: str = _MODEL_PATH, voice_sample_path: str = _VOICE_SAMPLE_PATH, - voice_sample_transcript: str = _VOICE_SAMPLE_TRANSCRIPT, + voice_sample_transcript: Optional[str] = None, ): - _trace(f"TTSDeployment init: model_path={model_path} voice={voice_sample_path} transcript={voice_sample_transcript}") + # Resolve the reference transcript here (deploy time on the GPU worker) + # rather than at module import: importing this module must not require + # HURI_VOICE_TRANSCRIPT, since modules.py imports it inside a broad + # try/except that would otherwise make TTS silently vanish from the + # pipeline when the var is unset. Fail loudly and locally instead. + if voice_sample_transcript is None: + raw = os.environ.get("HURI_VOICE_TRANSCRIPT") + if not raw: + raise RuntimeError( + "HURI_VOICE_TRANSCRIPT is not set. The TTS deployment needs the " + "transcript of the reference voice sample (voice.wav). Set it in " + "the Serve app runtime_env.env_vars (see deploy values.yaml)." + ) + voice_sample_transcript = raw + voice_sample_transcript = _normalize_transcript(voice_sample_transcript) cosy_dir = os.environ.get("HURI_COSY_DIR") if cosy_dir: matcha_path = os.path.join(cosy_dir, "third_party", "Matcha-TTS") if os.path.isdir(matcha_path) and matcha_path not in sys.path: sys.path.insert(0, matcha_path) - logger.debug("Added Matcha-TTS path to sys.path: %s", matcha_path) from cosyvoice.cli.cosyvoice import CosyVoice3 self.model = CosyVoice3(model_dir=model_path, load_trt=False) self.sample_rate: int = self.model.sample_rate - _trace(f"CosyVoice3 loaded (sample_rate={self.sample_rate})") self.prompt_speech = voice_sample_path self.prompt_text: str = voice_sample_transcript @@ -91,33 +96,26 @@ async def get_sample_rate(self) -> int: async def start_session(self, session_id: str) -> None: self._text_queues[session_id] = queue.Queue() - _trace(f"[{session_id}] session started (active={len(self._text_queues)})") async def push_text(self, session_id: str, text: str, end: bool) -> None: q = self._text_queues.get(session_id) if q is None: - _trace(f"[{session_id}] WARNING push_text on unknown session (text={text!r} end={end})") return if text: q.put(text) - _trace(f"[{session_id}] push_text {text!r} (qsize={q.qsize()})") if end: q.put(_END_TEXT) - _trace(f"[{session_id}] push_text: end-of-stream sentinel") async def stream_audio(self, session_id: str) -> AsyncGenerator[Audio, None]: text_q = self._text_queues[session_id] loop = asyncio.get_running_loop() chunk_count = 0 - _trace(f"[{session_id}] stream_audio: starting CosyVoice inference") def text_gen(): while True: item = text_q.get() if item is _END_TEXT: - _trace(f"[{session_id}] text_gen: received end sentinel") return - _trace(f"[{session_id}] text_gen yielding: {item!r}") yield item try: @@ -134,18 +132,12 @@ def text_gen(): assert isinstance(result, dict) chunk_count += 1 speech = result["tts_speech"].squeeze(0).numpy().astype(np.float32) - _trace( - f"[{session_id}] audio chunk #{chunk_count}: " - f"{speech.shape[0]} samples (~{speech.shape[0] / self.sample_rate:.2f}s)" - ) yield Audio(data=speech, sample_rate=self.sample_rate) - except Exception as e: - _trace(f"[{session_id}] stream_audio FAILED: {e!r}") - logger.exception("[%s] stream_audio failed", session_id) + except Exception: + traceback.print_exc() raise finally: self._text_queues.pop(session_id, None) - _trace(f"[{session_id}] stream_audio finished (chunks={chunk_count})") class TTS(ModuleWithHandle): @@ -189,7 +181,7 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: i if is_first: self._session_id = str(uuid.uuid4()) self._audio_q = asyncio.Queue() - print(f"[TTS-client] [{self._session_id}] opening new utterance session", flush=True) + print(f"[TTS-client] [{self._session_id}] opening new utterance session") await self._handle.start_session.remote(self._session_id) self._stream_task = asyncio.create_task( self._drain_audio(self._session_id, self._audio_q) @@ -198,7 +190,7 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: i sid = self._session_id audio_q = self._audio_q stream_task = self._stream_task - print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})", flush=True) + print(f"[TTS-client] [{sid}] push token: {token.text!r} (end={token.end})") await self._handle.push_text.remote(sid, token.text, token.end) if not is_first: @@ -212,10 +204,10 @@ async def process(self, token: Token) -> AsyncGenerator[Audio, None]: # type: i if item is _END_AUDIO: break count += 1 - print(f"[TTS-client] [{sid}] yield chunk #{count}", flush=True) + print(f"[TTS-client] [{sid}] yield chunk #{count}") yield item await stream_task - print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)", flush=True) + print(f"[TTS-client] [{sid}] utterance complete ({count} chunks)") sample_rate = await self._handle.get_sample_rate.remote() yield Audio(data=np.array([], dtype=np.float32), sample_rate=sample_rate, end=True) @@ -237,12 +229,11 @@ async def _drain_audio(self, session_id: str, audio_q: asyncio.Queue) -> None: print( f"[TTS-client] [{session_id}] drain received chunk #{count} " f"pts={audio.pts:.3f}s next={pts:.3f}s", - flush=True, ) await audio_q.put(audio) except Exception as e: - print(f"[TTS-client] [{session_id}] drain task FAILED: {e!r}", flush=True) + print(f"[TTS-client] [{session_id}] drain task FAILED: {e!r}") raise finally: await audio_q.put(_END_AUDIO) - print(f"[TTS-client] [{session_id}] drain task finished", flush=True) + print(f"[TTS-client] [{session_id}] drain task finished") From 93237f7264429e7edad21416d862e4bfa1c06c8c Mon Sep 17 00:00:00 2001 From: "thomas.pommier" Date: Wed, 10 Jun 2026 03:54:35 +0200 Subject: [PATCH 31/31] fixed(rag): qdrant is installed on all dockerfiles --- deploy/Dockerfile.nvidia | 3 +++ requirements-amd.txt | 4 ++-- serve_requirements.txt | 7 +++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/deploy/Dockerfile.nvidia b/deploy/Dockerfile.nvidia index f9e704f..f0e6e4d 100644 --- a/deploy/Dockerfile.nvidia +++ b/deploy/Dockerfile.nvidia @@ -10,6 +10,9 @@ RUN pip install --no-cache-dir \ --extra-index-url https://pypi.ngc.nvidia.com \ -r requirements-nvidia.txt +COPY serve_requirements.txt /app +RUN pip install --no-cache-dir -r serve_requirements.txt + USER root RUN apt-get update && apt-get install -y --no-install-recommends git \ diff --git a/requirements-amd.txt b/requirements-amd.txt index 1c99594..7fa9358 100644 --- a/requirements-amd.txt +++ b/requirements-amd.txt @@ -5,8 +5,8 @@ # Does NOT include CosyVoice2 or EMAGE — those run on the NVIDIA worker. # --- RAG / LLM --- -httpx==0.27.2 -qdrant-client==1.18.0 +# httpx + qdrant-client moved to serve_requirements.txt so the head/base +# controller can import rag.py without falling back to None (see that file). sentence-transformers==3.2.1 pypdf==5.1.0 semantic_chunker==0.2.0 diff --git a/serve_requirements.txt b/serve_requirements.txt index 242576e..1ebe583 100644 --- a/serve_requirements.txt +++ b/serve_requirements.txt @@ -3,3 +3,10 @@ numpy click<8.2 webrtcvad faster-whisper + +# RAG/LLM client deps. Kept in the *base* image (not just the AMD worker) +# because the Serve controller on the head node imports rag.py to build the app; +# if these are missing there, the module-level imports fall back to None and that +# None is baked into the cloudpickled RAGHandle shipped to the AMD replica. +httpx==0.27.2 +qdrant-client==1.18.0