From fbcca235a9847750cdbb1aeceb1f915ef9bba149 Mon Sep 17 00:00:00 2001 From: makchunhin Date: Fri, 8 May 2026 15:14:07 -0600 Subject: [PATCH] feat: swap inference abstraction for direct provider plugins, drop ai-coustics Replace inference.STT/LLM/TTS with direct plugin calls: - STT: deepgram.STT(model="nova-3", language="multi") - LLM: openai.LLM(model="gpt-5.4") - TTS: elevenlabs.TTS() Update pyproject.toml to add openai/deepgram/elevenlabs extras and remove livekit-plugins-ai-coustics (paid plugin). Add turn_handling={"interruption": {"mode": "vad"}} to AgentSession to use VAD-based interruption and avoid the paid adaptive endpoint. Co-Authored-By: Claude Sonnet 4.6 --- pyproject.toml | 3 +-- src/agent.py | 21 ++++++--------------- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ce3f602..6d4539f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,8 +9,7 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python" requires-python = ">=3.10, <3.15" dependencies = [ - "livekit-agents[silero,turn-detector]~=1.5", - "livekit-plugins-ai-coustics~=0.2", + "livekit-agents[openai,deepgram,elevenlabs,silero,turn-detector]~=1.5", "python-dotenv", ] diff --git a/src/agent.py b/src/agent.py index 1076905..f0f068b 100644 --- a/src/agent.py +++ b/src/agent.py @@ -9,10 +9,9 @@ JobContext, JobProcess, cli, - inference, room_io, ) -from livekit.plugins import ai_coustics, silero +from livekit.plugins import deepgram, elevenlabs, openai, silero from livekit.plugins.turn_detector.multilingual import MultilingualModel logger = logging.getLogger("agent") @@ -25,7 +24,7 @@ def __init__(self) -> None: super().__init__( # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response # See all available models at https://docs.livekit.io/agents/models/llm/ - llm=inference.LLM(model="openai/gpt-5.2-chat-latest"), + llm=openai.LLM(model="gpt-5.4"), # To use a realtime model instead of a voice pipeline, replace the LLM # with a RealtimeModel and remove the STT/TTS from the AgentSession # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/) @@ -107,16 +106,14 @@ async def my_agent(ctx: JobContext): "room": ctx.room.name, } - # Set up a voice AI pipeline using OpenAI, Cartesia, Deepgram, and the LiveKit turn detector + # Set up a voice AI pipeline using OpenAI, ElevenLabs, Deepgram, and the LiveKit turn detector session = AgentSession( # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand # See all available models at https://docs.livekit.io/agents/models/stt/ - stt=inference.STT(model="deepgram/nova-3", language="multi"), + stt=deepgram.STT(model="nova-3", language="multi"), # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/ - tts=inference.TTS( - model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc" - ), + tts=elevenlabs.TTS(), # VAD and turn detection are used to determine when the user is speaking and when the agent should respond # See more at https://docs.livekit.io/agents/build/turns turn_detection=MultilingualModel(), @@ -124,19 +121,13 @@ async def my_agent(ctx: JobContext): # allow the LLM to generate a response while waiting for the end of turn # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation preemptive_generation=True, + turn_handling={"interruption": {"mode": "vad"}}, ) # Start the session, which initializes the voice pipeline and warms up the models await session.start( agent=Assistant(), room=ctx.room, - room_options=room_io.RoomOptions( - audio_input=room_io.AudioInputOptions( - noise_cancellation=ai_coustics.audio_enhancement( - model=ai_coustics.EnhancerModel.QUAIL_VF_S - ), - ), - ), ) # # Add a virtual avatar to the session, if desired