From fbcca235a9847750cdbb1aeceb1f915ef9bba149 Mon Sep 17 00:00:00 2001
From: makchunhin <makchunhin@gmail.com>
Date: Fri, 8 May 2026 15:14:07 -0600
Subject: [PATCH] feat: swap inference abstraction for direct provider plugins,
 drop ai-coustics

Replace inference.STT/LLM/TTS with direct plugin calls:
- STT: deepgram.STT(model="nova-3", language="multi")
- LLM: openai.LLM(model="gpt-5.4")
- TTS: elevenlabs.TTS()

Update pyproject.toml to add openai/deepgram/elevenlabs extras and
remove livekit-plugins-ai-coustics (paid plugin).

Add turn_handling={"interruption": {"mode": "vad"}} to AgentSession to
use VAD-based interruption and avoid the paid adaptive endpoint.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pyproject.toml |  3 +--
 src/agent.py   | 21 ++++++---------------
 2 files changed, 7 insertions(+), 17 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ce3f602..6d4539f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,8 +9,7 @@ description = "Simple voice AI assistant built with LiveKit Agents for Python"
 requires-python = ">=3.10, <3.15"
 
 dependencies = [
-    "livekit-agents[silero,turn-detector]~=1.5",
-    "livekit-plugins-ai-coustics~=0.2",
+    "livekit-agents[openai,deepgram,elevenlabs,silero,turn-detector]~=1.5",
     "python-dotenv",
 ]
 
diff --git a/src/agent.py b/src/agent.py
index 1076905..f0f068b 100644
--- a/src/agent.py
+++ b/src/agent.py
@@ -9,10 +9,9 @@
     JobContext,
     JobProcess,
     cli,
-    inference,
     room_io,
 )
-from livekit.plugins import ai_coustics, silero
+from livekit.plugins import deepgram, elevenlabs, openai, silero
 from livekit.plugins.turn_detector.multilingual import MultilingualModel
 
 logger = logging.getLogger("agent")
@@ -25,7 +24,7 @@ def __init__(self) -> None:
         super().__init__(
             # A Large Language Model (LLM) is your agent's brain, processing user input and generating a response
             # See all available models at https://docs.livekit.io/agents/models/llm/
-            llm=inference.LLM(model="openai/gpt-5.2-chat-latest"),
+            llm=openai.LLM(model="gpt-5.4"),
             # To use a realtime model instead of a voice pipeline, replace the LLM
             # with a RealtimeModel and remove the STT/TTS from the AgentSession
             # (Note: This is for the OpenAI Realtime API. For other providers, see https://docs.livekit.io/agents/models/realtime/)
@@ -107,16 +106,14 @@ async def my_agent(ctx: JobContext):
         "room": ctx.room.name,
     }
 
-    # Set up a voice AI pipeline using OpenAI, Cartesia, Deepgram, and the LiveKit turn detector
+    # Set up a voice AI pipeline using OpenAI, ElevenLabs, Deepgram, and the LiveKit turn detector
     session = AgentSession(
         # Speech-to-text (STT) is your agent's ears, turning the user's speech into text that the LLM can understand
         # See all available models at https://docs.livekit.io/agents/models/stt/
-        stt=inference.STT(model="deepgram/nova-3", language="multi"),
+        stt=deepgram.STT(model="nova-3", language="multi"),
         # Text-to-speech (TTS) is your agent's voice, turning the LLM's text into speech that the user can hear
         # See all available models as well as voice selections at https://docs.livekit.io/agents/models/tts/
-        tts=inference.TTS(
-            model="cartesia/sonic-3", voice="9626c31c-bec5-4cca-baa8-f8ba9e84c8bc"
-        ),
+        tts=elevenlabs.TTS(),
         # VAD and turn detection are used to determine when the user is speaking and when the agent should respond
         # See more at https://docs.livekit.io/agents/build/turns
         turn_detection=MultilingualModel(),
@@ -124,19 +121,13 @@ async def my_agent(ctx: JobContext):
         # allow the LLM to generate a response while waiting for the end of turn
         # See more at https://docs.livekit.io/agents/build/audio/#preemptive-generation
         preemptive_generation=True,
+        turn_handling={"interruption": {"mode": "vad"}},
     )
 
     # Start the session, which initializes the voice pipeline and warms up the models
     await session.start(
         agent=Assistant(),
         room=ctx.room,
-        room_options=room_io.RoomOptions(
-            audio_input=room_io.AudioInputOptions(
-                noise_cancellation=ai_coustics.audio_enhancement(
-                    model=ai_coustics.EnhancerModel.QUAIL_VF_S
-                ),
-            ),
-        ),
     )
 
     # # Add a virtual avatar to the session, if desired