diff --git a/CLAUDE.md b/CLAUDE.md index 33c5c5a..fa3ac3d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -21,7 +21,7 @@ Both share the same knowledge base at `.podcli/knowledge/`. | `/generate-descriptions` | Copywriter | Creates descriptions + hashtags + SEO keywords | | `/plan-thumbnails` | Art Director | Plans thumbnail text + layout briefs for both formats | | `/review-content` | Brand Guardian | Reviews output against brand voice, quality gates, banned words | -| `/prep-episode` | Producer | Full pipeline: transcript → publish-ready package | +| `/produce-shorts` | Producer | Full pipeline: transcript → publish-ready package | | `/publish-checklist` | Launch Manager | Pre/post-publish optimization checklist | | `/retro-episode` | Analyst | Episode performance review + learnings | @@ -50,7 +50,7 @@ Both share the same knowledge base at `.podcli/knowledge/`. → /plan-thumbnails → /review-content → /publish-checklist ``` -Or run everything at once: `/prep-episode` +Or run everything at once: `/produce-shorts` After publishing: `/retro-episode` @@ -99,7 +99,7 @@ When input is provided without a specific command: - **Asks for titles** → Run `/generate-titles` - **Asks for thumbnails** → Run `/plan-thumbnails` - **Asks for descriptions** → Run `/generate-descriptions` -- **Says "process episode"** → Run `/prep-episode` +- **Says "process episode"** → Run `/produce-shorts` - **Asks to review content** → Run `/review-content` --- diff --git a/README.md b/README.md index 124efa9..7b21d0a 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ Clips come out as **upload-ready Shorts**: 1080x1920, 9:16 vertical, with burned Open the project in **Claude Code** and run: ``` -/prep-episode +/produce-shorts ``` This runs the [PodStack](https://github.com/nmbrthirteen/podstack) pipeline — a gstack-style workflow that gives you: @@ -135,7 +135,7 @@ Both halves share the same **knowledge base** (`.podcli/knowledge/`) — your sh - **`/generate-descriptions`** — descriptions + hashtags + SEO keywords - **`/plan-thumbnails`** — thumbnail text + designer briefs for both formats - **`/review-content`** — paranoid brand check (banned words, voice, title rules) -- **`/prep-episode`** — full pipeline: transcript → publish-ready package +- **`/produce-shorts`** — full pipeline: transcript → publish-ready package - **`/publish-checklist`** — pre/post-publish optimization - **`/retro-episode`** — performance analysis after publishing @@ -240,7 +240,7 @@ Open the project in Claude Code, then use slash commands: ```bash # Full pipeline — transcript to publish-ready package -/prep-episode +/produce-shorts # Individual steps /process-transcript # extract moments from a transcript @@ -356,7 +356,7 @@ podcli/ │ ├── generate-descriptions.md │ ├── plan-thumbnails.md │ ├── review-content.md -│ ├── prep-episode.md +│ ├── produce-shorts.md │ ├── publish-checklist.md │ └── retro-episode.md │ diff --git a/backend/cli.py b/backend/cli.py index bb93103..2ec78b1 100644 --- a/backend/cli.py +++ b/backend/cli.py @@ -261,11 +261,18 @@ def _ensure_ssl_certs(): except Exception: pass - # Method 4: Last resort — disable SSL verification for this session - # This is safe because we're only downloading Whisper models from known URLs - ssl._create_default_https_context = ssl._create_unverified_context - os.environ["PYTHONHTTPSVERIFY"] = "0" - print(" ⚠ SSL verification disabled for this session (model downloads only)") + # Method 4: disable SSL verification only on explicit opt-in — silently + # turning off TLS verification is a downgrade risk. + if os.environ.get("PODCLI_INSECURE_SSL", "").strip().lower() in ("1", "true", "yes", "on"): + ssl._create_default_https_context = ssl._create_unverified_context + os.environ["PYTHONHTTPSVERIFY"] = "0" + print(" ⚠ SSL verification DISABLED (PODCLI_INSECURE_SSL set) — model downloads only") + else: + print( + " ✗ Could not configure SSL certificates. Install certifi " + "(pip install certifi) or, to download models without verification, " + "re-run with PODCLI_INSECURE_SSL=1." + ) def _sanitize_path_component(value: str) -> str: @@ -3299,6 +3306,9 @@ def main(): studio.add_argument("--save-brand", action="store_true", help="Save handle/platforms/outro-title/accent/bg as the default brand and exit") + # ── ui (Studio web dashboard) ── + sub.add_parser("ui", aliases=["webui"], help="Open the Studio web UI (http://localhost:3847)") + # ── presets ── pre = sub.add_parser("presets", help="Manage presets") pre_sub = pre.add_subparsers(dest="presets_action") @@ -3562,10 +3572,66 @@ def main(): cmd_info(args) elif args.command == "init-thumbnail": cmd_init_thumbnail(args) + elif args.command in ("ui", "webui"): + launch_webui() else: interactive_menu() +def launch_webui(): + """Launch the Studio web UI server (http://localhost:3847).""" + import subprocess as sp + import shutil as _shutil + + accent = "\033[38;2;212;135;74m" + gray = "\033[38;5;245m" + yellow = "\033[38;2;250;204;21m" + dim = "\033[2m" + reset = "\033[0m" + + backend_dir = os.path.dirname(os.path.abspath(__file__)) + port = os.environ.get("PORT", "3847") + node = os.environ.get("PODCLI_NODE") or _shutil.which("node") + studio = os.environ.get("PODCLI_STUDIO") or os.path.join(backend_dir, "..", "studio") + server = os.path.join(studio, "web-server.mjs") + repo = os.path.join(backend_dir, "..") + + if node and os.path.exists(server): + # Bundled studio: hermetic Node serves it, rendering delegated to this + # same Python backend + ffmpeg via the env below. + env = { + **os.environ, + "PORT": str(port), + "PODCLI_BACKEND": backend_dir, + "PYTHON_PATH": sys.executable, + "PODCLI_HOME": paths["home"], + # data_dir is the cache's parent — output is now decoupled + # (clips render to the working dir), so don't derive it from output. + "PODCLI_DATA": os.path.dirname(paths["cache"]), + "PODCLI_OUTPUT": paths["output"], + "FFMPEG_PATH": os.environ.get("PODCLI_FFMPEG", "ffmpeg"), + "FFPROBE_PATH": os.environ.get("PODCLI_FFPROBE", "ffprobe"), + } + print(f"\n {gray}Studio:{reset} {accent}http://localhost:{port}{reset} {dim}(Ctrl+C to stop){reset}\n") + sp.run([node, server], env=env) + elif os.path.exists(os.path.join(repo, "package.json")) and _shutil.which("npm"): + # Source checkout (dev): build + serve via npm. + _npm_shell = sys.platform == "win32" + spa = os.path.join(repo, "dist", "ui", "public", "index.html") + ok = True + if not os.path.exists(spa): + print(f"\n {gray}Building the studio (first run)…{reset}\n") + ok = sp.run(["npm", "run", "build"], cwd=repo, shell=_npm_shell).returncode == 0 + if not ok: + print(f"\n {yellow}Build failed — run 'npm install' then try again.{reset}\n") + if ok: + print(f"\n {gray}Studio:{reset} {accent}http://localhost:{port}{reset} {dim}(Ctrl+C to stop){reset}\n") + sp.run(["npm", "run", "ui:prod"], cwd=repo, shell=_npm_shell) + else: + print(f"\n {yellow}Studio isn't provisioned yet.{reset}") + print(f" {dim}Run{reset} {accent}podcli setup{reset} {dim}to fetch the bundled studio + Node.{reset}\n") + + def interactive_menu(): """Interactive startup — show banner then let user pick what to do.""" @@ -3631,48 +3697,7 @@ def interactive_menu(): _interactive_process() return elif choice == "webui": - import subprocess as sp - import shutil as _shutil - backend_dir = os.path.dirname(os.path.abspath(__file__)) - port = os.environ.get("PORT", "3847") - node = os.environ.get("PODCLI_NODE") or _shutil.which("node") - studio = os.environ.get("PODCLI_STUDIO") or os.path.join(backend_dir, "..", "studio") - server = os.path.join(studio, "web-server.mjs") - repo = os.path.join(backend_dir, "..") - if node and os.path.exists(server): - # Bundled studio: hermetic Node serves it, rendering delegated to - # this same Python backend + ffmpeg via the env below. - env = { - **os.environ, - "PORT": str(port), - "PODCLI_BACKEND": backend_dir, - "PYTHON_PATH": sys.executable, - "PODCLI_HOME": paths["home"], - # data_dir is the cache's parent — output is now decoupled - # (clips render to the working dir), so don't derive it from output. - "PODCLI_DATA": os.path.dirname(paths["cache"]), - "PODCLI_OUTPUT": paths["output"], - "FFMPEG_PATH": os.environ.get("PODCLI_FFMPEG", "ffmpeg"), - "FFPROBE_PATH": os.environ.get("PODCLI_FFPROBE", "ffprobe"), - } - print(f"\n {gray}Studio:{reset} {accent}http://localhost:{port}{reset} {dim}(Ctrl+C to stop){reset}\n") - sp.run([node, server], env=env) - elif os.path.exists(os.path.join(repo, "package.json")) and _shutil.which("npm"): - # Source checkout (dev): build + serve via npm. - _npm_shell = sys.platform == "win32" - spa = os.path.join(repo, "dist", "ui", "public", "index.html") - ok = True - if not os.path.exists(spa): - print(f"\n {gray}Building the studio (first run)…{reset}\n") - ok = sp.run(["npm", "run", "build"], cwd=repo, shell=_npm_shell).returncode == 0 - if not ok: - print(f"\n {yellow}Build failed — run 'npm install' then try again.{reset}\n") - if ok: - print(f"\n {gray}Studio:{reset} {accent}http://localhost:{port}{reset} {dim}(Ctrl+C to stop){reset}\n") - sp.run(["npm", "run", "ui:prod"], cwd=repo, shell=_npm_shell) - else: - print(f"\n {yellow}Studio isn't provisioned yet.{reset}") - print(f" {dim}Run{reset} {accent}podcli setup{reset} {dim}to fetch the bundled studio + Node.{reset}\n") + launch_webui() elif choice == "assets": _interactive_assets() elif choice == "presets": diff --git a/backend/services/caption_renderer.py b/backend/services/caption_renderer.py index 940ed5a..1342971 100644 --- a/backend/services/caption_renderer.py +++ b/backend/services/caption_renderer.py @@ -132,6 +132,21 @@ def render_captions( return output_path +CAPTION_GAP_FILL_MAX = 0.4 # seconds + + +def _hold_through_gap(chunks: list[list[dict]], idx: int, end: float, offset: float) -> float: + """Extend a chunk's end toward the next chunk's start so a pause on a chunk + boundary doesn't blank the screen. Capped, and never overlaps the next chunk.""" + for j in range(idx + 1, len(chunks)): + if chunks[j]: + next_start = max(0, chunks[j][0]["start"] - offset) + if next_start > end: + return min(next_start, end + CAPTION_GAP_FILL_MAX) + return end + return end + + def _render_hormozi(words: list[dict], style: dict, offset: float) -> str: """ Hormozi style: Show 2-3 words at a time, smooth karaoke-fill highlight. @@ -147,12 +162,12 @@ def _render_hormozi(words: list[dict], style: dict, offset: float) -> str: chunk = words[i : i + chunk_size] chunks.append(chunk) - for chunk in chunks: + for idx, chunk in enumerate(chunks): if not chunk: continue chunk_start = max(0, chunk[0]["start"] - offset) - chunk_end = max(0, chunk[-1]["end"] - offset) + chunk_end = _hold_through_gap(chunks, idx, max(0, chunk[-1]["end"] - offset), offset) # Build \kf karaoke-fill parts: each word fills progressively parts = [] @@ -189,12 +204,12 @@ def _render_karaoke(words: list[dict], style: dict, offset: float) -> str: for i in range(0, len(words), sentence_size): sentences.append(words[i : i + sentence_size]) - for sentence in sentences: + for idx, sentence in enumerate(sentences): if not sentence: continue sent_start = max(0, sentence[0]["start"] - offset) - sent_end = max(0, sentence[-1]["end"] - offset) + sent_end = _hold_through_gap(sentences, idx, max(0, sentence[-1]["end"] - offset), offset) parts = [] for w in sentence: @@ -227,12 +242,12 @@ def _render_subtle(words: list[dict], style: dict, offset: float) -> str: for i in range(0, len(words), line_size): lines.append(words[i : i + line_size]) - for line_words in lines: + for idx, line_words in enumerate(lines): if not line_words: continue line_start = max(0, line_words[0]["start"] - offset) - line_end = max(0, line_words[-1]["end"] - offset) + line_end = _hold_through_gap(lines, idx, max(0, line_words[-1]["end"] - offset), offset) line_text = " ".join(w["word"] for w in line_words) @@ -493,12 +508,12 @@ def _render_branded(words: list[dict], style: dict, offset: float) -> str: chunk = words[i : i + chunk_size] chunks.append(chunk) - for chunk in chunks: + for chunk_idx, chunk in enumerate(chunks): if not chunk: continue chunk_start = max(0, chunk[0]["start"] - offset) - chunk_end = max(0, chunk[-1]["end"] - offset) + chunk_end = _hold_through_gap(chunks, chunk_idx, max(0, chunk[-1]["end"] - offset), offset) # Normalize casing normalized = [] diff --git a/backend/services/claude_suggest.py b/backend/services/claude_suggest.py index e180477..e400efc 100644 --- a/backend/services/claude_suggest.py +++ b/backend/services/claude_suggest.py @@ -360,19 +360,37 @@ def _should_bucket_initial_selection(segments: list[dict]) -> bool: def _dedupe_clips_by_range(clips: list[dict]) -> list[dict]: - """Drop duplicate clip suggestions that share the same rounded range.""" - deduped = [] - seen_ranges = set() - for clip in sorted(clips, key=lambda c: c.get("start_second", 0)): - key = ( - round(float(clip.get("start_second", 0)), 1), - round(float(clip.get("end_second", 0)), 1), - ) - if key in seen_ranges: - continue - seen_ranges.add(key) - deduped.append(clip) - return deduped + """Collapse overlapping clip suggestions (>50% of the shorter clip), keeping + the higher-scored one, sorted by start time. Exact-range matching would miss + near-duplicates like 100.0-140.0 vs 102.5-141.5.""" + kept: list[dict] = [] + # Highest-scored first so the survivor of an overlap is the better clip. + for clip in sorted(clips, key=lambda c: c.get("score", 0), reverse=True): + start = float(clip.get("start_second", 0)) + end = float(clip.get("end_second", 0)) + dur = max(0.0, end - start) + duplicate = False + for k in kept: + k_start = float(k.get("start_second", 0)) + k_end = float(k.get("end_second", 0)) + overlap = max(0.0, min(end, k_end) - max(start, k_start)) + shorter = min(dur, max(0.0, k_end - k_start)) or 1.0 + if overlap / shorter > 0.5: + duplicate = True + break + if not duplicate: + kept.append(clip) + return sorted(kept, key=lambda c: c.get("start_second", 0)) + + +def _select_top_by_score(clips: list[dict], top_n: int) -> list[dict]: + """Keep the highest-scored `top_n` clips, then order them by start time. + Ranking by score must come before truncation — otherwise the earliest clips + ship, not the best ones.""" + if len(clips) <= top_n: + return sorted(clips, key=lambda c: c.get("start_second", 0)) + ranked = sorted(clips, key=lambda c: c.get("score", 0), reverse=True)[:top_n] + return sorted(ranked, key=lambda c: c.get("start_second", 0)) def find_moments_from_text( @@ -695,12 +713,12 @@ def _parse_seconds(val) -> float: "_ai_engine": engine, }) - normalized.sort(key=lambda x: x["start_second"]) + selected = _select_top_by_score(normalized, top_n) - if normalized: + if selected: if progress_callback: - progress_callback(100, f"{label} suggested {len(normalized)} clips") - return normalized + progress_callback(100, f"{label} suggested {len(selected)} clips") + return selected if progress_callback: progress_callback(0, f"{label} returned no usable clips") @@ -798,7 +816,7 @@ def suggest_initial_with_claude( deduped = _dedupe_clips_by_range(aggregated) if len(deduped) >= top_n: - return deduped[:top_n] + return _select_top_by_score(deduped, top_n) fallback_clips = suggest_with_claude( segments=segments, @@ -816,7 +834,7 @@ def suggest_initial_with_claude( if fallback_clips: deduped = _dedupe_clips_by_range(deduped + fallback_clips) - return deduped[:top_n] if deduped else None + return _select_top_by_score(deduped, top_n) if deduped else None def _bucket_coverage_seconds(existing_clips: list[dict], start: float, end: float) -> float: @@ -959,13 +977,5 @@ def suggest_more_with_claude( if fallback_clips: aggregated.extend(fallback_clips) - deduped = [] - seen_ranges = set() - for clip in sorted(aggregated, key=lambda c: c.get("start_second", 0)): - key = (round(float(clip.get("start_second", 0)), 1), round(float(clip.get("end_second", 0)), 1)) - if key in seen_ranges: - continue - seen_ranges.add(key) - deduped.append(clip) - - return deduped[:top_n] if deduped else None + deduped = _dedupe_clips_by_range(aggregated) + return _select_top_by_score(deduped, top_n) if deduped else None diff --git a/backend/services/face_analysis.py b/backend/services/face_analysis.py index 17fe762..f9501b5 100644 --- a/backend/services/face_analysis.py +++ b/backend/services/face_analysis.py @@ -101,12 +101,27 @@ def _mouth_roi_gray(frame, cx: int, cy: int, fw: int, fh: int): small = cv2.resize(roi, (16, 16)) return cv2.cvtColor(small, cv2.COLOR_BGR2GRAY).astype(np.int16) - for i in range(sample_count): - t = i * duration / sample_count - cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000) - ret, frame = cap.read() + # Some containers under-report CAP_PROP_FRAME_COUNT, which would confine + # sampling to the front of the video; fall back to duration*fps. + reported_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0) + est_frames = int(duration * fps) if duration and fps else 0 + total_frames = max(reported_frames, est_frames, 1) + # grab() advances cheaply, retrieve() decodes only the sampled frames — far + # cheaper than a cap.set() seek (keyframe re-decode) per sample. + step = max(1, total_frames // sample_count) + frame_idx = -1 + sampled = 0 + while sampled < sample_count: + if not cap.grab(): + break + frame_idx += 1 + if frame_idx % step != 0: + continue + ret, frame = cap.retrieve() if not ret: continue + pos_ms = cap.get(cv2.CAP_PROP_POS_MSEC) + t = pos_ms / 1000.0 if pos_ms and pos_ms > 0 else frame_idx / fps faces = detect_faces(detector, frame, width, height) current_mouth_gray: dict[int, "np.ndarray"] = {} @@ -139,9 +154,10 @@ def _mouth_roi_gray(frame, cx: int, cy: int, fw: int, fh: int): prev_mouth_gray = current_mouth_gray faces_per_frame.append(frame_faces) - if progress_callback and i % 20 == 0: - pct = 10 + int(60 * i / sample_count) - progress_callback(pct, f"Analyzing faces... {i}/{sample_count}") + if progress_callback and sampled % 20 == 0: + pct = 10 + int(60 * sampled / sample_count) + progress_callback(pct, f"Analyzing faces... {sampled}/{sample_count}") + sampled += 1 cap.release() diff --git a/backend/services/transcription.py b/backend/services/transcription.py index 9ca581e..41cb9d0 100644 --- a/backend/services/transcription.py +++ b/backend/services/transcription.py @@ -75,10 +75,120 @@ def _transcribe_with_whispercpp(file_path, model_size, language, progress_callba vad_model=os.environ.get("PODCLI_WHISPERCPP_VAD_MODEL") or None, ) if progress_callback: - progress_callback(100, "Transcription complete") + progress_callback(50, "Transcription complete") return result +def _attach_speakers_and_faces( + file_path, + base, + enable_diarization, + num_speakers, + progress_callback, +): + """Merge speaker diarization + face analysis into a transcribed result. + Shared by both engines; face analysis (OpenCV) runs even when diarization + is unavailable.""" + segments = base.get("segments") or [] + words = base.get("words") or [] + duration = base.get("duration") or (segments[-1]["end"] if segments else 0.0) + + speaker_segments = [] + speaker_summary = {"num_speakers": 0, "speakers": {}} + diarization_warning = None + + if enable_diarization: + try: + from services.speaker_detection import ( + extract_audio_wav, + run_diarization, + assign_speakers_to_segments, + assign_speakers_to_words, + create_speaker_summary, + ) + + if progress_callback: + progress_callback(55, "Extracting audio for speaker detection...") + + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + wav_path = tmp.name + + try: + extract_audio_wav(file_path, wav_path) + + if progress_callback: + progress_callback(60, "Running speaker diarization...") + + speaker_segments = run_diarization( + wav_path, + num_speakers=num_speakers, + progress_callback=lambda pct, msg: ( + progress_callback(60 + int(pct * 0.3), msg) if progress_callback else None + ), + ) + + if speaker_segments: + if progress_callback: + progress_callback(92, "Assigning speakers to transcript...") + + segments = assign_speakers_to_segments(segments, speaker_segments) + words = assign_speakers_to_words(words, speaker_segments) + speaker_summary = create_speaker_summary(speaker_segments) + + if progress_callback: + progress_callback( + 95, + f"Found {speaker_summary['num_speakers']} speakers", + ) + + finally: + if os.path.exists(wav_path): + os.unlink(wav_path) + + except ImportError as e: + diarization_warning = f"Speaker detection unavailable: {e}" + if progress_callback: + progress_callback(90, diarization_warning) + except PermissionError as e: + diarization_warning = str(e) + if progress_callback: + progress_callback(90, diarization_warning) + except Exception as e: + diarization_warning = f"Speaker detection failed: {e}" + if progress_callback: + progress_callback(90, diarization_warning) + else: + diarization_warning = "Speaker detection disabled" + + face_map = None + try: + if progress_callback: + progress_callback(95, "Analyzing face positions...") + from services.face_analysis import analyze_faces + + face_map = analyze_faces( + video_path=file_path, + speaker_segments=speaker_segments, + duration=duration, + ) + except Exception as e: + print(f"Warning: face analysis failed: {e}", file=sys.stderr) + + if progress_callback: + progress_callback(100, "Complete") + + base["segments"] = segments + base["words"] = words + base["duration"] = round(duration, 3) + base["speakers"] = speaker_summary + base["speaker_segments"] = speaker_segments + if face_map: + base["face_map"] = face_map + if diarization_warning: + base["diarization_warning"] = diarization_warning + return base + + def transcribe_file( file_path: str, model_size: str = "base", @@ -106,30 +216,38 @@ def transcribe_file( requested = os.environ.get("PODCLI_ENGINE", "").strip().lower() engine = requested or "whisper-py" - if engine in ("whispercpp", "whisper-cpp", "whisper.cpp", "cpp"): - return _transcribe_with_whispercpp(file_path, model_size, language, progress_callback) - - # ================================================================ - # Step 1: Whisper transcription - # ================================================================ - if progress_callback: - progress_callback(5, "Loading Whisper model...") + use_cpp = engine in ("whispercpp", "whisper-cpp", "whisper.cpp", "cpp") # Native installs ship whisper.cpp, not openai-whisper. Fall back to it # automatically — whether whisper is missing OR a broken install fails to # load/run — unless the user explicitly asked for the whisper-py engine. - try: - import whisper + if not use_cpp: + if progress_callback: + progress_callback(5, "Loading Whisper model...") + try: + import whisper - model = whisper.load_model(model_size) - except Exception as e: - if not requested and _whispercpp_ready(model_size): - return _transcribe_with_whispercpp(file_path, model_size, language, progress_callback) - raise RuntimeError( - "The whisper-py engine needs the full source install (openai-whisper + torch). " - "This native install ships whisper.cpp — rerun with --engine whispercpp." - ) from e + model = whisper.load_model(model_size) + except Exception as e: + if not requested and _whispercpp_ready(model_size): + use_cpp = True + else: + raise RuntimeError( + "The whisper-py engine needs the full source install (openai-whisper + torch). " + "This native install ships whisper.cpp — rerun with --engine whispercpp." + ) from e + + if use_cpp: + base = _transcribe_with_whispercpp(file_path, model_size, language, progress_callback) + # whisper.cpp is the no-torch path: importing torch for diarization can + # hard-crash native runtimes. Skip diarization, keep face analysis (OpenCV). + return _attach_speakers_and_faces( + file_path, base, False, num_speakers, progress_callback + ) + # ================================================================ + # Step 1: Whisper transcription + # ================================================================ if progress_callback: progress_callback(10, f"Transcribing with Whisper ({model_size})...") @@ -202,110 +320,13 @@ def transcribe_file( detected_lang = result.get("language", language or "en") - # ================================================================ - # Step 2: Speaker diarization (if enabled) - # ================================================================ - speaker_segments = [] - speaker_summary = {"num_speakers": 0, "speakers": {}} - diarization_warning = None - - if enable_diarization: - try: - from services.speaker_detection import ( - extract_audio_wav, - run_diarization, - assign_speakers_to_segments, - assign_speakers_to_words, - create_speaker_summary, - ) - - if progress_callback: - progress_callback(55, "Extracting audio for speaker detection...") - - # Extract audio as WAV for pyannote - with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: - wav_path = tmp.name - - try: - extract_audio_wav(file_path, wav_path) - - if progress_callback: - progress_callback(60, "Running speaker diarization...") - - speaker_segments = run_diarization( - wav_path, - num_speakers=num_speakers, - progress_callback=lambda pct, msg: ( - progress_callback(60 + int(pct * 0.3), msg) if progress_callback else None - ), - ) - - if speaker_segments: - if progress_callback: - progress_callback(92, "Assigning speakers to transcript...") - - # Merge speaker labels into segments and words - segments = assign_speakers_to_segments(segments, speaker_segments) - words = assign_speakers_to_words(words, speaker_segments) - speaker_summary = create_speaker_summary(speaker_segments) - - if progress_callback: - progress_callback( - 95, - f"Found {speaker_summary['num_speakers']} speakers", - ) - - finally: - if os.path.exists(wav_path): - os.unlink(wav_path) - - except ImportError as e: - diarization_warning = f"Speaker detection unavailable: {e}" - if progress_callback: - progress_callback(90, diarization_warning) - except PermissionError as e: - diarization_warning = str(e) - if progress_callback: - progress_callback(90, diarization_warning) - except Exception as e: - diarization_warning = f"Speaker detection failed: {e}" - if progress_callback: - progress_callback(90, diarization_warning) - else: - diarization_warning = "Speaker detection disabled" - - # Run face analysis on the video (maps speakers to face positions) - # Run silently — no progress callbacks to avoid interfering with CLI output - face_map = None - try: - if progress_callback: - progress_callback(95, "Analyzing face positions...") - from services.face_analysis import analyze_faces - face_map = analyze_faces( - video_path=file_path, - speaker_segments=speaker_segments, - duration=duration, - ) - except Exception as e: - print(f"Warning: face analysis failed: {e}", file=sys.stderr) - - if progress_callback: - progress_callback(100, "Complete") - - result_data = { + base = { "transcript": result.get("text", "").strip(), "segments": segments, "words": words, - "duration": round(duration, 3), + "duration": duration, "language": detected_lang, - "speakers": speaker_summary, - "speaker_segments": speaker_segments, } - - if face_map: - result_data["face_map"] = face_map - - if diarization_warning: - result_data["diarization_warning"] = diarization_warning - - return result_data + return _attach_speakers_and_faces( + file_path, base, enable_diarization, num_speakers, progress_callback + ) diff --git a/backend/services/video_processor.py b/backend/services/video_processor.py index d953be9..cbf00d7 100644 --- a/backend/services/video_processor.py +++ b/backend/services/video_processor.py @@ -1106,8 +1106,6 @@ def _track_and_crop( # breaks on every layout transition. For mixed layouts, use # simple per-frame largest-face following instead. is_mixed = face_map.get("is_mixed_layout", False) if face_map else False - if is_mixed and not is_mixed: # disabled — checking below - pass if is_mixed: # Build a time→speaker lookup from segments def _speaker_at(t_sec: float) -> str | None: diff --git a/cli/main.go b/cli/main.go index 1308444..d3afbd5 100644 --- a/cli/main.go +++ b/cli/main.go @@ -67,7 +67,7 @@ func wantsRuntime(args []string) bool { return true } switch args[0] { - case "process", "transcribe", "studio", "auto": + case "process", "transcribe", "studio", "auto", "ui", "webui": return true } return false @@ -398,6 +398,7 @@ Usage: Engine commands (routed to the processing backend): process