#!/usr/bin/env python3 """YouTube Auto Dub command-line entrypoint.""" from __future__ import annotations import argparse import asyncio import shutil import time from src.core_utils import ConfigurationError from src.translation import TranslationConfig def build_parser() -> argparse.ArgumentParser: """Build the command-line parser.""" parser = argparse.ArgumentParser( description="YouTube Auto Dub - Automated Video Subtitling", formatter_class=argparse.RawDescriptionHelpFormatter, epilog="""\ Examples: python main.py "https://youtube.com/watch?v=VIDEO_ID" --lang es python main.py "https://youtube.com/watch?v=VIDEO_ID" --lang fr --gpu python main.py "https://youtube.com/watch?v=VIDEO_ID" --lang ja --browser chrome python main.py "https://youtube.com/watch?v=VIDEO_ID" --whisper_model large-v3 python main.py "https://youtube.com/watch?v=VIDEO_ID" --lmstudio-model gemma-3-4b-it """, ) parser.add_argument("url", help="YouTube video URL to subtitle") parser.add_argument( "--lang", "-l", default="es", help="Target language ISO code (e.g., es, fr, ja, vi).", ) parser.add_argument( "--browser", "-b", help="Browser to extract cookies from (chrome, edge, firefox). Close browser first!", ) parser.add_argument( "--cookies", "-c", help="Path to cookies.txt file (Netscape format) for YouTube authentication", ) parser.add_argument( "--gpu", action="store_true", help="Use GPU acceleration for Whisper when CUDA is available.", ) parser.add_argument( "--whisper_model", "-wm", help="Whisper model to use (tiny, base, small, medium, large-v3). Default: auto-select based on VRAM", ) parser.add_argument( "--translation-backend", default="lmstudio", choices=["lmstudio"], help="Translation backend to use. Currently only 'lmstudio' is supported.", ) parser.add_argument( "--lmstudio-base-url", help="Override the LM Studio OpenAI-compatible base URL (default: env or http://127.0.0.1:1234/v1).", ) parser.add_argument( "--lmstudio-model", help="Override the LM Studio model name (default: env or gemma-3-4b-it).", ) return parser def _check_deps() -> None: """Verify critical runtime dependencies.""" from shutil import which missing = [] if not which("ffmpeg"): missing.append("ffmpeg") if not which("ffprobe"): missing.append("ffprobe") if missing: print(f"[!] CRITICAL: Missing dependencies: {', '.join(missing)}") print(" Please install FFmpeg and add it to your System PATH.") print(" Download: https://ffmpeg.org/download.html") raise SystemExit(1) try: import torch print(f"[*] PyTorch {torch.__version__} | CUDA Available: {torch.cuda.is_available()}") except ImportError: print("[!] CRITICAL: PyTorch not installed.") print(" Install with your UV env, for example:") print(" uv pip install --python .venv\\Scripts\\python.exe -r requirements.txt") raise SystemExit(1) def _cleanup() -> None: """Clean up the temp directory with retries for Windows file locks.""" import src.engines max_retries = 5 for attempt in range(max_retries): try: if src.engines.TEMP_DIR.exists(): shutil.rmtree(src.engines.TEMP_DIR) src.engines.TEMP_DIR.mkdir(parents=True, exist_ok=True) return except PermissionError: wait_time = 0.5 * (2 ** attempt) print(f"[-] File locked (attempt {attempt + 1}/{max_retries}). Retrying in {wait_time}s...") time.sleep(wait_time) print(f"[!] WARNING: Could not fully clean temp directory after {max_retries} attempts.") print(f" Files may persist in: {src.engines.TEMP_DIR}") def _detect_device() -> str: """Detect the best available inference device.""" import torch if torch.backends.mps.is_available(): return "mps" if torch.cuda.is_available(): return "cuda" return "cpu" def _build_translation_config(args: argparse.Namespace) -> TranslationConfig: """Resolve translation configuration from env vars plus CLI overrides.""" return TranslationConfig.from_env( backend=args.translation_backend, base_url=args.lmstudio_base_url, model=args.lmstudio_model, ) def _get_source_language_hint() -> str: """Read an optional source language override from the environment.""" import os return (os.getenv("SOURCE_LANGUAGE_HINT") or "").strip() async def _synthesize_dub_audio(engine, chunks, target_lang: str, media_module, temp_dir) -> None: """Generate and fit dubbed audio clips for each translated chunk.""" total = len(chunks) for index, chunk in enumerate(chunks, start=1): translated_text = chunk.get("trans_text", "").strip() target_duration = max(0.0, chunk["end"] - chunk["start"]) if not translated_text or target_duration <= 0: chunk["processed_audio"] = None continue raw_audio_path = temp_dir / f"tts_{index:04d}.mp3" rate = engine.calcRate( text=translated_text, target_dur=target_duration, original_text=chunk.get("text", ""), ) await engine.synthesize( text=translated_text, target_lang=target_lang, out_path=raw_audio_path, rate=rate, ) chunk["processed_audio"] = media_module.fit_audio(raw_audio_path, target_duration) if index == 1 or index % 10 == 0 or index == total: print(f"[-] Dub synthesis progress: {index}/{total}") def main() -> None: """Run the full YouTube Auto Dub pipeline.""" parser = build_parser() args = parser.parse_args() import src.engines import src.media import src.youtube print("\n" + "=" * 60) print("YOUTUBE AUTO SUB - INITIALIZING") print("=" * 60) _check_deps() try: translation_config = _build_translation_config(args) except ConfigurationError as exc: print(f"[!] INVALID TRANSLATION CONFIG: {exc}") raise SystemExit(1) from exc _cleanup() device = _detect_device() print(f"[*] Using device: {device.upper()}") print(f"[*] Translation backend: {translation_config.backend}") print(f"[*] LM Studio endpoint: {translation_config.base_url}") print(f"[*] LM Studio model: {translation_config.model}") if args.whisper_model: src.engines.ASR_MODEL = args.whisper_model print(f"[*] Using specified Whisper model: {args.whisper_model}") else: print(f"[*] Auto-selected Whisper model: {src.engines.ASR_MODEL} (based on VRAM)") try: source_language_hint = _get_source_language_hint() if source_language_hint: print(f"[*] Source language hint: {source_language_hint}") engine = src.engines.Engine( device, translation_config=translation_config, source_language_hint=source_language_hint, ) print(f"\n{'=' * 60}") print("STEP 1: DOWNLOADING CONTENT") print(f"{'=' * 60}") print(f"[*] Target URL: {args.url}") print(f"[*] Target Language: {args.lang.upper()}") try: video_path = src.youtube.downloadVideo( args.url, browser=args.browser, cookies_file=args.cookies, ) audio_path = src.youtube.downloadAudio( args.url, browser=args.browser, cookies_file=args.cookies, ) print(f"[+] Video downloaded: {video_path}") print(f"[+] Audio extracted: {audio_path}") except Exception as exc: print(f"\n[!] DOWNLOAD FAILED: {exc}") print("\n[-] TROUBLESHOOTING TIPS:") print(" 1. Close all browser windows if using --browser") print(" 2. Export fresh cookies.txt and use --cookies") print(" 3. Check if video is private/region-restricted") print(" 4. Verify YouTube URL is correct") return print(f"\n{'=' * 60}") print("STEP 2: SPEECH TRANSCRIPTION") print(f"{'=' * 60}") print(f"[*] Transcribing audio with Whisper ({src.engines.ASR_MODEL})...") raw_segments = engine.transcribeSafe(audio_path) print(f"[+] Transcription complete: {len(raw_segments)} segments") if raw_segments: print(f"[*] Sample segment: '{raw_segments[0]['text'][:50]}...'") print(f"\n{'=' * 60}") print("STEP 3: INTELLIGENT CHUNKING") print(f"{'=' * 60}") chunks = src.engines.smartChunk(raw_segments) print(f"[+] Optimized {len(raw_segments)} raw segments into {len(chunks)} chunks") print(f"[*] Average chunk duration: {sum(c['end'] - c['start'] for c in chunks) / len(chunks):.2f}s") print(f"\n{'=' * 60}") print(f"STEP 4: TRANSLATION ({args.lang.upper()})") print(f"{'=' * 60}") texts = [chunk["text"] for chunk in chunks] print(f"[*] Translating {len(texts)} text segments...") translated_texts = engine.translateSafe(texts, args.lang) for index, chunk in enumerate(chunks): chunk["trans_text"] = translated_texts[index] print("[+] Translation complete") if chunks: original = chunks[0]["text"][:50] translated = chunks[0]["trans_text"][:50] print(f"[*] Sample: '{original}' -> '{translated}'") print(f"\n{'=' * 60}") print("STEP 5: DUB AUDIO SYNTHESIS") print(f"{'=' * 60}") print(f"[*] Synthesizing dubbed speech for {len(chunks)} translated chunks...") asyncio.run(_synthesize_dub_audio(engine, chunks, args.lang, src.media, src.engines.TEMP_DIR)) concat_manifest_path = src.engines.TEMP_DIR / "dub_audio_manifest.txt" silence_ref_path = src.engines.TEMP_DIR / "silence_ref.wav" src.media.create_concat_file(chunks, silence_ref_path, concat_manifest_path) print(f"[+] Dub audio manifest generated: {concat_manifest_path}") print(f"\n{'=' * 60}") print("STEP 6: SUBTITLE GENERATION") print(f"{'=' * 60}") subtitle_path = src.engines.TEMP_DIR / "subtitles.srt" src.media.generate_srt(chunks, subtitle_path) print(f"[+] Subtitles generated: {subtitle_path}") print(f"\n{'=' * 60}") print("STEP 7: FINAL VIDEO RENDERING") print(f"{'=' * 60}") try: video_name = video_path.stem output_name = f"dubbed_{args.lang}_{video_name}.mp4" final_output = src.engines.OUTPUT_DIR / output_name print("[*] Rendering final video with dubbed audio and subtitles...") print(f" Source: {video_path}") print(f" Output: {final_output}") print(f" Dub audio manifest: {concat_manifest_path}") print(f" Subtitles: {subtitle_path}") src.media.render_video( video_path, concat_manifest_path, final_output, subtitle_path=subtitle_path, ) if final_output.exists(): file_size = final_output.stat().st_size / (1024 * 1024) print("\n[+] SUCCESS! Video rendered successfully.") print(f" Output: {final_output}") print(f" Size: {file_size:.1f} MB") else: print(f"\n[!] ERROR: Output file not created at {final_output}") except Exception as exc: print(f"\n[!] RENDERING FAILED: {exc}") print("[-] This may be due to:") print(" 1. Corrupted audio chunks") print(" 2. FFmpeg compatibility issues") print(" 3. Insufficient disk space") return finally: if "engine" in locals(): engine.translator.close() print(f"\n{'=' * 60}") print("YOUTUBE AUTO SUB - PIPELINE COMPLETE") print(f"{'=' * 60}") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n[!] Process interrupted by user") raise SystemExit(1) except Exception as exc: print(f"\n[!] UNEXPECTED ERROR: {exc}") print("[-] Please report this issue with the full error message") raise SystemExit(1) from exc