baseline: initial working version

2026-03-30 18:18:41 +01:00
commit 27cfe2a3f5
19 changed files with 3878 additions and 0 deletions
--- a/src/init.py
+++ b/src/init.py
@@ -0,0 +1,4 @@
+"""YouTube Auto Dub - Automated Video Translation and Dubbing"""
+
+__version__ = "1.0.0"
+__author__ = "Nguyen Cong Thuan Huy (mangodxd)"
--- a/src/core_utils.py
+++ b/src/core_utils.py
@@ -0,0 +1,181 @@
+"""Core utilities and exceptions for YouTube Auto Sub.
+
+This module consolidates shared utilities, exceptions, and helper functions
+used across the entire pipeline to reduce code duplication.
+
+Author: Nguyen Cong Thuan Huy (mangodxd)
+Version: 1.0.0
+"""
+
+import subprocess
+import time
+import traceback
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+
+class YouTubeAutoSubError(Exception):
+    """Base exception for all YouTube Auto Sub errors."""
+    pass
+
+
+class ModelLoadError(YouTubeAutoSubError):
+    """Raised when AI/ML model fails to load."""
+    pass
+
+
+class AudioProcessingError(YouTubeAutoSubError):
+    """Raised when audio processing operations fail."""
+    pass
+
+
+class TranscriptionError(YouTubeAutoSubError):
+    """Raised when speech transcription fails."""
+    pass
+
+
+class TranslationError(YouTubeAutoSubError):
+    """Raised when text translation fails."""
+    pass
+
+
+class TTSError(YouTubeAutoSubError):
+    """Raised when text-to-speech synthesis fails."""
+    pass
+
+
+class VideoProcessingError(YouTubeAutoSubError):
+    """Raised when video processing operations fail."""
+    pass
+
+
+class ConfigurationError(YouTubeAutoSubError):
+    """Raised when configuration is invalid or missing."""
+    pass
+
+
+class DependencyError(YouTubeAutoSubError):
+    """Raised when required dependencies are missing."""
+    pass
+
+
+class ValidationError(YouTubeAutoSubError):
+    """Raised when input validation fails."""
+    pass
+
+
+class ResourceError(YouTubeAutoSubError):
+    """Raised when system resources are insufficient."""
+    pass
+
+
+def _handleError(error: Exception, context: str = "") -> None:
+    """Centralized error handling with context.
+    
+    Args:
+        error: The exception that occurred.
+        context: Additional context about where the error occurred.
+        
+    Returns:
+        None
+    """
+    if context:
+        print(f"[!] ERROR in {context}: {error}")
+    else:
+        print(f"[!] ERROR: {error}")
+    
+    print(f"    Full traceback: {traceback.format_exc()}")
+
+
+
+
+def _runFFmpegCmd(cmd: List[str], timeout: int = 300, description: str = "FFmpeg operation") -> None:
+    """Run FFmpeg command with consistent error handling.
+    
+    Args:
+        cmd: FFmpeg command to run.
+        timeout: Command timeout in seconds.
+        description: Description for error messages.
+        
+    Raises:
+        RuntimeError: If FFmpeg command fails.
+    """
+    try:
+        subprocess.run(cmd, check=True, timeout=timeout)
+    except subprocess.TimeoutExpired:
+        raise RuntimeError(f"{description} timed out")
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"{description} failed: {e}")
+    except Exception as e:
+        raise RuntimeError(f"Unexpected error during {description}: {e}")
+
+
+def _validateAudioFile(file_path: Path, min_size: int = 1024) -> bool:
+    """Validate that audio file exists and has minimum size.
+    
+    Args:
+        file_path: Path to audio file.
+        min_size: Minimum file size in bytes.
+        
+    Returns:
+        True if file is valid, False otherwise.
+    """
+    if not file_path.exists():
+        return False
+    
+    if file_path.stat().st_size < min_size:
+        return False
+    
+    return True
+
+
+def _safeFileDelete(file_path: Path) -> None:
+    """Safely delete file with error handling.
+    
+    Args:
+        file_path: Path to file to delete.
+        
+    Returns:
+        None
+    """
+    try:
+        if file_path.exists():
+            file_path.unlink()
+    except Exception as e:
+        print(f"[!] WARNING: Could not delete file {file_path}: {e}")
+
+
+
+class ProgressTracker:
+    """Simple progress tracking for long operations."""
+    
+    def __init__(self, total: int, description: str = "Processing", update_interval: int = 10):
+        """Initialize progress tracker.
+        
+        Args:
+            total: Total number of items to process.
+            description: Description for progress messages.
+            update_interval: How often to update progress (every N items).
+        """
+        self.total = total
+        self.description = description
+        self.update_interval = update_interval
+        self.current = 0
+    
+    def update(self, increment: int = 1) -> None:
+        """Update progress counter.
+        
+        Args:
+            increment: Number of items processed.
+            
+        Returns:
+            None
+        """
+        self.current += increment
+        
+        if self.current % self.update_interval == 0 or self.current >= self.total:
+            progress = (self.current / self.total) * 100
+            print(f"[-] {self.description}: {self.current}/{self.total} ({progress:.1f}%)", end='\r')
+            
+            if self.current >= self.total:
+                print()
--- a/src/engines.py
+++ b/src/engines.py
@@ -0,0 +1,547 @@
+"""
+AI/ML Engines Module for YouTube Auto Dub.
+
+This module provides the core AI/ML functionality including:
+- Device and configuration management
+- Whisper-based speech transcription  
+- LM Studio translation integration
+- Edge TTS synthesis
+- Pipeline orchestration and chunking
+
+Author: Nguyen Cong Thuan Huy (mangodxd)
+Version: 1.0.0
+"""
+
+import torch
+import asyncio
+import edge_tts
+import gc
+import json
+import os
+from abc import ABC
+import numpy as np
+from pathlib import Path
+from typing import List, Dict, Optional, Union, Any
+
+# Local imports
+from src.core_utils import (
+    ModelLoadError, TranscriptionError, TranslationError, TTSError, 
+    AudioProcessingError, _handleError, _runFFmpegCmd, ProgressTracker, 
+    _validateAudioFile, _safeFileDelete
+)
+from src.translation import LMStudioTranslator, TranslationConfig
+
+# =============================================================================
+# CONFIGURATION
+# =============================================================================
+
+# Base directory of the project
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+# Working directories
+CACHE_DIR = BASE_DIR / ".cache"
+OUTPUT_DIR = BASE_DIR / "output"  
+TEMP_DIR = BASE_DIR / "temp"
+
+# Configuration files
+LANG_MAP_FILE = BASE_DIR / "language_map.json"
+
+# Ensure directories exist
+for directory_path in [CACHE_DIR, OUTPUT_DIR, TEMP_DIR]:
+    directory_path.mkdir(parents=True, exist_ok=True)
+
+# Audio processing settings
+SAMPLE_RATE = 24000
+AUDIO_CHANNELS = 1
+
+def _select_optimal_whisper_model(device: str = "cpu") -> str:
+    """Select optimal Whisper model based on available VRAM and device.
+    
+    Args:
+        device: Device type ('cuda' or 'cpu').
+        
+    Returns:
+        Optimal Whisper model name.
+    """
+    if device == "cpu":
+        return "base"  # CPU works best with base model
+    
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return "base"
+            
+        # Get VRAM information
+        gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # GB
+        
+        if gpu_memory < 4:
+            return "tiny"  # < 4GB VRAM
+        elif gpu_memory < 8:
+            return "base"  # 4-8GB VRAM
+        elif gpu_memory < 12:
+            return "small"  # 8-12GB VRAM
+        elif gpu_memory < 16:
+            return "medium"  # 12-16GB VRAM
+        else:
+            return "large-v3"  # > 16GB VRAM - use latest large model
+            
+    except Exception:
+        return "base"  # Fallback to base if detection fails
+
+ASR_MODEL = _select_optimal_whisper_model(device="cuda" if torch.cuda.is_available() else "cpu")
+DEFAULT_VOICE = "en-US-AriaNeural"
+
+
+# Load language configuration
+try:
+    with open(LANG_MAP_FILE, "r", encoding="utf-8") as f:
+        LANG_DATA = json.load(f)
+        print(f"[*] Loaded language configuration for {len(LANG_DATA)} languages")
+except (FileNotFoundError, json.JSONDecodeError) as e:
+    print(f"[!] WARNING: Could not load language map from {LANG_MAP_FILE}")
+    LANG_DATA = {}
+
+
+class DeviceManager:
+    """Centralized device detection and management."""
+    
+    def __init__(self, device: Optional[str] = None):
+        """Initialize device manager.
+        
+        Args:
+            device: Device type ('cuda' or 'cpu'). If None, auto-detects.
+        """
+        if device is None:
+            if torch.backends.mps.is_available(): #macOS
+                device = "mps"
+            elif torch.cuda.is_available():
+                device = "cuda"
+            else:
+                device = "cpu"
+        
+        self.device = device
+        self._logDeviceInfo()
+    
+    def _logDeviceInfo(self) -> None:
+        """Log device information to console.
+        
+        Args:
+            None
+            
+        Returns:
+            None
+        """
+        print(f"[*] Device initialized: {self.device.upper()}")
+        
+        if self.device == "cuda":
+            gpu_name = torch.cuda.get_device_name(0)
+            gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+            print(f"    GPU: {gpu_name} | VRAM: {gpu_memory:.1f} GB")
+    
+    def getMemoryInfo(self) -> Dict[str, float]:
+        """Get GPU memory usage information.
+        
+        Args:
+            None
+            
+        Returns:
+            Dictionary with allocated and reserved memory in GB.
+        """
+        if self.device != "cuda":
+            return {"allocated": 0.0, "reserved": 0.0}
+        
+        return {
+            "allocated": torch.cuda.memory_allocated(0) / (1024**3),
+            "reserved": torch.cuda.memory_reserved(0) / (1024**3)
+        }
+    
+    def clearCache(self) -> None:
+        """Clear GPU cache and run garbage collection.
+        
+        Args:
+            None
+            
+        Returns:
+            None
+        """
+        if self.device == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+
+
+class ConfigManager:
+    """Centralized configuration access with validation."""
+    
+    def getLanguageConfig(self, lang_code: str) -> Dict[str, Any]:
+        """Get language configuration by language code.
+        
+        Args:
+            lang_code: ISO language code.
+            
+        Returns:
+            Language configuration dictionary.
+        """
+        return LANG_DATA.get(lang_code, {})
+    
+    def extractVoice(self, voice_data, fallback_gender: str = "female") -> str:
+        """Extract voice string from various data formats.
+        
+        Args:
+            voice_data: Voice data in list, string, or other format.
+            fallback_gender: Default gender to use if extraction fails.
+            
+        Returns:
+            Voice string for TTS.
+        """
+        if isinstance(voice_data, list):
+            return voice_data[0] if voice_data else DEFAULT_VOICE
+        if isinstance(voice_data, str):
+            return voice_data
+        return DEFAULT_VOICE
+    
+    def getVoicePool(self, lang_code: str, gender: str) -> list:
+        """Get pool of available voices for language and gender.
+        
+        Args:
+            lang_code: ISO language code.
+            gender: Voice gender (male/female).
+            
+        Returns:
+            List of available voice strings.
+        """
+        lang_config = self.getLanguageConfig(lang_code)
+        voices = lang_config.get('voices', {})
+        pool = voices.get(gender, [DEFAULT_VOICE])
+        
+        if isinstance(pool, str):
+            pool = [pool]
+        
+        return pool
+
+
+class PipelineComponent(ABC):
+    """Base class for pipeline components with shared utilities."""
+    
+    def __init__(self, device_manager: DeviceManager, config_manager: ConfigManager):
+        """Initialize pipeline component.
+        
+        Args:
+            device_manager: Device management instance.
+            config_manager: Configuration management instance.
+        """
+        self.device_manager = device_manager
+        self.config_manager = config_manager
+        self.device = device_manager.device
+    
+    def _validateFileExists(self, file_path: Path, description: str = "File") -> None:
+        """Validate that a file exists.
+        
+        Args:
+            file_path: Path to validate.
+            description: Description for error messages.
+            
+        Raises:
+            FileNotFoundError: If file doesn't exist.
+        """
+        if not file_path.exists():
+            raise FileNotFoundError(f"{description} not found: {file_path}")
+    
+    def _ensureDirectory(self, directory: Path) -> None:
+        """Ensure directory exists, create if necessary.
+        
+        Args:
+            directory: Directory path to ensure exists.
+            
+        Returns:
+            None
+        """
+        directory.mkdir(parents=True, exist_ok=True)
+
+
+# =============================================================================
+# MAIN AI/ML ENGINE
+# =============================================================================
+
+class Engine(PipelineComponent):
+    """Central AI/ML engine for YouTube Auto Dub pipeline."""
+    
+    def __init__(
+        self,
+        device: Optional[str] = None,
+        translation_config: Optional[TranslationConfig] = None,
+        source_language_hint: Optional[str] = None,
+    ):
+        device_manager = DeviceManager(device)
+        config_manager = ConfigManager()
+        super().__init__(device_manager, config_manager)
+        
+        self._asr = None
+        self.source_language_hint = (source_language_hint or os.getenv("SOURCE_LANGUAGE_HINT") or "").strip()
+        self.detected_source_lang = self.source_language_hint or "auto"
+        self.translation_config = translation_config or TranslationConfig.from_env()
+        self.translator = LMStudioTranslator(self.translation_config)
+        
+        print(f"[+] AI Engine initialized successfully")
+            
+    @property
+    def asrModel(self):
+        """Lazy-load Whisper ASR model.
+        
+        Returns:
+            Loaded Whisper model instance.
+            
+        Raises:
+            ModelLoadError: If model fails to load.
+        """
+        if not self._asr:
+            print(f"[*] Loading Whisper model ({ASR_MODEL}) on {self.device}...")
+            try:
+                from faster_whisper import WhisperModel
+                compute_type = "float16" if self.device == "cuda" else "int8"
+                self._asr = WhisperModel(ASR_MODEL, device=self.device, compute_type=compute_type)
+                print(f"[+] Whisper model loaded successfully")
+            except Exception as e:
+                raise ModelLoadError(f"Failed to load Whisper model: {e}") from e
+        return self._asr
+    
+    def _getLangConfig(self, lang: str) -> Dict:
+        """Get language configuration.
+        
+        Args:
+            lang: Language code.
+            
+        Returns:
+            Language configuration dictionary.
+        """
+        return self.config_manager.getLanguageConfig(lang)
+
+    def _extractVoiceString(self, voice_data: Union[str, List[str], None]) -> str:
+        """Extract voice string from data.
+        
+        Args:
+            voice_data: Voice data in various formats.
+            
+        Returns:
+            Voice string for TTS.
+        """
+        return self.config_manager.extractVoice(voice_data)
+
+    def releaseMemory(self, component: Optional[str] = None) -> None:
+        """Release VRAM and clean up GPU memory.
+        
+        Args:
+            component: Specific component to release ('asr').
+                      If None, releases all components.
+                      
+        Returns:
+            None
+        """
+        if component in [None, 'asr'] and self._asr:
+            del self._asr
+            self._asr = None
+            print("[*] ASR VRAM cleared")
+            self.device_manager.clearCache()
+
+    def transcribeSafe(self, audio_path: Path) -> List[Dict]:
+        """Transcribe audio with automatic memory management.
+        
+        Args:
+            audio_path: Path to audio file.
+            
+        Returns:
+            List of transcription segments with timing.
+            
+        Raises:
+            TranscriptionError: If transcription fails.
+        """
+        try:
+            res = self.transcribe(audio_path)
+            self.releaseMemory('asr')
+            return res
+        except Exception as e:
+            _handleError(e, "transcription")
+            raise TranscriptionError(f"Transcription failed: {e}") from e
+
+    def translateSafe(self, texts: List[str], target_lang: str) -> List[str]:
+        """Translate texts safely with memory management.
+        
+        Args:
+            texts: List of text strings to translate.
+            target_lang: Target language code.
+            
+        Returns:
+            List of translated text strings.
+        """
+        self.releaseMemory()
+        return self.translate(texts, target_lang)
+
+    def transcribe(self, audio_path: Path) -> List[Dict]:
+        """Transcribe audio using Whisper model.
+        
+        Args:
+            audio_path: Path to audio file.
+            
+        Returns:
+            List of transcription segments with start/end times and text.
+        """
+        segments, info = self.asrModel.transcribe(str(audio_path), word_timestamps=False, language=None)
+        detected = getattr(info, "language", "auto") or "auto"
+        self.detected_source_lang = self.source_language_hint or detected
+        print(f"[*] Detected source language: {self.detected_source_lang}")
+        return [{'start': s.start, 'end': s.end, 'text': s.text.strip()} for s in segments]
+
+    def translate(self, texts: List[str], target_lang: str) -> List[str]:
+        """Translate texts to target language.
+        
+        Args:
+            texts: List of text strings to translate.
+            target_lang: Target language code.
+            
+        Returns:
+            List of translated text strings.
+            
+        Raises:
+            TranslationError: If translation fails.
+        """
+        if not texts: return []
+        print(f"[*] Translating {len(texts)} segments to '{target_lang}'...")
+        source_lang = self.detected_source_lang or "auto"
+        
+        try:
+            return self.translator.translate_segments(
+                texts=texts,
+                target_language=target_lang,
+                source_language=source_lang,
+            )
+        except Exception as e:
+            _handleError(e, "translation")
+            raise TranslationError(f"Translation failed: {e}") from e
+
+    def calcRate(self, text: str, target_dur: float, original_text: str = "") -> str:
+        """Calculate speech rate adjustment for TTS with dynamic limits.
+        
+        Args:
+            text: Text to be synthesized (translated text).
+            target_dur: Target duration in seconds.
+            original_text: Original text for length comparison (optional).
+            
+        Returns:
+            Rate adjustment string (e.g., '+10%', '-5%').
+        """
+        words = len(text.split())
+        if words == 0 or target_dur <= 0: return "+0%"
+        
+        # Base calculation
+        wps = words / target_dur
+        estimated_time = words / wps
+        
+        if estimated_time <= target_dur:
+            return "+0%"
+            
+        ratio = estimated_time / target_dur
+        speed_percent = int((ratio - 1) * 100)
+        
+        # Dynamic speed limits based on text length comparison
+        if original_text:
+            orig_len = len(original_text.split())
+            trans_len = words
+            
+            # If translated text is significantly longer, allow more slowdown
+            if trans_len > orig_len * 1.5:
+                # Allow up to -25% slowdown for longer translations
+                speed_percent = max(-25, min(speed_percent, 90))
+            elif trans_len < orig_len * 0.7:
+                # If translation is shorter, be more conservative with speedup
+                speed_percent = max(-15, min(speed_percent, 50))
+            else:
+                # Normal case: -10% to 90%
+                speed_percent = max(-10, min(speed_percent, 90))
+        else:
+            # Fallback to original limits
+            speed_percent = max(-10, min(speed_percent, 90))
+        
+        return f"{speed_percent:+d}%"
+
+    async def synthesize(
+        self, 
+        text: str, 
+        target_lang: str, 
+        out_path: Path,
+        gender: str = "female",
+        rate: str = "+0%"
+    ) -> None:
+        if not text.strip(): raise ValueError("Text empty")
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        try:
+            lang_cfg = self._getLangConfig(target_lang)
+            voice_pool = self.config_manager.getVoicePool(target_lang, gender)
+            voice = voice_pool[0] if voice_pool else DEFAULT_VOICE
+
+            communicate = edge_tts.Communicate(text, voice=voice, rate=rate)
+            await communicate.save(str(out_path))
+            
+            if not out_path.exists() or out_path.stat().st_size < 1024:
+                raise RuntimeError("TTS file invalid")
+                
+        except Exception as e:
+            if out_path.exists(): out_path.unlink(missing_ok=True)
+            _handleError(e, "TTS synthesis")
+            raise TTSError(f"TTS failed: {e}") from e
+
+
+def smartChunk(segments: List[Dict]) -> List[Dict]:
+    n = len(segments)
+    if n == 0: return []
+
+    # Calculate segment durations and gaps for dynamic analysis
+    durations = [s['end'] - s['start'] for s in segments]
+    gaps = [segments[i]['start'] - segments[i-1]['end'] for i in range(1, n)]
+    
+    # Dynamic parameters based on actual video content
+    avg_seg_dur = sum(durations) / n
+    avg_gap = sum(gaps) / len(gaps) if gaps else 0.5
+    
+    # Dynamic min/max duration based on content characteristics
+    min_dur = max(1.0, avg_seg_dur * 0.5)  # Minimum 1s, or 50% of average
+    max_dur = np.percentile(durations, 90) if n > 5 else min(15.0, avg_seg_dur * 3)
+    max_dur = max(5.0, min(30.0, max_dur))  # Clamp between 5-30 seconds
+    
+    # Hard threshold for gap-based splitting (1.5x average gap)
+    gap_threshold = max(0.4, avg_gap * 1.5)
+
+    path = []
+    curr_chunk_segs = [segments[0]]
+
+    for i in range(1, n):
+        prev = segments[i-1]
+        curr = segments[i]
+        gap = curr['start'] - prev['end']
+        
+        # Dynamic splitting criteria:
+        # 1. Gap exceeds threshold (natural pause)
+        # 2. Current chunk exceeds safe duration
+        # 3. Dynamic lookback: consider context but don't go too far back
+        current_dur = curr['end'] - curr_chunk_segs[0]['start']
+        
+        if gap > gap_threshold or current_dur > max_dur:
+            # Close current chunk
+            path.append({
+                'start': curr_chunk_segs[0]['start'],
+                'end': curr_chunk_segs[-1]['end'],
+                'text': " ".join(s['text'] for s in curr_chunk_segs).strip()
+            })
+            curr_chunk_segs = [curr]
+        else:
+            curr_chunk_segs.append(curr)
+
+    # Add final chunk
+    if curr_chunk_segs:
+        path.append({
+            'start': curr_chunk_segs[0]['start'],
+            'end': curr_chunk_segs[-1]['end'],
+            'text': " ".join(s['text'] for s in curr_chunk_segs).strip()
+        })
+
+    print(f"[+] Smart chunking: {len(path)} chunks (Dynamic: min={min_dur:.1f}s, max={max_dur:.1f}s, gap_thr={gap_threshold:.2f}s)")
+    return path
--- a/src/media.py
+++ b/src/media.py
@@ -0,0 +1,410 @@
+"""Media Processing Module for YouTube Auto Dub.
+
+This module handles all audio/video processing operations using FFmpeg.
+It provides functionality for:
+- Audio duration detection and analysis
+- Silence generation for gap filling
+- Audio time-stretching and duration fitting (PADDING logic added)
+- Video concatenation and rendering (Volume Mixing fixed)
+- Audio synchronization and mixing
+
+Author: Nguyen Cong Thuan Huy (mangodxd)
+Version: 1.1.0 (Patched)
+"""
+
+import subprocess
+from pathlib import Path
+from typing import List, Dict, Optional
+
+from src.engines import SAMPLE_RATE, AUDIO_CHANNELS
+
+
+def _build_subtitle_filter(subtitle_path: Path) -> str:
+    """Build a Windows-safe FFmpeg subtitles filter expression."""
+    escaped_path = str(subtitle_path.resolve()).replace("\\", "/").replace(":", "\\:")
+    return f"subtitles=filename='{escaped_path}'"
+
+
+def _render_with_soft_subtitles(video_path: Path, output_path: Path, subtitle_path: Path) -> None:
+    """Fallback render path that muxes subtitles instead of hard-burning them."""
+    cmd = [
+        'ffmpeg', '-y', '-v', 'error',
+        '-i', str(video_path),
+        '-i', str(subtitle_path),
+        '-map', '0:v',
+        '-map', '0:a?',
+        '-map', '1:0',
+        '-c:v', 'copy',
+        '-c:a', 'copy',
+        '-c:s', 'mov_text',
+        str(output_path)
+    ]
+    subprocess.run(cmd, check=True, timeout=None)
+
+
+def _render_mixed_with_soft_subtitles(
+    video_path: Path,
+    concat_file: Path,
+    output_path: Path,
+    subtitle_path: Path,
+    filter_complex: str,
+) -> None:
+    """Fallback render path that muxes subtitles while preserving mixed dubbed audio."""
+    cmd = [
+        'ffmpeg', '-y', '-v', 'error',
+        '-i', str(video_path),
+        '-f', 'concat', '-safe', '0', '-i', str(concat_file),
+        '-i', str(subtitle_path),
+        '-filter_complex', filter_complex,
+        '-map', '0:v',
+        '-map', '[outa]',
+        '-map', '2:0',
+        '-c:v', 'copy',
+        '-c:a', 'aac', '-b:a', '192k',
+        '-ar', str(SAMPLE_RATE),
+        '-ac', str(AUDIO_CHANNELS),
+        '-c:s', 'mov_text',
+        '-shortest',
+        str(output_path),
+    ]
+    subprocess.run(cmd, check=True, timeout=None)
+
+
+def _get_duration(path: Path) -> float:
+    """Get the duration of an audio/video file using FFprobe."""
+    if not path.exists():
+        print(f"[!] ERROR: Media file not found: {path}")
+        return 0.0
+    
+    try:
+        cmd = [
+            'ffprobe', '-v', 'error', 
+            '-show_entries', 'format=duration',
+            '-of', 'default=noprint_wrappers=1:nokey=1', 
+            str(path)
+        ]
+        
+        result = subprocess.run(
+            cmd, 
+            capture_output=True, 
+            text=True, 
+            check=True,
+            timeout=60  # Increased from 30s to 60s for better reliability
+        )
+        
+        duration_str = result.stdout.strip()
+        if duration_str:
+            return float(duration_str)
+        else:
+            return 0.0
+            
+    except Exception as e:
+        print(f"[!] ERROR: Getting duration failed for {path}: {e}")
+        return 0.0
+
+
+def _generate_silence_segment(duration: float, silence_ref: Path) -> Optional[Path]:
+    """Generate a small silence segment for the concat list."""
+    if duration <= 0:
+        return None
+    
+    # Use the parent folder of the reference silence file
+    output_path = silence_ref.parent / f"gap_{duration:.4f}.wav"
+    
+    if output_path.exists():
+        return output_path
+
+    try:
+        cmd = [
+            'ffmpeg', '-y', '-v', 'error',
+            '-f', 'lavfi', '-i', f'anullsrc=r={SAMPLE_RATE}:cl=mono',
+            '-t', f"{duration:.4f}",
+            '-c:a', 'pcm_s16le',
+            str(output_path)
+        ]
+        subprocess.run(cmd, check=True)
+        return output_path
+    except Exception:
+        return None
+
+def _analyze_audio_loudness(audio_path: Path) -> Optional[float]:
+    """Analyze audio loudness using FFmpeg volumedetect filter.
+    
+    Args:
+        audio_path: Path to audio file to analyze.
+        
+    Returns:
+        Mean volume in dB, or None if analysis fails.
+    """
+    if not audio_path.exists():
+        return None
+        
+    try:
+        cmd = [
+            'ffmpeg', '-y', '-v', 'error',
+            '-i', str(audio_path),
+            '-filter:a', 'volumedetect',
+            '-f', 'null', '-'
+        ]
+        
+        result = subprocess.run(cmd, capture_output=True, text=True, check=True, timeout=30)
+        
+        # Parse mean volume from output
+        for line in result.stderr.split('\n'):
+            if 'mean_volume:' in line:
+                # Extract dB value from line like: "mean_volume: -15.2 dB"
+                parts = line.split()
+                if len(parts) >= 2:
+                    try:
+                        return float(parts[1])
+                    except ValueError:
+                        continue
+        
+        return None
+    except Exception:
+        return None
+
+
+def fit_audio(audio_path: Path, target_dur: float) -> Path:
+    if not audio_path.exists() or target_dur <= 0:
+        return audio_path
+    
+    actual_dur = _get_duration(audio_path)
+    if actual_dur == 0.0:
+        return audio_path
+    
+    out_path = audio_path.parent / f"{audio_path.stem}_fit.wav"
+    
+    # Increased tolerance from 0.05s to 0.15s for more natural audio
+    if actual_dur > target_dur + 0.15:
+        ratio = actual_dur / target_dur
+        filter_chain = []
+        current_ratio = ratio
+        
+        # Dynamic speed limit: max 1.5x instead of 2.0x to avoid chipmunk effect
+        max_speed_ratio = 1.5
+        
+        while current_ratio > max_speed_ratio:
+            filter_chain.append(f"atempo={max_speed_ratio}")
+            current_ratio /= max_speed_ratio
+            
+        if current_ratio > 1.0:
+            filter_chain.append(f"atempo={current_ratio:.4f}")
+        
+        filter_complex = ",".join(filter_chain)
+        
+        cmd = [
+            'ffmpeg', '-y', '-v', 'error',
+            '-i', str(audio_path),
+            '-filter:a', f"{filter_complex},aresample=24000",
+            '-t', f"{target_dur:.4f}",
+            '-c:a', 'pcm_s16le',
+            str(out_path)
+        ]
+    else:
+        cmd = [
+            'ffmpeg', '-y', '-v', 'error',
+            '-i', str(audio_path),
+            '-filter:a', f"apad,aresample=24000",
+            '-t', f"{target_dur:.4f}",
+            '-c:a', 'pcm_s16le',
+            str(out_path)
+        ]
+    print(f"Fiting {actual_dur:.4f}s to {target_dur:.4f}s")
+    
+    try:
+        subprocess.run(cmd, check=True, timeout=120)
+        return out_path
+    except Exception:
+        return audio_path
+
+def create_concat_file(segments: List[Dict], silence_ref: Path, output_txt: Path) -> None:
+    if not segments:
+        return
+    
+    try:
+        with open(output_txt, 'w', encoding='utf-8') as f:
+            current_timeline = 0.0
+            
+            for segment in segments:
+                start_time = segment['start']
+                end_time = segment['end']
+                audio_path = segment.get('processed_audio')
+                
+                gap = start_time - current_timeline
+                if gap > 0.01:
+                    silence_gap = _generate_silence_segment(gap, silence_ref)
+                    if silence_gap:
+                        f.write(f"file '{silence_gap.resolve().as_posix()}'\n")
+                        current_timeline += gap
+                
+                if audio_path and audio_path.exists():
+                    f.write(f"file '{audio_path.resolve().as_posix()}'\n")
+                    current_timeline += (end_time - start_time)
+                else:
+                    dur = end_time - start_time
+                    silence_err = _generate_silence_segment(dur, silence_ref)
+                    if silence_err:
+                        f.write(f"file '{silence_err.resolve().as_posix()}'\n")
+                    current_timeline += dur
+                    
+    except Exception as e:
+        raise RuntimeError(f"Failed to create concat manifest: {e}")
+
+
+def render_video(
+    video_path: Path,
+    concat_file: Optional[Path],
+    output_path: Path,
+    subtitle_path: Optional[Path] = None,
+) -> None:
+    """Render final video with Dynamic Volume Mixing."""
+    if not video_path.exists():
+        raise FileNotFoundError("Source video for rendering is missing")
+
+    if concat_file is not None and not concat_file.exists():
+        raise FileNotFoundError("Concat audio manifest for rendering is missing")
+    
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    try:
+        print(f"[*] Rendering final video...")
+
+        if concat_file is None:
+            video_codec = 'copy'
+            cmd = [
+                'ffmpeg', '-y', '-v', 'error',
+                '-i', str(video_path),
+                '-map', '0:v',
+                '-map', '0:a?',
+            ]
+
+            if subtitle_path:
+                video_codec = 'libx264'
+                cmd.extend(['-vf', _build_subtitle_filter(subtitle_path)])
+
+            cmd.extend([
+                '-c:v', video_codec,
+                '-c:a', 'copy',
+            ])
+
+            cmd.append(str(output_path))
+            try:
+                subprocess.run(cmd, check=True, timeout=None, capture_output=True, text=True)
+            except subprocess.CalledProcessError as exc:
+                if subtitle_path and "No such filter: 'subtitles'" in (exc.stderr or ""):
+                    print("[!] FFmpeg subtitles filter is unavailable. Falling back to soft subtitles.")
+                    _render_with_soft_subtitles(video_path, output_path, subtitle_path)
+                else:
+                    raise
+
+            if not output_path.exists():
+                raise RuntimeError("Output file not created")
+
+            print(f"[+] Video rendered successfully: {output_path}")
+            return
+        
+        # DYNAMIC VOLUME MIXING STRATEGY:
+        # Analyze original audio loudness to determine optimal background volume
+        original_loudness = _analyze_audio_loudness(video_path)
+        
+        if original_loudness is not None:
+            # Calculate background volume based on loudness analysis
+            # Target: voice should be 10-15dB louder than background
+            if original_loudness > -10:  # Very loud audio
+                bg_volume = 0.08  # 8% - reduce more for loud content
+            elif original_loudness > -20:  # Normal audio
+                bg_volume = 0.15  # 15% - standard reduction
+            else:  # Quiet audio
+                bg_volume = 0.25  # 25% - reduce less for quiet content
+                
+            print(f"[*] Dynamic volume mixing: original={original_loudness:.1f}dB, bg_volume={bg_volume*100:.0f}%")
+        else:
+            # Fallback to default if analysis fails
+            bg_volume = 0.15
+            print(f"[*] Using default volume mixing: bg_volume={bg_volume*100:.0f}%")
+        
+        filter_complex = (
+            f"[0:a]volume={bg_volume}[bg]; "
+            "[bg][1:a]amix=inputs=2:duration=first:dropout_transition=0[outa]"
+        )
+        video_codec = 'copy'
+
+        cmd = [
+            'ffmpeg', '-y', '-v', 'error',
+            '-i', str(video_path),
+            '-f', 'concat', '-safe', '0', '-i', str(concat_file),
+            '-filter_complex', filter_complex,
+        ]
+        
+        # Handle Hard Subtitles (Requires re-encoding)
+        if subtitle_path:
+            video_codec = 'libx264'
+            cmd.extend(['-vf', _build_subtitle_filter(subtitle_path)])
+
+        cmd.extend([
+            '-map', '0:v',
+            '-map', '[outa]',
+            '-c:v', video_codec,
+            '-c:a', 'aac', '-b:a', '192k',
+            '-ar', str(SAMPLE_RATE),
+            '-ac', str(AUDIO_CHANNELS),
+            '-shortest'
+        ])
+        
+        cmd.append(str(output_path))
+        
+        # Run rendering
+        try:
+            subprocess.run(cmd, check=True, timeout=None, capture_output=True, text=True)
+        except subprocess.CalledProcessError as exc:
+            if subtitle_path and "No such filter: 'subtitles'" in (exc.stderr or ""):
+                print("[!] FFmpeg subtitles filter is unavailable. Falling back to soft subtitles.")
+                _render_mixed_with_soft_subtitles(
+                    video_path=video_path,
+                    concat_file=concat_file,
+                    output_path=output_path,
+                    subtitle_path=subtitle_path,
+                    filter_complex=filter_complex,
+                )
+            else:
+                raise
+        
+        if not output_path.exists():
+            raise RuntimeError("Output file not created")
+            
+        print(f"[+] Video rendered successfully: {output_path}")
+        
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"FFmpeg rendering failed: {e}")
+    except Exception as e:
+        raise RuntimeError(f"Rendering error: {e}")
+
+
+def generate_srt(segments: List[Dict], output_path: Path) -> None:
+    """Generate SRT subtitle file."""
+    if not segments: return
+    
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            for i, segment in enumerate(segments, 1):
+                start = _format_timestamp_srt(segment['start'])
+                end = _format_timestamp_srt(segment['end'])
+                text = segment.get('trans_text', '').strip()
+                
+                f.write(f"{i}\n{start} --> {end}\n{text}\n\n")
+                
+        print(f"[+] SRT subtitles generated")
+    except Exception as e:
+        print(f"[!] Warning: SRT generation failed: {e}")
+
+
+def _format_timestamp_srt(seconds: float) -> str:
+    """Convert seconds to HH:MM:SS,mmm."""
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    secs = int(seconds % 60)
+    millis = int((seconds % 1) * 1000)
+    return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
--- a/src/translation.py
+++ b/src/translation.py
@@ -0,0 +1,358 @@
+"""LM Studio translation client for YouTube Auto Dub."""
+
+from __future__ import annotations
+
+import os
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+from urllib.parse import urlparse
+
+import httpx
+
+from src.core_utils import ConfigurationError, TranslationError
+
+DEFAULT_LM_STUDIO_BASE_URL = "http://127.0.0.1:1234/v1"
+DEFAULT_LM_STUDIO_API_KEY = "lm-studio"
+DEFAULT_LM_STUDIO_MODEL = "gemma-3-4b-it"
+DEFAULT_TRANSLATION_BACKEND = "lmstudio"
+
+
+def _normalize_base_url(base_url: str) -> str:
+    """Normalize LM Studio base URLs to the OpenAI-compatible /v1 root."""
+    if not base_url or not isinstance(base_url, str):
+        raise ConfigurationError("LM Studio base URL must be a non-empty string.")
+
+    normalized = base_url.strip().rstrip("/")
+    if normalized.endswith("/chat/completions"):
+        normalized = normalized[: -len("/chat/completions")]
+    if not normalized.endswith("/v1"):
+        normalized = f"{normalized}/v1"
+
+    parsed = urlparse(normalized)
+    if parsed.scheme not in {"http", "https"} or not parsed.netloc:
+        raise ConfigurationError(
+            "LM Studio base URL must be a valid http(s) URL, for example "
+            "'http://127.0.0.1:1234/v1'."
+        )
+
+    return normalized
+
+
+@dataclass(frozen=True)
+class TranslationConfig:
+    """Runtime configuration for the translation backend."""
+
+    backend: str = DEFAULT_TRANSLATION_BACKEND
+    base_url: str = DEFAULT_LM_STUDIO_BASE_URL
+    api_key: str = DEFAULT_LM_STUDIO_API_KEY
+    model: str = DEFAULT_LM_STUDIO_MODEL
+    timeout_seconds: float = 45.0
+    max_retries: int = 3
+    retry_backoff_seconds: float = 1.0
+
+    @classmethod
+    def from_env(
+        cls,
+        backend: Optional[str] = None,
+        base_url: Optional[str] = None,
+        model: Optional[str] = None,
+        api_key: Optional[str] = None,
+    ) -> "TranslationConfig":
+        """Build config from environment variables plus optional overrides."""
+        config = cls(
+            backend=(backend or os.getenv("TRANSLATION_BACKEND") or DEFAULT_TRANSLATION_BACKEND).strip().lower(),
+            base_url=_normalize_base_url(base_url or os.getenv("LM_STUDIO_BASE_URL") or DEFAULT_LM_STUDIO_BASE_URL),
+            api_key=api_key or os.getenv("LM_STUDIO_API_KEY") or DEFAULT_LM_STUDIO_API_KEY,
+            model=model or os.getenv("LM_STUDIO_MODEL") or DEFAULT_LM_STUDIO_MODEL,
+        )
+        config.validate()
+        return config
+
+    @property
+    def chat_completions_url(self) -> str:
+        return f"{_normalize_base_url(self.base_url)}/chat/completions"
+
+    def validate(self) -> None:
+        """Validate the translation configuration."""
+        if self.backend != DEFAULT_TRANSLATION_BACKEND:
+            raise ConfigurationError(
+                f"Unsupported translation backend '{self.backend}'. "
+                f"Only '{DEFAULT_TRANSLATION_BACKEND}' is supported."
+            )
+
+        if not self.model or not isinstance(self.model, str):
+            raise ConfigurationError("LM Studio model must be a non-empty string.")
+
+        if not self.api_key or not isinstance(self.api_key, str):
+            raise ConfigurationError("LM Studio API key must be a non-empty string.")
+
+        if self.timeout_seconds <= 0:
+            raise ConfigurationError("LM Studio timeout must be greater than zero.")
+
+        if self.max_retries < 1:
+            raise ConfigurationError("LM Studio max retries must be at least 1.")
+
+        if self.retry_backoff_seconds < 0:
+            raise ConfigurationError("LM Studio retry backoff cannot be negative.")
+
+        _normalize_base_url(self.base_url)
+
+
+def _build_system_prompt(source_language: str, target_language: str) -> str:
+    source_descriptor = source_language or "auto"
+    return (
+        "You are a professional audiovisual translator.\n"
+        f"Translate the user-provided text from {source_descriptor} to {target_language}.\n"
+        "Preserve meaning, tone, style, and intent as closely as possible.\n"
+        "Keep punctuation natural and keep subtitle-like lines concise when the source is concise.\n"
+        "Return only the translation.\n"
+        "Do not explain anything.\n"
+        "Do not add notes, headings, metadata, or commentary.\n"
+        "Do not add quotation marks unless they are part of the source.\n"
+        "Preserve line breaks and segment boundaries exactly.\n"
+        "Keep names, brands, URLs, emails, code, and proper nouns unchanged unless transliteration "
+        "is clearly appropriate.\n"
+        "Expand abbreviations only when needed for a natural translation.\n"
+        "Do not censor, summarize, or omit content."
+    )
+
+
+class LMStudioTranslator:
+    """OpenAI-style chat completions client for LM Studio."""
+
+    def __init__(
+        self,
+        config: TranslationConfig,
+        client: Optional[httpx.Client] = None,
+        sleeper=time.sleep,
+    ) -> None:
+        self.config = config
+        self.config.validate()
+        self._client = client or httpx.Client(timeout=httpx.Timeout(self.config.timeout_seconds))
+        self._owns_client = client is None
+        self._sleeper = sleeper
+
+    def build_payload(self, text: str, source_language: str, target_language: str) -> Dict[str, Any]:
+        """Build the OpenAI-compatible chat completions payload."""
+        return {
+            "model": self.config.model,
+            "messages": [
+                {"role": "system", "content": _build_system_prompt(source_language, target_language)},
+                {"role": "user", "content": text},
+            ],
+            "temperature": 0.1,
+            "top_p": 1,
+            "stream": False,
+        }
+
+    def build_user_only_payload(
+        self,
+        text: str,
+        source_language: str,
+        target_language: str,
+    ) -> Dict[str, Any]:
+        """Build a fallback payload for models that require the first turn to be user."""
+        instructions = _build_system_prompt(source_language, target_language)
+        merged_prompt = f"{instructions}\n\nText to translate:\n{text}"
+        return {
+            "model": self.config.model,
+            "messages": [
+                {"role": "user", "content": merged_prompt},
+            ],
+            "temperature": 0.1,
+            "top_p": 1,
+            "stream": False,
+        }
+
+    def build_structured_translation_payload(
+        self,
+        text: str,
+        source_language: str,
+        target_language: str,
+    ) -> Dict[str, Any]:
+        """Build a payload for custom translation models with structured user content."""
+        return {
+            "model": self.config.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "source_lang_code": source_language or "auto",
+                            "target_lang_code": target_language,
+                            "text": text,
+                            "image": None,
+                        }
+                    ],
+                }
+            ],
+            "temperature": 0.1,
+            "top_p": 1,
+            "stream": False,
+        }
+
+    @staticmethod
+    def parse_response_content(payload: Dict[str, Any]) -> str:
+        """Extract translated text from an OpenAI-compatible response payload."""
+        try:
+            content = payload["choices"][0]["message"]["content"]
+        except (KeyError, IndexError, TypeError) as exc:
+            raise TranslationError("LM Studio response did not contain a chat completion message.") from exc
+
+        if isinstance(content, list):
+            parts = []
+            for item in content:
+                if isinstance(item, str):
+                    parts.append(item)
+                elif isinstance(item, dict) and item.get("type") == "text":
+                    parts.append(str(item.get("text", "")))
+            content = "".join(parts)
+
+        if not isinstance(content, str):
+            raise TranslationError("LM Studio response content was not a text string.")
+
+        translated = content.strip()
+        if not translated:
+            raise TranslationError("LM Studio returned an empty translation.")
+
+        return translated
+
+    def _headers(self) -> Dict[str, str]:
+        return {
+            "Authorization": f"Bearer {self.config.api_key}",
+            "Content-Type": "application/json",
+        }
+
+    def _should_retry(self, exc: Exception) -> bool:
+        if isinstance(exc, (httpx.ConnectError, httpx.ReadTimeout, httpx.WriteTimeout, httpx.PoolTimeout)):
+            return True
+        if isinstance(exc, httpx.HTTPStatusError):
+            return exc.response.status_code in {408, 409, 429, 500, 502, 503, 504}
+        return False
+
+    @staticmethod
+    def _should_retry_with_user_only_prompt(exc: Exception) -> bool:
+        if not isinstance(exc, httpx.HTTPStatusError):
+            return False
+        if exc.response.status_code != 400:
+            return False
+
+        response_text = exc.response.text.lower()
+        return "conversations must start with a user prompt" in response_text
+
+    @staticmethod
+    def _should_retry_with_structured_translation_prompt(exc: Exception) -> bool:
+        if not isinstance(exc, httpx.HTTPStatusError):
+            return False
+        if exc.response.status_code != 400:
+            return False
+
+        response_text = exc.response.text.lower()
+        return "source_lang_code" in response_text and "target_lang_code" in response_text
+
+    def _post_chat_completion(self, payload: Dict[str, Any]) -> str:
+        response = self._client.post(
+            self.config.chat_completions_url,
+            headers=self._headers(),
+            json=payload,
+        )
+        response.raise_for_status()
+        return self.parse_response_content(response.json())
+
+    def translate_text(
+        self,
+        text: str,
+        target_language: str,
+        source_language: str = "auto",
+    ) -> str:
+        """Translate a single text segment."""
+        if not text.strip():
+            return ""
+
+        payload = self.build_payload(text, source_language, target_language)
+        last_error: Optional[Exception] = None
+
+        for attempt in range(1, self.config.max_retries + 1):
+            try:
+                return self._post_chat_completion(payload)
+            except (httpx.HTTPError, ValueError, TranslationError) as exc:
+                last_error = exc
+                if self._should_retry_with_user_only_prompt(exc):
+                    try:
+                        fallback_payload = self.build_user_only_payload(text, source_language, target_language)
+                        return self._post_chat_completion(fallback_payload)
+                    except (httpx.HTTPError, ValueError, TranslationError) as fallback_exc:
+                        last_error = fallback_exc
+                if self._should_retry_with_structured_translation_prompt(last_error):
+                    try:
+                        structured_payload = self.build_structured_translation_payload(
+                            text,
+                            source_language,
+                            target_language,
+                        )
+                        return self._post_chat_completion(structured_payload)
+                    except (httpx.HTTPError, ValueError, TranslationError) as structured_exc:
+                        last_error = structured_exc
+                if attempt >= self.config.max_retries or not self._should_retry(exc):
+                    break
+                self._sleeper(self.config.retry_backoff_seconds * attempt)
+
+        if isinstance(last_error, TranslationError):
+            raise last_error
+        if isinstance(last_error, ValueError):
+            raise TranslationError("LM Studio returned a non-JSON response.") from last_error
+        raise TranslationError(f"LM Studio request failed: {last_error}") from last_error
+
+    def translate_segments(
+        self,
+        texts: List[str],
+        target_language: str,
+        source_language: str = "auto",
+    ) -> List[str]:
+        """Translate an ordered list of subtitle-like segments."""
+        results: List[str] = []
+        for text in texts:
+            results.append(
+                self.translate_text(
+                    text=text,
+                    target_language=target_language,
+                    source_language=source_language,
+                )
+            )
+        return results
+
+    def close(self) -> None:
+        if self._owns_client:
+            self._client.close()
+
+
+def translate_text(
+    text: str,
+    target_language: str,
+    source_language: str = "auto",
+    config: Optional[TranslationConfig] = None,
+    client: Optional[httpx.Client] = None,
+) -> str:
+    """Translate a single text string using LM Studio."""
+    translator = LMStudioTranslator(config or TranslationConfig.from_env(), client=client)
+    try:
+        return translator.translate_text(text, target_language, source_language)
+    finally:
+        translator.close()
+
+
+def translate_segments(
+    texts: List[str],
+    target_language: str,
+    source_language: str = "auto",
+    config: Optional[TranslationConfig] = None,
+    client: Optional[httpx.Client] = None,
+) -> List[str]:
+    """Translate a list of text strings using LM Studio."""
+    translator = LMStudioTranslator(config or TranslationConfig.from_env(), client=client)
+    try:
+        return translator.translate_segments(texts, target_language, source_language)
+    finally:
+        translator.close()
--- a/src/youtube.py
+++ b/src/youtube.py
@@ -0,0 +1,329 @@
+"""YouTube Content Download Module for YouTube Auto Dub.
+
+This module provides a robust interface for downloading YouTube content
+using yt-dlp. It handles:
+- Video and audio extraction from YouTube URLs
+- Authentication via cookies or browser integration
+- Format selection and quality optimization
+- Error handling and retry logic
+- Metadata extraction and validation
+
+Author: Nguyen Cong Thuan Huy (mangodxd)
+Version: 1.0.0
+"""
+
+import yt_dlp
+from pathlib import Path
+from typing import Optional, Dict, Any
+from src.engines import CACHE_DIR
+
+
+def _format_minutes_seconds(total_seconds: float) -> str:
+    """Format seconds as M:SS for logging."""
+    seconds = int(round(total_seconds))
+    minutes, remaining_seconds = divmod(seconds, 60)
+    return f"{minutes}:{remaining_seconds:02d}"
+
+
+def _getOpts(browser: Optional[str] = None, 
+             cookies_file: Optional[str] = None, 
+             quiet: bool = True) -> Dict[str, Any]:
+    """Generate common yt-dlp options with authentication configuration.
+    
+    Args:
+        browser: Browser name for cookie extraction (chrome, edge, firefox).
+                If provided, cookies will be extracted from this browser.
+        cookies_file: Path to cookies.txt file in Netscape format.
+                     Takes priority over browser extraction if both provided.
+        quiet: Whether to suppress yt-dlp output messages.
+        
+    Returns:
+        Dictionary of yt-dlp options.
+        
+    Raises:
+        ValueError: If invalid browser name is provided.
+        
+    Note:
+        Priority order: cookies_file > browser > no authentication.
+    """
+    opts = {
+        'quiet': quiet,
+        'no_warnings': True,
+        'extract_flat': False,
+    }
+    
+    if cookies_file:
+        cookies_path = Path(cookies_file)
+        if not cookies_path.exists():
+            raise FileNotFoundError(f"Cookies file not found: {cookies_file}")
+        
+        opts['cookiefile'] = str(cookies_path)
+        print(f"[*] Using cookies file: {cookies_file}")
+        
+    elif browser:
+        valid_browsers = ['chrome', 'firefox', 'edge', 'safari', 'opera', 'brave']
+        browser_lower = browser.lower()
+        
+        if browser_lower not in valid_browsers:
+            raise ValueError(f"Invalid browser '{browser}'. Supported: {', '.join(valid_browsers)}")
+        
+        opts['cookiesfrombrowser'] = (browser_lower,)
+        print(f"[*] Extracting cookies from browser: {browser}")
+        
+    else:
+        print(f"[*] No authentication configured (public videos only)")
+    
+    return opts
+
+
+def getId(url: str, 
+          browser: Optional[str] = None, 
+          cookies_file: Optional[str] = None) -> str:
+    """Extract YouTube video ID from URL with authentication support.
+    
+    Args:
+        url: YouTube video URL to extract ID from.
+        browser: Browser name for cookie extraction.
+        cookies_file: Path to cookies.txt file.
+        
+    Returns:
+        YouTube video ID as string.
+        
+    Raises:
+        ValueError: If URL is invalid or video ID cannot be extracted.
+        RuntimeError: If yt-dlp fails to extract information.
+        
+    Note:
+        This function validates the URL and extracts metadata
+        without downloading the actual content.
+    """
+    if not url or not isinstance(url, str):
+        raise ValueError("URL must be a non-empty string")
+    
+    if not any(domain in url.lower() for domain in ['youtube.com', 'youtu.be']):
+        raise ValueError(f"Invalid YouTube URL: {url}")
+    
+    try:
+        print(f"[*] Extracting video ID from: {url[:50]}...")
+        
+        opts = _getOpts(browser=browser, cookies_file=cookies_file)
+        
+        with yt_dlp.YoutubeDL(opts) as ydl:
+            try:
+                info = ydl.extract_info(url, download=False)
+                video_id = info.get('id')
+                
+                if not video_id:
+                    raise RuntimeError("No video ID found in extracted information")
+                
+                title = info.get('title', 'Unknown')
+                duration = info.get('duration', 0)
+                uploader = info.get('uploader', 'Unknown')
+                
+                print(f"[+] Video ID extracted: {video_id}")
+                print(f"    Title: {title[:50]}{'...' if len(title) > 50 else ''}")
+                print(f"    Duration: {duration}s ({_format_minutes_seconds(duration)})")
+                print(f"    Uploader: {uploader}")
+                
+                return video_id
+                
+            except yt_dlp.DownloadError as e:
+                if "Sign in to confirm" in str(e) or "private video" in str(e).lower():
+                    raise ValueError(f"Authentication required for this video. Please use --browser or --cookies. Original error: {e}")
+                else:
+                    raise RuntimeError(f"yt-dlp extraction failed: {e}")
+                    
+    except Exception as e:
+        if isinstance(e, (ValueError, RuntimeError)):
+            raise
+        raise RuntimeError(f"Failed to extract video ID: {e}") from e
+
+
+def downloadVideo(url: str, 
+                  browser: Optional[str] = None, 
+                  cookies_file: Optional[str] = None) -> Path:
+    """Download the best quality video with audio from YouTube.
+    
+    Args:
+        url: YouTube video URL to download.
+        browser: Browser name for cookie extraction.
+        cookies_file: Path to cookies.txt file.
+        
+    Returns:
+        Path to the downloaded video file.
+        
+    Raises:
+        ValueError: If URL is invalid or authentication is required.
+        RuntimeError: If download fails or file is corrupted.
+        
+    Note:
+        This function downloads both video and audio in a single file.
+        If the video already exists in cache, it returns the existing file.
+    """
+    try:
+        video_id = getId(url, browser=browser, cookies_file=cookies_file)
+    except Exception as e:
+        raise ValueError(f"Failed to validate video URL: {e}") from e
+    
+    out_path = CACHE_DIR / f"{video_id}.mp4"
+    
+    if out_path.exists():
+        file_size = out_path.stat().st_size
+        if file_size > 1024 * 1024:
+            print(f"[*] Video already cached: {out_path}")
+            return out_path
+        else:
+            print(f"[!] WARNING: Cached video seems too small ({file_size} bytes), re-downloading")
+            out_path.unlink()
+    
+    try:
+        print(f"[*] Downloading video: {video_id}")
+        
+        opts = _getOpts(browser=browser, cookies_file=cookies_file)
+        opts.update({
+            'format': (
+                'bestvideo[ext=mp4][vcodec^=avc]+bestaudio[ext=m4a]/'
+                'best[ext=mp4]/'
+                'best'
+            ),
+            'outtmpl': str(out_path),
+            'merge_output_format': 'mp4',
+            'postprocessors': [],
+        })
+        
+        with yt_dlp.YoutubeDL(opts) as ydl:
+            ydl.download([url])
+        
+        if not out_path.exists():
+            raise RuntimeError(f"Video file not created after download: {out_path}")
+        
+        file_size = out_path.stat().st_size
+        if file_size < 1024 * 1024:
+            raise RuntimeError(f"Downloaded video file is too small: {file_size} bytes")
+        
+        print(f"[+] Video downloaded successfully:")
+        print(f"    File: {out_path}")
+        print(f"    Size: {file_size / (1024*1024):.1f} MB")
+        
+        return out_path
+        
+    except yt_dlp.DownloadError as e:
+        error_msg = str(e).lower()
+        if "sign in to confirm" in error_msg or "private video" in error_msg:
+            raise ValueError(
+                f"Authentication required for this video. Please try:\n"
+                f"1. Close all browser windows and use --browser\n"
+                f"2. Export fresh cookies.txt and use --cookies\n"
+                f"3. Check if video is public/accessible\n"
+                f"Original error: {e}"
+            )
+        else:
+            raise RuntimeError(f"Video download failed: {e}")
+            
+    except Exception as e:
+        if out_path.exists():
+            out_path.unlink()
+        raise RuntimeError(f"Video download failed: {e}") from e
+
+
+def downloadAudio(url: str, 
+                  browser: Optional[str] = None, 
+                  cookies_file: Optional[str] = None) -> Path:
+    """Download audio-only from YouTube for transcription processing.
+    
+    Args:
+        url: YouTube video URL to extract audio from.
+        browser: Browser name for cookie extraction.
+        cookies_file: Path to cookies.txt file.
+        
+    Returns:
+        Path to the downloaded WAV audio file.
+        
+    Raises:
+        ValueError: If URL is invalid or authentication is required.
+        RuntimeError: If audio download or conversion fails.
+        
+    Note:
+        The output is always in WAV format at the project's sample rate
+        for consistency with the transcription pipeline.
+    """
+    try:
+        video_id = getId(url, browser=browser, cookies_file=cookies_file)
+    except Exception as e:
+        raise ValueError(f"Failed to validate video URL: {e}") from e
+    
+    temp_path = CACHE_DIR / f"{video_id}"
+    final_path = CACHE_DIR / f"{video_id}.wav"
+    
+    if final_path.exists():
+        file_size = final_path.stat().st_size
+        if file_size > 1024 * 100:
+            print(f"[*] Audio already cached: {final_path}")
+            return final_path
+        else:
+            print(f"[!] WARNING: Cached audio seems too small ({file_size} bytes), re-downloading")
+            final_path.unlink()
+    
+    try:
+        print(f"[*] Downloading audio: {video_id}")
+        
+        opts = _getOpts(browser=browser, cookies_file=cookies_file)
+        opts.update({
+            'format': 'bestaudio/best',
+            'outtmpl': str(temp_path),
+            'postprocessors': [{
+                'key': 'FFmpegExtractAudio',
+                'preferredcodec': 'wav',
+                'preferredquality': '192',
+            }],
+        })
+        
+        with yt_dlp.YoutubeDL(opts) as ydl:
+            ydl.download([url])
+        
+        if not final_path.exists():
+            temp_files = list(CACHE_DIR.glob(f"{video_id}.*"))
+            if temp_files:
+                print(f"[!] WARNING: Expected {final_path} but found {temp_files[0]}")
+                final_path = temp_files[0]
+            else:
+                raise RuntimeError(f"Audio file not created after download: {final_path}")
+        
+        file_size = final_path.stat().st_size
+        if file_size < 1024 * 100:
+            raise RuntimeError(f"Downloaded audio file is too small: {file_size} bytes")
+        
+        print(f"[+] Audio downloaded successfully:")
+        print(f"    File: {final_path}")
+        print(f"    Size: {file_size / (1024*1024):.1f} MB")
+        
+        try:
+            from src.media import _get_duration
+            duration = _get_duration(final_path)
+            if duration > 0:
+                print(f"    Duration: {duration:.1f}s ({_format_minutes_seconds(duration)})")
+            else:
+                print(f"[!] WARNING: Could not determine audio duration")
+        except Exception as e:
+            print(f"[!] WARNING: Audio validation failed: {e}")
+        
+        return final_path
+        
+    except yt_dlp.DownloadError as e:
+        error_msg = str(e).lower()
+        if "sign in to confirm" in error_msg or "private video" in error_msg:
+            raise ValueError(
+                f"Authentication required for this video. Please try:\n"
+                f"1. Close all browser windows and use --browser\n"
+                f"2. Export fresh cookies.txt and use --cookies\n"
+                f"3. Check if video is public/accessible\n"
+                f"Original error: {e}"
+            )
+        else:
+            raise RuntimeError(f"Audio download failed: {e}")
+            
+    except Exception as e:
+        for path in [temp_path, final_path]:
+            if path.exists():
+                path.unlink()
+        raise RuntimeError(f"Audio download failed: {e}") from e