fix(audio): remove vocal bleed with instrumental-only mix

2026-03-30 18:44:48 +01:00
parent 348369c69a
commit 3c9b3c8090
6 changed files with 534 additions and 40 deletions
--- a/main.py
+++ b/main.py
@@ -8,6 +8,7 @@ import asyncio
 import shutil
 import time

+from src.audio_separation import DEFAULT_MIX_MODE
 from src.core_utils import ConfigurationError
 from src.translation import TranslationConfig

@@ -54,6 +55,15 @@ Examples:
        "-wm",
        help="Whisper model to use (tiny, base, small, medium, large-v3). Default: auto-select based on VRAM",
    )
+    parser.add_argument(
+        "--mix-mode",
+        default=DEFAULT_MIX_MODE,
+        choices=[DEFAULT_MIX_MODE, "original-audio", "dub-only"],
+        help=(
+            "Final audio bed for the dubbed output. "
+            "Default 'instrumental-only' uses a no-vocals bed when separation succeeds."
+        ),
+    )
    parser.add_argument(
        "--translation-backend",
        default="lmstudio",
@@ -309,7 +319,36 @@ def main() -> None:
        print(f"[+] Subtitles generated: {subtitle_path}")

        print(f"\n{'=' * 60}")
-        print("STEP 7: FINAL VIDEO RENDERING")
+        print("STEP 7: AUDIO BED PREPARATION")
+        print(f"{'=' * 60}")
+
+        effective_mix_mode = args.mix_mode
+        background_audio_path = None
+        background_volume = 0.10
+
+        if effective_mix_mode == DEFAULT_MIX_MODE:
+            print(f"[*] Preparing default {DEFAULT_MIX_MODE} mix bed...")
+            separation_result = engine.separate_audio(audio_path, src.engines.TEMP_DIR)
+            if separation_result.warning:
+                print(f"[!] WARNING: {separation_result.warning}")
+
+            if separation_result.instrumental_path:
+                background_audio_path = separation_result.instrumental_path
+                background_volume = separation_result.recommended_bg_volume
+                print(f"[+] Background stem ready: {background_audio_path}")
+                if separation_result.vocals_path:
+                    print(f"[*] Original vocal stem isolated at: {separation_result.vocals_path}")
+            else:
+                effective_mix_mode = "dub-only"
+                print("[!] WARNING: Could not build a safe instrumental bed. Falling back to dub-only output.")
+        elif effective_mix_mode == "original-audio":
+            background_audio_path = audio_path
+            print("[!] WARNING: original-audio mix keeps the source speech bed and may reintroduce bleed.")
+        else:
+            print("[*] Dub-only mix selected. No source audio bed will be used.")
+
+        print(f"\n{'=' * 60}")
+        print("STEP 8: FINAL VIDEO RENDERING")
        print(f"{'=' * 60}")

        try:
@@ -328,6 +367,9 @@ def main() -> None:
                concat_manifest_path,
                final_output,
                subtitle_path=subtitle_path,
+                background_audio_path=background_audio_path,
+                mix_mode=effective_mix_mode,
+                background_volume=background_volume,
            )

            if final_output.exists():