146 lines
4.7 KiB
Python
146 lines
4.7 KiB
Python
"""Focused tests for vocal-bleed reduction in the final dub mix."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
import shutil
|
|
import subprocess
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import pytest
|
|
import soundfile as sf
|
|
|
|
from src.audio_separation import AudioSeparator, DEFAULT_MIX_MODE
|
|
from src import media
|
|
|
|
|
|
FFMPEG_READY = shutil.which("ffmpeg") is not None and shutil.which("ffprobe") is not None
|
|
|
|
|
|
def _sine_wave(frequency: float, duration: float, sample_rate: int, amplitude: float) -> np.ndarray:
|
|
t = np.linspace(0.0, duration, int(sample_rate * duration), endpoint=False)
|
|
return (amplitude * np.sin(2.0 * math.pi * frequency * t)).astype(np.float32)
|
|
|
|
|
|
def _run(cmd: list[str]) -> None:
|
|
subprocess.run(cmd, check=True, capture_output=True, text=True, timeout=None)
|
|
|
|
|
|
def _extract_peak(signal: np.ndarray, sample_rate: int, frequency: float) -> float:
|
|
window = np.hanning(len(signal))
|
|
spectrum = np.fft.rfft(signal * window)
|
|
freqs = np.fft.rfftfreq(len(signal), d=1.0 / sample_rate)
|
|
index = int(np.argmin(np.abs(freqs - frequency)))
|
|
return float(np.abs(spectrum[index]))
|
|
|
|
|
|
@pytest.mark.skipif(not FFMPEG_READY, reason="FFmpeg is required for audio pipeline tests")
|
|
def test_default_mix_prefers_instrumental_bed_and_keeps_dub_prominent(tmp_path: Path):
|
|
sample_rate = 24_000
|
|
duration = 2.0
|
|
|
|
centered_voice = _sine_wave(440.0, duration, sample_rate, amplitude=0.35)
|
|
ambience_left = _sine_wave(660.0, duration, sample_rate, amplitude=0.18)
|
|
ambience_right = -ambience_left
|
|
original_stereo = np.column_stack(
|
|
[centered_voice + ambience_left, centered_voice + ambience_right]
|
|
)
|
|
|
|
original_audio = tmp_path / "original.wav"
|
|
sf.write(original_audio, original_stereo, sample_rate)
|
|
|
|
dub_audio = tmp_path / "dub.wav"
|
|
sf.write(dub_audio, _sine_wave(1000.0, duration, sample_rate, amplitude=0.30), sample_rate)
|
|
|
|
manifest_path = tmp_path / "dub_manifest.txt"
|
|
manifest_path.write_text(f"file '{dub_audio.resolve().as_posix()}'\n", encoding="utf-8")
|
|
|
|
video_path = tmp_path / "video.mp4"
|
|
_run(
|
|
[
|
|
"ffmpeg",
|
|
"-y",
|
|
"-v",
|
|
"error",
|
|
"-f",
|
|
"lavfi",
|
|
"-i",
|
|
f"color=c=black:s=320x240:d={duration}",
|
|
"-i",
|
|
str(original_audio),
|
|
"-c:v",
|
|
"libx264",
|
|
"-pix_fmt",
|
|
"yuv420p",
|
|
"-c:a",
|
|
"aac",
|
|
"-shortest",
|
|
str(video_path),
|
|
]
|
|
)
|
|
|
|
separator = AudioSeparator()
|
|
separation = separator.separate_audio(original_audio, tmp_path)
|
|
|
|
assert separation.instrumental_path is not None
|
|
assert separation.instrumental_path.exists()
|
|
|
|
instrumental_audio, instrumental_rate = sf.read(separation.instrumental_path, always_2d=True)
|
|
original_audio_data, _ = sf.read(original_audio, always_2d=True)
|
|
|
|
centered_before = _extract_peak(original_audio_data[:, 0], sample_rate, 440.0)
|
|
centered_after = _extract_peak(instrumental_audio[:, 0], instrumental_rate, 440.0)
|
|
ambience_after = _extract_peak(instrumental_audio[:, 0], instrumental_rate, 660.0)
|
|
|
|
assert centered_after < centered_before * 0.15
|
|
assert ambience_after > 0.01
|
|
|
|
output_video = tmp_path / "dubbed.mp4"
|
|
media.render_video(
|
|
video_path=video_path,
|
|
concat_file=manifest_path,
|
|
output_path=output_video,
|
|
background_audio_path=separation.instrumental_path,
|
|
mix_mode=DEFAULT_MIX_MODE,
|
|
background_volume=separation.recommended_bg_volume,
|
|
)
|
|
|
|
mixed_audio = tmp_path / "mixed.wav"
|
|
_run(
|
|
[
|
|
"ffmpeg",
|
|
"-y",
|
|
"-v",
|
|
"error",
|
|
"-i",
|
|
str(output_video),
|
|
"-vn",
|
|
"-c:a",
|
|
"pcm_s16le",
|
|
str(mixed_audio),
|
|
]
|
|
)
|
|
|
|
rendered_audio, rendered_rate = sf.read(mixed_audio, always_2d=True)
|
|
rendered_channel = rendered_audio[:, 0]
|
|
|
|
dub_peak = _extract_peak(rendered_channel, rendered_rate, 1000.0)
|
|
residual_original_peak = _extract_peak(rendered_channel, rendered_rate, 440.0)
|
|
ambience_peak = _extract_peak(rendered_channel, rendered_rate, 660.0)
|
|
|
|
assert dub_peak > residual_original_peak * 4
|
|
assert ambience_peak > residual_original_peak
|
|
|
|
|
|
@pytest.mark.skipif(not FFMPEG_READY, reason="FFmpeg is required for audio pipeline tests")
|
|
def test_separator_warns_and_returns_no_bed_for_mono_input(tmp_path: Path):
|
|
mono_audio = tmp_path / "mono.wav"
|
|
sf.write(mono_audio, _sine_wave(440.0, 1.0, 24_000, amplitude=0.30), 24_000)
|
|
|
|
result = AudioSeparator().separate_audio(mono_audio, tmp_path)
|
|
|
|
assert result.instrumental_path is None
|
|
assert result.warning is not None
|
|
assert "mono" in result.warning.lower()
|