From 9cf9e3233f5a68eb1a68a2730f4ce877c73f5497 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 17 Apr 2026 08:50:47 +0200 Subject: [PATCH] feat: add scan_video with average and nearest modes Co-Authored-By: Claude Opus 4.6 --- core/audio_scan.py | 74 ++++++++++++++++++++++++++++++++++++++++ tests/test_audio_scan.py | 52 +++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 1 deletion(-) diff --git a/core/audio_scan.py b/core/audio_scan.py index df1896b..43eeff0 100644 --- a/core/audio_scan.py +++ b/core/audio_scan.py @@ -38,3 +38,77 @@ def build_profile(clip_paths: list[str]) -> dict | None: "mean_vector": arr.mean(axis=0), "clip_vectors": vectors, } + + +def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: + """Cosine similarity between two vectors. + + Returns value in [-1, 1]. Negative means anti-correlated (very + dissimilar). For threshold filtering this is fine — negative scores + never exceed the threshold. Scores near 0 may be uncorrelated or + weakly anti-correlated. + """ + na = np.linalg.norm(a) + nb = np.linalg.norm(b) + if na == 0 or nb == 0: + return 0.0 + return float(np.dot(a, b) / (na * nb)) + + +def scan_video( + video_path: str, + profile: dict, + mode: str = "average", + threshold: float = 0.7, + hop: float = 1.0, + window: float = 8.0, + cancel_flag: object = None, +) -> list[tuple[float, float, float]]: + """Slide a window across the video audio and score against the profile. + + Args: + video_path: path to video/audio file + profile: dict from build_profile() + mode: "average" (compare to mean) or "nearest" (max over all clips) + threshold: minimum cosine similarity to include + hop: step size in seconds + window: window size in seconds (default 8s) + cancel_flag: object with _cancel bool attribute; checked each iteration + + Returns: + list of (start_time, end_time, score) for regions above threshold + """ + _log(f"audio_scan: loading {video_path}") + y, sr = librosa.load(video_path, sr=_SR, mono=True) + duration = len(y) / sr + _log(f"audio_scan: {duration:.1f}s loaded, scanning with hop={hop}s") + + win_samples = int(window * sr) + hop_samples = int(hop * sr) + + results = [] + pos = 0 + while pos + win_samples <= len(y): + if cancel_flag and getattr(cancel_flag, '_cancel', False): + _log("audio_scan: cancelled") + return results + + chunk = y[pos : pos + win_samples] + mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=_N_MFCC) + vec = mfcc.mean(axis=1) + + if mode == "nearest": + score = max( + _cosine_similarity(vec, cv) for cv in profile["clip_vectors"] + ) + else: # average + score = _cosine_similarity(vec, profile["mean_vector"]) + + if score >= threshold: + start_t = pos / sr + results.append((start_t, start_t + window, score)) + + pos += hop_samples + + _log(f"audio_scan: {len(results)} regions above threshold {threshold}") + return results diff --git a/tests/test_audio_scan.py b/tests/test_audio_scan.py index 6358189..2e96e64 100644 --- a/tests/test_audio_scan.py +++ b/tests/test_audio_scan.py @@ -1,6 +1,6 @@ import tempfile, os import numpy as np -from core.audio_scan import build_profile, _extract_mfcc +from core.audio_scan import build_profile, _extract_mfcc, scan_video def _make_wav(path: str, duration: float = 8.0, sr: int = 22050): @@ -68,3 +68,53 @@ def test_build_profile_skips_missing_files(): def test_build_profile_empty_returns_none(): result = build_profile([]) assert result is None + + +def test_scan_video_finds_matching_region(): + """A video made of the same sine wave as the reference should match.""" + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref: + _make_wav(ref.name, duration=8.0) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid: + _make_wav(vid.name, duration=20.0) + try: + profile = build_profile([ref.name]) + regions = scan_video(vid.name, profile, mode="average", threshold=0.5, hop=1.0) + assert len(regions) > 0 + for start, end, score in regions: + assert abs((end - start) - 8.0) < 1e-9 + assert score >= 0.5 + finally: + os.unlink(ref.name) + os.unlink(vid.name) + + +def test_scan_video_nearest_mode(): + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref: + _make_wav(ref.name, duration=8.0) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid: + _make_wav(vid.name, duration=20.0) + try: + profile = build_profile([ref.name]) + regions = scan_video(vid.name, profile, mode="nearest", threshold=0.5, hop=1.0) + assert len(regions) > 0 + finally: + os.unlink(ref.name) + os.unlink(vid.name) + + +def test_scan_video_high_threshold_no_match(): + """Different frequencies with very high threshold should not match.""" + import soundfile as sf + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref: + t = np.linspace(0, 8.0, 22050 * 8, endpoint=False) + sf.write(ref.name, 0.5 * np.sin(2 * np.pi * 440 * t), 22050) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid: + # White noise — very different from sine wave + sf.write(vid.name, np.random.randn(22050 * 20).astype(np.float32) * 0.1, 22050) + try: + profile = build_profile([ref.name]) + regions = scan_video(vid.name, profile, mode="average", threshold=0.99, hop=1.0) + assert len(regions) == 0 + finally: + os.unlink(ref.name) + os.unlink(vid.name)