From 9cf9e3233f5a68eb1a68a2730f4ce877c73f5497 Mon Sep 17 00:00:00 2001
From: Ethanfel <ethan.fel@ts-pc.fr>
Date: Fri, 17 Apr 2026 08:50:47 +0200
Subject: [PATCH] feat: add scan_video with average and nearest modes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 core/audio_scan.py       | 74 ++++++++++++++++++++++++++++++++++++++++
 tests/test_audio_scan.py | 52 +++++++++++++++++++++++++++-
 2 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/core/audio_scan.py b/core/audio_scan.py
index df1896b..43eeff0 100644
--- a/core/audio_scan.py
+++ b/core/audio_scan.py
@@ -38,3 +38,77 @@ def build_profile(clip_paths: list[str]) -> dict | None:
         "mean_vector": arr.mean(axis=0),
         "clip_vectors": vectors,
     }
+
+
+def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """Cosine similarity between two vectors.
+
+    Returns value in [-1, 1]. Negative means anti-correlated (very
+    dissimilar). For threshold filtering this is fine — negative scores
+    never exceed the threshold. Scores near 0 may be uncorrelated or
+    weakly anti-correlated.
+    """
+    na = np.linalg.norm(a)
+    nb = np.linalg.norm(b)
+    if na == 0 or nb == 0:
+        return 0.0
+    return float(np.dot(a, b) / (na * nb))
+
+
+def scan_video(
+    video_path: str,
+    profile: dict,
+    mode: str = "average",
+    threshold: float = 0.7,
+    hop: float = 1.0,
+    window: float = 8.0,
+    cancel_flag: object = None,
+) -> list[tuple[float, float, float]]:
+    """Slide a window across the video audio and score against the profile.
+
+    Args:
+        video_path: path to video/audio file
+        profile: dict from build_profile()
+        mode: "average" (compare to mean) or "nearest" (max over all clips)
+        threshold: minimum cosine similarity to include
+        hop: step size in seconds
+        window: window size in seconds (default 8s)
+        cancel_flag: object with _cancel bool attribute; checked each iteration
+
+    Returns:
+        list of (start_time, end_time, score) for regions above threshold
+    """
+    _log(f"audio_scan: loading {video_path}")
+    y, sr = librosa.load(video_path, sr=_SR, mono=True)
+    duration = len(y) / sr
+    _log(f"audio_scan: {duration:.1f}s loaded, scanning with hop={hop}s")
+
+    win_samples = int(window * sr)
+    hop_samples = int(hop * sr)
+
+    results = []
+    pos = 0
+    while pos + win_samples <= len(y):
+        if cancel_flag and getattr(cancel_flag, '_cancel', False):
+            _log("audio_scan: cancelled")
+            return results
+
+        chunk = y[pos : pos + win_samples]
+        mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=_N_MFCC)
+        vec = mfcc.mean(axis=1)
+
+        if mode == "nearest":
+            score = max(
+                _cosine_similarity(vec, cv) for cv in profile["clip_vectors"]
+            )
+        else:  # average
+            score = _cosine_similarity(vec, profile["mean_vector"])
+
+        if score >= threshold:
+            start_t = pos / sr
+            results.append((start_t, start_t + window, score))
+
+        pos += hop_samples
+
+    _log(f"audio_scan: {len(results)} regions above threshold {threshold}")
+    return results
diff --git a/tests/test_audio_scan.py b/tests/test_audio_scan.py
index 6358189..2e96e64 100644
--- a/tests/test_audio_scan.py
+++ b/tests/test_audio_scan.py
@@ -1,6 +1,6 @@
 import tempfile, os
 import numpy as np
-from core.audio_scan import build_profile, _extract_mfcc
+from core.audio_scan import build_profile, _extract_mfcc, scan_video
 
 
 def _make_wav(path: str, duration: float = 8.0, sr: int = 22050):
@@ -68,3 +68,53 @@ def test_build_profile_skips_missing_files():
 def test_build_profile_empty_returns_none():
     result = build_profile([])
     assert result is None
+
+
+def test_scan_video_finds_matching_region():
+    """A video made of the same sine wave as the reference should match."""
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref:
+        _make_wav(ref.name, duration=8.0)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid:
+        _make_wav(vid.name, duration=20.0)
+    try:
+        profile = build_profile([ref.name])
+        regions = scan_video(vid.name, profile, mode="average", threshold=0.5, hop=1.0)
+        assert len(regions) > 0
+        for start, end, score in regions:
+            assert abs((end - start) - 8.0) < 1e-9
+            assert score >= 0.5
+    finally:
+        os.unlink(ref.name)
+        os.unlink(vid.name)
+
+
+def test_scan_video_nearest_mode():
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref:
+        _make_wav(ref.name, duration=8.0)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid:
+        _make_wav(vid.name, duration=20.0)
+    try:
+        profile = build_profile([ref.name])
+        regions = scan_video(vid.name, profile, mode="nearest", threshold=0.5, hop=1.0)
+        assert len(regions) > 0
+    finally:
+        os.unlink(ref.name)
+        os.unlink(vid.name)
+
+
+def test_scan_video_high_threshold_no_match():
+    """Different frequencies with very high threshold should not match."""
+    import soundfile as sf
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref:
+        t = np.linspace(0, 8.0, 22050 * 8, endpoint=False)
+        sf.write(ref.name, 0.5 * np.sin(2 * np.pi * 440 * t), 22050)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid:
+        # White noise — very different from sine wave
+        sf.write(vid.name, np.random.randn(22050 * 20).astype(np.float32) * 0.1, 22050)
+    try:
+        profile = build_profile([ref.name])
+        regions = scan_video(vid.name, profile, mode="average", threshold=0.99, hop=1.0)
+        assert len(regions) == 0
+    finally:
+        os.unlink(ref.name)
+        os.unlink(vid.name)