feat: add scan_video with average and nearest modes
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -38,3 +38,77 @@ def build_profile(clip_paths: list[str]) -> dict | None:
|
|||||||
"mean_vector": arr.mean(axis=0),
|
"mean_vector": arr.mean(axis=0),
|
||||||
"clip_vectors": vectors,
|
"clip_vectors": vectors,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||||
|
"""Cosine similarity between two vectors.
|
||||||
|
|
||||||
|
Returns value in [-1, 1]. Negative means anti-correlated (very
|
||||||
|
dissimilar). For threshold filtering this is fine — negative scores
|
||||||
|
never exceed the threshold. Scores near 0 may be uncorrelated or
|
||||||
|
weakly anti-correlated.
|
||||||
|
"""
|
||||||
|
na = np.linalg.norm(a)
|
||||||
|
nb = np.linalg.norm(b)
|
||||||
|
if na == 0 or nb == 0:
|
||||||
|
return 0.0
|
||||||
|
return float(np.dot(a, b) / (na * nb))
|
||||||
|
|
||||||
|
|
||||||
|
def scan_video(
|
||||||
|
video_path: str,
|
||||||
|
profile: dict,
|
||||||
|
mode: str = "average",
|
||||||
|
threshold: float = 0.7,
|
||||||
|
hop: float = 1.0,
|
||||||
|
window: float = 8.0,
|
||||||
|
cancel_flag: object = None,
|
||||||
|
) -> list[tuple[float, float, float]]:
|
||||||
|
"""Slide a window across the video audio and score against the profile.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
video_path: path to video/audio file
|
||||||
|
profile: dict from build_profile()
|
||||||
|
mode: "average" (compare to mean) or "nearest" (max over all clips)
|
||||||
|
threshold: minimum cosine similarity to include
|
||||||
|
hop: step size in seconds
|
||||||
|
window: window size in seconds (default 8s)
|
||||||
|
cancel_flag: object with _cancel bool attribute; checked each iteration
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list of (start_time, end_time, score) for regions above threshold
|
||||||
|
"""
|
||||||
|
_log(f"audio_scan: loading {video_path}")
|
||||||
|
y, sr = librosa.load(video_path, sr=_SR, mono=True)
|
||||||
|
duration = len(y) / sr
|
||||||
|
_log(f"audio_scan: {duration:.1f}s loaded, scanning with hop={hop}s")
|
||||||
|
|
||||||
|
win_samples = int(window * sr)
|
||||||
|
hop_samples = int(hop * sr)
|
||||||
|
|
||||||
|
results = []
|
||||||
|
pos = 0
|
||||||
|
while pos + win_samples <= len(y):
|
||||||
|
if cancel_flag and getattr(cancel_flag, '_cancel', False):
|
||||||
|
_log("audio_scan: cancelled")
|
||||||
|
return results
|
||||||
|
|
||||||
|
chunk = y[pos : pos + win_samples]
|
||||||
|
mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=_N_MFCC)
|
||||||
|
vec = mfcc.mean(axis=1)
|
||||||
|
|
||||||
|
if mode == "nearest":
|
||||||
|
score = max(
|
||||||
|
_cosine_similarity(vec, cv) for cv in profile["clip_vectors"]
|
||||||
|
)
|
||||||
|
else: # average
|
||||||
|
score = _cosine_similarity(vec, profile["mean_vector"])
|
||||||
|
|
||||||
|
if score >= threshold:
|
||||||
|
start_t = pos / sr
|
||||||
|
results.append((start_t, start_t + window, score))
|
||||||
|
|
||||||
|
pos += hop_samples
|
||||||
|
|
||||||
|
_log(f"audio_scan: {len(results)} regions above threshold {threshold}")
|
||||||
|
return results
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import tempfile, os
|
import tempfile, os
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from core.audio_scan import build_profile, _extract_mfcc
|
from core.audio_scan import build_profile, _extract_mfcc, scan_video
|
||||||
|
|
||||||
|
|
||||||
def _make_wav(path: str, duration: float = 8.0, sr: int = 22050):
|
def _make_wav(path: str, duration: float = 8.0, sr: int = 22050):
|
||||||
@@ -68,3 +68,53 @@ def test_build_profile_skips_missing_files():
|
|||||||
def test_build_profile_empty_returns_none():
|
def test_build_profile_empty_returns_none():
|
||||||
result = build_profile([])
|
result = build_profile([])
|
||||||
assert result is None
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_video_finds_matching_region():
|
||||||
|
"""A video made of the same sine wave as the reference should match."""
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref:
|
||||||
|
_make_wav(ref.name, duration=8.0)
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid:
|
||||||
|
_make_wav(vid.name, duration=20.0)
|
||||||
|
try:
|
||||||
|
profile = build_profile([ref.name])
|
||||||
|
regions = scan_video(vid.name, profile, mode="average", threshold=0.5, hop=1.0)
|
||||||
|
assert len(regions) > 0
|
||||||
|
for start, end, score in regions:
|
||||||
|
assert abs((end - start) - 8.0) < 1e-9
|
||||||
|
assert score >= 0.5
|
||||||
|
finally:
|
||||||
|
os.unlink(ref.name)
|
||||||
|
os.unlink(vid.name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_video_nearest_mode():
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref:
|
||||||
|
_make_wav(ref.name, duration=8.0)
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid:
|
||||||
|
_make_wav(vid.name, duration=20.0)
|
||||||
|
try:
|
||||||
|
profile = build_profile([ref.name])
|
||||||
|
regions = scan_video(vid.name, profile, mode="nearest", threshold=0.5, hop=1.0)
|
||||||
|
assert len(regions) > 0
|
||||||
|
finally:
|
||||||
|
os.unlink(ref.name)
|
||||||
|
os.unlink(vid.name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_video_high_threshold_no_match():
|
||||||
|
"""Different frequencies with very high threshold should not match."""
|
||||||
|
import soundfile as sf
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as ref:
|
||||||
|
t = np.linspace(0, 8.0, 22050 * 8, endpoint=False)
|
||||||
|
sf.write(ref.name, 0.5 * np.sin(2 * np.pi * 440 * t), 22050)
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as vid:
|
||||||
|
# White noise — very different from sine wave
|
||||||
|
sf.write(vid.name, np.random.randn(22050 * 20).astype(np.float32) * 0.1, 22050)
|
||||||
|
try:
|
||||||
|
profile = build_profile([ref.name])
|
||||||
|
regions = scan_video(vid.name, profile, mode="average", threshold=0.99, hop=1.0)
|
||||||
|
assert len(regions) == 0
|
||||||
|
finally:
|
||||||
|
os.unlink(ref.name)
|
||||||
|
os.unlink(vid.name)
|
||||||
|
|||||||
Reference in New Issue
Block a user