8ab5bdba77
Mean-only vectors were too similar across different audio segments, causing everything to match even at threshold 0.99. Adding std captures temporal dynamics and makes the similarity scores much more spread out. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
119 lines
3.6 KiB
Python
119 lines
3.6 KiB
Python
"""Audio similarity scanning — MFCC-based profile matching."""
|
|
|
|
import numpy as np
|
|
import librosa
|
|
|
|
from .paths import _log
|
|
|
|
_N_MFCC = 20
|
|
_SR = 22050
|
|
|
|
|
|
def _extract_mfcc(path: str, sr: int = _SR) -> np.ndarray:
|
|
"""Load audio from a file and return an MFCC feature vector (40-dim).
|
|
|
|
Concatenates mean + std of each coefficient over time.
|
|
Mean captures average spectral content; std captures dynamics.
|
|
"""
|
|
y, _ = librosa.load(path, sr=sr, mono=True)
|
|
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=_N_MFCC)
|
|
return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)])
|
|
|
|
|
|
def build_profile(clip_paths: list[str]) -> dict | None:
|
|
"""Extract MFCCs from reference clips.
|
|
|
|
Returns dict with:
|
|
- mean_vector: averaged MFCC across all clips (20,)
|
|
- clip_vectors: list of individual MFCC vectors
|
|
Returns None if no clips could be loaded.
|
|
"""
|
|
vectors = []
|
|
for p in clip_paths:
|
|
try:
|
|
vec = _extract_mfcc(p)
|
|
vectors.append(vec)
|
|
except Exception as e:
|
|
_log(f"audio_scan: skip {p}: {e}")
|
|
if not vectors:
|
|
return None
|
|
arr = np.stack(vectors)
|
|
return {
|
|
"mean_vector": arr.mean(axis=0),
|
|
"clip_vectors": vectors,
|
|
}
|
|
|
|
|
|
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
|
"""Cosine similarity between two vectors.
|
|
|
|
Returns value in [-1, 1]. Negative means anti-correlated (very
|
|
dissimilar). For threshold filtering this is fine — negative scores
|
|
never exceed the threshold. Scores near 0 may be uncorrelated or
|
|
weakly anti-correlated.
|
|
"""
|
|
na = np.linalg.norm(a)
|
|
nb = np.linalg.norm(b)
|
|
if na == 0 or nb == 0:
|
|
return 0.0
|
|
return float(np.dot(a, b) / (na * nb))
|
|
|
|
|
|
def scan_video(
|
|
video_path: str,
|
|
profile: dict,
|
|
mode: str = "average",
|
|
threshold: float = 0.7,
|
|
hop: float = 1.0,
|
|
window: float = 8.0,
|
|
cancel_flag: object = None,
|
|
) -> list[tuple[float, float, float]]:
|
|
"""Slide a window across the video audio and score against the profile.
|
|
|
|
Args:
|
|
video_path: path to video/audio file
|
|
profile: dict from build_profile()
|
|
mode: "average" (compare to mean) or "nearest" (max over all clips)
|
|
threshold: minimum cosine similarity to include
|
|
hop: step size in seconds
|
|
window: window size in seconds (default 8s)
|
|
cancel_flag: object with _cancel bool attribute; checked each iteration
|
|
|
|
Returns:
|
|
list of (start_time, end_time, score) for regions above threshold
|
|
"""
|
|
_log(f"audio_scan: loading {video_path}")
|
|
y, sr = librosa.load(video_path, sr=_SR, mono=True)
|
|
duration = len(y) / sr
|
|
_log(f"audio_scan: {duration:.1f}s loaded, scanning with hop={hop}s")
|
|
|
|
win_samples = int(window * sr)
|
|
hop_samples = int(hop * sr)
|
|
|
|
results = []
|
|
pos = 0
|
|
while pos + win_samples <= len(y):
|
|
if cancel_flag and getattr(cancel_flag, '_cancel', False):
|
|
_log("audio_scan: cancelled")
|
|
return results
|
|
|
|
chunk = y[pos : pos + win_samples]
|
|
mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=_N_MFCC)
|
|
vec = np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)])
|
|
|
|
if mode == "nearest":
|
|
score = max(
|
|
_cosine_similarity(vec, cv) for cv in profile["clip_vectors"]
|
|
)
|
|
else: # average
|
|
score = _cosine_similarity(vec, profile["mean_vector"])
|
|
|
|
if score >= threshold:
|
|
start_t = pos / sr
|
|
results.append((start_t, start_t + window, score))
|
|
|
|
pos += hop_samples
|
|
|
|
_log(f"audio_scan: {len(results)} regions above threshold {threshold}")
|
|
return results
|