feat: add scan_video with average and nearest modes

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-17 08:50:47 +02:00
parent e17d8f67aa
commit 9cf9e3233f
2 changed files with 125 additions and 1 deletions
+74
View File
@@ -38,3 +38,77 @@ def build_profile(clip_paths: list[str]) -> dict | None:
"mean_vector": arr.mean(axis=0),
"clip_vectors": vectors,
}
def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Cosine similarity between two vectors.
Returns value in [-1, 1]. Negative means anti-correlated (very
dissimilar). For threshold filtering this is fine — negative scores
never exceed the threshold. Scores near 0 may be uncorrelated or
weakly anti-correlated.
"""
na = np.linalg.norm(a)
nb = np.linalg.norm(b)
if na == 0 or nb == 0:
return 0.0
return float(np.dot(a, b) / (na * nb))
def scan_video(
video_path: str,
profile: dict,
mode: str = "average",
threshold: float = 0.7,
hop: float = 1.0,
window: float = 8.0,
cancel_flag: object = None,
) -> list[tuple[float, float, float]]:
"""Slide a window across the video audio and score against the profile.
Args:
video_path: path to video/audio file
profile: dict from build_profile()
mode: "average" (compare to mean) or "nearest" (max over all clips)
threshold: minimum cosine similarity to include
hop: step size in seconds
window: window size in seconds (default 8s)
cancel_flag: object with _cancel bool attribute; checked each iteration
Returns:
list of (start_time, end_time, score) for regions above threshold
"""
_log(f"audio_scan: loading {video_path}")
y, sr = librosa.load(video_path, sr=_SR, mono=True)
duration = len(y) / sr
_log(f"audio_scan: {duration:.1f}s loaded, scanning with hop={hop}s")
win_samples = int(window * sr)
hop_samples = int(hop * sr)
results = []
pos = 0
while pos + win_samples <= len(y):
if cancel_flag and getattr(cancel_flag, '_cancel', False):
_log("audio_scan: cancelled")
return results
chunk = y[pos : pos + win_samples]
mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=_N_MFCC)
vec = mfcc.mean(axis=1)
if mode == "nearest":
score = max(
_cosine_similarity(vec, cv) for cv in profile["clip_vectors"]
)
else: # average
score = _cosine_similarity(vec, profile["mean_vector"])
if score >= threshold:
start_t = pos / sr
results.append((start_t, start_t + window, score))
pos += hop_samples
_log(f"audio_scan: {len(results)} regions above threshold {threshold}")
return results