8-cut/core/audio_scan.py

"""Audio similarity scanning — MFCC + spectral contrast profile matching."""

import numpy as np
import librosa

from .paths import _log

_N_MFCC = 13          # coefficients 0-12; we drop C0 → 12 usable
_SR = 16000           # lower sr = faster, no quality loss for style matching
_HOP_LENGTH = 1024    # STFT hop (~64ms frames at 16kHz)
_N_FFT = 2048         # STFT window
_WINDOW = 8.0         # seconds
_N_FEATURES = 62      # (12 mfcc + 12 delta + 7 sc) * 2 (mean + std)


def _extract_features_from_signal(y: np.ndarray, sr: int = _SR) -> np.ndarray:
    """Compute feature matrix (31 x T) from a raw audio signal.

    Features per frame: 12 MFCCs (skip C0) + 12 delta MFCCs + 7 spectral contrast.
    """
    S = np.abs(librosa.stft(y, n_fft=_N_FFT, hop_length=_HOP_LENGTH)) ** 2
    mel_S = librosa.feature.melspectrogram(S=S, sr=sr, hop_length=_HOP_LENGTH)
    mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel_S), sr=sr, n_mfcc=_N_MFCC)
    mfcc = mfcc[1:]  # drop C0 (energy) — dominates cosine sim, kills discrimination
    delta = librosa.feature.delta(mfcc)
    sc = librosa.feature.spectral_contrast(S=S, sr=sr, hop_length=_HOP_LENGTH)
    return np.vstack([mfcc, delta, sc])  # (31, T)


def _aggregate(feature_matrix: np.ndarray) -> np.ndarray:
    """Collapse a (31, T) feature matrix into a (62,) vector via mean + std."""
    return np.concatenate([
        feature_matrix.mean(axis=1),
        feature_matrix.std(axis=1),
    ])


def _extract_features(path: str, sr: int = _SR) -> np.ndarray:
    """Load audio from a file and return a 62-dim feature vector."""
    y, _ = librosa.load(path, sr=sr, mono=True)
    feat = _extract_features_from_signal(y, sr)
    return _aggregate(feat)


def build_profile(clip_paths: list[str]) -> dict | None:
    """Extract features from reference clips.

    Returns dict with:
      - mean_vector: averaged feature vector across all clips (62,)
      - clip_vectors: list of individual feature vectors
    Returns None if no clips could be loaded.
    """
    vectors = []
    for p in clip_paths:
        try:
            vec = _extract_features(p)
            vectors.append(vec)
        except Exception as e:
            _log(f"audio_scan: skip {p}: {e}")
    if not vectors:
        return None
    arr = np.stack(vectors)
    return {
        "mean_vector": arr.mean(axis=0),
        "clip_vectors": vectors,
    }


def _similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Euclidean-distance-based similarity in (0, 1].

    1/(1+dist): identical → 1.0, very different → near 0.
    """
    return float(1.0 / (1.0 + np.linalg.norm(a - b)))


def scan_video(
    video_path: str,
    profile: dict,
    mode: str = "average",
    threshold: float = 0.05,
    hop: float = 1.0,
    window: float = _WINDOW,
    cancel_flag: object = None,
) -> list[tuple[float, float, float]]:
    """Slide a window across the video audio and score against the profile.

    Pre-computes STFT once for the whole file, then uses vectorized
    cumulative-sum sliding window for speed.

    Args:
        video_path: path to video/audio file
        profile: dict from build_profile()
        mode: "average" (compare to mean) or "nearest" (max over all clips)
        threshold: minimum similarity to include (0-1, default 0.05)
        hop: step size in seconds
        window: window size in seconds (default 8s)
        cancel_flag: object with _cancel bool attribute; checked periodically

    Returns:
        list of (start_time, end_time, score) for regions above threshold
    """
    _log(f"audio_scan: loading {video_path}")
    y, sr = librosa.load(video_path, sr=_SR, mono=True)
    duration = len(y) / sr
    _log(f"audio_scan: {duration:.1f}s loaded, extracting features...")

    if cancel_flag and getattr(cancel_flag, '_cancel', False):
        return []

    # Compute features for the entire file at once (one STFT)
    feat = _extract_features_from_signal(y, sr)  # (31, T)
    n_feats, T = feat.shape
    fps = sr / _HOP_LENGTH  # frames per second
    win_frames = int(window * fps)
    hop_frames = int(hop * fps)

    if win_frames > T:
        _log("audio_scan: video shorter than window")
        return []

    _log(f"audio_scan: scanning {T} frames, win={win_frames}, hop={hop_frames}")

    # Vectorized sliding window via cumulative sums
    cumsum = np.zeros((n_feats, T + 1))
    cumsum[:, 1:] = np.cumsum(feat, axis=1)
    cumsq = np.zeros((n_feats, T + 1))
    cumsq[:, 1:] = np.cumsum(feat ** 2, axis=1)

    starts = np.arange(0, T - win_frames + 1, hop_frames)
    ends = starts + win_frames

    sums = cumsum[:, ends] - cumsum[:, starts]        # (31, n_windows)
    sq_sums = cumsq[:, ends] - cumsq[:, starts]
    means = sums / win_frames
    stds = np.sqrt(np.maximum(sq_sums / win_frames - means ** 2, 0) + 1e-10)

    window_vectors = np.vstack([means, stds]).T  # (n_windows, 62)

    if cancel_flag and getattr(cancel_flag, '_cancel', False):
        return []

    # Score all windows
    if mode == "nearest":
        # Compare each window to every clip vector, take max
        clip_vecs = np.stack(profile["clip_vectors"])  # (n_clips, 62)
        results = []
        # Process in batches to check cancel_flag periodically
        batch = 500
        for i in range(0, len(window_vectors), batch):
            if cancel_flag and getattr(cancel_flag, '_cancel', False):
                _log("audio_scan: cancelled")
                return results
            chunk = window_vectors[i:i + batch]
            # cdist: (batch, n_clips) distances
            dists = np.linalg.norm(chunk[:, None, :] - clip_vecs[None, :, :], axis=2)
            scores = 1.0 / (1.0 + dists.min(axis=1))  # min dist = max similarity
            for j, score in enumerate(scores):
                if score >= threshold:
                    idx = i + j
                    start_t = starts[idx] / fps
                    results.append((start_t, start_t + window, float(score)))
    else:
        # Average mode: compare to mean vector
        ref = profile["mean_vector"]
        dists = np.linalg.norm(window_vectors - ref, axis=1)
        scores = 1.0 / (1.0 + dists)
        mask = scores >= threshold
        results = [
            (starts[i] / fps, starts[i] / fps + window, float(scores[i]))
            for i in np.nonzero(mask)[0]
        ]

    _log(f"audio_scan: {len(results)} regions above threshold {threshold}")
    return results