diff --git a/core/audio_scan.py b/core/audio_scan.py index 43eeff0..bc615e8 100644 --- a/core/audio_scan.py +++ b/core/audio_scan.py @@ -10,10 +10,14 @@ _SR = 22050 def _extract_mfcc(path: str, sr: int = _SR) -> np.ndarray: - """Load audio from a file and return a mean MFCC vector (20-dim).""" + """Load audio from a file and return an MFCC feature vector (40-dim). + + Concatenates mean + std of each coefficient over time. + Mean captures average spectral content; std captures dynamics. + """ y, _ = librosa.load(path, sr=sr, mono=True) mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=_N_MFCC) - return mfcc.mean(axis=1) # average over time → (20,) + return np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)]) def build_profile(clip_paths: list[str]) -> dict | None: @@ -95,7 +99,7 @@ def scan_video( chunk = y[pos : pos + win_samples] mfcc = librosa.feature.mfcc(y=chunk, sr=sr, n_mfcc=_N_MFCC) - vec = mfcc.mean(axis=1) + vec = np.concatenate([mfcc.mean(axis=1), mfcc.std(axis=1)]) if mode == "nearest": score = max( diff --git a/tests/test_audio_scan.py b/tests/test_audio_scan.py index 52e534c..b7961aa 100644 --- a/tests/test_audio_scan.py +++ b/tests/test_audio_scan.py @@ -16,7 +16,7 @@ def test_extract_mfcc_returns_1d_vector(): _make_wav(f.name) try: vec = _extract_mfcc(f.name) - assert vec.shape == (20,) + assert vec.shape == (40,) assert not np.isnan(vec).any() finally: os.unlink(f.name) @@ -29,7 +29,7 @@ def test_build_profile_single_clip(): profile = build_profile([f.name]) assert "mean_vector" in profile assert "clip_vectors" in profile - assert profile["mean_vector"].shape == (20,) + assert profile["mean_vector"].shape == (40,) assert len(profile["clip_vectors"]) == 1 finally: os.unlink(f.name) @@ -49,7 +49,7 @@ def test_build_profile_multiple_clips(): profile = build_profile(paths) assert len(profile["clip_vectors"]) == 3 - assert profile["mean_vector"].shape == (20,) + assert profile["mean_vector"].shape == (40,) finally: for p in paths: os.unlink(p)