feat: evaluate adapters on all dataset clips, not just clip_001

- _eval_sample gains clip_idx param (default 0, backward compatible) - Evaluator loops over all dataset clips per adapter, saves one WAV per clip - Reference metrics computed for all clips and averaged - Comparison chart and summary use avg_metrics across all clips - Eliminates bias from evaluating on an unrepresentative single clip Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-08 17:42:55 +02:00
parent 42ceb4b153
commit fdce9cbbf1
2 changed files with 102 additions and 77 deletions
@@ -93,17 +93,16 @@ def _load_npz(path: Path) -> dict:
 # ---------------------------------------------------------------------------

 def _eval_sample(generator, feature_utils_orig, dataset, seq_cfg, device, dtype,
-                 num_steps: int = 25, seed: int = 42):
-    """Run a quick no-CFG inference pass on a fixed training clip.
+                 num_steps: int = 25, seed: int = 42, clip_idx: int = 0):
+    """Run a quick no-CFG inference pass on a training clip.

-    Always uses dataset[0] and a fixed noise seed so samples across checkpoints
+    Uses dataset[clip_idx] and a fixed noise seed so samples across checkpoints
    are directly comparable — you can hear the model improve step by step.
    Returns (waveform [1, L] float32 cpu, sample_rate) or (None, None) on failure.
-    Uses fewer ODE steps than inference (8 vs 25) for speed.
    """
    generator.eval()
    try:
-        _, clip_f_cpu, sync_f_cpu, text_clip_cpu = dataset[0]
+        _, clip_f_cpu, sync_f_cpu, text_clip_cpu = dataset[clip_idx]
        clip_f    = clip_f_cpu.to(device, dtype)
        sync_f    = sync_f_cpu.to(device, dtype)
        text_clip = text_clip_cpu.to(device, dtype)