feat: cache pre-generated LoRA mels to disk for reuse

LoRA mel pre-generation runs a full ODE+CFG for every clip, which is slow. Cache results to a .pt file next to the output, keyed by a SHA-256 hash of the LoRA adapter content + generation parameters (seed, steps, CFG, duration, sample rate, npz file list). Automatically reused on subsequent runs when parameters haven't changed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 00:30:20 +02:00
parent 0854bd2638
commit 4e6cc4d519
1 changed files with 57 additions and 1 deletions
@@ -27,6 +27,8 @@ BigVGAN checkpoint so it can be loaded with SelVA BigVGAN Loader.
 """
 import copy
 import hashlib
 import json as _json
 import random
 import threading
 from pathlib import Path
@@ -373,9 +375,34 @@ def _find_audio_for_npz(npz_path: Path):
    return None
 def _lora_mel_cache_key(lora_adapter_path, data_dir, seed, num_steps,
                        cfg_strength, duration, sample_rate):
    """Build a deterministic hash from all parameters that affect LoRA mel generation."""
    # Hash the LoRA adapter file content (not path — same file moved = same cache)
    h = hashlib.sha256()
    with open(lora_adapter_path, "rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    lora_hash = h.hexdigest()[:16]
    # Hash the sorted .npz file list (names only — content is deterministic per name)
    npz_names = sorted(p.name for p in Path(data_dir).glob("*.npz"))
    key_data = _json.dumps({
        "lora_hash": lora_hash,
        "npz_files": npz_names,
        "seed": seed,
        "num_steps": num_steps,
        "cfg_strength": cfg_strength,
        "duration": duration,
        "sample_rate": sample_rate,
    }, sort_keys=True)
    return hashlib.sha256(key_data.encode()).hexdigest()[:20]
 def _pregenerate_lora_mels(model, data_dir, lora_adapter_path, device, dtype,
                           sample_rate, duration, seed=42, num_steps=25,
-                           cfg_strength=4.5):
+                           cfg_strength=4.5, cache_dir=None):
    """Generate LoRA mels for all clips with matching audio in data_dir.
    Uses the LoRA adapter to run full ODE generation with CFG → VAE decode →
@@ -383,8 +410,26 @@ def _pregenerate_lora_mels(model, data_dir, lora_adapter_path, device, dtype,
    default (4.5) so the degraded mels the vocoder trains on are representative
    of what it will see at inference time.
    If cache_dir is provided, results are cached to disk and reused when
    generation parameters haven't changed.
    Returns list of (mel [n_mels, T_mel], audio [L]) CPU tensors.
    """
    # ── Check cache ──────────────────────────────────────────────────────────
    cache_path = None
    if cache_dir is not None:
        cache_key = _lora_mel_cache_key(
            lora_adapter_path, data_dir, seed, num_steps,
            cfg_strength, duration, sample_rate,
        )
        cache_path = Path(cache_dir) / f"lora_mels_{cache_key}.pt"
        if cache_path.exists():
            print(f"[BigVGAN] Loading cached LoRA mels: {cache_path.name}", flush=True)
            cached = torch.load(str(cache_path), map_location="cpu", weights_only=True)
            pairs = [(m, a) for m, a in zip(cached["mels"], cached["audios"])]
            print(f"[BigVGAN] Loaded {len(pairs)} cached mel/audio pairs", flush=True)
            return pairs
    from selva_core.model.lora import apply_lora, load_lora
    from selva_core.model.flow_matching import FlowMatching
@@ -524,6 +569,16 @@ def _pregenerate_lora_mels(model, data_dir, lora_adapter_path, device, dtype,
        soft_empty_cache()
    print(f"[BigVGAN] Pre-generated {len(pairs)} LoRA mel / clean audio pairs", flush=True)
    # ── Save cache ───────────────────────────────────────────────────────────
    if cache_path is not None and pairs:
        cache_path.parent.mkdir(parents=True, exist_ok=True)
        torch.save({
            "mels":   [m for m, _ in pairs],
            "audios": [a for _, a in pairs],
        }, str(cache_path))
        print(f"[BigVGAN] Cached LoRA mels: {cache_path.name}", flush=True)
    return pairs
@@ -756,6 +811,7 @@ class SelvaBigvganTrainer:
                        model, data_dir, str(lora_path),
                        device, dtype, sample_rate,
                        seq_cfg.duration, seed=seed,
                        cache_dir=out_path.parent,
                    )
                    if not lora_mel_pairs:
                        raise RuntimeError(