fix: guard model cleanup in try/finally and fix DiTWrapper comments
- Wrap training loop in try/finally so _unapply_lora always runs. Without this, an exception mid-training would leave LoRALinear wrappers in the cached DiTWrapper; a subsequent training run would then apply LoRA on top of existing LoRA, silently doubling the effective rank. - Fix misleading comment: diffusion.model is DiTWrapper (not DiffusionTransformer). DiffusionTransformer is at diffusion.model.model; _apply_lora reaches it recursively but the direct attribute is the wrapper. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -95,7 +95,7 @@ class PrismAudioLoRALoader:
|
|||||||
# Merge LoRA weights in-place into the DiT's base linear layers.
|
# Merge LoRA weights in-place into the DiT's base linear layers.
|
||||||
# ComfyUI re-executes the upstream ModelLoader on the next queue run
|
# ComfyUI re-executes the upstream ModelLoader on the next queue run
|
||||||
# when inputs change, providing a fresh base model as needed.
|
# when inputs change, providing a fresh base model as needed.
|
||||||
dit = model["model"].model # DiffusionTransformer
|
dit = model["model"].model # DiTWrapper
|
||||||
|
|
||||||
if strength == 0.0:
|
if strength == 0.0:
|
||||||
print("[PrismAudio] LoRA strength=0.0 — skipping merge, base model unchanged.", flush=True)
|
print("[PrismAudio] LoRA strength=0.0 — skipping merge, base model unchanged.", flush=True)
|
||||||
|
|||||||
+62
-58
@@ -176,7 +176,7 @@ class PrismAudioLoRATrainer:
|
|||||||
diffusion.pretransform.to(device)
|
diffusion.pretransform.to(device)
|
||||||
|
|
||||||
# Freeze all DiT params, then apply LoRA (adds trainable lora_A/lora_B)
|
# Freeze all DiT params, then apply LoRA (adds trainable lora_A/lora_B)
|
||||||
dit = diffusion.model # DiffusionTransformer
|
dit = diffusion.model # DiTWrapper
|
||||||
for p in dit.parameters():
|
for p in dit.parameters():
|
||||||
p.requires_grad_(False)
|
p.requires_grad_(False)
|
||||||
|
|
||||||
@@ -205,76 +205,80 @@ class PrismAudioLoRATrainer:
|
|||||||
|
|
||||||
pbar = comfy.utils.ProgressBar(train_steps)
|
pbar = comfy.utils.ProgressBar(train_steps)
|
||||||
|
|
||||||
for step in range(1, train_steps + 1):
|
try:
|
||||||
npz_path, audio_path = random.choice(pairs)
|
for step in range(1, train_steps + 1):
|
||||||
|
npz_path, audio_path = random.choice(pairs)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# Encode audio to latent space
|
# Encode audio to latent space
|
||||||
audio = _load_audio(audio_path, device)
|
audio = _load_audio(audio_path, device)
|
||||||
x0 = diffusion.pretransform.encode(audio.float()).to(dtype) # [1, 64, L]
|
x0 = diffusion.pretransform.encode(audio.float()).to(dtype) # [1, 64, L]
|
||||||
|
|
||||||
# Build conditioning from features
|
# Build conditioning from features
|
||||||
metadata = (_load_metadata(npz_path, device, dtype),)
|
metadata = (_load_metadata(npz_path, device, dtype),)
|
||||||
conditioning = diffusion.conditioner(metadata, device)
|
conditioning = diffusion.conditioner(metadata, device)
|
||||||
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
|
cond_inputs = diffusion.get_conditioning_inputs(conditioning)
|
||||||
|
|
||||||
# Rectified flow: interpolate between data and noise
|
# Rectified flow: interpolate between data and noise
|
||||||
t = torch.rand(x0.shape[0], device=device, dtype=dtype) # [1]
|
t = torch.rand(x0.shape[0], device=device, dtype=dtype) # [1]
|
||||||
noise = torch.randn_like(x0)
|
noise = torch.randn_like(x0)
|
||||||
# t expanded for broadcast: [1] -> [1, 1, 1]
|
# t expanded for broadcast: [1] -> [1, 1, 1]
|
||||||
t_bcast = t[:, None, None]
|
t_bcast = t[:, None, None]
|
||||||
x_t = (1.0 - t_bcast) * x0 + t_bcast * noise
|
x_t = (1.0 - t_bcast) * x0 + t_bcast * noise
|
||||||
v_target = noise - x0
|
v_target = noise - x0
|
||||||
|
|
||||||
with torch.amp.autocast(device_type=device.type, dtype=dtype):
|
with torch.amp.autocast(device_type=device.type, dtype=dtype):
|
||||||
v_pred = dit(x_t, t,
|
v_pred = dit(x_t, t,
|
||||||
cfg_scale=1.0,
|
cfg_scale=1.0,
|
||||||
cfg_dropout_prob=cfg_dropout_prob,
|
cfg_dropout_prob=cfg_dropout_prob,
|
||||||
**cond_inputs)
|
**cond_inputs)
|
||||||
|
|
||||||
loss = F.mse_loss(v_pred.float(), v_target.float())
|
loss = F.mse_loss(v_pred.float(), v_target.float())
|
||||||
|
|
||||||
if use_scaler:
|
if use_scaler:
|
||||||
scaler.scale(loss).backward()
|
scaler.scale(loss).backward()
|
||||||
scaler.step(optimizer)
|
scaler.step(optimizer)
|
||||||
scaler.update()
|
scaler.update()
|
||||||
else:
|
else:
|
||||||
loss.backward()
|
loss.backward()
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
if step % 50 == 0:
|
if step % 50 == 0:
|
||||||
print(f"[PrismAudio] step {step}/{train_steps} loss={loss.item():.6f}", flush=True)
|
print(f"[PrismAudio] step {step}/{train_steps} loss={loss.item():.6f}", flush=True)
|
||||||
|
|
||||||
if step % save_every == 0:
|
if step % save_every == 0:
|
||||||
ckpt_path = output_path.replace(".safetensors", f"_step{step}.safetensors")
|
ckpt_path = output_path.replace(".safetensors", f"_step{step}.safetensors")
|
||||||
save_file(_get_lora_state_dict(dit), ckpt_path)
|
save_file(_get_lora_state_dict(dit), ckpt_path)
|
||||||
print(f"[PrismAudio] Checkpoint: {ckpt_path}", flush=True)
|
print(f"[PrismAudio] Checkpoint: {ckpt_path}", flush=True)
|
||||||
|
|
||||||
pbar.update(1)
|
pbar.update(1)
|
||||||
|
|
||||||
# Save final weights
|
# Save final weights
|
||||||
save_file(_get_lora_state_dict(dit), output_path)
|
save_file(_get_lora_state_dict(dit), output_path)
|
||||||
|
|
||||||
# Save config alongside weights so the loader knows the structure
|
# Save config alongside weights so the loader knows the structure
|
||||||
config_path = output_path.replace(".safetensors", "_config.json")
|
config_path = output_path.replace(".safetensors", "_config.json")
|
||||||
with open(config_path, "w") as f:
|
with open(config_path, "w") as f:
|
||||||
json.dump({
|
json.dump({
|
||||||
"rank": lora_rank,
|
"rank": lora_rank,
|
||||||
"alpha": lora_alpha,
|
"alpha": lora_alpha,
|
||||||
"target_modules": sorted(target_attrs),
|
"target_modules": sorted(target_attrs),
|
||||||
}, f, indent=2)
|
}, f, indent=2)
|
||||||
|
|
||||||
print(f"[PrismAudio] LoRA saved: {output_path}", flush=True)
|
print(f"[PrismAudio] LoRA saved: {output_path}", flush=True)
|
||||||
|
|
||||||
# Restore model to base state (remove LoRA wrappers, restore original linears)
|
finally:
|
||||||
dit.eval()
|
# Always restore model to base state — even on exception.
|
||||||
_unapply_lora(dit)
|
# Without this, LoRA wrappers would persist in the cached model and
|
||||||
|
# subsequent training runs would apply LoRA on top of existing LoRA.
|
||||||
|
dit.eval()
|
||||||
|
_unapply_lora(dit)
|
||||||
|
|
||||||
if strategy == "offload_to_cpu":
|
if strategy == "offload_to_cpu":
|
||||||
diffusion.model.to(get_offload_device())
|
diffusion.model.to(get_offload_device())
|
||||||
diffusion.conditioner.to(get_offload_device())
|
diffusion.conditioner.to(get_offload_device())
|
||||||
diffusion.pretransform.to(get_offload_device())
|
diffusion.pretransform.to(get_offload_device())
|
||||||
soft_empty_cache()
|
soft_empty_cache()
|
||||||
|
|
||||||
return (output_path,)
|
return (output_path,)
|
||||||
|
|||||||
Reference in New Issue
Block a user