feat: add grad norm logging and richer experiment summary output
trainer: - Track gradient norm before clipping at each optimizer step - Log avg grad_norm per log_interval alongside loss in console output - Include grad_norm_history in _train_inner return dict scheduler: - Add system block to summary (GPU name, VRAM, torch/CUDA version) - Include full loss_history and grad_norm_history arrays in each experiment result (50-step resolution, not just save_every checkpoints) - Add loss_std_last_quarter stability metric (std dev of raw loss over last 25% of steps — high value indicates unstable training) - Add log_interval field so consumers know the x-axis resolution Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -44,6 +44,24 @@ from .selva_lora_trainer import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_system_info() -> dict:
|
||||||
|
"""Collect GPU / torch version info for the summary header."""
|
||||||
|
info: dict = {
|
||||||
|
"torch_version": torch.__version__,
|
||||||
|
"cuda_version": torch.version.cuda or "N/A",
|
||||||
|
"gpu_name": None,
|
||||||
|
"gpu_vram_gb": None,
|
||||||
|
}
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
try:
|
||||||
|
info["gpu_name"] = torch.cuda.get_device_name(0)
|
||||||
|
props = torch.cuda.get_device_properties(0)
|
||||||
|
info["gpu_vram_gb"] = round(props.total_memory / 1e9, 1)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
# Defaults mirror SelvaLoraTrainer INPUT_TYPES defaults
|
# Defaults mirror SelvaLoraTrainer INPUT_TYPES defaults
|
||||||
_PARAM_DEFAULTS = {
|
_PARAM_DEFAULTS = {
|
||||||
"alpha": 0.0,
|
"alpha": 0.0,
|
||||||
@@ -286,6 +304,7 @@ class SelvaLoraScheduler:
|
|||||||
"sweep_file": str(exp_path),
|
"sweep_file": str(exp_path),
|
||||||
"started_at": datetime.now(timezone.utc).isoformat(),
|
"started_at": datetime.now(timezone.utc).isoformat(),
|
||||||
"completed_at": None,
|
"completed_at": None,
|
||||||
|
"system": _get_system_info(),
|
||||||
"data_dir": str(data_dir),
|
"data_dir": str(data_dir),
|
||||||
"n_clips": n_clips,
|
"n_clips": n_clips,
|
||||||
"experiments": [],
|
"experiments": [],
|
||||||
@@ -375,22 +394,35 @@ class SelvaLoraScheduler:
|
|||||||
|
|
||||||
duration = time.monotonic() - t_start
|
duration = time.monotonic() - t_start
|
||||||
loss_history = r["loss_history"]
|
loss_history = r["loss_history"]
|
||||||
|
grad_norm_history = r.get("grad_norm_history", [])
|
||||||
smoothed = _smooth_losses(loss_history) if loss_history else []
|
smoothed = _smooth_losses(loss_history) if loss_history else []
|
||||||
|
|
||||||
# Compute summary metrics
|
# Scalar summary metrics
|
||||||
final_loss = round(smoothed[-1], 6) if smoothed else None
|
final_loss = round(smoothed[-1], 6) if smoothed else None
|
||||||
min_loss = round(min(smoothed), 6) if smoothed else None
|
min_loss = round(min(smoothed), 6) if smoothed else None
|
||||||
min_idx = smoothed.index(min(smoothed)) if smoothed else None
|
min_idx = smoothed.index(min(smoothed)) if smoothed else None
|
||||||
min_loss_step = (min_idx + 1) * log_interval if min_idx is not None else None
|
min_loss_step = (min_idx + 1) * log_interval if min_idx is not None else None
|
||||||
|
|
||||||
|
# Stability: std-dev of raw loss over last 25% of steps
|
||||||
|
if loss_history:
|
||||||
|
quarter = max(1, len(loss_history) // 4)
|
||||||
|
last_q = loss_history[-quarter:]
|
||||||
|
loss_std_last_quarter = round(float(np.std(last_q)), 6)
|
||||||
|
else:
|
||||||
|
loss_std_last_quarter = None
|
||||||
|
|
||||||
exp_record["results"] = {
|
exp_record["results"] = {
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
"final_loss": final_loss,
|
"final_loss": final_loss,
|
||||||
"min_loss": min_loss,
|
"min_loss": min_loss,
|
||||||
"min_loss_step": min_loss_step,
|
"min_loss_step": min_loss_step,
|
||||||
|
"loss_std_last_quarter": loss_std_last_quarter,
|
||||||
"loss_at_steps": _loss_at_steps(
|
"loss_at_steps": _loss_at_steps(
|
||||||
loss_history, log_interval, save_every, 0, steps
|
loss_history, log_interval, save_every, 0, steps
|
||||||
),
|
),
|
||||||
|
"loss_history": [round(v, 6) for v in loss_history],
|
||||||
|
"grad_norm_history": grad_norm_history,
|
||||||
|
"log_interval": log_interval,
|
||||||
"duration_seconds": round(duration, 1),
|
"duration_seconds": round(duration, 1),
|
||||||
}
|
}
|
||||||
exp_record["adapter_path"] = r["adapter_path"]
|
exp_record["adapter_path"] = r["adapter_path"]
|
||||||
|
|||||||
@@ -552,6 +552,9 @@ class SelvaLoraTrainer:
|
|||||||
pbar_train = comfy.utils.ProgressBar(remaining)
|
pbar_train = comfy.utils.ProgressBar(remaining)
|
||||||
loss_history = []
|
loss_history = []
|
||||||
running_loss = 0.0
|
running_loss = 0.0
|
||||||
|
grad_norm_history = []
|
||||||
|
running_grad_norm = 0.0
|
||||||
|
grad_norm_count = 0
|
||||||
|
|
||||||
meta = {
|
meta = {
|
||||||
"variant": variant,
|
"variant": variant,
|
||||||
@@ -608,18 +611,27 @@ class SelvaLoraTrainer:
|
|||||||
running_loss += loss.item() * grad_accum
|
running_loss += loss.item() * grad_accum
|
||||||
|
|
||||||
if step % grad_accum == 0:
|
if step % grad_accum == 0:
|
||||||
torch.nn.utils.clip_grad_norm_(lora_A_params + lora_B_params, max_norm=1.0)
|
grad_norm = torch.nn.utils.clip_grad_norm_(
|
||||||
|
lora_A_params + lora_B_params, max_norm=1.0
|
||||||
|
).item()
|
||||||
|
running_grad_norm += grad_norm
|
||||||
|
grad_norm_count += 1
|
||||||
optimizer.step()
|
optimizer.step()
|
||||||
scheduler.step()
|
scheduler.step()
|
||||||
optimizer.zero_grad()
|
optimizer.zero_grad()
|
||||||
|
|
||||||
if step % log_interval == 0:
|
if step % log_interval == 0:
|
||||||
avg = running_loss / log_interval
|
avg = running_loss / log_interval
|
||||||
|
avg_gnorm = running_grad_norm / max(1, grad_norm_count)
|
||||||
loss_history.append(avg)
|
loss_history.append(avg)
|
||||||
|
grad_norm_history.append(round(avg_gnorm, 6))
|
||||||
lr_now = scheduler.get_last_lr()[0]
|
lr_now = scheduler.get_last_lr()[0]
|
||||||
print(f"[LoRA Trainer] step {step:5d}/{steps} "
|
print(f"[LoRA Trainer] step {step:5d}/{steps} "
|
||||||
f"loss={avg:.4f} lr={lr_now:.2e} bs={batch_size}", flush=True)
|
f"loss={avg:.4f} grad_norm={avg_gnorm:.4f} "
|
||||||
|
f"lr={lr_now:.2e} bs={batch_size}", flush=True)
|
||||||
running_loss = 0.0
|
running_loss = 0.0
|
||||||
|
running_grad_norm = 0.0
|
||||||
|
grad_norm_count = 0
|
||||||
|
|
||||||
# Live preview: send updated loss curve to ComfyUI frontend
|
# Live preview: send updated loss curve to ComfyUI frontend
|
||||||
preview_img = _draw_loss_curve(loss_history, log_interval, start_step,
|
preview_img = _draw_loss_curve(loss_history, log_interval, start_step,
|
||||||
@@ -697,6 +709,7 @@ class SelvaLoraTrainer:
|
|||||||
"adapter_path": str(final_path),
|
"adapter_path": str(final_path),
|
||||||
"loss_curve": loss_curve,
|
"loss_curve": loss_curve,
|
||||||
"loss_history": loss_history,
|
"loss_history": loss_history,
|
||||||
|
"grad_norm_history": grad_norm_history,
|
||||||
"meta": meta,
|
"meta": meta,
|
||||||
"completed": True,
|
"completed": True,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user