From 2d200395af1f186cc8a54b59ec4cbe92eb731909 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Mon, 6 Apr 2026 13:06:39 +0200 Subject: [PATCH] feat: add grad norm logging and richer experiment summary output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit trainer: - Track gradient norm before clipping at each optimizer step - Log avg grad_norm per log_interval alongside loss in console output - Include grad_norm_history in _train_inner return dict scheduler: - Add system block to summary (GPU name, VRAM, torch/CUDA version) - Include full loss_history and grad_norm_history arrays in each experiment result (50-step resolution, not just save_every checkpoints) - Add loss_std_last_quarter stability metric (std dev of raw loss over last 25% of steps — high value indicates unstable training) - Add log_interval field so consumers know the x-axis resolution Co-Authored-By: Claude Sonnet 4.6 --- nodes/selva_lora_scheduler.py | 58 +++++++++++++++++++++++++++-------- nodes/selva_lora_trainer.py | 37 ++++++++++++++-------- 2 files changed, 70 insertions(+), 25 deletions(-) diff --git a/nodes/selva_lora_scheduler.py b/nodes/selva_lora_scheduler.py index ad5866c..7576148 100644 --- a/nodes/selva_lora_scheduler.py +++ b/nodes/selva_lora_scheduler.py @@ -44,6 +44,24 @@ from .selva_lora_trainer import ( ) +def _get_system_info() -> dict: + """Collect GPU / torch version info for the summary header.""" + info: dict = { + "torch_version": torch.__version__, + "cuda_version": torch.version.cuda or "N/A", + "gpu_name": None, + "gpu_vram_gb": None, + } + if torch.cuda.is_available(): + try: + info["gpu_name"] = torch.cuda.get_device_name(0) + props = torch.cuda.get_device_properties(0) + info["gpu_vram_gb"] = round(props.total_memory / 1e9, 1) + except Exception: + pass + return info + + # Defaults mirror SelvaLoraTrainer INPUT_TYPES defaults _PARAM_DEFAULTS = { "alpha": 0.0, @@ -286,6 +304,7 @@ class SelvaLoraScheduler: "sweep_file": str(exp_path), "started_at": datetime.now(timezone.utc).isoformat(), "completed_at": None, + "system": _get_system_info(), "data_dir": str(data_dir), "n_clips": n_clips, "experiments": [], @@ -373,25 +392,38 @@ class SelvaLoraScheduler: ts_mode, ln_sigma, curr_switch, dropout, plus_ratio, ) - duration = time.monotonic() - t_start - loss_history = r["loss_history"] - smoothed = _smooth_losses(loss_history) if loss_history else [] + duration = time.monotonic() - t_start + loss_history = r["loss_history"] + grad_norm_history = r.get("grad_norm_history", []) + smoothed = _smooth_losses(loss_history) if loss_history else [] - # Compute summary metrics - final_loss = round(smoothed[-1], 6) if smoothed else None - min_loss = round(min(smoothed), 6) if smoothed else None - min_idx = smoothed.index(min(smoothed)) if smoothed else None + # Scalar summary metrics + final_loss = round(smoothed[-1], 6) if smoothed else None + min_loss = round(min(smoothed), 6) if smoothed else None + min_idx = smoothed.index(min(smoothed)) if smoothed else None min_loss_step = (min_idx + 1) * log_interval if min_idx is not None else None + # Stability: std-dev of raw loss over last 25% of steps + if loss_history: + quarter = max(1, len(loss_history) // 4) + last_q = loss_history[-quarter:] + loss_std_last_quarter = round(float(np.std(last_q)), 6) + else: + loss_std_last_quarter = None + exp_record["results"] = { - "status": "completed", - "final_loss": final_loss, - "min_loss": min_loss, - "min_loss_step": min_loss_step, - "loss_at_steps": _loss_at_steps( + "status": "completed", + "final_loss": final_loss, + "min_loss": min_loss, + "min_loss_step": min_loss_step, + "loss_std_last_quarter": loss_std_last_quarter, + "loss_at_steps": _loss_at_steps( loss_history, log_interval, save_every, 0, steps ), - "duration_seconds": round(duration, 1), + "loss_history": [round(v, 6) for v in loss_history], + "grad_norm_history": grad_norm_history, + "log_interval": log_interval, + "duration_seconds": round(duration, 1), } exp_record["adapter_path"] = r["adapter_path"] diff --git a/nodes/selva_lora_trainer.py b/nodes/selva_lora_trainer.py index 555a806..7ad619b 100644 --- a/nodes/selva_lora_trainer.py +++ b/nodes/selva_lora_trainer.py @@ -550,8 +550,11 @@ class SelvaLoraTrainer: log_interval = 50 remaining = steps - start_step pbar_train = comfy.utils.ProgressBar(remaining) - loss_history = [] - running_loss = 0.0 + loss_history = [] + running_loss = 0.0 + grad_norm_history = [] + running_grad_norm = 0.0 + grad_norm_count = 0 meta = { "variant": variant, @@ -608,18 +611,27 @@ class SelvaLoraTrainer: running_loss += loss.item() * grad_accum if step % grad_accum == 0: - torch.nn.utils.clip_grad_norm_(lora_A_params + lora_B_params, max_norm=1.0) + grad_norm = torch.nn.utils.clip_grad_norm_( + lora_A_params + lora_B_params, max_norm=1.0 + ).item() + running_grad_norm += grad_norm + grad_norm_count += 1 optimizer.step() scheduler.step() optimizer.zero_grad() if step % log_interval == 0: - avg = running_loss / log_interval + avg = running_loss / log_interval + avg_gnorm = running_grad_norm / max(1, grad_norm_count) loss_history.append(avg) + grad_norm_history.append(round(avg_gnorm, 6)) lr_now = scheduler.get_last_lr()[0] print(f"[LoRA Trainer] step {step:5d}/{steps} " - f"loss={avg:.4f} lr={lr_now:.2e} bs={batch_size}", flush=True) - running_loss = 0.0 + f"loss={avg:.4f} grad_norm={avg_gnorm:.4f} " + f"lr={lr_now:.2e} bs={batch_size}", flush=True) + running_loss = 0.0 + running_grad_norm = 0.0 + grad_norm_count = 0 # Live preview: send updated loss curve to ComfyUI frontend preview_img = _draw_loss_curve(loss_history, log_interval, start_step, @@ -693,10 +705,11 @@ class SelvaLoraTrainer: loss_curve = _pil_to_tensor(smoothed_img) return { - "patched_model": patched, - "adapter_path": str(final_path), - "loss_curve": loss_curve, - "loss_history": loss_history, - "meta": meta, - "completed": True, + "patched_model": patched, + "adapter_path": str(final_path), + "loss_curve": loss_curve, + "loss_history": loss_history, + "grad_norm_history": grad_norm_history, + "meta": meta, + "completed": True, }