feat: add grad norm logging and richer experiment summary output

trainer:
- Track gradient norm before clipping at each optimizer step
- Log avg grad_norm per log_interval alongside loss in console output
- Include grad_norm_history in _train_inner return dict

scheduler:
- Add system block to summary (GPU name, VRAM, torch/CUDA version)
- Include full loss_history and grad_norm_history arrays in each
  experiment result (50-step resolution, not just save_every checkpoints)
- Add loss_std_last_quarter stability metric (std dev of raw loss over
  last 25% of steps — high value indicates unstable training)
- Add log_interval field so consumers know the x-axis resolution

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-06 13:06:39 +02:00
parent 3ec380a27e
commit 2d200395af
2 changed files with 70 additions and 25 deletions
+45 -13
View File
@@ -44,6 +44,24 @@ from .selva_lora_trainer import (
)
def _get_system_info() -> dict:
"""Collect GPU / torch version info for the summary header."""
info: dict = {
"torch_version": torch.__version__,
"cuda_version": torch.version.cuda or "N/A",
"gpu_name": None,
"gpu_vram_gb": None,
}
if torch.cuda.is_available():
try:
info["gpu_name"] = torch.cuda.get_device_name(0)
props = torch.cuda.get_device_properties(0)
info["gpu_vram_gb"] = round(props.total_memory / 1e9, 1)
except Exception:
pass
return info
# Defaults mirror SelvaLoraTrainer INPUT_TYPES defaults
_PARAM_DEFAULTS = {
"alpha": 0.0,
@@ -286,6 +304,7 @@ class SelvaLoraScheduler:
"sweep_file": str(exp_path),
"started_at": datetime.now(timezone.utc).isoformat(),
"completed_at": None,
"system": _get_system_info(),
"data_dir": str(data_dir),
"n_clips": n_clips,
"experiments": [],
@@ -373,25 +392,38 @@ class SelvaLoraScheduler:
ts_mode, ln_sigma, curr_switch, dropout, plus_ratio,
)
duration = time.monotonic() - t_start
loss_history = r["loss_history"]
smoothed = _smooth_losses(loss_history) if loss_history else []
duration = time.monotonic() - t_start
loss_history = r["loss_history"]
grad_norm_history = r.get("grad_norm_history", [])
smoothed = _smooth_losses(loss_history) if loss_history else []
# Compute summary metrics
final_loss = round(smoothed[-1], 6) if smoothed else None
min_loss = round(min(smoothed), 6) if smoothed else None
min_idx = smoothed.index(min(smoothed)) if smoothed else None
# Scalar summary metrics
final_loss = round(smoothed[-1], 6) if smoothed else None
min_loss = round(min(smoothed), 6) if smoothed else None
min_idx = smoothed.index(min(smoothed)) if smoothed else None
min_loss_step = (min_idx + 1) * log_interval if min_idx is not None else None
# Stability: std-dev of raw loss over last 25% of steps
if loss_history:
quarter = max(1, len(loss_history) // 4)
last_q = loss_history[-quarter:]
loss_std_last_quarter = round(float(np.std(last_q)), 6)
else:
loss_std_last_quarter = None
exp_record["results"] = {
"status": "completed",
"final_loss": final_loss,
"min_loss": min_loss,
"min_loss_step": min_loss_step,
"loss_at_steps": _loss_at_steps(
"status": "completed",
"final_loss": final_loss,
"min_loss": min_loss,
"min_loss_step": min_loss_step,
"loss_std_last_quarter": loss_std_last_quarter,
"loss_at_steps": _loss_at_steps(
loss_history, log_interval, save_every, 0, steps
),
"duration_seconds": round(duration, 1),
"loss_history": [round(v, 6) for v in loss_history],
"grad_norm_history": grad_norm_history,
"log_interval": log_interval,
"duration_seconds": round(duration, 1),
}
exp_record["adapter_path"] = r["adapter_path"]