From ca87c41a2e9473045cf809e6033d9a6703bb8678 Mon Sep 17 00:00:00 2001 From: Ethanfel Date: Fri, 27 Mar 2026 21:13:42 +0100 Subject: [PATCH] feat: add per-step timing to feature extraction logs Each step now prints elapsed seconds on completion. Total time printed at the end to identify bottlenecks. Co-Authored-By: Claude Sonnet 4.6 --- scripts/extract_features.py | 48 +++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/scripts/extract_features.py b/scripts/extract_features.py index d82aff5..6098710 100755 --- a/scripts/extract_features.py +++ b/scripts/extract_features.py @@ -10,6 +10,7 @@ Usage: import argparse import os import sys +import time import numpy as np import torch @@ -20,7 +21,21 @@ if _PLUGIN_DIR not in sys.path: sys.path.insert(0, _PLUGIN_DIR) +def _step(n, total, label): + """Print step header and return start time.""" + print(f"[extract] Step {n}/{total} — {label}...", flush=True) + return time.perf_counter() + + +def _done(t0, extra=""): + elapsed = time.perf_counter() - t0 + suffix = f" {extra}" if extra else "" + print(f"[extract] done in {elapsed:.1f}s{suffix}", flush=True) + + def main(): + t_total = time.perf_counter() + parser = argparse.ArgumentParser(description="PrismAudio feature extraction") parser.add_argument("--video", required=True, help="Path to input video") parser.add_argument("--cot_text", required=True, help="Chain-of-thought description") @@ -46,23 +61,23 @@ def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # ------------------------------------------------------------------ - print("[extract] Step 1/6 — importing dependencies...", flush=True) + t0 = _step(1, 6, "importing dependencies") from data_utils.v2a_utils.feature_utils_288 import FeaturesUtils import torchvision.transforms as T from decord import VideoReader, cpu - print("[extract] Step 1/6 — done", flush=True) + _done(t0) # ------------------------------------------------------------------ - print("[extract] Step 2/6 — loading models (T5, VideoPrism, Synchformer)...", flush=True) + t0 = _step(2, 6, "loading models (T5, VideoPrism, Synchformer)") feat_utils = FeaturesUtils( vae_config_path=args.vae_config, synchformer_ckpt=args.synchformer_ckpt, device=device, ) - print("[extract] Step 2/6 — done", flush=True) + _done(t0) # ------------------------------------------------------------------ - print("[extract] Step 3/6 — reading and preprocessing video...", flush=True) + t0 = _step(3, 6, "reading and preprocessing video") vr = VideoReader(args.video, ctx=cpu(0)) fps = vr.get_avg_fps() total_frames = len(vr) @@ -96,30 +111,26 @@ def main(): T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), ]) sync_input = torch.stack([sync_transform(f) for f in sync_frames]).unsqueeze(0).to(device) - print("[extract] Step 3/6 — done", flush=True) + _done(t0) # ------------------------------------------------------------------ - print("[extract] Step 4/6 — encoding text with T5-Gemma...", flush=True) + t0 = _step(4, 6, "encoding text with T5-Gemma") text_features = feat_utils.encode_t5_text([args.cot_text]) - print(f"[extract] text_features shape: {tuple(text_features.shape)}", flush=True) - print("[extract] Step 4/6 — done", flush=True) + _done(t0, f"shape={tuple(text_features.shape)}") # ------------------------------------------------------------------ - print("[extract] Step 5/6 — encoding video with VideoPrism...", flush=True) + t0 = _step(5, 6, "encoding video with VideoPrism") global_video_features, video_features, global_text_features = \ feat_utils.encode_video_and_text_with_videoprism(clip_input, [args.cot_text]) - print(f"[extract] global_video_features : {tuple(global_video_features.shape)}", flush=True) - print(f"[extract] video_features : {tuple(video_features.shape)}", flush=True) - print(f"[extract] global_text_features : {tuple(global_text_features.shape)}", flush=True) - print("[extract] Step 5/6 — done", flush=True) + _done(t0, f"video={tuple(video_features.shape)} global={tuple(global_video_features.shape)}") # ------------------------------------------------------------------ - print("[extract] Step 6/6 — encoding video with Synchformer...", flush=True) + t0 = _step(6, 6, "encoding video with Synchformer") sync_features = feat_utils.encode_video_with_sync(sync_input) - print(f"[extract] sync_features shape: {tuple(sync_features.shape)}", flush=True) - print("[extract] Step 6/6 — done", flush=True) + _done(t0, f"shape={tuple(sync_features.shape)}") # ------------------------------------------------------------------ + t0 = time.perf_counter() print(f"[extract] Saving features to {args.output} ...", flush=True) np.savez( args.output, @@ -131,7 +142,8 @@ def main(): caption_cot=args.cot_text, duration=duration, ) - print(f"[extract] Done — features saved to {args.output}", flush=True) + print(f"[extract] Saved in {time.perf_counter() - t0:.1f}s", flush=True) + print(f"[extract] Total time: {time.perf_counter() - t_total:.1f}s", flush=True) if __name__ == "__main__":