fix: bug audit — broken test imports, training data overlap, cleanup

- Fix test_utils.py importing build_annotation_json_path from main
  instead of core.annotations (all 59 tests pass now)
- Fix get_training_data double-counting clips at same start_time
  in both positive and soft sets — subtract positive from soft
- Add cancel_flag to train_classifier so training can be interrupted
  between videos (TrainWorker passes self as cancel_flag)
- Remove orphaned core/export.py (was for deleted server API)
- Remove stale Dockerfile and docker-compose.yml (referenced server)
- Clean up leftover server/__pycache__ and client/ build artifacts
- Add torch to requirements.txt (was only mentioned in comments)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-18 12:55:58 +02:00
parent 7834b1d05c
commit e1789d4e71
8 changed files with 17 additions and 168 deletions
+6 -1
View File
@@ -240,7 +240,8 @@ def train_classifier(video_infos: list[tuple[str, list[float], list[float]]],
model_path: str | None = None,
tolerance: float = 12.0,
neg_margin: float = 120.0,
embed_model: str | None = None) -> dict:
embed_model: str | None = None,
cancel_flag: object = None) -> dict:
"""Train a classifier from labeled videos.
Args:
@@ -248,6 +249,7 @@ def train_classifier(video_infos: list[tuple[str, list[float], list[float]]],
model_path: if given, save model to this path
tolerance/neg_margin: labeling parameters
embed_model: embedding model name (e.g. "HUBERT_BASE", "BEATS"), defaults to WAV2VEC2_BASE
cancel_flag: object with _cancel attribute; if set, training aborts early
Returns:
dict with 'classifier', 'embed_model', and metadata, or None on failure.
@@ -257,6 +259,9 @@ def train_classifier(video_infos: list[tuple[str, list[float], list[float]]],
all_X, all_y = [], []
for vi, (vpath, gt_intense, gt_soft) in enumerate(video_infos):
if cancel_flag and getattr(cancel_flag, '_cancel', False):
_log("audio_scan: training cancelled")
return None
_log(f"audio_scan: training [{vi+1}/{len(video_infos)}] {os.path.basename(vpath)}")
y, _ = librosa.load(vpath, sr=_SR, mono=True)
+5
View File
@@ -283,6 +283,11 @@ class ProcessedDB:
else:
soft_by_video.setdefault(fn, set()).add(st)
# Remove positive times from soft to avoid conflicting labels
for fn in pos_by_video:
if fn in soft_by_video:
soft_by_video[fn] -= pos_by_video[fn]
result = []
for fn in pos_by_video:
sp = source_by_filename.get(fn, "")
-127
View File
@@ -1,127 +0,0 @@
import os
import subprocess
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable
from .ffmpeg import build_ffmpeg_command, build_audio_extract_command
from .paths import _log
class ExportRunner:
"""Run ffmpeg export jobs in a background thread pool.
Callbacks:
on_clip_done(path: str)
on_all_done()
on_error(msg: str)
"""
def __init__(
self,
input_path: str,
jobs: list[tuple[float, str, str | None, float]],
short_side: int | None = None,
image_sequence: bool = False,
max_workers: int | None = None,
encoder: str = "libx264",
on_clip_done: Callable[[str], None] | None = None,
on_all_done: Callable[[], None] | None = None,
on_error: Callable[[str], None] | None = None,
):
self._input = input_path
self._jobs = jobs
self._short_side = short_side
self._image_sequence = image_sequence
self._max_workers = max_workers
self._encoder = encoder
self._on_clip_done = on_clip_done
self._on_all_done = on_all_done
self._on_error = on_error
self._cancel = False
self._procs: list[subprocess.Popen] = []
self._procs_lock = threading.Lock()
self._thread: threading.Thread | None = None
def start(self):
self._thread = threading.Thread(target=self._run, daemon=True)
self._thread.start()
def cancel(self):
self._cancel = True
with self._procs_lock:
for proc in self._procs:
try:
proc.kill()
except OSError:
pass
def is_running(self) -> bool:
return self._thread is not None and self._thread.is_alive()
def _run_one(self, start: float, output: str,
portrait_ratio: str | None, crop_center: float) -> str:
if self._cancel:
raise RuntimeError("cancelled")
if self._image_sequence:
os.makedirs(output, exist_ok=True)
cmd = build_ffmpeg_command(
self._input, start, output,
short_side=self._short_side,
portrait_ratio=portrait_ratio,
crop_center=crop_center,
image_sequence=self._image_sequence,
encoder=self._encoder,
)
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
with self._procs_lock:
self._procs.append(proc)
try:
_, stderr = proc.communicate(timeout=120)
except subprocess.TimeoutExpired:
proc.kill()
raise RuntimeError("ffmpeg timed out")
finally:
with self._procs_lock:
self._procs.remove(proc)
if self._cancel:
raise RuntimeError("cancelled")
if proc.returncode != 0:
msg = stderr.decode(errors='replace')[-500:] if stderr else "ffmpeg failed"
raise RuntimeError(msg)
if self._image_sequence:
audio_cmd = build_audio_extract_command(self._input, start, output)
audio_result = subprocess.run(audio_cmd, capture_output=True, text=True, timeout=60)
if audio_result.returncode != 0:
msg = (audio_result.stderr or "audio extraction failed")[-500:]
raise RuntimeError(msg)
return output
def _run(self):
cap = self._max_workers or (os.cpu_count() or 2)
workers = min(len(self._jobs), cap)
try:
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {
pool.submit(self._run_one, s, o, pr, cc): o
for s, o, pr, cc in self._jobs
}
for fut in as_completed(futures):
if self._cancel:
break
try:
path = fut.result()
if self._on_clip_done:
self._on_clip_done(path)
except Exception as e:
if "cancelled" not in str(e) and self._on_error:
self._on_error(str(e))
return
except Exception as e:
if self._on_error:
self._on_error(str(e))
return
if self._cancel:
return
if self._on_all_done:
self._on_all_done()