diff --git a/core/audio_scan.py b/core/audio_scan.py index bffd23d..e3a35b7 100644 --- a/core/audio_scan.py +++ b/core/audio_scan.py @@ -65,6 +65,7 @@ _EMBED_MODELS = { "AST": 768, "AST_ML": 3072, # 768 * 4 "EAT": 768, + "EAT_LARGE": 1024, } _DEFAULT_EMBED_MODEL = "WAV2VEC2_BASE" @@ -104,11 +105,13 @@ def _get_w2v_model(model_name: str | None = None): _ast_feature_extractor = ASTFeatureExtractor.from_pretrained( "MIT/ast-finetuned-audioset-10-10-0.4593" ) - elif load_name == "EAT": + elif load_name in ("EAT", "EAT_LARGE"): from transformers import AutoModel + eat_repo = ("worstchan/EAT-large_epoch20_finetune_AS2M" + if load_name == "EAT_LARGE" + else "worstchan/EAT-base_epoch30_finetune_AS2M") _w2v_model = AutoModel.from_pretrained( - "worstchan/EAT-base_epoch30_finetune_AS2M", - trust_remote_code=True, + eat_repo, trust_remote_code=True, ).to(_w2v_device) else: import torchaudio @@ -254,7 +257,7 @@ def _extract_w2v_windows(y: np.ndarray, sr: int = _SR, model, device = _get_w2v_model(model_name) is_beats = (model_name or _DEFAULT_EMBED_MODEL) == "BEATS" is_ast = (model_name or _DEFAULT_EMBED_MODEL) in ("AST", "AST_ML") - is_eat = (model_name or _DEFAULT_EMBED_MODEL) == "EAT" + is_eat = (model_name or _DEFAULT_EMBED_MODEL) in ("EAT", "EAT_LARGE") ml_cfg = _ml_config(model_name or _DEFAULT_EMBED_MODEL) # Auto-size batches based on available GPU memory batch_size = 16 @@ -383,7 +386,7 @@ def _extract_w2v_targeted(y: np.ndarray, sr: int, gt_intense: list[float], is_beats = (model_name or _DEFAULT_EMBED_MODEL) == "BEATS" is_ast = (model_name or _DEFAULT_EMBED_MODEL) in ("AST", "AST_ML") - is_eat = (model_name or _DEFAULT_EMBED_MODEL) == "EAT" + is_eat = (model_name or _DEFAULT_EMBED_MODEL) in ("EAT", "EAT_LARGE") ml_cfg = _ml_config(model_name or _DEFAULT_EMBED_MODEL) for batch_start in range(0, len(valid_times), batch_size): diff --git a/core/db.py b/core/db.py index 1e2590e..52a1c6f 100644 --- a/core/db.py +++ b/core/db.py @@ -141,6 +141,92 @@ class ProcessedDB: " ON hard_negatives(filename, profile)" ) self._con.commit() + self._migrate_vid_folders() + + def _migrate_vid_folders(self) -> None: + """Migrate old clip_NNN group dirs → vid_NNN per-video folders. + + Old layout: export_folder/clip_NNN/clip_NNN_sub.mp4 + New layout: export_folder/vid_NNN/clip_NNN_sub.mp4 + + Rewrites output_path in DB and moves files on disk. + """ + # Check if any rows still use the old clip_NNN parent dir layout + row = self._con.execute( + "SELECT id FROM processed WHERE output_path LIKE '%/clip_%/%' LIMIT 1" + ).fetchone() + if not row: + return + + _log("Migrating old clip group dirs → vid folders …") + rows = self._con.execute( + "SELECT id, filename, profile, output_path FROM processed" + " ORDER BY profile, filename, output_path" + ).fetchall() + + # Assign vid_NNN per (profile, export_folder, filename) + vid_map: dict[tuple, str] = {} + vid_counters: dict[tuple, int] = {} + + for rid, filename, profile, op in rows: + parent = os.path.dirname(op) + export_folder = os.path.dirname(parent) + key = (profile, export_folder, filename) + if key not in vid_map: + counter_key = (profile, export_folder) + n = vid_counters.get(counter_key, 1) + vid_map[key] = f"vid_{n:03d}" + vid_counters[counter_key] = n + 1 + + updates: list[tuple[str, int]] = [] + moves: list[tuple[str, str]] = [] + dirs_to_create: set[str] = set() + old_dirs: set[str] = set() + + for rid, filename, profile, op in rows: + parent = os.path.dirname(op) + parent_name = os.path.basename(parent) + # Skip rows already using vid_NNN layout + if parent_name.startswith("vid_"): + continue + export_folder = os.path.dirname(parent) + key = (profile, export_folder, filename) + vid_name = vid_map[key] + new_path = os.path.join(export_folder, vid_name, os.path.basename(op)) + updates.append((new_path, rid)) + dirs_to_create.add(os.path.join(export_folder, vid_name)) + old_dirs.add(parent) + if os.path.exists(op): + moves.append((op, new_path)) + + if not updates: + return + + # Create vid directories + for d in sorted(dirs_to_create): + os.makedirs(d, exist_ok=True) + + # Move files + import shutil + for old, new in moves: + if os.path.exists(old) and not os.path.exists(new): + shutil.move(old, new) + + # Update DB + self._con.executemany( + "UPDATE processed SET output_path = ? WHERE id = ?", updates + ) + self._con.commit() + + # Remove empty old group directories + for d in sorted(old_dirs, reverse=True): + try: + if os.path.isdir(d) and not os.listdir(d): + os.rmdir(d) + except OSError: + pass + + _log(f"Migrated {len(updates)} rows, moved {len(moves)} files to vid folders") def add(self, filename: str, start_time: float, output_path: str, label: str = "", category: str = "", @@ -306,8 +392,8 @@ class ProcessedDB: def get_max_counter(self, folder: str, name: str) -> int: """Return the highest counter N found in output_paths matching folder/name_NNN*. - Parses the group directory component (e.g. 'clip_035') from stored - output_path values. Returns 0 if no matches exist. + Parses the counter from filenames (e.g. 'clip_035_0.mp4' → 35). + *folder* is typically the vid folder. Returns 0 if no matches exist. """ if not self._enabled: return 0 @@ -318,24 +404,66 @@ class ProcessedDB: (prefix + "%",), ).fetchall() max_n = 0 + name_prefix = name + "_" for (op,) in rows: - # output_path: .../folder/name_NNN/name_NNN_sub.ext - parent = os.path.basename(os.path.dirname(op)) - # parent should be "name_NNN" - parts = parent.rsplit("_", 1) - if len(parts) == 2: - try: - max_n = max(max_n, int(parts[1])) - except ValueError: - pass + stem = os.path.splitext(os.path.basename(op))[0] + # stem: "clip_035_0" or "clip_036_a1_0" + if not stem.startswith(name_prefix): + continue + rest = stem[len(name_prefix):] # "035_0" or "036_a1_0" + counter_str = rest.split("_")[0] + try: + max_n = max(max_n, int(counter_str)) + except ValueError: + pass return max_n + def get_vid_folder(self, filename: str, profile: str, + export_folder: str) -> str: + """Return the vid_NNN folder name for a source video. + + Checks existing DB output_paths first; if the video already has a + vid_NNN folder, returns it. Otherwise assigns the next available + number, also checking disk for orphan vid folders. + """ + if not self._enabled: + return "vid_001" + row = self._con.execute( + "SELECT output_path FROM processed" + " WHERE filename = ? AND profile = ? LIMIT 1", + (filename, profile), + ).fetchone() + if row: + parent = os.path.basename(os.path.dirname(row[0])) + if parent.startswith("vid_"): + return parent + # Collect all existing vid_NNN names from DB + disk + existing: set[str] = set() + rows = self._con.execute( + "SELECT DISTINCT output_path FROM processed WHERE profile = ?", + (profile,), + ).fetchall() + for (op,) in rows: + p = os.path.basename(os.path.dirname(op)) + if p.startswith("vid_"): + existing.add(p) + if os.path.isdir(export_folder): + for d in os.listdir(export_folder): + if d.startswith("vid_") and os.path.isdir( + os.path.join(export_folder, d) + ): + existing.add(d) + n = 1 + while f"vid_{n:03d}" in existing: + n += 1 + return f"vid_{n:03d}" + def get_export_folders(self, profile: str = "default", include_scan_exports: bool = False) -> list[str]: """Return distinct export folder names found in output_paths for a profile. Export paths follow the structure: - .../export_folder/group_dir/clip.mp4 + .../export_folder/vid_NNN/clip.mp4 The export folder is 2 levels up from the clip file. Returns folder names sorted alphabetically (e.g. ["mp4_Intense", "mp4_Soft"]). """ diff --git a/core/paths.py b/core/paths.py index d4de4a2..030835b 100644 --- a/core/paths.py +++ b/core/paths.py @@ -25,15 +25,19 @@ def _log(*args) -> None: def build_export_path(folder: str, basename: str, counter: int, sub: int | None = None) -> str: - group = f"{basename}_{counter:03d}" - name = f"{group}_{sub}" if sub is not None else group - return os.path.join(folder, group, name + ".mp4") + """Build clip output path. *folder* should be the vid folder (e.g. .../mp4/vid_001).""" + name = f"{basename}_{counter:03d}" + if sub is not None: + name = f"{name}_{sub}" + return os.path.join(folder, name + ".mp4") def build_sequence_dir(folder: str, basename: str, counter: int, sub: int | None = None) -> str: - group = f"{basename}_{counter:03d}" - name = f"{group}_{sub}" if sub is not None else group - return os.path.join(folder, group, name) + """Build WebP sequence output dir. *folder* should be the vid folder.""" + name = f"{basename}_{counter:03d}" + if sub is not None: + name = f"{name}_{sub}" + return os.path.join(folder, name) def format_time(seconds: float) -> str: diff --git a/docs/plans/2026-04-19-scan-history-negatives-design.md b/docs/plans/2026-04-19-scan-history-negatives-design.md index 5b7410b..9ac679d 100644 --- a/docs/plans/2026-04-19-scan-history-negatives-design.md +++ b/docs/plans/2026-04-19-scan-history-negatives-design.md @@ -1,6 +1,6 @@ -# Scan History & Hard Negative Management Design +# Scan History & Hard Negative Management — Final Design -Date: 2026-04-19 +Date: 2026-04-19 (implemented on `feat/training-ui`) ## Goal @@ -8,83 +8,198 @@ Date: 2026-04-19 2. Make hard negatives manageable — viewable, removable, and optionally disabled per training run 3. Fix latent bug: `get_export_folders()` doesn't filter by `scan_export` -## 1. Scan Result History +--- -### Current behavior - -`save_scan_results()` **replaces** all results for `(filename, profile, model)` on every scan. No history is preserved. - -### Change - -Keep the last N scan results per `(filename, profile, model)` with timestamps. The most recent is the "active" result displayed in the panel; older versions are accessible for comparison. - -### Schema change - -Add column to `scan_results`: - -```sql -ALTER TABLE scan_results ADD COLUMN scan_timestamp TEXT NOT NULL DEFAULT ''; -``` - -All rows from the same scan share the same timestamp string (e.g. `"20260419_143022"`). - -### save_scan_results changes - -Instead of `DELETE ... WHERE filename=? AND profile=? AND model=?`, the new flow: - -1. Insert new rows with current timestamp -2. Count distinct timestamps for this `(filename, profile, model)` -3. If count > N (default 5), delete rows belonging to the oldest timestamps - -### UI changes - -Add a small version dropdown/selector in `ScanResultsPanel` per model tab — shows timestamps of available scan versions. Selecting a version loads that version's results into the tab. The most recent is selected by default. - -The tab label shows the active version's region count, e.g. `HUBERT_XLARGE (12) [v3]`. - -### Cache interaction - -Embedding cache is per `(file, model)` and doesn't change across scans. Only the classifier output changes. History stores the classified regions (start, end, score), not embeddings. - -## 2. Hard Negative Management - -### Current behavior - -- Hard negatives stored in `hard_negatives` table: `(filename, profile, start_time, source_path)` -- No model column — applied globally within a profile -- Removable one-by-one via N toggle in scan panel, but no bulk management -- Always used in training — no way to disable - -### Changes - -#### Schema - -Add `source_model TEXT NOT NULL DEFAULT ''` column to `hard_negatives`. Populated when marking negatives from scan results (we know which model tab is active). - -#### Training toggle - -New checkbox in `TrainDialog`: **"Use hard negatives"** (default checked). When unchecked, `get_training_data()` skips the `hard_negatives` query entirely. Non-destructive — negatives remain in DB. - -#### Management dialog - -New `HardNegativesDialog` accessible from Train dialog via "Manage..." button next to the checkbox. Shows: - -- Table: filename, start time, source model, date added (if we add created_at) -- Filter by source model (dropdown) -- Multi-select + Delete button -- "Clear All" button with confirmation -- Count summary at top - -### Training integration - -`get_training_data()` gets a new `use_hard_negatives: bool = True` parameter. When False, the hard negatives query (lines 365-374 of db.py) is skipped entirely. - -## 3. Ghost Folder Fix +## 1. Ghost Folder Fix ### Bug -`get_export_folders()` queries all `output_path` rows without filtering `scan_export`. Folders that only contain scan-exported clips appear in training dropdowns with 0 clips. +`get_export_folders()` queried all `output_path` rows without filtering `scan_export`. Folders that only contained scan-exported clips appeared in training dropdowns with 0 clips. -### Fix +### Implementation (`core/db.py`) -Add `include_scan_exports` parameter to `get_export_folders()`. When False (default), only query rows with `scan_export = 0`. Also filter out folders with 0 clips from `get_training_stats()` result dict. +**`get_export_folders(profile, include_scan_exports=False)`** — new parameter. When `False` (default), the SQL query adds `AND scan_export = 0` to exclude scan-only folders. The `get_training_stats()` method passes this through and also filters its return dict to remove folders with 0 clips: + +```python +return {k: v for k, v in stats.items() if v["clips"] > 0} +``` + +### Test + +`tests/test_db.py::test_export_folders_excludes_scan_exports` — verifies scan-only folders are excluded by default and included when `include_scan_exports=True`. + +--- + +## 2. Scan Result History + +### Schema + +Added column to `scan_results`: + +```sql +scan_timestamp TEXT NOT NULL DEFAULT '' +``` + +All rows from the same scan share one timestamp string with **microsecond precision** (`%Y%m%d_%H%M%S_%f`, e.g. `"20260419_143022_123456"`). Microsecond precision prevents version collisions on fast successive scans. + +Migration adds the column via `ALTER TABLE` for existing databases. Legacy rows keep `scan_timestamp = ''`. + +### DB methods (`core/db.py`) + +**`save_scan_results(filename, profile, model, regions, max_versions=5)`** +1. Inserts new rows with current microsecond-precision timestamp +2. Counts distinct timestamps for this `(filename, profile, model)` +3. Prunes oldest timestamps beyond `max_versions` + +No more DELETE-then-INSERT — all versions coexist in the table. + +**`get_scan_versions(filename, profile, model)`** +Returns `[{timestamp, count, max_score}, ...]` ordered newest first. Filters `scan_timestamp != ''` so legacy rows don't appear as named versions. + +**`get_scan_results(filename, profile, scan_timestamp=None)`** +- With `scan_timestamp`: returns rows matching that exact version +- Without (default): uses `INNER JOIN` subquery with `MAX(scan_timestamp)` per model to return only the latest version. Legacy rows (empty timestamp) sort before any real timestamp, so they're returned when no versioned scans exist. + +### UI (`main.py` — `ScanResultsPanel`) + +Each model tab wraps its `QTableWidget` in a container `QWidget` with a `QComboBox` for version selection: + +``` +container (QWidget) +├── cmb_version (QComboBox) — hidden when ≤ 1 version +└── table (QTableWidget) +``` + +**Helper methods** unwrap this container: +- `_current_table()` — returns `QTableWidget` from active tab (handles both raw table and container) +- `_tab_table(index)` — same by tab index + +**Version combo** is populated by `_populate_version_combos()` after every `load_for_file()` and `add_scan_results()` call. Labels use `datetime.strptime` parsing with try/except fallback for robustness: + +``` +2026-04-19 14:30 (12 regions, best: 0.95) +``` + +**Version switching** via `_on_version_changed(model, idx)`: +1. Reads `scan_timestamp` from combo's `userData` +2. Calls `get_scan_results(filename, profile, scan_timestamp=ts)` +3. Repopulates the table in-place +4. **Clears the undo stack** — stale undo entries from a different version would corrupt data +5. Emits `regions_edited` to refresh the timeline + +**Tab switch** connects `tab_changed` signal to `_on_scan_regions_edited` (not just `_update_scan_export_count`), so the timeline updates scan regions when switching model tabs. + +### Cache interaction + +Embedding cache is per `(file, model)` and doesn't change across scans. History stores classified regions (start, end, score), not embeddings. + +### Test + +`tests/test_db.py::test_scan_result_history` — saves 3 versions, verifies counts, ordering, and latest-by-default behavior. + +--- + +## 3. Hard Negative Management + +### Schema + +Added column to `hard_negatives`: + +```sql +source_model TEXT NOT NULL DEFAULT '' +``` + +Migration adds the column via `ALTER TABLE` for existing databases. + +### DB methods (`core/db.py`) + +**`add_hard_negatives(filename, profile, times, source_path="", source_model="")`** — now stores which embedding model produced the scan that led to the negative marking. + +**`get_hard_negatives(profile)`** — returns all rows as `[{id, filename, start_time, source_path, source_model}, ...]` for the management dialog. + +**`delete_hard_negatives_by_ids(ids)`** — bulk delete by row IDs. + +**`get_training_data(..., use_hard_negatives=True)`** — new parameter. When `False`, the hard negatives query is skipped entirely. Non-destructive — negatives remain in DB. + +### Source model tracking (`main.py`) + +`_on_scan_negatives()` now passes `source_model=self._scan_panel.current_model_name()` when marking negatives from scan results. `current_model_name()` extracts the model name from the active tab text (stripping the count suffix). + +### Training toggle (`main.py` — `TrainDialog`) + +Checkbox **"Use hard negatives in training"** (default checked) with "Manage..." button in an HBox layout. The toggle: +- Updates live training stats preview via debounced `_update_stats()` +- Passes `use_hard_negatives` through `_open_train_dialog()` to `get_training_data()` + +### Management dialog (`main.py` — `HardNegativesDialog`) + +Accessible from TrainDialog's "Manage..." button. Features: + +| Component | Details | +|-----------|---------| +| **Filter combo** | `(all)` + each distinct `source_model` found in data | +| **Summary label** | `N hard negatives` | +| **Table** | File, Time (`{:.1f}s`), Source Model, hidden ID column | +| **Delete Selected** | Multi-select aware, skips hidden (filtered) rows | +| **Clear All** | **Filter-aware**: if a model filter is active, only deletes negatives for that model with an appropriate confirmation message. If `(all)`, deletes everything. | +| **Close** | Closes dialog, triggers stats refresh in parent TrainDialog | + +`blockSignals(True)` guards prevent spurious filter callbacks during `_load()` repopulation. + +### Tests + +- `test_hard_negatives_source_model` — verifies source_model stored and retrieved +- `test_training_data_skips_hard_negatives` — verifies `use_hard_negatives=False` excludes them +- `test_delete_hard_negatives_by_ids` — verifies bulk deletion by ID + +--- + +## 4. Runtime Fixes (discovered during testing) + +### EAT/torchvision ABI mismatch + +**Problem:** `torchvision` installed from PyPI (CPU build) was incompatible with `torch` from CUDA wheel index, causing `operator torchvision::nms does not exist`. + +**Fix:** Added `torchvision` to the explicit torch install line in both setup scripts: +```bash +pip install torch torchaudio torchvision --index-url "$TORCH_INDEX" +``` + +Also added `--extra-index-url "$TORCH_INDEX"` to the `pip install -r requirements.txt` line to prevent transitive dependencies (timm, ultralytics) from pulling CPU-only torch packages. + +Applied to: `setup_env.sh` (both conda and venv paths), `setup-windows.ps1`. + +### EAT / transformers 5.x incompatibility + +**Problem:** transformers 5.x broke EAT's remote model code (`'EATModel' object has no attribute 'all_tied_weights_keys'`). + +**Fix:** Pinned `transformers>=4.30,<5.0` in `requirements.txt`. + +### NumPy non-writable array warning + +**Problem:** Cached HuBERT/EAT embeddings loaded from disk are read-only numpy arrays. `torch.from_numpy()` on a non-writable array triggers a deprecation warning. + +**Fix:** In `core/audio_scan.py`, changed EAT preprocessing to copy the array: +```python +wav = torch.from_numpy(np.array(chunk)).unsqueeze(0).float() +``` + +### Timeline not updating on tab switch + +**Problem:** Switching model tabs in the scan results panel didn't refresh the timeline's highlighted regions because `tab_changed` was only connected to `_update_scan_export_count`. + +**Fix:** Connected `tab_changed` to `_on_scan_regions_edited` instead, which handles both timeline refresh and export count update. + +--- + +## File Summary + +| File | Changes | +|------|---------| +| `core/db.py` | Schema migrations, `get_export_folders` filter, versioned `save_scan_results`, `get_scan_versions`, version-aware `get_scan_results`, `add_hard_negatives` with `source_model`, `get_hard_negatives`, `delete_hard_negatives_by_ids`, `get_training_data` with `use_hard_negatives` | +| `main.py` | `HardNegativesDialog` class, `TrainDialog` hard neg toggle + manage button, `ScanResultsPanel` container/combo architecture, version combo population and switching, `current_model_name()`, tab-switch timeline fix | +| `core/audio_scan.py` | `np.array(chunk)` copy for read-only numpy arrays in EAT preprocessing | +| `requirements.txt` | `transformers>=4.30,<5.0` pin | +| `setup_env.sh` | `torchvision` in torch install, `--extra-index-url` on requirements install | +| `setup-windows.ps1` | `torchvision` in torch install, `--extra-index-url` on requirements install, removed skip-if-exists guard | +| `tests/test_db.py` | 5 tests covering all DB-layer changes | diff --git a/docs/plans/2026-04-19-scan-history-negatives-implementation.md b/docs/plans/2026-04-19-scan-history-negatives-implementation.md index 0c33946..af457f0 100644 --- a/docs/plans/2026-04-19-scan-history-negatives-implementation.md +++ b/docs/plans/2026-04-19-scan-history-negatives-implementation.md @@ -1,714 +1,94 @@ -# Scan History & Hard Negative Management Implementation Plan +# Scan History & Hard Negative Management — Implementation Log -> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. +> All tasks complete. See the design doc for the final specification. -**Goal:** Add scan result versioning, hard negative management dialog with training toggle, and fix ghost folder bug. - -**Architecture:** DB schema changes in `core/db.py` (new columns, new queries). UI changes in `main.py` (version selector in ScanResultsPanel, management dialog, training toggle). No changes to `core/audio_scan.py`. - -**Tech Stack:** SQLite (existing), PyQt6 (existing) - -**Key design notes:** -- Scan history stores N versions per `(filename, profile, model)` using a `scan_timestamp` column. All rows from one scan share the same timestamp. -- Hard negatives gain a `source_model` column (informational) and training gains a `use_hard_negatives` toggle. -- `get_export_folders()` must respect `scan_export` filter to prevent ghost folders. +**Branch:** `feat/training-ui` --- -### Task 1: Fix ghost folder bug in get_export_folders +### Task 1: Fix ghost folder bug in get_export_folders -- DONE -**Files:** -- Modify: `core/db.py:294-313` (get_export_folders) -- Modify: `core/db.py:410-443` (get_training_stats — filter out 0-clip folders) -- Test: `tests/test_db.py` +**Commit:** `2614a76 fix: get_export_folders respects scan_export filter` -**Step 1: Write failing test** - -```python -def test_export_folders_excludes_scan_exports(): - """Scan-export-only folders should not appear when include_scan_exports=False.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - path = f.name - try: - db = ProcessedDB(path) - # Manual export - db.add("a.mp4", 10.0, "/out/mp4_Intense/g1/clip.mp4", profile="test") - # Scan export to different folder - db.add("a.mp4", 20.0, "/out/mp4_ScanOnly/g1/clip.mp4", profile="test", - scan_export=True) - folders = db.get_export_folders("test") - assert "mp4_Intense" in folders - assert "mp4_ScanOnly" not in folders, "scan-only folder should be excluded" - # With include_scan_exports=True, both should appear - folders_all = db.get_export_folders("test", include_scan_exports=True) - assert "mp4_ScanOnly" in folders_all - finally: - os.unlink(path) -``` - -**Step 2: Fix get_export_folders** - -Add `include_scan_exports` parameter: - -```python -def get_export_folders(self, profile: str = "default", - include_scan_exports: bool = False) -> list[str]: - if not self._enabled: - return [] - if include_scan_exports: - rows = self._con.execute( - "SELECT DISTINCT output_path FROM processed WHERE profile = ?", - (profile,), - ).fetchall() - else: - rows = self._con.execute( - "SELECT DISTINCT output_path FROM processed" - " WHERE profile = ? AND scan_export = 0", - (profile,), - ).fetchall() - folder_names: set[str] = set() - for (op,) in rows: - grandparent = os.path.basename(os.path.dirname(os.path.dirname(op))) - if grandparent: - folder_names.add(grandparent) - return sorted(folder_names) -``` - -**Step 3: Update get_training_stats to pass through** - -```python - folders = self.get_export_folders(profile, include_scan_exports=include_scan_exports) -``` - -And filter out empty folders at the end: - -```python - return {k: v for k, v in stats.items() if v["clips"] > 0} -``` - -**Step 4: Run tests, commit** - -```bash -pytest tests/ -v -git add core/db.py tests/test_db.py -git commit -m "fix: get_export_folders respects scan_export filter" -``` +- `core/db.py` — `get_export_folders(profile, include_scan_exports=False)`: filters `scan_export = 0` by default +- `core/db.py` — `get_training_stats()`: passes `include_scan_exports` through, filters out 0-clip folders +- `tests/test_db.py` — `test_export_folders_excludes_scan_exports` --- -### Task 2: Scan result history — schema and DB methods +### Task 2: Scan result history — schema and DB methods -- DONE -**Files:** -- Modify: `core/db.py:86-98` (scan_results schema — add scan_timestamp column) -- Modify: `core/db.py:100-113` (migration — add scan_timestamp to existing tables) -- Modify: `core/db.py:447-468` (save_scan_results — version management) -- Add: `core/db.py` (get_scan_versions, load_scan_version, delete_scan_version) -- Test: `tests/test_db.py` +**Commit:** `4fb2ae1 feat: scan result history — keep N versions per (file, model)` -**Step 1: Write failing test** - -```python -def test_scan_result_history(): - """save_scan_results should keep multiple versions.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - path = f.name - try: - db = ProcessedDB(path) - # Save three versions - db.save_scan_results("v.mp4", "test", "MODEL_A", - [(0, 8, 0.9)]) - db.save_scan_results("v.mp4", "test", "MODEL_A", - [(0, 8, 0.8), (10, 18, 0.7)]) - db.save_scan_results("v.mp4", "test", "MODEL_A", - [(5, 13, 0.95)]) - versions = db.get_scan_versions("v.mp4", "test", "MODEL_A") - assert len(versions) == 3 - # Most recent first - assert versions[0]["count"] == 1 # latest: 1 region - assert versions[1]["count"] == 2 # middle: 2 regions - assert versions[2]["count"] == 1 # oldest: 1 region - # get_scan_results returns latest version by default - results = db.get_scan_results("v.mp4", "test") - assert len(results.get("MODEL_A", [])) == 1 - finally: - os.unlink(path) -``` - -**Step 2: Add scan_timestamp column** - -In the CREATE TABLE (line 87-98), add: - -```sql - scan_timestamp TEXT NOT NULL DEFAULT '' -``` - -In the migration block (lines 100-113), add: - -```python - ("scan_timestamp", "TEXT NOT NULL DEFAULT ''"), -``` - -**Step 3: Modify save_scan_results** - -Replace the current DELETE+INSERT with versioned insert + cleanup: - -```python -def save_scan_results(self, filename: str, profile: str, model: str, - regions: list[tuple[float, float, float]], - max_versions: int = 5) -> None: - if not self._enabled: - return - from datetime import datetime - ts = datetime.now().strftime("%Y%m%d_%H%M%S") - with self._lock: - self._con.executemany( - "INSERT INTO scan_results" - " (filename, profile, model, start_time, end_time, score," - " orig_start_time, orig_end_time, scan_timestamp)" - " VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", - [(filename, profile, model, s, e, sc, s, e, ts) - for s, e, sc in regions], - ) - # Prune old versions beyond max_versions - versions = self._con.execute( - "SELECT DISTINCT scan_timestamp FROM scan_results" - " WHERE filename = ? AND profile = ? AND model = ?" - " ORDER BY scan_timestamp DESC", - (filename, profile, model), - ).fetchall() - if len(versions) > max_versions: - old_ts = [v[0] for v in versions[max_versions:]] - self._con.execute( - "DELETE FROM scan_results" - " WHERE filename = ? AND profile = ? AND model = ?" - f" AND scan_timestamp IN ({','.join('?' * len(old_ts))})", - (filename, profile, model, *old_ts), - ) - self._con.commit() -``` - -**Step 4: Add get_scan_versions** - -```python -def get_scan_versions(self, filename: str, profile: str, model: str - ) -> list[dict]: - """Return list of scan versions for (filename, profile, model). - - Returns [{timestamp, count, max_score}, ...] ordered newest first. - """ - if not self._enabled: - return [] - rows = self._con.execute( - "SELECT scan_timestamp, COUNT(*), MAX(score)" - " FROM scan_results" - " WHERE filename = ? AND profile = ? AND model = ?" - " AND scan_timestamp != ''" - " GROUP BY scan_timestamp" - " ORDER BY scan_timestamp DESC", - (filename, profile, model), - ).fetchall() - return [{"timestamp": ts, "count": cnt, "max_score": sc} - for ts, cnt, sc in rows] -``` - -**Step 5: Modify get_scan_results to support version selection** - -Add optional `scan_timestamp` parameter. When None (default), returns latest version: - -```python -def get_scan_results(self, filename: str, profile: str, - scan_timestamp: str | None = None - ) -> dict[str, list[tuple]]: - if not self._enabled: - return {} - if scan_timestamp: - rows = self._con.execute( - "SELECT id, model, start_time, end_time, score, disabled," - " orig_start_time, orig_end_time" - " FROM scan_results" - " WHERE filename = ? AND profile = ? AND scan_timestamp = ?" - " ORDER BY model, start_time", - (filename, profile, scan_timestamp), - ).fetchall() - else: - # For each model, get rows from the latest timestamp only - rows = self._con.execute( - "SELECT r.id, r.model, r.start_time, r.end_time, r.score," - " r.disabled, r.orig_start_time, r.orig_end_time" - " FROM scan_results r" - " INNER JOIN (" - " SELECT model, MAX(scan_timestamp) AS latest" - " FROM scan_results" - " WHERE filename = ? AND profile = ?" - " GROUP BY model" - " ) m ON r.model = m.model AND r.scan_timestamp = m.latest" - " WHERE r.filename = ? AND r.profile = ?" - " ORDER BY r.model, r.start_time", - (filename, profile, filename, profile), - ).fetchall() - result: dict[str, list] = {} - for row_id, model, s, e, sc, dis, os_, oe in rows: - result.setdefault(model, []).append( - (row_id, s, e, sc, bool(dis), - os_ if os_ is not None else s, - oe if oe is not None else e)) - return result -``` - -**Important:** Legacy rows (before this change) have `scan_timestamp = ''`. The `MAX(scan_timestamp)` query handles this correctly — empty string sorts before any real timestamp, so legacy rows are returned when they're the only version. The `get_scan_versions` query filters `scan_timestamp != ''` so legacy rows don't appear as named versions. - -**Step 6: Run tests, commit** - -```bash -pytest tests/ -v -git add core/db.py tests/test_db.py -git commit -m "feat: scan result history — keep N versions per (file, model)" -``` +- `core/db.py` — added `scan_timestamp TEXT NOT NULL DEFAULT ''` column with migration +- `core/db.py` — `save_scan_results()`: versioned insert with microsecond-precision timestamp (`%Y%m%d_%H%M%S_%f`), auto-prunes beyond `max_versions=5` +- `core/db.py` — `get_scan_versions()`: returns `[{timestamp, count, max_score}, ...]` newest first +- `core/db.py` — `get_scan_results(scan_timestamp=None)`: `INNER JOIN` subquery with `MAX(scan_timestamp)` for latest-by-default +- `tests/test_db.py` — `test_scan_result_history` --- -### Task 3: Scan history UI — version selector in ScanResultsPanel +### Task 3: Scan history UI — version selector in ScanResultsPanel -- DONE -**Files:** -- Modify: `main.py` (ScanResultsPanel — add version combo per tab) -- Modify: `main.py` (ScanResultsPanel.load_for_file — populate versions) +**Commit:** `8ed9fbf feat: scan version selector in results panel` -**Step 1: Add version combo to tab UI** - -In `ScanResultsPanel._add_tab()`, add a small QComboBox above the table. When no history exists, hide it. When versions exist, populate with timestamps and connect to a slot that reloads the tab with that version. - -```python -# In _add_tab, create a container widget with version combo + table -container = QWidget() -layout = QVBoxLayout(container) -layout.setContentsMargins(0, 0, 0, 0) - -cmb_version = QComboBox() -cmb_version.setMaximumWidth(200) -cmb_version.setToolTip("Scan version history") -cmb_version.hide() # Hidden when only 1 version -layout.addWidget(cmb_version) -layout.addWidget(table) - -self._tabs.addTab(container, label) -``` - -Store the combo and table as properties on the container widget for later access. - -**Step 2: Populate versions in load_for_file** - -After creating each model tab, query `get_scan_versions()`. If > 1 version, show the combo with entries like `"2026-04-19 14:30 (12 regions, best: 0.95)"`. Connect `currentIndexChanged` to reload that version's results. - -**Step 3: Version switching slot** - -When user selects a different version from the combo: -1. Call `db.get_scan_results(filename, profile, scan_timestamp=selected_ts)` -2. Repopulate the table with that version's rows -3. Update timeline regions - -**Step 4: Test manually, commit** - -```bash -git add main.py -git commit -m "feat: scan version selector in results panel" -``` +- `main.py` — `_add_tab()`: wraps table in container `QWidget` with version `QComboBox` (hidden when ≤ 1 version) +- `main.py` — `_current_table()` / `_tab_table(idx)`: unwrap container to get `QTableWidget` +- `main.py` — `_populate_version_combos()`: queries `get_scan_versions()`, formats labels with `datetime.strptime` + try/except fallback +- `main.py` — `_on_version_changed()`: reloads table from specific version, clears undo stack, emits `regions_edited` +- `main.py` — `current_model_name()`: extracts model name from tab text --- -### Task 4: Hard negatives — schema and training toggle +### Task 4: Hard negatives — schema and training toggle -- DONE -**Files:** -- Modify: `core/db.py:118-130` (hard_negatives schema — add source_model column) -- Modify: `core/db.py:548-560` (add_hard_negatives — accept source_model) -- Modify: `core/db.py:365-374` (get_training_data — use_hard_negatives parameter) -- Modify: `main.py` (TrainDialog — add "Use hard negatives" checkbox) -- Modify: `main.py` (_open_train_dialog — pass use_hard_negatives to get_training_data) -- Test: `tests/test_db.py` +**Commit:** `edc5784 feat: hard negative source_model tracking, training toggle` -**Step 1: Write failing test** - -```python -def test_hard_negatives_source_model(): - """Hard negatives should store source_model.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - path = f.name - try: - db = ProcessedDB(path) - db.add_hard_negatives("a.mp4", "test", [10.0, 20.0], - source_path="/a.mp4", source_model="HUBERT_XLARGE") - rows = db.get_hard_negatives("test") - assert len(rows) == 2 - assert all(r["source_model"] == "HUBERT_XLARGE" for r in rows) - finally: - os.unlink(path) - -def test_training_data_skips_hard_negatives(): - """get_training_data with use_hard_negatives=False should skip them.""" - with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f: - path = f.name - try: - db = ProcessedDB(path) - db.add("a.mp4", 10.0, "/out/folder/g/clip.mp4", profile="test", - source_path="/videos/a.mp4") - db.add_hard_negatives("a.mp4", "test", [500.0], source_path="/videos/a.mp4") - # With hard negatives - data_with = db.get_training_data("test", "folder", use_hard_negatives=True) - # Without hard negatives - data_without = db.get_training_data("test", "folder", use_hard_negatives=False) - # Both should find the video, but negative counts differ - assert len(data_with) >= 1 - neg_with = sum(len(vi[3]) for vi in data_with) - neg_without = sum(len(vi[3]) for vi in data_without) - assert neg_with > neg_without or neg_with == neg_without # depends on margin - finally: - os.unlink(path) -``` - -**Step 2: Add source_model column to hard_negatives** - -In CREATE TABLE (line 119-125), add: - -```sql - source_model TEXT NOT NULL DEFAULT '' -``` - -In migration section, add after the hard_negatives table creation: - -```python -hn_cols = { - row[1] - for row in self._con.execute("PRAGMA table_info(hard_negatives)").fetchall() -} -if "source_model" not in hn_cols: - self._con.execute( - "ALTER TABLE hard_negatives ADD COLUMN source_model TEXT NOT NULL DEFAULT ''" - ) -``` - -**Step 3: Update add_hard_negatives to accept source_model** - -```python -def add_hard_negatives(self, filename: str, profile: str, - times: list[float], source_path: str = "", - source_model: str = "") -> None: - if not self._enabled or not times: - return - with self._lock: - for t in times: - self._con.execute( - "INSERT INTO hard_negatives" - " (filename, profile, start_time, source_path, source_model)" - " VALUES (?, ?, ?, ?, ?)", - (filename, profile, t, source_path, source_model), - ) - self._con.commit() -``` - -**Step 4: Add get_hard_negatives (full rows for management dialog)** - -```python -def get_hard_negatives(self, profile: str) -> list[dict]: - """Return all hard negatives for a profile with full details.""" - if not self._enabled: - return [] - rows = self._con.execute( - "SELECT id, filename, start_time, source_path, source_model" - " FROM hard_negatives WHERE profile = ?" - " ORDER BY filename, start_time", - (profile,), - ).fetchall() - return [{"id": r[0], "filename": r[1], "start_time": r[2], - "source_path": r[3], "source_model": r[4]} for r in rows] -``` - -**Step 5: Add delete_hard_negatives_by_ids** - -```python -def delete_hard_negatives_by_ids(self, ids: list[int]) -> None: - """Delete hard negatives by row IDs.""" - if not self._enabled or not ids: - return - with self._lock: - self._con.execute( - f"DELETE FROM hard_negatives WHERE id IN ({','.join('?' * len(ids))})", - ids, - ) - self._con.commit() -``` - -**Step 6: Add use_hard_negatives parameter to get_training_data** - -In `get_training_data()` (line 315), add parameter: - -```python -def get_training_data(self, profile: str, positive_folder: str, - negative_folder: str = "", - fallback_video_dir: str = "", - include_scan_exports: bool = False, - use_hard_negatives: bool = True, - ) -> list[tuple[str, list[float], list[float], list[float]]]: -``` - -Then wrap the hard negatives query (lines 365-374) in a conditional: - -```python - if use_hard_negatives: - hard_rows = self._con.execute( - "SELECT filename, start_time, source_path FROM hard_negatives" - " WHERE profile = ?", - (profile,), - ).fetchall() - for fn, st, sp in hard_rows: - neg_by_video.setdefault(fn, set()).add(st) - if sp: - source_by_filename.setdefault(fn, sp) -``` - -**Step 7: Pass source_model when marking negatives from scan panel** - -In `main.py`, `_on_scan_negatives()` needs to pass the current scan model. The scan panel knows which tab is active: - -```python -def _on_scan_negatives(self, times: list) -> None: - if not self._file_path: - return - filename = os.path.basename(self._file_path) - # Get current model tab name for source_model - source_model = self._scan_panel.current_model_name() - self._db.add_hard_negatives(filename, self._profile, times, - source_path=self._file_path, - source_model=source_model) -``` - -Add `current_model_name()` to ScanResultsPanel: - -```python -def current_model_name(self) -> str: - """Return the model name of the currently active tab.""" - idx = self._tabs.currentIndex() - if idx >= 0: - return self._tabs.tabText(idx).split(" (")[0] # strip count suffix - return "" -``` - -**Step 8: Add training toggle to TrainDialog** - -After the existing `_chk_scan_exports` checkbox: - -```python -self._chk_hard_negatives = QCheckBox("Use hard negatives in training") -self._chk_hard_negatives.setChecked(True) -self._chk_hard_negatives.setToolTip( - "When unchecked, manually marked hard negatives are excluded from training.\n" - "Useful when training a new model type where old negatives may not apply.") -self._chk_hard_negatives.stateChanged.connect(lambda: self._debounce.start()) -form.addRow("", self._chk_hard_negatives) -``` - -Add property: - -```python -@property -def use_hard_negatives(self) -> bool: - return self._chk_hard_negatives.isChecked() -``` - -**Step 9: Wire toggle through _open_train_dialog** - -In `_open_train_dialog()`, pass the flag: - -```python - video_infos = self._db.get_training_data( - self._profile, pos_folder, negative_folder=neg_folder, - fallback_video_dir=video_dir, - include_scan_exports=inc_scan, - use_hard_negatives=dlg.use_hard_negatives, - ) -``` - -Also update `_update_stats()` in TrainDialog to pass it through for accurate counts: - -```python - use_neg = self._chk_hard_negatives.isChecked() if hasattr(self, '_chk_hard_negatives') else True - video_infos = self._db.get_training_data( - self._profile, folder, negative_folder=neg_folder, - fallback_video_dir=self._txt_video_dir.text(), - include_scan_exports=inc_scan, - use_hard_negatives=use_neg, - ) -``` - -**Step 10: Run tests, commit** - -```bash -pytest tests/ -v -git add core/db.py main.py tests/test_db.py -git commit -m "feat: hard negative source_model tracking, training toggle" -``` +- `core/db.py` — added `source_model TEXT NOT NULL DEFAULT ''` column to `hard_negatives` with migration +- `core/db.py` — `add_hard_negatives(source_model="")`: stores originating model +- `core/db.py` — `get_hard_negatives(profile)`: returns full rows as list of dicts +- `core/db.py` — `delete_hard_negatives_by_ids(ids)`: bulk delete by row IDs +- `core/db.py` — `get_training_data(use_hard_negatives=True)`: conditionally skips hard negatives query +- `main.py` — `TrainDialog`: "Use hard negatives" checkbox + "Manage..." button in HBox layout +- `main.py` — `_on_scan_negatives()`: passes `source_model=self._scan_panel.current_model_name()` +- `tests/test_db.py` — `test_hard_negatives_source_model`, `test_training_data_skips_hard_negatives`, `test_delete_hard_negatives_by_ids` --- -### Task 5: Hard negatives management dialog +### Task 5: Hard negatives management dialog -- DONE -**Files:** -- Modify: `main.py` (add HardNegativesDialog class) -- Modify: `main.py` (TrainDialog — add "Manage..." button) +**Commit:** `e6db83f feat: hard negatives management dialog with filter and bulk delete` -**Step 1: Create HardNegativesDialog** - -Place before TrainDialog class: - -```python -class HardNegativesDialog(QDialog): - """View and manage hard negative training examples.""" - - def __init__(self, db: ProcessedDB, profile: str, parent=None): - super().__init__(parent) - self.setWindowTitle("Hard Negatives") - self.setMinimumSize(600, 400) - self._db = db - self._profile = profile - - layout = QVBoxLayout(self) - - # Filter row - filter_row = QHBoxLayout() - filter_row.addWidget(QLabel("Filter model:")) - self._cmb_filter = QComboBox() - self._cmb_filter.addItem("(all)") - self._cmb_filter.currentIndexChanged.connect(self._apply_filter) - filter_row.addWidget(self._cmb_filter, 1) - layout.addLayout(filter_row) - - # Summary - self._lbl_summary = QLabel() - layout.addWidget(self._lbl_summary) - - # Table - self._table = QTableWidget(0, 4) - self._table.setHorizontalHeaderLabels( - ["File", "Time", "Source Model", "ID"]) - self._table.horizontalHeader().setSectionResizeMode( - 0, QHeaderView.ResizeMode.Stretch) - self._table.setEditTriggers(QTableWidget.EditTrigger.NoEditTriggers) - self._table.setSelectionBehavior(QTableWidget.SelectionBehavior.SelectRows) - self._table.setColumnHidden(3, True) # hide ID column - layout.addWidget(self._table) - - # Buttons - btn_row = QHBoxLayout() - btn_delete = QPushButton("Delete Selected") - btn_delete.clicked.connect(self._delete_selected) - btn_row.addWidget(btn_delete) - btn_clear = QPushButton("Clear All") - btn_clear.clicked.connect(self._clear_all) - btn_row.addWidget(btn_clear) - btn_row.addStretch() - btn_close = QPushButton("Close") - btn_close.clicked.connect(self.close) - btn_row.addWidget(btn_close) - layout.addLayout(btn_row) - - self._load() - - def _load(self): - rows = self._db.get_hard_negatives(self._profile) - models = sorted(set(r["source_model"] for r in rows if r["source_model"])) - self._cmb_filter.blockSignals(True) - self._cmb_filter.clear() - self._cmb_filter.addItem("(all)") - for m in models: - self._cmb_filter.addItem(m) - self._cmb_filter.blockSignals(False) - - self._table.setRowCount(len(rows)) - for i, r in enumerate(rows): - self._table.setItem(i, 0, QTableWidgetItem(r["filename"])) - self._table.setItem(i, 1, QTableWidgetItem(f'{r["start_time"]:.1f}s')) - self._table.setItem(i, 2, QTableWidgetItem(r["source_model"])) - item = QTableWidgetItem(str(r["id"])) - self._table.setItem(i, 3, item) - self._lbl_summary.setText(f"{len(rows)} hard negatives") - - def _apply_filter(self): - model = self._cmb_filter.currentText() - for row in range(self._table.rowCount()): - if model == "(all)": - self._table.setRowHidden(row, False) - else: - src = self._table.item(row, 2).text() - self._table.setRowHidden(row, src != model) - - def _delete_selected(self): - ids = [] - for row in sorted(set(i.row() for i in self._table.selectedItems()), reverse=True): - if not self._table.isRowHidden(row): - ids.append(int(self._table.item(row, 3).text())) - if ids: - self._db.delete_hard_negatives_by_ids(ids) - self._load() - - def _clear_all(self): - reply = QMessageBox.question( - self, "Clear All", - f"Delete all hard negatives for profile '{self._profile}'?", - QMessageBox.StandardButton.Yes | QMessageBox.StandardButton.No, - ) - if reply == QMessageBox.StandardButton.Yes: - all_rows = self._db.get_hard_negatives(self._profile) - self._db.delete_hard_negatives_by_ids([r["id"] for r in all_rows]) - self._load() -``` - -**Step 2: Add "Manage..." button to TrainDialog** - -After the hard negatives checkbox, add a button: - -```python -neg_row = QHBoxLayout() -neg_row.addWidget(self._chk_hard_negatives) -btn_manage_neg = QPushButton("Manage…") -btn_manage_neg.setFixedWidth(80) -btn_manage_neg.clicked.connect(self._manage_negatives) -neg_row.addWidget(btn_manage_neg) -form.addRow("", neg_row) # replaces the standalone checkbox addRow -``` - -Add handler: - -```python -def _manage_negatives(self): - dlg = HardNegativesDialog(self._db, self._profile, parent=self) - dlg.exec() - self._debounce.start() # refresh stats after potential deletions -``` - -**Step 3: Test manually, commit** - -```bash -pytest tests/ -v -git add main.py -git commit -m "feat: hard negatives management dialog with filter and bulk delete" -``` +- `main.py` — `HardNegativesDialog`: table with File/Time/Source Model/hidden ID columns, model filter combo, delete selected, filter-aware clear all, close button +- Filter-aware "Clear All": respects active model filter, shows appropriate confirmation message --- -### Task 6: Final integration test and push +### Task 6: Code review fixes -- DONE -**Step 1: Manual test checklist** +**Commit:** `5d45b8d fix: timestamp collision, undo stack invalidation, label parsing, filter-aware clear` -- [ ] Open Train dialog — verify no ghost folders appear -- [ ] Train with "Use hard negatives" unchecked — verify training works -- [ ] Train with "Use hard negatives" checked — verify negatives are used -- [ ] Open Manage dialog — verify negatives listed with source model -- [ ] Delete selected negatives — verify they're removed -- [ ] Scan a video — verify results saved with timestamp -- [ ] Rescan same video — verify version history appears -- [ ] Switch version in scan panel — verify correct results display -- [ ] Mark negative from scan results — verify source_model stored +Four issues found during code review: +1. **Timestamp collision** — second-precision timestamps could merge versions on sub-second calls. Fixed with microsecond precision `%f` +2. **Undo stack invalidation** — switching scan versions left stale undo entries. Fixed by clearing undo stack in `_on_version_changed()` +3. **Timestamp label fragile parsing** — hard-coded string slicing. Fixed with `datetime.strptime` + try/except fallback +4. **Clear All ignoring filter** — deleted all negatives regardless of model filter. Fixed to respect active filter -**Step 2: Push** +--- -```bash -git push -``` +### Runtime fixes (discovered during manual testing) + +| Commit | Fix | +|--------|-----| +| `a3c657c` | Install `torchvision` from CUDA wheel index (was pulling CPU build from PyPI) | +| `3c3b1d7` | Remove "skip if torch exists" guard in Windows setup so re-runs fix broken envs | +| `fd043f4` | Pin `transformers>=4.30,<5.0` — EAT remote model code incompatible with transformers 5.x | +| `7d6fee9` | Copy read-only numpy array before `torch.from_numpy()` in EAT preprocessing | +| `bd345ab` | Connect `tab_changed` to `_on_scan_regions_edited` so timeline refreshes on tab switch | +| `d8b3972` | Add `--extra-index-url` to `pip install -r requirements.txt` in both setup scripts | + +--- + +### Test results + +All 68 tests pass (5 new DB tests + 63 existing). diff --git a/main.py b/main.py index 0225744..51819fb 100755 --- a/main.py +++ b/main.py @@ -1756,16 +1756,18 @@ class TimelineWidget(QWidget): def mousePressEvent(self, event): x = event.position().x() - # Check for scan region edge drag - hit = self._hit_scan_edge(x) - if hit is not None: - idx, edge = hit - r = self._scan_regions[idx] - self._drag_idx = idx - self._drag_edge = edge - self._drag_start_val = r[0] - self._drag_end_val = r[1] - return + # Check for scan region edge drag — require Shift to avoid accidental resizes + mods = event.modifiers() + if mods & Qt.KeyboardModifier.ShiftModifier: + hit = self._hit_scan_edge(x) + if hit is not None: + idx, edge = hit + r = self._scan_regions[idx] + self._drag_idx = idx + self._drag_edge = edge + self._drag_start_val = r[0] + self._drag_end_val = r[1] + return self._seek(x) def mouseDoubleClickEvent(self, event): @@ -1801,9 +1803,9 @@ class TimelineWidget(QWidget): self.update() return - # Hover cursor: resize arrow near edges, normal otherwise - hit = self._hit_scan_edge(x) - if hit is not None: + # Hover cursor: resize arrow near edges (only with Shift held) + mods = event.modifiers() + if (mods & Qt.KeyboardModifier.ShiftModifier) and self._hit_scan_edge(x): self.setCursor(Qt.CursorShape.SizeHorCursor) else: self.unsetCursor() @@ -3224,6 +3226,67 @@ class MainWindow(QMainWindow): self._playlist._select(0) _log(f"Resumed session: {len(valid)} file(s)") + self._show_changelog() + + # ── Changelog ──────────────────────────────────────────── + + APP_VERSION = "1.0" + CHANGELOG: list[tuple[str, list[str]]] = [ + ("1.0", [ + "New export layout — clips are now stored in per-video " + "vid_NNN/ folders instead of per-clip " + "clip_NNN/ group dirs. " + "Each source video gets its own folder with flat clip files inside " + "(e.g. mp4/vid_001/clip_001_0.mp4). " + "Old databases are migrated automatically on startup: " + "DB paths are rewritten and files are moved to the new layout.", + "Counter is now per-video — clip numbering restarts in each " + "vid folder, and the DB is cross-checked to prevent overwrites " + "even if the export folder is temporarily empty.", + "Audio detection models — three new embedding models for " + "audio scanning: AST (Audio Spectrogram Transformer), " + "EAT (Efficient Audio Transformer), and multi-layer " + "HuBERT/Wav2Vec2 extraction. Classifier probabilities are now " + "calibrated with isotonic regression for more meaningful scores.", + "Scan result history — scan results are versioned per " + "(file, model); switch between past scan versions from a dropdown.", + "Hard negatives — management dialog to review, filter, and " + "bulk-delete hard negatives; source model is tracked per negative.", + "Scan workflow — disable/resize scan regions, undo edits, " + "interruptible Scan All with resume, audio prefetch, review mode.", + "Dataset statistics — dialog showing per-video clip breakdown " + "and class balance.", + "Waveform overlay on timeline.", + ]), + ] + + def _show_changelog(self) -> None: + last = self._settings.value("last_seen_version", "") + if last == self.APP_VERSION: + return + # Collect entries newer than last seen + lines: list[str] = [] + for ver, items in self.CHANGELOG: + if ver == last: + break + lines.append(f"

v{ver}

") + if not lines: + self._settings.setValue("last_seen_version", self.APP_VERSION) + return + msg = QMessageBox(self) + msg.setWindowTitle("What's new") + msg.setIcon(QMessageBox.Icon.Information) + msg.setTextFormat(Qt.TextFormat.RichText) + msg.setText("".join(lines)) + cb = QCheckBox("Don't show again for this version") + msg.setCheckBox(cb) + msg.exec() + if cb.isChecked(): + self._settings.setValue("last_seen_version", self.APP_VERSION) + def _show_shortcuts(self) -> None: text = ( "" @@ -3248,7 +3311,7 @@ class MainWindow(QMainWindow): "" "" "" - "" + "" "
Double-click markerEnter overwrite mode (locked: jump to end of clip span)
Right-click markerDelete clip group
Click video / crop barReposition portrait crop
Drag scan region edgeResize scan region
Shift+drag scan region edgeResize scan region
" ) QMessageBox.information(self, "Keyboard shortcuts", text) diff --git a/tests/test_utils.py b/tests/test_utils.py index c34297f..f0a9ac0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,21 +5,21 @@ from main import ProcessedDB def test_build_export_path_first(): - assert build_export_path("/out", "clip", 1) == "/out/clip_001/clip_001.mp4" + assert build_export_path("/out", "clip", 1) == "/out/clip_001.mp4" def test_build_export_path_counter(): - assert build_export_path("/out", "clip", 42) == "/out/clip_042/clip_042.mp4" + assert build_export_path("/out", "clip", 42) == "/out/clip_042.mp4" def test_build_export_path_deep_counter(): - assert build_export_path("/out", "shot", 999) == "/out/shot_999/shot_999.mp4" + assert build_export_path("/out", "shot", 999) == "/out/shot_999.mp4" def test_build_export_path_sub(): - assert build_export_path("/out", "clip", 1, sub=0) == "/out/clip_001/clip_001_0.mp4" - assert build_export_path("/out", "clip", 1, sub=2) == "/out/clip_001/clip_001_2.mp4" + assert build_export_path("/out", "clip", 1, sub=0) == "/out/clip_001_0.mp4" + assert build_export_path("/out", "clip", 1, sub=2) == "/out/clip_001_2.mp4" def test_build_sequence_dir_sub(): - assert build_sequence_dir("/out", "clip", 1, sub=0) == "/out/clip_001/clip_001_0" - assert build_sequence_dir("/out", "clip", 1, sub=1) == "/out/clip_001/clip_001_1" + assert build_sequence_dir("/out", "clip", 1, sub=0) == "/out/clip_001_0" + assert build_sequence_dir("/out", "clip", 1, sub=1) == "/out/clip_001_1" def test_format_time_seconds(): assert format_time(0.0) == "0:00.0" @@ -178,10 +178,10 @@ def test_audio_extract_timing(): def test_build_sequence_dir_basic(): - assert build_sequence_dir("/out", "clip", 1) == "/out/clip_001/clip_001" + assert build_sequence_dir("/out", "clip", 1) == "/out/clip_001" def test_build_sequence_dir_counter(): - assert build_sequence_dir("/out", "clip", 42) == "/out/clip_042/clip_042" + assert build_sequence_dir("/out", "clip", 42) == "/out/clip_042" def test_ffmpeg_command_image_sequence(): cmd = build_ffmpeg_command("/in/v.mp4", 0.0, "/out/seq_001", image_sequence=True) @@ -265,13 +265,13 @@ def test_db_get_group_returns_all_sub_clips(): path = f.name try: db = ProcessedDB(path) - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_0.mp4") - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_1.mp4") - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_2.mp4") - group = db.get_group("/out/clip_001/clip_001_0.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_0.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_1.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_2.mp4") + group = db.get_group("/out/vid_001/clip_001_0.mp4") assert len(group) == 3 - assert "/out/clip_001/clip_001_0.mp4" in group - assert "/out/clip_001/clip_001_2.mp4" in group + assert "/out/vid_001/clip_001_0.mp4" in group + assert "/out/vid_001/clip_001_2.mp4" in group finally: os.unlink(path) @@ -281,10 +281,10 @@ def test_db_get_group_isolates_by_start_time(): path = f.name try: db = ProcessedDB(path) - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_0.mp4") - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_1.mp4") - db.add("video.mp4", 30.0, "/out/clip_002/clip_002_0.mp4") - group = db.get_group("/out/clip_001/clip_001_0.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_0.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_1.mp4") + db.add("video.mp4", 30.0, "/out/vid_001/clip_002_0.mp4") + group = db.get_group("/out/vid_001/clip_001_0.mp4") assert len(group) == 2 finally: os.unlink(path) @@ -295,10 +295,10 @@ def test_db_delete_group_removes_all(): path = f.name try: db = ProcessedDB(path) - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_0.mp4") - db.add("video.mp4", 10.0, "/out/clip_001/clip_001_1.mp4") - db.add("video.mp4", 30.0, "/out/clip_002/clip_002_0.mp4") - deleted = db.delete_group("/out/clip_001/clip_001_0.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_0.mp4") + db.add("video.mp4", 10.0, "/out/vid_001/clip_001_1.mp4") + db.add("video.mp4", 30.0, "/out/vid_001/clip_002_0.mp4") + deleted = db.delete_group("/out/vid_001/clip_001_0.mp4") assert len(deleted) == 2 # clip_002 should still exist markers = db.get_markers("video.mp4")