8bc42cabd9
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
99 lines
3.0 KiB
Python
99 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""One-shot migration: generate dataset.json files from ~/.8cut.db history.
|
|
|
|
For each export folder that has records in the database, writes (or merges
|
|
into) a dataset.json with path (relative to the folder) and label per clip.
|
|
|
|
Usage:
|
|
python tools/migrate_dataset_json.py [--dry-run]
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
|
|
DB_PATH = Path.home() / ".8cut.db"
|
|
|
|
|
|
def load_db_records(db_path: Path) -> list[dict]:
|
|
con = sqlite3.connect(db_path)
|
|
con.row_factory = sqlite3.Row
|
|
rows = con.execute(
|
|
"SELECT output_path, label FROM processed WHERE label != ''"
|
|
).fetchall()
|
|
con.close()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
def build_entries_for_folder(folder: str, records: list[dict]) -> list[dict]:
|
|
return [
|
|
{"path": os.path.abspath(rec["output_path"]), "label": rec["label"]}
|
|
for rec in records
|
|
]
|
|
|
|
|
|
def merge_into_json(json_path: str, new_entries: list[dict], dry_run: bool) -> int:
|
|
"""Merge new_entries into existing json_path, returning count added/updated."""
|
|
existing: list[dict] = []
|
|
if os.path.exists(json_path):
|
|
with open(json_path, "r", encoding="utf-8") as f:
|
|
try:
|
|
existing = json.load(f)
|
|
except (json.JSONDecodeError, ValueError):
|
|
existing = []
|
|
|
|
by_path = {e["path"]: e for e in existing}
|
|
changed = 0
|
|
for entry in new_entries:
|
|
if by_path.get(entry["path"]) != entry:
|
|
by_path[entry["path"]] = entry
|
|
changed += 1
|
|
|
|
merged = list(by_path.values())
|
|
if not dry_run and changed:
|
|
with open(json_path, "w", encoding="utf-8") as f:
|
|
json.dump(merged, f, indent=2, ensure_ascii=False)
|
|
f.write("\n")
|
|
return changed
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--dry-run", action="store_true",
|
|
help="Show what would be written without writing")
|
|
args = parser.parse_args()
|
|
|
|
if not DB_PATH.exists():
|
|
print(f"Database not found: {DB_PATH}")
|
|
return
|
|
|
|
records = load_db_records(DB_PATH)
|
|
if not records:
|
|
print("No labelled records in database.")
|
|
return
|
|
|
|
# Group records by parent folder of output_path
|
|
by_folder: dict[str, list[dict]] = defaultdict(list)
|
|
for rec in records:
|
|
folder = os.path.dirname(rec["output_path"])
|
|
by_folder[folder].append(rec)
|
|
|
|
total_written = 0
|
|
for folder, recs in sorted(by_folder.items()):
|
|
entries = build_entries_for_folder(folder, recs)
|
|
json_path = os.path.join(folder, "dataset.json")
|
|
changed = merge_into_json(json_path, entries, args.dry_run)
|
|
status = "(dry run) " if args.dry_run else ""
|
|
print(f"{status}{json_path}: {len(entries)} clips, {changed} added/updated")
|
|
total_written += changed
|
|
|
|
suffix = " (dry run)" if args.dry_run else ""
|
|
print(f"\nTotal: {total_written} entries written{suffix}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|