Fix RAM leak: strip history snapshots from memory, load on demand

History tree nodes stored full data snapshots in memory (5-50MB each),
accumulating with every save. Now:

- New `history_snapshots` DB table stores node data separately
- `save_history_tree` and `sync_to_db` extract snapshots before saving
- In-memory tree nodes only hold metadata (id, parent, note, timestamp)
- Restore and preview load snapshots from DB on demand
- `save_and_snap` uses json roundtrip instead of deepcopy (1 copy not 2)
- `_src_cache` moved to AppState, cleared on file switch
- `strip_snapshots()` method on HistoryTree for explicit cleanup

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 09:48:47 +01:00
parent 79e1426036
commit eac4e4f08b
7 changed files with 173 additions and 23 deletions
+74 -4
View File
@@ -48,8 +48,18 @@ CREATE TABLE IF NOT EXISTS history_trees (
updated_at REAL NOT NULL updated_at REAL NOT NULL
); );
CREATE TABLE IF NOT EXISTS history_snapshots (
id INTEGER PRIMARY KEY AUTOINCREMENT,
data_file_id INTEGER NOT NULL REFERENCES data_files(id) ON DELETE CASCADE,
node_id TEXT NOT NULL,
snapshot_data TEXT NOT NULL,
updated_at REAL NOT NULL,
UNIQUE(data_file_id, node_id)
);
CREATE INDEX IF NOT EXISTS idx_data_files_project_id ON data_files(project_id); CREATE INDEX IF NOT EXISTS idx_data_files_project_id ON data_files(project_id);
CREATE INDEX IF NOT EXISTS idx_sequences_data_file_id ON sequences(data_file_id); CREATE INDEX IF NOT EXISTS idx_sequences_data_file_id ON sequences(data_file_id);
CREATE INDEX IF NOT EXISTS idx_history_snapshots_df ON history_snapshots(data_file_id);
""" """
@@ -314,22 +324,64 @@ class ProjectDB:
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def save_history_tree(self, data_file_id: int, tree_data: dict) -> None: def save_history_tree(self, data_file_id: int, tree_data: dict) -> None:
"""Save history tree, extracting node snapshots into separate table."""
now = time.time() now = time.time()
# Extract snapshot data from nodes into history_snapshots table
nodes = tree_data.get("nodes", {})
slim_tree = dict(tree_data)
slim_nodes = {}
for nid, node in nodes.items():
snap = node.get("data")
if snap:
self.conn.execute(
"INSERT INTO history_snapshots (data_file_id, node_id, snapshot_data, updated_at) "
"VALUES (?, ?, ?, ?) "
"ON CONFLICT(data_file_id, node_id) DO UPDATE SET "
"snapshot_data=excluded.snapshot_data, updated_at=excluded.updated_at",
(data_file_id, nid, json.dumps(snap), now),
)
# Store node without data in tree
slim_nodes[nid] = {k: v for k, v in node.items() if k != "data"}
slim_tree["nodes"] = slim_nodes
self.conn.execute( self.conn.execute(
"INSERT INTO history_trees (data_file_id, tree_data, updated_at) " "INSERT INTO history_trees (data_file_id, tree_data, updated_at) "
"VALUES (?, ?, ?) " "VALUES (?, ?, ?) "
"ON CONFLICT(data_file_id) DO UPDATE SET tree_data=excluded.tree_data, updated_at=excluded.updated_at", "ON CONFLICT(data_file_id) DO UPDATE SET tree_data=excluded.tree_data, updated_at=excluded.updated_at",
(data_file_id, json.dumps(tree_data), now), (data_file_id, json.dumps(slim_tree), now),
) )
self.conn.commit() self.conn.commit()
def get_history_tree(self, data_file_id: int) -> dict | None: def get_history_tree(self, data_file_id: int) -> dict | None:
"""Load history tree metadata (without snapshot data)."""
row = self.conn.execute( row = self.conn.execute(
"SELECT tree_data FROM history_trees WHERE data_file_id = ?", "SELECT tree_data FROM history_trees WHERE data_file_id = ?",
(data_file_id,), (data_file_id,),
).fetchone() ).fetchone()
return json.loads(row["tree_data"]) if row else None return json.loads(row["tree_data"]) if row else None
# ------------------------------------------------------------------
# History snapshots (per-node data, loaded on demand)
# ------------------------------------------------------------------
def get_node_snapshot(self, data_file_id: int, node_id: str) -> dict | None:
"""Load a single node's snapshot data on demand."""
row = self.conn.execute(
"SELECT snapshot_data FROM history_snapshots WHERE data_file_id = ? AND node_id = ?",
(data_file_id, node_id),
).fetchone()
return json.loads(row["snapshot_data"]) if row else None
def delete_node_snapshots(self, data_file_id: int, node_ids: set) -> None:
"""Delete snapshots for removed nodes."""
if not node_ids:
return
placeholders = ",".join("?" for _ in node_ids)
self.conn.execute(
f"DELETE FROM history_snapshots WHERE data_file_id = ? AND node_id IN ({placeholders})",
(data_file_id, *node_ids),
)
self.conn.commit()
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Import # Import
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@@ -385,15 +437,30 @@ class ProjectDB:
(df_id, seq_num, json.dumps(item), now), (df_id, seq_num, json.dumps(item), now),
) )
# Import history tree # Import history tree (extract snapshots into separate table)
history_tree = data.get(KEY_HISTORY_TREE) history_tree = data.get(KEY_HISTORY_TREE)
if history_tree and isinstance(history_tree, dict): if history_tree and isinstance(history_tree, dict):
now = time.time() now = time.time()
nodes = history_tree.get("nodes", {})
slim_tree = dict(history_tree)
slim_nodes = {}
for nid, node in nodes.items():
snap = node.get("data")
if snap:
self.conn.execute(
"INSERT INTO history_snapshots (data_file_id, node_id, snapshot_data, updated_at) "
"VALUES (?, ?, ?, ?) "
"ON CONFLICT(data_file_id, node_id) DO UPDATE SET "
"snapshot_data=excluded.snapshot_data, updated_at=excluded.updated_at",
(df_id, nid, json.dumps(snap), now),
)
slim_nodes[nid] = {k: v for k, v in node.items() if k != "data"}
slim_tree["nodes"] = slim_nodes
self.conn.execute( self.conn.execute(
"INSERT INTO history_trees (data_file_id, tree_data, updated_at) " "INSERT INTO history_trees (data_file_id, tree_data, updated_at) "
"VALUES (?, ?, ?) " "VALUES (?, ?, ?) "
"ON CONFLICT(data_file_id) DO UPDATE SET tree_data=excluded.tree_data, updated_at=excluded.updated_at", "ON CONFLICT(data_file_id) DO UPDATE SET tree_data=excluded.tree_data, updated_at=excluded.updated_at",
(df_id, json.dumps(history_tree), now), (df_id, json.dumps(slim_tree), now),
) )
self.conn.execute("COMMIT") self.conn.execute("COMMIT")
@@ -445,9 +512,12 @@ class ProjectDB:
data["batch_data"] = batch_data data["batch_data"] = batch_data
t2 = time.time() t2 = time.time()
# Load history tree # Load history tree (metadata only, no snapshot data)
tree = self.get_history_tree(df["id"]) tree = self.get_history_tree(df["id"])
if tree: if tree:
# Strip any residual snapshot data from nodes
for node in tree.get("nodes", {}).values():
node.pop("data", None)
data["history_tree"] = tree data["history_tree"] = tree
t3 = time.time() t3 = time.time()
+5
View File
@@ -76,6 +76,11 @@ class HistoryTree:
return self.nodes[node_id]["data"] return self.nodes[node_id]["data"]
return None return None
def strip_snapshots(self) -> None:
"""Remove snapshot data from all nodes to free memory."""
for node in self.nodes.values():
node.pop("data", None)
def to_dict(self) -> dict[str, Any]: def to_dict(self) -> dict[str, Any]:
return {"nodes": self.nodes, "branches": self.branches, "head_id": self.head_id} return {"nodes": self.nodes, "branches": self.branches, "head_id": self.head_id}
+19 -1
View File
@@ -11,7 +11,7 @@ from utils import (
load_config, save_config, load_snippets, save_snippets, load_config, save_config, load_snippets, save_snippets,
load_json, save_json, generate_templates, DEFAULTS, load_json, save_json, generate_templates, DEFAULTS,
KEY_BATCH_DATA, KEY_SEQUENCE_NUMBER, KEY_BATCH_DATA, KEY_SEQUENCE_NUMBER,
resolve_path_case_insensitive, resolve_path_case_insensitive, sync_to_db,
) )
from tab_batch_ng import render_batch_processor from tab_batch_ng import render_batch_processor
from tab_timeline_ng import render_timeline_tab from tab_timeline_ng import render_timeline_tab
@@ -290,11 +290,19 @@ def index():
pane_state.db.load_full_data, pane_state.current_project, file_stem) pane_state.db.load_full_data, pane_state.current_project, file_stem)
if data is None: if data is None:
data, _ = await asyncio.to_thread(load_json, fp) data, _ = await asyncio.to_thread(load_json, fp)
if pane_state.db and pane_state.db_enabled and pane_state.current_project:
await asyncio.to_thread(
sync_to_db, pane_state.db, pane_state.current_project, fp, data)
tree = data.get('history_tree')
if tree and isinstance(tree, dict):
for node in tree.get('nodes', {}).values():
node.pop('data', None)
pane_state.data_cache = data pane_state.data_cache = data
pane_state.last_mtime = fp.stat().st_mtime if fp.exists() else 0 pane_state.last_mtime = fp.stat().st_mtime if fp.exists() else 0
pane_state.loaded_file = str(fp) pane_state.loaded_file = str(fp)
pane_state.file_path = fp pane_state.file_path = fp
pane_state.restored_indicator = None pane_state.restored_indicator = None
pane_state._src_cache = {'data': None, 'batch': [], 'name': None}
_render_batch_tab_content.refresh() _render_batch_tab_content.refresh()
logger.info("on_select END (%.3fs)", _time.perf_counter() - _t0) logger.info("on_select END (%.3fs)", _time.perf_counter() - _t0)
@@ -320,11 +328,21 @@ def index():
state.db.load_full_data, state.current_project, file_stem) state.db.load_full_data, state.current_project, file_stem)
if data is None: if data is None:
data, _ = await asyncio.to_thread(load_json, fp) data, _ = await asyncio.to_thread(load_json, fp)
# When loading from JSON fallback and DB is enabled, sync to DB
# so snapshots are persisted, then strip from memory
if state.db and state.db_enabled and state.current_project:
await asyncio.to_thread(
sync_to_db, state.db, state.current_project, fp, data)
tree = data.get('history_tree')
if tree and isinstance(tree, dict):
for node in tree.get('nodes', {}).values():
node.pop('data', None)
state.data_cache = data state.data_cache = data
state.last_mtime = fp.stat().st_mtime if fp.exists() else 0 state.last_mtime = fp.stat().st_mtime if fp.exists() else 0
state.loaded_file = str(fp) state.loaded_file = str(fp)
state.file_path = fp state.file_path = fp
state.restored_indicator = None state.restored_indicator = None
state._src_cache = {'data': None, 'batch': [], 'name': None}
if state._main_rendered: if state._main_rendered:
render_main_content.refresh() render_main_content.refresh()
logger.info("load_file END (%.3fs)", _time.perf_counter() - _t0) logger.info("load_file END (%.3fs)", _time.perf_counter() - _t0)
+1
View File
@@ -28,6 +28,7 @@ class AppState:
_main_rendered: bool = False _main_rendered: bool = False
_live_checkboxes: dict = field(default_factory=dict) _live_checkboxes: dict = field(default_factory=dict)
_live_refreshables: dict = field(default_factory=dict) _live_refreshables: dict = field(default_factory=dict)
_src_cache: dict = field(default_factory=lambda: {'data': None, 'batch': [], 'name': None})
def create_secondary(self) -> 'AppState': def create_secondary(self) -> 'AppState':
return AppState( return AppState(
+15 -9
View File
@@ -247,8 +247,8 @@ def render_batch_processor(state: AppState):
src_seq_select = ui.select([], label='Source Sequence:').classes('w-64') src_seq_select = ui.select([], label='Source Sequence:').classes('w-64')
# Track loaded source data # Track loaded source data (on state so it's cleared on file switch)
_src_cache = {'data': None, 'batch': [], 'name': None} _src_cache = state._src_cache
def _update_src(): def _update_src():
name = src_file_select.value name = src_file_select.value
@@ -359,11 +359,14 @@ def render_batch_processor(state: AppState):
data[KEY_BATCH_DATA] = batch_list data[KEY_BATCH_DATA] = batch_list
tree_data = data.get(KEY_HISTORY_TREE, {}) tree_data = data.get(KEY_HISTORY_TREE, {})
htree = HistoryTree(tree_data) htree = HistoryTree(tree_data)
t1 = time.perf_counter()
snapshot_payload = {k: copy.deepcopy(v) for k, v in data.items()
if k != KEY_HISTORY_TREE}
logger.info("save_and_snap deepcopy %.3fs", time.perf_counter() - t1)
note = commit_input.value if commit_input.value else _auto_change_note(htree, batch_list) note = commit_input.value if commit_input.value else _auto_change_note(htree, batch_list)
# Single serialization: json roundtrip gives us an isolated snapshot
# without the expensive deepcopy
t1 = time.perf_counter()
snapshot_json = json.dumps({k: v for k, v in data.items()
if k != KEY_HISTORY_TREE})
snapshot_payload = json.loads(snapshot_json)
logger.info("save_and_snap snapshot %.3fs", time.perf_counter() - t1)
try: try:
htree.commit(snapshot_payload, note=note) htree.commit(snapshot_payload, note=note)
except ValueError as e: except ValueError as e:
@@ -371,13 +374,16 @@ def render_batch_processor(state: AppState):
return return
data[KEY_HISTORY_TREE] = htree.to_dict() data[KEY_HISTORY_TREE] = htree.to_dict()
t1 = time.perf_counter() t1 = time.perf_counter()
snapshot = json.loads(json.dumps(data)) save_snapshot = json.loads(json.dumps(data))
await asyncio.to_thread(save_json, file_path, snapshot) await asyncio.to_thread(save_json, file_path, save_snapshot)
logger.info("save_and_snap save_json %.3fs", time.perf_counter() - t1) logger.info("save_and_snap save_json %.3fs", time.perf_counter() - t1)
if state.db_enabled and state.current_project and state.db: if state.db_enabled and state.current_project and state.db:
t1 = time.perf_counter() t1 = time.perf_counter()
await asyncio.to_thread(sync_to_db, state.db, state.current_project, file_path, snapshot) await asyncio.to_thread(sync_to_db, state.db, state.current_project, file_path, save_snapshot)
logger.info("save_and_snap sync_to_db %.3fs", time.perf_counter() - t1) logger.info("save_and_snap sync_to_db %.3fs", time.perf_counter() - t1)
# Free snapshot data from memory — it's persisted in DB now
htree.strip_snapshots()
data[KEY_HISTORY_TREE] = htree.to_dict()
state.restored_indicator = None state.restored_indicator = None
commit_input.set_value('') commit_input.set_value('')
logger.info("save_and_snap END (%.3fs)", time.perf_counter() - t_ss) logger.info("save_and_snap END (%.3fs)", time.perf_counter() - t_ss)
+33 -7
View File
@@ -13,7 +13,7 @@ from utils import save_json, sync_to_db, KEY_BATCH_DATA, KEY_HISTORY_TREE
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _delete_nodes(htree, data, file_path, node_ids): def _delete_nodes(htree, data, file_path, node_ids, state=None):
"""Delete nodes with backup, branch cleanup, re-parenting, and head fallback.""" """Delete nodes with backup, branch cleanup, re-parenting, and head fallback."""
if 'history_tree_backup' not in data: if 'history_tree_backup' not in data:
data['history_tree_backup'] = [] data['history_tree_backup'] = []
@@ -52,6 +52,11 @@ def _delete_nodes(htree, data, file_path, node_ids):
else: else:
htree.head_id = None htree.head_id = None
data[KEY_HISTORY_TREE] = htree.to_dict() data[KEY_HISTORY_TREE] = htree.to_dict()
# Clean up DB snapshots for deleted nodes
if state and state.db_enabled and state.db and state.current_project:
df = state.db.get_data_file_by_names(state.current_project, file_path.stem)
if df:
state.db.delete_node_snapshots(df['id'], set(node_ids))
def _render_selection_picker(all_nodes, htree, state, refresh_fn): def _render_selection_picker(all_nodes, htree, state, refresh_fn):
@@ -159,7 +164,7 @@ def _render_batch_delete(htree, data, file_path, state, refresh_fn):
async def do_batch_delete(): async def do_batch_delete():
current_valid = state.timeline_selected_nodes & set(htree.nodes.keys()) current_valid = state.timeline_selected_nodes & set(htree.nodes.keys())
_delete_nodes(htree, data, file_path, current_valid) _delete_nodes(htree, data, file_path, current_valid, state=state)
snapshot = json.loads(json.dumps(data)) snapshot = json.loads(json.dumps(data))
await asyncio.to_thread(save_json, file_path, snapshot) await asyncio.to_thread(save_json, file_path, snapshot)
if state.db_enabled and state.current_project and state.db: if state.db_enabled and state.current_project and state.db:
@@ -344,7 +349,7 @@ def _render_node_manager(all_nodes, htree, data, file_path, restore_fn, refresh_
async def delete_selected(): async def delete_selected():
if sel_id in htree.nodes: if sel_id in htree.nodes:
_delete_nodes(htree, data, file_path, {sel_id}) _delete_nodes(htree, data, file_path, {sel_id}, state=state)
snapshot = json.loads(json.dumps(data)) snapshot = json.loads(json.dumps(data))
await asyncio.to_thread(save_json, file_path, snapshot) await asyncio.to_thread(save_json, file_path, snapshot)
if state and state.db_enabled and state.current_project and state.db: if state and state.db_enabled and state.current_project and state.db:
@@ -361,7 +366,7 @@ def _render_node_manager(all_nodes, htree, data, file_path, restore_fn, refresh_
# Data preview # Data preview
with ui.expansion('Data Preview', icon='preview').classes('w-full q-mt-sm'): with ui.expansion('Data Preview', icon='preview').classes('w-full q-mt-sm'):
_render_data_preview(sel_id, htree) _render_data_preview(sel_id, htree, state=state, file_path=file_path)
render_branch_nodes() render_branch_nodes()
@@ -566,7 +571,20 @@ async def _restore_node(data, node, htree, file_path, state: AppState):
"""Restore a history node as the current version (full replace, not merge).""" """Restore a history node as the current version (full replace, not merge)."""
t0 = time.perf_counter() t0 = time.perf_counter()
logger.info("_restore_node START: %s", node.get('note', 'Step')) logger.info("_restore_node START: %s", node.get('note', 'Step'))
node_data = json.loads(json.dumps(node.get('data', {}))) # Load snapshot from DB on demand (nodes no longer hold data in memory)
raw_snap = node.get('data')
if not raw_snap and state.db_enabled and state.db and state.current_project:
df = state.db.get_data_file_by_names(state.current_project, file_path.stem)
if df:
raw_snap = await asyncio.to_thread(
state.db.get_node_snapshot, df['id'], node['id'])
if not raw_snap:
# Last resort: read from JSON file on disk
from utils import load_json as _load_json
raw_file, _ = await asyncio.to_thread(_load_json, file_path)
tree_on_disk = raw_file.get(KEY_HISTORY_TREE, {})
raw_snap = tree_on_disk.get('nodes', {}).get(node['id'], {}).get('data', {})
node_data = json.loads(json.dumps(raw_snap)) if raw_snap else {}
# Preserve the history tree before clearing # Preserve the history tree before clearing
preserved_tree = data.get(KEY_HISTORY_TREE) preserved_tree = data.get(KEY_HISTORY_TREE)
preserved_backup = data.get('history_tree_backup') preserved_backup = data.get('history_tree_backup')
@@ -589,13 +607,21 @@ async def _restore_node(data, node, htree, file_path, state: AppState):
ui.notify('Restored!', type='positive') ui.notify('Restored!', type='positive')
def _render_data_preview(nid, htree): def _render_data_preview(nid, htree, state: AppState = None, file_path=None):
"""Render a read-only preview of the selected node's data.""" """Render a read-only preview of the selected node's data."""
if not nid or nid not in htree.nodes: if not nid or nid not in htree.nodes:
ui.label('No node selected.').classes('text-caption') ui.label('No node selected.').classes('text-caption')
return return
node_data = htree.nodes[nid].get('data', {}) # Load snapshot from DB on demand (not stored in memory)
node_data = htree.nodes[nid].get('data')
if not node_data and state and state.db_enabled and state.db and state.current_project and file_path:
df = state.db.get_data_file_by_names(state.current_project, file_path.stem)
if df:
node_data = state.db.get_node_snapshot(df['id'], nid)
if not node_data:
ui.label('Snapshot data not available.').classes('text-caption text-warning')
return
batch_list = node_data.get(KEY_BATCH_DATA, []) batch_list = node_data.get(KEY_BATCH_DATA, [])
if batch_list and isinstance(batch_list, list) and len(batch_list) > 0: if batch_list and isinstance(batch_list, list) and len(batch_list) > 0:
+26 -2
View File
@@ -210,6 +210,15 @@ def save_json(path: str | Path, data: dict[str, Any]) -> None:
os.replace(tmp, path) os.replace(tmp, path)
logger.info("save_json %s: %.3fs", path.name, time.time() - t0) logger.info("save_json %s: %.3fs", path.name, time.time() - t0)
def snapshot_data(data: dict[str, Any]) -> dict[str, Any]:
"""Create a thread-safe deep copy via JSON roundtrip.
Must be called on the main thread before passing data to asyncio.to_thread,
to avoid 'dict changed size during iteration' when the UI mutates data.
"""
return json.loads(json.dumps(data))
def get_file_mtime(path: str | Path) -> float: def get_file_mtime(path: str | Path) -> float:
"""Returns the modification time of a file, or 0 if it doesn't exist.""" """Returns the modification time of a file, or 0 if it doesn't exist."""
path = Path(path) path = Path(path)
@@ -279,14 +288,29 @@ def sync_to_db(db, project_name: str, file_path: Path, data: dict) -> None:
else: else:
db.conn.execute("DELETE FROM sequences WHERE data_file_id = ?", (df_id,)) db.conn.execute("DELETE FROM sequences WHERE data_file_id = ?", (df_id,))
# Sync history tree # Sync history tree (extract node snapshots into separate table)
history_tree = data.get(KEY_HISTORY_TREE) history_tree = data.get(KEY_HISTORY_TREE)
if history_tree and isinstance(history_tree, dict): if history_tree and isinstance(history_tree, dict):
nodes = history_tree.get("nodes", {})
slim_tree = dict(history_tree)
slim_nodes = {}
for nid, node in nodes.items():
snap = node.get("data")
if snap:
db.conn.execute(
"INSERT INTO history_snapshots (data_file_id, node_id, snapshot_data, updated_at) "
"VALUES (?, ?, ?, ?) "
"ON CONFLICT(data_file_id, node_id) DO UPDATE SET "
"snapshot_data=excluded.snapshot_data, updated_at=excluded.updated_at",
(df_id, nid, json.dumps(snap), now),
)
slim_nodes[nid] = {k: v for k, v in node.items() if k != "data"}
slim_tree["nodes"] = slim_nodes
db.conn.execute( db.conn.execute(
"INSERT INTO history_trees (data_file_id, tree_data, updated_at) " "INSERT INTO history_trees (data_file_id, tree_data, updated_at) "
"VALUES (?, ?, ?) " "VALUES (?, ?, ?) "
"ON CONFLICT(data_file_id) DO UPDATE SET tree_data=excluded.tree_data, updated_at=excluded.updated_at", "ON CONFLICT(data_file_id) DO UPDATE SET tree_data=excluded.tree_data, updated_at=excluded.updated_at",
(df_id, json.dumps(history_tree), now), (df_id, json.dumps(slim_tree), now),
) )
db.conn.execute("COMMIT") db.conn.execute("COMMIT")