fix: build object_info off the event loop so a slow mount can't hang ComfyUI
Root cause of 'refresh hangs ComfyUI': object_info is built by walking the model folders synchronously on the single aiohttp event loop. When the model tree is on a slow/stalling network mount (CIFS), that walk blocks in 'wait_for_response' and freezes the entire UI until the NAS answers. Fix: - Run the object_info build in a worker thread (folder-walk syscalls release the GIL, so the loop stays responsive). Uses ComfyUI's real node_info, resolved from the /object_info route closure, with a safe fallback to the in-loop build. - Offload the Quick scan and Register work to a thread too (POST no longer freezes). - Guard the incremental scanner against symlink cycles (visited realpaths). Unit-tested: threaded build bypasses the in-loop handler; node_info resolves; cycle guard terminates. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
+89
-4
@@ -6,8 +6,9 @@ model/LoRA collection (especially on a network mount).
|
|||||||
|
|
||||||
It injects an aiohttp middleware that caches the huge /api/object_info response
|
It injects an aiohttp middleware that caches the huge /api/object_info response
|
||||||
in memory and on disk (survives restarts) and serves it gzipped, so the slow
|
in memory and on disk (survives restarts) and serves it gzipped, so the slow
|
||||||
build (which freezes ComfyUI's event loop) runs only on the first load or an
|
build runs only on the first load or an explicit refresh — not on every page
|
||||||
explicit refresh — not on every page load.
|
load. That build (and the refresh folder-walk) runs in a worker thread, so a
|
||||||
|
slow/stalling network model mount no longer freezes ComfyUI's event loop.
|
||||||
|
|
||||||
Three refresh modes are exposed (menu buttons, a graph node, and HTTP):
|
Three refresh modes are exposed (menu buttons, a graph node, and HTTP):
|
||||||
* full - clear ComfyUI's folder cache -> full re-walk of every model
|
* full - clear ComfyUI's folder cache -> full re-walk of every model
|
||||||
@@ -26,6 +27,7 @@ import os
|
|||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
|
import asyncio
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import threading
|
import threading
|
||||||
@@ -173,9 +175,17 @@ def _scan_root_incremental(root, old):
|
|||||||
"""Walk a root, scandir-ing only dirs whose mtime changed; reuse the rest."""
|
"""Walk a root, scandir-ing only dirs whose mtime changed; reuse the rest."""
|
||||||
new = {}
|
new = {}
|
||||||
scanned = reused = 0
|
scanned = reused = 0
|
||||||
|
visited = set() # real paths, to defend against symlink cycles on network mounts
|
||||||
stack = [root]
|
stack = [root]
|
||||||
while stack:
|
while stack:
|
||||||
d = stack.pop()
|
d = stack.pop()
|
||||||
|
try:
|
||||||
|
rp = os.path.realpath(d)
|
||||||
|
except OSError:
|
||||||
|
continue
|
||||||
|
if rp in visited:
|
||||||
|
continue
|
||||||
|
visited.add(rp)
|
||||||
try:
|
try:
|
||||||
m = os.path.getmtime(d)
|
m = os.path.getmtime(d)
|
||||||
except OSError:
|
except OSError:
|
||||||
@@ -344,6 +354,72 @@ def _check_node_signature():
|
|||||||
log.warning("Tenaciousload: node signature check failed: %s", e)
|
log.warning("Tenaciousload: node signature check failed: %s", e)
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Off-loop object_info builder
|
||||||
|
# --------------------------------------------------------------------------- #
|
||||||
|
# Building object_info walks the model folders synchronously. On a slow/stalling
|
||||||
|
# network mount that walk blocks ComfyUI's single event loop = the whole UI
|
||||||
|
# hangs. We instead run the build in a worker thread: the folder-walk syscalls
|
||||||
|
# release the GIL while they wait on the NAS, so the event loop stays responsive.
|
||||||
|
_node_info_fn = None
|
||||||
|
_node_info_resolved = False
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_node_info_fn():
|
||||||
|
"""Pull ComfyUI's own `node_info` closure off the /object_info route, so the
|
||||||
|
threaded build is byte-for-byte the same logic (no drift). Routes are added
|
||||||
|
after custom nodes load, so this is done lazily on first use."""
|
||||||
|
global _node_info_fn, _node_info_resolved
|
||||||
|
_node_info_resolved = True
|
||||||
|
try:
|
||||||
|
for route in PromptServer.instance.app.router.routes():
|
||||||
|
if route.method != "GET":
|
||||||
|
continue
|
||||||
|
path = getattr(route.resource, "canonical", None)
|
||||||
|
if path not in ("/object_info", "/api/object_info"):
|
||||||
|
continue
|
||||||
|
fn = getattr(route.handler, "__wrapped__", route.handler)
|
||||||
|
code = getattr(fn, "__code__", None)
|
||||||
|
if code and fn.__closure__:
|
||||||
|
for name, cell in zip(code.co_freevars, fn.__closure__):
|
||||||
|
if name == "node_info" and callable(cell.cell_contents):
|
||||||
|
_node_info_fn = cell.cell_contents
|
||||||
|
log.info("Tenaciousload: threaded object_info build enabled")
|
||||||
|
return
|
||||||
|
except Exception as e: # pragma: no cover
|
||||||
|
log.warning("Tenaciousload: could not resolve node_info (%s); builds stay on the loop", e)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_object_info_bytes():
|
||||||
|
"""Replicate ComfyUI's object_info build. Runs in a worker thread."""
|
||||||
|
import nodes
|
||||||
|
out = {}
|
||||||
|
with folder_paths.cache_helper:
|
||||||
|
for x in list(nodes.NODE_CLASS_MAPPINGS.keys()):
|
||||||
|
try:
|
||||||
|
out[x] = _node_info_fn(x)
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
log.error("Tenaciousload: node_info failed for '%s'", x, exc_info=True)
|
||||||
|
return json.dumps(out).encode("utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
async def _build_object_info_off_loop():
|
||||||
|
"""Build object_info in a thread; return raw bytes, or None to fall back."""
|
||||||
|
if _node_info_fn is None and not _node_info_resolved:
|
||||||
|
_resolve_node_info_fn()
|
||||||
|
if _node_info_fn is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
raw = await loop.run_in_executor(None, _build_object_info_bytes)
|
||||||
|
if isinstance(raw, (bytes, bytearray)) and len(raw) > 1000: # sanity: real one is huge
|
||||||
|
return bytes(raw)
|
||||||
|
log.warning("Tenaciousload: threaded build looked wrong (%d bytes); falling back", len(raw or b""))
|
||||||
|
except Exception as e: # pragma: no cover
|
||||||
|
log.warning("Tenaciousload: threaded build failed (%s); falling back", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
# object_info caching middleware
|
# object_info caching middleware
|
||||||
# --------------------------------------------------------------------------- #
|
# --------------------------------------------------------------------------- #
|
||||||
@@ -377,6 +453,13 @@ async def _object_info_cache_mw(request, handler):
|
|||||||
if "nocache" not in request.query and _mem["raw"] is not None:
|
if "nocache" not in request.query and _mem["raw"] is not None:
|
||||||
return _serve_cached(request)
|
return _serve_cached(request)
|
||||||
|
|
||||||
|
# MISS / refresh: build in a worker thread so a slow folder-walk does not
|
||||||
|
# freeze the event loop. Falls back to the normal in-loop handler.
|
||||||
|
raw = await _build_object_info_off_loop()
|
||||||
|
if raw is not None:
|
||||||
|
_store(raw)
|
||||||
|
return _serve_cached(request)
|
||||||
|
|
||||||
resp = await handler(request)
|
resp = await handler(request)
|
||||||
try:
|
try:
|
||||||
body = getattr(resp, "body", None)
|
body = getattr(resp, "body", None)
|
||||||
@@ -409,9 +492,11 @@ async def _refresh(request):
|
|||||||
except Exception:
|
except Exception:
|
||||||
data = {}
|
data = {}
|
||||||
mode = (data.get("mode") or "full").lower()
|
mode = (data.get("mode") or "full").lower()
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
if mode == "quick":
|
if mode == "quick":
|
||||||
summary = quick_rescan_all()
|
# run the folder walk off the loop so the UI stays responsive
|
||||||
|
summary = await loop.run_in_executor(None, quick_rescan_all)
|
||||||
invalidate_object_info_cache()
|
invalidate_object_info_cache()
|
||||||
rescanned = sum(s["scanned"] for s in summary)
|
rescanned = sum(s["scanned"] for s in summary)
|
||||||
log.info("Tenaciousload: quick refresh — %d folders touched, %d dirs rescanned", len(summary), rescanned)
|
log.info("Tenaciousload: quick refresh — %d folders touched, %d dirs rescanned", len(summary), rescanned)
|
||||||
@@ -420,7 +505,7 @@ async def _refresh(request):
|
|||||||
if mode == "register":
|
if mode == "register":
|
||||||
folder = data.get("folder") or "loras"
|
folder = data.get("folder") or "loras"
|
||||||
files = data.get("files") or []
|
files = data.get("files") or []
|
||||||
result = register_files(folder, files)
|
result = await loop.run_in_executor(None, register_files, folder, files)
|
||||||
invalidate_object_info_cache()
|
invalidate_object_info_cache()
|
||||||
log.info("Tenaciousload: register — %s", result)
|
log.info("Tenaciousload: register — %s", result)
|
||||||
return web.json_response({"status": "ok", "mode": "register", **result})
|
return web.json_response({"status": "ok", "mode": "register", **result})
|
||||||
|
|||||||
Reference in New Issue
Block a user