Add popular node metadata build CLI

This commit is contained in:
2026-07-02 21:57:31 +02:00
parent dddb136b16
commit 1895a0e677
2 changed files with 683 additions and 0 deletions
@@ -1,4 +1,5 @@
import json
import subprocess
import tempfile
import textwrap
import unittest
@@ -7,8 +8,15 @@ from pathlib import Path
from unittest import mock
from tools.generate_popular_node_signatures import (
build_artifact,
clone_or_update_repo,
extract_repo_signatures,
fetch_json,
main,
normalise_input_spec,
normalise_manager_entries,
rank_packs,
repo_cache_path,
write_artifact,
)
@@ -5499,5 +5507,267 @@ NODE_CLASS_MAPPINGS = {
self.assertEqual("2026-07-02T00:00:00Z", parsed["generated_at"])
class ManagerIngestionTests(unittest.TestCase):
def test_fetch_json_reads_and_decodes_json_with_clear_url_errors(self):
response = mock.Mock()
response.read.return_value = b'{"custom_nodes": []}'
response.__enter__ = mock.Mock(return_value=response)
response.__exit__ = mock.Mock(return_value=False)
with mock.patch("tools.generate_popular_node_signatures.urllib.request.urlopen", return_value=response):
self.assertEqual({"custom_nodes": []}, fetch_json("https://example.invalid/list.json"))
with mock.patch(
"tools.generate_popular_node_signatures.urllib.request.urlopen",
side_effect=OSError("network down"),
):
with self.assertRaisesRegex(RuntimeError, "https://example.invalid/list.json"):
fetch_json("https://example.invalid/list.json")
def test_normalise_manager_entries_accepts_git_clone_repos_and_skips_raw_file_installs(self):
manager_data = {
"custom_nodes": [
{
"author": "Alice",
"id": "alpha-id",
"title": "Alpha Nodes",
"files": ["https://github.com/example/alpha-nodes"],
"install_type": "git-clone",
"description": "Alpha description",
"downloads": "42",
},
{
"author": "Raw",
"id": "raw-id",
"title": "Raw File Node",
"files": ["https://raw.githubusercontent.com/example/raw-node.py"],
"install_type": "copy",
},
{
"author": "Bob",
"id": "reference-id",
"title": "Reference Nodes",
"reference": "https://github.com/example/reference-nodes.git",
"install_type": "git-clone",
"stars": 7,
},
]
}
entries = normalise_manager_entries(manager_data)
self.assertEqual(["alpha-id", "reference-id"], [entry["id"] for entry in entries])
self.assertEqual("https://github.com/example/alpha-nodes", entries[0]["repository"])
self.assertEqual("Alpha Nodes", entries[0]["title"])
self.assertEqual("Alice", entries[0]["author"])
self.assertEqual(42, entries[0]["metrics"]["downloads"])
self.assertEqual("https://github.com/example/reference-nodes.git", entries[1]["repository"])
def test_rank_packs_uses_popularity_metrics_then_stable_fallbacks(self):
packs = [
{
"id": "tie-b",
"title": "Tie B",
"repository": "https://github.com/example/tie-b",
"metrics": {"downloads": 5},
},
{
"id": "most",
"title": "Most",
"repository": "https://github.com/example/most",
"metrics": {"stars": 10},
},
{
"id": "tie-a",
"title": "Tie A",
"repository": "https://github.com/example/tie-a",
"metrics": {"favorites": 5},
},
{
"id": "none",
"title": "None",
"repository": "https://github.com/example/none",
"metrics": {},
},
]
ranked = rank_packs(packs)
self.assertEqual(["most", "tie-a", "tie-b", "none"], [pack["id"] for pack in ranked])
self.assertEqual([1, 2, 3, 4], [pack["rank"] for pack in ranked])
class RepoCacheTests(unittest.TestCase):
def test_repo_cache_path_is_safe_stable_and_collision_resistant(self):
with tempfile.TemporaryDirectory() as tmp:
cache_dir = Path(tmp)
first = repo_cache_path("https://github.com/Owner/Repo.git", cache_dir)
same = repo_cache_path("https://github.com/Owner/Repo.git", cache_dir)
collision = repo_cache_path("https://github.com/Other/Repo.git", cache_dir)
self.assertEqual(first, same)
self.assertNotEqual(first, collision)
self.assertEqual("repos", first.parent.name)
self.assertNotIn("..", first.name)
self.assertRegex(first.name, r"^github-com-owner-repo-[0-9a-f]{12}$")
def test_clone_or_update_repo_clones_missing_repo_and_pulls_existing_repo(self):
with tempfile.TemporaryDirectory() as tmp:
cache_dir = Path(tmp)
url = "https://github.com/example/pack.git"
expected_path = repo_cache_path(url, cache_dir)
with mock.patch("tools.generate_popular_node_signatures.subprocess.run") as run:
self.assertEqual(expected_path, clone_or_update_repo(url, cache_dir))
run.assert_called_once_with(
["git", "clone", "--depth", "1", url, str(expected_path)],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
expected_path.mkdir(parents=True)
with mock.patch("tools.generate_popular_node_signatures.subprocess.run") as run:
self.assertEqual(expected_path, clone_or_update_repo(url, cache_dir))
run.assert_not_called()
with mock.patch("tools.generate_popular_node_signatures.subprocess.run") as run:
self.assertEqual(expected_path, clone_or_update_repo(url, cache_dir, refresh=True))
run.assert_called_once_with(
["git", "-C", str(expected_path), "pull", "--ff-only"],
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
class BuildArtifactTests(unittest.TestCase):
def _write_fixture_repo(self, path):
Path(path, "__init__.py").write_text(
textwrap.dedent(
'''
class GoodNode:
RETURN_TYPES = ("IMAGE",)
@classmethod
def INPUT_TYPES(cls):
return {
"required": {
"image": ("IMAGE",),
},
}
NODE_CLASS_MAPPINGS = {
"GoodNode": GoodNode,
}
'''
),
encoding="utf-8",
)
def test_build_artifact_continues_after_failed_repo_and_records_pack_error(self):
manager_data = {
"custom_nodes": [
{
"id": "broken-pack",
"title": "Broken Pack",
"files": ["https://github.com/example/broken-pack"],
"install_type": "git-clone",
"downloads": 20,
},
{
"id": "good-pack",
"title": "Good Pack",
"files": ["https://github.com/example/good-pack"],
"install_type": "git-clone",
"downloads": 10,
},
]
}
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
repo_dir = tmp_path / "good-repo"
repo_dir.mkdir()
self._write_fixture_repo(repo_dir)
output = tmp_path / "popular_node_signatures.json"
with (
mock.patch("tools.generate_popular_node_signatures.fetch_json", return_value=manager_data),
mock.patch(
"tools.generate_popular_node_signatures.clone_or_update_repo",
side_effect=[RuntimeError("clone failed"), repo_dir],
),
):
summary = build_artifact(
manager_url="https://example.invalid/manager.json",
cache_dir=tmp_path / "cache",
output=output,
limit=2,
generated_at="2026-07-02T00:00:00Z",
)
payload = json.loads(output.read_text(encoding="utf-8"))
self.assertEqual(2, summary["processed"])
self.assertEqual(1, summary["errors"])
self.assertEqual(1, summary["node_count"])
self.assertEqual("error", payload["packs"]["broken-pack"]["status"])
self.assertIn("clone failed", payload["packs"]["broken-pack"]["error"])
self.assertEqual("ok", payload["packs"]["good-pack"]["status"])
self.assertIn("GoodNode", payload["nodes"])
def test_cli_invokes_build_artifact_and_prints_summary(self):
with tempfile.TemporaryDirectory() as tmp:
output = Path(tmp, "artifact.json")
cache = Path(tmp, "cache")
fake_summary = {
"processed": 3,
"pack_count": 2,
"node_count": 7,
"errors": 1,
"output": output,
}
with (
mock.patch("tools.generate_popular_node_signatures.build_artifact", return_value=fake_summary) as build,
mock.patch("builtins.print") as print_mock,
):
exit_code = main(
[
"--manager-url",
"https://example.invalid/manager.json",
"--cache-dir",
str(cache),
"--output",
str(output),
"--limit",
"3",
"--refresh",
]
)
self.assertEqual(0, exit_code)
build.assert_called_once_with(
manager_url="https://example.invalid/manager.json",
cache_dir=cache,
output=output,
limit=3,
refresh=True,
generated_at=mock.ANY,
)
printed = print_mock.call_args.args[0]
self.assertIn("processed=3", printed)
self.assertIn("packs=2", printed)
self.assertIn("nodes=7", printed)
self.assertIn("errors=1", printed)
self.assertIn(str(output), printed)
if __name__ == "__main__":
unittest.main()
+413
View File
@@ -2,15 +2,24 @@
"""Generate UTFCN's popular_node_signatures.json artifact."""
import ast
import argparse
import hashlib
import json
import os
import re
import subprocess
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse
SCHEMA_VERSION = 1
MANAGER_LIST_URL = "https://raw.githubusercontent.com/ltdrdata/ComfyUI-Manager/main/custom-node-list.json"
REGISTRY_NODES_URL = "https://api.comfy.org/nodes"
DEFAULT_GENERATED_AT = "1970-01-01T00:00:00Z"
DEFAULT_CACHE_DIR = Path(".cache/utfcn-popular-node-repos")
DEFAULT_OUTPUT = Path("popular_node_signatures.json")
USER_AGENT = "ComfyUI-UTFCN popular node signature generator"
class UnsupportedStaticExpression(Exception):
@@ -41,6 +50,293 @@ _CLASS_SIGNATURE_ATTRS = {"INPUT_TYPES", "RETURN_NAMES", "RETURN_TYPES"}
_DYNAMIC_NAMESPACE_MUTATION = object()
_NAMESPACE_FUNCTIONS = {"globals", "locals", "vars"}
_NAMESPACE_DUNDER_MUTATORS = {"__delitem__", "__setitem__"}
_METRIC_FIELDS = (
"downloads",
"download_count",
"stars",
"github_stars",
"stargazers_count",
"favorites",
"favourites",
"installed",
"installs",
"install_count",
"count",
)
def fetch_json(url):
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
try:
with urllib.request.urlopen(request, timeout=30) as response:
return json.loads(response.read().decode("utf-8"))
except Exception as exc:
raise RuntimeError(f"failed to fetch JSON from {url}: {exc}") from exc
def _manager_entries(raw):
if isinstance(raw, list):
return raw
if not isinstance(raw, dict):
return []
for key in ("custom_nodes", "customNodes", "nodes", "items"):
value = raw.get(key)
if isinstance(value, list):
return value
return []
def _coerce_int(value):
if isinstance(value, bool):
return 0
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
if isinstance(value, str):
text = value.strip().replace(",", "")
if text.isdigit() or (text.startswith("-") and text[1:].isdigit()):
return int(text)
return 0
def _slug(value, default="unnamed-pack"):
text = str(value or "").strip().lower()
text = re.sub(r"[^a-z0-9]+", "-", text).strip("-")
return text or default
def github_repo_url(value):
if not isinstance(value, str):
return None
text = value.strip()
if not text:
return None
parsed = urlparse(text)
if parsed.scheme not in {"http", "https"} or parsed.netloc.lower() != "github.com":
return None
parts = [part for part in parsed.path.split("/") if part]
if len(parts) < 2:
return None
owner, repo = parts[0], parts[1]
return f"https://github.com/{owner}/{repo}"
def _normalise_repository_url(value):
if not isinstance(value, str):
return None
text = value.strip()
if not text:
return None
if re.match(r"^[A-Za-z0-9_.-]+@[A-Za-z0-9_.-]+:.+/.+(\.git)?$", text):
return text
parsed = urlparse(text)
if parsed.netloc.lower() == "github.com":
return github_repo_url(text)
if parsed.netloc.lower() == "raw.githubusercontent.com":
return None
if parsed.scheme not in {"http", "https", "git", "ssh"}:
return None
host = parsed.netloc.lower()
if not host:
return None
path_parts = [part for part in parsed.path.split("/") if part]
if len(path_parts) < 2:
return None
last = path_parts[-1].lower()
if not last.endswith(".git") and "." in last:
return None
return text
def _is_cloneable_repo_url(value):
return _normalise_repository_url(value) is not None
def _repository_candidates(item):
for key in ("repository", "repo", "git", "git_url", "url", "reference"):
value = item.get(key)
if isinstance(value, str):
yield value
elif isinstance(value, list):
for candidate in value:
yield candidate
files = item.get("files")
if isinstance(files, str):
yield files
elif isinstance(files, list):
for candidate in files:
yield candidate
def _manager_entry_repository(item):
install_type = str(item.get("install_type") or item.get("installType") or "").lower()
candidates = list(_repository_candidates(item))
if "git" in install_type:
for candidate in candidates:
repository = _normalise_repository_url(candidate)
if repository:
return repository
return None
for candidate in candidates:
repository = _normalise_repository_url(candidate)
if repository:
return repository
return None
def _entry_metrics(item):
metrics = {}
sources = [item]
for key in ("stats", "statistics", "metadata"):
value = item.get(key)
if isinstance(value, dict):
sources.append(value)
for source in sources:
for field in _METRIC_FIELDS:
value = _coerce_int(source.get(field))
if value:
metrics[field] = value
return metrics
def _pack_id_from_repository(repository):
parsed = urlparse(repository)
if parsed.netloc:
parts = [part for part in parsed.path.split("/") if part]
if parts:
return _slug(parts[-1].removesuffix(".git"))
return _slug(parsed.netloc)
if ":" in repository:
return _slug(repository.rsplit("/", 1)[-1].removesuffix(".git"))
return _slug(repository)
def normalise_manager_entries(raw):
entries = []
for manager_order, item in enumerate(_manager_entries(raw)):
if not isinstance(item, dict):
continue
repository = _manager_entry_repository(item)
if repository is None:
continue
pack_id = str(item.get("id") or "").strip()
if not pack_id:
pack_id = _slug(item.get("title") or _pack_id_from_repository(repository))
title = str(item.get("title") or pack_id).strip() or pack_id
entry = {
"id": pack_id,
"title": title,
"author": str(item.get("author") or "").strip(),
"repository": repository,
"manager_order": manager_order,
"metrics": _entry_metrics(item),
}
description = str(item.get("description") or "").strip()
if description:
entry["description"] = description
entries.append(entry)
return entries
def _popularity_score(pack):
return sum(_coerce_int(value) for value in pack.get("metrics", {}).values())
def rank_packs(packs, limit=None):
best_by_repository = {}
for pack in packs:
repository = pack.get("repository")
if not repository:
continue
candidate = dict(pack)
previous = best_by_repository.get(repository)
if previous is None:
best_by_repository[repository] = candidate
continue
candidate_key = (
_popularity_score(candidate),
-int(candidate.get("manager_order", 0)),
str(candidate.get("id", "")),
)
previous_key = (
_popularity_score(previous),
-int(previous.get("manager_order", 0)),
str(previous.get("id", "")),
)
if candidate_key > previous_key:
best_by_repository[repository] = candidate
ranked = sorted(
best_by_repository.values(),
key=lambda pack: (
-_popularity_score(pack),
str(pack.get("title", "")).lower(),
str(pack.get("id", "")),
str(pack.get("repository", "")),
),
)
if limit is not None:
ranked = ranked[:limit]
result = []
for index, pack in enumerate(ranked, start=1):
ranked_pack = dict(pack)
ranked_pack["rank"] = index
result.append(ranked_pack)
return result
def rank_entries(entries, limit=None):
return rank_packs(entries, limit)
def _repo_cache_slug(url):
text = str(url).strip()
parsed = urlparse(text)
if parsed.netloc:
parts = [parsed.netloc, *[part for part in parsed.path.split("/") if part]]
elif ":" in text:
host, path = text.split(":", 1)
host = host.split("@")[-1]
parts = [host, *[part for part in path.split("/") if part]]
else:
parts = [text]
if parts and parts[-1].endswith(".git"):
parts[-1] = parts[-1][:-4]
slug = "-".join(parts).lower()
slug = re.sub(r"[^a-z0-9]+", "-", slug).strip("-")
return slug[:80].strip("-") or "repo"
def repo_cache_path(url, cache_dir):
digest = hashlib.sha256(str(url).encode("utf-8")).hexdigest()[:12]
return Path(cache_dir) / "repos" / f"{_repo_cache_slug(url)}-{digest}"
def _run_git(command):
try:
subprocess.run(
command,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
text=True,
)
except subprocess.CalledProcessError as exc:
stderr = (exc.stderr or "").strip()
detail = f": {stderr}" if stderr else ""
raise RuntimeError(f"git command failed ({' '.join(command)}){detail}") from exc
def clone_or_update_repo(url, cache_dir, *, refresh=False):
target = repo_cache_path(url, cache_dir)
target.parent.mkdir(parents=True, exist_ok=True)
if target.exists():
if refresh:
_run_git(["git", "-C", str(target), "pull", "--ff-only"])
return target
_run_git(["git", "clone", "--depth", "1", url, str(target)])
return target
def _literal(node, env, allow_mutable_env=True):
@@ -2630,3 +2926,120 @@ def write_artifact(path, sources, packs, nodes, *, generated_at=DEFAULT_GENERATE
}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(payload, indent=2, sort_keys=False) + "\n", encoding="utf-8")
def _pack_record_from_meta(pack, status, *, node_count=0, error=None):
record = {
"id": pack["id"],
"title": pack.get("title", pack["id"]),
"repository": pack.get("repository", ""),
"rank": pack.get("rank", 0),
"status": status,
"node_count": node_count,
}
if pack.get("author"):
record["author"] = pack["author"]
if pack.get("description"):
record["description"] = pack["description"]
if pack.get("metrics"):
record["metrics"] = dict(pack["metrics"])
if error is not None:
record["error"] = str(error)
return record
def _merge_pack_metadata(extracted_pack, pack):
merged = dict(extracted_pack)
if pack.get("author"):
merged["author"] = pack["author"]
if pack.get("description"):
merged["description"] = pack["description"]
if pack.get("metrics"):
merged["metrics"] = dict(pack["metrics"])
return merged
def build_artifact(
*,
manager_url=MANAGER_LIST_URL,
cache_dir=DEFAULT_CACHE_DIR,
output=DEFAULT_OUTPUT,
limit=1000,
refresh=False,
generated_at=None,
):
manager_raw = fetch_json(manager_url)
normalised = normalise_manager_entries(manager_raw)
ranked = rank_packs(normalised, limit)
packs = {}
nodes = {}
errors = 0
for pack in ranked:
try:
repo_dir = clone_or_update_repo(pack["repository"], cache_dir, refresh=refresh)
extracted = extract_repo_signatures(repo_dir, pack)
except Exception as exc:
errors += 1
packs[pack["id"]] = _pack_record_from_meta(pack, "error", error=exc)
continue
packs[pack["id"]] = _merge_pack_metadata(extracted["pack"], pack)
for node_type, node in sorted(extracted["nodes"].items()):
nodes.setdefault(node_type, node)
generated_at = generated_at if generated_at is not None else datetime.now(timezone.utc)
write_artifact(
Path(output),
sources={
"manager_url": manager_url,
"limit": limit,
"normalised_packs": len(normalised),
"processed_packs": len(ranked),
},
packs=packs,
nodes=nodes,
generated_at=generated_at,
)
return {
"processed": len(ranked),
"pack_count": len(packs),
"node_count": len(nodes),
"errors": errors,
"output": Path(output),
}
def main(argv=None):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--manager-url", default=MANAGER_LIST_URL)
parser.add_argument("--cache-dir", type=Path, default=DEFAULT_CACHE_DIR)
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
parser.add_argument("--limit", type=int, default=1000)
parser.add_argument("--refresh", action="store_true")
parser.add_argument("--quiet", action="store_true")
args = parser.parse_args(argv)
summary = build_artifact(
manager_url=args.manager_url,
cache_dir=args.cache_dir,
output=args.output,
limit=args.limit,
refresh=args.refresh,
generated_at=datetime.now(timezone.utc),
)
if not args.quiet:
print(
"wrote {output} processed={processed} packs={packs} nodes={nodes} errors={errors}".format(
output=summary["output"],
processed=summary["processed"],
packs=summary["pack_count"],
nodes=summary["node_count"],
errors=summary["errors"],
)
)
return 0
if __name__ == "__main__":
raise SystemExit(main())