From 036e6a21f6db724fc6eca7852fd7c86e661c36b7 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Wed, 6 May 2026 17:41:41 -0700 Subject: [PATCH 1/4] gfql: add benchmark residual triage tool for issue #880 --- ai/docs/gfql/README.md | 1 + bin/gfql_issue_880_triage.py | 14 + .../compute/gfql/benchmark_residual_triage.py | 324 ++++++++++++++++++ .../gfql/test_benchmark_residual_triage.py | 183 ++++++++++ 4 files changed, 522 insertions(+) create mode 100755 bin/gfql_issue_880_triage.py create mode 100644 graphistry/compute/gfql/benchmark_residual_triage.py create mode 100644 graphistry/tests/compute/gfql/test_benchmark_residual_triage.py diff --git a/ai/docs/gfql/README.md b/ai/docs/gfql/README.md index ce98239f9f..333783b90d 100644 --- a/ai/docs/gfql/README.md +++ b/ai/docs/gfql/README.md @@ -9,6 +9,7 @@ Guide for AI assistants working with GFQL (Graph Frame Query Language) in PyGrap - [`predicates_checklist.md`](./predicates_checklist.md) — End-to-end checklist for predicate implementations. - [`conformance.md`](./conformance.md) — Cypher TCK conformance harness and CI wiring. - [`../prompts/GFQL_LLM_GUIDE_MAINTENANCE.md`](../prompts/GFQL_LLM_GUIDE_MAINTENANCE.md) — Guidance for keeping AI assistants aligned with GFQL changes. +- `python bin/gfql_issue_880_triage.py --runs-dir ../pyg-bench/results/runs` — Builds benchmark-evidence markdown for splitting residual workaround-backed `#880` lanes into narrower child issues. ### Essential GFQL Operations ```python diff --git a/bin/gfql_issue_880_triage.py b/bin/gfql_issue_880_triage.py new file mode 100755 index 0000000000..b58cbbf772 --- /dev/null +++ b/bin/gfql_issue_880_triage.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 + +from pathlib import Path +import sys + +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from graphistry.compute.gfql.benchmark_residual_triage import run_cli + + +if __name__ == "__main__": + raise SystemExit(run_cli()) diff --git a/graphistry/compute/gfql/benchmark_residual_triage.py b/graphistry/compute/gfql/benchmark_residual_triage.py new file mode 100644 index 0000000000..47798335f7 --- /dev/null +++ b/graphistry/compute/gfql/benchmark_residual_triage.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import argparse +import json +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple + + +@dataclass(frozen=True) +class ResidualLane: + run_dir: Path + run_dir_name: str + probe_path: Path + config_path: Optional[str] + remote_command: Optional[str] + backend: str + status: str + issue_refs: Tuple[str, ...] + suite: Optional[str] + upstream_query: Optional[str] + query_id: Optional[str] + semantic_scope: Optional[str] + notes: Tuple[str, ...] + latency_ms: Optional[float] + adapter_overhead_latency_ms: Optional[float] + rows_returned: Optional[int] + + @property + def lane_key(self) -> str: + upstream = self.upstream_query or "unknown-upstream" + query_id = self.query_id or "unknown-query-id" + return f"{upstream}:{query_id}:{self.backend}" + + +def _read_json(path: Path) -> Any: + with path.open("r", encoding="utf-8") as f: + return json.load(f) + + +def _coerce_float(value: Any) -> Optional[float]: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + return None + + +def _coerce_int(value: Any) -> Optional[int]: + if value is None: + return None + if isinstance(value, bool): + return None + if isinstance(value, int): + return value + return None + + +def _coerce_str_list(value: Any) -> Tuple[str, ...]: + if not isinstance(value, list): + return tuple() + out: List[str] = [] + for item in value: + if isinstance(item, str): + out.append(item) + return tuple(out) + + +def _load_manifest_config_path(run_dir: Path) -> Optional[str]: + manifest_path = run_dir / "run-manifest.json" + if not manifest_path.exists(): + return None + payload = _read_json(manifest_path) + if not isinstance(payload, Mapping): + return None + config_path = payload.get("config_path") + return config_path if isinstance(config_path, str) else None + + +def _load_remote_command(run_dir: Path) -> Optional[str]: + remote_path = run_dir / "remote-execution.json" + if not remote_path.exists(): + return None + payload = _read_json(remote_path) + if not isinstance(payload, Mapping): + return None + command = payload.get("command") + return command if isinstance(command, str) else None + + +def iter_probe_result_paths(runs_dir: Path) -> Iterable[Path]: + for probe_path in sorted(runs_dir.glob("*/probe-results.json")): + if probe_path.is_file(): + yield probe_path + + +def load_residual_lanes(runs_dir: Path) -> List[ResidualLane]: + lanes: List[ResidualLane] = [] + for probe_path in iter_probe_result_paths(runs_dir): + run_dir = probe_path.parent + run_dir_name = run_dir.name + config_path = _load_manifest_config_path(run_dir) + remote_command = _load_remote_command(run_dir) + payload = _read_json(probe_path) + if not isinstance(payload, list): + continue + for entry in payload: + if not isinstance(entry, Mapping): + continue + backend = entry.get("backend") + status = entry.get("status") + if not isinstance(backend, str) or not isinstance(status, str): + continue + lanes.append( + ResidualLane( + run_dir=run_dir, + run_dir_name=run_dir_name, + probe_path=probe_path, + config_path=config_path, + remote_command=remote_command, + backend=backend, + status=status, + issue_refs=_coerce_str_list(entry.get("issue_refs")), + suite=entry.get("suite") if isinstance(entry.get("suite"), str) else None, + upstream_query=entry.get("upstream_query") if isinstance(entry.get("upstream_query"), str) else None, + query_id=entry.get("query_id") if isinstance(entry.get("query_id"), str) else None, + semantic_scope=entry.get("semantic_scope") if isinstance(entry.get("semantic_scope"), str) else None, + notes=_coerce_str_list(entry.get("notes")), + latency_ms=_coerce_float(entry.get("latency_ms")), + adapter_overhead_latency_ms=_coerce_float(entry.get("adapter_overhead_latency_ms")), + rows_returned=_coerce_int(entry.get("rows_returned")), + ) + ) + return lanes + + +def filter_issue_residuals( + lanes: Sequence[ResidualLane], + issue_ref: str, + backend: str = "gfql", + status: str = "partial", +) -> List[ResidualLane]: + return [ + lane + for lane in lanes + if lane.backend == backend and lane.status == status and issue_ref in lane.issue_refs + ] + + +def latest_per_lane_key(lanes: Sequence[ResidualLane]) -> List[ResidualLane]: + latest: Dict[str, ResidualLane] = {} + for lane in lanes: + prior = latest.get(lane.lane_key) + if prior is None or lane.probe_path.stat().st_mtime > prior.probe_path.stat().st_mtime: + latest[lane.lane_key] = lane + return sorted(latest.values(), key=lambda lane: lane.lane_key) + + +def classify_residual_bucket(lane: ResidualLane) -> str: + haystack = " ".join([lane.semantic_scope or "", *lane.notes]).lower() + if "ancestor-post" in haystack or "ancestor post" in haystack: + return "recursive_ancestor_row_join" + if "shortest" in haystack: + return "path_distance_plus_multi_table_join" + if "co-occurrence" in haystack or "cooccurrence" in haystack: + return "tag_cooccurrence_join_aggregation" + if "reply" in haystack and "author" in haystack: + return "reply_author_row_shaping_join" + if "employment" in haystack or "company" in haystack: + return "employment_company_row_join" + if "join" in haystack and "aggregate" in haystack: + return "joined_row_aggregation" + if "join" in haystack: + return "joined_row_projection" + return "unclassified_residual" + + +def bucket_residuals(lanes: Sequence[ResidualLane]) -> Dict[str, List[ResidualLane]]: + buckets: Dict[str, List[ResidualLane]] = {} + for lane in lanes: + bucket = classify_residual_bucket(lane) + buckets.setdefault(bucket, []).append(lane) + for values in buckets.values(): + values.sort(key=lambda lane: lane.lane_key) + return dict(sorted(buckets.items(), key=lambda kv: kv[0])) + + +def _format_issue_template_title(bucket: str) -> str: + return f"[FEA] GFQL residual row-bindings: {bucket.replace('_', ' ')}" + + +def render_markdown_report( + lanes: Sequence[ResidualLane], + issue_ref: str, + include_child_issue_templates: bool = True, +) -> str: + rows: List[str] = [] + rows.append(f"# Residual Triage For `{issue_ref}`") + rows.append("") + rows.append(f"- Residual lane count: **{len(lanes)}**") + rows.append("- Filter: backend=`gfql`, status=`partial`, matching issue ref") + rows.append("") + rows.append("## Lane Matrix") + rows.append("") + rows.append("| upstream_query | query_id | semantic_scope | latency_ms | run_dir |") + rows.append("|---|---|---|---:|---|") + for lane in lanes: + rows.append( + "| {upstream} | {query_id} | {scope} | {latency} | `{run_dir}` |".format( + upstream=lane.upstream_query or "-", + query_id=lane.query_id or "-", + scope=lane.semantic_scope or "-", + latency=f"{lane.latency_ms:.3f}" if lane.latency_ms is not None else "-", + run_dir=lane.run_dir_name, + ) + ) + + buckets = bucket_residuals(lanes) + rows.append("") + rows.append("## Proposed Child-Issue Split") + rows.append("") + rows.append("| bucket | lane_count | upstream_queries |") + rows.append("|---|---:|---|") + for bucket, bucket_lanes in buckets.items(): + upstreams = sorted({lane.upstream_query or "-" for lane in bucket_lanes}) + rows.append(f"| `{bucket}` | {len(bucket_lanes)} | {', '.join(upstreams)} |") + + if include_child_issue_templates: + rows.append("") + rows.append("## Child Issue Drafts") + rows.append("") + for bucket, bucket_lanes in buckets.items(): + rows.append(f"### {_format_issue_template_title(bucket)}") + rows.append("") + rows.append("Scope:") + rows.append("- Residual workaround-backed GFQL partial lanes grouped by shared benchmark semantics.") + rows.append("- Split from umbrella `{}` to keep root causes narrow and reproducible.".format(issue_ref)) + rows.append("") + rows.append("Repro Evidence:") + for lane in bucket_lanes: + rows.append( + "- `{upstream}` / `{query_id}` from `{run_dir}` (`probe-results.json`)".format( + upstream=lane.upstream_query or "-", + query_id=lane.query_id or "-", + run_dir=lane.run_dir_name, + ) + ) + if lane.config_path: + rows.append(f" - config: `{lane.config_path}`") + if lane.remote_command: + rows.append(f" - command: `{lane.remote_command}`") + if lane.notes: + rows.append(f" - workaround note: {lane.notes[-1]}") + rows.append("") + rows.append("Done when:") + rows.append("- Query lanes no longer require adapter workaround rows/joins under benchmark runs.") + rows.append(f"- `issue_refs` in benchmark probes no longer need `{issue_ref}` for these lanes.") + rows.append("") + return "\n".join(rows).rstrip() + "\n" + + +def build_arg_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Generate #880 residual triage report from pyg-bench run artifacts." + ) + parser.add_argument( + "--runs-dir", + default=None, + help="Path to pyg-bench results/runs directory. If omitted, auto-detects common local locations.", + ) + parser.add_argument( + "--issue-ref", + default="graphistry/pygraphistry#880", + help="Issue reference to filter on", + ) + parser.add_argument( + "--output", + default=None, + help="Optional output markdown path", + ) + parser.add_argument( + "--all-runs", + action="store_true", + help="Include all matching lanes instead of only latest per lane key", + ) + return parser + + +def _resolve_runs_dir(runs_dir_arg: Optional[str]) -> Path: + if runs_dir_arg: + return Path(runs_dir_arg).expanduser() + + candidates = [ + Path.cwd() / "results" / "runs", + Path.cwd().parent / "pyg-bench" / "results" / "runs", + ] + for candidate in candidates: + if candidate.exists(): + return candidate + + raise FileNotFoundError( + "Could not auto-detect pyg-bench runs directory. Pass --runs-dir explicitly." + ) + + +def run_cli(argv: Optional[Sequence[str]] = None) -> int: + parser = build_arg_parser() + args = parser.parse_args(argv) + runs_dir = _resolve_runs_dir(args.runs_dir) + lanes = load_residual_lanes(runs_dir) + lanes = filter_issue_residuals(lanes, issue_ref=args.issue_ref) + if not args.all_runs: + lanes = latest_per_lane_key(lanes) + report = render_markdown_report(lanes, issue_ref=args.issue_ref) + if args.output: + out_path = Path(args.output).expanduser() + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(report, encoding="utf-8") + print(report) + return 0 + + +if __name__ == "__main__": + raise SystemExit(run_cli()) diff --git a/graphistry/tests/compute/gfql/test_benchmark_residual_triage.py b/graphistry/tests/compute/gfql/test_benchmark_residual_triage.py new file mode 100644 index 0000000000..1391de9b3c --- /dev/null +++ b/graphistry/tests/compute/gfql/test_benchmark_residual_triage.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path + +from graphistry.compute.gfql.benchmark_residual_triage import ( + ResidualLane, + _resolve_runs_dir, + bucket_residuals, + filter_issue_residuals, + latest_per_lane_key, + load_residual_lanes, + render_markdown_report, +) + + +def _write_json(path: Path, payload: object) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload), encoding="utf-8") + + +def test_load_filter_and_latest_selection(tmp_path: Path) -> None: + runs_dir = tmp_path / "runs" + issue = "graphistry/pygraphistry#880" + query_id = "new-topics" + upstream_query = "interactive-complex-4" + + run_a = runs_dir / "run-a" + run_b = runs_dir / "run-b" + + _write_json( + run_a / "probe-results.json", + [ + { + "backend": "gfql", + "status": "partial", + "issue_refs": [issue], + "suite": "snb-interactive", + "upstream_query": upstream_query, + "query_id": query_id, + "semantic_scope": "adapter_join_aggregation_workaround", + "notes": ["Adapter workaround joins and aggregates locally."], + "latency_ms": 12.0, + }, + { + "backend": "cypher", + "status": "ok", + "issue_refs": [], + "suite": "snb-interactive", + "upstream_query": upstream_query, + "query_id": query_id, + }, + ], + ) + _write_json(run_a / "run-manifest.json", {"config_path": "configs/suites/ic4.yaml"}) + _write_json(run_a / "remote-execution.json", {"command": "uv run python scripts/run_suite.py ..."}) + + _write_json( + run_b / "probe-results.json", + [ + { + "backend": "gfql", + "status": "partial", + "issue_refs": [issue], + "suite": "snb-interactive", + "upstream_query": upstream_query, + "query_id": query_id, + "semantic_scope": "adapter_join_aggregation_workaround", + "notes": ["Adapter workaround joins and aggregates locally."], + "latency_ms": 10.0, + } + ], + ) + _write_json(run_b / "run-manifest.json", {"config_path": "configs/suites/ic4.yaml"}) + + # Ensure run-b is newer for latest-per-key selection. + older = run_a / "probe-results.json" + newer = run_b / "probe-results.json" + os.utime(older, (1_700_000_000, 1_700_000_000)) + os.utime(newer, (1_700_000_100, 1_700_000_100)) + + lanes = load_residual_lanes(runs_dir) + assert len(lanes) == 3 + + residuals = filter_issue_residuals(lanes, issue_ref=issue) + assert len(residuals) == 2 + assert all(lane.backend == "gfql" for lane in residuals) + assert all(lane.status == "partial" for lane in residuals) + assert all(issue in lane.issue_refs for lane in residuals) + + latest = latest_per_lane_key(residuals) + assert len(latest) == 1 + assert latest[0].run_dir_name == "run-b" + + +def test_bucket_classification() -> None: + lane_join = ResidualLane( + run_dir=Path("/tmp/run1"), + run_dir_name="run1", + probe_path=Path("/tmp/run1/probe-results.json"), + config_path=None, + remote_command=None, + backend="gfql", + status="partial", + issue_refs=("graphistry/pygraphistry#880",), + suite="snb-interactive", + upstream_query="interactive-complex-4", + query_id="new-topics", + semantic_scope="adapter_join_aggregation_workaround", + notes=("Adapter workaround joins rows and aggregates locally.",), + latency_ms=1.0, + adapter_overhead_latency_ms=1.0, + rows_returned=10, + ) + lane_ancestor = ResidualLane( + run_dir=Path("/tmp/run2"), + run_dir_name="run2", + probe_path=Path("/tmp/run2/probe-results.json"), + config_path=None, + remote_command=None, + backend="gfql", + status="partial", + issue_refs=("graphistry/pygraphistry#880",), + suite="snb-interactive", + upstream_query="interactive-short-6", + query_id="message-forum", + semantic_scope="adapter_join_workaround", + notes=("Adapter workaround resolves ancestor-post plus forum/moderator rows locally.",), + latency_ms=2.0, + adapter_overhead_latency_ms=2.0, + rows_returned=20, + ) + + buckets = bucket_residuals([lane_join, lane_ancestor]) + assert "joined_row_aggregation" in buckets + assert "recursive_ancestor_row_join" in buckets + assert buckets["joined_row_aggregation"][0].query_id == "new-topics" + assert buckets["recursive_ancestor_row_join"][0].query_id == "message-forum" + + +def test_render_markdown_report_contains_evidence_blocks() -> None: + lane = ResidualLane( + run_dir=Path("/tmp/runx"), + run_dir_name="runx", + probe_path=Path("/tmp/runx/probe-results.json"), + config_path="configs/suites/snb-interactive-ic4-conformance-sf1.yaml", + remote_command="uv run python scripts/run_suite.py --suite snb-interactive ...", + backend="gfql", + status="partial", + issue_refs=("graphistry/pygraphistry#880",), + suite="snb-interactive", + upstream_query="interactive-complex-4", + query_id="new-topics", + semantic_scope="adapter_join_aggregation_workaround", + notes=("Adapter workaround joins rows and aggregates locally.",), + latency_ms=123.456, + adapter_overhead_latency_ms=120.0, + rows_returned=10, + ) + + report = render_markdown_report([lane], issue_ref="graphistry/pygraphistry#880") + assert "Residual Triage For `graphistry/pygraphistry#880`" in report + assert "interactive-complex-4" in report + assert "Child Issue Drafts" in report + assert "configs/suites/snb-interactive-ic4-conformance-sf1.yaml" in report + assert "uv run python scripts/run_suite.py" in report + + +def test_resolve_runs_dir_autodetect_and_explicit(tmp_path: Path, monkeypatch) -> None: + wd = tmp_path / "repo" + wd.mkdir(parents=True, exist_ok=True) + sibling_runs = tmp_path / "pyg-bench" / "results" / "runs" + sibling_runs.mkdir(parents=True, exist_ok=True) + + monkeypatch.chdir(wd) + resolved = _resolve_runs_dir(None) + assert resolved == sibling_runs + + explicit = tmp_path / "custom" / "runs" + explicit.mkdir(parents=True, exist_ok=True) + resolved_explicit = _resolve_runs_dir(str(explicit)) + assert resolved_explicit == explicit From 9dffbca055139c0800438188f2f626f5002f86c2 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Wed, 6 May 2026 18:08:24 -0700 Subject: [PATCH 2/4] chore(changelog): add #880 triage tooling entry --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e060a73887..9ede3be0f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Documentation - **GFQL component-labeling examples + README clarity (#1324)**: Added concise WCC/SCC labeling examples for `compute_cugraph`, `compute_igraph('clusters')`, and local Cypher `CALL graphistry.cugraph.*` write/row modes in GFQL docs, clarified that component IDs are partition labels (not stable semantic IDs), and tightened the main README GFQL intro sentence for readability. +### Added +- **GFQL benchmark residual triage tooling for `#880`**: Added `graphistry.compute.gfql.benchmark_residual_triage` plus CLI wrapper `bin/gfql_issue_880_triage.py` to parse `pyg-bench` run artifacts, filter residual `gfql`/`partial` lanes by issue ref (default `graphistry/pygraphistry#880`), select latest lane evidence, bucket likely root-cause families, and emit split-ready child-issue draft markdown with concrete repro evidence (run dir, config path, command, workaround note). Includes focused regression coverage in `graphistry/tests/compute/gfql/test_benchmark_residual_triage.py` and a GFQL docs quick-reference pointer. + ## [0.55.1 - 2026-05-05] ### Tests From bac17b2d8f2a5e534e35a92922b37edfc1a91231 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Wed, 6 May 2026 18:30:58 -0700 Subject: [PATCH 3/4] revert: remove #880 triage tooling from pygraphistry --- CHANGELOG.md | 2 - ai/docs/gfql/README.md | 1 - bin/gfql_issue_880_triage.py | 14 - .../compute/gfql/benchmark_residual_triage.py | 324 ------------------ .../gfql/test_benchmark_residual_triage.py | 183 ---------- 5 files changed, 524 deletions(-) delete mode 100755 bin/gfql_issue_880_triage.py delete mode 100644 graphistry/compute/gfql/benchmark_residual_triage.py delete mode 100644 graphistry/tests/compute/gfql/test_benchmark_residual_triage.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ede3be0f7..72b7c7aa93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,8 +11,6 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Documentation - **GFQL component-labeling examples + README clarity (#1324)**: Added concise WCC/SCC labeling examples for `compute_cugraph`, `compute_igraph('clusters')`, and local Cypher `CALL graphistry.cugraph.*` write/row modes in GFQL docs, clarified that component IDs are partition labels (not stable semantic IDs), and tightened the main README GFQL intro sentence for readability. -### Added -- **GFQL benchmark residual triage tooling for `#880`**: Added `graphistry.compute.gfql.benchmark_residual_triage` plus CLI wrapper `bin/gfql_issue_880_triage.py` to parse `pyg-bench` run artifacts, filter residual `gfql`/`partial` lanes by issue ref (default `graphistry/pygraphistry#880`), select latest lane evidence, bucket likely root-cause families, and emit split-ready child-issue draft markdown with concrete repro evidence (run dir, config path, command, workaround note). Includes focused regression coverage in `graphistry/tests/compute/gfql/test_benchmark_residual_triage.py` and a GFQL docs quick-reference pointer. ## [0.55.1 - 2026-05-05] diff --git a/ai/docs/gfql/README.md b/ai/docs/gfql/README.md index 333783b90d..ce98239f9f 100644 --- a/ai/docs/gfql/README.md +++ b/ai/docs/gfql/README.md @@ -9,7 +9,6 @@ Guide for AI assistants working with GFQL (Graph Frame Query Language) in PyGrap - [`predicates_checklist.md`](./predicates_checklist.md) — End-to-end checklist for predicate implementations. - [`conformance.md`](./conformance.md) — Cypher TCK conformance harness and CI wiring. - [`../prompts/GFQL_LLM_GUIDE_MAINTENANCE.md`](../prompts/GFQL_LLM_GUIDE_MAINTENANCE.md) — Guidance for keeping AI assistants aligned with GFQL changes. -- `python bin/gfql_issue_880_triage.py --runs-dir ../pyg-bench/results/runs` — Builds benchmark-evidence markdown for splitting residual workaround-backed `#880` lanes into narrower child issues. ### Essential GFQL Operations ```python diff --git a/bin/gfql_issue_880_triage.py b/bin/gfql_issue_880_triage.py deleted file mode 100755 index b58cbbf772..0000000000 --- a/bin/gfql_issue_880_triage.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from pathlib import Path -import sys - -REPO_ROOT = Path(__file__).resolve().parent.parent -if str(REPO_ROOT) not in sys.path: - sys.path.insert(0, str(REPO_ROOT)) - -from graphistry.compute.gfql.benchmark_residual_triage import run_cli - - -if __name__ == "__main__": - raise SystemExit(run_cli()) diff --git a/graphistry/compute/gfql/benchmark_residual_triage.py b/graphistry/compute/gfql/benchmark_residual_triage.py deleted file mode 100644 index 47798335f7..0000000000 --- a/graphistry/compute/gfql/benchmark_residual_triage.py +++ /dev/null @@ -1,324 +0,0 @@ -from __future__ import annotations - -import argparse -import json -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple - - -@dataclass(frozen=True) -class ResidualLane: - run_dir: Path - run_dir_name: str - probe_path: Path - config_path: Optional[str] - remote_command: Optional[str] - backend: str - status: str - issue_refs: Tuple[str, ...] - suite: Optional[str] - upstream_query: Optional[str] - query_id: Optional[str] - semantic_scope: Optional[str] - notes: Tuple[str, ...] - latency_ms: Optional[float] - adapter_overhead_latency_ms: Optional[float] - rows_returned: Optional[int] - - @property - def lane_key(self) -> str: - upstream = self.upstream_query or "unknown-upstream" - query_id = self.query_id or "unknown-query-id" - return f"{upstream}:{query_id}:{self.backend}" - - -def _read_json(path: Path) -> Any: - with path.open("r", encoding="utf-8") as f: - return json.load(f) - - -def _coerce_float(value: Any) -> Optional[float]: - if value is None: - return None - if isinstance(value, (int, float)): - return float(value) - return None - - -def _coerce_int(value: Any) -> Optional[int]: - if value is None: - return None - if isinstance(value, bool): - return None - if isinstance(value, int): - return value - return None - - -def _coerce_str_list(value: Any) -> Tuple[str, ...]: - if not isinstance(value, list): - return tuple() - out: List[str] = [] - for item in value: - if isinstance(item, str): - out.append(item) - return tuple(out) - - -def _load_manifest_config_path(run_dir: Path) -> Optional[str]: - manifest_path = run_dir / "run-manifest.json" - if not manifest_path.exists(): - return None - payload = _read_json(manifest_path) - if not isinstance(payload, Mapping): - return None - config_path = payload.get("config_path") - return config_path if isinstance(config_path, str) else None - - -def _load_remote_command(run_dir: Path) -> Optional[str]: - remote_path = run_dir / "remote-execution.json" - if not remote_path.exists(): - return None - payload = _read_json(remote_path) - if not isinstance(payload, Mapping): - return None - command = payload.get("command") - return command if isinstance(command, str) else None - - -def iter_probe_result_paths(runs_dir: Path) -> Iterable[Path]: - for probe_path in sorted(runs_dir.glob("*/probe-results.json")): - if probe_path.is_file(): - yield probe_path - - -def load_residual_lanes(runs_dir: Path) -> List[ResidualLane]: - lanes: List[ResidualLane] = [] - for probe_path in iter_probe_result_paths(runs_dir): - run_dir = probe_path.parent - run_dir_name = run_dir.name - config_path = _load_manifest_config_path(run_dir) - remote_command = _load_remote_command(run_dir) - payload = _read_json(probe_path) - if not isinstance(payload, list): - continue - for entry in payload: - if not isinstance(entry, Mapping): - continue - backend = entry.get("backend") - status = entry.get("status") - if not isinstance(backend, str) or not isinstance(status, str): - continue - lanes.append( - ResidualLane( - run_dir=run_dir, - run_dir_name=run_dir_name, - probe_path=probe_path, - config_path=config_path, - remote_command=remote_command, - backend=backend, - status=status, - issue_refs=_coerce_str_list(entry.get("issue_refs")), - suite=entry.get("suite") if isinstance(entry.get("suite"), str) else None, - upstream_query=entry.get("upstream_query") if isinstance(entry.get("upstream_query"), str) else None, - query_id=entry.get("query_id") if isinstance(entry.get("query_id"), str) else None, - semantic_scope=entry.get("semantic_scope") if isinstance(entry.get("semantic_scope"), str) else None, - notes=_coerce_str_list(entry.get("notes")), - latency_ms=_coerce_float(entry.get("latency_ms")), - adapter_overhead_latency_ms=_coerce_float(entry.get("adapter_overhead_latency_ms")), - rows_returned=_coerce_int(entry.get("rows_returned")), - ) - ) - return lanes - - -def filter_issue_residuals( - lanes: Sequence[ResidualLane], - issue_ref: str, - backend: str = "gfql", - status: str = "partial", -) -> List[ResidualLane]: - return [ - lane - for lane in lanes - if lane.backend == backend and lane.status == status and issue_ref in lane.issue_refs - ] - - -def latest_per_lane_key(lanes: Sequence[ResidualLane]) -> List[ResidualLane]: - latest: Dict[str, ResidualLane] = {} - for lane in lanes: - prior = latest.get(lane.lane_key) - if prior is None or lane.probe_path.stat().st_mtime > prior.probe_path.stat().st_mtime: - latest[lane.lane_key] = lane - return sorted(latest.values(), key=lambda lane: lane.lane_key) - - -def classify_residual_bucket(lane: ResidualLane) -> str: - haystack = " ".join([lane.semantic_scope or "", *lane.notes]).lower() - if "ancestor-post" in haystack or "ancestor post" in haystack: - return "recursive_ancestor_row_join" - if "shortest" in haystack: - return "path_distance_plus_multi_table_join" - if "co-occurrence" in haystack or "cooccurrence" in haystack: - return "tag_cooccurrence_join_aggregation" - if "reply" in haystack and "author" in haystack: - return "reply_author_row_shaping_join" - if "employment" in haystack or "company" in haystack: - return "employment_company_row_join" - if "join" in haystack and "aggregate" in haystack: - return "joined_row_aggregation" - if "join" in haystack: - return "joined_row_projection" - return "unclassified_residual" - - -def bucket_residuals(lanes: Sequence[ResidualLane]) -> Dict[str, List[ResidualLane]]: - buckets: Dict[str, List[ResidualLane]] = {} - for lane in lanes: - bucket = classify_residual_bucket(lane) - buckets.setdefault(bucket, []).append(lane) - for values in buckets.values(): - values.sort(key=lambda lane: lane.lane_key) - return dict(sorted(buckets.items(), key=lambda kv: kv[0])) - - -def _format_issue_template_title(bucket: str) -> str: - return f"[FEA] GFQL residual row-bindings: {bucket.replace('_', ' ')}" - - -def render_markdown_report( - lanes: Sequence[ResidualLane], - issue_ref: str, - include_child_issue_templates: bool = True, -) -> str: - rows: List[str] = [] - rows.append(f"# Residual Triage For `{issue_ref}`") - rows.append("") - rows.append(f"- Residual lane count: **{len(lanes)}**") - rows.append("- Filter: backend=`gfql`, status=`partial`, matching issue ref") - rows.append("") - rows.append("## Lane Matrix") - rows.append("") - rows.append("| upstream_query | query_id | semantic_scope | latency_ms | run_dir |") - rows.append("|---|---|---|---:|---|") - for lane in lanes: - rows.append( - "| {upstream} | {query_id} | {scope} | {latency} | `{run_dir}` |".format( - upstream=lane.upstream_query or "-", - query_id=lane.query_id or "-", - scope=lane.semantic_scope or "-", - latency=f"{lane.latency_ms:.3f}" if lane.latency_ms is not None else "-", - run_dir=lane.run_dir_name, - ) - ) - - buckets = bucket_residuals(lanes) - rows.append("") - rows.append("## Proposed Child-Issue Split") - rows.append("") - rows.append("| bucket | lane_count | upstream_queries |") - rows.append("|---|---:|---|") - for bucket, bucket_lanes in buckets.items(): - upstreams = sorted({lane.upstream_query or "-" for lane in bucket_lanes}) - rows.append(f"| `{bucket}` | {len(bucket_lanes)} | {', '.join(upstreams)} |") - - if include_child_issue_templates: - rows.append("") - rows.append("## Child Issue Drafts") - rows.append("") - for bucket, bucket_lanes in buckets.items(): - rows.append(f"### {_format_issue_template_title(bucket)}") - rows.append("") - rows.append("Scope:") - rows.append("- Residual workaround-backed GFQL partial lanes grouped by shared benchmark semantics.") - rows.append("- Split from umbrella `{}` to keep root causes narrow and reproducible.".format(issue_ref)) - rows.append("") - rows.append("Repro Evidence:") - for lane in bucket_lanes: - rows.append( - "- `{upstream}` / `{query_id}` from `{run_dir}` (`probe-results.json`)".format( - upstream=lane.upstream_query or "-", - query_id=lane.query_id or "-", - run_dir=lane.run_dir_name, - ) - ) - if lane.config_path: - rows.append(f" - config: `{lane.config_path}`") - if lane.remote_command: - rows.append(f" - command: `{lane.remote_command}`") - if lane.notes: - rows.append(f" - workaround note: {lane.notes[-1]}") - rows.append("") - rows.append("Done when:") - rows.append("- Query lanes no longer require adapter workaround rows/joins under benchmark runs.") - rows.append(f"- `issue_refs` in benchmark probes no longer need `{issue_ref}` for these lanes.") - rows.append("") - return "\n".join(rows).rstrip() + "\n" - - -def build_arg_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( - description="Generate #880 residual triage report from pyg-bench run artifacts." - ) - parser.add_argument( - "--runs-dir", - default=None, - help="Path to pyg-bench results/runs directory. If omitted, auto-detects common local locations.", - ) - parser.add_argument( - "--issue-ref", - default="graphistry/pygraphistry#880", - help="Issue reference to filter on", - ) - parser.add_argument( - "--output", - default=None, - help="Optional output markdown path", - ) - parser.add_argument( - "--all-runs", - action="store_true", - help="Include all matching lanes instead of only latest per lane key", - ) - return parser - - -def _resolve_runs_dir(runs_dir_arg: Optional[str]) -> Path: - if runs_dir_arg: - return Path(runs_dir_arg).expanduser() - - candidates = [ - Path.cwd() / "results" / "runs", - Path.cwd().parent / "pyg-bench" / "results" / "runs", - ] - for candidate in candidates: - if candidate.exists(): - return candidate - - raise FileNotFoundError( - "Could not auto-detect pyg-bench runs directory. Pass --runs-dir explicitly." - ) - - -def run_cli(argv: Optional[Sequence[str]] = None) -> int: - parser = build_arg_parser() - args = parser.parse_args(argv) - runs_dir = _resolve_runs_dir(args.runs_dir) - lanes = load_residual_lanes(runs_dir) - lanes = filter_issue_residuals(lanes, issue_ref=args.issue_ref) - if not args.all_runs: - lanes = latest_per_lane_key(lanes) - report = render_markdown_report(lanes, issue_ref=args.issue_ref) - if args.output: - out_path = Path(args.output).expanduser() - out_path.parent.mkdir(parents=True, exist_ok=True) - out_path.write_text(report, encoding="utf-8") - print(report) - return 0 - - -if __name__ == "__main__": - raise SystemExit(run_cli()) diff --git a/graphistry/tests/compute/gfql/test_benchmark_residual_triage.py b/graphistry/tests/compute/gfql/test_benchmark_residual_triage.py deleted file mode 100644 index 1391de9b3c..0000000000 --- a/graphistry/tests/compute/gfql/test_benchmark_residual_triage.py +++ /dev/null @@ -1,183 +0,0 @@ -from __future__ import annotations - -import json -import os -from pathlib import Path - -from graphistry.compute.gfql.benchmark_residual_triage import ( - ResidualLane, - _resolve_runs_dir, - bucket_residuals, - filter_issue_residuals, - latest_per_lane_key, - load_residual_lanes, - render_markdown_report, -) - - -def _write_json(path: Path, payload: object) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - path.write_text(json.dumps(payload), encoding="utf-8") - - -def test_load_filter_and_latest_selection(tmp_path: Path) -> None: - runs_dir = tmp_path / "runs" - issue = "graphistry/pygraphistry#880" - query_id = "new-topics" - upstream_query = "interactive-complex-4" - - run_a = runs_dir / "run-a" - run_b = runs_dir / "run-b" - - _write_json( - run_a / "probe-results.json", - [ - { - "backend": "gfql", - "status": "partial", - "issue_refs": [issue], - "suite": "snb-interactive", - "upstream_query": upstream_query, - "query_id": query_id, - "semantic_scope": "adapter_join_aggregation_workaround", - "notes": ["Adapter workaround joins and aggregates locally."], - "latency_ms": 12.0, - }, - { - "backend": "cypher", - "status": "ok", - "issue_refs": [], - "suite": "snb-interactive", - "upstream_query": upstream_query, - "query_id": query_id, - }, - ], - ) - _write_json(run_a / "run-manifest.json", {"config_path": "configs/suites/ic4.yaml"}) - _write_json(run_a / "remote-execution.json", {"command": "uv run python scripts/run_suite.py ..."}) - - _write_json( - run_b / "probe-results.json", - [ - { - "backend": "gfql", - "status": "partial", - "issue_refs": [issue], - "suite": "snb-interactive", - "upstream_query": upstream_query, - "query_id": query_id, - "semantic_scope": "adapter_join_aggregation_workaround", - "notes": ["Adapter workaround joins and aggregates locally."], - "latency_ms": 10.0, - } - ], - ) - _write_json(run_b / "run-manifest.json", {"config_path": "configs/suites/ic4.yaml"}) - - # Ensure run-b is newer for latest-per-key selection. - older = run_a / "probe-results.json" - newer = run_b / "probe-results.json" - os.utime(older, (1_700_000_000, 1_700_000_000)) - os.utime(newer, (1_700_000_100, 1_700_000_100)) - - lanes = load_residual_lanes(runs_dir) - assert len(lanes) == 3 - - residuals = filter_issue_residuals(lanes, issue_ref=issue) - assert len(residuals) == 2 - assert all(lane.backend == "gfql" for lane in residuals) - assert all(lane.status == "partial" for lane in residuals) - assert all(issue in lane.issue_refs for lane in residuals) - - latest = latest_per_lane_key(residuals) - assert len(latest) == 1 - assert latest[0].run_dir_name == "run-b" - - -def test_bucket_classification() -> None: - lane_join = ResidualLane( - run_dir=Path("/tmp/run1"), - run_dir_name="run1", - probe_path=Path("/tmp/run1/probe-results.json"), - config_path=None, - remote_command=None, - backend="gfql", - status="partial", - issue_refs=("graphistry/pygraphistry#880",), - suite="snb-interactive", - upstream_query="interactive-complex-4", - query_id="new-topics", - semantic_scope="adapter_join_aggregation_workaround", - notes=("Adapter workaround joins rows and aggregates locally.",), - latency_ms=1.0, - adapter_overhead_latency_ms=1.0, - rows_returned=10, - ) - lane_ancestor = ResidualLane( - run_dir=Path("/tmp/run2"), - run_dir_name="run2", - probe_path=Path("/tmp/run2/probe-results.json"), - config_path=None, - remote_command=None, - backend="gfql", - status="partial", - issue_refs=("graphistry/pygraphistry#880",), - suite="snb-interactive", - upstream_query="interactive-short-6", - query_id="message-forum", - semantic_scope="adapter_join_workaround", - notes=("Adapter workaround resolves ancestor-post plus forum/moderator rows locally.",), - latency_ms=2.0, - adapter_overhead_latency_ms=2.0, - rows_returned=20, - ) - - buckets = bucket_residuals([lane_join, lane_ancestor]) - assert "joined_row_aggregation" in buckets - assert "recursive_ancestor_row_join" in buckets - assert buckets["joined_row_aggregation"][0].query_id == "new-topics" - assert buckets["recursive_ancestor_row_join"][0].query_id == "message-forum" - - -def test_render_markdown_report_contains_evidence_blocks() -> None: - lane = ResidualLane( - run_dir=Path("/tmp/runx"), - run_dir_name="runx", - probe_path=Path("/tmp/runx/probe-results.json"), - config_path="configs/suites/snb-interactive-ic4-conformance-sf1.yaml", - remote_command="uv run python scripts/run_suite.py --suite snb-interactive ...", - backend="gfql", - status="partial", - issue_refs=("graphistry/pygraphistry#880",), - suite="snb-interactive", - upstream_query="interactive-complex-4", - query_id="new-topics", - semantic_scope="adapter_join_aggregation_workaround", - notes=("Adapter workaround joins rows and aggregates locally.",), - latency_ms=123.456, - adapter_overhead_latency_ms=120.0, - rows_returned=10, - ) - - report = render_markdown_report([lane], issue_ref="graphistry/pygraphistry#880") - assert "Residual Triage For `graphistry/pygraphistry#880`" in report - assert "interactive-complex-4" in report - assert "Child Issue Drafts" in report - assert "configs/suites/snb-interactive-ic4-conformance-sf1.yaml" in report - assert "uv run python scripts/run_suite.py" in report - - -def test_resolve_runs_dir_autodetect_and_explicit(tmp_path: Path, monkeypatch) -> None: - wd = tmp_path / "repo" - wd.mkdir(parents=True, exist_ok=True) - sibling_runs = tmp_path / "pyg-bench" / "results" / "runs" - sibling_runs.mkdir(parents=True, exist_ok=True) - - monkeypatch.chdir(wd) - resolved = _resolve_runs_dir(None) - assert resolved == sibling_runs - - explicit = tmp_path / "custom" / "runs" - explicit.mkdir(parents=True, exist_ok=True) - resolved_explicit = _resolve_runs_dir(str(explicit)) - assert resolved_explicit == explicit From 06a5c18040bda2555119c87f37032a2330aedb96 Mon Sep 17 00:00:00 2001 From: Leo Meyerovich Date: Wed, 6 May 2026 18:31:22 -0700 Subject: [PATCH 4/4] chore: drop accidental changelog whitespace diff --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72b7c7aa93..e060a73887 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,6 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ### Documentation - **GFQL component-labeling examples + README clarity (#1324)**: Added concise WCC/SCC labeling examples for `compute_cugraph`, `compute_igraph('clusters')`, and local Cypher `CALL graphistry.cugraph.*` write/row modes in GFQL docs, clarified that component IDs are partition labels (not stable semantic IDs), and tightened the main README GFQL intro sentence for readability. - ## [0.55.1 - 2026-05-05] ### Tests