Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ BENCHMARK_SPLIT ?= train
BENCHMARK_RUN ?= benchmarks/runs/$(BENCHMARK_SPLIT)
BENCHMARK_PARSER_URL ?= $(SCIENCEBEAM_PARSER_URL)

SHOW_FIELD ?=
SHOW_METHOD ?= edit_sim
SHOW_CORPUS ?= biorxiv
SHOW_LIMIT ?= 10
SHOW_RUN_A ?= $(BENCHMARK_RUN)
SHOW_RUN_B ?= $(shell python3 -c "import yaml; b=yaml.safe_load(open('benchmarks/eval.yml')).get('baselines',[]); print('benchmarks/runs/baselines/'+b[0]['tool']+'/'+b[0]['version']+'/$(BENCHMARK_SPLIT)') if b else print('')" 2>/dev/null)

COMPARE_MODEL ?= segmentation
COMPARE_DOC_ID ?= $(basename $(notdir $(COMPARE_PDF)))
COMPARE_DOC_DIR = .temp/compare-with-grobid/by-doc/$(COMPARE_DOC_ID)
Expand Down Expand Up @@ -195,6 +202,32 @@ dev-benchmark-compare:
dev-benchmark: dev-benchmark-predict dev-benchmark-score


dev-show-regressions: .require-SHOW_FIELD
$(PYTHON) -m benchmarks.show_cases \
--run-a $(SHOW_RUN_A) \
--run-b $(SHOW_RUN_B) \
--field $(SHOW_FIELD) \
--method $(SHOW_METHOD) \
--corpus $(SHOW_CORPUS) \
--mode regression \
--data benchmarks/data \
--split $(BENCHMARK_SPLIT) \
--limit $(SHOW_LIMIT)


dev-show-improvements: .require-SHOW_FIELD
$(PYTHON) -m benchmarks.show_cases \
--run-a $(SHOW_RUN_A) \
--run-b $(SHOW_RUN_B) \
--field $(SHOW_FIELD) \
--method $(SHOW_METHOD) \
--corpus $(SHOW_CORPUS) \
--mode improvement \
--data benchmarks/data \
--split $(BENCHMARK_SPLIT) \
--limit $(SHOW_LIMIT)


dev-benchmark-with-baselines:
$(PYTHON) -m benchmarks.run_local \
--config $(BENCHMARK_CONFIG) \
Expand Down
304 changes: 304 additions & 0 deletions benchmarks/show_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
from __future__ import annotations

import argparse
import difflib
import json
import logging
import shutil
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Tuple

from sciencebeam_judge.parsing.xml import parse_xml, parse_xml_mapping
from sciencebeam_judge.parsing.xpath.xpath_functions import register_functions
from sciencebeam_judge.resources import DEFAULT_XML_MAPPING_PATH

LOGGER = logging.getLogger(__name__)

_RESET = "\033[0m"
_RED = "\033[91m"
_GREEN = "\033[92m"


def word_diff(reference: Optional[str], candidate: Optional[str]) -> str:
if reference is None and candidate is None:
return "(both absent)"
if reference is None:
return f"(gold absent) {_GREEN}{candidate}{_RESET}"
if candidate is None:
return f"{_RED}(prediction absent){_RESET}"
ref_words = reference.split()
cand_words = candidate.split()
matcher = difflib.SequenceMatcher(None, ref_words, cand_words, autojunk=False)
parts = []
for op, i1, i2, j1, j2 in matcher.get_opcodes():
if op == "equal":
parts.append(" ".join(ref_words[i1:i2]))
elif op == "replace":
parts.append(f"{_RED}[-{' '.join(ref_words[i1:i2])}-]{_RESET}")
parts.append(f"{_GREEN}{{+{' '.join(cand_words[j1:j2])}+}}{_RESET}")
elif op == "delete":
parts.append(f"{_RED}[-{' '.join(ref_words[i1:i2])}-]{_RESET}")
elif op == "insert":
parts.append(f"{_GREEN}{{+{' '.join(cand_words[j1:j2])}+}}{_RESET}")
return " ".join(parts)


def _get_doc_score(field_scores: dict, method: str) -> Optional[float]:
ms = field_scores.get(method)
if ms is None:
return None
if "sim_sum" in ms:
ec = ms.get("expected_count", 0)
pc = ms.get("predicted_count", 0)
precision = ms["sim_sum"] / pc if pc else 0.0
recall = ms["sim_sum"] / ec if ec else 0.0
return (
2 * precision * recall / (precision + recall)
if (precision + recall)
else 0.0
)
return ms.get("f1")


def _load_doc_score(score_path: Path, field: str, method: str) -> Optional[float]:
if not score_path.exists():
return None
data = json.loads(score_path.read_text(encoding="utf-8"))
return _get_doc_score(data.get("fields", {}).get(field, {}), method)


def _extract_field_text(xml_path: Path, field: str, xml_mapping: dict) -> Optional[str]:
if not xml_path.exists():
return None
values = parse_xml(BytesIO(xml_path.read_bytes()), xml_mapping, fields=[field])
items = values.get(field, [])
return " | ".join(str(v) for v in items) if items else None


def _run_label(run: Path) -> str:
parts = list(run.parts)
try:
idx = parts.index("runs")
return "/".join(parts[idx + 1:])
except ValueError:
return run.name


def _comparison_label(run_b: Path) -> str:
"""Derive a short label from run_b path for use in export directory names.

baselines/grobid/0.9.0-crf/train -> grobid-0.9.0-crf
"""
label = _run_label(run_b)
if label.startswith("baselines/"):
label = label[len("baselines/"):]
# Strip trailing split segment (last path component)
parts = label.split("/")
if len(parts) > 1:
label = "/".join(parts[:-1])
return label.replace("/", "-")


def find_cases(
run_a: Path,
run_b: Path,
field: str,
method: str,
corpus_filter: Optional[str],
mode: str,
) -> List[Tuple[float, str, str, float, float]]:
"""Return (delta, corpus, record_id, score_a, score_b) sorted by delta magnitude."""
if not (run_a / "scores").exists():
LOGGER.warning("No scores directory in %s", run_a)
return []

corpora = (
[corpus_filter]
if corpus_filter
else sorted(d.name for d in (run_a / "scores").iterdir() if d.is_dir())
)

cases = []
for corpus in corpora:
score_dir_a = run_a / "scores" / corpus
score_dir_b = run_b / "scores" / corpus
if not score_dir_a.exists():
continue
for score_path_a in sorted(score_dir_a.glob("*.json")):
score_a = _load_doc_score(score_path_a, field, method)
score_b = _load_doc_score(score_dir_b / score_path_a.name, field, method)
if score_a is None or score_b is None:
continue
delta = score_a - score_b
if mode == "regression" and delta < 0:
cases.append((delta, corpus, score_path_a.stem, score_a, score_b))
elif mode == "improvement" and delta > 0:
cases.append((delta, corpus, score_path_a.stem, score_a, score_b))

cases.sort(key=lambda x: x[0], reverse=mode == "improvement")
return cases


def _extract_texts(
corpus: str,
record_id: str,
run_a: Path,
run_b: Path,
data_dir: Path,
split: str,
field: str,
xml_mapping: dict,
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
gold_path = data_dir / split / corpus / f"{record_id}.jats.xml"
pred_path_a = run_a / "predictions" / corpus / f"{record_id}.tei.xml"
pred_path_b = run_b / "predictions" / corpus / f"{record_id}.tei.xml"
gold_text = _extract_field_text(gold_path, field, xml_mapping)
text_a = _extract_field_text(pred_path_a, field, xml_mapping)
text_b = _extract_field_text(pred_path_b, field, xml_mapping)
return gold_text, text_a, text_b


def _print_case( # pylint: disable=too-many-arguments,too-many-positional-arguments
delta: float,
corpus: str,
record_id: str,
score_a: float,
score_b: float,
label_a: str,
label_b: str,
gold_text: Optional[str],
text_a: Optional[str],
text_b: Optional[str],
) -> None:
width = max(len(label_a), len(label_b), 4)
print(f"record : {corpus}/{record_id} Δ={delta:+.3f}")
if gold_text is not None:
print(f" {'gold':<{width}} : {gold_text}")
print(f" {label_a:<{width}} : {score_a:.3f} | {word_diff(gold_text, text_a)}")
print(f" {label_b:<{width}} : {score_b:.3f} | {word_diff(gold_text, text_b)}")
print()


def _copy_if_exists(src: Path, dst: Path) -> None:
if src.exists():
shutil.copy(src, dst)


def _export_case(
out_dir: Path,
record_id: str,
corpus: str,
field: str,
gold_text: Optional[str],
text_a: Optional[str],
text_b: Optional[str],
run_a: Path,
run_b: Path,
data_dir: Path,
split: str,
) -> None:
out_dir.mkdir(parents=True, exist_ok=True)
for suffix, text in [
(f"{record_id}.gold.{field}.txt", gold_text),
(f"{record_id}.run-a.{field}.txt", text_a),
(f"{record_id}.run-b.{field}.txt", text_b),
]:
(out_dir / suffix).write_text(text or "", encoding="utf-8")
_copy_if_exists(
data_dir / split / corpus / f"{record_id}.pdf", out_dir / f"{record_id}.pdf"
)
_copy_if_exists(
data_dir / split / corpus / f"{record_id}.jats.xml",
out_dir / f"{record_id}.jats.xml",
)
_copy_if_exists(
run_a / "predictions" / corpus / f"{record_id}.tei.xml",
out_dir / f"{record_id}.run-a.tei.xml",
)
_copy_if_exists(
run_b / "predictions" / corpus / f"{record_id}.tei.xml",
out_dir / f"{record_id}.run-b.tei.xml",
)


def run_show_cases( # pylint: disable=too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals
run_a: Path,
run_b: Path,
field: str,
method: str,
corpus: Optional[str],
mode: str,
data_dir: Path,
split: str,
limit: Optional[int],
) -> None:
register_functions()
xml_mapping = parse_xml_mapping(DEFAULT_XML_MAPPING_PATH)

label_a = _run_label(run_a)
label_b = _run_label(run_b)
comp_label = _comparison_label(run_b)

cases = find_cases(run_a, run_b, field, method, corpus, mode)
corpus_label = corpus or "all corpora"
print(f"{len(cases)} {mode}s: {field}/{method} ({corpus_label})")
print(f" run-a ({label_a}): {run_a}")
print(f" run-b ({label_b}): {run_b}")
print()

to_show = cases[:limit] if limit is not None else cases
examples_base = run_a / "examples" / f"vs-{comp_label}" / mode / field / method
for delta, corp, record_id, score_a, score_b in to_show:
gold_text, text_a, text_b = _extract_texts(
corp, record_id, run_a, run_b, data_dir, split, field, xml_mapping,
)
_print_case(
delta, corp, record_id, score_a, score_b,
label_a, label_b, gold_text, text_a, text_b,
)
_export_case(
examples_base / corp, record_id, corp, field,
gold_text, text_a, text_b,
run_a, run_b, data_dir, split,
)

if to_show:
print(f"Exported {len(to_show)} examples to {examples_base}")


def main(argv=None) -> None:
parser = argparse.ArgumentParser(
description="Show per-document regressions or improvements between two runs"
)
parser.add_argument("--run-a", required=True, type=Path, help="Primary run directory")
parser.add_argument("--run-b", required=True, type=Path, help="Baseline run directory")
parser.add_argument("--field", required=True, help="Field name, e.g. title")
parser.add_argument(
"--method", default="edit_sim", help="Scoring method (default: edit_sim)"
)
parser.add_argument("--corpus", default=None, help="Corpus filter (default: all)")
parser.add_argument("--mode", required=True, choices=["regression", "improvement"])
parser.add_argument("--data", default="benchmarks/data", type=Path)
parser.add_argument("--split", default="train")
parser.add_argument("--limit", type=int, default=10)
args = parser.parse_args(argv)

logging.basicConfig(level=logging.WARNING, format="%(levelname)s %(message)s")

run_show_cases(
run_a=args.run_a,
run_b=args.run_b,
field=args.field,
method=args.method,
corpus=args.corpus,
mode=args.mode,
data_dir=args.data,
split=args.split,
limit=args.limit,
)


if __name__ == "__main__":
main()
Loading
Loading