From 922280b5577dec2bcc3d1759587d3e68f5012ed5 Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Wed, 27 May 2026 11:10:52 +0100 Subject: [PATCH 1/2] Add GROBID baseline comparison to benchmark workflow part of https://github.com/eLifePathways/ScienceBeam2.0/issues/73 Inline GROBID prediction generation into benchmark.yml (conditional on predictions being absent), score baseline predictions via a new --split override in score.py, and produce a side-by-side comparison report. Adds variant field to eval.yml corpus config and baselines section declaring GROBID 0.9.0-crf as the reference tool. --- .github/workflows/benchmark.yml | 167 +++++++++++++++++++++++++------- benchmarks/eval.yml | 4 + benchmarks/score.py | 7 +- benchmarks/tests/score_test.py | 78 +++++++++++++++ 4 files changed, 219 insertions(+), 37 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 7bd85d3c..4717d9d7 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -26,7 +26,7 @@ jobs: contains(github.event.pull_request.labels.*.name, 'benchmark:smoke') || contains(github.event.pull_request.labels.*.name, 'benchmark:full') runs-on: ubuntu-latest - timeout-minutes: 90 + timeout-minutes: 120 environment: benchmark permissions: contents: read @@ -88,20 +88,6 @@ jobs: type=registry,ref=ghcr.io/elifepathways/sciencebeam-parser_unstable:buildcache-builder type=registry,ref=ghcr.io/elifepathways/sciencebeam-parser_unstable:buildcache-runtime - - name: Start sciencebeam-parser - run: docker run -d --name sciencebeam-parser -p 8080:8070 ${{ steps.image_tag.outputs.value }} - - - name: Wait for parser - run: | - for i in $(seq 1 60); do - if curl -sf http://localhost:8080/; then - echo; echo "Parser up"; exit 0 - fi - sleep 5 - done - docker logs sciencebeam-parser >&2 - echo "Parser never became healthy" >&2; exit 1 - - name: Set baseline cache key id: baseline_key run: | @@ -117,6 +103,122 @@ jobs: ${{ steps.baseline_key.outputs.prefix }}- ${{ steps.baseline_key.outputs.broad_prefix }} + - name: Checkout predictions repo + uses: actions/checkout@v5 + with: + repository: elifepathways/sciencebeam-eval-predictions + path: sciencebeam-eval-predictions + token: ${{ secrets.PREDICTIONS_REPO_PAT }} + fetch-depth: 1 + filter: blob:none + sparse-checkout: .gitignore + sparse-checkout-cone-mode: false + + - name: Fetch or generate baseline predictions + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + SPLIT: ${{ env.BENCHMARK_SPLIT }} + MODE: ${{ env.BENCHMARK_MODE }} + run: | + python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); [print(b['tool']+'='+b['version']) for b in c.get('baselines', [])]" \ + | while IFS== read -r TOOL VERSION; do + [ -z "$TOOL" ] && continue + echo "=== Baseline ${TOOL}/${VERSION} ===" + EXPECTED=$(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(sum(c.get('sampling',{}).get('${MODE}',{}).values()))") + EXISTING=$(git -C sciencebeam-eval-predictions ls-tree -r --name-only HEAD "${TOOL}/${VERSION}/" 2>/dev/null | grep '\.tei\.xml$' | wc -l) + echo "Predictions: ${EXISTING}/${EXPECTED}" + if [ "$EXISTING" -lt "$EXPECTED" ]; then + echo "Generating predictions for ${TOOL}/${VERSION}..." + if [ "$TOOL" = "grobid" ]; then + IMAGE="grobid/grobid:${VERSION}"; PORT="8070:8070" + URL="http://localhost:8070"; HEALTH="/api/isalive" + else + IMAGE="ghcr.io/elifepathways/sciencebeam-parser:${VERSION}"; PORT="8080:8070" + URL="http://localhost:8080"; HEALTH="/" + fi + docker run -d --name baseline-container -p "${PORT}" "${IMAGE}" + for i in $(seq 1 60); do + if curl -sf "${URL}${HEALTH}"; then echo; echo "Ready"; break; fi + [ "$i" = "60" ] && { docker logs baseline-container >&2; echo "Never ready" >&2; exit 1; } + sleep 5 + done + uv run python -m benchmarks.predict \ + --config benchmarks/eval.yml \ + --mode "${MODE}" --split "${SPLIT}" \ + --data benchmarks/data \ + --out "benchmarks/runs/baseline-${TOOL}" \ + --parser-url "${URL}" \ + --parser-image "${TOOL}:${VERSION}" + docker stop baseline-container || true + DEST_BASE="sciencebeam-eval-predictions/${TOOL}/${VERSION}" + for CORPUS_DIR in "benchmarks/runs/baseline-${TOOL}/predictions"/*/; do + [ -d "${CORPUS_DIR}" ] || continue + CORPUS=$(basename "${CORPUS_DIR}") + VARIANT=$(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(c['dataset']['splits'].get('${SPLIT}',{}).get('${CORPUS}',{}).get('variant','v1'))") + mkdir -p "${DEST_BASE}/${CORPUS}/${VARIANT}/${SPLIT}" + cp -r "${CORPUS_DIR}/." "${DEST_BASE}/${CORPUS}/${VARIANT}/${SPLIT}/" + done + mkdir -p "${DEST_BASE}/${SPLIT}" + [ -f "benchmarks/runs/baseline-${TOOL}/predictions/manifest.jsonl" ] \ + && cp "benchmarks/runs/baseline-${TOOL}/predictions/manifest.jsonl" "${DEST_BASE}/${SPLIT}/" + DATE=$(date --utc +%Y-%m-%dT%H:%M:%SZ) + echo "{\"tool\":\"${TOOL}\",\"version\":\"${VERSION}\",\"split\":\"${SPLIT}\",\"mode\":\"${MODE}\",\"generated_at\":\"${DATE}\"}" \ + | python3 -m json.tool > "${DEST_BASE}/${SPLIT}/metadata.json" + cd sciencebeam-eval-predictions + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + git add --sparse . + if ! git diff --cached --quiet; then + git commit -m "Add ${TOOL}/${VERSION} ${SPLIT} predictions (${MODE})" + git push + fi + cd .. + else + echo "Fetching predictions for ${TOOL}/${VERSION} from repo..." + git -C sciencebeam-eval-predictions sparse-checkout add "${TOOL}/${VERSION}" + git -C sciencebeam-eval-predictions checkout + for CORPUS in $(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(' '.join(c['dataset']['splits'].get('${SPLIT}', {}).keys()))"); do + VARIANT=$(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(c['dataset']['splits'].get('${SPLIT}',{}).get('${CORPUS}',{}).get('variant','v1'))") + SRC="sciencebeam-eval-predictions/${TOOL}/${VERSION}/${CORPUS}/${VARIANT}/${SPLIT}" + if [ -d "${SRC}" ]; then + mkdir -p "benchmarks/runs/baseline-${TOOL}/predictions/${CORPUS}" + cp -r "${SRC}/." "benchmarks/runs/baseline-${TOOL}/predictions/${CORPUS}/" + fi + done + fi + done + + - name: Score baseline predictions + run: | + python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); [print(b['tool']+'='+b['version']) for b in c.get('baselines', [])]" \ + | while IFS== read -r TOOL VERSION; do + [ -z "$TOOL" ] && continue + RUN_DIR="benchmarks/runs/baseline-${TOOL}" + if [ -d "${RUN_DIR}/predictions" ]; then + uv run python -m benchmarks.score \ + --config benchmarks/eval.yml \ + --run "${RUN_DIR}" \ + --split "${{ env.BENCHMARK_SPLIT }}" \ + --data benchmarks/data + else + echo "No predictions for ${TOOL}/${VERSION}, skipping" + fi + done + + - name: Start sciencebeam-parser + run: docker run -d --name sciencebeam-parser -p 8080:8070 ${{ steps.image_tag.outputs.value }} + + - name: Wait for parser + run: | + for i in $(seq 1 60); do + if curl -sf http://localhost:8080/; then + echo; echo "Parser up"; exit 0 + fi + sleep 5 + done + docker logs sciencebeam-parser >&2 + echo "Parser never became healthy" >&2; exit 1 + - name: Run predict env: HF_TOKEN: ${{ secrets.HF_TOKEN }} @@ -130,18 +232,6 @@ jobs: --parser-url http://localhost:8080 \ --parser-image "${{ steps.image_tag.outputs.value }}" - - name: Checkout predictions repo - if: github.ref == 'refs/heads/main' - uses: actions/checkout@v5 - with: - repository: elifepathways/sciencebeam-eval-predictions - path: sciencebeam-eval-predictions - token: ${{ secrets.PREDICTIONS_REPO_PAT }} - fetch-depth: 1 - filter: blob:none - sparse-checkout: .gitignore - sparse-checkout-cone-mode: false - - name: Push ScienceBeam Parser predictions if: github.ref == 'refs/heads/main' env: @@ -167,9 +257,7 @@ jobs: git config user.name "github-actions[bot]" git config user.email "github-actions[bot]@users.noreply.github.com" git add --sparse . - if git diff --cached --quiet; then - echo "No new predictions to push" - else + if ! git diff --cached --quiet; then git commit -m "Update sciencebeam-parser/main ${SPLIT} predictions (${IMAGE_TAG})" git push fi @@ -183,12 +271,19 @@ jobs: - name: Generate comparison report run: | - if [ -f ${{ env.BASELINE_DIR }}/summary.json ]; then - BASELINE_LABEL=$(cat ${{ env.BASELINE_DIR }}/label.txt 2>/dev/null || echo "baseline") - uv run python -m benchmarks.report \ - --summary "${BASELINE_LABEL}=${{ env.BASELINE_DIR }}/summary.json" \ - --summary "${{ steps.image_tag.outputs.value }}=${{ env.BENCHMARK_RUN }}/summary.json" \ - --out ${{ env.BENCHMARK_RUN }}/comparison.md + SUMMARY_ARGS=() + while IFS== read -r TOOL VERSION; do + [ -z "$TOOL" ] && continue + SUMMARY_PATH="benchmarks/runs/baseline-${TOOL}/summary.json" + [ -f "${SUMMARY_PATH}" ] && SUMMARY_ARGS+=("--summary" "${TOOL} ${VERSION}=${SUMMARY_PATH}") + done < <(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); [print(b['tool']+'='+b['version']) for b in c.get('baselines', [])]") + if [ -f "${{ env.BASELINE_DIR }}/summary.json" ]; then + BASELINE_LABEL=$(cat "${{ env.BASELINE_DIR }}/label.txt" 2>/dev/null || echo "baseline") + SUMMARY_ARGS+=("--summary" "${BASELINE_LABEL}=${{ env.BASELINE_DIR }}/summary.json") + fi + SUMMARY_ARGS+=("--summary" "${{ steps.image_tag.outputs.value }}=${{ env.BENCHMARK_RUN }}/summary.json") + if [ "${#SUMMARY_ARGS[@]}" -ge 4 ]; then + uv run python -m benchmarks.report "${SUMMARY_ARGS[@]}" --out "${{ env.BENCHMARK_RUN }}/comparison.md" fi - name: Write job summary diff --git a/benchmarks/eval.yml b/benchmarks/eval.yml index 9ba16e61..65747047 100644 --- a/benchmarks/eval.yml +++ b/benchmarks/eval.yml @@ -67,3 +67,7 @@ scoring: type: partial_list reference_doi: type: partial_ulist + +baselines: + - tool: grobid + version: 0.9.0-crf diff --git a/benchmarks/score.py b/benchmarks/score.py index 14dc4c48..6935c3fe 100644 --- a/benchmarks/score.py +++ b/benchmarks/score.py @@ -214,6 +214,7 @@ def run_score( # pylint: disable=too-many-locals run_dir: Path, data_dir: Path, out_path: Optional[Path], + split_override: Optional[str] = None, ) -> None: register_functions() xml_mapping = parse_xml_mapping(DEFAULT_XML_MAPPING_PATH) @@ -233,7 +234,7 @@ def run_score( # pylint: disable=too-many-locals else: LOGGER.warning("No run.json found in %s; split/corpus detection may be incomplete", run_dir) - split = (run_record or {}).get("split", "train") + split = split_override or (run_record or {}).get("split", "train") corpora = list(config["dataset"]["splits"][split].keys()) @@ -270,6 +271,9 @@ def main(argv: Optional[List[str]] = None) -> None: parser.add_argument( "--out", default=None, help="Output path for report.md (default: /report.md)" ) + parser.add_argument( + "--split", default=None, help="Dataset split override (default: read from run.json)" + ) args = parser.parse_args(argv) logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") @@ -282,6 +286,7 @@ def main(argv: Optional[List[str]] = None) -> None: run_dir=Path(args.run), data_dir=Path(args.data), out_path=Path(args.out) if args.out else None, + split_override=args.split, ) diff --git a/benchmarks/tests/score_test.py b/benchmarks/tests/score_test.py index 284a1181..b5bb4aaa 100644 --- a/benchmarks/tests/score_test.py +++ b/benchmarks/tests/score_test.py @@ -1,11 +1,16 @@ from __future__ import annotations +import json +from pathlib import Path +from unittest.mock import patch + from benchmarks.score import ( _build_field_measures, _build_field_scoring_types, _doc_scores_to_dict, _match_to_prf, _render_report, + run_score, ) @@ -222,3 +227,76 @@ def test_should_include_run_provenance_when_provided(self): result = _render_report({}, [], {}, run_record) assert "my-image:v1" in result assert "default" in result + + +class TestRunScoreSplitDetermination: + _CONFIG = { + "dataset": { + "splits": { + "train": {"train_corpus": {}}, + "validation": {"validation_corpus": {}}, + } + }, + "fields": ["title"], + "scoring": { + "default_methods": ["levenshtein"], + "default_type": "string", + "per_field": {}, + }, + } + + def _scored_corpora(self, mock_score_corpus) -> list: + return [call.args[0] for call in mock_score_corpus.call_args_list] + + def test_split_override_takes_precedence_over_run_json(self, tmp_path: Path): + run_dir = tmp_path / "run" + run_dir.mkdir() + (run_dir / "run.json").write_text(json.dumps({"split": "train"})) + + with patch("benchmarks.score.register_functions"), \ + patch("benchmarks.score.parse_xml_mapping"), \ + patch("benchmarks.score._score_corpus", return_value={"n": 0}) as mock_score: + run_score( + config=self._CONFIG, + run_dir=run_dir, + data_dir=tmp_path / "data", + out_path=tmp_path / "report.md", + split_override="validation", + ) + + assert self._scored_corpora(mock_score) == ["validation_corpus"] + + def test_uses_split_from_run_json_when_no_override(self, tmp_path: Path): + run_dir = tmp_path / "run" + run_dir.mkdir() + (run_dir / "run.json").write_text(json.dumps({"split": "validation"})) + + with patch("benchmarks.score.register_functions"), \ + patch("benchmarks.score.parse_xml_mapping"), \ + patch("benchmarks.score._score_corpus", return_value={"n": 0}) as mock_score: + run_score( + config=self._CONFIG, + run_dir=run_dir, + data_dir=tmp_path / "data", + out_path=tmp_path / "report.md", + split_override=None, + ) + + assert self._scored_corpora(mock_score) == ["validation_corpus"] + + def test_defaults_to_train_when_no_override_and_no_run_json(self, tmp_path: Path): + run_dir = tmp_path / "run" + run_dir.mkdir() + + with patch("benchmarks.score.register_functions"), \ + patch("benchmarks.score.parse_xml_mapping"), \ + patch("benchmarks.score._score_corpus", return_value={"n": 0}) as mock_score: + run_score( + config=self._CONFIG, + run_dir=run_dir, + data_dir=tmp_path / "data", + out_path=tmp_path / "report.md", + split_override=None, + ) + + assert self._scored_corpora(mock_score) == ["train_corpus"] From 73e36413ff0706d5bf1c8e6860de524ff287cd5b Mon Sep 17 00:00:00 2001 From: Daniel Ecer Date: Wed, 27 May 2026 11:23:12 +0100 Subject: [PATCH 2/2] Remove accept header as that was rejected by GROBID --- benchmarks/predict.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks/predict.py b/benchmarks/predict.py index eab4bef8..80069b47 100644 --- a/benchmarks/predict.py +++ b/benchmarks/predict.py @@ -14,7 +14,6 @@ LOGGER = logging.getLogger(__name__) -MEDIA_TYPE_TEI_XML = "application/tei+xml" CONVERT_ENDPOINT = "/api/processFulltextDocument" @@ -55,7 +54,6 @@ def _predict_one( response = client.post( f"{parser_url}{CONVERT_ENDPOINT}", files={"input": (pdf_path.name, pdf_path.read_bytes(), "application/pdf")}, - headers={"Accept": MEDIA_TYPE_TEI_XML}, timeout=timeout, ) response.raise_for_status()