From 922280b5577dec2bcc3d1759587d3e68f5012ed5 Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Wed, 27 May 2026 11:10:52 +0100
Subject: [PATCH 1/2] Add GROBID baseline comparison to benchmark workflow

part of https://github.com/eLifePathways/ScienceBeam2.0/issues/73

Inline GROBID prediction generation into benchmark.yml (conditional on
predictions being absent), score baseline predictions via a new --split
override in score.py, and produce a side-by-side comparison report. Adds
variant field to eval.yml corpus config and baselines section declaring
GROBID 0.9.0-crf as the reference tool.
---
 .github/workflows/benchmark.yml | 167 +++++++++++++++++++++++++-------
 benchmarks/eval.yml             |   4 +
 benchmarks/score.py             |   7 +-
 benchmarks/tests/score_test.py  |  78 +++++++++++++++
 4 files changed, 219 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 7bd85d3c..4717d9d7 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -26,7 +26,7 @@ jobs:
       contains(github.event.pull_request.labels.*.name, 'benchmark:smoke') ||
       contains(github.event.pull_request.labels.*.name, 'benchmark:full')
     runs-on: ubuntu-latest
-    timeout-minutes: 90
+    timeout-minutes: 120
     environment: benchmark
     permissions:
       contents: read
@@ -88,20 +88,6 @@ jobs:
             type=registry,ref=ghcr.io/elifepathways/sciencebeam-parser_unstable:buildcache-builder
             type=registry,ref=ghcr.io/elifepathways/sciencebeam-parser_unstable:buildcache-runtime
 
-      - name: Start sciencebeam-parser
-        run: docker run -d --name sciencebeam-parser -p 8080:8070 ${{ steps.image_tag.outputs.value }}
-
-      - name: Wait for parser
-        run: |
-          for i in $(seq 1 60); do
-            if curl -sf http://localhost:8080/; then
-              echo; echo "Parser up"; exit 0
-            fi
-            sleep 5
-          done
-          docker logs sciencebeam-parser >&2
-          echo "Parser never became healthy" >&2; exit 1
-
       - name: Set baseline cache key
         id: baseline_key
         run: |
@@ -117,6 +103,122 @@ jobs:
             ${{ steps.baseline_key.outputs.prefix }}-
             ${{ steps.baseline_key.outputs.broad_prefix }}
 
+      - name: Checkout predictions repo
+        uses: actions/checkout@v5
+        with:
+          repository: elifepathways/sciencebeam-eval-predictions
+          path: sciencebeam-eval-predictions
+          token: ${{ secrets.PREDICTIONS_REPO_PAT }}
+          fetch-depth: 1
+          filter: blob:none
+          sparse-checkout: .gitignore
+          sparse-checkout-cone-mode: false
+
+      - name: Fetch or generate baseline predictions
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+          SPLIT: ${{ env.BENCHMARK_SPLIT }}
+          MODE: ${{ env.BENCHMARK_MODE }}
+        run: |
+          python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); [print(b['tool']+'='+b['version']) for b in c.get('baselines', [])]" \
+          | while IFS== read -r TOOL VERSION; do
+            [ -z "$TOOL" ] && continue
+            echo "=== Baseline ${TOOL}/${VERSION} ==="
+            EXPECTED=$(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(sum(c.get('sampling',{}).get('${MODE}',{}).values()))")
+            EXISTING=$(git -C sciencebeam-eval-predictions ls-tree -r --name-only HEAD "${TOOL}/${VERSION}/" 2>/dev/null | grep '\.tei\.xml$' | wc -l)
+            echo "Predictions: ${EXISTING}/${EXPECTED}"
+            if [ "$EXISTING" -lt "$EXPECTED" ]; then
+              echo "Generating predictions for ${TOOL}/${VERSION}..."
+              if [ "$TOOL" = "grobid" ]; then
+                IMAGE="grobid/grobid:${VERSION}"; PORT="8070:8070"
+                URL="http://localhost:8070"; HEALTH="/api/isalive"
+              else
+                IMAGE="ghcr.io/elifepathways/sciencebeam-parser:${VERSION}"; PORT="8080:8070"
+                URL="http://localhost:8080"; HEALTH="/"
+              fi
+              docker run -d --name baseline-container -p "${PORT}" "${IMAGE}"
+              for i in $(seq 1 60); do
+                if curl -sf "${URL}${HEALTH}"; then echo; echo "Ready"; break; fi
+                [ "$i" = "60" ] && { docker logs baseline-container >&2; echo "Never ready" >&2; exit 1; }
+                sleep 5
+              done
+              uv run python -m benchmarks.predict \
+                --config benchmarks/eval.yml \
+                --mode "${MODE}" --split "${SPLIT}" \
+                --data benchmarks/data \
+                --out "benchmarks/runs/baseline-${TOOL}" \
+                --parser-url "${URL}" \
+                --parser-image "${TOOL}:${VERSION}"
+              docker stop baseline-container || true
+              DEST_BASE="sciencebeam-eval-predictions/${TOOL}/${VERSION}"
+              for CORPUS_DIR in "benchmarks/runs/baseline-${TOOL}/predictions"/*/; do
+                [ -d "${CORPUS_DIR}" ] || continue
+                CORPUS=$(basename "${CORPUS_DIR}")
+                VARIANT=$(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(c['dataset']['splits'].get('${SPLIT}',{}).get('${CORPUS}',{}).get('variant','v1'))")
+                mkdir -p "${DEST_BASE}/${CORPUS}/${VARIANT}/${SPLIT}"
+                cp -r "${CORPUS_DIR}/." "${DEST_BASE}/${CORPUS}/${VARIANT}/${SPLIT}/"
+              done
+              mkdir -p "${DEST_BASE}/${SPLIT}"
+              [ -f "benchmarks/runs/baseline-${TOOL}/predictions/manifest.jsonl" ] \
+                && cp "benchmarks/runs/baseline-${TOOL}/predictions/manifest.jsonl" "${DEST_BASE}/${SPLIT}/"
+              DATE=$(date --utc +%Y-%m-%dT%H:%M:%SZ)
+              echo "{\"tool\":\"${TOOL}\",\"version\":\"${VERSION}\",\"split\":\"${SPLIT}\",\"mode\":\"${MODE}\",\"generated_at\":\"${DATE}\"}" \
+                | python3 -m json.tool > "${DEST_BASE}/${SPLIT}/metadata.json"
+              cd sciencebeam-eval-predictions
+              git config user.name "github-actions[bot]"
+              git config user.email "github-actions[bot]@users.noreply.github.com"
+              git add --sparse .
+              if ! git diff --cached --quiet; then
+                git commit -m "Add ${TOOL}/${VERSION} ${SPLIT} predictions (${MODE})"
+                git push
+              fi
+              cd ..
+            else
+              echo "Fetching predictions for ${TOOL}/${VERSION} from repo..."
+              git -C sciencebeam-eval-predictions sparse-checkout add "${TOOL}/${VERSION}"
+              git -C sciencebeam-eval-predictions checkout
+              for CORPUS in $(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(' '.join(c['dataset']['splits'].get('${SPLIT}', {}).keys()))"); do
+                VARIANT=$(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); print(c['dataset']['splits'].get('${SPLIT}',{}).get('${CORPUS}',{}).get('variant','v1'))")
+                SRC="sciencebeam-eval-predictions/${TOOL}/${VERSION}/${CORPUS}/${VARIANT}/${SPLIT}"
+                if [ -d "${SRC}" ]; then
+                  mkdir -p "benchmarks/runs/baseline-${TOOL}/predictions/${CORPUS}"
+                  cp -r "${SRC}/." "benchmarks/runs/baseline-${TOOL}/predictions/${CORPUS}/"
+                fi
+              done
+            fi
+          done
+
+      - name: Score baseline predictions
+        run: |
+          python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); [print(b['tool']+'='+b['version']) for b in c.get('baselines', [])]" \
+          | while IFS== read -r TOOL VERSION; do
+            [ -z "$TOOL" ] && continue
+            RUN_DIR="benchmarks/runs/baseline-${TOOL}"
+            if [ -d "${RUN_DIR}/predictions" ]; then
+              uv run python -m benchmarks.score \
+                --config benchmarks/eval.yml \
+                --run "${RUN_DIR}" \
+                --split "${{ env.BENCHMARK_SPLIT }}" \
+                --data benchmarks/data
+            else
+              echo "No predictions for ${TOOL}/${VERSION}, skipping"
+            fi
+          done
+
+      - name: Start sciencebeam-parser
+        run: docker run -d --name sciencebeam-parser -p 8080:8070 ${{ steps.image_tag.outputs.value }}
+
+      - name: Wait for parser
+        run: |
+          for i in $(seq 1 60); do
+            if curl -sf http://localhost:8080/; then
+              echo; echo "Parser up"; exit 0
+            fi
+            sleep 5
+          done
+          docker logs sciencebeam-parser >&2
+          echo "Parser never became healthy" >&2; exit 1
+
       - name: Run predict
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -130,18 +232,6 @@ jobs:
             --parser-url http://localhost:8080 \
             --parser-image "${{ steps.image_tag.outputs.value }}"
 
-      - name: Checkout predictions repo
-        if: github.ref == 'refs/heads/main'
-        uses: actions/checkout@v5
-        with:
-          repository: elifepathways/sciencebeam-eval-predictions
-          path: sciencebeam-eval-predictions
-          token: ${{ secrets.PREDICTIONS_REPO_PAT }}
-          fetch-depth: 1
-          filter: blob:none
-          sparse-checkout: .gitignore
-          sparse-checkout-cone-mode: false
-
       - name: Push ScienceBeam Parser predictions
         if: github.ref == 'refs/heads/main'
         env:
@@ -167,9 +257,7 @@ jobs:
           git config user.name "github-actions[bot]"
           git config user.email "github-actions[bot]@users.noreply.github.com"
           git add --sparse .
-          if git diff --cached --quiet; then
-            echo "No new predictions to push"
-          else
+          if ! git diff --cached --quiet; then
             git commit -m "Update sciencebeam-parser/main ${SPLIT} predictions (${IMAGE_TAG})"
             git push
           fi
@@ -183,12 +271,19 @@ jobs:
 
       - name: Generate comparison report
         run: |
-          if [ -f ${{ env.BASELINE_DIR }}/summary.json ]; then
-            BASELINE_LABEL=$(cat ${{ env.BASELINE_DIR }}/label.txt 2>/dev/null || echo "baseline")
-            uv run python -m benchmarks.report \
-              --summary "${BASELINE_LABEL}=${{ env.BASELINE_DIR }}/summary.json" \
-              --summary "${{ steps.image_tag.outputs.value }}=${{ env.BENCHMARK_RUN }}/summary.json" \
-              --out ${{ env.BENCHMARK_RUN }}/comparison.md
+          SUMMARY_ARGS=()
+          while IFS== read -r TOOL VERSION; do
+            [ -z "$TOOL" ] && continue
+            SUMMARY_PATH="benchmarks/runs/baseline-${TOOL}/summary.json"
+            [ -f "${SUMMARY_PATH}" ] && SUMMARY_ARGS+=("--summary" "${TOOL} ${VERSION}=${SUMMARY_PATH}")
+          done < <(python3 -c "import yaml; c=yaml.safe_load(open('benchmarks/eval.yml')); [print(b['tool']+'='+b['version']) for b in c.get('baselines', [])]")
+          if [ -f "${{ env.BASELINE_DIR }}/summary.json" ]; then
+            BASELINE_LABEL=$(cat "${{ env.BASELINE_DIR }}/label.txt" 2>/dev/null || echo "baseline")
+            SUMMARY_ARGS+=("--summary" "${BASELINE_LABEL}=${{ env.BASELINE_DIR }}/summary.json")
+          fi
+          SUMMARY_ARGS+=("--summary" "${{ steps.image_tag.outputs.value }}=${{ env.BENCHMARK_RUN }}/summary.json")
+          if [ "${#SUMMARY_ARGS[@]}" -ge 4 ]; then
+            uv run python -m benchmarks.report "${SUMMARY_ARGS[@]}" --out "${{ env.BENCHMARK_RUN }}/comparison.md"
           fi
 
       - name: Write job summary
diff --git a/benchmarks/eval.yml b/benchmarks/eval.yml
index 9ba16e61..65747047 100644
--- a/benchmarks/eval.yml
+++ b/benchmarks/eval.yml
@@ -67,3 +67,7 @@ scoring:
       type: partial_list
     reference_doi:
       type: partial_ulist
+
+baselines:
+  - tool: grobid
+    version: 0.9.0-crf
diff --git a/benchmarks/score.py b/benchmarks/score.py
index 14dc4c48..6935c3fe 100644
--- a/benchmarks/score.py
+++ b/benchmarks/score.py
@@ -214,6 +214,7 @@ def run_score(  # pylint: disable=too-many-locals
     run_dir: Path,
     data_dir: Path,
     out_path: Optional[Path],
+    split_override: Optional[str] = None,
 ) -> None:
     register_functions()
     xml_mapping = parse_xml_mapping(DEFAULT_XML_MAPPING_PATH)
@@ -233,7 +234,7 @@ def run_score(  # pylint: disable=too-many-locals
     else:
         LOGGER.warning("No run.json found in %s; split/corpus detection may be incomplete", run_dir)
 
-    split = (run_record or {}).get("split", "train")
+    split = split_override or (run_record or {}).get("split", "train")
 
     corpora = list(config["dataset"]["splits"][split].keys())
 
@@ -270,6 +271,9 @@ def main(argv: Optional[List[str]] = None) -> None:
     parser.add_argument(
         "--out", default=None, help="Output path for report.md (default: <run>/report.md)"
     )
+    parser.add_argument(
+        "--split", default=None, help="Dataset split override (default: read from run.json)"
+    )
     args = parser.parse_args(argv)
 
     logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
@@ -282,6 +286,7 @@ def main(argv: Optional[List[str]] = None) -> None:
         run_dir=Path(args.run),
         data_dir=Path(args.data),
         out_path=Path(args.out) if args.out else None,
+        split_override=args.split,
     )
 
 
diff --git a/benchmarks/tests/score_test.py b/benchmarks/tests/score_test.py
index 284a1181..b5bb4aaa 100644
--- a/benchmarks/tests/score_test.py
+++ b/benchmarks/tests/score_test.py
@@ -1,11 +1,16 @@
 from __future__ import annotations
 
+import json
+from pathlib import Path
+from unittest.mock import patch
+
 from benchmarks.score import (
     _build_field_measures,
     _build_field_scoring_types,
     _doc_scores_to_dict,
     _match_to_prf,
     _render_report,
+    run_score,
 )
 
 
@@ -222,3 +227,76 @@ def test_should_include_run_provenance_when_provided(self):
         result = _render_report({}, [], {}, run_record)
         assert "my-image:v1" in result
         assert "default" in result
+
+
+class TestRunScoreSplitDetermination:
+    _CONFIG = {
+        "dataset": {
+            "splits": {
+                "train": {"train_corpus": {}},
+                "validation": {"validation_corpus": {}},
+            }
+        },
+        "fields": ["title"],
+        "scoring": {
+            "default_methods": ["levenshtein"],
+            "default_type": "string",
+            "per_field": {},
+        },
+    }
+
+    def _scored_corpora(self, mock_score_corpus) -> list:
+        return [call.args[0] for call in mock_score_corpus.call_args_list]
+
+    def test_split_override_takes_precedence_over_run_json(self, tmp_path: Path):
+        run_dir = tmp_path / "run"
+        run_dir.mkdir()
+        (run_dir / "run.json").write_text(json.dumps({"split": "train"}))
+
+        with patch("benchmarks.score.register_functions"), \
+             patch("benchmarks.score.parse_xml_mapping"), \
+             patch("benchmarks.score._score_corpus", return_value={"n": 0}) as mock_score:
+            run_score(
+                config=self._CONFIG,
+                run_dir=run_dir,
+                data_dir=tmp_path / "data",
+                out_path=tmp_path / "report.md",
+                split_override="validation",
+            )
+
+        assert self._scored_corpora(mock_score) == ["validation_corpus"]
+
+    def test_uses_split_from_run_json_when_no_override(self, tmp_path: Path):
+        run_dir = tmp_path / "run"
+        run_dir.mkdir()
+        (run_dir / "run.json").write_text(json.dumps({"split": "validation"}))
+
+        with patch("benchmarks.score.register_functions"), \
+             patch("benchmarks.score.parse_xml_mapping"), \
+             patch("benchmarks.score._score_corpus", return_value={"n": 0}) as mock_score:
+            run_score(
+                config=self._CONFIG,
+                run_dir=run_dir,
+                data_dir=tmp_path / "data",
+                out_path=tmp_path / "report.md",
+                split_override=None,
+            )
+
+        assert self._scored_corpora(mock_score) == ["validation_corpus"]
+
+    def test_defaults_to_train_when_no_override_and_no_run_json(self, tmp_path: Path):
+        run_dir = tmp_path / "run"
+        run_dir.mkdir()
+
+        with patch("benchmarks.score.register_functions"), \
+             patch("benchmarks.score.parse_xml_mapping"), \
+             patch("benchmarks.score._score_corpus", return_value={"n": 0}) as mock_score:
+            run_score(
+                config=self._CONFIG,
+                run_dir=run_dir,
+                data_dir=tmp_path / "data",
+                out_path=tmp_path / "report.md",
+                split_override=None,
+            )
+
+        assert self._scored_corpora(mock_score) == ["train_corpus"]

From 73e36413ff0706d5bf1c8e6860de524ff287cd5b Mon Sep 17 00:00:00 2001
From: Daniel Ecer <de-code@users.noreply.github.com>
Date: Wed, 27 May 2026 11:23:12 +0100
Subject: [PATCH 2/2] Remove accept header as that was rejected by GROBID

---
 benchmarks/predict.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/predict.py b/benchmarks/predict.py
index eab4bef8..80069b47 100644
--- a/benchmarks/predict.py
+++ b/benchmarks/predict.py
@@ -14,7 +14,6 @@
 
 LOGGER = logging.getLogger(__name__)
 
-MEDIA_TYPE_TEI_XML = "application/tei+xml"
 CONVERT_ENDPOINT = "/api/processFulltextDocument"
 
 
@@ -55,7 +54,6 @@ def _predict_one(
     response = client.post(
         f"{parser_url}{CONVERT_ENDPOINT}",
         files={"input": (pdf_path.name, pdf_path.read_bytes(), "application/pdf")},
-        headers={"Accept": MEDIA_TYPE_TEI_XML},
         timeout=timeout,
     )
     response.raise_for_status()