microsoft · WaelAbuSeada · Apr 8, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -23,6 +23,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false

diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -30,6 +30,7 @@ on:
         options:
           - "bug-fix"
           - "test-generation"
+          - "code-review"
       test-run:
         description: "Indicate this is a test run (with few entries)"
         required: false
@@ -83,6 +84,7 @@ jobs:
     permissions:
       contents: read
       id-token: write
+      copilot-requests: write
     name: ${{ matrix.entry }}
     strategy:
       fail-fast: false
@@ -122,21 +124,11 @@ jobs:
       - name: Install GitHub Copilot CLI
         run: npm install -g @github/copilot@1.0.57
 
-      - name: Select PAT based on job index
-        id: select-pat
-        shell: pwsh
-        run: |
-          $patIndex = ${{ strategy.job-index }} % 4
-          echo "pat_index=$patIndex" >> $env:GITHUB_OUTPUT
-
       - name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
         timeout-minutes: 120
         shell: pwsh
         env:
-          COPILOT_GITHUB_TOKEN: ${{ steps.select-pat.outputs.pat_index == '0' &&
-            secrets.COPILOT_PAT || (steps.select-pat.outputs.pat_index == '1' &&
-            secrets.COPILOT_PAT2 || (steps.select-pat.outputs.pat_index == '2'&&
-            secrets.COPILOT_PAT3 || secrets.COPILOT_PAT4)) }}
+          COPILOT_GITHUB_TOKEN: ${{ github.token }}
         run: |
           Write-Output "::add-mask::$env:COPILOT_GITHUB_TOKEN"
 
@@ -153,7 +145,9 @@ jobs:
         if: always()
         with:
           name: evaluation-results-${{ github.run_id }}-${{ matrix.entry }}
-          path: ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl
+          path: |
+            ${{ env.EVALUATION_RESULTS_DIR }}/**/*.jsonl
+            ${{ env.EVALUATION_RESULTS_DIR }}/**/*.log
           retention-days: ${{ inputs.test-run && 1 || 30 }}
 
   summarize-results:

diff --git a/.github/workflows/summarize-results.yml b/.github/workflows/summarize-results.yml
@@ -108,7 +108,8 @@ jobs:
             --use-capi ${{ !inputs.mock && '--storage braintrust --storage kusto' || '' }}
 
       - name: Update leaderboard in a new branch
-        if: ${{ !inputs.mock && !inputs.skip-leaderboard }}
+        # WIP for code-review category
+        if: ${{ !inputs.mock && !inputs.skip-leaderboard && inputs.category != 'code-review' }}
         run: |
           git fetch origin main
 

diff --git a/dataset/codereview.jsonl b/dataset/codereview.jsonl
diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json
@@ -0,0 +1,4 @@
+{
+    "runs": [],
+    "aggregate": []
+}
diff --git a/evaluator/scores.py b/evaluator/scores.py
@@ -19,3 +19,23 @@ def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
 class PostPatchPassedRate:
     def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
         return metadata.get("post_patch_passed", False)
+
+
+class PrecisionScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("precision", 0.0))
+
+
+class RecallScore:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("recall", 0.0))
+
+
+class F1Score:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> float:
+        return float(metadata.get("f1", 0.0))
+
+
+class ValidReviewOutput:
+    def __call__(self, *, metadata: dict, **kwargs: object) -> bool:
+        return bool(metadata.get("valid_review_output", False))
diff --git a/scripts/BCBenchUtils.psm1 b/scripts/BCBenchUtils.psm1
@@ -490,13 +490,14 @@ function Get-BCBenchDatasetPath {
     param(
         [Parameter(Mandatory = $true)]
         # Category validation lives only here: every caller resolves the dataset path through this function, so there's no need to duplicate ValidateSet on each caller.
-        [ValidateSet("bug-fix", "test-generation")]
+        [ValidateSet("bug-fix", "test-generation", "code-review")]
         [string] $Category
     )
 
     switch ($Category) {
         "bug-fix" { $DatasetName = "bcbench.jsonl" }
         "test-generation" { $DatasetName = "bcbench.jsonl" }
+        "code-review" { $DatasetName = "codereview.jsonl" }
     }
 
     [string] $projectRoot = Split-Path $PSScriptRoot -Parent

diff --git a/src/bcbench/agent/copilot/metrics.py b/src/bcbench/agent/copilot/metrics.py
@@ -34,7 +34,12 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
         output_lines: Lines from Copilot CLI stderr output
         session_log_path: Optional path to session log file for tool usage parsing
 
-    Expected output format (new, v1.0.2+):
+    Expected output format (newest, v1.0.61+):
+        Changes    +23 -0
+        AI Credits 58.4 (1m 14s)
+        Tokens     ↑ 413.9k (368.1k cached) • ↓ 4.5k (500 reasoning)
+
+    Previous output format (v1.0.2..v1.0.60):
         Changes   +17 -0
         Requests  0.33 Premium (1m 45s)
         Tokens    ↑ 317.5k • ↓ 4.3k • 255.0k (cached)
@@ -83,26 +88,41 @@ def parse_metrics(output_lines: Sequence[str], session_log_path: Path | None = N
             seconds = float(duration_match.group(2))
             execution_time = minutes * 60 + seconds
 
-        # New format: "Requests  0.33 Premium (1m 45s)" — extract session time from parenthesized duration
+        # New format (v1.0.2+): "Requests  0.33 Premium (1m 45s)" — extract session time from parenthesized duration
         if execution_time is None:
             requests_match = re.search(r"Requests\s+[\d.]+\s+Premium\s+\((?:(\d+)m\s*)?(\d+(?:\.\d+)?)s\)", output_text)
             if requests_match:
                 minutes = int(requests_match.group(1)) if requests_match.group(1) else 0
                 seconds = float(requests_match.group(2))
                 execution_time = minutes * 60 + seconds
 
+        # Newest format (v1.0.61+): "AI Credits 58.4 (1m 14s)" — "Requests N Premium" was renamed to "AI Credits N"
+        if execution_time is None:
+            credits_match = re.search(r"AI Credits\s+[\d.]+\s+\((?:(\d+)m\s*)?(\d+(?:\.\d+)?)s\)", output_text)
+            if credits_match:
+                minutes = int(credits_match.group(1)) if credits_match.group(1) else 0
+                seconds = float(credits_match.group(2))
+                execution_time = minutes * 60 + seconds
+
         # Token usage — legacy format: "1.3m in, 11.6k out"
         usage_match = re.search(r"(\d+(?:\.\d+)?[km]?)\s+in,\s*(\d+(?:\.\d+)?[km]?)\s+out", output_text)
         if usage_match:
             prompt_tokens = _parse_token_count(usage_match.group(1))
             completion_tokens = _parse_token_count(usage_match.group(2))
 
-        # New format: "Tokens    ↑ 317.5k • ↓ 4.3k • 255.0k (cached)"
+        # New format (v1.0.2+): "Tokens    ↑ 317.5k • ↓ 4.3k • 255.0k (cached)"
+        # Newest format (v1.0.61+): "Tokens     ↑ 413.9k (368.1k cached) • ↓ 4.5k (500 reasoning)"
+        # Use separate ↑ / ↓ lookups to tolerate inline "(N cached)" / "(N reasoning)" annotations
+        # between the two values.
         if prompt_tokens is None:
-            tokens_match = re.search(r"Tokens\s+[^\d]*(\d+(?:\.\d+)?[km]?)\s*[•·]\s*[^\d]*(\d+(?:\.\d+)?[km]?)", output_text)
-            if tokens_match:
-                prompt_tokens = _parse_token_count(tokens_match.group(1))
-                completion_tokens = _parse_token_count(tokens_match.group(2))
+            tokens_line_match = re.search(r"Tokens\s+([^\n]+)", output_text)
+            if tokens_line_match:
+                tokens_line = tokens_line_match.group(1)
+                up_match = re.search(r"\u2191\s*(\d+(?:\.\d+)?[km]?)", tokens_line)
+                down_match = re.search(r"\u2193\s*(\d+(?:\.\d+)?[km]?)", tokens_line)
+                if up_match and down_match:
+                    prompt_tokens = _parse_token_count(up_match.group(1))
+                    completion_tokens = _parse_token_count(down_match.group(1))
 
         if execution_time is not None or llm_duration is not None or prompt_tokens is not None or completion_tokens is not None or turn_count is not None:
             return AgentMetrics(

diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -50,6 +50,19 @@ prompt:
     {{task}}
     {% endif %}
 
+  code-review-template: |
+    /al-code-review
+
+    Review ONLY the current working-tree AL file changes for this evaluation entry.
+    Use the working tree diff only (git diff HEAD), and focus on changed *.al files.
+    Do NOT review committed history or the HEAD commit, and do NOT compare commits (for example, do NOT use HEAD~1..HEAD or origin/main comparisons).
+
+    Save findings to a file named "review.json" in the repository root.
+    The file must contain valid JSON with a top-level object named findings.
+    Each finding must include: filePath, lineNumber, severity, issue, recommendation, domain, suggestedCode
+    Allowed severity values are: critical, high, medium, low.
+    If there are no findings, write an empty findings list.
+
 # controls:
 # 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`
 #    - Copilot: copies to repo/.github/ and renames AGENTS.md to copilot-instructions.md
@@ -59,14 +72,14 @@ prompt:
 # NOTE: the canonical source file is AGENTS.md; it is automatically renamed
 #       to the agent-specific filename (AgentType.instruction_filename) during setup
 instructions:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy skills from `src/bcbench/agent/shared/instructions/<sanitized-repo>/skills/`
 #    - Copilot: copies to repo/.github/skills/
 #    - Claude: copies to repo/.claude/skills/
 skills:
-  enabled: false
+  enabled: true
 
 # controls:
 # 1. whether to copy custom agents from `src/bcbench/agent/shared/instructions/<sanitized-repo>/agents/`

diff --git a/src/bcbench/agent/shared/hooks/log_tool_usage.py b/src/bcbench/agent/shared/hooks/log_tool_usage.py
@@ -0,0 +1,51 @@
+"""Copilot/Claude PreToolUse hook: log tool invocations to a JSONL file.
+
+Reads the hook payload from stdin and appends one JSON line per call to the
+path in BCBENCH_TOOL_LOG. Used by both Copilot CLI (Linux runners) and Claude
+hooks via the `bash` field of the hook command spec; the legacy .ps1 in this
+directory mirrors the same behavior for the Windows `powershell` field.
+"""
+
+import contextlib
+import json
+import os
+import sys
+
+
+def _extract_tool_name(payload: dict) -> str | None:
+    name = payload.get("tool_name") or payload.get("toolName")
+    if name != "lsp":
+        return name
+
+    args = payload.get("toolArgs") or payload.get("tool_input")
+    if isinstance(args, str):
+        try:
+            args = json.loads(args)
+        except json.JSONDecodeError:
+            args = None
+    if isinstance(args, dict) and (op := args.get("operation")):
+        return f"lsp:{op}"
+    return name
+
+
+def main() -> None:
+    try:
+        payload = json.loads(sys.stdin.read() or "{}")
+    except json.JSONDecodeError:
+        return
+
+    name = _extract_tool_name(payload)
+    log_path = os.environ.get("BCBENCH_TOOL_LOG")
+    if not name or not log_path:
+        return
+
+    entry = {"tool_name": name, "timestamp": payload.get("timestamp", "")}
+    with open(log_path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(entry) + "\n")
+
+
+if __name__ == "__main__":
+    with contextlib.suppress(Exception):
+        # Never block tool execution — silently fail.
+        main()
+    sys.exit(0)