microsoft · gggdttt · Jun 25, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/dataset/codereview.jsonl b/dataset/codereview.jsonl
diff --git a/dataset/judge_calibration.jsonl b/dataset/judge_calibration.jsonl
@@ -0,0 +1,24 @@
+{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "Calling Commit() inside the repeat loop can leave partial data if a later iteration fails.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 145, "body": "Move the Commit() out of the loop; committing per iteration breaks atomicity.", "severity": "medium"}, "should_match": true, "note": "commit-in-loop, paraphrased + different line"}
+{"expected": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "Add SetLoadFields before FindSet so the whole record isn't loaded.", "severity": "medium"}, "candidate": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "Use SetLoadFields to limit the columns read for this query.", "severity": "medium"}, "should_match": true, "note": "setloadfields performance, same issue"}
+{"expected": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "Currency code 'USD' is hardcoded; read it from setup instead.", "severity": "medium"}, "candidate": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "Don't hardcode the currency — pull it from the configuration record.", "severity": "medium"}, "should_match": true, "note": "hardcoded currency, same issue"}
+{"expected": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 33, "body": "GET can fail when the customer is missing; handle the not-found case.", "severity": "medium"}, "candidate": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 31, "body": "Missing error handling if the GET on Customer returns false.", "severity": "medium"}, "should_match": true, "note": "unchecked GET, same issue"}
+{"expected": {"file": "src/Sales/SalesLine.Table.al", "line_start": 210, "body": "You can't SetRange on a FlowField without calling CalcFields first.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesLine.Table.al", "line_start": 210, "body": "Filtering directly on this FlowField won't work as written.", "severity": "medium"}, "should_match": true, "note": "flowfield filter, same issue"}
+{"expected": {"file": "src/Reports/Statement.Report.al", "line_start": 90, "body": "Building the filter from a concatenated string risks filter injection; use SetFilter with parameters.", "severity": "medium"}, "candidate": {"file": "src/Reports/Statement.Report.al", "line_start": 92, "body": "Use parameterized SetFilter instead of concatenating user input into the filter.", "severity": "medium"}, "should_match": true, "note": "filter injection, same issue"}
+{"expected": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 120, "body": "This runs a database read per record — an N+1 pattern.", "severity": "medium"}, "candidate": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 118, "body": "Querying inside the loop causes N+1 queries; batch the lookup.", "severity": "medium"}, "should_match": true, "note": "n+1 query, same issue"}
+{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "TestField(Quantity) is missing before posting.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "Validate that Quantity is set before posting, e.g. with TestField.", "severity": "high"}, "should_match": true, "note": "missing testfield, severity differs"}
+{"expected": {"file": "src/Common/Util.Codeunit.al", "line_start": 12, "body": "Variable 'i' is declared but never used.", "severity": "low"}, "candidate": {"file": "src/Common/Util.Codeunit.al", "line_start": 12, "body": "Remove the unused variable i.", "severity": "high"}, "should_match": true, "note": "unused variable, severity differs"}
+{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "Calling Commit() inside the repeat loop can leave partial data on failure.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "The procedure name should be PascalCase to match conventions.", "severity": "medium"}, "should_match": false, "note": "commit-in-loop vs naming style, same line"}
+{"expected": {"file": "src/Admin/UserSetup.Page.al", "line_start": 45, "body": "Missing permission/SecurityFiltering check before exposing this data.", "severity": "medium"}, "candidate": {"file": "src/Admin/UserSetup.Page.al", "line_start": 45, "body": "Add a tooltip to this field for accessibility.", "severity": "medium"}, "should_match": false, "note": "permission vs tooltip, same line"}
+{"expected": {"file": "src/Common/Loop.Codeunit.al", "line_start": 88, "body": "Off-by-one: the range 1..Count skips the last element.", "severity": "medium"}, "candidate": {"file": "src/Common/Loop.Codeunit.al", "line_start": 88, "body": "Use a temporary record here to avoid locking the table.", "severity": "medium"}, "should_match": false, "note": "off-by-one vs locking, same line"}
+{"expected": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 33, "body": "Possible null/empty reference: 'Customer' may be blank after the failed GET.", "severity": "medium"}, "candidate": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 33, "body": "Add a comment explaining what this block does.", "severity": "medium"}, "should_match": false, "note": "null-ref vs missing comment, same line"}
+{"expected": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "Use IsEmpty() instead of Count() = 0 for performance.", "severity": "medium"}, "candidate": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "This label text needs to be translated/localized.", "severity": "medium"}, "should_match": false, "note": "isempty perf vs localization, same line"}
+{"expected": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "Currency code 'USD' is hardcoded; read it from setup.", "severity": "medium"}, "candidate": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "The magic number 1000 should be extracted into a named constant.", "severity": "medium"}, "should_match": false, "note": "hardcoded currency vs magic number — both 'hardcoding' but different code"}
+{"expected": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 118, "body": "FindSet(true) should be FindSet(false) since records aren't modified.", "severity": "medium"}, "candidate": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 118, "body": "Add SetLoadFields before this FindSet to limit columns.", "severity": "medium"}, "should_match": false, "note": "findset write-intent vs setloadfields — both about the same FindSet, different issue"}
+{"expected": {"file": "src/Sales/CustList.Codeunit.al", "line_start": 70, "body": "This loop reads every column of Customer; on large tables that's a major performance hit.", "severity": "medium"}, "candidate": {"file": "src/Sales/CustList.Codeunit.al", "line_start": 72, "body": "Call SetLoadFields(Name, \"No.\") before FindSet so only the needed fields are fetched.", "severity": "medium"}, "should_match": true, "note": "setloadfields: symptom (slow full read) vs fix (call SetLoadFields)"}
+{"expected": {"file": "src/Sales/SalesValidation.Codeunit.al", "line_start": 25, "body": "AA0217: the literal passed to Error() must be a Label variable.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesValidation.Codeunit.al", "line_start": 25, "body": "Hardcoding this error text breaks translation; move it to a Label with an Err suffix.", "severity": "medium"}, "should_match": true, "note": "hardcoded error string: rule number (AA0217) vs translation rationale"}
+{"expected": {"file": "src/Integration/Import.Codeunit.al", "line_start": 88, "body": "The InStream is read twice without resetting position; the second read returns nothing.", "severity": "medium"}, "candidate": {"file": "src/Integration/Import.Codeunit.al", "line_start": 90, "body": "Reset the stream position before reading it a second time.", "severity": "medium"}, "should_match": true, "note": "stream re-read: symptom (empty second read) vs fix (reset position)"}
+{"expected": {"file": "src/Admin/UserSetup.Page.al", "line_start": 12, "body": "This page exposes records without an OnOpenPage permission check.", "severity": "medium"}, "candidate": {"file": "src/Admin/UserSetup.Page.al", "line_start": 12, "body": "Add a SecurityFiltering/permission guard so unauthorized users can't read these records.", "severity": "medium"}, "should_match": true, "note": "missing permission check: terminology variance (OnOpenPage check vs SecurityFiltering guard)"}
+{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "Missing TestField(Quantity) before posting.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "Missing TestField(Type) before posting.", "severity": "medium"}, "should_match": false, "note": "missing TestField but different field (Quantity vs Type)"}
+{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "Commit() inside the loop breaks atomicity — move it out.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "There is no Commit() here, so the batch changes are never persisted.", "severity": "medium"}, "should_match": false, "note": "commit: too-many (in loop) vs missing entirely, opposite issue same line"}
+{"expected": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 120, "body": "GET inside the loop is an N+1 performance problem.", "severity": "medium"}, "candidate": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 120, "body": "The return value of this GET isn't checked; it can silently fail.", "severity": "medium"}, "should_match": false, "note": "same GET statement: N+1 performance vs unchecked-return correctness"}
+{"expected": {"file": "src/Sales/Notify.Codeunit.al", "line_start": 14, "body": "Hardcoded message string should be a Label (AA0217).", "severity": "medium"}, "candidate": {"file": "src/Sales/Notify.Codeunit.al", "line_start": 14, "body": "This Label uses a 'Msg' suffix but is passed to Error(); the suffix should be 'Err'.", "severity": "medium"}, "should_match": false, "note": "label: not-a-Label-at-all (AA0217) vs existing-Label-wrong-suffix"}
diff --git a/src/bcbench/agent/shared/config.yaml b/src/bcbench/agent/shared/config.yaml
@@ -54,7 +54,17 @@ prompt:
     /review
 
     Review only the uncommitted working-tree changes (git diff HEAD); do not compare commits such as HEAD~1..HEAD or origin/main.
-    Save the review findings to a file named "review.json" in the repository root (write an empty array if there are no findings).
+
+    Your only deliverable is a file named review.json in the repository root. You MUST write it before finishing; if you do not, your review is lost and counts as no output.
+
+    review.json must contain a single JSON array. Each finding is an object with these fields:
+      - file: repo-relative path of the file the finding refers to (string, required)
+      - line_start: 1-based line number where the issue starts (integer, required)
+      - line_end: line number where the issue ends (integer, optional)
+      - severity: one of critical, high, medium, or low (optional, defaults to medium)
+      - body: concise description of the issue (string, required)
+
+    If there are no findings, write an empty array. Write only valid JSON to review.json, with no surrounding markdown or commentary.
 
 # controls:
 # 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`

diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py
@@ -150,7 +150,7 @@ def view_entry(
                 lines = str(comment.line_start)
                 if comment.line_end and comment.line_end != comment.line_start:
                     lines += f"-{comment.line_end}"
-                comment_table.add_row(comment.file, lines, comment.severity, comment.body)
+                comment_table.add_row(comment.file, lines, comment.severity_label, comment.body)
             console.print(comment_table)
         else:
             console.print("[dim]No expected comments[/dim]")

diff --git a/src/bcbench/commands/evaluate.py b/src/bcbench/commands/evaluate.py
@@ -22,6 +22,7 @@
 from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry, NL2ALEntry
 from bcbench.evaluate import EvaluationPipeline
+from bcbench.evaluate.codereview_judge_calibration import run_calibration
 from bcbench.logger import get_logger
 from bcbench.results import BaseEvaluationResult, CodeReviewResult, ExecutionBasedEvaluationResult, JudgeBasedEvaluationResult
 from bcbench.types import AgentMetrics, BCalLLMBackend, ContainerConfig, EvaluationCategory, EvaluationContext, ExperimentConfiguration
@@ -203,6 +204,31 @@ def evaluate_bcal(
     logger.info(f"Results saved to: {run_dir}")
 
 
+@evaluate_app.command("judge-calibration")
+def evaluate_judge_calibration(
+    model: CopilotModel = _config.judge.code_review_model,
+    work_dir: RepoPath = _config.paths.testbed_path,
+    min_accuracy: Annotated[float, typer.Option(help="Fail if judge accuracy falls below this")] = 0.8,
+) -> None:
+    """Run the LLM judge over the hand-labeled calibration set and report its precision/recall.
+
+    Intended for local/ad-hoc checks of the judge; exits non-zero if accuracy drops below
+    the threshold.
+    """
+    work_dir.mkdir(parents=True, exist_ok=True)
+    report = run_calibration(work_dir, model=model)
+
+    logger.info(f"Judge calibration ({model}) over {report.total} labeled pairs:")
+    logger.info(f"  precision={report.precision:.3f}  recall={report.recall:.3f}  accuracy={report.accuracy:.3f}")
+    logger.info(f"  TP={report.true_positives} FP={report.false_positives} TN={report.true_negatives} FN={report.false_negatives}")
+    for note in report.misclassified_notes:
+        logger.warning(f"  {note}")
+
+    if report.accuracy < min_accuracy:
+        logger.error(f"Judge accuracy {report.accuracy:.3f} is below the required {min_accuracy:.3f}")
+        raise typer.Exit(code=1)
+
+
 @evaluate_app.command("mock", hidden=True)
 def evaluate_mock(
     entry_id: Annotated[str, typer.Argument(help="Entry ID to run")],
@@ -302,7 +328,7 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
             case "invalid":
                 result = CodeReviewResult.create_invalid(context, output="MOCK_INVALID_REVIEW_OUTPUT", expected_comments=[])
             case "valid":
-                result = CodeReviewResult.create(context, output="[]", expected_comments=[], generated_comments=[], line_tolerance=0)
+                result = CodeReviewResult.create(context, output="[]", expected_comments=[], generated_comments=[])
             case "raw":
                 result = JudgeBasedEvaluationResult.create_raw(context, output="MOCK_PATCH_CONTENT")
             case "empty":

diff --git a/src/bcbench/config.py b/src/bcbench/config.py
@@ -9,6 +9,8 @@
 
 from dotenv import load_dotenv
 
+from bcbench.cli_options import CopilotModel
+
 __all__ = ["Config", "get_config"]
 
 
@@ -123,6 +125,21 @@ def default(cls) -> FilePatternConfig:
         )
 
 
+@dataclass(frozen=True)
+class JudgeConfig:
+    """Configuration for the code-review LLM semantic judge."""
+
+    code_review_model: CopilotModel
+    result_file: str
+
+    @classmethod
+    def default(cls) -> JudgeConfig:
+        return cls(
+            code_review_model="gpt-5.3-codex",
+            result_file="judge_results.json",
+        )
+
+
 @dataclass(frozen=True)
 class EnvironmentConfig:
     """Environment-specific configuration."""
@@ -152,6 +169,7 @@ class Config:
     env: EnvironmentConfig
     timeout: TimeoutConfig
     file_patterns: FilePatternConfig
+    judge: JudgeConfig
 
     @classmethod
     def load(cls) -> Config:
@@ -163,6 +181,7 @@ def load(cls) -> Config:
             env=EnvironmentConfig.from_environment(),
             timeout=TimeoutConfig.default(),
             file_patterns=FilePatternConfig.default(),
+            judge=JudgeConfig.default(),
         )
 
 

diff --git a/src/bcbench/dataset/codereview.py b/src/bcbench/dataset/codereview.py
@@ -20,10 +20,12 @@ def level(self) -> int:
     @classmethod
     def from_input(cls, value: str) -> Severity:
         normalized = value.strip().lower()
-        try:
+        if normalized in {s.value for s in cls}:
             return cls(normalized)
-        except ValueError:
-            return _SEVERITY_ALIASES.get(normalized, cls.MEDIUM)
+        if normalized in _SEVERITY_ALIASES:
+            return _SEVERITY_ALIASES[normalized]
+        valid = [s.value for s in cls] + list(_SEVERITY_ALIASES)
+        raise ValueError(f"Unknown severity {value!r}; expected one of {valid}")
 
 
 _SEVERITY_LEVELS: dict[Severity, int] = {
@@ -49,27 +51,30 @@ class ReviewComment(BaseModel):
     line_end: int | None = None
     domain: str | None = None
     body: str
-    severity: Severity
+    severity: Severity | None = None
 
     @field_validator("severity", mode="before")
     @classmethod
-    def _coerce_severity(cls, value: object) -> Severity:
-        if isinstance(value, Severity):
+    def _coerce_severity(cls, value: object) -> Severity | None:
+        if value is None or isinstance(value, Severity):
             return value
         return Severity.from_input(str(value))
 
+    @property
+    def severity_label(self) -> str:
+        return self.severity.value if self.severity is not None else "unspecified"
+
     def __str__(self) -> str:
         loc = f"{self.file}:{self.line_start}"
         if self.line_end and self.line_end != self.line_start:
             loc += f"-{self.line_end}"
-        return f"[{self.severity}] {loc}: {self.body}"
+        return f"[{self.severity_label}] {loc}: {self.body}"
 
 
 class CodeReviewEntry(BaseDatasetEntry):
     """Dataset entry for the code-review category."""
 
     expected_comments: list[ReviewComment] = Field(default_factory=list)
-    match_line_tolerance: int = Field(default=5, ge=0)
 
     def get_task(self) -> str:
         return self.patch

diff --git a/src/bcbench/evaluate/codereview.py b/src/bcbench/evaluate/codereview.py
@@ -69,7 +69,6 @@ def evaluate(self, context: EvaluationContext[CodeReviewEntry]) -> None:
             structural_matches = match_comments(
                 context.entry.expected_comments,
                 generated_comments,
-                context.entry.match_line_tolerance,
             )
             validated_matches = judge_comment_matches(
                 structural_matches,
@@ -80,7 +79,6 @@ def evaluate(self, context: EvaluationContext[CodeReviewEntry]) -> None:
                 output=output,
                 expected_comments=context.entry.expected_comments,
                 generated_comments=generated_comments,
-                line_tolerance=context.entry.match_line_tolerance,
                 matched_pairs=validated_matches,
             )
         logger.info(f"Parsed {len(result.generated_comments)} comments from {REVIEW_OUTPUT_FILE}")