Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 81 additions & 81 deletions dataset/codereview.jsonl

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions dataset/judge_calibration.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "Calling Commit() inside the repeat loop can leave partial data if a later iteration fails.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 145, "body": "Move the Commit() out of the loop; committing per iteration breaks atomicity.", "severity": "medium"}, "should_match": true, "note": "commit-in-loop, paraphrased + different line"}
{"expected": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "Add SetLoadFields before FindSet so the whole record isn't loaded.", "severity": "medium"}, "candidate": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "Use SetLoadFields to limit the columns read for this query.", "severity": "medium"}, "should_match": true, "note": "setloadfields performance, same issue"}
{"expected": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "Currency code 'USD' is hardcoded; read it from setup instead.", "severity": "medium"}, "candidate": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "Don't hardcode the currency — pull it from the configuration record.", "severity": "medium"}, "should_match": true, "note": "hardcoded currency, same issue"}
{"expected": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 33, "body": "GET can fail when the customer is missing; handle the not-found case.", "severity": "medium"}, "candidate": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 31, "body": "Missing error handling if the GET on Customer returns false.", "severity": "medium"}, "should_match": true, "note": "unchecked GET, same issue"}
{"expected": {"file": "src/Sales/SalesLine.Table.al", "line_start": 210, "body": "You can't SetRange on a FlowField without calling CalcFields first.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesLine.Table.al", "line_start": 210, "body": "Filtering directly on this FlowField won't work as written.", "severity": "medium"}, "should_match": true, "note": "flowfield filter, same issue"}
{"expected": {"file": "src/Reports/Statement.Report.al", "line_start": 90, "body": "Building the filter from a concatenated string risks filter injection; use SetFilter with parameters.", "severity": "medium"}, "candidate": {"file": "src/Reports/Statement.Report.al", "line_start": 92, "body": "Use parameterized SetFilter instead of concatenating user input into the filter.", "severity": "medium"}, "should_match": true, "note": "filter injection, same issue"}
{"expected": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 120, "body": "This runs a database read per record — an N+1 pattern.", "severity": "medium"}, "candidate": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 118, "body": "Querying inside the loop causes N+1 queries; batch the lookup.", "severity": "medium"}, "should_match": true, "note": "n+1 query, same issue"}
{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "TestField(Quantity) is missing before posting.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "Validate that Quantity is set before posting, e.g. with TestField.", "severity": "high"}, "should_match": true, "note": "missing testfield, severity differs"}
{"expected": {"file": "src/Common/Util.Codeunit.al", "line_start": 12, "body": "Variable 'i' is declared but never used.", "severity": "low"}, "candidate": {"file": "src/Common/Util.Codeunit.al", "line_start": 12, "body": "Remove the unused variable i.", "severity": "high"}, "should_match": true, "note": "unused variable, severity differs"}
{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "Calling Commit() inside the repeat loop can leave partial data on failure.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "The procedure name should be PascalCase to match conventions.", "severity": "medium"}, "should_match": false, "note": "commit-in-loop vs naming style, same line"}
{"expected": {"file": "src/Admin/UserSetup.Page.al", "line_start": 45, "body": "Missing permission/SecurityFiltering check before exposing this data.", "severity": "medium"}, "candidate": {"file": "src/Admin/UserSetup.Page.al", "line_start": 45, "body": "Add a tooltip to this field for accessibility.", "severity": "medium"}, "should_match": false, "note": "permission vs tooltip, same line"}
{"expected": {"file": "src/Common/Loop.Codeunit.al", "line_start": 88, "body": "Off-by-one: the range 1..Count skips the last element.", "severity": "medium"}, "candidate": {"file": "src/Common/Loop.Codeunit.al", "line_start": 88, "body": "Use a temporary record here to avoid locking the table.", "severity": "medium"}, "should_match": false, "note": "off-by-one vs locking, same line"}
{"expected": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 33, "body": "Possible null/empty reference: 'Customer' may be blank after the failed GET.", "severity": "medium"}, "candidate": {"file": "src/Sales/Customer.Codeunit.al", "line_start": 33, "body": "Add a comment explaining what this block does.", "severity": "medium"}, "should_match": false, "note": "null-ref vs missing comment, same line"}
{"expected": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "Use IsEmpty() instead of Count() = 0 for performance.", "severity": "medium"}, "candidate": {"file": "src/Inventory/ItemAvail.Codeunit.al", "line_start": 58, "body": "This label text needs to be translated/localized.", "severity": "medium"}, "should_match": false, "note": "isempty perf vs localization, same line"}
{"expected": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "Currency code 'USD' is hardcoded; read it from setup.", "severity": "medium"}, "candidate": {"file": "src/Finance/Payment.Codeunit.al", "line_start": 77, "body": "The magic number 1000 should be extracted into a named constant.", "severity": "medium"}, "should_match": false, "note": "hardcoded currency vs magic number — both 'hardcoding' but different code"}
{"expected": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 118, "body": "FindSet(true) should be FindSet(false) since records aren't modified.", "severity": "medium"}, "candidate": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 118, "body": "Add SetLoadFields before this FindSet to limit columns.", "severity": "medium"}, "should_match": false, "note": "findset write-intent vs setloadfields — both about the same FindSet, different issue"}
{"expected": {"file": "src/Sales/CustList.Codeunit.al", "line_start": 70, "body": "This loop reads every column of Customer; on large tables that's a major performance hit.", "severity": "medium"}, "candidate": {"file": "src/Sales/CustList.Codeunit.al", "line_start": 72, "body": "Call SetLoadFields(Name, \"No.\") before FindSet so only the needed fields are fetched.", "severity": "medium"}, "should_match": true, "note": "setloadfields: symptom (slow full read) vs fix (call SetLoadFields)"}
{"expected": {"file": "src/Sales/SalesValidation.Codeunit.al", "line_start": 25, "body": "AA0217: the literal passed to Error() must be a Label variable.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesValidation.Codeunit.al", "line_start": 25, "body": "Hardcoding this error text breaks translation; move it to a Label with an Err suffix.", "severity": "medium"}, "should_match": true, "note": "hardcoded error string: rule number (AA0217) vs translation rationale"}
{"expected": {"file": "src/Integration/Import.Codeunit.al", "line_start": 88, "body": "The InStream is read twice without resetting position; the second read returns nothing.", "severity": "medium"}, "candidate": {"file": "src/Integration/Import.Codeunit.al", "line_start": 90, "body": "Reset the stream position before reading it a second time.", "severity": "medium"}, "should_match": true, "note": "stream re-read: symptom (empty second read) vs fix (reset position)"}
{"expected": {"file": "src/Admin/UserSetup.Page.al", "line_start": 12, "body": "This page exposes records without an OnOpenPage permission check.", "severity": "medium"}, "candidate": {"file": "src/Admin/UserSetup.Page.al", "line_start": 12, "body": "Add a SecurityFiltering/permission guard so unauthorized users can't read these records.", "severity": "medium"}, "should_match": true, "note": "missing permission check: terminology variance (OnOpenPage check vs SecurityFiltering guard)"}
{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "Missing TestField(Quantity) before posting.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 60, "body": "Missing TestField(Type) before posting.", "severity": "medium"}, "should_match": false, "note": "missing TestField but different field (Quantity vs Type)"}
{"expected": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "Commit() inside the loop breaks atomicity — move it out.", "severity": "medium"}, "candidate": {"file": "src/Sales/SalesPost.Codeunit.al", "line_start": 142, "body": "There is no Commit() here, so the batch changes are never persisted.", "severity": "medium"}, "should_match": false, "note": "commit: too-many (in loop) vs missing entirely, opposite issue same line"}
{"expected": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 120, "body": "GET inside the loop is an N+1 performance problem.", "severity": "medium"}, "candidate": {"file": "src/Inventory/Reorder.Codeunit.al", "line_start": 120, "body": "The return value of this GET isn't checked; it can silently fail.", "severity": "medium"}, "should_match": false, "note": "same GET statement: N+1 performance vs unchecked-return correctness"}
{"expected": {"file": "src/Sales/Notify.Codeunit.al", "line_start": 14, "body": "Hardcoded message string should be a Label (AA0217).", "severity": "medium"}, "candidate": {"file": "src/Sales/Notify.Codeunit.al", "line_start": 14, "body": "This Label uses a 'Msg' suffix but is passed to Error(); the suffix should be 'Err'.", "severity": "medium"}, "should_match": false, "note": "label: not-a-Label-at-all (AA0217) vs existing-Label-wrong-suffix"}
12 changes: 11 additions & 1 deletion src/bcbench/agent/shared/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,17 @@ prompt:
/review

Review only the uncommitted working-tree changes (git diff HEAD); do not compare commits such as HEAD~1..HEAD or origin/main.
Save the review findings to a file named "review.json" in the repository root (write an empty array if there are no findings).

Your only deliverable is a file named review.json in the repository root. You MUST write it before finishing; if you do not, your review is lost and counts as no output.

review.json must contain a single JSON array. Each finding is an object with these fields:
- file: repo-relative path of the file the finding refers to (string, required)
- line_start: 1-based line number where the issue starts (integer, required)
- line_end: line number where the issue ends (integer, optional)
- severity: one of critical, high, medium, or low (optional, defaults to medium)
- body: concise description of the issue (string, required)

If there are no findings, write an empty array. Write only valid JSON to review.json, with no surrounding markdown or commentary.

# controls:
# 1. whether to copy custom instructions from `src/bcbench/agent/shared/instructions/<sanitized-repo>/`
Expand Down
2 changes: 1 addition & 1 deletion src/bcbench/commands/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def view_entry(
lines = str(comment.line_start)
if comment.line_end and comment.line_end != comment.line_start:
lines += f"-{comment.line_end}"
comment_table.add_row(comment.file, lines, comment.severity, comment.body)
comment_table.add_row(comment.file, lines, comment.severity_label, comment.body)
console.print(comment_table)
else:
console.print("[dim]No expected comments[/dim]")
Expand Down
28 changes: 27 additions & 1 deletion src/bcbench/commands/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from bcbench.config import get_config
from bcbench.dataset import BaseDatasetEntry, NL2ALEntry
from bcbench.evaluate import EvaluationPipeline
from bcbench.evaluate.codereview_judge_calibration import run_calibration
from bcbench.logger import get_logger
from bcbench.results import BaseEvaluationResult, CodeReviewResult, ExecutionBasedEvaluationResult, JudgeBasedEvaluationResult
from bcbench.types import AgentMetrics, BCalLLMBackend, ContainerConfig, EvaluationCategory, EvaluationContext, ExperimentConfiguration
Expand Down Expand Up @@ -203,6 +204,31 @@ def evaluate_bcal(
logger.info(f"Results saved to: {run_dir}")


@evaluate_app.command("judge-calibration")
def evaluate_judge_calibration(
model: CopilotModel = _config.judge.code_review_model,
work_dir: RepoPath = _config.paths.testbed_path,
min_accuracy: Annotated[float, typer.Option(help="Fail if judge accuracy falls below this")] = 0.8,
) -> None:
"""Run the LLM judge over the hand-labeled calibration set and report its precision/recall.

Intended for local/ad-hoc checks of the judge; exits non-zero if accuracy drops below
the threshold.
"""
work_dir.mkdir(parents=True, exist_ok=True)
report = run_calibration(work_dir, model=model)

logger.info(f"Judge calibration ({model}) over {report.total} labeled pairs:")
logger.info(f" precision={report.precision:.3f} recall={report.recall:.3f} accuracy={report.accuracy:.3f}")
logger.info(f" TP={report.true_positives} FP={report.false_positives} TN={report.true_negatives} FN={report.false_negatives}")
for note in report.misclassified_notes:
logger.warning(f" {note}")

if report.accuracy < min_accuracy:
logger.error(f"Judge accuracy {report.accuracy:.3f} is below the required {min_accuracy:.3f}")
raise typer.Exit(code=1)


@evaluate_app.command("mock", hidden=True)
def evaluate_mock(
entry_id: Annotated[str, typer.Argument(help="Entry ID to run")],
Expand Down Expand Up @@ -302,7 +328,7 @@ def evaluate(self, context: EvaluationContext[BaseDatasetEntry]) -> None:
case "invalid":
result = CodeReviewResult.create_invalid(context, output="MOCK_INVALID_REVIEW_OUTPUT", expected_comments=[])
case "valid":
result = CodeReviewResult.create(context, output="[]", expected_comments=[], generated_comments=[], line_tolerance=0)
result = CodeReviewResult.create(context, output="[]", expected_comments=[], generated_comments=[])
case "raw":
result = JudgeBasedEvaluationResult.create_raw(context, output="MOCK_PATCH_CONTENT")
case "empty":
Expand Down
19 changes: 19 additions & 0 deletions src/bcbench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

from dotenv import load_dotenv

from bcbench.cli_options import CopilotModel

__all__ = ["Config", "get_config"]


Expand Down Expand Up @@ -123,6 +125,21 @@ def default(cls) -> FilePatternConfig:
)


@dataclass(frozen=True)
class JudgeConfig:
"""Configuration for the code-review LLM semantic judge."""

code_review_model: CopilotModel
result_file: str

@classmethod
def default(cls) -> JudgeConfig:
return cls(
code_review_model="gpt-5.3-codex",
result_file="judge_results.json",
)


@dataclass(frozen=True)
class EnvironmentConfig:
"""Environment-specific configuration."""
Expand Down Expand Up @@ -152,6 +169,7 @@ class Config:
env: EnvironmentConfig
timeout: TimeoutConfig
file_patterns: FilePatternConfig
judge: JudgeConfig

@classmethod
def load(cls) -> Config:
Expand All @@ -163,6 +181,7 @@ def load(cls) -> Config:
env=EnvironmentConfig.from_environment(),
timeout=TimeoutConfig.default(),
file_patterns=FilePatternConfig.default(),
judge=JudgeConfig.default(),
)


Expand Down
21 changes: 13 additions & 8 deletions src/bcbench/dataset/codereview.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,12 @@ def level(self) -> int:
@classmethod
def from_input(cls, value: str) -> Severity:
normalized = value.strip().lower()
try:
if normalized in {s.value for s in cls}:
return cls(normalized)
except ValueError:
return _SEVERITY_ALIASES.get(normalized, cls.MEDIUM)
if normalized in _SEVERITY_ALIASES:
return _SEVERITY_ALIASES[normalized]
valid = [s.value for s in cls] + list(_SEVERITY_ALIASES)
raise ValueError(f"Unknown severity {value!r}; expected one of {valid}")


_SEVERITY_LEVELS: dict[Severity, int] = {
Expand All @@ -49,27 +51,30 @@ class ReviewComment(BaseModel):
line_end: int | None = None
domain: str | None = None
body: str
severity: Severity
severity: Severity | None = None

@field_validator("severity", mode="before")
@classmethod
def _coerce_severity(cls, value: object) -> Severity:
if isinstance(value, Severity):
def _coerce_severity(cls, value: object) -> Severity | None:
if value is None or isinstance(value, Severity):
return value
return Severity.from_input(str(value))

@property
def severity_label(self) -> str:
return self.severity.value if self.severity is not None else "unspecified"

def __str__(self) -> str:
loc = f"{self.file}:{self.line_start}"
if self.line_end and self.line_end != self.line_start:
loc += f"-{self.line_end}"
return f"[{self.severity}] {loc}: {self.body}"
return f"[{self.severity_label}] {loc}: {self.body}"


class CodeReviewEntry(BaseDatasetEntry):
"""Dataset entry for the code-review category."""

expected_comments: list[ReviewComment] = Field(default_factory=list)
match_line_tolerance: int = Field(default=5, ge=0)

def get_task(self) -> str:
return self.patch
Expand Down
2 changes: 0 additions & 2 deletions src/bcbench/evaluate/codereview.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@ def evaluate(self, context: EvaluationContext[CodeReviewEntry]) -> None:
structural_matches = match_comments(
context.entry.expected_comments,
generated_comments,
context.entry.match_line_tolerance,
)
validated_matches = judge_comment_matches(
structural_matches,
Expand All @@ -80,7 +79,6 @@ def evaluate(self, context: EvaluationContext[CodeReviewEntry]) -> None:
output=output,
expected_comments=context.entry.expected_comments,
generated_comments=generated_comments,
line_tolerance=context.entry.match_line_tolerance,
matched_pairs=validated_matches,
)
logger.info(f"Parsed {len(result.generated_comments)} comments from {REVIEW_OUTPUT_FILE}")
Expand Down
Loading
Loading