From 160a17d227b18a878c452adc1568909058106b73 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 24 Jun 2026 14:38:27 +0000 Subject: [PATCH 1/5] Update leaderboard: GitHub Copilot CLI (claude-haiku-4.5) - Run 28105886936 --- docs/_data/code-review.json | 71 +++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index f744d8bdb..17783d19d 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -1,4 +1,71 @@ { - "runs": [], - "aggregate": [] + "runs": [ + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 51.9, + "average_prompt_tokens": 113035.8, + "average_completion_tokens": 1590.2, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.93, + "bash": 3.19, + "view": 1.16, + "create": 0.8, + "read_bash": 0.37, + "stop_bash": 0.21, + "write_bash": 0.07, + "task": 0.01 + }, + "github_run_id": "28105886936", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 251, + "expected_comment_count": 216, + "matched_comment_count": 51, + "incorrect_comment_count": 200, + "missed_comment_count": 165, + "precision": 0.203, + "recall": 0.236, + "f1": 0.218, + "f_beta_05": 0.209, + "f_beta_2": 0.229, + "macro_precision": 0.245, + "macro_recall": 0.329, + "macro_f1": 0.256, + "macro_f_beta_05": 0.242, + "macro_f_beta_2": 0.286, + "severity_mae": 0.784, + "valid_review_output_rate": 1.0 + } + ], + "aggregate": [ + { + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "experiment": null, + "total": 81, + "num_runs": 1, + "average_duration": 51.9, + "benchmark_version": "0.5.6", + "f1": 0.218, + "f1_ci_low": null, + "f1_ci_high": null, + "f_beta_05": 0.209, + "f_beta_2": 0.229, + "precision": 0.203, + "recall": 0.236, + "macro_f1": 0.256, + "macro_f1_ci_low": null, + "macro_f1_ci_high": null, + "macro_f_beta_05": 0.242, + "macro_f_beta_2": 0.286, + "macro_precision": 0.245, + "macro_recall": 0.329 + } + ] } From d77f7a9f77d79a95ebe0d35122ebdee7334e77a1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 24 Jun 2026 14:47:37 +0000 Subject: [PATCH 2/5] Update leaderboard: GitHub Copilot CLI (claude-haiku-4.5) - Run 28106565168 --- docs/_data/code-review.json | 73 +++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 17783d19d..4622a1e43 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -40,6 +40,47 @@ "macro_f_beta_2": 0.286, "severity_mae": 0.784, "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 53.0, + "average_prompt_tokens": 124801.2, + "average_completion_tokens": 1724.0, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.94, + "bash": 3.65, + "view": 1.32, + "create": 0.73, + "read_bash": 0.37, + "stop_bash": 0.35, + "write_bash": 0.02, + "task": 0.01 + }, + "github_run_id": "28106565168", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 222, + "expected_comment_count": 216, + "matched_comment_count": 40, + "incorrect_comment_count": 182, + "missed_comment_count": 176, + "precision": 0.18, + "recall": 0.185, + "f1": 0.183, + "f_beta_05": 0.181, + "f_beta_2": 0.184, + "macro_precision": 0.226, + "macro_recall": 0.263, + "macro_f1": 0.217, + "macro_f_beta_05": 0.213, + "macro_f_beta_2": 0.235, + "severity_mae": 0.725, + "valid_review_output_rate": 1.0 } ], "aggregate": [ @@ -49,23 +90,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 1, - "average_duration": 51.9, + "num_runs": 2, + "average_duration": 52.45, "benchmark_version": "0.5.6", - "f1": 0.218, - "f1_ci_low": null, - "f1_ci_high": null, - "f_beta_05": 0.209, - "f_beta_2": 0.229, - "precision": 0.203, - "recall": 0.236, - "macro_f1": 0.256, - "macro_f1_ci_low": null, - "macro_f1_ci_high": null, - "macro_f_beta_05": 0.242, - "macro_f_beta_2": 0.286, - "macro_precision": 0.245, - "macro_recall": 0.329 + "f1": 0.201, + "f1_ci_low": 0.183, + "f1_ci_high": 0.218, + "f_beta_05": 0.195, + "f_beta_2": 0.20650000000000002, + "precision": 0.1915, + "recall": 0.2105, + "macro_f1": 0.236, + "macro_f1_ci_low": 0.217, + "macro_f1_ci_high": 0.256, + "macro_f_beta_05": 0.22749999999999998, + "macro_f_beta_2": 0.26049999999999995, + "macro_precision": 0.2355, + "macro_recall": 0.29600000000000004 } ] } From 5f4e41d179e5e9499aa036e8db1a2cbf48bdf73c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 24 Jun 2026 14:56:26 +0000 Subject: [PATCH 3/5] Update leaderboard: GitHub Copilot CLI (claude-haiku-4.5) - Run 28107173987 --- docs/_data/code-review.json | 68 +++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 4622a1e43..0ce6a15d8 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -81,6 +81,46 @@ "macro_f_beta_2": 0.235, "severity_mae": 0.725, "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 53.6, + "average_prompt_tokens": 121482.7, + "average_completion_tokens": 1617.1, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.95, + "bash": 3.36, + "create": 0.85, + "stop_bash": 0.41, + "view": 1.25, + "read_bash": 0.47, + "write_bash": 0.01 + }, + "github_run_id": "28107173987", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 230, + "expected_comment_count": 216, + "matched_comment_count": 48, + "incorrect_comment_count": 182, + "missed_comment_count": 168, + "precision": 0.209, + "recall": 0.222, + "f1": 0.215, + "f_beta_05": 0.211, + "f_beta_2": 0.219, + "macro_precision": 0.242, + "macro_recall": 0.292, + "macro_f1": 0.229, + "macro_f_beta_05": 0.219, + "macro_f_beta_2": 0.254, + "severity_mae": 0.729, + "valid_review_output_rate": 1.0 } ], "aggregate": [ @@ -90,23 +130,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 2, - "average_duration": 52.45, + "num_runs": 3, + "average_duration": 52.833333333333336, "benchmark_version": "0.5.6", - "f1": 0.201, + "f1": 0.205, "f1_ci_low": 0.183, - "f1_ci_high": 0.218, - "f_beta_05": 0.195, - "f_beta_2": 0.20650000000000002, - "precision": 0.1915, - "recall": 0.2105, - "macro_f1": 0.236, - "macro_f1_ci_low": 0.217, + "f1_ci_high": 0.217, + "f_beta_05": 0.20033333333333334, + "f_beta_2": 0.21066666666666667, + "precision": 0.19733333333333333, + "recall": 0.21433333333333335, + "macro_f1": 0.234, + "macro_f1_ci_low": 0.221, "macro_f1_ci_high": 0.256, - "macro_f_beta_05": 0.22749999999999998, - "macro_f_beta_2": 0.26049999999999995, - "macro_precision": 0.2355, - "macro_recall": 0.29600000000000004 + "macro_f_beta_05": 0.22466666666666665, + "macro_f_beta_2": 0.2583333333333333, + "macro_precision": 0.23766666666666666, + "macro_recall": 0.2946666666666667 } ] } From 2959486f8b0b32b7c3ab406b5e03f1237004f850 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 24 Jun 2026 15:07:31 +0000 Subject: [PATCH 4/5] Update leaderboard: GitHub Copilot CLI (claude-haiku-4.5) - Run 28107756834 --- docs/_data/code-review.json | 70 +++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 15 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 0ce6a15d8..58ae38ee1 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -121,6 +121,46 @@ "macro_f_beta_2": 0.254, "severity_mae": 0.729, "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 50.0, + "average_prompt_tokens": 112477.8, + "average_completion_tokens": 1602.7, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.93, + "bash": 3.02, + "create": 0.8, + "read_bash": 0.43, + "stop_bash": 0.32, + "view": 0.98, + "write_bash": 0.06 + }, + "github_run_id": "28107756834", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 242, + "expected_comment_count": 216, + "matched_comment_count": 47, + "incorrect_comment_count": 195, + "missed_comment_count": 169, + "precision": 0.194, + "recall": 0.218, + "f1": 0.205, + "f_beta_05": 0.198, + "f_beta_2": 0.212, + "macro_precision": 0.216, + "macro_recall": 0.285, + "macro_f1": 0.223, + "macro_f_beta_05": 0.21, + "macro_f_beta_2": 0.25, + "severity_mae": 0.681, + "valid_review_output_rate": 1.0 } ], "aggregate": [ @@ -130,23 +170,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 3, - "average_duration": 52.833333333333336, + "num_runs": 4, + "average_duration": 52.125, "benchmark_version": "0.5.6", "f1": 0.205, - "f1_ci_low": 0.183, - "f1_ci_high": 0.217, - "f_beta_05": 0.20033333333333334, - "f_beta_2": 0.21066666666666667, - "precision": 0.19733333333333333, - "recall": 0.21433333333333335, - "macro_f1": 0.234, - "macro_f1_ci_low": 0.221, - "macro_f1_ci_high": 0.256, - "macro_f_beta_05": 0.22466666666666665, - "macro_f_beta_2": 0.2583333333333333, - "macro_precision": 0.23766666666666666, - "macro_recall": 0.2946666666666667 + "f1_ci_low": 0.188, + "f1_ci_high": 0.216, + "f_beta_05": 0.19974999999999998, + "f_beta_2": 0.211, + "precision": 0.1965, + "recall": 0.21525, + "macro_f1": 0.231, + "macro_f1_ci_low": 0.22, + "macro_f1_ci_high": 0.249, + "macro_f_beta_05": 0.221, + "macro_f_beta_2": 0.25625, + "macro_precision": 0.23225, + "macro_recall": 0.29225 } ] } From b8e0939ae37078d0187d163f5a2af7bac110b8aa Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 24 Jun 2026 15:17:44 +0000 Subject: [PATCH 5/5] Update leaderboard: GitHub Copilot CLI (claude-haiku-4.5) - Run 28108519915 --- docs/_data/code-review.json | 73 +++++++++++++++++++++++++++++-------- 1 file changed, 57 insertions(+), 16 deletions(-) diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index 58ae38ee1..8445e5229 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -161,6 +161,47 @@ "macro_f_beta_2": 0.25, "severity_mae": 0.681, "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 53.9, + "average_prompt_tokens": 119386.4, + "average_completion_tokens": 1680.6, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.93, + "bash": 3.16, + "create": 0.83, + "read_bash": 0.42, + "stop_bash": 0.37, + "view": 1.42, + "write_bash": 0.02, + "task": 0.01 + }, + "github_run_id": "28108519915", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 292, + "expected_comment_count": 216, + "matched_comment_count": 48, + "incorrect_comment_count": 244, + "missed_comment_count": 168, + "precision": 0.164, + "recall": 0.222, + "f1": 0.189, + "f_beta_05": 0.173, + "f_beta_2": 0.208, + "macro_precision": 0.205, + "macro_recall": 0.304, + "macro_f1": 0.216, + "macro_f_beta_05": 0.196, + "macro_f_beta_2": 0.254, + "severity_mae": 0.729, + "valid_review_output_rate": 1.0 } ], "aggregate": [ @@ -170,23 +211,23 @@ "category": "code-review", "experiment": null, "total": 81, - "num_runs": 4, - "average_duration": 52.125, + "num_runs": 5, + "average_duration": 52.48, "benchmark_version": "0.5.6", - "f1": 0.205, - "f1_ci_low": 0.188, - "f1_ci_high": 0.216, - "f_beta_05": 0.19974999999999998, - "f_beta_2": 0.211, - "precision": 0.1965, - "recall": 0.21525, - "macro_f1": 0.231, - "macro_f1_ci_low": 0.22, - "macro_f1_ci_high": 0.249, - "macro_f_beta_05": 0.221, - "macro_f_beta_2": 0.25625, - "macro_precision": 0.23225, - "macro_recall": 0.29225 + "f1": 0.202, + "f1_ci_low": 0.189, + "f1_ci_high": 0.213, + "f_beta_05": 0.1944, + "f_beta_2": 0.2104, + "precision": 0.19, + "recall": 0.2166, + "macro_f1": 0.228, + "macro_f1_ci_low": 0.219, + "macro_f1_ci_high": 0.248, + "macro_f_beta_05": 0.21600000000000003, + "macro_f_beta_2": 0.25579999999999997, + "macro_precision": 0.22679999999999997, + "macro_recall": 0.2946 } ] }