Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 231 additions & 2 deletions docs/_data/code-review.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,233 @@
{
"runs": [],
"aggregate": []
"runs": [
{
"total": 81,
"date": "2026-06-24",
"model": "claude-haiku-4-5",
"agent_name": "GitHub Copilot",
"category": "code-review",
"average_duration": 51.9,
"average_prompt_tokens": 113035.8,
"average_completion_tokens": 1590.2,
"average_llm_duration": 0.0,
"average_tool_usage": {
"report_intent": 0.93,
"bash": 3.19,
"view": 1.16,
"create": 0.8,
"read_bash": 0.37,
"stop_bash": 0.21,
"write_bash": 0.07,
"task": 0.01
},
"github_run_id": "28105886936",
"experiment": null,
"benchmark_version": "0.5.6",
"generated_comment_count": 251,
"expected_comment_count": 216,
"matched_comment_count": 51,
"incorrect_comment_count": 200,
"missed_comment_count": 165,
"precision": 0.203,
"recall": 0.236,
"f1": 0.218,
"f_beta_05": 0.209,
"f_beta_2": 0.229,
"macro_precision": 0.245,
"macro_recall": 0.329,
"macro_f1": 0.256,
"macro_f_beta_05": 0.242,
"macro_f_beta_2": 0.286,
"severity_mae": 0.784,
"valid_review_output_rate": 1.0
},
{
"total": 81,
"date": "2026-06-24",
"model": "claude-haiku-4-5",
"agent_name": "GitHub Copilot",
"category": "code-review",
"average_duration": 53.0,
"average_prompt_tokens": 124801.2,
"average_completion_tokens": 1724.0,
"average_llm_duration": 0.0,
"average_tool_usage": {
"report_intent": 0.94,
"bash": 3.65,
"view": 1.32,
"create": 0.73,
"read_bash": 0.37,
"stop_bash": 0.35,
"write_bash": 0.02,
"task": 0.01
},
"github_run_id": "28106565168",
"experiment": null,
"benchmark_version": "0.5.6",
"generated_comment_count": 222,
"expected_comment_count": 216,
"matched_comment_count": 40,
"incorrect_comment_count": 182,
"missed_comment_count": 176,
"precision": 0.18,
"recall": 0.185,
"f1": 0.183,
"f_beta_05": 0.181,
"f_beta_2": 0.184,
"macro_precision": 0.226,
"macro_recall": 0.263,
"macro_f1": 0.217,
"macro_f_beta_05": 0.213,
"macro_f_beta_2": 0.235,
"severity_mae": 0.725,
"valid_review_output_rate": 1.0
},
{
"total": 81,
"date": "2026-06-24",
"model": "claude-haiku-4-5",
"agent_name": "GitHub Copilot",
"category": "code-review",
"average_duration": 53.6,
"average_prompt_tokens": 121482.7,
"average_completion_tokens": 1617.1,
"average_llm_duration": 0.0,
"average_tool_usage": {
"report_intent": 0.95,
"bash": 3.36,
"create": 0.85,
"stop_bash": 0.41,
"view": 1.25,
"read_bash": 0.47,
"write_bash": 0.01
},
"github_run_id": "28107173987",
"experiment": null,
"benchmark_version": "0.5.6",
"generated_comment_count": 230,
"expected_comment_count": 216,
"matched_comment_count": 48,
"incorrect_comment_count": 182,
"missed_comment_count": 168,
"precision": 0.209,
"recall": 0.222,
"f1": 0.215,
"f_beta_05": 0.211,
"f_beta_2": 0.219,
"macro_precision": 0.242,
"macro_recall": 0.292,
"macro_f1": 0.229,
"macro_f_beta_05": 0.219,
"macro_f_beta_2": 0.254,
"severity_mae": 0.729,
"valid_review_output_rate": 1.0
},
{
"total": 81,
"date": "2026-06-24",
"model": "claude-haiku-4-5",
"agent_name": "GitHub Copilot",
"category": "code-review",
"average_duration": 50.0,
"average_prompt_tokens": 112477.8,
"average_completion_tokens": 1602.7,
"average_llm_duration": 0.0,
"average_tool_usage": {
"report_intent": 0.93,
"bash": 3.02,
"create": 0.8,
"read_bash": 0.43,
"stop_bash": 0.32,
"view": 0.98,
"write_bash": 0.06
},
"github_run_id": "28107756834",
"experiment": null,
"benchmark_version": "0.5.6",
"generated_comment_count": 242,
"expected_comment_count": 216,
"matched_comment_count": 47,
"incorrect_comment_count": 195,
"missed_comment_count": 169,
"precision": 0.194,
"recall": 0.218,
"f1": 0.205,
"f_beta_05": 0.198,
"f_beta_2": 0.212,
"macro_precision": 0.216,
"macro_recall": 0.285,
"macro_f1": 0.223,
"macro_f_beta_05": 0.21,
"macro_f_beta_2": 0.25,
"severity_mae": 0.681,
"valid_review_output_rate": 1.0
},
{
"total": 81,
"date": "2026-06-24",
"model": "claude-haiku-4-5",
"agent_name": "GitHub Copilot",
"category": "code-review",
"average_duration": 53.9,
"average_prompt_tokens": 119386.4,
"average_completion_tokens": 1680.6,
"average_llm_duration": 0.0,
"average_tool_usage": {
"report_intent": 0.93,
"bash": 3.16,
"create": 0.83,
"read_bash": 0.42,
"stop_bash": 0.37,
"view": 1.42,
"write_bash": 0.02,
"task": 0.01
},
"github_run_id": "28108519915",
"experiment": null,
"benchmark_version": "0.5.6",
"generated_comment_count": 292,
"expected_comment_count": 216,
"matched_comment_count": 48,
"incorrect_comment_count": 244,
"missed_comment_count": 168,
"precision": 0.164,
"recall": 0.222,
"f1": 0.189,
"f_beta_05": 0.173,
"f_beta_2": 0.208,
"macro_precision": 0.205,
"macro_recall": 0.304,
"macro_f1": 0.216,
"macro_f_beta_05": 0.196,
"macro_f_beta_2": 0.254,
"severity_mae": 0.729,
"valid_review_output_rate": 1.0
}
],
"aggregate": [
{
"model": "claude-haiku-4-5",
"agent_name": "GitHub Copilot",
"category": "code-review",
"experiment": null,
"total": 81,
"num_runs": 5,
"average_duration": 52.48,
"benchmark_version": "0.5.6",
"f1": 0.202,
"f1_ci_low": 0.189,
"f1_ci_high": 0.213,
"f_beta_05": 0.1944,
"f_beta_2": 0.2104,
"precision": 0.19,
"recall": 0.2166,
"macro_f1": 0.228,
"macro_f1_ci_low": 0.219,
"macro_f1_ci_high": 0.248,
"macro_f_beta_05": 0.21600000000000003,
"macro_f_beta_2": 0.25579999999999997,
"macro_precision": 0.22679999999999997,
"macro_recall": 0.2946
}
]
}
Loading