diff --git a/docs/_data/code-review.json b/docs/_data/code-review.json index f744d8bdb..8445e5229 100644 --- a/docs/_data/code-review.json +++ b/docs/_data/code-review.json @@ -1,4 +1,233 @@ { - "runs": [], - "aggregate": [] + "runs": [ + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 51.9, + "average_prompt_tokens": 113035.8, + "average_completion_tokens": 1590.2, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.93, + "bash": 3.19, + "view": 1.16, + "create": 0.8, + "read_bash": 0.37, + "stop_bash": 0.21, + "write_bash": 0.07, + "task": 0.01 + }, + "github_run_id": "28105886936", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 251, + "expected_comment_count": 216, + "matched_comment_count": 51, + "incorrect_comment_count": 200, + "missed_comment_count": 165, + "precision": 0.203, + "recall": 0.236, + "f1": 0.218, + "f_beta_05": 0.209, + "f_beta_2": 0.229, + "macro_precision": 0.245, + "macro_recall": 0.329, + "macro_f1": 0.256, + "macro_f_beta_05": 0.242, + "macro_f_beta_2": 0.286, + "severity_mae": 0.784, + "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 53.0, + "average_prompt_tokens": 124801.2, + "average_completion_tokens": 1724.0, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.94, + "bash": 3.65, + "view": 1.32, + "create": 0.73, + "read_bash": 0.37, + "stop_bash": 0.35, + "write_bash": 0.02, + "task": 0.01 + }, + "github_run_id": "28106565168", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 222, + "expected_comment_count": 216, + "matched_comment_count": 40, + "incorrect_comment_count": 182, + "missed_comment_count": 176, + "precision": 0.18, + "recall": 0.185, + "f1": 0.183, + "f_beta_05": 0.181, + "f_beta_2": 0.184, + "macro_precision": 0.226, + "macro_recall": 0.263, + "macro_f1": 0.217, + "macro_f_beta_05": 0.213, + "macro_f_beta_2": 0.235, + "severity_mae": 0.725, + "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 53.6, + "average_prompt_tokens": 121482.7, + "average_completion_tokens": 1617.1, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.95, + "bash": 3.36, + "create": 0.85, + "stop_bash": 0.41, + "view": 1.25, + "read_bash": 0.47, + "write_bash": 0.01 + }, + "github_run_id": "28107173987", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 230, + "expected_comment_count": 216, + "matched_comment_count": 48, + "incorrect_comment_count": 182, + "missed_comment_count": 168, + "precision": 0.209, + "recall": 0.222, + "f1": 0.215, + "f_beta_05": 0.211, + "f_beta_2": 0.219, + "macro_precision": 0.242, + "macro_recall": 0.292, + "macro_f1": 0.229, + "macro_f_beta_05": 0.219, + "macro_f_beta_2": 0.254, + "severity_mae": 0.729, + "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 50.0, + "average_prompt_tokens": 112477.8, + "average_completion_tokens": 1602.7, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.93, + "bash": 3.02, + "create": 0.8, + "read_bash": 0.43, + "stop_bash": 0.32, + "view": 0.98, + "write_bash": 0.06 + }, + "github_run_id": "28107756834", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 242, + "expected_comment_count": 216, + "matched_comment_count": 47, + "incorrect_comment_count": 195, + "missed_comment_count": 169, + "precision": 0.194, + "recall": 0.218, + "f1": 0.205, + "f_beta_05": 0.198, + "f_beta_2": 0.212, + "macro_precision": 0.216, + "macro_recall": 0.285, + "macro_f1": 0.223, + "macro_f_beta_05": 0.21, + "macro_f_beta_2": 0.25, + "severity_mae": 0.681, + "valid_review_output_rate": 1.0 + }, + { + "total": 81, + "date": "2026-06-24", + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "average_duration": 53.9, + "average_prompt_tokens": 119386.4, + "average_completion_tokens": 1680.6, + "average_llm_duration": 0.0, + "average_tool_usage": { + "report_intent": 0.93, + "bash": 3.16, + "create": 0.83, + "read_bash": 0.42, + "stop_bash": 0.37, + "view": 1.42, + "write_bash": 0.02, + "task": 0.01 + }, + "github_run_id": "28108519915", + "experiment": null, + "benchmark_version": "0.5.6", + "generated_comment_count": 292, + "expected_comment_count": 216, + "matched_comment_count": 48, + "incorrect_comment_count": 244, + "missed_comment_count": 168, + "precision": 0.164, + "recall": 0.222, + "f1": 0.189, + "f_beta_05": 0.173, + "f_beta_2": 0.208, + "macro_precision": 0.205, + "macro_recall": 0.304, + "macro_f1": 0.216, + "macro_f_beta_05": 0.196, + "macro_f_beta_2": 0.254, + "severity_mae": 0.729, + "valid_review_output_rate": 1.0 + } + ], + "aggregate": [ + { + "model": "claude-haiku-4-5", + "agent_name": "GitHub Copilot", + "category": "code-review", + "experiment": null, + "total": 81, + "num_runs": 5, + "average_duration": 52.48, + "benchmark_version": "0.5.6", + "f1": 0.202, + "f1_ci_low": 0.189, + "f1_ci_high": 0.213, + "f_beta_05": 0.1944, + "f_beta_2": 0.2104, + "precision": 0.19, + "recall": 0.2166, + "macro_f1": 0.228, + "macro_f1_ci_low": 0.219, + "macro_f1_ci_high": 0.248, + "macro_f_beta_05": 0.21600000000000003, + "macro_f_beta_2": 0.25579999999999997, + "macro_precision": 0.22679999999999997, + "macro_recall": 0.2946 + } + ] }