diff --git a/pybaseball/__init__.py b/pybaseball/__init__.py index cc223855..1f11d8a5 100644 --- a/pybaseball/__init__.py +++ b/pybaseball/__init__.py @@ -7,31 +7,31 @@ from .teamid_lookup import team_ids from .statcast import statcast, statcast_single_game from .statcast_pitcher import ( - statcast_pitcher, - statcast_pitcher_exitvelo_barrels, - statcast_pitcher_expected_stats, - statcast_pitcher_pitch_arsenal, - statcast_pitcher_arsenal_stats, - statcast_pitcher_percentile_ranks, - statcast_pitcher_spin_dir_comp + statcast_pitcher, + statcast_pitcher_exitvelo_barrels, + statcast_pitcher_expected_stats, + statcast_pitcher_pitch_arsenal, + statcast_pitcher_arsenal_stats, + statcast_pitcher_percentile_ranks, + statcast_pitcher_spin_dir_comp, ) from .statcast_batter import ( - statcast_batter, - statcast_batter_exitvelo_barrels, - statcast_batter_expected_stats, - statcast_batter_percentile_ranks, - statcast_batter_pitch_arsenal, - statcast_batter_bat_tracking + statcast_batter, + statcast_batter_exitvelo_barrels, + statcast_batter_expected_stats, + statcast_batter_percentile_ranks, + statcast_batter_pitch_arsenal, + statcast_batter_bat_tracking, ) from .statcast_running import statcast_sprint_speed, statcast_running_splits from .statcast_fielding import ( - statcast_outs_above_average, - statcast_outfield_directional_oaa, - statcast_outfield_catch_prob, - statcast_outfielder_jump, - statcast_catcher_poptime, - statcast_catcher_framing, - statcast_fielding_run_value + statcast_outs_above_average, + statcast_outfield_directional_oaa, + statcast_outfield_catch_prob, + statcast_outfielder_jump, + statcast_catcher_poptime, + statcast_catcher_framing, + statcast_fielding_run_value, ) from .league_batting_stats import batting_stats_bref from .league_batting_stats import batting_stats_range @@ -78,8 +78,9 @@ from .lahman import salaries from .lahman import schools from .lahman import series_post -from .lahman import teams_core -from .lahman import teams_upstream +from .lahman import teams + +# from .lahman import teams_upstream Not part of Lahman from .lahman import teams_franchises from .lahman import teams_half from .lahman import download_lahman @@ -98,7 +99,12 @@ from .plotting import spraychart from .plotting import plot_teams from .plotting import plot_strike_zone -from .datasources.fangraphs import (fg_batting_data, fg_pitching_data, fg_team_batting_data, fg_team_fielding_data, - fg_team_pitching_data) +from .datasources.fangraphs import ( + fg_batting_data, + fg_pitching_data, + fg_team_batting_data, + fg_team_fielding_data, + fg_team_pitching_data, +) from .split_stats import get_splits from .version import __version__ diff --git a/pybaseball/lahman.py b/pybaseball/lahman.py index 437096eb..b080eaa0 100644 --- a/pybaseball/lahman.py +++ b/pybaseball/lahman.py @@ -8,11 +8,12 @@ from . import cache -url = "https://github.com/chadwickbureau/baseballdatabank/archive/master.zip" -base_string = "baseballdatabank-master" +url = "https://github.com/jmaslek/LahmanDatabase/archive/refs/heads/main/baseballdb.zip" +base_string = "LahmanDatabase-main/baseballdb" _handle = None + def get_lahman_zip() -> Optional[ZipFile]: # Retrieve the Lahman database zip file, returns None if file already exists in cwd. # If we already have the zip file, keep re-using that. @@ -25,6 +26,7 @@ def get_lahman_zip() -> Optional[ZipFile]: _handle = ZipFile(BytesIO(s.content)) return _handle + def download_lahman(): # download entire lahman db to present working directory z = get_lahman_zip() @@ -34,103 +36,135 @@ def download_lahman(): # this way we'll now start using the extracted zip directory # instead of the session ZipFile object -def _get_file(tablename: str, quotechar: str = "'") -> pd.DataFrame: + +def _get_file(tablename: str, quotechar: str = "'", encoding="latin1") -> pd.DataFrame: z = get_lahman_zip() - f = f'{base_string}/{tablename}' + f = f"{base_string}/{tablename}" data = pd.read_csv( f"{path.join(cache.config.cache_directory, f)}" if z is None else z.open(f), header=0, - sep=',', - quotechar=quotechar + sep=",", + quotechar=quotechar, + encoding=encoding, ) return data # do this for every table in the lahman db so they can exist as separate functions def parks() -> pd.DataFrame: - return _get_file('core/Parks.csv') + return _get_file("Parks.csv") + def all_star_full() -> pd.DataFrame: - return _get_file("core/AllstarFull.csv") + return _get_file("AllstarFull.csv") + def appearances() -> pd.DataFrame: - return _get_file("core/Appearances.csv") + return _get_file("Appearances.csv") + def awards_managers() -> pd.DataFrame: - return _get_file("contrib/AwardsManagers.csv") + return _get_file("AwardsManagers.csv") + def awards_players() -> pd.DataFrame: - return _get_file("contrib/AwardsPlayers.csv") + return _get_file("AwardsPlayers.csv") + def awards_share_managers() -> pd.DataFrame: - return _get_file("contrib/AwardsShareManagers.csv") + return _get_file("AwardsShareManagers.csv") + def awards_share_players() -> pd.DataFrame: - return _get_file("contrib/AwardsSharePlayers.csv") + return _get_file("AwardsSharePlayers.csv") + def batting() -> pd.DataFrame: - return _get_file("core/Batting.csv") + return _get_file("Batting.csv") + def batting_post() -> pd.DataFrame: - return _get_file("core/BattingPost.csv") + return _get_file("BattingPost.csv") + def college_playing() -> pd.DataFrame: - return _get_file("contrib/CollegePlaying.csv") + return _get_file("CollegePlaying.csv") + def fielding() -> pd.DataFrame: - return _get_file("core/Fielding.csv") + return _get_file("Fielding.csv") + def fielding_of() -> pd.DataFrame: - return _get_file("core/FieldingOF.csv") + return _get_file("FieldingOF.csv") + def fielding_of_split() -> pd.DataFrame: - return _get_file("core/FieldingOFsplit.csv") + return _get_file("FieldingOFsplit.csv") + def fielding_post() -> pd.DataFrame: - return _get_file("core/FieldingPost.csv") + return _get_file("FieldingPost.csv") + def hall_of_fame() -> pd.DataFrame: - return _get_file("contrib/HallOfFame.csv") + return _get_file("HallOfFame.csv") + def home_games() -> pd.DataFrame: - return _get_file("core/HomeGames.csv") + return _get_file("HomeGames.csv") + def managers() -> pd.DataFrame: - return _get_file("core/Managers.csv") + return _get_file("Managers.csv") + def managers_half() -> pd.DataFrame: - return _get_file("core/ManagersHalf.csv") + return _get_file("ManagersHalf.csv") + def master() -> pd.DataFrame: # Alias for people -- the new name for master return people() + def people() -> pd.DataFrame: - return _get_file("core/People.csv") + return _get_file("People.csv") + def pitching() -> pd.DataFrame: - return _get_file("core/Pitching.csv") + return _get_file("Pitching.csv") + def pitching_post() -> pd.DataFrame: - return _get_file("core/PitchingPost.csv") + return _get_file("PitchingPost.csv") + def salaries() -> pd.DataFrame: - return _get_file("contrib/Salaries.csv") + return _get_file("Salaries.csv") + def schools() -> pd.DataFrame: - return _get_file("contrib/Schools.csv", quotechar='"') # different here bc of doublequotes used in some school names + return _get_file( + "Schools.csv", quotechar='"' + ) # different here bc of doublequotes used in some school names + def series_post() -> pd.DataFrame: - return _get_file("core/SeriesPost.csv") + return _get_file("SeriesPost.csv") + -def teams_core() -> pd.DataFrame: - return _get_file("core/Teams.csv") +def teams() -> pd.DataFrame: + return _get_file("Teams.csv") + + +# def teams_upstream() -> pd.DataFrame: +# return _get_file("Teams.csv") # manually maintained file -def teams_upstream() -> pd.DataFrame: - return _get_file("upstream/Teams.csv") # manually maintained file def teams_franchises() -> pd.DataFrame: - return _get_file("core/TeamsFranchises.csv") + return _get_file("TeamsFranchises.csv") + def teams_half() -> pd.DataFrame: - return _get_file("core/TeamsHalf.csv") + return _get_file("TeamsHalf.csv")