Revisiting Opening Day Win-Loss Statistics

baseball
code
Revisiting Opening Day win-loss statistics with new data and new methods
Author

Matthew Spence

Published

March 4, 2023

I wanted to revisit my earlier post about opening day win-loss statistics to update the numbers with eight additional years of data and also to fix the a few issues that I didn’t have the skills to address back in 2015.

First, my old script had separate entries for the two teams that renamed themselves during the period — the Los Angeles (California) Angels and the Miami (Florida) Marlins. I wanted those combined, no question. I also wondered about what to do with a franchise that renamed itself and moved. Should the Expos and the Nationals really get combined into a single franchise record?

My first attempts to fix the former and keep the Nats/Expos apart were unsuccessful. I combined the renamed teams by creating the win-loss records by team_id (as before), but then merging those with the historical franchise ID (hist_franchise_id, as before) and bringing over the current_team_id as well.

Code
import os, requests, zipfile, io
from IPython.display import HTML, Markdown
#from itables import show
import pandas as pd

DATADIR = os.path.join(os.getenv("MY_PROJECT_DIR"), "data", "gl")
YEARSTART, YEAREND = 1980, 2022 # will add +1 to YEAREND for range()

if not os.path.exists(DATADIR):
    os.makedirs(DATADIR)

if not os.path.exists(os.path.join(DATADIR,"CurrentNames.csv")):
    r = requests.get("https://www.retrosheet.org/CurrentNames.csv")
    with open(os.path.join(DATADIR,"CurrentNames.csv"),"w") as f:
        f.write("current_team_id,hist_franchise_id,league,division,location,nickname,alt_nickname,first_used,last_used,city,state\n")
        f.write(r.text)

for year in range(YEARSTART,YEAREND+1): 
    if not os.path.exists(os.path.join(DATADIR,f"gl{year}.txt")):
        zip_file_url = f"https://www.retrosheet.org/gamelogs/gl{year}.zip"
        r = requests.get(zip_file_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(path=DATADIR)

fields = ['date','visit_team_id','visit_team_league','visit_game_number','home_team_id','home_team_league','home_game_number','visit_final_score','home_final_score']
cols = [0,3,4,5,6,7,8,9,10]

def assign_outcome(row):
    if row.final_score>row.opponent_final_score:
        return 'W'
    elif row.final_score<row.opponent_final_score:
        return 'L'
    else:
        return 'T' # 2000's opening of Reds vs Brewers ended in an official tie

def read_in_data(year):
    dat = pd.read_csv(f"{DATADIR}/gl{year}.txt", header=None,names=fields, usecols=cols, parse_dates=['date'])
    v = dat.loc[dat.visit_game_number==1,].rename(columns=lambda x: x.replace('visit_','')).rename(columns=lambda x: x.replace('home_','opponent_'))
    h = dat.loc[dat.home_game_number==1,].rename(columns=lambda x: x.replace('home_','')).rename(columns=lambda x: x.replace('visit_','opponent_'))
    games = pd.concat([v,h], ignore_index=True)
    games['outcome'] = games.apply(assign_outcome,axis=1)
    games['season'] = year
    return games

teamnames = pd.read_csv(os.path.join(DATADIR,'CurrentNames.csv'), parse_dates=['first_used','last_used']) 

dflist = []
for year in range(YEARSTART,YEAREND+1):
    g = read_in_data(year)
    dflist.append(g)
    # df = pd.concat([df,g], ignore_index=True)

df = pd.concat(dflist, ignore_index=True, sort=False)

# def f(group):
#     return pd.DataFrame({'count': group.value_counts(),
#                          'total': group.size})

winpct = (df.groupby('team_id').apply(lambda x: x.outcome.value_counts()/x.outcome.size, include_groups=False).round(3)*100)[:,"W"].sort_values(ascending=False)

wlcnt = df.groupby('team_id').apply(lambda x: x.outcome.value_counts(), include_groups=False).unstack(-1)

# drop any team name that was last_used before the start of our analysis. 
# "Drop duplicates" by current_team_id and hist_franchise_id will get rid of divisional changes (e.g. Pirates moving from NL E to NL C in 1994) and minor name changes (TBA Devil Rays --> Rays; Anaheim Angles --> LA Angels)
pctteams = teamnames[(teamnames['last_used'].dt.year>YEARSTART)|(teamnames['last_used'].isna())].drop_duplicates(subset=['current_team_id','hist_franchise_id'], keep="last") 

wlcntteam = pd.merge(wlcnt, pctteams[['current_team_id', 'hist_franchise_id']], left_index=True, right_on='hist_franchise_id', how="left").groupby(['current_team_id'])[['W','L','T']].sum()

finaltable = pd.merge(wlcntteam, pctteams, left_index=True, right_on='hist_franchise_id', how="left").assign(Team = lambda x: x['location'] + " " + x['nickname'], WnPct = lambda x: 100*round(x['W']/(x['W']+x['L']+x['T']), 3)).rename(columns={'W':'Wins','L':'Losses','T':'Ties', 'WnPct':'Win (%)'})[['Team','Wins','Losses', 'Ties', 'Win (%)']].sort_values("Win (%)", ascending=False)

HTML(finaltable.to_html(index=False))  
Table 1: Opening Day Records, 1980 – 2022
Team Wins Losses Ties Win (%)
New York Mets 31.0 12.0 0.0 72.1
Baltimore Orioles 28.0 15.0 0.0 65.1
Seattle Mariners 26.0 17.0 0.0 60.5
Chicago White Sox 25.0 18.0 0.0 58.1
Los Angeles Dodgers 25.0 18.0 0.0 58.1
Detroit Tigers 25.0 18.0 0.0 58.1
Cincinnati Reds 24.0 18.0 1.0 55.8
St. Louis Cardinals 23.0 20.0 0.0 53.5
New York Yankees 23.0 20.0 0.0 53.5
Toronto Blue Jays 23.0 20.0 0.0 53.5
San Francisco Giants 23.0 20.0 0.0 53.5
Houston Astros 23.0 20.0 0.0 53.5
Colorado Rockies 16.0 14.0 0.0 53.3
Tampa Bay Rays 13.0 12.0 0.0 52.0
Arizona Diamondbacks 13.0 12.0 0.0 52.0
Atlanta Braves 22.0 21.0 0.0 51.2
Milwaukee Brewers 22.0 20.0 1.0 51.2
Chicago Cubs 21.0 22.0 0.0 48.8
Pittsburgh Pirates 20.0 23.0 0.0 46.5
Los Angeles Angels 20.0 23.0 0.0 46.5
Boston Red Sox 20.0 23.0 0.0 46.5
Washington Nationals 20.0 23.0 0.0 46.5
Texas Rangers 19.0 24.0 0.0 44.2
Miami Marlins 12.0 18.0 0.0 40.0
Philadelphia Phillies 17.0 26.0 0.0 39.5
Oakland Athletics 17.0 26.0 0.0 39.5
San Diego Padres 17.0 26.0 0.0 39.5
Minnesota Twins 17.0 26.0 0.0 39.5
Cleveland Indians 16.0 27.0 0.0 37.2
Kansas City Royals 15.0 28.0 0.0 34.9
Code
wlcntteam_bycity = pd.merge(wlcnt, pctteams[['current_team_id', 'hist_franchise_id', 'city', 'state']], left_index=True, right_on='hist_franchise_id', how="left").groupby(['current_team_id','city'])[['W','L','T']].sum().reset_index()

wlcntteam_bycity.loc[wlcntteam_bycity['city'] == "Montreal", "current_team_id"] = "MON"

finaltable_exposfix = pd.merge(wlcntteam_bycity, pctteams, left_on=['current_team_id','city'], right_on=['hist_franchise_id','city'], how="left").assign(Team = lambda x: x['location'] + " " + x['nickname'], WnPct = lambda x: 100*round(x['W']/(x['W']+x['L']+x['T']), 3)).rename(columns={'W':'Wins','L':'Losses','T':'Ties', 'WnPct':'Win (%)'})[['Team','Wins','Losses','Ties', 'Win (%)']].sort_values("Win (%)", ascending=False)

HTML(finaltable_exposfix.to_html(index=False))  
Table 2: Opening Day Records, 1980 – 2022, Expos and Nationals separated
Team Wins Losses Ties Win (%)
New York Mets 31.0 12.0 0.0 72.1
Baltimore Orioles 28.0 15.0 0.0 65.1
Seattle Mariners 26.0 17.0 0.0 60.5
Chicago White Sox 25.0 18.0 0.0 58.1
Los Angeles Dodgers 25.0 18.0 0.0 58.1
Detroit Tigers 25.0 18.0 0.0 58.1
Cincinnati Reds 24.0 18.0 1.0 55.8
Toronto Blue Jays 23.0 20.0 0.0 53.5
Houston Astros 23.0 20.0 0.0 53.5
St. Louis Cardinals 23.0 20.0 0.0 53.5
San Francisco Giants 23.0 20.0 0.0 53.5
New York Yankees 23.0 20.0 0.0 53.5
Colorado Rockies 16.0 14.0 0.0 53.3
Tampa Bay Rays 13.0 12.0 0.0 52.0
Arizona Diamondbacks 13.0 12.0 0.0 52.0
Atlanta Braves 22.0 21.0 0.0 51.2
Milwaukee Brewers 22.0 20.0 1.0 51.2
Chicago Cubs 21.0 22.0 0.0 48.8
Montreal Expos 12.0 13.0 0.0 48.0
Pittsburgh Pirates 20.0 23.0 0.0 46.5
Los Angeles Angels 20.0 23.0 0.0 46.5
Boston Red Sox 20.0 23.0 0.0 46.5
Washington Nationals 8.0 10.0 0.0 44.4
Texas Rangers 19.0 24.0 0.0 44.2
Miami Marlins 12.0 18.0 0.0 40.0
Philadelphia Phillies 17.0 26.0 0.0 39.5
San Diego Padres 17.0 26.0 0.0 39.5
Oakland Athletics 17.0 26.0 0.0 39.5
Minnesota Twins 17.0 26.0 0.0 39.5
Cleveland Indians 16.0 27.0 0.0 37.2
Kansas City Royals 15.0 28.0 0.0 34.9