Revisiting Opening Day Win-Loss Statistics

baseball

code

Revisiting Opening Day win-loss statistics with new data and new methods

Author

Matthew Spence

Published

March 4, 2023

I wanted to revisit my earlier post about opening day win-loss statistics to update the numbers with eight additional years of data and also to fix the a few issues that I didn’t have the skills to address back in 2015.

First, my old script had separate entries for the two teams that renamed themselves during the period — the Los Angeles (California) Angels and the Miami (Florida) Marlins. I wanted those combined, no question. I also wondered about what to do with a franchise that renamed itself and moved. Should the Expos and the Nationals really get combined into a single franchise record?

My first attempts to fix the former and keep the Nats/Expos apart were unsuccessful. I combined the renamed teams by creating the win-loss records by team_id (as before), but then merging those with the historical franchise ID (hist_franchise_id, as before) and bringing over the current_team_id as well.

Code

import os, requests, zipfile, io
from IPython.display import HTML, Markdown
#from itables import show
import pandas as pd

DATADIR = os.path.join(os.getenv("MY_PROJECT_DIR"), "data", "gl")
YEARSTART, YEAREND = 1980, 2022 # will add +1 to YEAREND for range()

if not os.path.exists(DATADIR):
    os.makedirs(DATADIR)

if not os.path.exists(os.path.join(DATADIR,"CurrentNames.csv")):
    r = requests.get("https://www.retrosheet.org/CurrentNames.csv")
    with open(os.path.join(DATADIR,"CurrentNames.csv"),"w") as f:
        f.write("current_team_id,hist_franchise_id,league,division,location,nickname,alt_nickname,first_used,last_used,city,state\n")
        f.write(r.text)

for year in range(YEARSTART,YEAREND+1): 
    if not os.path.exists(os.path.join(DATADIR,f"gl{year}.txt")):
        zip_file_url = f"https://www.retrosheet.org/gamelogs/gl{year}.zip"
        r = requests.get(zip_file_url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall(path=DATADIR)

fields = ['date','visit_team_id','visit_team_league','visit_game_number','home_team_id','home_team_league','home_game_number','visit_final_score','home_final_score']
cols = [0,3,4,5,6,7,8,9,10]

def assign_outcome(row):
    if row.final_score>row.opponent_final_score:
        return 'W'
    elif row.final_score<row.opponent_final_score:
        return 'L'
    else:
        return 'T' # 2000's opening of Reds vs Brewers ended in an official tie

def read_in_data(year):
    dat = pd.read_csv(f"{DATADIR}/gl{year}.txt", header=None,names=fields, usecols=cols, parse_dates=['date'])
    v = dat.loc[dat.visit_game_number==1,].rename(columns=lambda x: x.replace('visit_','')).rename(columns=lambda x: x.replace('home_','opponent_'))
    h = dat.loc[dat.home_game_number==1,].rename(columns=lambda x: x.replace('home_','')).rename(columns=lambda x: x.replace('visit_','opponent_'))
    games = pd.concat([v,h], ignore_index=True)
    games['outcome'] = games.apply(assign_outcome,axis=1)
    games['season'] = year
    return games

teamnames = pd.read_csv(os.path.join(DATADIR,'CurrentNames.csv'), parse_dates=['first_used','last_used']) 

dflist = []
for year in range(YEARSTART,YEAREND+1):
    g = read_in_data(year)
    dflist.append(g)
    # df = pd.concat([df,g], ignore_index=True)

df = pd.concat(dflist, ignore_index=True, sort=False)

# def f(group):
#     return pd.DataFrame({'count': group.value_counts(),
#                          'total': group.size})

winpct = (df.groupby('team_id').apply(lambda x: x.outcome.value_counts()/x.outcome.size, include_groups=False).round(3)*100)[:,"W"].sort_values(ascending=False)

wlcnt = df.groupby('team_id').apply(lambda x: x.outcome.value_counts(), include_groups=False).unstack(-1)

# drop any team name that was last_used before the start of our analysis. 
# "Drop duplicates" by current_team_id and hist_franchise_id will get rid of divisional changes (e.g. Pirates moving from NL E to NL C in 1994) and minor name changes (TBA Devil Rays --> Rays; Anaheim Angles --> LA Angels)
pctteams = teamnames[(teamnames['last_used'].dt.year>YEARSTART)|(teamnames['last_used'].isna())].drop_duplicates(subset=['current_team_id','hist_franchise_id'], keep="last") 

wlcntteam = pd.merge(wlcnt, pctteams[['current_team_id', 'hist_franchise_id']], left_index=True, right_on='hist_franchise_id', how="left").groupby(['current_team_id'])[['W','L','T']].sum()

finaltable = pd.merge(wlcntteam, pctteams, left_index=True, right_on='hist_franchise_id', how="left").assign(Team = lambda x: x['location'] + " " + x['nickname'], WnPct = lambda x: 100*round(x['W']/(x['W']+x['L']+x['T']), 3)).rename(columns={'W':'Wins','L':'Losses','T':'Ties', 'WnPct':'Win (%)'})[['Team','Wins','Losses', 'Ties', 'Win (%)']].sort_values("Win (%)", ascending=False)

HTML(finaltable.to_html(index=False))

Table 1: Opening Day Records, 1980 – 2022

Team	Wins	Losses	Ties	Win (%)
New York Mets	31.0	12.0	0.0	72.1
Baltimore Orioles	28.0	15.0	0.0	65.1
Seattle Mariners	26.0	17.0	0.0	60.5
Chicago White Sox	25.0	18.0	0.0	58.1
Los Angeles Dodgers	25.0	18.0	0.0	58.1
Detroit Tigers	25.0	18.0	0.0	58.1
Cincinnati Reds	24.0	18.0	1.0	55.8
St. Louis Cardinals	23.0	20.0	0.0	53.5
New York Yankees	23.0	20.0	0.0	53.5
Toronto Blue Jays	23.0	20.0	0.0	53.5
San Francisco Giants	23.0	20.0	0.0	53.5
Houston Astros	23.0	20.0	0.0	53.5
Colorado Rockies	16.0	14.0	0.0	53.3
Tampa Bay Rays	13.0	12.0	0.0	52.0
Arizona Diamondbacks	13.0	12.0	0.0	52.0
Atlanta Braves	22.0	21.0	0.0	51.2
Milwaukee Brewers	22.0	20.0	1.0	51.2
Chicago Cubs	21.0	22.0	0.0	48.8
Pittsburgh Pirates	20.0	23.0	0.0	46.5
Los Angeles Angels	20.0	23.0	0.0	46.5
Boston Red Sox	20.0	23.0	0.0	46.5
Washington Nationals	20.0	23.0	0.0	46.5
Texas Rangers	19.0	24.0	0.0	44.2
Miami Marlins	12.0	18.0	0.0	40.0
Philadelphia Phillies	17.0	26.0	0.0	39.5
Oakland Athletics	17.0	26.0	0.0	39.5
San Diego Padres	17.0	26.0	0.0	39.5
Minnesota Twins	17.0	26.0	0.0	39.5
Cleveland Indians	16.0	27.0	0.0	37.2
Kansas City Royals	15.0	28.0	0.0	34.9

Code

wlcntteam_bycity = pd.merge(wlcnt, pctteams[['current_team_id', 'hist_franchise_id', 'city', 'state']], left_index=True, right_on='hist_franchise_id', how="left").groupby(['current_team_id','city'])[['W','L','T']].sum().reset_index()

wlcntteam_bycity.loc[wlcntteam_bycity['city'] == "Montreal", "current_team_id"] = "MON"

finaltable_exposfix = pd.merge(wlcntteam_bycity, pctteams, left_on=['current_team_id','city'], right_on=['hist_franchise_id','city'], how="left").assign(Team = lambda x: x['location'] + " " + x['nickname'], WnPct = lambda x: 100*round(x['W']/(x['W']+x['L']+x['T']), 3)).rename(columns={'W':'Wins','L':'Losses','T':'Ties', 'WnPct':'Win (%)'})[['Team','Wins','Losses','Ties', 'Win (%)']].sort_values("Win (%)", ascending=False)

HTML(finaltable_exposfix.to_html(index=False))

Table 2: Opening Day Records, 1980 – 2022, Expos and Nationals separated

Team	Wins	Losses	Ties	Win (%)
New York Mets	31.0	12.0	0.0	72.1
Baltimore Orioles	28.0	15.0	0.0	65.1
Seattle Mariners	26.0	17.0	0.0	60.5
Chicago White Sox	25.0	18.0	0.0	58.1
Los Angeles Dodgers	25.0	18.0	0.0	58.1
Detroit Tigers	25.0	18.0	0.0	58.1
Cincinnati Reds	24.0	18.0	1.0	55.8
Toronto Blue Jays	23.0	20.0	0.0	53.5
Houston Astros	23.0	20.0	0.0	53.5
St. Louis Cardinals	23.0	20.0	0.0	53.5
San Francisco Giants	23.0	20.0	0.0	53.5
New York Yankees	23.0	20.0	0.0	53.5
Colorado Rockies	16.0	14.0	0.0	53.3
Tampa Bay Rays	13.0	12.0	0.0	52.0
Arizona Diamondbacks	13.0	12.0	0.0	52.0
Atlanta Braves	22.0	21.0	0.0	51.2
Milwaukee Brewers	22.0	20.0	1.0	51.2
Chicago Cubs	21.0	22.0	0.0	48.8
Montreal Expos	12.0	13.0	0.0	48.0
Pittsburgh Pirates	20.0	23.0	0.0	46.5
Los Angeles Angels	20.0	23.0	0.0	46.5
Boston Red Sox	20.0	23.0	0.0	46.5
Washington Nationals	8.0	10.0	0.0	44.4
Texas Rangers	19.0	24.0	0.0	44.2
Miami Marlins	12.0	18.0	0.0	40.0
Philadelphia Phillies	17.0	26.0	0.0	39.5
San Diego Padres	17.0	26.0	0.0	39.5
Oakland Athletics	17.0	26.0	0.0	39.5
Minnesota Twins	17.0	26.0	0.0	39.5
Cleveland Indians	16.0	27.0	0.0	37.2
Kansas City Royals	15.0	28.0	0.0	34.9