import http.client
import json
from pathlib import Path
import pandas as pd
from scipy.stats import shapiro

from datetime import datetime, date

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.io as pio

from IPython.display import HTML, Javascript, display

# !pip install plotly 
# !pip install cufflinks 
# !pip install chart_studio
# !pip install seaborn --upgrade
# !pip install print-versions # works for python >= 3.8
# !pip install pypalettes

from pypalettes import load_cmap
import seaborn as sns
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.offline import init_notebook_mode

# magic function, renders matplotlib figures in notebook / inline
# %matplotlib inline 
%matplotlib 
pd.options.plotting.backend = "plotly"
# pio.renderers.default = 'notebook'
pio.renderers.default = "notebook_connected"
init_notebook_mode(connected=False)
cf.go_offline()

Using matplotlib backend: module://matplotlib_inline.backend_inline

import types
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__
list(imports())

['builtins',
 'builtins',
 'json',
 'sys',
 'http',
 'pandas',
 'matplotlib',
 'matplotlib.pyplot',
 'matplotlib.ticker',
 'plotly.io',
 'seaborn',
 'cufflinks',
 'chart_studio.plotly',
 'plotly.express',
 'plotly.graph_objects',
 'plotly.figure_factory',
 'types']

from print_versions import print_versions

print_versions(globals())

json==2.0.9
ipykernel==6.28.0
pandas==2.2.2
scipy==1.13.1
matplotlib==3.9.2
pypalettes==0.1.4
seaborn==0.13.2
cufflinks==0.17.3
plotly==5.24.1

import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/opt/anaconda3/bin/python
3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:28:27) [Clang 14.0.6 ]
sys.version_info(major=3, minor=12, micro=7, releaselevel='final', serial=0)

# directory for raw data storage
dir_raw = Path("raw")
# directory for storing clean pre-processed data
dir_data = Path("data")
# make sure it exists
dir_data.mkdir(parents=True, exist_ok=True)

# init config, runtime variables and counters
teams = ["atlanta hawks", "boston celtics", "brooklyn nets", "charlotte hornets", "chicago bulls", 
         "cleveland cavaliers", "dallas mavericks", "denver nuggets", "detroit pistons", "golden state warriors", 
         "houston rockets", "indiana pacers", "la clippers", "los angeles lakers", "memphis grizzlies", "miami heat", 
         "milwaukee bucks", "minnesota timberwolves", "new orleans pelicans", "new york knicks", "oklahoma city thunder",
         "orlando magic", "philadelphia 76ers", "phoenix suns", "portland trail blazers", "sacramento kings", 
         "san antonio spurs", "toronto raptors", "utah jazz", "washington wizards"]
seasons = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

player_files = {}
for team_name in teams:
    player_files[team_name] = []
    
stats_files = {}
for team_name in teams:
    for season in seasons:
        stats_files[team_name + ' ' + str(season)] = []

stats_rec_cnt = 0
player_indiv_cnt = 0

debug_mode = False

# iterate through raw data on disk
for fpath in dir_raw.iterdir():
    if fpath.suffix == ".json":
        with open(fpath, "r") as fin:
            jdata = fin.read()
            data = json.loads(jdata)
            
            # parse the filename
            parts = fpath.stem.split("_")
            
            # set season, team and file type
            team = ''
            season = ''
            data_type = parts[-1]
            if (len(parts) >= 5 and len(parts[3]) == 4):
                team = parts[0] + ' ' + parts[1] + ' ' + parts[2]
                season = parts[3]
            else:
                team = parts[0] + ' ' + parts[1]
                season = parts[2]
            
            # add data to dict before merging
            if (team in teams):
                number_records = len(data['response'])
                if debug_mode:
                    print("Reading data for the %s for season %s with data type %s with %s records" % (team, season, data_type, number_records))
                if (data_type == 'players'):   
                    player_files[team] = pd.DataFrame.from_dict(data["response"])
                    player_indiv_cnt += number_records
                elif (data_type == 'stats'):
                    stats_files[team + ' ' + str(season)] = pd.DataFrame.from_dict(data["response"])
                    stats_rec_cnt += number_records
                else:
                    print('Invalid data type: ', data_type)
                
print("Finished reading %s player statistics records" % (stats_rec_cnt))
print("Finished reading %s individual player records" % (player_indiv_cnt))

Finished reading 329856 player statistics records
Finished reading 8032 individual player records

# calculate player age from date of birth (DOB)
def get_player_age(birth_date):
    if (birth_date != '' and birth_date is not None):
        dob = datetime.strptime(birth_date, '%Y-%m-%d').date()
        today = date.today()
        return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    else:
        return None

df_player_stats = pd.concat(stats_files)

df_player_details = pd.concat(player_files)

print(df_player_stats.shape)
# 309697 (18553 for 6x teams 21-23) is total number of player stats records
df_player_stats.shape[0] == 329856

(329856, 25)

True

print(df_player_details.shape)
df_player_details.shape[0] == 866

(866, 10)

True

# Normalize nested JSON objects for player game statistics
df_player = pd.json_normalize(df_player_stats['player'])# pull player_id and merge first/last name for player_name
df_team = pd.json_normalize(df_player_stats['team']) # pull team_id, team_code and name (drop team name index)
df_game = pd.json_normalize(df_player_stats['game']) # Extract game ID

# Normalize nested JSON objects for player details
df_birth = pd.json_normalize(df_player_details['birth'])# pull country and DOB
df_height = pd.json_normalize(df_player_details['height'])# pull height in mteres and merge feets/inches into ft_height
df_leagues = pd.json_normalize(df_player_details['leagues'])# pull standard.active	standard.jersey	standard.pos
df_nba = pd.json_normalize(df_player_details['nba'])# pull pro and start for international player insights ?
df_weight = pd.json_normalize(df_player_details['weight'])# pull kilograms and pounds

# Extract normalized data to player stats DF
df_player_stats = df_player_stats.reset_index(drop=False)

del df_player_stats['level_1']
df_player_stats['season'] = df_player_stats['level_0'].apply(lambda x: x.split(' ')[-1])
del df_player_stats['level_0']

df_player_stats['player_id'] = df_player['id']
df_player_stats['player_name'] = df_player['firstname'] + ' ' + df_player['lastname']

del df_player_stats['player']

df_player_stats['team_code'] = df_team['code']
df_player_stats['team_name'] = df_team['name']
df_player_stats['team_id'] = df_team['id']

del df_player_stats['team']

df_player_stats['game_id'] = df_game['id']

del df_player_stats['game']

df_player_stats['min'] = pd.to_numeric(df_player_stats['min'], errors='coerce')
df_player_stats['plusMinus'] = pd.to_numeric(df_player_stats['plusMinus'], errors='coerce')

# Extract normalized data to player details DF
df_player_details = df_player_details.reset_index(drop=True)

df_player_details['country'] = df_birth['country']
df_player_details['DOB'] = df_birth['date']

df_player_details['age'] = df_player_details['DOB'].apply(get_player_age)
df_player_details['age'] = df_player_details['age'].fillna(0).astype('int')
del df_player_details['birth']

df_player_details['feet'] = df_height['feets'] + '\"' + df_height['inches']
df_player_details['meters'] = df_height['meters']
df_player_details['meters'] = pd.to_numeric(df_player_details['meters'], errors='coerce')
df_player_details['meters'] = df_player_details['meters'].apply(lambda x: round(float(x), 2) if x != None else x)
del df_player_details['height']

df_player_details['position'] = df_leagues['standard.pos']
df_player_details['active'] = df_leagues['standard.active']
df_player_details['jersey_number'] = df_leagues['standard.jersey']
del df_player_details['leagues']

df_player_details['years_pro'] = df_nba['pro']
df_player_details['rookie_year'] = df_nba['start']
del df_player_details['nba']

df_player_details['pounds'] = df_weight['pounds']
df_player_details['pounds'] = df_player_details['pounds'].astype('Int16')
del df_player_details['weight']

df_player_stats.shape # verify record count is the same and new column count

(329856, 29)

# merge data frames (player game statistics and player details)
df = pd.merge(df_player_stats, df_player_details, how='left', left_on='player_id', right_on='id')

df.columns

Index(['assists', 'blocks', 'comment', 'defReb', 'fga', 'fgm', 'fgp', 'fta',
       'ftm', 'ftp', 'min', 'offReb', 'pFouls', 'plusMinus', 'points', 'pos',
       'steals', 'totReb', 'tpa', 'tpm', 'tpp', 'turnovers', 'season',
       'player_id', 'player_name', 'team_code', 'team_name', 'team_id',
       'game_id', 'affiliation', 'college', 'firstname', 'id', 'lastname',
       'country', 'DOB', 'age', 'feet', 'meters', 'position', 'active',
       'jersey_number', 'years_pro', 'rookie_year', 'pounds'],
      dtype='object')

# Save to CSV
df.to_csv('nba-stats-data.csv', index=False)

2. Data Preparation and Analysis¶

Data Preprocessing¶

Data Parsing and Merging¶

Data Verification¶

Merge player performance statistics and individual player information¶

Download to disk¶