2. Data Preparation and Analysis¶

  • This is the second notebook which contains componets 2 and 3 of the assignment.

  • I will load the saved dataset (raw NBA data) from Part 1 into appropriate data structures, I have chosen to use pandas data frames as the main data structure using dictionaries for intermediary data parsing steps.

  • I will apply any data preprocessing steps that might be required to clean, filter or transform the dataset before analysis.

  • I will Analyse, characterise, and summarise the cleaned dataset, using tables and visualisations where appropriate.

  • I would like to find what features that indicate better player productivity offensively and defensively as well as detractors.

  • Then I would like to find a way to visualise the top players and their stats for making an overall strongly positive contribution to their respective teams, excluding players that fail in key defensive/offensive areas (turnovers, fouls, points, blocks, steals and assists) or overall productivity features like (plusMinus score, minutes played etc).

  • Load packages and libraries
In [1]:
import http.client
import json
from pathlib import Path
import pandas as pd
from scipy.stats import shapiro

from datetime import datetime, date

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.io as pio

from IPython.display import HTML, Javascript, display

# !pip install plotly 
# !pip install cufflinks 
# !pip install chart_studio
# !pip install seaborn --upgrade
# !pip install print-versions # works for python >= 3.8
# !pip install pypalettes

from pypalettes import load_cmap
import seaborn as sns
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.offline import init_notebook_mode
  • Basic notebook configurations for rendering plots
In [2]:
# magic function, renders matplotlib figures in notebook / inline
# %matplotlib inline 
%matplotlib 
pd.options.plotting.backend = "plotly"
# pio.renderers.default = 'notebook'
pio.renderers.default = "notebook_connected"
init_notebook_mode(connected=False)
cf.go_offline()
Using matplotlib backend: module://matplotlib_inline.backend_inline
In [3]:
import types
def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            yield val.__name__
list(imports())
Out[3]:
['builtins',
 'builtins',
 'json',
 'sys',
 'http',
 'pandas',
 'matplotlib',
 'matplotlib.pyplot',
 'matplotlib.ticker',
 'plotly.io',
 'seaborn',
 'cufflinks',
 'chart_studio.plotly',
 'plotly.express',
 'plotly.graph_objects',
 'plotly.figure_factory',
 'types']
In [4]:
from print_versions import print_versions

print_versions(globals())
json==2.0.9
ipykernel==6.28.0
pandas==2.2.2
scipy==1.13.1
matplotlib==3.9.2
pypalettes==0.1.4
seaborn==0.13.2
cufflinks==0.17.3
plotly==5.24.1
In [5]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)
/opt/anaconda3/bin/python
3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 08:28:27) [Clang 14.0.6 ]
sys.version_info(major=3, minor=12, micro=7, releaselevel='final', serial=0)
  • Set file read configurations
In [6]:
# directory for raw data storage
dir_raw = Path("raw")
# directory for storing clean pre-processed data
dir_data = Path("data")
# make sure it exists
dir_data.mkdir(parents=True, exist_ok=True)

Data Preprocessing¶

In [7]:
# init config, runtime variables and counters
teams = ["atlanta hawks", "boston celtics", "brooklyn nets", "charlotte hornets", "chicago bulls", 
         "cleveland cavaliers", "dallas mavericks", "denver nuggets", "detroit pistons", "golden state warriors", 
         "houston rockets", "indiana pacers", "la clippers", "los angeles lakers", "memphis grizzlies", "miami heat", 
         "milwaukee bucks", "minnesota timberwolves", "new orleans pelicans", "new york knicks", "oklahoma city thunder",
         "orlando magic", "philadelphia 76ers", "phoenix suns", "portland trail blazers", "sacramento kings", 
         "san antonio spurs", "toronto raptors", "utah jazz", "washington wizards"]
seasons = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

player_files = {}
for team_name in teams:
    player_files[team_name] = []
    
stats_files = {}
for team_name in teams:
    for season in seasons:
        stats_files[team_name + ' ' + str(season)] = []

stats_rec_cnt = 0
player_indiv_cnt = 0

debug_mode = False
In [8]:
# iterate through raw data on disk
for fpath in dir_raw.iterdir():
    if fpath.suffix == ".json":
        with open(fpath, "r") as fin:
            jdata = fin.read()
            data = json.loads(jdata)
            
            # parse the filename
            parts = fpath.stem.split("_")
            
            # set season, team and file type
            team = ''
            season = ''
            data_type = parts[-1]
            if (len(parts) >= 5 and len(parts[3]) == 4):
                team = parts[0] + ' ' + parts[1] + ' ' + parts[2]
                season = parts[3]
            else:
                team = parts[0] + ' ' + parts[1]
                season = parts[2]
            
            # add data to dict before merging
            if (team in teams):
                number_records = len(data['response'])
                if debug_mode:
                    print("Reading data for the %s for season %s with data type %s with %s records" % (team, season, data_type, number_records))
                if (data_type == 'players'):   
                    player_files[team] = pd.DataFrame.from_dict(data["response"])
                    player_indiv_cnt += number_records
                elif (data_type == 'stats'):
                    stats_files[team + ' ' + str(season)] = pd.DataFrame.from_dict(data["response"])
                    stats_rec_cnt += number_records
                else:
                    print('Invalid data type: ', data_type)
                
print("Finished reading %s player statistics records" % (stats_rec_cnt))
print("Finished reading %s individual player records" % (player_indiv_cnt))
Finished reading 329856 player statistics records
Finished reading 8032 individual player records

Data Parsing and Merging¶

  • Define function to get player age from DOB
In [9]:
# calculate player age from date of birth (DOB)
def get_player_age(birth_date):
    if (birth_date != '' and birth_date is not None):
        dob = datetime.strptime(birth_date, '%Y-%m-%d').date()
        today = date.today()
        return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
    else:
        return None
  • Concatenate all data from player statitics files
In [10]:
df_player_stats = pd.concat(stats_files)
  • Concatenate all data from individual player files
In [11]:
df_player_details = pd.concat(player_files)

Data Verification¶

  • Assert number of records loaded is as expected
In [12]:
print(df_player_stats.shape)
# 309697 (18553 for 6x teams 21-23) is total number of player stats records
df_player_stats.shape[0] == 329856
(329856, 25)
Out[12]:
True
In [13]:
print(df_player_details.shape)
df_player_details.shape[0] == 866
(866, 10)
Out[13]:
True
  • 716 still high (possible dupes) as there are 450 ish players in the NBA per season
  • Player details data in parts is duplicated from season to season so have kept the main information
  • Normalise nested JSON fields and extract the features needed for merging and analysis
  • Drop duplicate data and nested JSON fields when ready
In [14]:
# Normalize nested JSON objects for player game statistics
df_player = pd.json_normalize(df_player_stats['player'])# pull player_id and merge first/last name for player_name
df_team = pd.json_normalize(df_player_stats['team']) # pull team_id, team_code and name (drop team name index)
df_game = pd.json_normalize(df_player_stats['game']) # Extract game ID

# Normalize nested JSON objects for player details
df_birth = pd.json_normalize(df_player_details['birth'])# pull country and DOB
df_height = pd.json_normalize(df_player_details['height'])# pull height in mteres and merge feets/inches into ft_height
df_leagues = pd.json_normalize(df_player_details['leagues'])# pull standard.active	standard.jersey	standard.pos
df_nba = pd.json_normalize(df_player_details['nba'])# pull pro and start for international player insights ?
df_weight = pd.json_normalize(df_player_details['weight'])# pull kilograms and pounds
In [15]:
# Extract normalized data to player stats DF
df_player_stats = df_player_stats.reset_index(drop=False)

del df_player_stats['level_1']
df_player_stats['season'] = df_player_stats['level_0'].apply(lambda x: x.split(' ')[-1])
del df_player_stats['level_0']

df_player_stats['player_id'] = df_player['id']
df_player_stats['player_name'] = df_player['firstname'] + ' ' + df_player['lastname']

del df_player_stats['player']

df_player_stats['team_code'] = df_team['code']
df_player_stats['team_name'] = df_team['name']
df_player_stats['team_id'] = df_team['id']

del df_player_stats['team']

df_player_stats['game_id'] = df_game['id']

del df_player_stats['game']

df_player_stats['min'] = pd.to_numeric(df_player_stats['min'], errors='coerce')
df_player_stats['plusMinus'] = pd.to_numeric(df_player_stats['plusMinus'], errors='coerce')
In [16]:
# Extract normalized data to player details DF
df_player_details = df_player_details.reset_index(drop=True)

df_player_details['country'] = df_birth['country']
df_player_details['DOB'] = df_birth['date']

df_player_details['age'] = df_player_details['DOB'].apply(get_player_age)
df_player_details['age'] = df_player_details['age'].fillna(0).astype('int')
del df_player_details['birth']

df_player_details['feet'] = df_height['feets'] + '\"' + df_height['inches']
df_player_details['meters'] = df_height['meters']
df_player_details['meters'] = pd.to_numeric(df_player_details['meters'], errors='coerce')
df_player_details['meters'] = df_player_details['meters'].apply(lambda x: round(float(x), 2) if x != None else x)
del df_player_details['height']

df_player_details['position'] = df_leagues['standard.pos']
df_player_details['active'] = df_leagues['standard.active']
df_player_details['jersey_number'] = df_leagues['standard.jersey']
del df_player_details['leagues']

df_player_details['years_pro'] = df_nba['pro']
df_player_details['rookie_year'] = df_nba['start']
del df_player_details['nba']

df_player_details['pounds'] = df_weight['pounds']
df_player_details['pounds'] = df_player_details['pounds'].astype('Int16')
del df_player_details['weight']
In [17]:
df_player_stats.shape # verify record count is the same and new column count
Out[17]:
(329856, 29)

Merge player performance statistics and individual player information¶

In [18]:
# merge data frames (player game statistics and player details)
df = pd.merge(df_player_stats, df_player_details, how='left', left_on='player_id', right_on='id')
  • View features available for analysis
In [19]:
df.columns
Out[19]:
Index(['assists', 'blocks', 'comment', 'defReb', 'fga', 'fgm', 'fgp', 'fta',
       'ftm', 'ftp', 'min', 'offReb', 'pFouls', 'plusMinus', 'points', 'pos',
       'steals', 'totReb', 'tpa', 'tpm', 'tpp', 'turnovers', 'season',
       'player_id', 'player_name', 'team_code', 'team_name', 'team_id',
       'game_id', 'affiliation', 'college', 'firstname', 'id', 'lastname',
       'country', 'DOB', 'age', 'feet', 'meters', 'position', 'active',
       'jersey_number', 'years_pro', 'rookie_year', 'pounds'],
      dtype='object')
  • Set DataFrame index to season / year

Download to disk¶

In [20]:
# Save to CSV
df.to_csv('nba-stats-data.csv', index=False)