2. Data Preparation and Analysis¶
This is the second notebook which contains componets 2 and 3 of the assignment.
I will load the saved dataset (raw NBA data) from Part 1 into appropriate data structures, I have chosen to use pandas data frames as the main data structure using dictionaries for intermediary data parsing steps.
I will apply any data preprocessing steps that might be required to clean, filter or transform the dataset before analysis.
I will Analyse, characterise, and summarise the cleaned dataset, using tables and visualisations where appropriate.
I would like to find what features that indicate better player productivity offensively and defensively as well as detractors.
Then I would like to find a way to visualise the top players and their stats for making an overall strongly positive contribution to their respective teams, excluding players that fail in key defensive/offensive areas (turnovers, fouls, points, blocks, steals and assists) or overall productivity features like (plusMinus score, minutes played etc).
- Load packages and libraries
import http.client
import json
from pathlib import Path
import pandas as pd
from scipy.stats import shapiro
from datetime import datetime, date
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import plotly.io as pio
from IPython.display import HTML, Javascript, display
# !pip install plotly
# !pip install cufflinks
# !pip install chart_studio
# !pip install seaborn --upgrade
# !pip install print-versions # works for python >= 3.8
# !pip install pypalettes
from pypalettes import load_cmap
import seaborn as sns
import cufflinks as cf
import chart_studio.plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from plotly.offline import init_notebook_mode
- Basic notebook configurations for rendering plots
# magic function, renders matplotlib figures in notebook / inline
# %matplotlib inline
%matplotlib
pd.options.plotting.backend = "plotly"
# pio.renderers.default = 'notebook'
pio.renderers.default = "notebook_connected"
init_notebook_mode(connected=False)
cf.go_offline()
Using matplotlib backend: module://matplotlib_inline.backend_inline
import types
def imports():
for name, val in globals().items():
if isinstance(val, types.ModuleType):
yield val.__name__
list(imports())
['builtins', 'builtins', 'json', 'sys', 'http', 'pandas', 'matplotlib', 'matplotlib.pyplot', 'matplotlib.ticker', 'plotly.io', 'seaborn', 'cufflinks', 'chart_studio.plotly', 'plotly.express', 'plotly.graph_objects', 'plotly.figure_factory', 'types']
from print_versions import print_versions
print_versions(globals())
json==2.0.9 ipykernel==6.28.0 pandas==2.2.2 scipy==1.13.1 matplotlib==3.9.2 pypalettes==0.1.4 seaborn==0.13.2 cufflinks==0.17.3 plotly==5.24.1
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)
/opt/anaconda3/bin/python 3.12.7 | packaged by Anaconda, Inc. | (main, Oct 4 2024, 08:28:27) [Clang 14.0.6 ] sys.version_info(major=3, minor=12, micro=7, releaselevel='final', serial=0)
- Set file read configurations
# directory for raw data storage
dir_raw = Path("raw")
# directory for storing clean pre-processed data
dir_data = Path("data")
# make sure it exists
dir_data.mkdir(parents=True, exist_ok=True)
Data Preprocessing¶
# init config, runtime variables and counters
teams = ["atlanta hawks", "boston celtics", "brooklyn nets", "charlotte hornets", "chicago bulls",
"cleveland cavaliers", "dallas mavericks", "denver nuggets", "detroit pistons", "golden state warriors",
"houston rockets", "indiana pacers", "la clippers", "los angeles lakers", "memphis grizzlies", "miami heat",
"milwaukee bucks", "minnesota timberwolves", "new orleans pelicans", "new york knicks", "oklahoma city thunder",
"orlando magic", "philadelphia 76ers", "phoenix suns", "portland trail blazers", "sacramento kings",
"san antonio spurs", "toronto raptors", "utah jazz", "washington wizards"]
seasons = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
player_files = {}
for team_name in teams:
player_files[team_name] = []
stats_files = {}
for team_name in teams:
for season in seasons:
stats_files[team_name + ' ' + str(season)] = []
stats_rec_cnt = 0
player_indiv_cnt = 0
debug_mode = False
# iterate through raw data on disk
for fpath in dir_raw.iterdir():
if fpath.suffix == ".json":
with open(fpath, "r") as fin:
jdata = fin.read()
data = json.loads(jdata)
# parse the filename
parts = fpath.stem.split("_")
# set season, team and file type
team = ''
season = ''
data_type = parts[-1]
if (len(parts) >= 5 and len(parts[3]) == 4):
team = parts[0] + ' ' + parts[1] + ' ' + parts[2]
season = parts[3]
else:
team = parts[0] + ' ' + parts[1]
season = parts[2]
# add data to dict before merging
if (team in teams):
number_records = len(data['response'])
if debug_mode:
print("Reading data for the %s for season %s with data type %s with %s records" % (team, season, data_type, number_records))
if (data_type == 'players'):
player_files[team] = pd.DataFrame.from_dict(data["response"])
player_indiv_cnt += number_records
elif (data_type == 'stats'):
stats_files[team + ' ' + str(season)] = pd.DataFrame.from_dict(data["response"])
stats_rec_cnt += number_records
else:
print('Invalid data type: ', data_type)
print("Finished reading %s player statistics records" % (stats_rec_cnt))
print("Finished reading %s individual player records" % (player_indiv_cnt))
Finished reading 329856 player statistics records Finished reading 8032 individual player records
Data Parsing and Merging¶
- Define function to get player age from DOB
# calculate player age from date of birth (DOB)
def get_player_age(birth_date):
if (birth_date != '' and birth_date is not None):
dob = datetime.strptime(birth_date, '%Y-%m-%d').date()
today = date.today()
return today.year - dob.year - ((today.month, today.day) < (dob.month, dob.day))
else:
return None
- Concatenate all data from player statitics files
df_player_stats = pd.concat(stats_files)
- Concatenate all data from individual player files
df_player_details = pd.concat(player_files)
Data Verification¶
- Assert number of records loaded is as expected
print(df_player_stats.shape)
# 309697 (18553 for 6x teams 21-23) is total number of player stats records
df_player_stats.shape[0] == 329856
(329856, 25)
True
print(df_player_details.shape)
df_player_details.shape[0] == 866
(866, 10)
True
- 716 still high (possible dupes) as there are 450 ish players in the NBA per season
- Player details data in parts is duplicated from season to season so have kept the main information
- Normalise nested JSON fields and extract the features needed for merging and analysis
- Drop duplicate data and nested JSON fields when ready
# Normalize nested JSON objects for player game statistics
df_player = pd.json_normalize(df_player_stats['player'])# pull player_id and merge first/last name for player_name
df_team = pd.json_normalize(df_player_stats['team']) # pull team_id, team_code and name (drop team name index)
df_game = pd.json_normalize(df_player_stats['game']) # Extract game ID
# Normalize nested JSON objects for player details
df_birth = pd.json_normalize(df_player_details['birth'])# pull country and DOB
df_height = pd.json_normalize(df_player_details['height'])# pull height in mteres and merge feets/inches into ft_height
df_leagues = pd.json_normalize(df_player_details['leagues'])# pull standard.active standard.jersey standard.pos
df_nba = pd.json_normalize(df_player_details['nba'])# pull pro and start for international player insights ?
df_weight = pd.json_normalize(df_player_details['weight'])# pull kilograms and pounds
# Extract normalized data to player stats DF
df_player_stats = df_player_stats.reset_index(drop=False)
del df_player_stats['level_1']
df_player_stats['season'] = df_player_stats['level_0'].apply(lambda x: x.split(' ')[-1])
del df_player_stats['level_0']
df_player_stats['player_id'] = df_player['id']
df_player_stats['player_name'] = df_player['firstname'] + ' ' + df_player['lastname']
del df_player_stats['player']
df_player_stats['team_code'] = df_team['code']
df_player_stats['team_name'] = df_team['name']
df_player_stats['team_id'] = df_team['id']
del df_player_stats['team']
df_player_stats['game_id'] = df_game['id']
del df_player_stats['game']
df_player_stats['min'] = pd.to_numeric(df_player_stats['min'], errors='coerce')
df_player_stats['plusMinus'] = pd.to_numeric(df_player_stats['plusMinus'], errors='coerce')
# Extract normalized data to player details DF
df_player_details = df_player_details.reset_index(drop=True)
df_player_details['country'] = df_birth['country']
df_player_details['DOB'] = df_birth['date']
df_player_details['age'] = df_player_details['DOB'].apply(get_player_age)
df_player_details['age'] = df_player_details['age'].fillna(0).astype('int')
del df_player_details['birth']
df_player_details['feet'] = df_height['feets'] + '\"' + df_height['inches']
df_player_details['meters'] = df_height['meters']
df_player_details['meters'] = pd.to_numeric(df_player_details['meters'], errors='coerce')
df_player_details['meters'] = df_player_details['meters'].apply(lambda x: round(float(x), 2) if x != None else x)
del df_player_details['height']
df_player_details['position'] = df_leagues['standard.pos']
df_player_details['active'] = df_leagues['standard.active']
df_player_details['jersey_number'] = df_leagues['standard.jersey']
del df_player_details['leagues']
df_player_details['years_pro'] = df_nba['pro']
df_player_details['rookie_year'] = df_nba['start']
del df_player_details['nba']
df_player_details['pounds'] = df_weight['pounds']
df_player_details['pounds'] = df_player_details['pounds'].astype('Int16')
del df_player_details['weight']
df_player_stats.shape # verify record count is the same and new column count
(329856, 29)
Merge player performance statistics and individual player information¶
# merge data frames (player game statistics and player details)
df = pd.merge(df_player_stats, df_player_details, how='left', left_on='player_id', right_on='id')
- View features available for analysis
df.columns
Index(['assists', 'blocks', 'comment', 'defReb', 'fga', 'fgm', 'fgp', 'fta', 'ftm', 'ftp', 'min', 'offReb', 'pFouls', 'plusMinus', 'points', 'pos', 'steals', 'totReb', 'tpa', 'tpm', 'tpp', 'turnovers', 'season', 'player_id', 'player_name', 'team_code', 'team_name', 'team_id', 'game_id', 'affiliation', 'college', 'firstname', 'id', 'lastname', 'country', 'DOB', 'age', 'feet', 'meters', 'position', 'active', 'jersey_number', 'years_pro', 'rookie_year', 'pounds'], dtype='object')
- Set DataFrame index to season / year
Download to disk¶
# Save to CSV
df.to_csv('nba-stats-data.csv', index=False)