Modules for Import¶

In [1]:
import pandas as pd

# !pip install pandasai
# !pip install langchain_community
# !pip install langchain_openai

import os
from pandasai import Agent
from pandasai import SmartDataframe
from pandasai.llm import OpenAI
from pandasai.helpers.openai_info import get_openai_callback
import logging
import openai

from langchain_openai.chat_models import ChatOpenAI

# Create a custom logging configuration
logging.basicConfig(level=logging.CRITICAL)

Read NBA Stats Data CSV from disk¶

In [2]:
# Load your data into a pandas DataFrame
df = pd.read_csv('nba-stats-data.csv', low_memory=False)
# set season (year) as main DF index to allow for time series analysis of player data
df = df.reset_index().set_index('season')
df.index = pd.to_datetime(df.index, format='%Y').to_period('Y')
df = df.sort_index(axis=0)
df.head()
Out[2]:
index assists blocks comment defReb fga fgm fgp fta ftm ... DOB age feet meters position active jersey_number years_pro rookie_year pounds
season
2015 34 0.0 1.0 NaN 1.0 3.0 2.0 66.7 2.0 1.0 ... 1991-07-01 33.0 6"10 2.08 F-C True 33.0 8.0 2013.0 240.0
2015 35 0.0 1.0 NaN 1.0 3.0 2.0 66.7 2.0 1.0 ... 1991-07-01 33.0 6"10 2.08 F-C True 33.0 8.0 2013.0 240.0
2015 36 2.0 0.0 NaN 7.0 7.0 3.0 42.9 2.0 2.0 ... 1988-07-16 36.0 6"7 2.01 F True 1.0 9.0 2012.0 237.0
2015 37 0.0 0.0 NaN 3.0 4.0 0.0 0.0 0.0 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2015 38 1.0 1.0 NaN 5.0 5.0 3.0 60.0 6.0 2.0 ... 1992-03-22 33.0 7"3 2.21 C False 40.0 1.0 2015.0 265.0

5 rows × 45 columns

Analysis: Speak to the Data!¶

In [3]:
df.columns
Out[3]:
Index(['index', 'assists', 'blocks', 'comment', 'defReb', 'fga', 'fgm', 'fgp',
       'fta', 'ftm', 'ftp', 'min', 'offReb', 'pFouls', 'plusMinus', 'points',
       'pos', 'steals', 'totReb', 'tpa', 'tpm', 'tpp', 'turnovers',
       'player_id', 'player_name', 'team_code', 'team_name', 'team_id',
       'game_id', 'affiliation', 'college', 'firstname', 'id', 'lastname',
       'country', 'DOB', 'age', 'feet', 'meters', 'position', 'active',
       'jersey_number', 'years_pro', 'rookie_year', 'pounds'],
      dtype='object')

Set Open AI API Key¶

In [4]:
os.environ["OPENAI_API_KEY"] = "*******"

llm = OpenAI(model="gpt-4o", openai_api_key=openai.api_key)

Set Large Language Model & other DF confirgurations¶

In [5]:
df_nba = SmartDataframe(df, config={"llm": llm, "conversational": True, "verbose": True})
In [6]:
df_nba.chat("Which are the top 20 player_names by assists?")
Out[6]:
player_name assists
season
2017 Rajon Rondo 25.0
2018 Russell Westbrook 24.0
2020 Russell Westbrook 24.0
2018 Russell Westbrook 24.0
2020 Russell Westbrook 24.0
2023 Tyrese Haliburton 23.0
2016 Russell Westbrook 22.0
2016 Russell Westbrook 22.0
2024 Trae Young 22.0
2017 Rajon Rondo 21.0
2017 Russell Westbrook 21.0
2020 Russell Westbrook 21.0
2017 Russell Westbrook 21.0
2020 Russell Westbrook 21.0
2018 Russell Westbrook 21.0
2022 James Harden 21.0
2018 Russell Westbrook 21.0
2020 Russell Westbrook 21.0
2024 Elfrid Payton 21.0
2020 Russell Westbrook 21.0

Chat with (query data)¶

In [7]:
df_nba.chat("Which are the top 10 player_names by assists?")
Out[7]:
player_name assists
season
2017 Rajon Rondo 25.0
2018 Russell Westbrook 24.0
2020 Russell Westbrook 24.0
2018 Russell Westbrook 24.0
2020 Russell Westbrook 24.0
2023 Tyrese Haliburton 23.0
2016 Russell Westbrook 22.0
2016 Russell Westbrook 22.0
2024 Trae Young 22.0
2017 Rajon Rondo 21.0
In [8]:
df_nba.chat("Which are the top 5 player_names group by points?")
Out[8]:
player_name points
1167 Kevin Durant 35170.0
1669 Russell Westbrook 33398.0
1220 Kyrie Irving 29574.0
78 Andre Drummond 27846.0
1497 Nikola Vucevic 27172.0
In [9]:
df_nba.chat("Which are the top 5 player_names group by blocks?")
Out[9]:
player_name blocks
0 JaVale McGee 2648.0
1 Andre Drummond 2478.0
2 Mason Plumlee 1866.0
3 Kevin Durant 1640.0
4 Rudy Gobert 1586.0
In [10]:
df_nba.chat("Which are the top 5 player_names group by assists?")
Out[10]:
player_name assists
0 Russell Westbrook 13532.0
1 Kyle Lowry 9276.0
2 Eric Bledsoe 7404.0
3 James Harden 6952.0
4 Kyrie Irving 6658.0
In [11]:
df_nba.chat("Which are the top 5 distinct player_names by pounds?")
Out[11]:
player_name pounds
season
2015 Jusuf Nurkic 290.0
2015 Boban Marjanovic 290.0
2019 Zion Williamson 284.0
2015 Nikola Jokic 284.0
2015 Joel Embiid 280.0
In [12]:
df_nba.chat("Which are the top 5 distinct player_names by meters?")
Out[12]:
player_name meters
season
2017 Boban Marjanovic 2.21
2015 Edy Tavares 2.21
2017 Kristaps Porzingis 2.21
2021 Bol Bol 2.18
2021 Rudy Gobert 2.16
In [13]:
df_nba.chat("Which are the top 5 player_names group by plusMinus?")
Out[13]:
player_name plusMinus
1167 Kevin Durant 8420.0
1763 Stephen Curry 5453.0
1220 Kyrie Irving 5234.0
401 Danny Green 5004.0
1214 Kyle Lowry 4852.0
In [14]:
df_nba.chat("Which are the top 5 player_names by all stats excluding pFouls and turnovers?")
<string>:3: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
{'type': 'dataframe', 'value':            player_name  total_stats
season                             
2024     Jaylen Nowell    416102.93
2024     Jaylen Nowell    416101.93
2024     Jaylen Nowell    416100.93
2024    Kira Lewis Jr.    415965.85
2024     Jaylen Nowell    414590.93}
Out[14]:
player_name total_stats
season
2024 Jaylen Nowell 416102.93
2024 Jaylen Nowell 416101.93
2024 Jaylen Nowell 416100.93
2024 Kira Lewis Jr. 415965.85
2024 Jaylen Nowell 414590.93
In [15]:
df_nba.chat("Which are the best players by single game stats?")
<string>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Out[15]:
player_name points assists totReb steals blocks composite_score
season
2022 Luka Doncic 60.0 10.0 21.0 2.0 1.0 106.2
2023 Joel Embiid 70.0 5.0 18.0 1.0 1.0 103.1
2022 Joel Embiid 59.0 8.0 11.0 1.0 7.0 100.2
2022 Donovan Mitchell 71.0 11.0 8.0 0.0 1.0 99.1
2017 James Harden 60.0 11.0 10.0 4.0 1.0 98.5
In [19]:
df_nba.chat("Plot points scored by LeBron James for each season, using different colors for each bar")
Out[19]:
'/Users/conorheffron/workspace/data-science-in-python/assignment-1/submission/v2/exports/charts/temp_chart.png'
<Figure size 1000x600 with 0 Axes>
No description has been provided for this image
No description has been provided for this image
In [20]:
df_nba.chat("Plot heatmap of Lebron James performance by season")
<string>:3: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
Out[20]:
'/Users/conorheffron/workspace/data-science-in-python/assignment-1/submission/v2/exports/charts/temp_chart.png'
No description has been provided for this image
In [18]:
df_nba.chat("Plot heatmap of the best 14 players by player_name and stats")
Out[18]:
'/Users/conorheffron/workspace/data-science-in-python/assignment-1/submission/v2/exports/charts/temp_chart.png'
No description has been provided for this image
In [ ]: