Modules for Import¶
In [1]:
import pandas as pd
!pip install pandasai
# !pip install langchain_community
# !pip install langchain_openai
import os
# from pandasai import Agent
from pandasai import SmartDataframe
from pandasai.llm import OpenAI
# from pandasai.helpers.openai_info import get_openai_callback
import logging
import openai
# from langchain_openai.chat_models import ChatOpenAI
# Create a custom logging configuration
logging.basicConfig(level=logging.CRITICAL)
Requirement already satisfied: pandasai in /opt/anaconda3/lib/python3.12/site-packages (2.4.2) Requirement already satisfied: astor<0.9.0,>=0.8.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (0.8.1) Requirement already satisfied: duckdb<2.0.0,>=1.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (1.2.2) Requirement already satisfied: faker<20.0.0,>=19.12.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (19.13.0) Requirement already satisfied: jinja2<4.0.0,>=3.1.3 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (3.1.4) Requirement already satisfied: matplotlib<4.0.0,>=3.7.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (3.9.2) Requirement already satisfied: openai<2 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (1.75.0) Requirement already satisfied: pandas==1.5.3 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (1.5.3) Requirement already satisfied: pillow<11.0.0,>=10.1.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (10.4.0) Requirement already satisfied: pydantic<3,>=1 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (2.11.3) Requirement already satisfied: python-dotenv<2.0.0,>=1.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (1.1.0) Requirement already satisfied: requests<3.0.0,>=2.31.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (2.32.3) Requirement already satisfied: scipy<2.0.0,>=1.9.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (1.13.1) Requirement already satisfied: sqlalchemy<3,>=1.4 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (2.0.34) Requirement already satisfied: sqlglot<26.0.0,>=25.0.3 in /opt/anaconda3/lib/python3.12/site-packages (from sqlglot[rs]<26.0.0,>=25.0.3->pandasai) (25.34.1) Requirement already satisfied: torch==2.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandasai) (2.2.0) Requirement already satisfied: numpy>=1.21.0 in /opt/anaconda3/lib/python3.12/site-packages (from pandas==1.5.3->pandasai) (1.26.4) Requirement already satisfied: python-dateutil>=2.8.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas==1.5.3->pandasai) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas==1.5.3->pandasai) (2024.1) Requirement already satisfied: filelock in /opt/anaconda3/lib/python3.12/site-packages (from torch==2.2.0->pandasai) (3.13.1) Requirement already satisfied: typing-extensions>=4.8.0 in /opt/anaconda3/lib/python3.12/site-packages (from torch==2.2.0->pandasai) (4.13.2) Requirement already satisfied: sympy in /opt/anaconda3/lib/python3.12/site-packages (from torch==2.2.0->pandasai) (1.13.2) Requirement already satisfied: networkx in /opt/anaconda3/lib/python3.12/site-packages (from torch==2.2.0->pandasai) (3.3) Requirement already satisfied: fsspec in /opt/anaconda3/lib/python3.12/site-packages (from torch==2.2.0->pandasai) (2024.6.1) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.12/site-packages (from jinja2<4.0.0,>=3.1.3->pandasai) (2.1.3) Requirement already satisfied: contourpy>=1.0.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<4.0.0,>=3.7.1->pandasai) (1.2.0) Requirement already satisfied: cycler>=0.10 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<4.0.0,>=3.7.1->pandasai) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<4.0.0,>=3.7.1->pandasai) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<4.0.0,>=3.7.1->pandasai) (1.4.4) Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<4.0.0,>=3.7.1->pandasai) (24.1) Requirement already satisfied: pyparsing>=2.3.1 in /opt/anaconda3/lib/python3.12/site-packages (from matplotlib<4.0.0,>=3.7.1->pandasai) (3.1.2) Requirement already satisfied: anyio<5,>=3.5.0 in /opt/anaconda3/lib/python3.12/site-packages (from openai<2->pandasai) (4.2.0) Requirement already satisfied: distro<2,>=1.7.0 in /opt/anaconda3/lib/python3.12/site-packages (from openai<2->pandasai) (1.9.0) Requirement already satisfied: httpx<1,>=0.23.0 in /opt/anaconda3/lib/python3.12/site-packages (from openai<2->pandasai) (0.27.0) Requirement already satisfied: jiter<1,>=0.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from openai<2->pandasai) (0.9.0) Requirement already satisfied: sniffio in /opt/anaconda3/lib/python3.12/site-packages (from openai<2->pandasai) (1.3.0) Requirement already satisfied: tqdm>4 in /opt/anaconda3/lib/python3.12/site-packages (from openai<2->pandasai) (4.66.5) Requirement already satisfied: annotated-types>=0.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic<3,>=1->pandasai) (0.6.0) Requirement already satisfied: pydantic-core==2.33.1 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic<3,>=1->pandasai) (2.33.1) Requirement already satisfied: typing-inspection>=0.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from pydantic<3,>=1->pandasai) (0.4.0) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->pandasai) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->pandasai) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->pandasai) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests<3.0.0,>=2.31.0->pandasai) (2025.1.31) Requirement already satisfied: greenlet!=0.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from sqlalchemy<3,>=1.4->pandasai) (3.0.1) Requirement already satisfied: sqlglotrs==0.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from sqlglot[rs]<26.0.0,>=25.0.3->pandasai) (0.3.0) Requirement already satisfied: httpcore==1.* in /opt/anaconda3/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai<2->pandasai) (1.0.2) Requirement already satisfied: h11<0.15,>=0.13 in /opt/anaconda3/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai<2->pandasai) (0.14.0) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.1->pandas==1.5.3->pandasai) (1.16.0) Requirement already satisfied: mpmath<1.4,>=1.1.0 in /opt/anaconda3/lib/python3.12/site-packages (from sympy->torch==2.2.0->pandasai) (1.3.0)
Read NBA Stats Data CSV from disk¶
In [2]:
# Load your data into a pandas DataFrame
df = pd.read_csv('nba-stats-data.csv', low_memory=False)
# set season (year) as main DF index to allow for time series analysis of player data
df = df.reset_index().set_index('season')
df.index = pd.to_datetime(df.index, format='%Y').to_period('Y')
df = df.sort_index(axis=0)
df.head()
Out[2]:
index | assists | blocks | comment | defReb | fga | fgm | fgp | fta | ftm | ... | DOB | age | feet | meters | position | active | jersey_number | years_pro | rookie_year | pounds | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
season | |||||||||||||||||||||
2015 | 34 | 0.0 | 1.0 | NaN | 1.0 | 3.0 | 2.0 | 66.7 | 2.0 | 1.0 | ... | 1991-07-01 | 33.0 | 6"10 | 2.08 | F-C | True | 33.0 | 8.0 | 2013.0 | 240.0 |
2015 | 35 | 0.0 | 1.0 | NaN | 1.0 | 3.0 | 2.0 | 66.7 | 2.0 | 1.0 | ... | 1991-07-01 | 33.0 | 6"10 | 2.08 | F-C | True | 33.0 | 8.0 | 2013.0 | 240.0 |
2015 | 36 | 2.0 | 0.0 | NaN | 7.0 | 7.0 | 3.0 | 42.9 | 2.0 | 2.0 | ... | 1988-07-16 | 36.0 | 6"7 | 2.01 | F | True | 1.0 | 9.0 | 2012.0 | 237.0 |
2015 | 37 | 0.0 | 0.0 | NaN | 3.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2015 | 38 | 1.0 | 1.0 | NaN | 5.0 | 5.0 | 3.0 | 60.0 | 6.0 | 2.0 | ... | 1992-03-22 | 33.0 | 7"3 | 2.21 | C | False | 40.0 | 1.0 | 2015.0 | 265.0 |
5 rows × 45 columns
Analysis: Speak to the Data!¶
In [3]:
df.columns
Out[3]:
Index(['index', 'assists', 'blocks', 'comment', 'defReb', 'fga', 'fgm', 'fgp', 'fta', 'ftm', 'ftp', 'min', 'offReb', 'pFouls', 'plusMinus', 'points', 'pos', 'steals', 'totReb', 'tpa', 'tpm', 'tpp', 'turnovers', 'player_id', 'player_name', 'team_code', 'team_name', 'team_id', 'game_id', 'affiliation', 'college', 'firstname', 'id', 'lastname', 'country', 'DOB', 'age', 'feet', 'meters', 'position', 'active', 'jersey_number', 'years_pro', 'rookie_year', 'pounds'], dtype='object')
Set Open AI API Key¶
In [4]:
os.environ["OPENAI_API_KEY"] = "*******"
llm = OpenAI(openai_api_key=openai.api_key)# Default is BambooLLM
Set Large Language Model & other DF confirgurations¶
In [5]:
df_nba = SmartDataframe(df, config={"llm": llm, "conversational": True, "verbose": True})
Chat with (query data)¶
In [6]:
df_nba.chat("Which are the top 10 player_names by assists?")
player_name assists season 2017 Rajon Rondo 25.0 2020 Russell Westbrook 24.0 2018 Russell Westbrook 24.0 2020 Russell Westbrook 24.0 2018 Russell Westbrook 24.0 2023 Tyrese Haliburton 23.0 2016 Russell Westbrook 22.0 2024 Trae Young 22.0 2016 Russell Westbrook 22.0 2022 James Harden 21.0
Out[6]:
player_name | assists | |
---|---|---|
season | ||
2017 | Rajon Rondo | 25.0 |
2020 | Russell Westbrook | 24.0 |
2018 | Russell Westbrook | 24.0 |
2020 | Russell Westbrook | 24.0 |
2018 | Russell Westbrook | 24.0 |
2023 | Tyrese Haliburton | 23.0 |
2016 | Russell Westbrook | 22.0 |
2024 | Trae Young | 22.0 |
2016 | Russell Westbrook | 22.0 |
2022 | James Harden | 21.0 |
In [7]:
df_nba.chat("Which are the top 5 player_names by points?")
{'type': 'dataframe', 'value': player_name points season 2023 Luka Doncic 73.0 2022 Damian Lillard 71.0 2022 Donovan Mitchell 71.0 2016 Devin Booker 70.0 2023 Joel Embiid 70.0}
Out[7]:
player_name | points | |
---|---|---|
season | ||
2023 | Luka Doncic | 73.0 |
2022 | Damian Lillard | 71.0 |
2022 | Donovan Mitchell | 71.0 |
2016 | Devin Booker | 70.0 |
2023 | Joel Embiid | 70.0 |
In [8]:
df_nba.chat("Which are the top 5 player_names group by blocks?")
Out[8]:
player_name | blocks | |
---|---|---|
766 | JaVale McGee | 2648.0 |
78 | Andre Drummond | 2478.0 |
1356 | Mason Plumlee | 1866.0 |
1167 | Kevin Durant | 1640.0 |
1667 | Rudy Gobert | 1586.0 |
In [9]:
df_nba.chat("Which are the top 5 player_names group by assists?")
Out[9]:
player_name | assists | |
---|---|---|
1669 | Russell Westbrook | 13532.0 |
1214 | Kyle Lowry | 9276.0 |
596 | Eric Bledsoe | 7404.0 |
846 | James Harden | 6952.0 |
1220 | Kyrie Irving | 6658.0 |
In [10]:
df_nba.chat("Which are the top 5 distinct player_names by pounds?")
Out[10]:
player_name | pounds | |
---|---|---|
24268 | Jusuf Nurkic | 290.0 |
40609 | Boban Marjanovic | 290.0 |
24267 | Nikola Jokic | 284.0 |
216367 | Zion Williamson | 284.0 |
25525 | Joel Embiid | 280.0 |
In [11]:
df_nba.chat("Which are the top 5 distinct player_names by meters?")
Out[11]:
player_name | meters | |
---|---|---|
season | ||
2015 | Edy Tavares | 2.21 |
2015 | Kristaps Porzingis | 2.21 |
2015 | Boban Marjanovic | 2.21 |
2019 | Bol Bol | 2.18 |
2015 | Rudy Gobert | 2.16 |
In [12]:
df_nba.chat("Which are the top 5 player_names group by plusMinus?")
Out[12]:
player_name | plusMinus | |
---|---|---|
0 | Kevin Durant | 8420.0 |
1 | Stephen Curry | 5453.0 |
2 | Kyrie Irving | 5234.0 |
3 | Danny Green | 5004.0 |
4 | Kyle Lowry | 4852.0 |
In [13]:
df_nba.chat("Which are the top 5 player_names by all stats excluding pFouls and turnovers?")
{'type': 'dataframe', 'value': index assists blocks defReb fga fgm fgp \ player_name Norman Powell 706349115 3078.0 543.0 4728.0 19740.0 9177.0 91376.1 Alec Burks 497168280 3495.0 405.0 5481.0 15078.0 6201.0 68439.9 Pascal Siakam 486231160 4826.0 792.0 7044.0 18754.0 9278.0 68422.8 Evan Fournier 485077443 4860.0 342.0 4206.0 21393.0 9360.0 74368.5 JaVale McGee 483537388 1216.0 2648.0 7460.0 11980.0 7416.0 146170.4 fta ftm ftp ... player_id team_id game_id \ player_name ... Norman Powell 5439.0 4506.0 110602.5 ... 998634.0 72555 15541269 Alec Burks 4986.0 4122.0 101819.4 ... 162540.0 49170 14186610 Pascal Siakam 5542.0 4190.0 82276.6 ... 697424.0 50820 11585508 Evan Fournier 4521.0 3639.0 99586.8 ... 330813.0 44814 11155605 JaVale McGee 3408.0 2140.0 79749.6 ... 1036408.0 44384 18544508 id age meters jersey_number years_pro \ player_name Norman Powell 998634.0 71331.0 4371.90 55224.0 13806.0 Alec Burks 162540.0 63855.0 3831.30 9675.0 19350.0 Pascal Siakam 697424.0 45136.0 2955.68 62608.0 7280.0 Evan Fournier 330813.0 59808.0 3700.62 24297.0 16821.0 JaVale McGee 1036408.0 108632.0 6253.68 0.0 38168.0 rookie_year pounds player_name Norman Powell 4636515.0 494715.0 Alec Burks 3891285.0 414090.0 Pascal Siakam 2935296.0 334880.0 Evan Fournier 3760428.0 383145.0 JaVale McGee 5895488.0 792720.0 [5 rows x 29 columns]}
Out[13]:
index | assists | blocks | defReb | fga | fgm | fgp | fta | ftm | ftp | ... | player_id | team_id | game_id | id | age | meters | jersey_number | years_pro | rookie_year | pounds | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
player_name | |||||||||||||||||||||
Norman Powell | 706349115 | 3078.0 | 543.0 | 4728.0 | 19740.0 | 9177.0 | 91376.1 | 5439.0 | 4506.0 | 110602.5 | ... | 998634.0 | 72555 | 15541269 | 998634.0 | 71331.0 | 4371.90 | 55224.0 | 13806.0 | 4636515.0 | 494715.0 |
Alec Burks | 497168280 | 3495.0 | 405.0 | 5481.0 | 15078.0 | 6201.0 | 68439.9 | 4986.0 | 4122.0 | 101819.4 | ... | 162540.0 | 49170 | 14186610 | 162540.0 | 63855.0 | 3831.30 | 9675.0 | 19350.0 | 3891285.0 | 414090.0 |
Pascal Siakam | 486231160 | 4826.0 | 792.0 | 7044.0 | 18754.0 | 9278.0 | 68422.8 | 5542.0 | 4190.0 | 82276.6 | ... | 697424.0 | 50820 | 11585508 | 697424.0 | 45136.0 | 2955.68 | 62608.0 | 7280.0 | 2935296.0 | 334880.0 |
Evan Fournier | 485077443 | 4860.0 | 342.0 | 4206.0 | 21393.0 | 9360.0 | 74368.5 | 4521.0 | 3639.0 | 99586.8 | ... | 330813.0 | 44814 | 11155605 | 330813.0 | 59808.0 | 3700.62 | 24297.0 | 16821.0 | 3760428.0 | 383145.0 |
JaVale McGee | 483537388 | 1216.0 | 2648.0 | 7460.0 | 11980.0 | 7416.0 | 146170.4 | 3408.0 | 2140.0 | 79749.6 | ... | 1036408.0 | 44384 | 18544508 | 1036408.0 | 108632.0 | 6253.68 | 0.0 | 38168.0 | 5895488.0 | 792720.0 |
5 rows × 29 columns
In [14]:
df_nba.chat("Which are the best players by single game stats?")
Out[14]:
player_name | points | assists | blocks | |
---|---|---|---|---|
0 | Luka Doncic | 73.0 | NaN | NaN |
1 | Donovan Mitchell | 71.0 | NaN | NaN |
2 | Damian Lillard | 71.0 | NaN | NaN |
3 | Devin Booker | 70.0 | NaN | NaN |
4 | Joel Embiid | 70.0 | NaN | NaN |
5 | Rajon Rondo | NaN | 25.0 | NaN |
6 | Russell Westbrook | NaN | 24.0 | NaN |
7 | Russell Westbrook | NaN | 24.0 | NaN |
8 | Russell Westbrook | NaN | 24.0 | NaN |
9 | Russell Westbrook | NaN | 24.0 | NaN |
10 | Hassan Whiteside | NaN | NaN | 11.0 |
11 | Hassan Whiteside | NaN | NaN | 10.0 |
12 | Hassan Whiteside | NaN | NaN | 10.0 |
13 | Hassan Whiteside | NaN | NaN | 10.0 |
14 | Anthony Davis | NaN | NaN | 10.0 |
In [15]:
df_nba.chat("Plot points scored by LeBron James showing for each season, using different colors for each bar")
Traceback (most recent call last): File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/pipelines/chat/generate_chat_pipeline.py", line 335, in run ).run(input) ^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/pipelines/pipeline.py", line 137, in run raise e File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/pipelines/pipeline.py", line 101, in run step_output = logic.execute( ^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/pipelines/chat/code_execution.py", line 113, in execute raise e File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/pipelines/chat/code_execution.py", line 85, in execute result = self.execute_code(code_to_run, code_context) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/pipelines/chat/code_execution.py", line 174, in execute_code exec(code, environment) File "<string>", line 12, in <module> File "/opt/anaconda3/lib/python3.12/site-packages/pandasai/safe_libs/restricted_matplotlib.py", line 75, in __getattr__ raise AttributeError(f"'{name}' is not allowed in RestrictedMatplotlib") AttributeError: 'subplots_adjust' is not allowed in RestrictedMatplotlib
Out[15]:
"Unfortunately, I was not able to answer your question, because of the following error:\n\n'subplots_adjust' is not allowed in RestrictedMatplotlib\n"
<Figure size 1000x600 with 0 Axes>
In [16]:
df_nba.chat("Plot heatmap of the best 7 player_names")
Out[16]:
'/Users/conorheffron/workspace/data-science-in-python/assignment-1/submission/v2/exports/charts/temp_chart.png'
In [ ]: