Python Course Nontapat Thaiprayoon

03 Data Science

# pip install pandas
# pip install matplotlib
# pip install seaborn

Series

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
series = pd.Series(["A","B","AB","O"])
series[4] = "O-"
print(series)
0     A
1     B
2    AB
3     O
4    O-
dtype: object
series
0     A
1     B
2    AB
3     O
4    O-
dtype: object

DataFrame

import pandas as pd
pd.__version__
'2.2.0'
import pandas as pd
data  = [1,2,3,4,5]
df1 = pd.DataFrame(data)
# display(df1)

data = [['Alice', 21], ['Bob', 22], ['Cathy', 23]]
df2 = pd.DataFrame(data, columns = ["Name", "Age"])
df2
Name Age
0 Alice 21
1 Bob 22
2 Cathy 23
d = {'name' : pd.Series(['Alice','Bob','Cathy','Dave']),
    'Age': pd.Series([21,22,23,24]),
    'Score' : pd.Series([80,85,90,95])}

df3 = pd.DataFrame(d)
df3
name Age Score
0 Alice 21 80
1 Bob 22 85
2 Cathy 23 90
3 Dave 24 95
df3.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Score   4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 228.0+ bytes
df3.columns
Index(['name', 'Age', 'Score'], dtype='object')
for i in df3['name']:
    print(i)
Alice
Bob
Cathy
Dave
for i in df3['Age']:
    print(i)
21
22
23
24
df3
name Age Score
0 Alice 21 80
1 Bob 22 85
2 Cathy 23 90
3 Dave 24 95
df4 = df3[df3['Score'] > 80]
df4
name Age Score
1 Bob 22 85
2 Cathy 23 90
3 Dave 24 95
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('https://nontapatnon.github.io/python-course-master/datascience/Titanic-Dataset.csv')
df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
df[df['Survived'] == 0][['Name', 'Sex']]
Name Sex
0 Braund, Mr. Owen Harris male
4 Allen, Mr. William Henry male
5 Moran, Mr. James male
6 McCarthy, Mr. Timothy J male
7 Palsson, Master. Gosta Leonard male
... ... ...
884 Sutehall, Mr. Henry Jr male
885 Rice, Mrs. William (Margaret Norton) female
886 Montvila, Rev. Juozas male
888 Johnston, Miss. Catherine Helen "Carrie" female
890 Dooley, Mr. Patrick male

549 rows × 2 columns

df.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
df.head(10)
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
df.tail()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.00 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.00 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.45 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.00 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.75 NaN Q
import matplotlib.pyplot as plt
df['Survived'].value_counts()
Survived
0    549
1    342
Name: count, dtype: int64
df.groupby(['Sex','Survived'])[['Survived']].count()
Survived
Sex Survived
female 0 81
1 233
male 0 468
1 109
df[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar()
plt.show()

png

import matplotlib.pyplot as plt
import seaborn as sns
f, ax = plt.subplots(1, 2, figsize = (18,8))
ax[0].set_title('Survived vs Sex')
df[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar(ax = ax[0])

ax[1].set_title('Sex:Survived vs Dead')
sns.countplot(x = 'Sex', hue = 'Survived', data = df, ax = ax[1])
plt.show()

png

sns.countplot(df, x="Pclass")
<Axes: xlabel='Pclass', ylabel='count'>

png

sns.countplot(data = df, x="Pclass", hue="Survived")
<Axes: xlabel='Pclass', ylabel='count'>

png

sns.countplot(data = df , x="Pclass", hue="Survived", stat="percent")
<Axes: xlabel='Pclass', ylabel='percent'>

png

Time Series Data

df_f = pd.read_csv("https://nontapatnon.github.io/python-course-master/datascience/flight2.csv")
df_f
year month passengers
0 1949 Jan 112
1 1949 Feb 118
2 1949 Mar 132
3 1949 Apr 129
4 1949 May 121
... ... ... ...
139 1960 Aug 606
140 1960 Sep 508
141 1960 Oct 461
142 1960 Nov 390
143 1960 Dec 432

144 rows × 3 columns

df_may = df_f.query("month == 'May'")
sns.lineplot(data = df_may, x = "year", y = "passengers")
<Axes: xlabel='year', ylabel='passengers'>

png

df_wide = df_f.pivot(index = "year", columns = "month", values= "passengers")
df_wide.head()
month Apr Aug Dec Feb Jan Jul Jun Mar May Nov Oct Sep
year
1949 129 148 118 118 112 148 135 132 121 104 119 136
1950 135 170 140 126 115 170 149 141 125 114 133 158
1951 163 199 166 150 145 199 178 178 172 146 162 184
1952 181 242 194 180 171 230 218 193 183 172 191 209
1953 235 272 201 196 196 264 243 236 229 180 211 237
sns.lineplot(data= df_wide["May"])
<Axes: xlabel='year', ylabel='May'>

png

sns.lineplot(data = df_wide)
<Axes: xlabel='year'>

png

sns.lineplot(data=df_f, x="year", y="passengers")
<Axes: xlabel='year', ylabel='passengers'>

png

# pip install plotly
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

df = px.data.iris()
df.head()
sepal_length sepal_width petal_length petal_width species species_id
0 5.1 3.5 1.4 0.2 setosa 1
1 4.9 3.0 1.4 0.2 setosa 1
2 4.7 3.2 1.3 0.2 setosa 1
3 4.6 3.1 1.5 0.2 setosa 1
4 5.0 3.6 1.4 0.2 setosa 1
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species",
                 size='petal_length', hover_data=['petal_width'])
fig.show()
import plotly.express as px
df = px.data.tips()
df.head()
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
df
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
... ... ... ... ... ... ... ...
239 29.03 5.92 Male No Sat Dinner 3
240 27.18 2.00 Female Yes Sat Dinner 2
241 22.67 2.00 Male Yes Sat Dinner 2
242 17.82 1.75 Male No Sat Dinner 2
243 18.78 3.00 Female No Thur Dinner 2

244 rows × 7 columns

import plotly.express as px
df = px.data.tips()

fig = px.density_heatmap(df, x="total_bill", y="tip", text_auto=True)
fig.show()