Python: Pandas
Utilizzo l’environment conda py3
1
~$ conda activate py3
Versione modulo installato
1
2
3
4
5
6
7
8
9
10
11
~$ pip show pandas
Name: pandas
Version: 1.0.5
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: None
Author-email: None
License: BSD
Location: /home/user/miniconda3/envs/py3/lib/python3.7/site-packages
Requires: pytz, python-dateutil, numpy
Required-by: seaborn
Pandas
Series
1
2
import numpy as np
import pandas as pd
1
2
3
4
labels = ['a', 'b', 'c'] # list
my_data = [10,20,30] # list
arr = np.array(my_data) # array
d = {'a':10,'b':20,'c':30} # dictionary
1
pd.Series(data = my_data)
1
2
3
4
0 10
1 20
2 30
dtype: int64
1
pd.Series(data = my_data, index=labels)
1
2
3
4
a 10
b 20
c 30
dtype: int64
1
pd.Series(my_data, labels)
1
2
3
4
a 10
b 20
c 30
dtype: int64
1
pd.Series(arr, labels) # con numpy array funziona come lista
1
2
3
4
a 10
b 20
c 30
dtype: int64
1
pd.Series(d) # le key come index e value come elementi
1
2
3
4
a 10
b 20
c 30
dtype: int64
1
pd.Series(data=[sum,print,len]) # un pandas series può contenere anche reference di funzioni
1
2
3
4
0 <built-in function sum>
1 <built-in function print>
2 <built-in function len>
dtype: object
1
2
3
# filtering
ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])
ser1
1
2
3
4
5
USA 1
Germany 2
USSR 3
Japan 4
dtype: int64
1
2
ser2 = pd.Series([1,2,5,4],['USA','Germany','Italy','Japan'])
ser2
1
2
3
4
5
USA 1
Germany 2
Italy 5
Japan 4
dtype: int64
1
ser1['USA']
1
1
1
2
ser3 = pd.Series(data=labels)
ser3
1
2
3
4
0 a
1 b
2 c
dtype: object
1
ser3[0]
1
'a'
1
ser1 + ser2
1
2
3
4
5
6
Germany 4.0
Italy NaN
Japan 8.0
USA 2.0
USSR NaN
dtype: float64
DataFrames
1
2
3
import numpy as np
import pandas as pd
from numpy.random import randn
1
2
# set seed
np.random.seed(101)
1
2
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])
df
W | X | Y | Z | |
---|---|---|---|---|
A | 0.302665 | 1.693723 | -1.706086 | -1.159119 |
B | -0.134841 | 0.390528 | 0.166905 | 0.184502 |
C | 0.807706 | 0.072960 | 0.638787 | 0.329646 |
D | -0.497104 | -0.754070 | -0.943406 | 0.484752 |
E | -0.116773 | 1.901755 | 0.238127 | 1.996652 |
1
2
# dettagli data frame
df.info()
1
2
3
4
5
6
7
8
9
10
11
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, A to E
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 W 5 non-null float64
1 X 5 non-null float64
2 Y 5 non-null float64
3 Z 5 non-null float64
dtypes: float64(4)
memory usage: 200.0+ bytes
Selection
1
2
3
4
5
6
# selezione colonna
df['W'] # una series
# df[df.columns[0]] # dalla posizione colonna
# df[df.columns[:-1]] # dalla posizione colonna, escludo l'ultima
# df.iloc[:,0] # dalla posizione colonna
# df.W # non conviene usarlo come comando perché si confonde con i methods
1
2
3
4
5
6
A 2.706850
B 0.651118
C -2.018168
D 0.188695
E 0.190794
Name: W, dtype: float64
1
type(df['W'])
1
pandas.core.series.Series
1
type(df)
1
pandas.core.frame.DataFrame
1
2
# selezione colonne
df[['W','Z']]
W | Z | |
---|---|---|
A | 2.706850 | 0.503826 |
B | 0.651118 | 0.605965 |
C | -2.018168 | -0.589001 |
D | 0.188695 | 0.955057 |
E | 0.190794 | 0.683509 |
1
2
3
# aggiungere colonne
df['new'] = df['W'] + df['Y']
df
W | X | Y | Z | new | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | 3.614819 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | -0.196959 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | -1.489355 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | -0.744542 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | 2.796762 |
1
2
3
# rimuovere colonne
df.drop('new', axis=1, inplace=True) # axis=0 è di default ed è la riga, inplace va a sostituire il df originale
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
1
2
# rimuovere righe
df.drop('E')
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
1
df.shape # è una tupla, 0 riguarda i record e 1 le colonne
1
(5, 4)
1
2
# selezione righe, label riga
df.loc['A'] # location, anche le righe sono pandas series
1
2
3
4
5
W 2.706850
X 0.628133
Y 0.907969
Z 0.503826
Name: A, dtype: float64
1
2
# selezione righe, posizione riga
df.iloc[0]
1
2
3
4
5
W 2.706850
X 0.628133
Y 0.907969
Z 0.503826
Name: A, dtype: float64
1
2
# selezione subset righe e colonne
df.loc['B','Y']
1
-0.8480769834036315
1
2
# selezione subset righe e colonne
df.loc[['A','B'],['W','Y']]
W | Y | |
---|---|---|
A | 2.706850 | 0.907969 |
B | 0.651118 | -0.848077 |
Conditional selection
1
df
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
1
df[df > 0]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | NaN | NaN | 0.605965 |
C | NaN | 0.740122 | 0.528813 | NaN |
D | 0.188695 | NaN | NaN | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
1
2
# filtro df in base a condizioni singole colonna
df[df['W']>0]
W | X | Y | Z | |
---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
1
df[df['Z']<0]
W | X | Y | Z | |
---|---|---|---|---|
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
1
df[df['W']>0][['Y','X']]
Y | X | |
---|---|---|
A | 0.907969 | 0.628133 |
B | -0.848077 | -0.319318 |
D | -0.933237 | -0.758872 |
E | 2.605967 | 1.978757 |
1
df
W | X | Y | Z | |
---|---|---|---|---|
A | 0.302665 | 1.693723 | -1.706086 | -1.159119 |
B | -0.134841 | 0.390528 | 0.166905 | 0.184502 |
C | 0.807706 | 0.072960 | 0.638787 | 0.329646 |
D | -0.497104 | -0.754070 | -0.943406 | 0.484752 |
E | -0.116773 | 1.901755 | 0.238127 | 1.996652 |
1
2
# filtro record che ha il massimo rispetto la colonna W
df.loc[[df['W'].idxmax()]]
W | X | Y | Z | |
---|---|---|---|---|
C | 0.807706 | 0.07296 | 0.638787 | 0.329646 |
1
2
3
# multiple conditions, and e or
# df[(df['W']>0) and (df['Y']>1)] # and e or funziona solo su scalari e non vettori o series
df[(df['W']>0) & (df['Y']>1)] # l'or con |
W | X | Y | Z | |
---|---|---|---|---|
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
1
2
# reset l'index
df.reset_index() # se si vuole sostituire l'originale basta inplace=True
index | W | X | Y | Z | |
---|---|---|---|---|---|
0 | A | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
1 | B | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
2 | C | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
3 | D | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
4 | E | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
1
2
3
# sostituzione dell'index
newind = 'CA NY WY OR CO'.split()
newind
1
['CA', 'NY', 'WY', 'OR', 'CO']
1
2
df['States'] = newind
df
W | X | Y | Z | States | |
---|---|---|---|---|---|
A | 2.706850 | 0.628133 | 0.907969 | 0.503826 | CA |
B | 0.651118 | -0.319318 | -0.848077 | 0.605965 | NY |
C | -2.018168 | 0.740122 | 0.528813 | -0.589001 | WY |
D | 0.188695 | -0.758872 | -0.933237 | 0.955057 | OR |
E | 0.190794 | 1.978757 | 2.605967 | 0.683509 | CO |
1
df.set_index('States')
W | X | Y | Z | |
---|---|---|---|---|
States | ||||
CA | 2.706850 | 0.628133 | 0.907969 | 0.503826 |
NY | 0.651118 | -0.319318 | -0.848077 | 0.605965 |
WY | -2.018168 | 0.740122 | 0.528813 | -0.589001 |
OR | 0.188695 | -0.758872 | -0.933237 | 0.955057 |
CO | 0.190794 | 1.978757 | 2.605967 | 0.683509 |
Multi-Index and Index Hierarchy
1
2
3
import numpy as np
import pandas as pd
from numpy.random import randn
1
2
3
4
5
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
1
list(zip(outside,inside))
1
[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]
1
hier_index
1
2
3
4
5
6
7
MultiIndex([('G1', 1),
('G1', 2),
('G1', 3),
('G2', 1),
('G2', 2),
('G2', 3)],
)
1
2
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df
A | B | ||
---|---|---|---|
G1 | 1 | -1.094109 | 0.224428 |
2 | -1.927079 | 0.359976 | |
3 | 0.607172 | 0.892118 | |
G2 | 1 | 0.345103 | 0.023081 |
2 | -1.820372 | -1.061236 | |
3 | 0.983283 | 1.096904 |
1
df.loc['G1']
A | B | |
---|---|---|
1 | -1.094109 | 0.224428 |
2 | -1.927079 | 0.359976 |
3 | 0.607172 | 0.892118 |
1
df.loc['G1'].loc[1]
1
2
3
A -1.094109
B 0.224428
Name: 1, dtype: float64
1
df.index.names
1
FrozenList([None, None])
1
2
3
# imposto i names dei multilivelli
df.index.names = ['Groups', 'Num']
df
A | B | ||
---|---|---|---|
Groups | Num | ||
G1 | 1 | -1.094109 | 0.224428 |
2 | -1.927079 | 0.359976 | |
3 | 0.607172 | 0.892118 | |
G2 | 1 | 0.345103 | 0.023081 |
2 | -1.820372 | -1.061236 | |
3 | 0.983283 | 1.096904 |
1
df.loc['G2'].loc[2].iloc[1]
1
-1.0612359393184747
1
df.loc['G2'].loc[2]['B']
1
df.loc['G2'].loc[2][['B']][0]
1
-1.0612359393184747
1
2
# cross section
df.xs('G1')
A | B | |
---|---|---|
Num | ||
1 | -1.094109 | 0.224428 |
2 | -1.927079 | 0.359976 |
3 | 0.607172 | 0.892118 |
1
2
# cross section filtering
df.xs(1,level='Num')
A | B | |
---|---|---|
Groups | ||
G1 | -1.094109 | 0.224428 |
G2 | 0.345103 | 0.023081 |
Missing Data
1
2
import numpy as np
import pandas as pd
1
2
d = {'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}
d
1
{'A': [1, 2, nan], 'B': [5, nan, nan], 'C': [1, 2, 3]}
1
2
df = pd.DataFrame(d)
df
A | B | C | |
---|---|---|---|
0 | 1.0 | 5.0 | 1 |
1 | 2.0 | NaN | 2 |
2 | NaN | NaN | 3 |
1
2
# dropping with missing
df.dropna() # axis = 0 default agisce sulle righe
A | B | C | |
---|---|---|---|
0 | 1.0 | 5.0 | 1 |
1
df.dropna(thresh=2) # drop se ha almeno n nan la riga
A | B | C | |
---|---|---|---|
0 | 1.0 | 5.0 | 1 |
1 | 2.0 | NaN | 2 |
1
2
# sostituzione tutti i missing
df.fillna(value='FILL VALUE')
A | B | C | |
---|---|---|---|
0 | 1 | 5 | 1 |
1 | 2 | FILL VALUE | 2 |
2 | FILL VALUE | FILL VALUE | 3 |
1
2
# sostituzione missing colonna con media
df['A'].fillna(value=df['A'].mean())
1
2
3
4
0 1.0
1 2.0
2 1.5
Name: A, dtype: float64
Groupby
1
import pandas as pd
1
2
3
4
5
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
data
1
2
3
{'Company': ['GOOG', 'GOOG', 'MSFT', 'MSFT', 'FB', 'FB'],
'Person': ['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
'Sales': [200, 120, 340, 124, 243, 350]}
1
2
df = pd.DataFrame(data)
df
Company | Person | Sales | |
---|---|---|---|
0 | GOOG | Sam | 200 |
1 | GOOG | Charlie | 120 |
2 | MSFT | Amy | 340 |
3 | MSFT | Vanessa | 124 |
4 | FB | Carl | 243 |
5 | FB | Sarah | 350 |
1
2
# groupby
df.groupby('Company').sum() # sum, count, mean, std, max ...
Sales | |
---|---|
Company | |
FB | 593 |
GOOG | 320 |
MSFT | 464 |
1
df.groupby('Company').sum().loc['FB']
1
2
Sales 593
Name: FB, dtype: int64
1
df.groupby('Company').max()
Person | Sales | |
---|---|---|
Company | ||
FB | Sarah | 350 |
GOOG | Sam | 200 |
MSFT | Vanessa | 340 |
1
df.groupby('Company').describe()
Sales | ||||||||
---|---|---|---|---|---|---|---|---|
count | mean | std | min | 25% | 50% | 75% | max | |
Company | ||||||||
FB | 2.0 | 296.5 | 75.660426 | 243.0 | 269.75 | 296.5 | 323.25 | 350.0 |
GOOG | 2.0 | 160.0 | 56.568542 | 120.0 | 140.00 | 160.0 | 180.00 | 200.0 |
MSFT | 2.0 | 232.0 | 152.735065 | 124.0 | 178.00 | 232.0 | 286.00 | 340.0 |
1
df.groupby('Company').describe().transpose()['FB']
1
2
3
4
5
6
7
8
9
Sales count 2.000000
mean 296.500000
std 75.660426
min 243.000000
25% 269.750000
50% 296.500000
75% 323.250000
max 350.000000
Name: FB, dtype: float64
Merging Joining and Concatenating
1
import pandas as pd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
1
df1
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
3 | A3 | B3 | C3 | D3 |
1
df2
A | B | C | D | |
---|---|---|---|---|
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
1
df3
A | B | C | D | |
---|---|---|---|---|
8 | A8 | B8 | C8 | D8 |
9 | A9 | B9 | C9 | D9 |
10 | A10 | B10 | C10 | D10 |
11 | A11 | B11 | C11 | D11 |
1
2
# concatenation
pd.concat([df1,df2,df3]) # default axis=0 per riga
A | B | C | D | |
---|---|---|---|---|
0 | A0 | B0 | C0 | D0 |
1 | A1 | B1 | C1 | D1 |
2 | A2 | B2 | C2 | D2 |
4 | A3 | B3 | C3 | D3 |
4 | A4 | B4 | C4 | D4 |
5 | A5 | B5 | C5 | D5 |
6 | A6 | B6 | C6 | D6 |
7 | A7 | B7 | C7 | D7 |
8 | A8 | B8 | C8 | D8 |
9 | A9 | B9 | C9 | D9 |
10 | A10 | B10 | C10 | D10 |
11 | A11 | B11 | C11 | D11 |
1
2
# merging
# inner (intersect), outer (union), right, left
1
2
3
4
5
6
7
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
1
left
key | A | B | |
---|---|---|---|
0 | K0 | A0 | B0 |
1 | K1 | A1 | B1 |
2 | K2 | A2 | B2 |
3 | K3 | A3 | B3 |
1
right
key | C | D | |
---|---|---|---|
0 | K0 | C0 | D0 |
1 | K1 | C1 | D1 |
2 | K2 | C2 | D2 |
3 | K3 | C3 | D3 |
1
pd.merge(left,right,how='inner',on='key') # default how inner
key | A | B | C | D | |
---|---|---|---|---|---|
0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | A1 | B1 | C1 | D1 |
2 | K2 | A2 | B2 | C2 | D2 |
3 | K3 | A3 | B3 | C3 | D3 |
1
2
3
4
5
6
7
8
9
10
# più complicato
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
1
left
key1 | key2 | A | B | |
---|---|---|---|---|
0 | K0 | K0 | A0 | B0 |
1 | K0 | K1 | A1 | B1 |
2 | K1 | K0 | A2 | B2 |
3 | K2 | K1 | A3 | B3 |
1
right
key1 | key2 | C | D | |
---|---|---|---|---|
0 | K0 | K0 | C0 | D0 |
1 | K1 | K0 | C1 | D1 |
2 | K1 | K0 | C2 | D2 |
3 | K2 | K0 | C3 | D3 |
1
pd.merge(left, right, on=['key1', 'key2'])
key1 | key2 | A | B | C | D | |
---|---|---|---|---|---|---|
0 | K0 | K0 | A0 | B0 | C0 | D0 |
1 | K1 | K0 | A2 | B2 | C1 | D1 |
2 | K1 | K0 | A2 | B2 | C2 | D2 |
1
# join come il merge ma le key che vuoi usare sono colonne della tabella
1
2
3
4
5
6
7
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
1
left
A | B | |
---|---|---|
K0 | A0 | B0 |
K1 | A1 | B1 |
K2 | A2 | B2 |
1
right
C | D | |
---|---|---|
K0 | C0 | D0 |
K2 | C2 | D2 |
K3 | C3 | D3 |
1
left.join(right) # default sulal index key
A | B | C | D | |
---|---|---|---|---|
K0 | A0 | B0 | C0 | D0 |
K1 | A1 | B1 | NaN | NaN |
K2 | A2 | B2 | C2 | D2 |
Operations
1
import pandas as pd
1
2
df = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})
df.head()
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
1 | 2 | 555 | def |
2 | 3 | 666 | ghi |
3 | 4 | 444 | xyz |
1
df['col2']
1
2
3
4
5
0 444
1 555
2 666
3 444
Name: col2, dtype: int64
1
df[['col2']]
col2 | |
---|---|
0 | 444 |
1 | 555 |
2 | 666 |
3 | 444 |
1
df['col2'].unique()
1
array([444, 555, 666])
1
len(df['col2'].unique())
1
3
1
df['col2'].nunique()
1
3
1
2
# distribuzione di frequenza
df['col2'].value_counts()
1
2
3
4
444 2
555 1
666 1
Name: col2, dtype: int64
1
df.groupby('col2').count() # mette in mezzo altre variabili
col1 | col3 | |
---|---|---|
col2 | ||
444 | 2 | 2 |
555 | 1 | 1 |
666 | 1 | 1 |
1
2
# select data with conditional selection
df[(df['col1']>2) & (df['col2']==444)]
col1 | col2 | col3 | |
---|---|---|---|
3 | 4 | 444 | xyz |
1
2
def times2(x):
return x*2
1
df['col1'].sum()
1
10
1
2
# apply broadcasts the function to each element of the column
df['col1'].apply(times2)
1
2
3
4
5
0 2
1 4
2 6
3 8
Name: col1, dtype: int64
1
2
# applico funzione vettoriale ad una colonna intera (importante!)
df['col1'].apply(lambda x: x*2)
1
2
3
4
5
0 2
1 4
2 6
3 8
Name: col1, dtype: int64
1
list(map(lambda x: x*2, df['col1']))
1
[2, 4, 6, 8]
1
df['col3'].apply(len)
1
2
3
4
5
0 3
1 3
2 3
3 3
Name: col3, dtype: int64
1
2
# removing columns
df.drop('col1',axis=1) # con inplace=True va a sostituire
col2 | col3 | |
---|---|---|
0 | 444 | abc |
1 | 555 | def |
2 | 666 | ghi |
3 | 444 | xyz |
1
2
# list of columns name
df.columns
1
Index(['col1', 'col2', 'col3'], dtype='object')
1
df.index
1
RangeIndex(start=0, stop=4, step=1)
1
2
# sorting and ordering, ordine sort
df.sort_values(by='col2')
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
3 | 4 | 444 | xyz |
1 | 2 | 555 | def |
2 | 3 | 666 | ghi |
1
df.sort_values(by='col2').reset_index().drop('index',axis=1)
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 444 | abc |
1 | 4 | 444 | xyz |
2 | 2 | 555 | def |
3 | 3 | 666 | ghi |
1
2
# cerca missing nan
df.isnull()
1
2
3
4
5
6
7
8
# pivot table
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
df
A | B | C | D | |
---|---|---|---|---|
0 | foo | one | x | 1 |
1 | foo | one | y | 3 |
2 | foo | two | x | 2 |
3 | bar | two | y | 5 |
4 | bar | one | x | 4 |
5 | bar | one | y | 1 |
1
2
# long to wide
df.pivot_table(values='D',index=['A','B'],columns='C')
C | x | y | |
---|---|---|---|
A | B | ||
bar | one | 4.0 | 1.0 |
two | NaN | 5.0 | |
foo | one | 1.0 | 3.0 |
two | 2.0 | NaN |
Data Input and Output
1
!conda install -y xlrd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
Collecting package metadata (current_repodata.json): done
Solving environment: done
## Package Plan ##
environment location: /home/user/miniconda3/envs/py3
added / updated specs:
- xlrd
The following packages will be downloaded:
package | build
---------------------------|-----------------
xlrd-1.2.0 | py37_0 175 KB
------------------------------------------------------------
Total: 175 KB
The following NEW packages will be INSTALLED:
xlrd pkgs/main/linux-64::xlrd-1.2.0-py37_0
Downloading and Extracting Packages
xlrd-1.2.0 | 175 KB | ##################################### | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
1
2
3
4
5
6
7
# conda install xlrd
# conda install lxml
# conda install html5lib
# conda install BeautifulSoup4
# conda install sqlalchemy
1
2
import numpy as np
import pandas as pd
1
pwd
1
'/media/user/Public/Python/Course 001'
CSV ed EXCEL
1
2
3
4
5
# import csv
pd.read_csv('example')
# import excel
pd.read_excel('Excel_Sample.xlsx',sheet_name='Sheet1')
1
2
3
4
5
# export csv
df.to_csv('example',index=False)
# export excel
df.to_excel('Excel_Sample.xlsx',sheet_name='Sheet1')
HTML
1
df = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')
1
type(df)
1
list
1
df[0].head()
Bank Name | City | ST | CERT | Acquiring Institution | Closing Date | |
---|---|---|---|---|---|---|
0 | The First State Bank | Barboursville | WV | 14361 | MVB Bank, Inc. | April 3, 2020 |
1 | Ericson State Bank | Ericson | NE | 18265 | Farmers and Merchants Bank | February 14, 2020 |
2 | City National Bank of New Jersey | Newark | NJ | 21111 | Industrial Bank | November 1, 2019 |
3 | Resolute Bank | Maumee | OH | 58317 | Buckeye State Bank | October 25, 2019 |
4 | Louisa Community Bank | Louisa | KY | 58112 | Kentucky Farmers Bank Corporation | October 25, 2019 |
SQL
1
2
3
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
1
2
# sql light engine
engine = create_engine('sqlite:///:memory:')
1
2
df = pd.read_csv('example')
df
a | b | c | d | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
1
df.to_sql('data', engine)
1
2
sql_df = pd.read_sql('data',con=engine)
sql_df
index | a | b | c | d | |
---|---|---|---|---|---|
0 | 0 | 0 | 1 | 2 | 3 |
1 | 1 | 4 | 5 | 6 | 7 |
2 | 2 | 8 | 9 | 10 | 11 |
3 | 3 | 12 | 13 | 14 | 15 |
Altro
1
2
3
4
5
df = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(df)
df
Company | Person | Sales | |
---|---|---|---|
0 | GOOG | Sam | 200 |
1 | GOOG | Charlie | 120 |
2 | MSFT | Amy | 340 |
3 | MSFT | Vanessa | 124 |
4 | FB | Carl | 243 |
5 | FB | Sarah | 350 |
1
df.info()
1
2
3
4
5
6
7
8
9
10
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Company 6 non-null object
1 Person 6 non-null object
2 Sales 6 non-null int64
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes
1
2
3
4
# quanti iniziano con C (seguono codici che danno stessi risultati)
sum(df['Person'].apply(lambda x: x[0]=='C'))
df['Person'][df['Person'].apply(lambda x: x[0]=='C')].count()
len(df[df['Person'].apply(lambda x: x[0]=='C')].index)
1
2
3
# seleziono solo colonne numeriche
df.select_dtypes(include=np.number)
df.loc[:, df.dtypes == np.float64]