pandas advanced operations
import pandas as pd
import numpy as np
from pandas import DataFrame
Replace operation
-
The replacement operation can act synchronously in Series and DataFrame
-
Single valued substitution
- Normal replacement: replace all qualified elements: to_replace=15,value=‘e’
- Specify single value substitution by column: to_replace = {column label: replace value} value = 'value'
-
Multi valued substitution
- List replacement: to_replace=[] value=[]
- Dictionary replacement (recommended) to_replace={to_replace:value,to_replace:value}
df = DataFrame(data=np.random.randint(0,20,size=(8,5)))
df
| 0 | 1 | 2 | 3 | 4 |
---|
0 | 17 | 0 | 17 | 11 | 6 |
---|
1 | 8 | 3 | 7 | 7 | 3 |
---|
2 | 9 | 13 | 4 | 19 | 3 |
---|
3 | 12 | 3 | 0 | 10 | 16 |
---|
4 | 8 | 5 | 5 | 4 | 14 |
---|
5 | 14 | 18 | 2 | 1 | 3 |
---|
6 | 14 | 6 | 8 | 15 | 5 |
---|
7 | 2 | 7 | 7 | 18 | 11 |
---|
#Global replacement
df.replace(to_replace=1,value='one')
| 0 | 1 | 2 | 3 | 4 |
---|
0 | 17 | 0 | 17 | 11 | 6 |
---|
1 | 8 | 3 | 7 | 7 | 3 |
---|
2 | 9 | 13 | 4 | 19 | 3 |
---|
3 | 12 | 3 | 0 | 10 | 16 |
---|
4 | 8 | 5 | 5 | 4 | 14 |
---|
5 | 14 | 18 | 2 | one | 3 |
---|
6 | 14 | 6 | 8 | 15 | 5 |
---|
7 | 2 | 7 | 7 | 18 | 11 |
---|
#Multi value replacement: replaces multiple values with the specified form
df.replace(to_replace={1:'one',3:'three'})
| 0 | 1 | 2 | 3 | 4 |
---|
0 | 17 | 0 | 17 | 11 | 6 |
---|
1 | 8 | three | 7 | 7 | three |
---|
2 | 9 | 13 | 4 | 19 | three |
---|
3 | 12 | three | 0 | 10 | 16 |
---|
4 | 8 | 5 | 5 | 4 | 14 |
---|
5 | 14 | 18 | 2 | one | three |
---|
6 | 14 | 6 | 8 | 15 | 5 |
---|
7 | 2 | 7 | 7 | 18 | 11 |
---|
#Replacement of specified columns: the key of the dictionary represents the specified column index, and the value of the dictionary is the element to be replaced
df.replace(to_replace={3:17},value='one')
| 0 | 1 | 2 | 3 | 4 |
---|
0 | 13 | 10 | 14 | 2 | 17 |
---|
1 | 17 | 2 | 4 | one | 9 |
---|
2 | 2 | 3 | 10 | one | 13 |
---|
3 | 17 | 15 | 19 | 13 | 15 |
---|
4 | 10 | 5 | 17 | 7 | 4 |
---|
5 | 3 | 2 | 0 | 0 | 13 |
---|
6 | 6 | 8 | 0 | 16 | 1 |
---|
7 | 15 | 13 | 10 | 6 | 18 |
---|
Mapping operation
-
Concept: create a mapping relationship list, bind the values element to a specific label or string (provide different expressions for an element value)
-
Create a df with two columns of name and salary, and then give its name the corresponding English name
dic = {
'name':['Zhang San','Li Si','Wang Wu','Zhang San'],
'salary':[5000,6000,5500,5000]
}
df = DataFrame(data=dic)
df
| name | salary |
---|
0 | Zhang San | 5000 |
---|
1 | Li Si | 6000 |
---|
2 | Wang Wu | 5500 |
---|
3 | Zhang San | 5000 |
---|
#Mapping relation table
dic = {
'Li Si':'Jerry',
'Wang Wu':'Jack',
'Zhang San':'Tom'
}
e_name = df['name'].map(dic)
df['e_name'] = e_name
df
| name | salary | e_name |
---|
0 | Zhang San | 5000 | Tom |
---|
1 | Li Si | 6000 | Jerry |
---|
2 | Wang Wu | 5500 | Jack |
---|
3 | Zhang San | 5000 | Tom |
---|
- map is a Series method and can only be called by Series
Computing tools
- Pay 50% tax on more than 3000 parts of the money and calculate everyone's after tax salary
def after_sal(s):
return s - (s-3000)*0.5
after_salary = df['salary'].map(after_sal)
df['after_salary'] = after_salary
df
| name | salary | e_name | after_salary |
---|
0 | Zhang San | 5000 | Tom | 4000.0 |
---|
1 | Li Si | 6000 | Jerry | 4500.0 |
---|
2 | Wang Wu | 5500 | Jack | 4250.0 |
---|
3 | Zhang San | 5000 | Tom | 4000.0 |
---|
- The application method of Series can also act as an arithmetic tool like map
- As a computing tool, the efficiency of apple is much higher than that of map
def after_sal(s):
return s - (s-3000)*0.5
after_salary = df['salary'].apply(after_sal)
after_salary
0 4000.0
1 4500.0
2 4250.0
3 4000.0
Name: salary, dtype: float64
Random sampling for sorting implementation
- take()
- np.random.permutation(n): returns a random sequence between 0 - (n-1)
df = DataFrame(data=np.random.random(size=(100,3)),columns=['A','B','C'])
df.head()
| A | B | C |
---|
0 | 0.443835 | 0.973331 | 0.626923 |
---|
1 | 0.366619 | 0.225035 | 0.719683 |
---|
2 | 0.431639 | 0.154259 | 0.051419 |
---|
3 | 0.406994 | 0.065048 | 0.145229 |
---|
4 | 0.965849 | 0.385788 | 0.208316 |
---|
#Disrupt rows
df.take(np.random.permutation(100),axis=0)
| A | B | C |
---|
88 | 0.882121 | 0.368332 | 0.158629 |
---|
96 | 0.319008 | 0.251546 | 0.009901 |
---|
28 | 0.696431 | 0.178469 | 0.125718 |
---|
23 | 0.268010 | 0.639662 | 0.137387 |
---|
22 | 0.599899 | 0.089535 | 0.930574 |
---|
... | ... | ... | ... |
---|
50 | 0.468952 | 0.562839 | 0.671872 |
---|
79 | 0.237526 | 0.228043 | 0.699325 |
---|
87 | 0.932147 | 0.394211 | 0.540368 |
---|
90 | 0.980401 | 0.823402 | 0.624730 |
---|
92 | 0.699101 | 0.108088 | 0.542962 |
---|
100 rows × 3 columns
#Disrupt the ranks
df.take(np.random.permutation(100),axis=0).take(indices=np.random.permutation(3),axis=1)
| B | A | C |
---|
46 | 0.274198 | 0.689843 | 0.119163 |
---|
6 | 0.493573 | 0.854749 | 0.134710 |
---|
55 | 0.572823 | 0.159019 | 0.124300 |
---|
65 | 0.982499 | 0.822922 | 0.788931 |
---|
5 | 0.578393 | 0.904401 | 0.866710 |
---|
... | ... | ... | ... |
---|
30 | 0.734058 | 0.948066 | 0.029316 |
---|
59 | 0.103037 | 0.748319 | 0.025966 |
---|
48 | 0.988017 | 0.194801 | 0.162126 |
---|
80 | 0.594385 | 0.471618 | 0.945606 |
---|
70 | 0.750110 | 0.592528 | 0.470297 |
---|
100 rows × 3 columns
#Scramble the rows and columns, and then conduct random sampling
df.take(np.random.permutation(100),axis=0).take(indices=np.random.permutation(3),axis=1)[0:5]
| A | B | C |
---|
38 | 0.605719 | 0.843236 | 0.339796 |
---|
32 | 0.779512 | 0.944551 | 0.109526 |
---|
70 | 0.243308 | 0.799665 | 0.374344 |
---|
37 | 0.929103 | 0.907324 | 0.259208 |
---|
21 | 0.963018 | 0.868906 | 0.874337 |
---|
Classification and processing of data
df = DataFrame({'item':['Apple','Banana','Orange','Banana','Orange','Apple'],
'price':[4,3,3,2.5,4,2],
'color':['red','yellow','yellow','green','green','green'],
'weight':[12,20,50,30,20,44]})
df
| item | price | color | weight |
---|
0 | Apple | 4.0 | red | 12 |
---|
1 | Banana | 3.0 | yellow | 20 |
---|
2 | Orange | 3.0 | yellow | 50 |
---|
3 | Banana | 2.5 | green | 30 |
---|
4 | Orange | 4.0 | green | 20 |
---|
5 | Apple | 2.0 | green | 44 |
---|
df.groupby(by='item')#The DataFrameGroupBy object is returned, and the grouping results are stored in this object
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fea8c556460>
#groups view group details
df.groupby(by='item').groups
{'Apple': [0, 5], 'Banana': [1, 3], 'Orange': [2, 4]}
#Calculate the average price of each fruit
df.groupby(by='item').mean()
df.groupby(by='item').mean()['price'] #Not recommended: wasted computing power
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
df.groupby(by='item')['price'].mean()#Recommended way
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
#Multiple grouping conditions
df.groupby(by=['item','color'])
df.groupby(by=['color','item']).sum()
| | price | weight |
---|
color | item | | |
---|
green | Apple | 2.0 | 44 |
---|
Banana | 2.5 | 30 |
---|
Orange | 4.0 | 20 |
---|
red | Apple | 4.0 | 12 |
---|
yellow | Banana | 3.0 | 20 |
---|
Orange | 3.0 | 50 |
---|
#Demand: calculate the average price of each fruit, and then summarize the average price into the original table
mean_price_s = df.groupby(by='item')['price'].mean()
mean_price_s
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
dic = mean_price_s.to_dict() #Mapping relation table
df['mean_price'] = df['item'].map(dic)
df
| item | price | color | weight | mean_price |
---|
0 | Apple | 4.0 | red | 12 | 3.00 |
---|
1 | Banana | 3.0 | yellow | 20 | 2.75 |
---|
2 | Orange | 3.0 | yellow | 50 | 3.50 |
---|
3 | Banana | 2.5 | green | 30 | 2.75 |
---|
4 | Orange | 4.0 | green | 20 | 3.50 |
---|
5 | Apple | 2.0 | green | 44 | 3.00 |
---|
#Average weight of fruit of each color
mean_weight_s = df.groupby(by='color')['weight'].mean()
dic = mean_weight_s.to_dict()
mean_weight = df['color'].map(dic)
df['mean_weight'] = mean_weight
df
| item | price | color | weight | mean_price | mean_weight |
---|
0 | Apple | 4.0 | red | 12 | 3.00 | 12.000000 |
---|
1 | Banana | 3.0 | yellow | 20 | 2.75 | 35.000000 |
---|
2 | Orange | 3.0 | yellow | 50 | 3.50 | 35.000000 |
---|
3 | Banana | 2.5 | green | 30 | 2.75 | 31.333333 |
---|
4 | Orange | 4.0 | green | 20 | 3.50 | 31.333333 |
---|
5 | Apple | 2.0 | green | 44 | 3.00 | 31.333333 |
---|
Advanced data aggregation
- After grouping with groupby, you can also use the custom functions provided by transform and apply to realize more operations
- df.groupby('item')['price'].sum() <==> df.groupby('item')['price'].apply(sum)
- Both transform and apply will perform operations. Just pass in the function in transform or apply
- transform and apply can also pass in a lambda expression
def my_mean(p):
sum = 0
for i in p:
sum += i
return sum / len(p)
# The average price of each fruit can be summarized into the original table without mapping
df.groupby(by='item')['price'].transform(my_mean)
0 3.00
1 2.75
2 3.50
3 2.75
4 3.50
5 3.00
Name: price, dtype: float64
# The average price value of each fruit needs to be mapped before it can be summarized into the original table
df.groupby(by='item')['price'].apply(my_mean)
item
Apple 3.00
Banana 2.75
Orange 3.50
Name: price, dtype: float64
Data loading
df = pd.read_csv('./data/type-.txt')
df
| Hello. - I'm fine. - he's fine, too |
---|
0 | Maybe - probably - maybe |
---|
1 | However - not necessarily - not necessarily |
---|
df = pd.read_csv('./data/type-.txt',sep='-')
df
| Hello | I'm fine | So is he |
---|
0 | perhaps | Probably | be on the cards |
---|
1 | however | not necessarily | Not necessarily |
---|
df = pd.read_csv('./data/type-.txt',sep='-',header=None)
df
| 0 | 1 | 2 |
---|
0 | Hello | I'm fine | So is he |
---|
1 | perhaps | Probably | be on the cards |
---|
2 | however | not necessarily | Not necessarily |
---|
#\s + indicates one or more spaces
df = pd.read_csv('./data/test.txt',sep='\s+',header=None)
df
| 0 | 1 | 2 |
---|
0 | Hello | I'm fine | So is he |
---|
1 | perhaps | Probably | be on the cards |
---|
2 | however | not necessarily | Not necessarily |
---|
#Modify column index
df.columns = ['a','b','c']
df
| a | b | c |
---|
0 | Hello | I'm fine | So is he |
---|
1 | perhaps | Probably | be on the cards |
---|
2 | however | not necessarily | Not necessarily |
---|
df.rename(index={0:'a1',1:'a2',2:'a3'})
| a | b | c |
---|
a1 | Hello | I'm fine | So is he |
---|
a2 | perhaps | Probably | be on the cards |
---|
a3 | however | not necessarily | Not necessarily |
---|
#The parameter names can directly modify the column index when reading data
df = pd.read_csv('./data/test.txt',sep='\s+',header=None,names=['A','B','C'])
df
| A | B | C |
---|
0 | Hello | I'm fine | So is he |
---|
1 | perhaps | Probably | be on the cards |
---|
2 | however | not necessarily | Not necessarily |
---|
Pivot table
-
Pivot table is a table format that can dynamically arrange and summarize data. Perhaps most people have used PivotTable in Excel and realized its powerful functions. In pandas, it is called pivot_table.
-
Advantages of pivot table:
- High flexibility, you can customize your analysis and calculation requirements at will
- Clear context and easy to understand data
- Strong operability, report artifact
df = pd.read_csv('./data/Pivot table-Basketball match.csv')
df.head()
| opponent | Victory or defeat | Home and away | Hit | Number of shots | Shooting percentage | 3-point hit rate | backboard | Assists | score |
---|
0 | warrior | win | passenger | 10 | 23 | 0.435 | 0.444 | 6 | 11 | 27 |
---|
1 | king | win | passenger | 8 | 21 | 0.381 | 0.286 | 3 | 9 | 27 |
---|
2 | Calf | win | main | 10 | 19 | 0.526 | 0.462 | 3 | 7 | 29 |
---|
3 | Grizzlies | negative | main | 8 | 20 | 0.400 | 0.250 | 5 | 8 | 22 |
---|
4 | 76 people | win | passenger | 10 | 20 | 0.500 | 0.250 | 3 | 13 | 27 |
---|
pivot_table has four most important parameters: index, values, columns and aggfunc
- index parameter: classification criteria of classification summary
- Each pivot_table must have an index. If you want to see harden's score against each team, you need to classify each team and calculate the average of its various scores:
df.pivot_table(index='Victory or defeat')
| 3-point hit rate | Assists | Hit | score | Shooting percentage | Number of shots | backboard |
---|
Victory or defeat | | | | | | | |
---|
win | 0.42819 | 9.666667 | 10.238095 | 32.952381 | 0.481524 | 21.142857 | 5.142857 |
---|
negative | 0.29350 | 8.500000 | 7.500000 | 27.250000 | 0.354250 | 21.250000 | 4.750000 |
---|
- I want to see the data of harden against the same opponent in different home and away games. The classification conditions are opponent and home and away games
df.pivot_table(index=['opponent','Home and away'])
| | 3-point hit rate | Assists | Hit | score | Shooting percentage | Number of shots | backboard |
---|
opponent | Home and away | | | | | | | |
---|
76 people | main | 0.4290 | 7.0 | 8.0 | 29.0 | 0.381 | 21.0 | 4.0 |
---|
passenger | 0.2500 | 13.0 | 10.0 | 27.0 | 0.500 | 20.0 | 3.0 |
---|
warrior | passenger | 0.4440 | 11.0 | 10.0 | 27.0 | 0.435 | 23.0 | 6.0 |
---|
king | passenger | 0.2860 | 9.0 | 8.0 | 27.0 | 0.381 | 21.0 | 3.0 |
---|
sun | passenger | 0.5450 | 7.0 | 12.0 | 48.0 | 0.545 | 22.0 | 2.0 |
---|
Calf | main | 0.4620 | 7.0 | 10.0 | 29.0 | 0.526 | 19.0 | 3.0 |
---|
Knicks | main | 0.3850 | 10.0 | 12.0 | 37.0 | 0.444 | 27.0 | 2.0 |
---|
passenger | 0.3530 | 9.0 | 9.0 | 31.0 | 0.391 | 23.0 | 5.0 |
---|
pioneer | passenger | 0.5710 | 3.0 | 16.0 | 48.0 | 0.552 | 29.0 | 8.0 |
---|
Nuggets | main | 0.1430 | 9.0 | 6.0 | 21.0 | 0.375 | 16.0 | 8.0 |
---|
Walker | main | 0.3330 | 10.0 | 8.0 | 29.0 | 0.364 | 22.0 | 8.0 |
---|
passenger | 0.2500 | 15.0 | 9.0 | 26.0 | 0.429 | 21.0 | 5.0 |
---|
Lakers | passenger | 0.4440 | 9.0 | 13.0 | 36.0 | 0.591 | 22.0 | 4.0 |
---|
Grizzlies | main | 0.3395 | 8.0 | 9.5 | 30.0 | 0.420 | 22.5 | 4.5 |
---|
passenger | 0.3610 | 7.5 | 7.5 | 24.5 | 0.383 | 19.5 | 4.5 |
---|
sir | main | 0.8750 | 13.0 | 19.0 | 56.0 | 0.760 | 25.0 | 2.0 |
---|
passenger | 0.3330 | 3.0 | 8.0 | 29.0 | 0.421 | 19.0 | 5.0 |
---|
Raptor | main | 0.2730 | 11.0 | 8.0 | 38.0 | 0.320 | 25.0 | 6.0 |
---|
basket net | main | 0.6150 | 8.0 | 13.0 | 37.0 | 0.650 | 20.0 | 10.0 |
---|
eagle | passenger | 0.5450 | 11.0 | 8.0 | 29.0 | 0.533 | 15.0 | 3.0 |
---|
knight | main | 0.4290 | 13.0 | 8.0 | 35.0 | 0.381 | 21.0 | 11.0 |
---|
pelican | main | 0.4000 | 17.0 | 8.0 | 26.0 | 0.500 | 16.0 | 1.0 |
---|
wasp | passenger | 0.4000 | 11.0 | 8.0 | 27.0 | 0.444 | 18.0 | 10.0 |
---|
- values parameter: the calculated data needs to be filtered
- If we only need harden's score, rebounds and assists at home and away and in different winning and losing situations:
df.pivot_table(index=['Home and away','Victory or defeat'],values=['score','backboard','Assists'])
| | Assists | score | backboard |
---|
Home and away | Victory or defeat | | | |
---|
main | win | 10.555556 | 34.222222 | 5.444444 |
---|
negative | 8.666667 | 29.666667 | 5.000000 |
---|
passenger | win | 9.000000 | 32.000000 | 4.916667 |
---|
negative | 8.000000 | 20.000000 | 4.000000 |
---|
-
Aggfunc parameter: sets the function operation we perform during data aggregation
- When we do not set aggfunc, it defaults to aggfunc='mean 'to calculate the mean value.
-
When you want to get james harden's total score, total rebounds and total assists in home and away games and in different winning and losing situations:
df.pivot_table(index=['Home and away','Victory or defeat'],values=['score','backboard','Assists'],aggfunc='sum')
| | Assists | score | backboard |
---|
Home and away | Victory or defeat | | | |
---|
main | win | 95 | 308 | 49 |
---|
negative | 26 | 89 | 15 |
---|
passenger | win | 108 | 384 | 59 |
---|
negative | 8 | 20 | 4 |
---|
df.pivot_table(index=['Home and away','Victory or defeat'],aggfunc={'score':'sum','backboard':'mean','Assists':'min'})
| | Assists | score | backboard |
---|
Home and away | Victory or defeat | | | |
---|
main | win | 7 | 308 | 5.444444 |
---|
negative | 7 | 89 | 5.000000 |
---|
passenger | win | 3 | 384 | 4.916667 |
---|
negative | 8 | 20 | 4.000000 |
---|
- Columns: you can set column hierarchy fields
- Classify the values field
#Get the total score of all teams at home and away
df.pivot_table(index='Home and away',values='score',aggfunc='sum')
| score |
---|
Home and away | |
---|
main | 397 |
---|
passenger | 404 |
---|
#See who is the component of the total score of home and away games
df.pivot_table(index='Home and away',values='score',aggfunc='sum',columns='opponent',fill_value=0)
opponent | 76 people | warrior | king | sun | Calf | Knicks | pioneer | Nuggets | Walker | Lakers | Grizzlies | sir | Raptor | basket net | eagle | knight | pelican | wasp |
---|
Home and away | | | | | | | | | | | | | | | | | | |
---|
main | 29 | 0 | 0 | 0 | 29 | 37 | 0 | 21 | 29 | 0 | 60 | 56 | 38 | 37 | 0 | 35 | 26 | 0 |
---|
passenger | 27 | 27 | 27 | 48 | 0 | 31 | 48 | 0 | 26 | 36 | 49 | 29 | 0 | 0 | 29 | 0 | 0 | 27 |
---|
Cross table
- It is a special perspective used to calculate grouping and summarize data
- pd.crosstab(index,colums)
- Index: grouped data, row index of crosstab
- columns: column index of crosstab
import pandas as pd
from pandas import DataFrame
df = DataFrame({'sex':['man','man','women','women','man','women','man','women','women'],
'age':[15,23,25,17,35,57,24,31,22],
'smoke':[True,False,False,True,True,False,False,True,False],
'height':[168,179,181,166,173,178,188,190,160]})
df
| sex | age | smoke | height |
---|
0 | man | 15 | True | 168 |
---|
1 | man | 23 | False | 179 |
---|
2 | women | 25 | False | 181 |
---|
3 | women | 17 | True | 166 |
---|
4 | man | 35 | True | 173 |
---|
5 | women | 57 | False | 178 |
---|
6 | man | 24 | False | 188 |
---|
7 | women | 31 | True | 190 |
---|
8 | women | 22 | False | 160 |
---|
- Find the number of smokers of each sex
#grouping
df.groupby(by='sex')['smoke'].sum()
#perspective
df.pivot_table(index='sex',values='smoke',aggfunc='sum')
pd.crosstab(df.smoke,df.sex)
sex | man | women |
---|
smoke | | |
---|
False | 2 | 3 |
---|
True | 2 | 2 |
---|
pd.crosstab(df.sex,df.smoke)
smoke | False | True |
---|
sex | | |
---|
man | 2 | 2 |
---|
women | 3 | 2 |
---|
- Find out the smoking situation of people of all ages
pd.crosstab(df.smoke,df.age)
age | 15 | 17 | 22 | 23 | 24 | 25 | 31 | 35 | 57 |
---|
smoke | | | | | | | | | |
---|
False | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 1 |
---|
True | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
---|