1, Viewing of Pandas data
1.1 view Dataframe: print(df);print(df.head(6));print(df.tail(6));print(df.sample(6))
1.2 view Series: print(ser); print(df[‘salary’])
1.3 view DF basic information: print(df.info()); print(df.shape)
1.4 view df basic statistical indicators: print(df.describe())
1.5 view df column name: print(df.columns)
1.6 view df index: print(df.index)
1.7 viewing df data: print(df.values)
1.8 check the number of missing df values: print(df.isnull().sum().sum())
1.9 frequency of viewing df data column (Series): print(df ['salary ']. value_counts())
1.10 view the unique value of df data column (Series): print(df ['salary ']. unique())
1.11 view df completely repeated lines: print(df.duplicated())
1.12 view duplicate rows of df data column (Series): print(df ['salary ']. duplicated())
2, Deletion of Pandas data
2.1 delete single line: by single index value
df = pd.read_csv('data.csv') df.set_index('positionId', inplace=True) df.drop(5204912, inplace=True) print(df)
2.2 delete multiple rows: list by index value
df = pd.read_csv('data.csv') df.set_index('positionId', inplace=True) df.drop([5204912, 6467417], inplace=True) print(df)
2.3 delete lines by criteria: filter first and then delete
Filter data analysis posts with salary greater than 40000 yuan, and delete those qualified by the Department
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary']) df.set_index('positionId', inplace=True) # DF = DF [(DF ['salary '] > 40000) & (DF ['positionname']. Str.contains ('data analysis'))] df.drop(df[~((df['salary']>40000) & (df['positionName'].str.contains('Data analysis')))].index, inplace=True) print(df)
2.4 delete single column: by column name axis=1
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary', 'createTime']) df.set_index('positionId', inplace=True) df.drop('createTime', axis=1, inplace=True) print(df)
2.5 delete multiple columns: list by column name axis=1
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary', 'createTime']) df.set_index('positionId', inplace=True) df.drop(['createTime', 'salary'], axis=1, inplace=True) print(df)
2.6 delete single column: by column number axis=1
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary', 'createTime']) df.set_index('positionId', inplace=True) df.drop(df.columns[2], axis=1, inplace=True) print(df)
2.7 delete multiple columns: list by column number axis=1
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary', 'createTime']) df.set_index('positionId', inplace=True) df.drop(df.columns[[1,2]], axis=1, inplace=True) print(df)
3, Increase of Pandas data
3.1 insert columns by position: Insert
import pandas as pd from random import randint df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary', 'createTime']) df.set_index('positionId', inplace=True) # Add a list after the last column df.insert(df.shape[1], 'salary1', [randint(20000, 50000) for i in range(df.shape[0])]) # Add a column of scalars before the first column df.insert(0, 'salary', 2500, allow_duplicates=True) print(df)
3.2 add a column: scalar value
3.3 add a column: calculation column
3.4 add a column: list data column
3.5 add a column: comparison column
3.6 add a column: judgment column
3.7 add a column: proportion column - reference variable
import pandas as pd import numpy as np from random import randint df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'salary']) df.columns = ['ID', 'position','salary'] df.set_index('ID', inplace=True) # Add scalar column df['Transportation subsidy'] = 1000 # Add calculated column df['Salary supplement'] = df['salary'] + df['Transportation subsidy'] # Add list data column df['Salary 2'] = [randint(20000, 50000) for i in range(df.shape[0])] # Add comparison column df['High salary'] = df[['Salary supplement', 'Salary 2']].max(axis=1) # Add judgment column df['Meet expectations'] = np.where(df['High salary']>40000, 'yes', 'no') # Increase proportion column - reference variable sal2_sum = df['Salary 2'].sum() df.eval(f'Proportion of salary(%) = 100*Salary 2/{sal2_sum}', inplace=True) print(df)
3.8 add multiple columns
import pandas as pd df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) df = df.assign(Salary supplement=df.salary+1000, particular year=df.time.str[:4], quarter=pd.to_datetime(df.time).dt.to_period('Q')) print(df)
4, Modification of Pandas data
4.1 modify column name, index and index name
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) df.rename_axis('ID number', inplace=True) print(df.index)
4.2 modify the value of the specified row and column:. LOC /. Iloc
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) df.loc[6802721,'salary'] = 9999 df.iloc[1,2] = 8888 print(df)
4.3 modify the value of the specified column
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df.dtypes) df['salary'].replace(30000, '3 ten thousand', inplace=True) df['position'] = df['position'].str.replace('data', 'Data') print(df.dtypes) print(df)
4.4 modify multiple values of DF: exact matching of all values
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df.dtypes) df.replace(['Data analysis', 30000], ['Data analysis', '3 ten thousand'], inplace=True) print(df.dtypes) print(df)
5, Screening of Pandas data (CURD synthesis)
5.1 filter all columns in a single line
5.1.1 filter by index name, and the result is Series
5.1.2 filter by index number, and the result is Series
5.1.3 filter by column values without duplicate values, and the result is Dataframe
import pandas as pd df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df.loc[6802721]) print(df.iloc[1]) df.reset_index(inplace=True) print(df) print(df.loc[df['ID']==6877668])
5.2 filter all columns in multiple rows, and the result is Dataframe
5.2.1 filter by index name
5.2.2 filter by index number
5.2.3 filter by column value
import pandas as pd df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.loc[[6802721, 5204912]]) df.reset_index(inplace=True) # Extract 0,1 rows print(df.iloc[[0, 1]]) # Extract data after 10 rows print(df.iloc[10:]) # Extract integer column of 3 print(df.iloc[[i%3==0 for i in range(df.shape[0])]]) # Extract rows with salary greater than 40000 and position including data analysis print(df.loc[(df['salary']>40000) & (df['position'].str.contains('Data analysis'))]) # The extracted position is the column of data analysis and data analyst print(df.loc[df['position'].isin(['Data analysis','Data Analyst'])])
5.3 delete all rows in a single column
5.3.1 filter by column name, and the result is Series
5.3.2 filter by column number, and the result is Series
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.loc[:, 'salary']) print(df.iloc[:, 1])
5.4 delete all rows in multiple columns
5.4.1 filter by column name
5.4.2 filter by column number
import pandas as pd df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.loc[:, ['position', 'salary']]) print(df.loc[:, df.columns.str.endswith('position')]) print(df.iloc[:, [0,2]]) # Filter odd columns print(df.iloc[:, [i for i in range(df.shape[1]) if i%2]]) print(df.iloc[:, [i%2==1 for i in range(df.shape[1])]]) # Filter even columns print(df.iloc[:, [i%2==0 for i in range(df.shape[1])]])
5.5 filter single row and single column
5.5.1 filter by row and column name
5.5.2 filter by line number
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.loc[6802721, 'salary']) print(df.iloc[0, 2])
5.6 filter multiple rows and columns
5.6.1 filter by row / column name list
5.6.2 filter by line number list
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.loc[[6802721, 6877668, 6467417], ['position', 'salary']]) print(df.iloc[[0, 2, 4], [0, 2]])
5.6.3 extract the last two columns of rows 10-20
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.iloc[10: 20, (df.shape[1]-2):])
5.7 df.query() query
df = pd.read_csv('data.csv' , usecols=['positionId', 'positionName', 'createTime','salary']) df.columns = ['ID', 'position', 'time','salary'] df.set_index('ID', inplace=True) print(df) print(df.query("position=='Data analysis' & salary>40000")) sal_avg = int(df.salary.mean()) # Calculate average salary print(df.query(f"position=='Data analysis' & salary>{sal_avg}")) print(df.loc[(df['position']=='Data analysis') & (df['salary']>sal_avg)]) # Compare with query