XGboost related learning
from xgboost import XGBRegressor as XGBR from sklearn.ensemble import RandomForestRegressor as RFR from sklearn.linear_model import LinearRegression as LinearR from sklearn.datasets import load_boston from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS from sklearn.metrics import mean_squared_error as MSE import pandas as pd import numpy as np import matplotlib.pyplot as plt from time import time import datetime
data = load_boston() #The Boston data set is very simple, but it involves many problems
data
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02, 4.9800e+00], [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02, 9.1400e+00], [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02, 4.0300e+00], ..., [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02, 5.6400e+00], [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02, 6.4800e+00], [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02, 7.8800e+00]]), 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. , 18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6, 15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2, 13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7, 21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9, 35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5, 19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. , 20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2, 23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8, 33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4, 21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. , 20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6, 23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4, 15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4, 17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7, 25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4, 23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. , 32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3, 34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4, 20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. , 26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3, 31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1, 22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6, 42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. , 36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4, 32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. , 20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1, 20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2, 22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1, 21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6, 19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7, 32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1, 18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8, 16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8, 13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3, 8.8, 7.2, 10.5, 7.4, 10.2, 11.5, 15.1, 23.2, 9.7, 13.8, 12.7, 13.1, 12.5, 8.5, 5. , 6.3, 5.6, 7.2, 12.1, 8.3, 8.5, 5. , 11.9, 27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3, 7. , 7.2, 7.5, 10.4, 8.8, 8.4, 16.7, 14.2, 20.8, 13.4, 11.7, 8.3, 10.2, 10.9, 11. , 9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4, 9.6, 8.7, 8.4, 12.8, 10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4, 15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7, 19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2, 29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8, 20.6, 21.2, 19.1, 20.6, 15.2, 7. , 8.1, 13.6, 20.1, 21.8, 24.5, 23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]), 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'), 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:** \n\n :Number of Instances: 506 \n\n :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n :Attribute Information (in order):\n - CRIM per capita crime rate by town\n - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n - INDUS proportion of non-retail business acres per town\n - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n - NOX nitric oxides concentration (parts per 10 million)\n - RM average number of rooms per dwelling\n - AGE proportion of owner-occupied units built prior to 1940\n - DIS weighted distances to five Boston employment centres\n - RAD index of accessibility to radial highways\n - TAX full-value property-tax rate per $10,000\n - PTRATIO pupil-teacher ratio by town\n - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n - LSTAT % lower status of the population\n - MEDV Median value of owner-occupied homes in $1000's\n\n :Missing Attribute Values: None\n\n :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980. N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems. \n \n.. topic:: References\n\n - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", 'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
X = data.data y = data.target
X.shape
(506, 13)
y.shape
(506,)
y
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. , 18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6, 15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2, 13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7, 21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9, 35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5, 19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. , 20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2, 23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8, 33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4, 21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. , 20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6, 23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4, 15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4, 17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7, 25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4, 23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. , 32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3, 34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4, 20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. , 26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3, 31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1, 22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6, 42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. , 36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4, 32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. , 20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1, 20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2, 22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1, 21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6, 19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7, 32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1, 18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8, 16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8, 13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3, 8.8, 7.2, 10.5, 7.4, 10.2, 11.5, 15.1, 23.2, 9.7, 13.8, 12.7, 13.1, 12.5, 8.5, 5. , 6.3, 5.6, 7.2, 12.1, 8.3, 8.5, 5. , 11.9, 27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3, 7. , 7.2, 7.5, 10.4, 8.8, 8.4, 16.7, 14.2, 20.8, 13.4, 11.7, 8.3, 10.2, 10.9, 11. , 9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4, 9.6, 8.7, 8.4, 12.8, 10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4, 15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7, 19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2, 29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8, 20.6, 21.2, 19.1, 20.6, 15.2, 7. , 8.1, 13.6, 20.1, 21.8, 24.5, 23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #train
reg.predict(Xtest) #Traditional interface predict
reg.score(Xtest,Ytest) #Can you think of what model evaluation indicators should be returned here? Using shift+Tab, you can know the R^2 evaluation index
y.mean()
MSE(Ytest,reg.predict(Xtest))#It can be seen that the mean square error is about 1 / 3 of the average value y.mean(), and the result is neither good nor bad
reg.feature_importances_ #One of the advantages of tree model: it can view the importance score of the model and use the select from model for feature selection #xgboost can use embedding method for feature selection
reg = XGBR(n_estimators=100) #Untrained models imported in cross validation
CVS(reg,Xtrain,Ytrain,cv=5).mean() #What model evaluation indicators should be returned here, remember? And reg Score the same evaluation index R^2 (regression), accuracy (classification)
0.8017863029875325
#Discussion between rigorous cross validation and non rigorous cross validation: training set or full data?
array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])
#Rigorous vs. not rigorous
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
-16.041115480238048
#Let's take a look at all the model evaluation indicators in sklearn import sklearn sorted(sklearn.metrics.SCORERS.keys())
['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
#A comparison was made using random forest and linear regression rfr = RFR(n_estimators=100) CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329
0.7975497480638329
CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033
-16.998723616338033
lr = LinearR() CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085
0.6835070597278085
CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844
-25.34950749364844
#If you enable the parameter slient: when the data is huge and the algorithm is expected to run very slowly, you can use this parameter to monitor the training progress of the model reg = XGBR(n_estimators=10,silent=True)#xgboost library silent=True will not print the training process, but only return the running results. The default is False, and the training process will be printed #The default of xgbsoost in sklearn library is silent=True. The training process will not be printed. If you want to print, you need to manually set it to False CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579
-92.67865836936579
def plot_learning_curve(estimator,title, X, y, ax=None, #Select subgraph ylim=None, #Set the value range of ordinate cv=None, #Cross validation n_jobs=None #Set the thread to use ): from sklearn.model_selection import learning_curve import matplotlib.pyplot as plt import numpy as np train_sizes, train_scores, test_scores = learning_curve(estimator, X, y ,shuffle=True ,cv=cv ,random_state=420 ,n_jobs=n_jobs) if ax == None: ax = plt.gca() else: ax = plt.figure() ax.set_title(title) if ylim is not None: ax.set_ylim(*ylim) ax.set_xlabel("Training examples") ax.set_ylabel("Score") ax.grid() #Drawing a mesh is not required ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-' , color="r",label="Training score") ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-' , color="g",label="Test score") ax.legend(loc="best") return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42) #Cross validation mode
plot_learning_curve(XGBR(n_estimators=100,random_state=420) ,"XGB",Xtrain,Ytrain,ax=None,cv=cv) plt.show()
[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-xpQh3MDR-1619417048279)(output_28_0.png)]
#=====[TIME WARNING: 25 seconds]=====# axisx = range(10,1010,50) rs = [] for i in axisx: reg = XGBR(n_estimators=i,random_state=420) rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean()) print(axisx[rs.index(max(rs))],max(rs)) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="red",label="XGB") plt.legend() plt.show()
660 0.8046775284172915
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-6da4aVib-1619417048281)(output_29_1.png)]
#Selected n_estimators are very unusual. Should we choose n with the highest accuracy_ What about the estimators value?
#======[TIME WARNING: 20s]=======# axisx = range(50,1050,50) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i,random_state=420) cvresult = CVS(reg,Xtrain,Ytrain,cv=cv) #Record 1 - deviation rs.append(cvresult.mean()) #Record variance var.append(cvresult.var()) #Calculate the controllable part of generalization error ge.append((1 - cvresult.mean())**2+cvresult.var()) #Print the parameter value corresponding to the highest R2, and print the variance under this parameter print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))]) #Print the parameter value corresponding to the lowest variance, and print R2 under this parameter print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var)) #Print the parameter value of the controllable part of generalization error, and print R2, variance and controllable part of generalization error under this parameter print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge)) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="red",label="XGB") plt.legend() plt.show()
650 0.80476050359201 0.01053673846018678 50 0.7857724708830981 0.009072727885598212 150 0.8032842414878519 0.009747694343514357 0.04844478399052411
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-eJ1DzHzL-1619417048282)(output_31_1.png)]
axisx = range(100,300,10) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i,random_state=420) cvresult = CVS(reg,Xtrain,Ytrain,cv=cv) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2+cvresult.var()) print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))]) print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var)) print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge)) rs = np.array(rs) var = np.array(var)*0.01 plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="black",label="XGB") #Add variance line plt.plot(axisx,rs+var,c="red",linestyle='-.') plt.plot(axisx,rs-var,c="red",linestyle='-.') plt.legend() plt.show()
180 0.8038787848970184 0.00959321570484315 180 0.8038787848970184 0.00959321570484315 180 0.8038787848970184 0.00959321570484315 0.04805674671831314
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-qD6ST091-1619417048283)(output_32_1.png)]
#What about the controllable part of generalization error? plt.figure(figsize=(20,5)) plt.plot(axisx,ge,c="gray",linestyle='-.') plt.show()
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-pxH53h8Q-1619417048284)(output_33_0.png)]
#Verify whether the effect of the model is improved? time0 = time() print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest)) print(time()-time0)
0.9197580267581366 0.0787498950958252
time0 = time() print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest)) print(time()-time0)
0.9208745746309475 0.36807847023010254
time0 = time() print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest)) print(time()-time0)
0.9231068620728082 0.12366437911987305
axisx = np.linspace(0,1,20) rs = [] for i in axisx: reg = XGBR(n_estimators=180,subsample=i,random_state=420) rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean()) print(axisx[rs.index(max(rs))],max(rs)) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="green",label="XGB") plt.legend() plt.show()
0.7368421052631579 0.837609040251761
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-7svP0syk-1619417048285)(output_37_1.png)]
#Continue to refine the learning curve axisx = np.linspace(0.05,1,20) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=180,subsample=i,random_state=420) cvresult = CVS(reg,Xtrain,Ytrain,cv=cv) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2+cvresult.var()) print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))]) print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var)) print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge)) rs = np.array(rs) var = np.array(var) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="black",label="XGB") plt.plot(axisx,rs+var,c="red",linestyle='-.') plt.plot(axisx,rs-var,c="red",linestyle='-.') plt.legend() plt.show()
0.65 0.8302530801197368 0.008708816667924316 0.7999999999999999 0.8277414964661117 0.007159903723250457 0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055
[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-qvvhiCYL-1619417048286)(output_38_1.png)]
#Refine learning curve axisx = np.linspace(0.75,1,25) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=180,subsample=i,random_state=420) cvresult = CVS(reg,Xtrain,Ytrain,cv=cv) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2+cvresult.var()) print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))]) print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var)) print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge)) rs = np.array(rs) var = np.array(var) plt.figure(figsize=(20,5)) plt.plot(axisx,rs,c="black",label="XGB") plt.plot(axisx,rs+var,c="red",linestyle='-.') plt.plot(axisx,rs-var,c="red",linestyle='-.') plt.legend() plt.show()
0.7708333333333334 0.833489187182165 0.005575077682875093 0.7708333333333334 0.833489187182165 0.005575077682875093 0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166
[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-jULrwpAL-1619417048286)(output_39_1.png)]
reg = XGBR(n_estimators=180 # ,subsample=0.7708333333333334 ,random_state=420).fit(Xtrain,Ytrain) reg.score(Xtest,Ytest)
0.9159462982185405
MSE(Ytest,reg.predict(Xtest))
7.821523502888769
#First, let's define a scoring function, which can help us directly print the cross validation results on Xtrain def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True): score = [] for i in range(len(scoring)): if show: print("{}:{:.2f}".format(scoring[i] #Name of model evaluation indicator ,CVS(reg ,Xtrain,Ytrain ,cv=cv,scoring=scoring[i]).mean())) score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean()) return score
reg = XGBR(n_estimators=180,random_state=420)
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
r2:0.80 neg_mean_squared_error:-13.48 [0.8038787848970184, -13.482301822063182]
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8038787848970184, -13.482301822063182]
from time import time import datetime for i in [0,0.2,0.5,1]: time0=time() reg = XGBR(n_estimators=180,random_state=420,learning_rate=i) print("learning_rate = {}".format(i)) regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"]) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")) print("\t")
learning_rate = 0 r2:-6.76 neg_mean_squared_error:-567.55 00:01:561781 learning_rate = 0.2 r2:0.81 neg_mean_squared_error:-13.32 00:01:848888 learning_rate = 0.5 r2:0.81 neg_mean_squared_error:-13.24 00:01:541875 learning_rate = 1 r2:0.72 neg_mean_squared_error:-19.11 00:01:499027
axisx = np.arange(0.05,1,0.05) rs = [] te = [] for i in axisx: reg = XGBR(n_estimators=180,random_state=420,learning_rate=i) score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False) test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest) rs.append(score[0]) te.append(test) print(axisx[rs.index(max(rs))],max(rs)) plt.figure(figsize=(20,5)) plt.plot(axisx,te,c="gray",label="test") plt.plot(axisx,rs,c="green",label="train") plt.legend() plt.show()
0.55 0.8125604372670463
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-Kt53b60m-1619417048287)(output_47_1.png)]
for booster in ["gbtree","gblinear","dart"]: reg = XGBR(n_estimators=180 ,learning_rate=0.1 ,random_state=420 ,booster=booster).fit(Xtrain,Ytrain) print(booster) print(reg.score(Xtest,Ytest))
gbtree 0.9231068620728082 gblinear 0.6286510307485139 dart 0.923106843149575
#Default reg:linear reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain) reg.score(Xtest, Ytest)
0.9231068620728082
MSE(Ytest,reg.predict(Xtest))
7.155205217161047
#xgb implementation method import xgboost as xgb
#Read data using DMatrix class dtrain = xgb.DMatrix(Xtrain,Ytrain) #Both feature matrix and label are passed in dtest = xgb.DMatrix(Xtest,Ytest)
#Unfortunately, you can't open it for viewing, so you usually read it in pandas first and then put it in DMatrix dtrain
<xgboost.core.DMatrix at 0x2770de3bdd8>
import pandas as pd
pd.DataFrame(Xtrain)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.03041 | 0.0 | 5.19 | 0.0 | 0.515 | 5.895 | 59.6 | 5.6150 | 5.0 | 224.0 | 20.2 | 394.81 | 10.56 |
1 | 0.04113 | 25.0 | 4.86 | 0.0 | 0.426 | 6.727 | 33.5 | 5.4007 | 4.0 | 281.0 | 19.0 | 396.90 | 5.29 |
2 | 10.23300 | 0.0 | 18.10 | 0.0 | 0.614 | 6.185 | 96.7 | 2.1705 | 24.0 | 666.0 | 20.2 | 379.70 | 18.03 |
3 | 0.17142 | 0.0 | 6.91 | 0.0 | 0.448 | 5.682 | 33.8 | 5.1004 | 3.0 | 233.0 | 17.9 | 396.90 | 10.21 |
4 | 0.05059 | 0.0 | 4.49 | 0.0 | 0.449 | 6.389 | 48.0 | 4.7794 | 3.0 | 247.0 | 18.5 | 396.90 | 9.62 |
5 | 0.13587 | 0.0 | 10.59 | 1.0 | 0.489 | 6.064 | 59.1 | 4.2392 | 4.0 | 277.0 | 18.6 | 381.32 | 14.66 |
6 | 0.04981 | 21.0 | 5.64 | 0.0 | 0.439 | 5.998 | 21.4 | 6.8147 | 4.0 | 243.0 | 16.8 | 396.90 | 8.43 |
7 | 0.02543 | 55.0 | 3.78 | 0.0 | 0.484 | 6.696 | 56.4 | 5.7321 | 5.0 | 370.0 | 17.6 | 396.90 | 7.18 |
8 | 0.10793 | 0.0 | 8.56 | 0.0 | 0.520 | 6.195 | 54.4 | 2.7778 | 5.0 | 384.0 | 20.9 | 393.49 | 13.00 |
9 | 0.02498 | 0.0 | 1.89 | 0.0 | 0.518 | 6.540 | 59.7 | 6.2669 | 1.0 | 422.0 | 15.9 | 389.96 | 8.65 |
10 | 0.09299 | 0.0 | 25.65 | 0.0 | 0.581 | 5.961 | 92.9 | 2.0869 | 2.0 | 188.0 | 19.1 | 378.09 | 17.93 |
11 | 0.15876 | 0.0 | 10.81 | 0.0 | 0.413 | 5.961 | 17.5 | 5.2873 | 4.0 | 305.0 | 19.2 | 376.94 | 9.88 |
12 | 6.71772 | 0.0 | 18.10 | 0.0 | 0.713 | 6.749 | 92.6 | 2.3236 | 24.0 | 666.0 | 20.2 | 0.32 | 17.44 |
13 | 0.03768 | 80.0 | 1.52 | 0.0 | 0.404 | 7.274 | 38.3 | 7.3090 | 2.0 | 329.0 | 12.6 | 392.20 | 6.62 |
14 | 5.20177 | 0.0 | 18.10 | 1.0 | 0.770 | 6.127 | 83.4 | 2.7227 | 24.0 | 666.0 | 20.2 | 395.43 | 11.48 |
15 | 11.08740 | 0.0 | 18.10 | 0.0 | 0.718 | 6.411 | 100.0 | 1.8589 | 24.0 | 666.0 | 20.2 | 318.75 | 15.02 |
16 | 0.11432 | 0.0 | 8.56 | 0.0 | 0.520 | 6.781 | 71.3 | 2.8561 | 5.0 | 384.0 | 20.9 | 395.58 | 7.67 |
17 | 0.05602 | 0.0 | 2.46 | 0.0 | 0.488 | 7.831 | 53.6 | 3.1992 | 3.0 | 193.0 | 17.8 | 392.63 | 4.45 |
18 | 0.24103 | 0.0 | 7.38 | 0.0 | 0.493 | 6.083 | 43.7 | 5.4159 | 5.0 | 287.0 | 19.6 | 396.90 | 12.79 |
19 | 0.09378 | 12.5 | 7.87 | 0.0 | 0.524 | 5.889 | 39.0 | 5.4509 | 5.0 | 311.0 | 15.2 | 390.50 | 15.71 |
20 | 8.71675 | 0.0 | 18.10 | 0.0 | 0.693 | 6.471 | 98.8 | 1.7257 | 24.0 | 666.0 | 20.2 | 391.98 | 17.12 |
21 | 7.36711 | 0.0 | 18.10 | 0.0 | 0.679 | 6.193 | 78.1 | 1.9356 | 24.0 | 666.0 | 20.2 | 96.73 | 21.52 |
22 | 1.38799 | 0.0 | 8.14 | 0.0 | 0.538 | 5.950 | 82.0 | 3.9900 | 4.0 | 307.0 | 21.0 | 232.60 | 27.71 |
23 | 14.33370 | 0.0 | 18.10 | 0.0 | 0.614 | 6.229 | 88.0 | 1.9512 | 24.0 | 666.0 | 20.2 | 383.32 | 13.11 |
24 | 28.65580 | 0.0 | 18.10 | 0.0 | 0.597 | 5.155 | 100.0 | 1.5894 | 24.0 | 666.0 | 20.2 | 210.97 | 20.08 |
25 | 0.80271 | 0.0 | 8.14 | 0.0 | 0.538 | 5.456 | 36.6 | 3.7965 | 4.0 | 307.0 | 21.0 | 288.99 | 11.69 |
26 | 1.00245 | 0.0 | 8.14 | 0.0 | 0.538 | 6.674 | 87.3 | 4.2390 | 4.0 | 307.0 | 21.0 | 380.23 | 11.98 |
27 | 9.91655 | 0.0 | 18.10 | 0.0 | 0.693 | 5.852 | 77.8 | 1.5004 | 24.0 | 666.0 | 20.2 | 338.16 | 29.97 |
28 | 0.13158 | 0.0 | 10.01 | 0.0 | 0.547 | 6.176 | 72.5 | 2.7301 | 6.0 | 432.0 | 17.8 | 393.30 | 12.04 |
29 | 0.14231 | 0.0 | 10.01 | 0.0 | 0.547 | 6.254 | 84.2 | 2.2565 | 6.0 | 432.0 | 17.8 | 388.74 | 10.45 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
324 | 0.13117 | 0.0 | 8.56 | 0.0 | 0.520 | 6.127 | 85.2 | 2.1224 | 5.0 | 384.0 | 20.9 | 387.69 | 14.09 |
325 | 1.35472 | 0.0 | 8.14 | 0.0 | 0.538 | 6.072 | 100.0 | 4.1750 | 4.0 | 307.0 | 21.0 | 376.73 | 13.04 |
326 | 0.10153 | 0.0 | 12.83 | 0.0 | 0.437 | 6.279 | 74.5 | 4.0522 | 5.0 | 398.0 | 18.7 | 373.66 | 11.97 |
327 | 0.22927 | 0.0 | 6.91 | 0.0 | 0.448 | 6.030 | 85.5 | 5.6894 | 3.0 | 233.0 | 17.9 | 392.74 | 18.80 |
328 | 0.04666 | 80.0 | 1.52 | 0.0 | 0.404 | 7.107 | 36.6 | 7.3090 | 2.0 | 329.0 | 12.6 | 354.31 | 8.61 |
329 | 0.08014 | 0.0 | 5.96 | 0.0 | 0.499 | 5.850 | 41.5 | 3.9342 | 5.0 | 279.0 | 19.2 | 396.90 | 8.77 |
330 | 0.40771 | 0.0 | 6.20 | 1.0 | 0.507 | 6.164 | 91.3 | 3.0480 | 8.0 | 307.0 | 17.4 | 395.24 | 21.46 |
331 | 0.13642 | 0.0 | 10.59 | 0.0 | 0.489 | 5.891 | 22.3 | 3.9454 | 4.0 | 277.0 | 18.6 | 396.90 | 10.87 |
332 | 9.32909 | 0.0 | 18.10 | 0.0 | 0.713 | 6.185 | 98.7 | 2.2616 | 24.0 | 666.0 | 20.2 | 396.90 | 18.13 |
333 | 0.09103 | 0.0 | 2.46 | 0.0 | 0.488 | 7.155 | 92.2 | 2.7006 | 3.0 | 193.0 | 17.8 | 394.12 | 4.82 |
334 | 0.01301 | 35.0 | 1.52 | 0.0 | 0.442 | 7.241 | 49.3 | 7.0379 | 1.0 | 284.0 | 15.5 | 394.74 | 5.49 |
335 | 0.59005 | 0.0 | 21.89 | 0.0 | 0.624 | 6.372 | 97.9 | 2.3274 | 4.0 | 437.0 | 21.2 | 385.76 | 11.12 |
336 | 1.12658 | 0.0 | 19.58 | 1.0 | 0.871 | 5.012 | 88.0 | 1.6102 | 5.0 | 403.0 | 14.7 | 343.28 | 12.12 |
337 | 0.07886 | 80.0 | 4.95 | 0.0 | 0.411 | 7.148 | 27.7 | 5.1167 | 4.0 | 245.0 | 19.2 | 396.90 | 3.56 |
338 | 0.21719 | 0.0 | 10.59 | 1.0 | 0.489 | 5.807 | 53.8 | 3.6526 | 4.0 | 277.0 | 18.6 | 390.94 | 16.03 |
339 | 0.53700 | 0.0 | 6.20 | 0.0 | 0.504 | 5.981 | 68.1 | 3.6715 | 8.0 | 307.0 | 17.4 | 378.35 | 11.65 |
340 | 3.32105 | 0.0 | 19.58 | 1.0 | 0.871 | 5.403 | 100.0 | 1.3216 | 5.0 | 403.0 | 14.7 | 396.90 | 26.82 |
341 | 1.49632 | 0.0 | 19.58 | 0.0 | 0.871 | 5.404 | 100.0 | 1.5916 | 5.0 | 403.0 | 14.7 | 341.60 | 13.28 |
342 | 0.38735 | 0.0 | 25.65 | 0.0 | 0.581 | 5.613 | 95.6 | 1.7572 | 2.0 | 188.0 | 19.1 | 359.29 | 27.26 |
343 | 0.06617 | 0.0 | 3.24 | 0.0 | 0.460 | 5.868 | 25.8 | 5.2146 | 4.0 | 430.0 | 16.9 | 382.44 | 9.97 |
344 | 0.78570 | 20.0 | 3.97 | 0.0 | 0.647 | 7.014 | 84.6 | 2.1329 | 5.0 | 264.0 | 13.0 | 384.07 | 14.79 |
345 | 1.41385 | 0.0 | 19.58 | 1.0 | 0.871 | 6.129 | 96.0 | 1.7494 | 5.0 | 403.0 | 14.7 | 321.02 | 15.12 |
346 | 0.06047 | 0.0 | 2.46 | 0.0 | 0.488 | 6.153 | 68.8 | 3.2797 | 3.0 | 193.0 | 17.8 | 387.11 | 13.15 |
347 | 8.49213 | 0.0 | 18.10 | 0.0 | 0.584 | 6.348 | 86.1 | 2.0527 | 24.0 | 666.0 | 20.2 | 83.45 | 17.64 |
348 | 0.17134 | 0.0 | 10.01 | 0.0 | 0.547 | 5.928 | 88.2 | 2.4631 | 6.0 | 432.0 | 17.8 | 344.91 | 15.76 |
349 | 0.03871 | 52.5 | 5.32 | 0.0 | 0.405 | 6.209 | 31.3 | 7.3172 | 6.0 | 293.0 | 16.6 | 396.90 | 7.14 |
350 | 0.12650 | 25.0 | 5.13 | 0.0 | 0.453 | 6.762 | 43.4 | 7.9809 | 8.0 | 284.0 | 19.7 | 395.58 | 9.50 |
351 | 6.96215 | 0.0 | 18.10 | 0.0 | 0.700 | 5.713 | 97.0 | 1.9265 | 24.0 | 666.0 | 20.2 | 394.43 | 17.11 |
352 | 0.09164 | 0.0 | 10.81 | 0.0 | 0.413 | 6.065 | 7.8 | 5.2873 | 4.0 | 305.0 | 19.2 | 390.91 | 5.52 |
353 | 5.58107 | 0.0 | 18.10 | 0.0 | 0.713 | 6.436 | 87.9 | 2.3158 | 24.0 | 666.0 | 20.2 | 100.19 | 16.22 |
354 rows × 13 columns
#Write the parameters param = {'silent':True #The default is False, which is usually turned off manually ,'objective':'reg:linear' ,"eta":0.1} num_round = 180 #n_estimators
#Class train. The parameters that can be imported directly are training data and the number of trees. Other parameters need to be imported through params bst = xgb.train(param, dtrain, num_round)
#Interface predict preds = bst.predict(dtest)
preds
array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 , 8.378565 , 23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649 , 24.030867 , 34.36362 , 21.461111 , 28.839497 , 19.568035 , 10.188658 , 19.42369 , 23.539951 , 22.850523 , 23.198708 , 17.82486 , 16.07219 , 27.602034 , 20.773046 , 20.868807 , 15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 , 36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615 , 23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111 , 18.784286 , 20.483374 , 37.10668 , 18.068268 , 12.73839 , 31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 , 26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 , 25.225826 , 17.15366 , 21.215551 , 17.426773 , 18.478971 , 14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 , 18.756853 , 18.784918 , 16.33361 , 23.155968 , 19.144344 , 29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 , 23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 , 19.537868 , 46.349495 , 23.119637 , 8.071444 , 26.358177 , 24.85706 , 17.057547 , 20.084204 , 18.54005 , 7.157663 , 20.593962 , 15.451031 , 45.09552 , 34.435097 , 22.969654 , 10.10335 , 10.803318 , 18.42058 , 7.800361 , 11.79309 , 30.755335 , 10.80648 , 26.122625 , 22.589502 , 31.219454 , 42.283318 , 19.274109 , 7.3861685, 23.055706 , 14.315018 , 45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247 , 28.382742 , 23.41182 , 19.962458 , 45.916683 , 17.521889 , 24.13039 , 26.147182 , 18.418781 , 17.606575 , 14.540631 , 20.595512 , 32.59128 , 10.155618 , 20.53032 , 21.477484 , 17.450048 , 20.154486 , 8.010227 , 30.482618 , 29.677181 , 20.357098 , 18.222181 , 14.14504 , 10.100547 , 18.85027 , 41.85804 , 17.44544 , 22.907183 , 21.02398 , 29.799366 , 20.219465 , 12.404763 , 45.750965 , 25.56757 , 22.000706 , 14.194921 , 27.102774 ], dtype=float32)
from sklearn.metrics import r2_score r2_score(Ytest,preds)
0.9260984298390122
MSE(Ytest,preds)
6.87682821415069
import xgboost as xgb #For convenience, use full data dfull = xgb.DMatrix(X,y)
#Set parameters param1 = {'silent':True,'obj':'reg:linear',"gamma":0} num_round = 100 n_fold=5 #sklearn - KFold
#Use class XGB cv time0 = time() cvresult1 = xgb.cv(param1, dfull, num_round,n_fold) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:610364
#Look at class XGB What is the result of CV? cvresult1 #As the number of trees increases, how does the effect of our model change
train-rmse-mean | train-rmse-std | test-rmse-mean | test-rmse-std | |
---|---|---|---|---|
0 | 17.105578 | 0.129116 | 17.163215 | 0.584296 |
1 | 12.337973 | 0.097557 | 12.519736 | 0.473458 |
2 | 8.994071 | 0.065756 | 9.404534 | 0.472310 |
3 | 6.629481 | 0.050323 | 7.250335 | 0.500342 |
4 | 4.954406 | 0.033209 | 5.920812 | 0.591874 |
5 | 3.781454 | 0.029604 | 5.045190 | 0.687971 |
6 | 2.947767 | 0.038786 | 4.472030 | 0.686492 |
7 | 2.357748 | 0.042040 | 4.179314 | 0.737935 |
8 | 1.951907 | 0.044972 | 3.979878 | 0.798198 |
9 | 1.660895 | 0.044894 | 3.870751 | 0.812331 |
10 | 1.464296 | 0.049422 | 3.816196 | 0.835251 |
11 | 1.323362 | 0.056240 | 3.788125 | 0.841643 |
12 | 1.214468 | 0.046524 | 3.766973 | 0.848989 |
13 | 1.137311 | 0.044522 | 3.741199 | 0.872370 |
14 | 1.064629 | 0.042245 | 3.729194 | 0.879429 |
15 | 1.010286 | 0.038892 | 3.717997 | 0.879572 |
16 | 0.941258 | 0.038360 | 3.706736 | 0.878032 |
17 | 0.883599 | 0.056640 | 3.693886 | 0.873913 |
18 | 0.829674 | 0.057284 | 3.693296 | 0.883429 |
19 | 0.772332 | 0.042899 | 3.687510 | 0.880928 |
20 | 0.731557 | 0.049150 | 3.687037 | 0.879180 |
21 | 0.690698 | 0.041190 | 3.677507 | 0.882060 |
22 | 0.657743 | 0.042137 | 3.675343 | 0.883635 |
23 | 0.619988 | 0.054097 | 3.671006 | 0.879224 |
24 | 0.585414 | 0.052585 | 3.670951 | 0.867470 |
25 | 0.548723 | 0.054440 | 3.673598 | 0.863241 |
26 | 0.527266 | 0.049630 | 3.673988 | 0.867116 |
27 | 0.504405 | 0.040376 | 3.671702 | 0.864566 |
28 | 0.468534 | 0.033020 | 3.671324 | 0.862536 |
29 | 0.448633 | 0.032191 | 3.675074 | 0.864713 |
... | ... | ... | ... | ... |
70 | 0.071057 | 0.015411 | 3.668067 | 0.859435 |
71 | 0.067946 | 0.013960 | 3.667708 | 0.859370 |
72 | 0.065197 | 0.012475 | 3.668174 | 0.859307 |
73 | 0.062789 | 0.012538 | 3.668738 | 0.859471 |
74 | 0.060294 | 0.012669 | 3.668950 | 0.860112 |
75 | 0.058278 | 0.012055 | 3.669084 | 0.859966 |
76 | 0.055402 | 0.011065 | 3.669627 | 0.859505 |
77 | 0.053819 | 0.011072 | 3.669904 | 0.859294 |
78 | 0.051280 | 0.011215 | 3.670185 | 0.859204 |
79 | 0.048748 | 0.009988 | 3.670092 | 0.859250 |
80 | 0.046972 | 0.009233 | 3.669869 | 0.858892 |
81 | 0.044753 | 0.008664 | 3.669702 | 0.858676 |
82 | 0.043148 | 0.008636 | 3.669704 | 0.858921 |
83 | 0.041823 | 0.008355 | 3.669596 | 0.858843 |
84 | 0.040257 | 0.008378 | 3.669730 | 0.858459 |
85 | 0.038518 | 0.007731 | 3.669835 | 0.858698 |
86 | 0.036694 | 0.006928 | 3.669705 | 0.858958 |
87 | 0.034932 | 0.006174 | 3.669722 | 0.858715 |
88 | 0.033947 | 0.006206 | 3.669964 | 0.858547 |
89 | 0.032706 | 0.006176 | 3.669988 | 0.858516 |
90 | 0.031317 | 0.006171 | 3.670116 | 0.858512 |
91 | 0.029697 | 0.005473 | 3.669930 | 0.858759 |
92 | 0.028561 | 0.005599 | 3.669906 | 0.858549 |
93 | 0.027585 | 0.005694 | 3.669822 | 0.858554 |
94 | 0.026436 | 0.005414 | 3.669985 | 0.858390 |
95 | 0.025204 | 0.005145 | 3.669921 | 0.858313 |
96 | 0.024422 | 0.005242 | 3.669983 | 0.858255 |
97 | 0.023661 | 0.005117 | 3.669947 | 0.858331 |
98 | 0.022562 | 0.004704 | 3.669868 | 0.858578 |
99 | 0.021496 | 0.004738 | 3.669824 | 0.858305 |
100 rows × 4 columns
plt.figure(figsize=(20,5)) plt.grid() plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0") plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0") plt.legend() plt.show() #What can we see from this picture? #How to observe the generalization ability of the model from the graph? #From the perspective of this diagram, what is the parameter adjustment goal of the model?
[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-OVs9GHP4-1619417048288)(output_66_0.png)]
#What is the default model evaluation index of regression model in xgboost?
param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"} cvresult1 = xgb.cv(param1, dfull, num_round,n_fold) plt.figure(figsize=(20,5)) plt.grid() plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0") plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0") plt.legend() plt.show()
[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-fSCBcwL6-1619417048288)(output_68_0.png)]
param1 = {'silent':True,'obj':'reg:linear',"gamma":0} param2 = {'silent':True,'obj':'reg:linear',"gamma":20} num_round = 180 n_fold=5 time0 = time() cvresult1 = xgb.cv(param1, dfull, num_round,n_fold) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:083104
time0 = time() cvresult2 = xgb.cv(param2, dfull, num_round,n_fold) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:359378
plt.figure(figsize=(20,5)) plt.grid() plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0") plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0") plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20") plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20") plt.legend() plt.show() #From here, do you see how gamma controls over fitting? Control training on training set - reduce performance on training set
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-nW9wsd4l-1619417048289)(output_71_0.png)]
import xgboost as xgb import matplotlib.pyplot as plt from time import time import datetime
from sklearn.datasets import load_breast_cancer data2 = load_breast_cancer() x2 = data2.data y2 = data2.target dfull2 = xgb.DMatrix(x2,y2) param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5 ,"eval_metrics":"error" } param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5} num_round = 100
time0 = time() cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error")) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:271581
time0 = time() cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error")) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:443810
plt.figure(figsize=(20,5)) plt.grid() plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0") plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0") plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1") plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1") plt.legend() plt.show()
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-40pJXnNG-1619417048289)(output_76_0.png)]
dfull = xgb.DMatrix(X,y) param1 = {'silent':True ,'obj':'reg:linear' ,"subsample":1 ,"max_depth":6 ,"eta":0.3 ,"gamma":0 ,"lambda":1 ,"alpha":0 ,"colsample_bytree":1 ,"colsample_bylevel":1 ,"colsample_bynode":1 ,"nfold":5} num_round = 200
time0 = time() cvresult1 = xgb.cv(param1, dfull, num_round) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")) fig,ax = plt.subplots(1,figsize=(15,8)) ax.set_ylim(top=5) ax.grid() ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original") ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original") ax.legend(fontsize="xx-large") plt.show()
00:00:513584
[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-23NZRQ9V-1619417048290)(output_78_1.png)]
param1 = {'silent':True ,'obj':'reg:linear' ,"subsample":1 ,"max_depth":6 ,"eta":0.3 ,"gamma":0 ,"lambda":1 ,"alpha":0 ,"colsample_bytree":1 ,"colsample_bylevel":1 ,"colsample_bynode":1 ,"nfold":5} num_round = 200 time0 = time() cvresult1 = xgb.cv(param1, dfull, num_round) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")) fig,ax = plt.subplots(1,figsize=(15,8)) ax.set_ylim(top=5) ax.grid() ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original") ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original") param2 = {'silent':True ,'obj':'reg:linear' ,"max_depth":2 ,"eta":0.05 ,"gamma":0 ,"lambda":1 ,"alpha":0 ,"colsample_bytree":1 ,"colsample_bylevel":0.4 ,"colsample_bynode":1 ,"nfold":5} param3 = {'silent':True ,'obj':'reg:linear' ,"subsample":1 ,"eta":0.05 ,"gamma":20 ,"lambda":3.5 ,"alpha":0.2 ,"max_depth":4 ,"colsample_bytree":0.4 ,"colsample_bylevel":0.6 ,"colsample_bynode":1 ,"nfold":5} time0 = time() cvresult2 = xgb.cv(param2, dfull, num_round) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")) time0 = time() cvresult3 = xgb.cv(param3, dfull, num_round) print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f")) ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last") ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last") ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this") ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this") ax.legend(fontsize="xx-large") plt.show()
00:00:532621 00:00:223373 00:00:259346
[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-xdwpzhu-1619417048290) (output_79_1. PNG)]
import pickle
dtrain = xgb.DMatrix(Xtrain,Ytrain) #Set parameters and train the model param = {'silent':True ,'obj':'reg:linear' ,"subsample":1 ,"eta":0.05 ,"gamma":20 ,"lambda":3.5 ,"alpha":0.2 ,"max_depth":4 ,"colsample_bytree":0.4 ,"colsample_bylevel":0.6 ,"colsample_bynode":1} num_round = 180 bst = xgb.train(param, dtrain, num_round)
#Save model pickle.dump(bst, open("xgboostonboston.dat","wb")) #Note that in open, we often use w or r as the reading mode, but in fact, W and r can only be used for text files - txt #When we want to import not the text file but the model itself, we use "wb" and "rb" as the reading mode #wb means to write in binary and rb means to read in binary. The file saved with open is a model that can be read or called
#See where the model is saved? import sys sys.path
['C:\\Pythonwork\\micro-class\\11 xgboost', 'C:\\Python\\python37.zip', 'C:\\Python\\DLLs', 'C:\\Python\\lib', 'C:\\Python', '', 'C:\\Python\\lib\\site-packages', 'C:\\Python\\lib\\site-packages\\win32', 'C:\\Python\\lib\\site-packages\\win32\\lib', 'C:\\Python\\lib\\site-packages\\Pythonwin', 'C:\\Python\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\Shuyu\\.ipython']
#Reopen jupyter lab from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split as TTS from sklearn.metrics import mean_squared_error as MSE import pickle import xgboost as xgb data = load_boston() X = data.data y = data.target Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
#Note that if the model we save is built in the xgboost library, the imported data type must also be the data type in the xgboost library dtest = xgb.DMatrix(Xtest,Ytest)
#Import model loaded_model = pickle.load(open("xgboostonboston.dat", "rb")) print("Loaded model from: xgboostonboston.dat")
Loaded model from: xgboostonboston.dat
#Make a prediction and directly call the interface predict ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131, 9.944413, 21.356094, 15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792, 20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337, 26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674, 20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473, 34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056, 15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418, 33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873, 10.37829 , 37.5367 , 27.097404, 20.73775 , 20.198935, 46.20087 , 26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708, 15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672, 21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055, 30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447, 21.068003, 20.5974 , 18.412853, 45.326836, 22.941956, 9.055015, 27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619 , 12.178641, 21.293903, 19.826134, 41.0362 , 31.300192, 24.400661, 11.267941, 15.763796, 20.984198, 9.232577, 11.090055, 32.739227, 16.265066, 24.975492, 24.905188, 34.348663, 41.02216 , 20.181097, 8.897793, 22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355, 19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314, 24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166, 31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842, 8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898, 10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631, 32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 , 16.924698, 22.633028], dtype=float32)
from sklearn.metrics import mean_squared_error as MSE, r2_score MSE(Ytest,ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
bst = xgb.train(param, dtrain, num_round)
import joblib #You can also see where the model is saved joblib.dump(bst,"xgboost-boston.dat")
['xgboost-boston.dat']
loaded_model = joblib.load("xgboost-boston.dat")
dtest = xgb.DMatrix(Xtest,Ytest) ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131, 9.944413, 21.356094, 15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792, 20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337, 26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674, 20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473, 34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056, 15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418, 33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873, 10.37829 , 37.5367 , 27.097404, 20.73775 , 20.198935, 46.20087 , 26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708, 15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672, 21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055, 30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447, 21.068003, 20.5974 , 18.412853, 45.326836, 22.941956, 9.055015, 27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619 , 12.178641, 21.293903, 19.826134, 41.0362 , 31.300192, 24.400661, 11.267941, 15.763796, 20.984198, 9.232577, 11.090055, 32.739227, 16.265066, 24.975492, 24.905188, 34.348663, 41.02216 , 20.181097, 8.897793, 22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355, 19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314, 24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166, 31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842, 8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898, 10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631, 32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 , 16.924698, 22.633028], dtype=float32)
MSE(Ytest, ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
#Using the model in sklearn from xgboost import XGBRegressor as XGBR bst = XGBR(n_estimators=200 ,eta=0.05,gamma=20 ,reg_lambda=3.5 ,reg_alpha=0.2 ,max_depth=4 ,colsample_bytree=0.4 ,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #Training completed
joblib.dump(bst,"xgboost-boston-sklearn.dat")
['xgboost-boston-sklearn.dat']
loaded_model = joblib.load("xgboost-boston-sklearn.dat")
#You can import Xtest directly here, which is our numpy ypreds = loaded_model.predict(Xtest)
Xtest
array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01, 3.29460e+02, 2.73800e+01], [2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01, 3.96900e+02, 9.14000e+00], [3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01, 3.96900e+02, 4.56000e+00], ..., [5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01, 3.89710e+02, 5.68000e+00], [3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01, 2.20100e+01, 1.71500e+01], [1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01, 3.96900e+02, 1.34400e+01]])
dtest
<xgboost.core.DMatrix at 0x29e30670668>
ypreds
array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 , 9.883689 , 20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 , 22.096102 , 35.381573 , 20.3307 , 27.129421 , 19.997156 , 10.935587 , 20.25071 , 26.188572 , 26.711943 , 22.600443 , 18.23832 , 15.876045 , 26.263977 , 22.706024 , 20.18491 , 15.891692 , 21.4781 , 29.047956 , 23.371012 , 17.167185 , 35.699898 , 20.490337 , 20.195292 , 23.81444 , 23.106022 , 25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664 , 17.46115 , 19.159126 , 34.79234 , 17.766731 , 17.141891 , 27.755646 , 39.786766 , 22.49913 , 10.246634 , 36.76105 , 26.294876 , 20.75917 , 19.893272 , 46.62629 , 26.549704 , 24.040398 , 17.769514 , 20.76889 , 16.139618 , 17.494894 , 16.005596 , 24.28487 , 19.15237 , 31.407684 , 27.862312 , 18.877817 , 20.50497 , 16.094156 , 22.622025 , 17.762297 , 28.518019 , 41.146317 , 32.52681 , 23.117966 , 19.125128 , 24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 , 18.567612 , 46.46465 , 23.03303 , 9.912106 , 26.407642 , 23.466772 , 16.985506 , 20.73746 , 15.679997 , 11.697191 , 21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 , 12.362759 , 14.593165 , 20.577328 , 9.253377 , 11.1253805, 32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 , 41.556873 , 19.726238 , 8.808649 , 23.04128 , 14.709186 , 46.10303 , 21.435535 , 21.97892 , 24.299171 , 19.591938 , 27.527737 , 23.80468 , 18.782711 , 44.266346 , 17.328068 , 23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 , 23.655058 , 32.294373 , 10.60579 , 22.099716 , 19.26955 , 14.293162 , 19.386055 , 8.824598 , 26.909697 , 29.539446 , 20.38691 , 20.832077 , 22.507433 , 11.142808 , 17.685743 , 40.230915 , 17.526121 , 23.09964 , 19.899158 , 31.775164 , 19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 , 15.041253 , 28.63522 ], dtype=float32)
MSE(Ytest, ypreds)
10.198269690947479
r2_score(Ytest,ypreds)
0.8904046866351292
import numpy as np import xgboost as xgb import matplotlib.pyplot as plt from xgboost import XGBClassifier as XGBC from sklearn.datasets import make_blobs #Self created data set from sklearn.model_selection import train_test_split as TTS from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
class_1 = 500 #Category 1 has 500 samples class_2 = 50 #Category 2 has only 50 centers = [[0.0, 0.0], [2.0, 2.0]] #Set the center of two categories clusters_std = [1.5, 0.5] #Set the variance of the two categories. Generally speaking, the category with large sample size will be more loose X, y = make_blobs(n_samples=[class_1, class_2], centers=centers, cluster_std=clusters_std, random_state=0, shuffle=False)
X.shape
(550, 2)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(y == 1).sum() / y.shape[0] #9%
0.09090909090909091
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)
#stay sklearn Lower modeling# clf = XGBC().fit(Xtrain,Ytrain) ypred = clf.predict(Xtest)
ypred
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
clf.score(Xtest,Ytest) #Default model evaluation indicator - Accuracy
0.9272727272727272
cm(Ytest,ypred,labels=[1,0]) #A few classes are written in front
array([[ 9, 4], [ 8, 144]], dtype=int64)
recall(Ytest,ypred)
0.6923076923076923
auc(Ytest,clf.predict_proba(Xtest)[:,1])
0.9671052631578947
#Negative / positive sample ratio clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain) ypred_ = clf_.predict(Xtest) clf_.score(Xtest,Ytest) cm(Ytest,ypred_,labels=[1,0]) recall(Ytest,ypred_) auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9696356275303644
#How do the recall,auc and accuracy of the model change with the gradual increase of sample weight? for i in [1,5,10,20,30]: clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain) ypred_ = clf_.predict(Xtest) print(i) print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest))) print("\tRecall:{}".format(recall(Ytest,ypred_))) print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
1 Accuracy:0.9272727272727272 Recall:0.6923076923076923 AUC:0.9671052631578947 5 Accuracy:0.9454545454545454 Recall:0.9230769230769231 AUC:0.9665991902834008 10 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9696356275303644 20 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9706477732793523 30 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9701417004048584
#Negative / positive sample ratio clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain) ypred_ = clf_.predict(Xtest) clf_.score(Xtest,Ytest)
0.9515151515151515
cm(Ytest,ypred_,labels=[1,0])
array([[ 13, 0], [ 8, 144]], dtype=int64)
recall(Ytest,ypred_)
1.0
auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9706477732793523
dtrain = xgb.DMatrix(Xtrain,Ytrain) dtest = xgb.DMatrix(Xtest,Ytest)
#Take a look at the predict interface of the xgboost library param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1} num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
#See what preds returns? preds
array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454, 0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839, 0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053, 0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357, 0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839, 0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251, 0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357, 0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426, 0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714, 0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454, 0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026, 0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246, 0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053, 0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357, 0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714, 0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053, 0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714, 0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714, 0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528, 0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839, 0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217], dtype=float32)
#Set your own threshold ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)
ypred[ypred != 1] = 0
#Write the parameters scale_pos_weight = [1,5,10] names = ["negative vs positive: 1" ,"negative vs positive: 5" ,"negative vs positive: 10"]
[*zip(names,scale_pos_weight)]
[('negative vs positive: 1', 1), ('negative vs positive: 5', 5), ('negative vs positive: 10', 10)]
#Import model evaluation indicators from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as auc for name,i in zip(names,scale_pos_weight): param = {'silent':True,'objective':'binary:logistic' ,"eta":0.1,"scale_pos_weight":i} num_round = 100 clf = xgb.train(param, dtrain, num_round) preds = clf.predict(dtest) ypred = preds.copy() ypred[preds > 0.5] = 1 ypred[ypred != 1] = 0 print(name) print("\tAccuracy:{}".format(accuracy(Ytest,ypred))) print("\tRecall:{}".format(recall(Ytest,ypred))) print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1 Accuracy:0.9272727272727272 Recall:0.6923076923076923 AUC:0.9741902834008097 negative vs positive: 5 Accuracy:0.9393939393939394 Recall:0.8461538461538461 AUC:0.9635627530364372 negative vs positive: 10 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9665991902834008
#Of course, we can also try different thresholds for name,i in zip(names,scale_pos_weight): for thres in [0.3,0.5,0.7,0.9]: param= {'silent':True,'objective':'binary:logistic' ,"eta":0.1,"scale_pos_weight":i} clf = xgb.train(param, dtrain, num_round) preds = clf.predict(dtest) ypred = preds.copy() ypred[preds > thres] = 1 ypred[ypred != 1] = 0 print("{},thresholds:{}".format(name,thres)) print("\tAccuracy:{}".format(accuracy(Ytest,ypred))) print("\tRecall:{}".format(recall(Ytest,ypred))) print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1,thresholds:0.3 Accuracy:0.9393939393939394 Recall:0.8461538461538461 AUC:0.9741902834008097 negative vs positive: 1,thresholds:0.5 Accuracy:0.9272727272727272 Recall:0.6923076923076923 AUC:0.9741902834008097 negative vs positive: 1,thresholds:0.7 Accuracy:0.9212121212121213 Recall:0.6153846153846154 AUC:0.9741902834008097 negative vs positive: 1,thresholds:0.9 Accuracy:0.9515151515151515 Recall:0.5384615384615384 AUC:0.9741902834008097 negative vs positive: 5,thresholds:0.3 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9635627530364372 negative vs positive: 5,thresholds:0.5 Accuracy:0.9393939393939394 Recall:0.8461538461538461 AUC:0.9635627530364372 negative vs positive: 5,thresholds:0.7 Accuracy:0.9272727272727272 Recall:0.6923076923076923 AUC:0.9635627530364372 negative vs positive: 5,thresholds:0.9 Accuracy:0.9212121212121213 Recall:0.6153846153846154 AUC:0.9635627530364372 negative vs positive: 10,thresholds:0.3 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9665991902834008 negative vs positive: 10,thresholds:0.5 Accuracy:0.9515151515151515 Recall:1.0 AUC:0.9665991902834008 negative vs positive: 10,thresholds:0.7 Accuracy:0.9393939393939394 Recall:0.8461538461538461 AUC:0.9665991902834008 negative vs positive: 10,thresholds:0.9 Accuracy:0.9212121212121213 Recall:0.6153846153846154 AUC:0.9665991902834008