Integrated learning - xgboost learning

Posted by Sir Mildred Pierce on Sat, 19 Feb 2022 15:18:46 +0100

XGboost related learning

from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
data = load_boston()
#The Boston data set is very simple, but it involves many problems
data
{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),
 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
 'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}
X = data.data
y = data.target
X.shape
(506, 13)
y.shape
(506,)
y
array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
       17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
       25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
       23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
       32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
       34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
       20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
       26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
       31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
       22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
       42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
       36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
       32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
       20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
       20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
       22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
       21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
       19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
       32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
       18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
       16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
       13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
        7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
       12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
       27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
        8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
        9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
       10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
       15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
       19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
       29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
       20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
       23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #train
reg.predict(Xtest) #Traditional interface predict
reg.score(Xtest,Ytest) #Can you think of what model evaluation indicators should be returned here? Using shift+Tab, you can know the R^2 evaluation index
y.mean()
MSE(Ytest,reg.predict(Xtest))#It can be seen that the mean square error is about 1 / 3 of the average value y.mean(), and the result is neither good nor bad
reg.feature_importances_ #One of the advantages of tree model: it can view the importance score of the model and use the select from model for feature selection
#xgboost can use embedding method for feature selection
reg = XGBR(n_estimators=100) #Untrained models imported in cross validation
CVS(reg,Xtrain,Ytrain,cv=5).mean()
#What model evaluation indicators should be returned here, remember? And reg Score the same evaluation index R^2 (regression), accuracy (classification)
0.8017863029875325
#Discussion between rigorous cross validation and non rigorous cross validation: training set or full data?
array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])
#Rigorous vs. not rigorous
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()
-16.041115480238048
#Let's take a look at all the model evaluation indicators in sklearn
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']
#A comparison was made using random forest and linear regression
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329
0.7975497480638329
CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033
-16.998723616338033
lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085
0.6835070597278085
CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844
-25.34950749364844
#If you enable the parameter slient: when the data is huge and the algorithm is expected to run very slowly, you can use this parameter to monitor the training progress of the model
reg = XGBR(n_estimators=10,silent=True)#xgboost library silent=True will not print the training process, but only return the running results. The default is False, and the training process will be printed
#The default of xgbsoost in sklearn library is silent=True. The training process will not be printed. If you want to print, you need to manually set it to False
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579
-92.67865836936579
def plot_learning_curve(estimator,title, X, y, 
                        ax=None, #Select subgraph
                        ylim=None, #Set the value range of ordinate
                        cv=None, #Cross validation
                        n_jobs=None #Set the thread to use
                       ):
    
    from sklearn.model_selection import learning_curve
    import matplotlib.pyplot as plt
    import numpy as np
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
                                                            ,shuffle=True
                                                            ,cv=cv
                                                            ,random_state=420
                                                            ,n_jobs=n_jobs)      
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    ax.grid() #Drawing a mesh is not required
    ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-'
            , color="r",label="Training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
            , color="g",label="Test score")
    ax.legend(loc="best")
    return ax
cv = KFold(n_splits=5, shuffle = True, random_state=42) #Cross validation mode
plot_learning_curve(XGBR(n_estimators=100,random_state=420)
                    ,"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-xpQh3MDR-1619417048279)(output_28_0.png)]

#=====[TIME WARNING: 25 seconds]=====#

axisx = range(10,1010,50)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
660 0.8046775284172915

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-6da4aVib-1619417048281)(output_29_1.png)]

#Selected n_estimators are very unusual. Should we choose n with the highest accuracy_ What about the estimators value?
#======[TIME WARNING: 20s]=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    #Record 1 - deviation
    rs.append(cvresult.mean())
    #Record variance
    var.append(cvresult.var())
    #Calculate the controllable part of generalization error
    ge.append((1 - cvresult.mean())**2+cvresult.var())
#Print the parameter value corresponding to the highest R2, and print the variance under this parameter
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#Print the parameter value corresponding to the lowest variance, and print R2 under this parameter
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#Print the parameter value of the controllable part of generalization error, and print R2, variance and controllable part of generalization error under this parameter
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()
650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-eJ1DzHzL-1619417048282)(output_31_1.png)]

axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#Add variance line
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-qD6ST091-1619417048283)(output_32_1.png)]

#What about the controllable part of generalization error?
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-pxH53h8Q-1619417048284)(output_33_0.png)]

#Verify whether the effect of the model is improved?
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9197580267581366
0.0787498950958252
time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9208745746309475
0.36807847023010254
time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)
0.9231068620728082
0.12366437911987305
axisx = np.linspace(0,1,20)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()
0.7368421052631579 0.837609040251761

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-7svP0syk-1619417048285)(output_37_1.png)]

#Continue to refine the learning curve
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055

[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-qvvhiCYL-1619417048286)(output_38_1.png)]

#Refine learning curve
axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166

[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-jULrwpAL-1619417048286)(output_39_1.png)]

reg = XGBR(n_estimators=180
         #  ,subsample=0.7708333333333334
           ,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)
0.9159462982185405
MSE(Ytest,reg.predict(Xtest))
7.821523502888769
#First, let's define a scoring function, which can help us directly print the cross validation results on Xtrain
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):
    score = []
    for i in range(len(scoring)):
        if show:
            print("{}:{:.2f}".format(scoring[i] #Name of model evaluation indicator
                                     ,CVS(reg
                                          ,Xtrain,Ytrain
                                          ,cv=cv,scoring=scoring[i]).mean()))
        score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())
    return score
reg = XGBR(n_estimators=180,random_state=420)
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
r2:0.80
neg_mean_squared_error:-13.48





[0.8038787848970184, -13.482301822063182]
regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
[0.8038787848970184, -13.482301822063182]
from time import time
import datetime

for i in [0,0.2,0.5,1]:
    time0=time()
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)
    print("learning_rate = {}".format(i))
    regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print("\t")
learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:561781
	
learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:848888
	
learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:541875
	
learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:499027
axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)
    score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
    test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)
    rs.append(score[0])
    te.append(test)
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()
0.55 0.8125604372670463

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-Kt53b60m-1619417048287)(output_47_1.png)]

for booster in ["gbtree","gblinear","dart"]:
    reg = XGBR(n_estimators=180
               ,learning_rate=0.1
               ,random_state=420
               ,booster=booster).fit(Xtrain,Ytrain)
    print(booster)
    print(reg.score(Xtest,Ytest))
gbtree
0.9231068620728082
gblinear
0.6286510307485139
dart
0.923106843149575
#Default reg:linear
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)
0.9231068620728082
MSE(Ytest,reg.predict(Xtest))
7.155205217161047
#xgb implementation method
import xgboost as xgb
#Read data using DMatrix class
dtrain = xgb.DMatrix(Xtrain,Ytrain) #Both feature matrix and label are passed in
dtest = xgb.DMatrix(Xtest,Ytest)
#Unfortunately, you can't open it for viewing, so you usually read it in pandas first and then put it in DMatrix
dtrain
<xgboost.core.DMatrix at 0x2770de3bdd8>
import pandas as pd
pd.DataFrame(Xtrain)
0123456789101112
00.030410.05.190.00.5155.89559.65.61505.0224.020.2394.8110.56
10.0411325.04.860.00.4266.72733.55.40074.0281.019.0396.905.29
210.233000.018.100.00.6146.18596.72.170524.0666.020.2379.7018.03
30.171420.06.910.00.4485.68233.85.10043.0233.017.9396.9010.21
40.050590.04.490.00.4496.38948.04.77943.0247.018.5396.909.62
50.135870.010.591.00.4896.06459.14.23924.0277.018.6381.3214.66
60.0498121.05.640.00.4395.99821.46.81474.0243.016.8396.908.43
70.0254355.03.780.00.4846.69656.45.73215.0370.017.6396.907.18
80.107930.08.560.00.5206.19554.42.77785.0384.020.9393.4913.00
90.024980.01.890.00.5186.54059.76.26691.0422.015.9389.968.65
100.092990.025.650.00.5815.96192.92.08692.0188.019.1378.0917.93
110.158760.010.810.00.4135.96117.55.28734.0305.019.2376.949.88
126.717720.018.100.00.7136.74992.62.323624.0666.020.20.3217.44
130.0376880.01.520.00.4047.27438.37.30902.0329.012.6392.206.62
145.201770.018.101.00.7706.12783.42.722724.0666.020.2395.4311.48
1511.087400.018.100.00.7186.411100.01.858924.0666.020.2318.7515.02
160.114320.08.560.00.5206.78171.32.85615.0384.020.9395.587.67
170.056020.02.460.00.4887.83153.63.19923.0193.017.8392.634.45
180.241030.07.380.00.4936.08343.75.41595.0287.019.6396.9012.79
190.0937812.57.870.00.5245.88939.05.45095.0311.015.2390.5015.71
208.716750.018.100.00.6936.47198.81.725724.0666.020.2391.9817.12
217.367110.018.100.00.6796.19378.11.935624.0666.020.296.7321.52
221.387990.08.140.00.5385.95082.03.99004.0307.021.0232.6027.71
2314.333700.018.100.00.6146.22988.01.951224.0666.020.2383.3213.11
2428.655800.018.100.00.5975.155100.01.589424.0666.020.2210.9720.08
250.802710.08.140.00.5385.45636.63.79654.0307.021.0288.9911.69
261.002450.08.140.00.5386.67487.34.23904.0307.021.0380.2311.98
279.916550.018.100.00.6935.85277.81.500424.0666.020.2338.1629.97
280.131580.010.010.00.5476.17672.52.73016.0432.017.8393.3012.04
290.142310.010.010.00.5476.25484.22.25656.0432.017.8388.7410.45
..........................................
3240.131170.08.560.00.5206.12785.22.12245.0384.020.9387.6914.09
3251.354720.08.140.00.5386.072100.04.17504.0307.021.0376.7313.04
3260.101530.012.830.00.4376.27974.54.05225.0398.018.7373.6611.97
3270.229270.06.910.00.4486.03085.55.68943.0233.017.9392.7418.80
3280.0466680.01.520.00.4047.10736.67.30902.0329.012.6354.318.61
3290.080140.05.960.00.4995.85041.53.93425.0279.019.2396.908.77
3300.407710.06.201.00.5076.16491.33.04808.0307.017.4395.2421.46
3310.136420.010.590.00.4895.89122.33.94544.0277.018.6396.9010.87
3329.329090.018.100.00.7136.18598.72.261624.0666.020.2396.9018.13
3330.091030.02.460.00.4887.15592.22.70063.0193.017.8394.124.82
3340.0130135.01.520.00.4427.24149.37.03791.0284.015.5394.745.49
3350.590050.021.890.00.6246.37297.92.32744.0437.021.2385.7611.12
3361.126580.019.581.00.8715.01288.01.61025.0403.014.7343.2812.12
3370.0788680.04.950.00.4117.14827.75.11674.0245.019.2396.903.56
3380.217190.010.591.00.4895.80753.83.65264.0277.018.6390.9416.03
3390.537000.06.200.00.5045.98168.13.67158.0307.017.4378.3511.65
3403.321050.019.581.00.8715.403100.01.32165.0403.014.7396.9026.82
3411.496320.019.580.00.8715.404100.01.59165.0403.014.7341.6013.28
3420.387350.025.650.00.5815.61395.61.75722.0188.019.1359.2927.26
3430.066170.03.240.00.4605.86825.85.21464.0430.016.9382.449.97
3440.7857020.03.970.00.6477.01484.62.13295.0264.013.0384.0714.79
3451.413850.019.581.00.8716.12996.01.74945.0403.014.7321.0215.12
3460.060470.02.460.00.4886.15368.83.27973.0193.017.8387.1113.15
3478.492130.018.100.00.5846.34886.12.052724.0666.020.283.4517.64
3480.171340.010.010.00.5475.92888.22.46316.0432.017.8344.9115.76
3490.0387152.55.320.00.4056.20931.37.31726.0293.016.6396.907.14
3500.1265025.05.130.00.4536.76243.47.98098.0284.019.7395.589.50
3516.962150.018.100.00.7005.71397.01.926524.0666.020.2394.4317.11
3520.091640.010.810.00.4136.0657.85.28734.0305.019.2390.915.52
3535.581070.018.100.00.7136.43687.92.315824.0666.020.2100.1916.22

354 rows × 13 columns

#Write the parameters
param = {'silent':True #The default is False, which is usually turned off manually
         ,'objective':'reg:linear'
         ,"eta":0.1}
num_round = 180 #n_estimators
#Class train. The parameters that can be imported directly are training data and the number of trees. Other parameters need to be imported through params
bst = xgb.train(param, dtrain, num_round)
#Interface predict
preds = bst.predict(dtest)
preds
array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 ,  8.378565 ,
       23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649  ,
       24.030867 , 34.36362  , 21.461111 , 28.839497 , 19.568035 ,
       10.188658 , 19.42369  , 23.539951 , 22.850523 , 23.198708 ,
       17.82486  , 16.07219  , 27.602034 , 20.773046 , 20.868807 ,
       15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,
       36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615  ,
       23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111  ,
       18.784286 , 20.483374 , 37.10668  , 18.068268 , 12.73839  ,
       31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,
       26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,
       25.225826 , 17.15366  , 21.215551 , 17.426773 , 18.478971 ,
       14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,
       18.756853 , 18.784918 , 16.33361  , 23.155968 , 19.144344 ,
       29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,
       23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,
       19.537868 , 46.349495 , 23.119637 ,  8.071444 , 26.358177 ,
       24.85706  , 17.057547 , 20.084204 , 18.54005  ,  7.157663 ,
       20.593962 , 15.451031 , 45.09552  , 34.435097 , 22.969654 ,
       10.10335  , 10.803318 , 18.42058  ,  7.800361 , 11.79309  ,
       30.755335 , 10.80648  , 26.122625 , 22.589502 , 31.219454 ,
       42.283318 , 19.274109 ,  7.3861685, 23.055706 , 14.315018 ,
       45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247  ,
       28.382742 , 23.41182  , 19.962458 , 45.916683 , 17.521889 ,
       24.13039  , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,
       20.595512 , 32.59128  , 10.155618 , 20.53032  , 21.477484 ,
       17.450048 , 20.154486 ,  8.010227 , 30.482618 , 29.677181 ,
       20.357098 , 18.222181 , 14.14504  , 10.100547 , 18.85027  ,
       41.85804  , 17.44544  , 22.907183 , 21.02398  , 29.799366 ,
       20.219465 , 12.404763 , 45.750965 , 25.56757  , 22.000706 ,
       14.194921 , 27.102774 ], dtype=float32)
from sklearn.metrics import r2_score
r2_score(Ytest,preds)
0.9260984298390122
MSE(Ytest,preds)
6.87682821415069
import xgboost as xgb

#For convenience, use full data
dfull = xgb.DMatrix(X,y)
#Set parameters
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold
#Use class XGB cv
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:610364
#Look at class XGB What is the result of CV?
cvresult1 #As the number of trees increases, how does the effect of our model change
train-rmse-meantrain-rmse-stdtest-rmse-meantest-rmse-std
017.1055780.12911617.1632150.584296
112.3379730.09755712.5197360.473458
28.9940710.0657569.4045340.472310
36.6294810.0503237.2503350.500342
44.9544060.0332095.9208120.591874
53.7814540.0296045.0451900.687971
62.9477670.0387864.4720300.686492
72.3577480.0420404.1793140.737935
81.9519070.0449723.9798780.798198
91.6608950.0448943.8707510.812331
101.4642960.0494223.8161960.835251
111.3233620.0562403.7881250.841643
121.2144680.0465243.7669730.848989
131.1373110.0445223.7411990.872370
141.0646290.0422453.7291940.879429
151.0102860.0388923.7179970.879572
160.9412580.0383603.7067360.878032
170.8835990.0566403.6938860.873913
180.8296740.0572843.6932960.883429
190.7723320.0428993.6875100.880928
200.7315570.0491503.6870370.879180
210.6906980.0411903.6775070.882060
220.6577430.0421373.6753430.883635
230.6199880.0540973.6710060.879224
240.5854140.0525853.6709510.867470
250.5487230.0544403.6735980.863241
260.5272660.0496303.6739880.867116
270.5044050.0403763.6717020.864566
280.4685340.0330203.6713240.862536
290.4486330.0321913.6750740.864713
...............
700.0710570.0154113.6680670.859435
710.0679460.0139603.6677080.859370
720.0651970.0124753.6681740.859307
730.0627890.0125383.6687380.859471
740.0602940.0126693.6689500.860112
750.0582780.0120553.6690840.859966
760.0554020.0110653.6696270.859505
770.0538190.0110723.6699040.859294
780.0512800.0112153.6701850.859204
790.0487480.0099883.6700920.859250
800.0469720.0092333.6698690.858892
810.0447530.0086643.6697020.858676
820.0431480.0086363.6697040.858921
830.0418230.0083553.6695960.858843
840.0402570.0083783.6697300.858459
850.0385180.0077313.6698350.858698
860.0366940.0069283.6697050.858958
870.0349320.0061743.6697220.858715
880.0339470.0062063.6699640.858547
890.0327060.0061763.6699880.858516
900.0313170.0061713.6701160.858512
910.0296970.0054733.6699300.858759
920.0285610.0055993.6699060.858549
930.0275850.0056943.6698220.858554
940.0264360.0054143.6699850.858390
950.0252040.0051453.6699210.858313
960.0244220.0052423.6699830.858255
970.0236610.0051173.6699470.858331
980.0225620.0047043.6698680.858578
990.0214960.0047383.6698240.858305

100 rows × 4 columns

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

#What can we see from this picture?
#How to observe the generalization ability of the model from the graph?
#From the perspective of this diagram, what is the parameter adjustment goal of the model?

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-OVs9GHP4-1619417048288)(output_66_0.png)]

#What is the default model evaluation index of regression model in xgboost?
param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-fSCBcwL6-1619417048288)(output_68_0.png)]

param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:083104
time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:01:359378
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20")
plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20")
plt.legend()
plt.show()

#From here, do you see how gamma controls over fitting? Control training on training set - reduce performance on training set

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-nW9wsd4l-1619417048289)(output_71_0.png)]

import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime
from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()

x2 = data2.data
y2 = data2.target

dfull2 = xgb.DMatrix(x2,y2)

param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5
          ,"eval_metrics":"error"
         }
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100
time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:271581
time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error")) 
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
00:00:443810
plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1")
plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1")
plt.legend()
plt.show()

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-40pJXnNG-1619417048289)(output_76_0.png)]

dfull = xgb.DMatrix(X,y)

param1 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"max_depth":6
          ,"eta":0.3
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":1
          ,"colsample_bynode":1
          ,"nfold":5}
num_round = 200
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
ax.legend(fontsize="xx-large")
plt.show()
00:00:513584

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-23NZRQ9V-1619417048290)(output_78_1.png)]

param1 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"max_depth":6
          ,"eta":0.3
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":1
          ,"colsample_bynode":1
          ,"nfold":5}
num_round = 200

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")

param2 = {'silent':True
          ,'obj':'reg:linear'
          ,"max_depth":2
          ,"eta":0.05
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":0.4
          ,"colsample_bynode":1
          ,"nfold":5}

param3 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"eta":0.05
          ,"gamma":20
          ,"lambda":3.5
          ,"alpha":0.2
          ,"max_depth":4
          ,"colsample_bytree":0.4
          ,"colsample_bylevel":0.6
          ,"colsample_bynode":1
          ,"nfold":5}

time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
ax.legend(fontsize="xx-large")
plt.show()
00:00:532621
00:00:223373
00:00:259346

[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-xdwpzhu-1619417048290) (output_79_1. PNG)]

import pickle
dtrain = xgb.DMatrix(Xtrain,Ytrain)

#Set parameters and train the model
param = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"eta":0.05
          ,"gamma":20
          ,"lambda":3.5
          ,"alpha":0.2
          ,"max_depth":4
          ,"colsample_bytree":0.4
          ,"colsample_bylevel":0.6
          ,"colsample_bynode":1}
num_round = 180

bst = xgb.train(param, dtrain, num_round)
#Save model
pickle.dump(bst, open("xgboostonboston.dat","wb"))

#Note that in open, we often use w or r as the reading mode, but in fact, W and r can only be used for text files - txt
#When we want to import not the text file but the model itself, we use "wb" and "rb" as the reading mode
#wb means to write in binary and rb means to read in binary. The file saved with open is a model that can be read or called
#See where the model is saved?
import sys
sys.path
['C:\\Pythonwork\\micro-class\\11 xgboost',
 'C:\\Python\\python37.zip',
 'C:\\Python\\DLLs',
 'C:\\Python\\lib',
 'C:\\Python',
 '',
 'C:\\Python\\lib\\site-packages',
 'C:\\Python\\lib\\site-packages\\win32',
 'C:\\Python\\lib\\site-packages\\win32\\lib',
 'C:\\Python\\lib\\site-packages\\Pythonwin',
 'C:\\Python\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Shuyu\\.ipython']
#Reopen jupyter lab

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgb

data = load_boston()

X = data.data
y = data.target

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)
#Note that if the model we save is built in the xgboost library, the imported data type must also be the data type in the xgboost library
dtest = xgb.DMatrix(Xtest,Ytest)
#Import model
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")
Loaded model from: xgboostonboston.dat
#Make a prediction and directly call the interface predict
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,
       15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,
       20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,
       26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,
       20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,
       34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,
       15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,
       33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,
       10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,
       26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,
       15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,
       21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,
       30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,
       21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,
       27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,
       21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,
       15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,
       24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,
       22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,
       19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,
       24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,
       31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,
        8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,
       10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,
       32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,
       16.924698, 22.633028], dtype=float32)
from sklearn.metrics import mean_squared_error as MSE, r2_score
MSE(Ytest,ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
bst = xgb.train(param, dtrain, num_round)
import joblib

#You can also see where the model is saved
joblib.dump(bst,"xgboost-boston.dat")
['xgboost-boston.dat']
loaded_model = joblib.load("xgboost-boston.dat")
dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)
ypreds
array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,
       15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,
       20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,
       26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,
       20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,
       34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,
       15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,
       33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,
       10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,
       26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,
       15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,
       21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,
       30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,
       21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,
       27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,
       21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,
       15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,
       24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,
       22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,
       19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,
       24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,
       31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,
        8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,
       10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,
       32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,
       16.924698, 22.633028], dtype=float32)
MSE(Ytest, ypreds)
9.107608696116197
r2_score(Ytest,ypreds)
0.9021254331073938
#Using the model in sklearn
from xgboost import XGBRegressor as XGBR

bst = XGBR(n_estimators=200
           ,eta=0.05,gamma=20
           ,reg_lambda=3.5
           ,reg_alpha=0.2
           ,max_depth=4
           ,colsample_bytree=0.4
           ,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #Training completed
joblib.dump(bst,"xgboost-boston-sklearn.dat")
['xgboost-boston-sklearn.dat']
loaded_model = joblib.load("xgboost-boston-sklearn.dat")
#You can import Xtest directly here, which is our numpy
ypreds = loaded_model.predict(Xtest)
Xtest
array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.29460e+02, 2.73800e+01],
       [2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,
        3.96900e+02, 9.14000e+00],
       [3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,
        3.96900e+02, 4.56000e+00],
       ...,
       [5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,
        3.89710e+02, 5.68000e+00],
       [3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        2.20100e+01, 1.71500e+01],
       [1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.96900e+02, 1.34400e+01]])
dtest
<xgboost.core.DMatrix at 0x29e30670668>
ypreds
array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 ,  9.883689 ,
       20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,
       22.096102 , 35.381573 , 20.3307   , 27.129421 , 19.997156 ,
       10.935587 , 20.25071  , 26.188572 , 26.711943 , 22.600443 ,
       18.23832  , 15.876045 , 26.263977 , 22.706024 , 20.18491  ,
       15.891692 , 21.4781   , 29.047956 , 23.371012 , 17.167185 ,
       35.699898 , 20.490337 , 20.195292 , 23.81444  , 23.106022 ,
       25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664  ,
       17.46115  , 19.159126 , 34.79234  , 17.766731 , 17.141891 ,
       27.755646 , 39.786766 , 22.49913  , 10.246634 , 36.76105  ,
       26.294876 , 20.75917  , 19.893272 , 46.62629  , 26.549704 ,
       24.040398 , 17.769514 , 20.76889  , 16.139618 , 17.494894 ,
       16.005596 , 24.28487  , 19.15237  , 31.407684 , 27.862312 ,
       18.877817 , 20.50497  , 16.094156 , 22.622025 , 17.762297 ,
       28.518019 , 41.146317 , 32.52681  , 23.117966 , 19.125128 ,
       24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,
       18.567612 , 46.46465  , 23.03303  ,  9.912106 , 26.407642 ,
       23.466772 , 16.985506 , 20.73746  , 15.679997 , 11.697191 ,
       21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,
       12.362759 , 14.593165 , 20.577328 ,  9.253377 , 11.1253805,
       32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,
       41.556873 , 19.726238 ,  8.808649 , 23.04128  , 14.709186 ,
       46.10303  , 21.435535 , 21.97892  , 24.299171 , 19.591938 ,
       27.527737 , 23.80468  , 18.782711 , 44.266346 , 17.328068 ,
       23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,
       23.655058 , 32.294373 , 10.60579  , 22.099716 , 19.26955  ,
       14.293162 , 19.386055 ,  8.824598 , 26.909697 , 29.539446 ,
       20.38691  , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,
       40.230915 , 17.526121 , 23.09964  , 19.899158 , 31.775164 ,
       19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,
       15.041253 , 28.63522  ], dtype=float32)
MSE(Ytest, ypreds)
10.198269690947479
r2_score(Ytest,ypreds)
0.8904046866351292
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #Self created data set
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc
class_1 = 500 #Category 1 has 500 samples
class_2 = 50 #Category 2 has only 50
centers = [[0.0, 0.0], [2.0, 2.0]] #Set the center of two categories
clusters_std = [1.5, 0.5] #Set the variance of the two categories. Generally speaking, the category with large sample size will be more loose
X, y = make_blobs(n_samples=[class_1, class_2],
                  centers=centers,
                  cluster_std=clusters_std,
                  random_state=0, shuffle=False)
X.shape
(550, 2)
y
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
(y == 1).sum() / y.shape[0] #9%
0.09090909090909091
Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)
#stay sklearn Lower modeling#

clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)
ypred
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
clf.score(Xtest,Ytest) #Default model evaluation indicator - Accuracy
0.9272727272727272
cm(Ytest,ypred,labels=[1,0]) #A few classes are written in front
array([[  9,   4],
       [  8, 144]], dtype=int64)
recall(Ytest,ypred)
0.6923076923076923
auc(Ytest,clf.predict_proba(Xtest)[:,1])
0.9671052631578947
#Negative / positive sample ratio
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)

cm(Ytest,ypred_,labels=[1,0])

recall(Ytest,ypred_)

auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9696356275303644
#How do the recall,auc and accuracy of the model change with the gradual increase of sample weight?
for i in [1,5,10,20,30]:
    clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)
    ypred_ = clf_.predict(Xtest)
    print(i)
    print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))
    print("\tRecall:{}".format(recall(Ytest,ypred_)))
    print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))
1
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9671052631578947
5
	Accuracy:0.9454545454545454
	Recall:0.9230769230769231
	AUC:0.9665991902834008
10
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9696356275303644
20
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9706477732793523
30
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9701417004048584
#Negative / positive sample ratio
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)
0.9515151515151515
cm(Ytest,ypred_,labels=[1,0])
array([[ 13,   0],
       [  8, 144]], dtype=int64)
recall(Ytest,ypred_)
1.0
auc(Ytest,clf_.predict_proba(Xtest)[:,1])
0.9706477732793523
dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)
#Take a look at the predict interface of the xgboost library
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)
#See what preds returns?
preds
array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,
       0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,
       0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,
       0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,
       0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,
       0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,
       0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,
       0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,
       0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,
       0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,
       0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,
       0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,
       0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,
       0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,
       0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,
       0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,
       0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,
       0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,
       0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,
       0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,
       0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,
       0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],
      dtype=float32)
#Set your own threshold
ypred = preds.copy()
ypred[preds > 0.5] = 1
ypred
array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)
ypred[ypred != 1] = 0
#Write the parameters
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1"
         ,"negative vs positive: 5"
         ,"negative vs positive: 10"]
[*zip(names,scale_pos_weight)]
[('negative vs positive: 1', 1),
 ('negative vs positive: 5', 5),
 ('negative vs positive: 10', 10)]
#Import model evaluation indicators
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as auc

for name,i in zip(names,scale_pos_weight):
    param = {'silent':True,'objective':'binary:logistic'
            ,"eta":0.1,"scale_pos_weight":i}
    num_round = 100
    clf = xgb.train(param, dtrain, num_round)
    preds = clf.predict(dtest)
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0
    print(name)
    print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
    print("\tRecall:{}".format(recall(Ytest,ypred)))
    print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9741902834008097
negative vs positive: 5
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9635627530364372
negative vs positive: 10
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
#Of course, we can also try different thresholds
for name,i in zip(names,scale_pos_weight):
    for thres in [0.3,0.5,0.7,0.9]:
        param= {'silent':True,'objective':'binary:logistic'
                ,"eta":0.1,"scale_pos_weight":i}
        clf = xgb.train(param, dtrain, num_round)
        preds = clf.predict(dtest)
        ypred = preds.copy()
        ypred[preds > thres] = 1
        ypred[ypred != 1] = 0
        print("{},thresholds:{}".format(name,thres))
        print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
        print("\tRecall:{}".format(recall(Ytest,ypred)))
        print("\tAUC:{}".format(auc(Ytest,preds)))
negative vs positive: 1,thresholds:0.3
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9
	Accuracy:0.9515151515151515
	Recall:0.5384615384615384
	AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9665991902834008

Topics: Python Machine Learning