Integrated learning - xgboost learning

Posted by Sir Mildred Pierce on Sat, 19 Feb 2022 15:18:46 +0100

XGboost related learning

from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.linear_model import LinearRegression as LinearR
from sklearn.datasets import load_boston
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime

data = load_boston()
#The Boston data set is very simple, but it involves many problems

data

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
        19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
        20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
        23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
        33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
        21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
        20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
        23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
        15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
        17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
        25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
        23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
        32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
        34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
        20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
        26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
        31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
        22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
        42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
        36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
        32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
        20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
        20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
        22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
        21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
        19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
        32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
        18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
        16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
        13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
         7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
        12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
        27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
         8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
         9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
        10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
        15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
        19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
        29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
        20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
        23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9]),
 'feature_names': array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
        'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7'),
 'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000\n        - PTRATIO  pupil-teacher ratio by town\n        - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n        - LSTAT    % lower status of the population\n        - MEDV     Median value of owner-occupied homes in $1000's\n\n    :Missing Attribute Values: None\n\n    :Creator: Harrison, D. and Rubinfeld, D.L.\n\nThis is a copy of UCI ML housing dataset.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n\n\nThis dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n\nThe Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\nprices and the demand for clean air', J. Environ. Economics & Management,\nvol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n...', Wiley, 1980.   N.B. Various transformations are used in the table on\npages 244-261 of the latter.\n\nThe Boston house-price data has been used in many machine learning papers that address regression\nproblems.   \n     \n.. topic:: References\n\n   - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n   - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n",
 'filename': 'f:\\Anaconda3\\lib\\site-packages\\sklearn\\datasets\\data\\boston_house_prices.csv'}

X = data.data
y = data.target

X.shape

(506, 13)

y.shape

(506,)

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21.5, 19.6, 15.3, 19.4,
       17. , 15.6, 13.1, 41.3, 24.3, 23.3, 27. , 50. , 50. , 50. , 22.7,
       25. , 50. , 23.8, 23.8, 22.3, 17.4, 19.1, 23.1, 23.6, 22.6, 29.4,
       23.2, 24.6, 29.9, 37.2, 39.8, 36.2, 37.9, 32.5, 26.4, 29.6, 50. ,
       32. , 29.8, 34.9, 37. , 30.5, 36.4, 31.1, 29.1, 50. , 33.3, 30.3,
       34.6, 34.9, 32.9, 24.1, 42.3, 48.5, 50. , 22.6, 24.4, 22.5, 24.4,
       20. , 21.7, 19.3, 22.4, 28.1, 23.7, 25. , 23.3, 28.7, 21.5, 23. ,
       26.7, 21.7, 27.5, 30.1, 44.8, 50. , 37.6, 31.6, 46.7, 31.5, 24.3,
       31.7, 41.7, 48.3, 29. , 24. , 25.1, 31.5, 23.7, 23.3, 22. , 20.1,
       22.2, 23.7, 17.6, 18.5, 24.3, 20.5, 24.5, 26.2, 24.4, 24.8, 29.6,
       42.8, 21.9, 20.9, 44. , 50. , 36. , 30.1, 33.8, 43.1, 48.8, 31. ,
       36.5, 22.8, 30.7, 50. , 43.5, 20.7, 21.1, 25.2, 24.4, 35.2, 32.4,
       32. , 33.2, 33.1, 29.1, 35.1, 45.4, 35.4, 46. , 50. , 32.2, 22. ,
       20.1, 23.2, 22.3, 24.8, 28.5, 37.3, 27.9, 23.9, 21.7, 28.6, 27.1,
       20.3, 22.5, 29. , 24.8, 22. , 26.4, 33.1, 36.1, 28.4, 33.4, 28.2,
       22.8, 20.3, 16.1, 22.1, 19.4, 21.6, 23.8, 16.2, 17.8, 19.8, 23.1,
       21. , 23.8, 23.1, 20.4, 18.5, 25. , 24.6, 23. , 22.2, 19.3, 22.6,
       19.8, 17.1, 19.4, 22.2, 20.7, 21.1, 19.5, 18.5, 20.6, 19. , 18.7,
       32.7, 16.5, 23.9, 31.2, 17.5, 17.2, 23.1, 24.5, 26.6, 22.9, 24.1,
       18.6, 30.1, 18.2, 20.6, 17.8, 21.7, 22.7, 22.6, 25. , 19.9, 20.8,
       16.8, 21.9, 27.5, 21.9, 23.1, 50. , 50. , 50. , 50. , 50. , 13.8,
       13.8, 15. , 13.9, 13.3, 13.1, 10.2, 10.4, 10.9, 11.3, 12.3,  8.8,
        7.2, 10.5,  7.4, 10.2, 11.5, 15.1, 23.2,  9.7, 13.8, 12.7, 13.1,
       12.5,  8.5,  5. ,  6.3,  5.6,  7.2, 12.1,  8.3,  8.5,  5. , 11.9,
       27.9, 17.2, 27.5, 15. , 17.2, 17.9, 16.3,  7. ,  7.2,  7.5, 10.4,
        8.8,  8.4, 16.7, 14.2, 20.8, 13.4, 11.7,  8.3, 10.2, 10.9, 11. ,
        9.5, 14.5, 14.1, 16.1, 14.3, 11.7, 13.4,  9.6,  8.7,  8.4, 12.8,
       10.5, 17.1, 18.4, 15.4, 10.8, 11.8, 14.9, 12.6, 14.1, 13. , 13.4,
       15.2, 16.1, 17.8, 14.9, 14.1, 12.7, 13.5, 14.9, 20. , 16.4, 17.7,
       19.5, 20.2, 21.4, 19.9, 19. , 19.1, 19.1, 20.1, 19.9, 19.6, 23.2,
       29.8, 13.8, 13.3, 16.7, 12. , 14.6, 21.4, 23. , 23.7, 25. , 21.8,
       20.6, 21.2, 19.1, 20.6, 15.2,  7. ,  8.1, 13.6, 20.1, 21.8, 24.5,
       23.1, 19.7, 18.3, 21.2, 17.5, 16.8, 22.4, 20.6, 23.9, 22. , 11.9])

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)

reg = XGBR(n_estimators=100).fit(Xtrain,Ytrain) #train

reg.predict(Xtest) #Traditional interface predict

reg.score(Xtest,Ytest) #Can you think of what model evaluation indicators should be returned here? Using shift+Tab, you can know the R^2 evaluation index

y.mean()

MSE(Ytest,reg.predict(Xtest))#It can be seen that the mean square error is about 1 / 3 of the average value y.mean(), and the result is neither good nor bad

reg.feature_importances_ #One of the advantages of tree model: it can view the importance score of the model and use the select from model for feature selection
#xgboost can use embedding method for feature selection

reg = XGBR(n_estimators=100) #Untrained models imported in cross validation

CVS(reg,Xtrain,Ytrain,cv=5).mean()
#What model evaluation indicators should be returned here, remember? And reg Score the same evaluation index R^2 (regression), accuracy (classification)

0.8017863029875325

#Discussion between rigorous cross validation and non rigorous cross validation: training set or full data?

array([0.83340801, 0.77096033, 0.83473392, 0.80424149, 0.76558778])

#Rigorous vs. not rigorous

CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()

-16.041115480238048

#Let's take a look at all the model evaluation indicators in sklearn
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

#A comparison was made using random forest and linear regression
rfr = RFR(n_estimators=100)
CVS(rfr,Xtrain,Ytrain,cv=5).mean()#0.7975497480638329

0.7975497480638329

CVS(rfr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-16.998723616338033

-16.998723616338033

lr = LinearR()
CVS(lr,Xtrain,Ytrain,cv=5).mean()#0.6835070597278085

0.6835070597278085

CVS(lr,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-25.34950749364844

-25.34950749364844

#If you enable the parameter slient: when the data is huge and the algorithm is expected to run very slowly, you can use this parameter to monitor the training progress of the model
reg = XGBR(n_estimators=10,silent=True)#xgboost library silent=True will not print the training process, but only return the running results. The default is False, and the training process will be printed
#The default of xgbsoost in sklearn library is silent=True. The training process will not be printed. If you want to print, you need to manually set it to False
CVS(reg,Xtrain,Ytrain,cv=5,scoring='neg_mean_squared_error').mean()#-92.67865836936579

-92.67865836936579

def plot_learning_curve(estimator,title, X, y, 
                        ax=None, #Select subgraph
                        ylim=None, #Set the value range of ordinate
                        cv=None, #Cross validation
                        n_jobs=None #Set the thread to use
                       ):
    
    from sklearn.model_selection import learning_curve
    import matplotlib.pyplot as plt
    import numpy as np
    
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y
                                                            ,shuffle=True
                                                            ,cv=cv
                                                            ,random_state=420
                                                            ,n_jobs=n_jobs)      
    if ax == None:
        ax = plt.gca()
    else:
        ax = plt.figure()
    ax.set_title(title)
    if ylim is not None:
        ax.set_ylim(*ylim)
    ax.set_xlabel("Training examples")
    ax.set_ylabel("Score")
    ax.grid() #Drawing a mesh is not required
    ax.plot(train_sizes, np.mean(train_scores, axis=1), 'o-'
            , color="r",label="Training score")
    ax.plot(train_sizes, np.mean(test_scores, axis=1), 'o-'
            , color="g",label="Test score")
    ax.legend(loc="best")
    return ax

cv = KFold(n_splits=5, shuffle = True, random_state=42) #Cross validation mode

plot_learning_curve(XGBR(n_estimators=100,random_state=420)
                    ,"XGB",Xtrain,Ytrain,ax=None,cv=cv)
plt.show()

[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-xpQh3MDR-1619417048279)(output_28_0.png)]

#=====[TIME WARNING: 25 seconds]=====#

axisx = range(10,1010,50)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

660 0.8046775284172915

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-6da4aVib-1619417048281)(output_29_1.png)]

#Selected n_estimators are very unusual. Should we choose n with the highest accuracy_ What about the estimators value?

#======[TIME WARNING: 20s]=======#
axisx = range(50,1050,50)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    #Record 1 - deviation
    rs.append(cvresult.mean())
    #Record variance
    var.append(cvresult.var())
    #Calculate the controllable part of generalization error
    ge.append((1 - cvresult.mean())**2+cvresult.var())
#Print the parameter value corresponding to the highest R2, and print the variance under this parameter
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
#Print the parameter value corresponding to the lowest variance, and print R2 under this parameter
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
#Print the parameter value of the controllable part of generalization error, and print R2, variance and controllable part of generalization error under this parameter
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="red",label="XGB")
plt.legend()
plt.show()

650 0.80476050359201 0.01053673846018678
50 0.7857724708830981 0.009072727885598212
150 0.8032842414878519 0.009747694343514357 0.04844478399052411

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-eJ1DzHzL-1619417048282)(output_31_1.png)]

axisx = range(100,300,10)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)*0.01
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
#Add variance line
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315
180 0.8038787848970184 0.00959321570484315 0.04805674671831314

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-qD6ST091-1619417048283)(output_32_1.png)]

#What about the controllable part of generalization error?
plt.figure(figsize=(20,5))
plt.plot(axisx,ge,c="gray",linestyle='-.')
plt.show()

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-pxH53h8Q-1619417048284)(output_33_0.png)]

#Verify whether the effect of the model is improved?
time0 = time()
print(XGBR(n_estimators=100,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)

0.9197580267581366
0.0787498950958252

time0 = time()
print(XGBR(n_estimators=660,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)

0.9208745746309475
0.36807847023010254

time0 = time()
print(XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain).score(Xtest,Ytest))
print(time()-time0)

0.9231068620728082
0.12366437911987305

axisx = np.linspace(0,1,20)
rs = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    rs.append(CVS(reg,Xtrain,Ytrain,cv=cv).mean())
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="green",label="XGB")
plt.legend()
plt.show()

0.7368421052631579 0.837609040251761

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-7svP0syk-1619417048285)(output_37_1.png)]

#Continue to refine the learning curve
axisx = np.linspace(0.05,1,20)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

0.65 0.8302530801197368 0.008708816667924316
0.7999999999999999 0.8277414964661117 0.007159903723250457
0.7999999999999999 0.8277414964661117 0.007159903723250457 0.036832895762985055

[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-qvvhiCYL-1619417048286)(output_38_1.png)]

#Refine learning curve
axisx = np.linspace(0.75,1,25)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=180,subsample=i,random_state=420)
    cvresult = CVS(reg,Xtrain,Ytrain,cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2+cvresult.var())
print(axisx[rs.index(max(rs))],max(rs),var[rs.index(max(rs))])
print(axisx[var.index(min(var))],rs[var.index(min(var))],min(var))
print(axisx[ge.index(min(ge))],rs[ge.index(min(ge))],var[ge.index(min(ge))],min(ge))
rs = np.array(rs)
var = np.array(var)
plt.figure(figsize=(20,5))
plt.plot(axisx,rs,c="black",label="XGB")
plt.plot(axisx,rs+var,c="red",linestyle='-.')
plt.plot(axisx,rs-var,c="red",linestyle='-.')
plt.legend()
plt.show()

0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093
0.7708333333333334 0.833489187182165 0.005575077682875093 0.033300928468131166

[the external chain image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-jULrwpAL-1619417048286)(output_39_1.png)]

reg = XGBR(n_estimators=180
         #  ,subsample=0.7708333333333334
           ,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest,Ytest)

0.9159462982185405

MSE(Ytest,reg.predict(Xtest))

7.821523502888769

#First, let's define a scoring function, which can help us directly print the cross validation results on Xtrain
def regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2"],show=True):
    score = []
    for i in range(len(scoring)):
        if show:
            print("{}:{:.2f}".format(scoring[i] #Name of model evaluation indicator
                                     ,CVS(reg
                                          ,Xtrain,Ytrain
                                          ,cv=cv,scoring=scoring[i]).mean()))
        score.append(CVS(reg,Xtrain,Ytrain,cv=cv,scoring=scoring[i]).mean())
    return score

reg = XGBR(n_estimators=180,random_state=420)

regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])

r2:0.80
neg_mean_squared_error:-13.48





[0.8038787848970184, -13.482301822063182]

regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)

[0.8038787848970184, -13.482301822063182]

from time import time
import datetime

for i in [0,0.2,0.5,1]:
    time0=time()
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)
    print("learning_rate = {}".format(i))
    regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"])
    print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
    print("\t")

learning_rate = 0
r2:-6.76
neg_mean_squared_error:-567.55
00:01:561781
	
learning_rate = 0.2
r2:0.81
neg_mean_squared_error:-13.32
00:01:848888
	
learning_rate = 0.5
r2:0.81
neg_mean_squared_error:-13.24
00:01:541875
	
learning_rate = 1
r2:0.72
neg_mean_squared_error:-19.11
00:01:499027

axisx = np.arange(0.05,1,0.05)
rs = []
te = []
for i in axisx:
    reg = XGBR(n_estimators=180,random_state=420,learning_rate=i)
    score = regassess(reg,Xtrain,Ytrain,cv,scoring = ["r2","neg_mean_squared_error"],show=False)
    test = reg.fit(Xtrain,Ytrain).score(Xtest,Ytest)
    rs.append(score[0])
    te.append(test)
print(axisx[rs.index(max(rs))],max(rs))
plt.figure(figsize=(20,5))
plt.plot(axisx,te,c="gray",label="test")
plt.plot(axisx,rs,c="green",label="train")
plt.legend()
plt.show()

0.55 0.8125604372670463

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-Kt53b60m-1619417048287)(output_47_1.png)]

for booster in ["gbtree","gblinear","dart"]:
    reg = XGBR(n_estimators=180
               ,learning_rate=0.1
               ,random_state=420
               ,booster=booster).fit(Xtrain,Ytrain)
    print(booster)
    print(reg.score(Xtest,Ytest))

gbtree
0.9231068620728082
gblinear
0.6286510307485139
dart
0.923106843149575

#Default reg:linear
reg = XGBR(n_estimators=180,random_state=420).fit(Xtrain,Ytrain)
reg.score(Xtest, Ytest)

0.9231068620728082

MSE(Ytest,reg.predict(Xtest))

7.155205217161047

#xgb implementation method
import xgboost as xgb

#Read data using DMatrix class
dtrain = xgb.DMatrix(Xtrain,Ytrain) #Both feature matrix and label are passed in
dtest = xgb.DMatrix(Xtest,Ytest)

#Unfortunately, you can't open it for viewing, so you usually read it in pandas first and then put it in DMatrix
dtrain

<xgboost.core.DMatrix at 0x2770de3bdd8>

import pandas as pd

pd.DataFrame(Xtrain)

	0	1	2	3	4	5	6	7	8	9	10	11	12
0	0.03041	0.0	5.19	0.0	0.515	5.895	59.6	5.6150	5.0	224.0	20.2	394.81	10.56
1	0.04113	25.0	4.86	0.0	0.426	6.727	33.5	5.4007	4.0	281.0	19.0	396.90	5.29
2	10.23300	0.0	18.10	0.0	0.614	6.185	96.7	2.1705	24.0	666.0	20.2	379.70	18.03
3	0.17142	0.0	6.91	0.0	0.448	5.682	33.8	5.1004	3.0	233.0	17.9	396.90	10.21
4	0.05059	0.0	4.49	0.0	0.449	6.389	48.0	4.7794	3.0	247.0	18.5	396.90	9.62
5	0.13587	0.0	10.59	1.0	0.489	6.064	59.1	4.2392	4.0	277.0	18.6	381.32	14.66
6	0.04981	21.0	5.64	0.0	0.439	5.998	21.4	6.8147	4.0	243.0	16.8	396.90	8.43
7	0.02543	55.0	3.78	0.0	0.484	6.696	56.4	5.7321	5.0	370.0	17.6	396.90	7.18
8	0.10793	0.0	8.56	0.0	0.520	6.195	54.4	2.7778	5.0	384.0	20.9	393.49	13.00
9	0.02498	0.0	1.89	0.0	0.518	6.540	59.7	6.2669	1.0	422.0	15.9	389.96	8.65
10	0.09299	0.0	25.65	0.0	0.581	5.961	92.9	2.0869	2.0	188.0	19.1	378.09	17.93
11	0.15876	0.0	10.81	0.0	0.413	5.961	17.5	5.2873	4.0	305.0	19.2	376.94	9.88
12	6.71772	0.0	18.10	0.0	0.713	6.749	92.6	2.3236	24.0	666.0	20.2	0.32	17.44
13	0.03768	80.0	1.52	0.0	0.404	7.274	38.3	7.3090	2.0	329.0	12.6	392.20	6.62
14	5.20177	0.0	18.10	1.0	0.770	6.127	83.4	2.7227	24.0	666.0	20.2	395.43	11.48
15	11.08740	0.0	18.10	0.0	0.718	6.411	100.0	1.8589	24.0	666.0	20.2	318.75	15.02
16	0.11432	0.0	8.56	0.0	0.520	6.781	71.3	2.8561	5.0	384.0	20.9	395.58	7.67
17	0.05602	0.0	2.46	0.0	0.488	7.831	53.6	3.1992	3.0	193.0	17.8	392.63	4.45
18	0.24103	0.0	7.38	0.0	0.493	6.083	43.7	5.4159	5.0	287.0	19.6	396.90	12.79
19	0.09378	12.5	7.87	0.0	0.524	5.889	39.0	5.4509	5.0	311.0	15.2	390.50	15.71
20	8.71675	0.0	18.10	0.0	0.693	6.471	98.8	1.7257	24.0	666.0	20.2	391.98	17.12
21	7.36711	0.0	18.10	0.0	0.679	6.193	78.1	1.9356	24.0	666.0	20.2	96.73	21.52
22	1.38799	0.0	8.14	0.0	0.538	5.950	82.0	3.9900	4.0	307.0	21.0	232.60	27.71
23	14.33370	0.0	18.10	0.0	0.614	6.229	88.0	1.9512	24.0	666.0	20.2	383.32	13.11
24	28.65580	0.0	18.10	0.0	0.597	5.155	100.0	1.5894	24.0	666.0	20.2	210.97	20.08
25	0.80271	0.0	8.14	0.0	0.538	5.456	36.6	3.7965	4.0	307.0	21.0	288.99	11.69
26	1.00245	0.0	8.14	0.0	0.538	6.674	87.3	4.2390	4.0	307.0	21.0	380.23	11.98
27	9.91655	0.0	18.10	0.0	0.693	5.852	77.8	1.5004	24.0	666.0	20.2	338.16	29.97
28	0.13158	0.0	10.01	0.0	0.547	6.176	72.5	2.7301	6.0	432.0	17.8	393.30	12.04
29	0.14231	0.0	10.01	0.0	0.547	6.254	84.2	2.2565	6.0	432.0	17.8	388.74	10.45
...	...	...	...	...	...	...	...	...	...	...	...	...	...
324	0.13117	0.0	8.56	0.0	0.520	6.127	85.2	2.1224	5.0	384.0	20.9	387.69	14.09
325	1.35472	0.0	8.14	0.0	0.538	6.072	100.0	4.1750	4.0	307.0	21.0	376.73	13.04
326	0.10153	0.0	12.83	0.0	0.437	6.279	74.5	4.0522	5.0	398.0	18.7	373.66	11.97
327	0.22927	0.0	6.91	0.0	0.448	6.030	85.5	5.6894	3.0	233.0	17.9	392.74	18.80
328	0.04666	80.0	1.52	0.0	0.404	7.107	36.6	7.3090	2.0	329.0	12.6	354.31	8.61
329	0.08014	0.0	5.96	0.0	0.499	5.850	41.5	3.9342	5.0	279.0	19.2	396.90	8.77
330	0.40771	0.0	6.20	1.0	0.507	6.164	91.3	3.0480	8.0	307.0	17.4	395.24	21.46
331	0.13642	0.0	10.59	0.0	0.489	5.891	22.3	3.9454	4.0	277.0	18.6	396.90	10.87
332	9.32909	0.0	18.10	0.0	0.713	6.185	98.7	2.2616	24.0	666.0	20.2	396.90	18.13
333	0.09103	0.0	2.46	0.0	0.488	7.155	92.2	2.7006	3.0	193.0	17.8	394.12	4.82
334	0.01301	35.0	1.52	0.0	0.442	7.241	49.3	7.0379	1.0	284.0	15.5	394.74	5.49
335	0.59005	0.0	21.89	0.0	0.624	6.372	97.9	2.3274	4.0	437.0	21.2	385.76	11.12
336	1.12658	0.0	19.58	1.0	0.871	5.012	88.0	1.6102	5.0	403.0	14.7	343.28	12.12
337	0.07886	80.0	4.95	0.0	0.411	7.148	27.7	5.1167	4.0	245.0	19.2	396.90	3.56
338	0.21719	0.0	10.59	1.0	0.489	5.807	53.8	3.6526	4.0	277.0	18.6	390.94	16.03
339	0.53700	0.0	6.20	0.0	0.504	5.981	68.1	3.6715	8.0	307.0	17.4	378.35	11.65
340	3.32105	0.0	19.58	1.0	0.871	5.403	100.0	1.3216	5.0	403.0	14.7	396.90	26.82
341	1.49632	0.0	19.58	0.0	0.871	5.404	100.0	1.5916	5.0	403.0	14.7	341.60	13.28
342	0.38735	0.0	25.65	0.0	0.581	5.613	95.6	1.7572	2.0	188.0	19.1	359.29	27.26
343	0.06617	0.0	3.24	0.0	0.460	5.868	25.8	5.2146	4.0	430.0	16.9	382.44	9.97
344	0.78570	20.0	3.97	0.0	0.647	7.014	84.6	2.1329	5.0	264.0	13.0	384.07	14.79
345	1.41385	0.0	19.58	1.0	0.871	6.129	96.0	1.7494	5.0	403.0	14.7	321.02	15.12
346	0.06047	0.0	2.46	0.0	0.488	6.153	68.8	3.2797	3.0	193.0	17.8	387.11	13.15
347	8.49213	0.0	18.10	0.0	0.584	6.348	86.1	2.0527	24.0	666.0	20.2	83.45	17.64
348	0.17134	0.0	10.01	0.0	0.547	5.928	88.2	2.4631	6.0	432.0	17.8	344.91	15.76
349	0.03871	52.5	5.32	0.0	0.405	6.209	31.3	7.3172	6.0	293.0	16.6	396.90	7.14
350	0.12650	25.0	5.13	0.0	0.453	6.762	43.4	7.9809	8.0	284.0	19.7	395.58	9.50
351	6.96215	0.0	18.10	0.0	0.700	5.713	97.0	1.9265	24.0	666.0	20.2	394.43	17.11
352	0.09164	0.0	10.81	0.0	0.413	6.065	7.8	5.2873	4.0	305.0	19.2	390.91	5.52
353	5.58107	0.0	18.10	0.0	0.713	6.436	87.9	2.3158	24.0	666.0	20.2	100.19	16.22

354 rows × 13 columns

#Write the parameters
param = {'silent':True #The default is False, which is usually turned off manually
         ,'objective':'reg:linear'
         ,"eta":0.1}
num_round = 180 #n_estimators

#Class train. The parameters that can be imported directly are training data and the number of trees. Other parameters need to be imported through params
bst = xgb.train(param, dtrain, num_round)

#Interface predict
preds = bst.predict(dtest)

preds

array([ 6.4613175, 22.123888 , 30.755163 , 13.424351 ,  8.378565 ,
       23.608477 , 14.2151165, 16.026499 , 15.498961 , 14.10649  ,
       24.030867 , 34.36362  , 21.461111 , 28.839497 , 19.568035 ,
       10.188658 , 19.42369  , 23.539951 , 22.850523 , 23.198708 ,
       17.82486  , 16.07219  , 27.602034 , 20.773046 , 20.868807 ,
       15.865789 , 22.076588 , 29.292158 , 22.841051 , 15.770392 ,
       36.680496 , 21.057947 , 20.137005 , 23.777853 , 22.70615  ,
       23.863268 , 15.595315 , 24.565872 , 17.720552 , 33.95111  ,
       18.784286 , 20.483374 , 37.10668  , 18.068268 , 12.73839  ,
       31.186407 , 45.895035 , 12.696718 , 10.773068 , 36.064293 ,
       26.262571 , 19.908836 , 20.715096 , 48.814903 , 27.550056 ,
       25.225826 , 17.15366  , 21.215551 , 17.426773 , 18.478971 ,
       14.6453705, 22.841473 , 18.869593 , 29.990978 , 29.933191 ,
       18.756853 , 18.784918 , 16.33361  , 23.155968 , 19.144344 ,
       29.724382 , 42.121906 , 31.544363 , 23.017508 , 19.536028 ,
       23.851992 , 41.790577 , 28.676506 , 20.036425 , 21.723856 ,
       19.537868 , 46.349495 , 23.119637 ,  8.071444 , 26.358177 ,
       24.85706  , 17.057547 , 20.084204 , 18.54005  ,  7.157663 ,
       20.593962 , 15.451031 , 45.09552  , 34.435097 , 22.969654 ,
       10.10335  , 10.803318 , 18.42058  ,  7.800361 , 11.79309  ,
       30.755335 , 10.80648  , 26.122625 , 22.589502 , 31.219454 ,
       42.283318 , 19.274109 ,  7.3861685, 23.055706 , 14.315018 ,
       45.136368 , 21.243176 , 19.715647 , 24.533583 , 18.24247  ,
       28.382742 , 23.41182  , 19.962458 , 45.916683 , 17.521889 ,
       24.13039  , 26.147182 , 18.418781 , 17.606575 , 14.540631 ,
       20.595512 , 32.59128  , 10.155618 , 20.53032  , 21.477484 ,
       17.450048 , 20.154486 ,  8.010227 , 30.482618 , 29.677181 ,
       20.357098 , 18.222181 , 14.14504  , 10.100547 , 18.85027  ,
       41.85804  , 17.44544  , 22.907183 , 21.02398  , 29.799366 ,
       20.219465 , 12.404763 , 45.750965 , 25.56757  , 22.000706 ,
       14.194921 , 27.102774 ], dtype=float32)

from sklearn.metrics import r2_score
r2_score(Ytest,preds)

0.9260984298390122

MSE(Ytest,preds)

6.87682821415069

import xgboost as xgb

#For convenience, use full data
dfull = xgb.DMatrix(X,y)

#Set parameters
param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
num_round = 100
n_fold=5 #sklearn - KFold

#Use class XGB cv
time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:00:610364

#Look at class XGB What is the result of CV?
cvresult1 #As the number of trees increases, how does the effect of our model change

	train-rmse-mean	train-rmse-std	test-rmse-mean	test-rmse-std
0	17.105578	0.129116	17.163215	0.584296
1	12.337973	0.097557	12.519736	0.473458
2	8.994071	0.065756	9.404534	0.472310
3	6.629481	0.050323	7.250335	0.500342
4	4.954406	0.033209	5.920812	0.591874
5	3.781454	0.029604	5.045190	0.687971
6	2.947767	0.038786	4.472030	0.686492
7	2.357748	0.042040	4.179314	0.737935
8	1.951907	0.044972	3.979878	0.798198
9	1.660895	0.044894	3.870751	0.812331
10	1.464296	0.049422	3.816196	0.835251
11	1.323362	0.056240	3.788125	0.841643
12	1.214468	0.046524	3.766973	0.848989
13	1.137311	0.044522	3.741199	0.872370
14	1.064629	0.042245	3.729194	0.879429
15	1.010286	0.038892	3.717997	0.879572
16	0.941258	0.038360	3.706736	0.878032
17	0.883599	0.056640	3.693886	0.873913
18	0.829674	0.057284	3.693296	0.883429
19	0.772332	0.042899	3.687510	0.880928
20	0.731557	0.049150	3.687037	0.879180
21	0.690698	0.041190	3.677507	0.882060
22	0.657743	0.042137	3.675343	0.883635
23	0.619988	0.054097	3.671006	0.879224
24	0.585414	0.052585	3.670951	0.867470
25	0.548723	0.054440	3.673598	0.863241
26	0.527266	0.049630	3.673988	0.867116
27	0.504405	0.040376	3.671702	0.864566
28	0.468534	0.033020	3.671324	0.862536
29	0.448633	0.032191	3.675074	0.864713
...	...	...	...	...
70	0.071057	0.015411	3.668067	0.859435
71	0.067946	0.013960	3.667708	0.859370
72	0.065197	0.012475	3.668174	0.859307
73	0.062789	0.012538	3.668738	0.859471
74	0.060294	0.012669	3.668950	0.860112
75	0.058278	0.012055	3.669084	0.859966
76	0.055402	0.011065	3.669627	0.859505
77	0.053819	0.011072	3.669904	0.859294
78	0.051280	0.011215	3.670185	0.859204
79	0.048748	0.009988	3.670092	0.859250
80	0.046972	0.009233	3.669869	0.858892
81	0.044753	0.008664	3.669702	0.858676
82	0.043148	0.008636	3.669704	0.858921
83	0.041823	0.008355	3.669596	0.858843
84	0.040257	0.008378	3.669730	0.858459
85	0.038518	0.007731	3.669835	0.858698
86	0.036694	0.006928	3.669705	0.858958
87	0.034932	0.006174	3.669722	0.858715
88	0.033947	0.006206	3.669964	0.858547
89	0.032706	0.006176	3.669988	0.858516
90	0.031317	0.006171	3.670116	0.858512
91	0.029697	0.005473	3.669930	0.858759
92	0.028561	0.005599	3.669906	0.858549
93	0.027585	0.005694	3.669822	0.858554
94	0.026436	0.005414	3.669985	0.858390
95	0.025204	0.005145	3.669921	0.858313
96	0.024422	0.005242	3.669983	0.858255
97	0.023661	0.005117	3.669947	0.858331
98	0.022562	0.004704	3.669868	0.858578
99	0.021496	0.004738	3.669824	0.858305

100 rows × 4 columns

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

#What can we see from this picture?
#How to observe the generalization ability of the model from the graph?
#From the perspective of this diagram, what is the parameter adjustment goal of the model?

[the external link image transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the image and upload it directly (img-OVs9GHP4-1619417048288)(output_66_0.png)]

#What is the default model evaluation index of regression model in xgboost?

param1 = {'silent':True,'obj':'reg:linear',"gamma":0,"eval_metric":"mae"}
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.legend()
plt.show()

[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-fSCBcwL6-1619417048288)(output_68_0.png)]

param1 = {'silent':True,'obj':'reg:linear',"gamma":0}
param2 = {'silent':True,'obj':'reg:linear',"gamma":20}
num_round = 180
n_fold=5

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:01:083104

time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round,n_fold)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:01:359378

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,181),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,181),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,181),cvresult2.iloc[:,0],c="green",label="train,gamma=20")
plt.plot(range(1,181),cvresult2.iloc[:,2],c="blue",label="test,gamma=20")
plt.legend()
plt.show()

#From here, do you see how gamma controls over fitting? Control training on training set - reduce performance on training set

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-nW9wsd4l-1619417048289)(output_71_0.png)]

import xgboost as xgb
import matplotlib.pyplot as plt
from time import time
import datetime

from sklearn.datasets import load_breast_cancer
data2 = load_breast_cancer()

x2 = data2.data
y2 = data2.target

dfull2 = xgb.DMatrix(x2,y2)

param1 = {'silent':True,'obj':'binary:logistic',"gamma":0,"nfold":5
          ,"eval_metrics":"error"
         }
param2 = {'silent':True,'obj':'binary:logistic',"gamma":1,"nfold":5}
num_round = 100

time0 = time()
cvresult1 = xgb.cv(param1, dfull2, num_round,metrics=("error"))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:00:271581

time0 = time()
cvresult2 = xgb.cv(param2, dfull2, num_round,metrics=("error")) 
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

00:00:443810

plt.figure(figsize=(20,5))
plt.grid()
plt.plot(range(1,101),cvresult1.iloc[:,0],c="red",label="train,gamma=0")
plt.plot(range(1,101),cvresult1.iloc[:,2],c="orange",label="test,gamma=0")
plt.plot(range(1,101),cvresult2.iloc[:,0],c="green",label="train,gamma=1")
plt.plot(range(1,101),cvresult2.iloc[:,2],c="blue",label="test,gamma=1")
plt.legend()
plt.show()

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-40pJXnNG-1619417048289)(output_76_0.png)]

dfull = xgb.DMatrix(X,y)

param1 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"max_depth":6
          ,"eta":0.3
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":1
          ,"colsample_bynode":1
          ,"nfold":5}
num_round = 200

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")
ax.legend(fontsize="xx-large")
plt.show()

00:00:513584

[the external chain picture transfer fails. The source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-23NZRQ9V-1619417048290)(output_78_1.png)]

param1 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"max_depth":6
          ,"eta":0.3
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":1
          ,"colsample_bynode":1
          ,"nfold":5}
num_round = 200

time0 = time()
cvresult1 = xgb.cv(param1, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

fig,ax = plt.subplots(1,figsize=(15,8))
ax.set_ylim(top=5)
ax.grid()
ax.plot(range(1,201),cvresult1.iloc[:,0],c="red",label="train,original")
ax.plot(range(1,201),cvresult1.iloc[:,2],c="orange",label="test,original")

param2 = {'silent':True
          ,'obj':'reg:linear'
          ,"max_depth":2
          ,"eta":0.05
          ,"gamma":0
          ,"lambda":1
          ,"alpha":0
          ,"colsample_bytree":1
          ,"colsample_bylevel":0.4
          ,"colsample_bynode":1
          ,"nfold":5}

param3 = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"eta":0.05
          ,"gamma":20
          ,"lambda":3.5
          ,"alpha":0.2
          ,"max_depth":4
          ,"colsample_bytree":0.4
          ,"colsample_bylevel":0.6
          ,"colsample_bynode":1
          ,"nfold":5}

time0 = time()
cvresult2 = xgb.cv(param2, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

time0 = time()
cvresult3 = xgb.cv(param3, dfull, num_round)
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))

ax.plot(range(1,201),cvresult2.iloc[:,0],c="green",label="train,last")
ax.plot(range(1,201),cvresult2.iloc[:,2],c="blue",label="test,last")
ax.plot(range(1,201),cvresult3.iloc[:,0],c="gray",label="train,this")
ax.plot(range(1,201),cvresult3.iloc[:,2],c="pink",label="test,this")
ax.legend(fontsize="xx-large")
plt.show()

00:00:532621
00:00:223373
00:00:259346

[the external chain picture transfer fails, and the source station may have an anti-theft chain mechanism. It is recommended to save the picture and upload it directly (img-xdwpzhu-1619417048290) (output_79_1. PNG)]

import pickle

dtrain = xgb.DMatrix(Xtrain,Ytrain)

#Set parameters and train the model
param = {'silent':True
          ,'obj':'reg:linear'
          ,"subsample":1
          ,"eta":0.05
          ,"gamma":20
          ,"lambda":3.5
          ,"alpha":0.2
          ,"max_depth":4
          ,"colsample_bytree":0.4
          ,"colsample_bylevel":0.6
          ,"colsample_bynode":1}
num_round = 180

bst = xgb.train(param, dtrain, num_round)

#Save model
pickle.dump(bst, open("xgboostonboston.dat","wb"))

#Note that in open, we often use w or r as the reading mode, but in fact, W and r can only be used for text files - txt
#When we want to import not the text file but the model itself, we use "wb" and "rb" as the reading mode
#wb means to write in binary and rb means to read in binary. The file saved with open is a model that can be read or called

#See where the model is saved?
import sys
sys.path

['C:\\Pythonwork\\micro-class\\11 xgboost',
 'C:\\Python\\python37.zip',
 'C:\\Python\\DLLs',
 'C:\\Python\\lib',
 'C:\\Python',
 '',
 'C:\\Python\\lib\\site-packages',
 'C:\\Python\\lib\\site-packages\\win32',
 'C:\\Python\\lib\\site-packages\\win32\\lib',
 'C:\\Python\\lib\\site-packages\\Pythonwin',
 'C:\\Python\\lib\\site-packages\\IPython\\extensions',
 'C:\\Users\\Shuyu\\.ipython']

#Reopen jupyter lab

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pickle
import xgboost as xgb

data = load_boston()

X = data.data
y = data.target

Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420)

#Note that if the model we save is built in the xgboost library, the imported data type must also be the data type in the xgboost library
dtest = xgb.DMatrix(Xtest,Ytest)

#Import model
loaded_model = pickle.load(open("xgboostonboston.dat", "rb"))
print("Loaded model from: xgboostonboston.dat")

Loaded model from: xgboostonboston.dat

#Make a prediction and directly call the interface predict
ypreds = loaded_model.predict(dtest)

ypreds

array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,
       15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,
       20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,
       26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,
       20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,
       34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,
       15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,
       33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,
       10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,
       26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,
       15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,
       21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,
       30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,
       21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,
       27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,
       21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,
       15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,
       24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,
       22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,
       19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,
       24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,
       31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,
        8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,
       10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,
       32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,
       16.924698, 22.633028], dtype=float32)

from sklearn.metrics import mean_squared_error as MSE, r2_score
MSE(Ytest,ypreds)

9.107608696116197

r2_score(Ytest,ypreds)

0.9021254331073938

bst = xgb.train(param, dtrain, num_round)

import joblib

#You can also see where the model is saved
joblib.dump(bst,"xgboost-boston.dat")

['xgboost-boston.dat']

loaded_model = joblib.load("xgboost-boston.dat")

dtest = xgb.DMatrix(Xtest,Ytest)
ypreds = loaded_model.predict(dtest)

ypreds

array([ 9.244746, 22.536953, 28.47614 , 13.126131,  9.944413, 21.356094,
       15.187935, 15.559099, 15.629611, 15.555439, 21.427156, 35.502792,
       20.827318, 29.397932, 21.669186, 11.906522, 21.464252, 26.143337,
       26.300356, 23.474188, 18.186035, 15.851086, 22.928507, 22.919674,
       20.557487, 16.27315 , 22.000988, 25.230766, 23.12165 , 16.663473,
       34.747093, 20.003593, 20.617601, 23.74025 , 23.044952, 24.849056,
       15.414761, 23.383522, 18.500463, 33.790466, 18.009186, 18.729418,
       33.181175, 18.834534, 15.085677, 27.601177, 42.75243 , 15.359873,
       10.37829 , 37.5367  , 27.097404, 20.73775 , 20.198935, 46.20087 ,
       26.959623, 24.566458, 18.678255, 20.913795, 17.369501, 17.823708,
       15.136806, 24.533068, 19.465569, 30.474009, 29.571526, 19.773672,
       21.554045, 17.590807, 22.250225, 18.275839, 29.012346, 40.198055,
       30.235825, 23.174484, 20.191778, 23.742437, 38.217915, 27.173447,
       21.068003, 20.5974  , 18.412853, 45.326836, 22.941956,  9.055015,
       27.04054 , 23.45833 , 17.310354, 20.762442, 15.6619  , 12.178641,
       21.293903, 19.826134, 41.0362  , 31.300192, 24.400661, 11.267941,
       15.763796, 20.984198,  9.232577, 11.090055, 32.739227, 16.265066,
       24.975492, 24.905188, 34.348663, 41.02216 , 20.181097,  8.897793,
       22.894953, 15.023113, 45.222473, 21.289068, 22.882399, 24.792355,
       19.141815, 27.372849, 24.132881, 19.243576, 43.235798, 17.438314,
       24.561804, 24.187195, 17.001463, 18.172377, 15.483843, 23.802166,
       31.079023, 10.322498, 21.977345, 19.267714, 15.559681, 19.336842,
        8.979549, 28.35794 , 29.80491 , 21.987814, 19.893597, 19.730898,
       10.501988, 17.405378, 40.51527 , 17.420282, 24.272373, 19.771631,
       32.620422, 19.19032 , 12.364113, 38.63305 , 24.189354, 23.38174 ,
       16.924698, 22.633028], dtype=float32)

MSE(Ytest, ypreds)

9.107608696116197

r2_score(Ytest,ypreds)

0.9021254331073938

#Using the model in sklearn
from xgboost import XGBRegressor as XGBR

bst = XGBR(n_estimators=200
           ,eta=0.05,gamma=20
           ,reg_lambda=3.5
           ,reg_alpha=0.2
           ,max_depth=4
           ,colsample_bytree=0.4
           ,colsample_bylevel=0.6).fit(Xtrain,Ytrain) #Training completed

joblib.dump(bst,"xgboost-boston-sklearn.dat")

['xgboost-boston-sklearn.dat']

loaded_model = joblib.load("xgboost-boston-sklearn.dat")

#You can import Xtest directly here, which is our numpy
ypreds = loaded_model.predict(Xtest)

Xtest

array([[4.15292e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.29460e+02, 2.73800e+01],
       [2.73100e-02, 0.00000e+00, 7.07000e+00, ..., 1.78000e+01,
        3.96900e+02, 9.14000e+00],
       [3.15000e-02, 9.50000e+01, 1.47000e+00, ..., 1.70000e+01,
        3.96900e+02, 4.56000e+00],
       ...,
       [5.08300e-02, 0.00000e+00, 5.19000e+00, ..., 2.02000e+01,
        3.89710e+02, 5.68000e+00],
       [3.77498e+00, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        2.20100e+01, 1.71500e+01],
       [1.96091e+01, 0.00000e+00, 1.81000e+01, ..., 2.02000e+01,
        3.96900e+02, 1.34400e+01]])

dtest

<xgboost.core.DMatrix at 0x29e30670668>

ypreds

array([ 9.350334 , 21.501623 , 30.219057 , 13.021226 ,  9.883689 ,
       20.977922 , 16.023008 , 15.8910475, 15.512305 , 15.706607 ,
       22.096102 , 35.381573 , 20.3307   , 27.129421 , 19.997156 ,
       10.935587 , 20.25071  , 26.188572 , 26.711943 , 22.600443 ,
       18.23832  , 15.876045 , 26.263977 , 22.706024 , 20.18491  ,
       15.891692 , 21.4781   , 29.047956 , 23.371012 , 17.167185 ,
       35.699898 , 20.490337 , 20.195292 , 23.81444  , 23.106022 ,
       25.709312 , 15.0182905, 22.621248 , 18.576109 , 34.25664  ,
       17.46115  , 19.159126 , 34.79234  , 17.766731 , 17.141891 ,
       27.755646 , 39.786766 , 22.49913  , 10.246634 , 36.76105  ,
       26.294876 , 20.75917  , 19.893272 , 46.62629  , 26.549704 ,
       24.040398 , 17.769514 , 20.76889  , 16.139618 , 17.494894 ,
       16.005596 , 24.28487  , 19.15237  , 31.407684 , 27.862312 ,
       18.877817 , 20.50497  , 16.094156 , 22.622025 , 17.762297 ,
       28.518019 , 41.146317 , 32.52681  , 23.117966 , 19.125128 ,
       24.141544 , 39.041847 , 25.901724 , 20.974117 , 19.626917 ,
       18.567612 , 46.46465  , 23.03303  ,  9.912106 , 26.407642 ,
       23.466772 , 16.985506 , 20.73746  , 15.679997 , 11.697191 ,
       21.320868 , 20.333689 , 41.616425 , 31.659132 , 25.605923 ,
       12.362759 , 14.593165 , 20.577328 ,  9.253377 , 11.1253805,
       32.878246 , 15.840851 , 24.695955 , 24.882996 , 34.643425 ,
       41.556873 , 19.726238 ,  8.808649 , 23.04128  , 14.709186 ,
       46.10303  , 21.435535 , 21.97892  , 24.299171 , 19.591938 ,
       27.527737 , 23.80468  , 18.782711 , 44.266346 , 17.328068 ,
       23.030151 , 23.801643 , 16.483137 , 18.219353 , 15.713125 ,
       23.655058 , 32.294373 , 10.60579  , 22.099716 , 19.26955  ,
       14.293162 , 19.386055 ,  8.824598 , 26.909697 , 29.539446 ,
       20.38691  , 20.832077 , 22.507433 , 11.142808 , 17.685743 ,
       40.230915 , 17.526121 , 23.09964  , 19.899158 , 31.775164 ,
       19.718151 , 12.164877 , 40.867558 , 24.465397 , 22.134802 ,
       15.041253 , 28.63522  ], dtype=float32)

MSE(Ytest, ypreds)

10.198269690947479

r2_score(Ytest,ypreds)

0.8904046866351292

import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import make_blobs #Self created data set
from sklearn.model_selection import train_test_split as TTS
from sklearn.metrics import confusion_matrix as cm, recall_score as recall, roc_auc_score as auc

class_1 = 500 #Category 1 has 500 samples
class_2 = 50 #Category 2 has only 50
centers = [[0.0, 0.0], [2.0, 2.0]] #Set the center of two categories
clusters_std = [1.5, 0.5] #Set the variance of the two categories. Generally speaking, the category with large sample size will be more loose
X, y = make_blobs(n_samples=[class_1, class_2],
                  centers=centers,
                  cluster_std=clusters_std,
                  random_state=0, shuffle=False)

X.shape

(550, 2)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

(y == 1).sum() / y.shape[0] #9%

0.09090909090909091

Xtrain, Xtest, Ytrain, Ytest = TTS(X,y,test_size=0.3,random_state=420)

#stay sklearn Lower modeling#

clf = XGBC().fit(Xtrain,Ytrain)
ypred = clf.predict(Xtest)

ypred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

clf.score(Xtest,Ytest) #Default model evaluation indicator - Accuracy

0.9272727272727272

cm(Ytest,ypred,labels=[1,0]) #A few classes are written in front

array([[  9,   4],
       [  8, 144]], dtype=int64)

recall(Ytest,ypred)

0.6923076923076923

auc(Ytest,clf.predict_proba(Xtest)[:,1])

0.9671052631578947

#Negative / positive sample ratio
clf_ = XGBC(scale_pos_weight=10).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)

cm(Ytest,ypred_,labels=[1,0])

recall(Ytest,ypred_)

auc(Ytest,clf_.predict_proba(Xtest)[:,1])

0.9696356275303644

#How do the recall,auc and accuracy of the model change with the gradual increase of sample weight?
for i in [1,5,10,20,30]:
    clf_ = XGBC(scale_pos_weight=i).fit(Xtrain,Ytrain)
    ypred_ = clf_.predict(Xtest)
    print(i)
    print("\tAccuracy:{}".format(clf_.score(Xtest,Ytest)))
    print("\tRecall:{}".format(recall(Ytest,ypred_)))
    print("\tAUC:{}".format(auc(Ytest,clf_.predict_proba(Xtest)[:,1])))

1
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9671052631578947
5
	Accuracy:0.9454545454545454
	Recall:0.9230769230769231
	AUC:0.9665991902834008
10
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9696356275303644
20
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9706477732793523
30
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9701417004048584

#Negative / positive sample ratio
clf_ = XGBC(scale_pos_weight=20).fit(Xtrain,Ytrain)
ypred_ = clf_.predict(Xtest)
clf_.score(Xtest,Ytest)

0.9515151515151515

cm(Ytest,ypred_,labels=[1,0])

array([[ 13,   0],
       [  8, 144]], dtype=int64)

recall(Ytest,ypred_)

1.0

auc(Ytest,clf_.predict_proba(Xtest)[:,1])

0.9706477732793523

dtrain = xgb.DMatrix(Xtrain,Ytrain)
dtest = xgb.DMatrix(Xtest,Ytest)

#Take a look at the predict interface of the xgboost library
param = {'silent':True,'objective':'binary:logistic',"eta":0.1,"scale_pos_weight":1}
num_round = 100

bst = xgb.train(param, dtrain, num_round)

preds = bst.predict(dtest)

#See what preds returns?
preds

array([0.00110357, 0.00761518, 0.00110357, 0.00110357, 0.93531454,
       0.00466839, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00410493, 0.00454478, 0.00571528, 0.00751026,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00712637, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00110357, 0.00793251, 0.00466839,
       0.00110357, 0.00339395, 0.00657186, 0.00110357, 0.00457053,
       0.00571528, 0.0026763 , 0.00110357, 0.00110357, 0.00110357,
       0.00884932, 0.00712637, 0.00110357, 0.00712637, 0.00466839,
       0.00110357, 0.00110357, 0.00712637, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.63748044, 0.00110357, 0.00793251,
       0.00110357, 0.00451971, 0.00644181, 0.00110357, 0.00110357,
       0.00110357, 0.00110357, 0.00751026, 0.00712637, 0.00110357,
       0.00866458, 0.00110357, 0.00110357, 0.00110357, 0.91610426,
       0.00110357, 0.00110357, 0.89246494, 0.0026763 , 0.00501714,
       0.00761518, 0.00884932, 0.00339395, 0.00110357, 0.93531454,
       0.00110357, 0.00110357, 0.00110357, 0.82530665, 0.00751026,
       0.00110357, 0.35174078, 0.00110357, 0.00110357, 0.70393246,
       0.00110357, 0.76804197, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.96656513, 0.00110357, 0.00571528, 0.25400913,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00457053,
       0.00110357, 0.00110357, 0.00110357, 0.89246494, 0.00110357,
       0.9518535 , 0.0026763 , 0.00712637, 0.00110357, 0.00501714,
       0.00110357, 0.00110357, 0.00571528, 0.00110357, 0.00110357,
       0.00712637, 0.00110357, 0.00110357, 0.00712637, 0.00110357,
       0.25136763, 0.00110357, 0.00110357, 0.00110357, 0.00110357,
       0.00110357, 0.8904051 , 0.3876418 , 0.00110357, 0.00457053,
       0.00657186, 0.9366597 , 0.00866458, 0.00110357, 0.00501714,
       0.00501714, 0.00110357, 0.00110357, 0.00368543, 0.00501714,
       0.9830577 , 0.00110357, 0.00644181, 0.00110357, 0.00571528,
       0.00110357, 0.00110357, 0.00110357, 0.00110357, 0.00466839,
       0.00110357, 0.00110357, 0.92388713, 0.90231985, 0.80084217],
      dtype=float32)

#Set your own threshold
ypred = preds.copy()

ypred[preds > 0.5] = 1

ypred

array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.], dtype=float32)

ypred[ypred != 1] = 0

#Write the parameters
scale_pos_weight = [1,5,10]
names = ["negative vs positive: 1"
         ,"negative vs positive: 5"
         ,"negative vs positive: 10"]

[*zip(names,scale_pos_weight)]

[('negative vs positive: 1', 1),
 ('negative vs positive: 5', 5),
 ('negative vs positive: 10', 10)]

#Import model evaluation indicators
from sklearn.metrics import accuracy_score as accuracy, recall_score as recall, roc_auc_score as auc

for name,i in zip(names,scale_pos_weight):
    param = {'silent':True,'objective':'binary:logistic'
            ,"eta":0.1,"scale_pos_weight":i}
    num_round = 100
    clf = xgb.train(param, dtrain, num_round)
    preds = clf.predict(dtest)
    ypred = preds.copy()
    ypred[preds > 0.5] = 1
    ypred[ypred != 1] = 0
    print(name)
    print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
    print("\tRecall:{}".format(recall(Ytest,ypred)))
    print("\tAUC:{}".format(auc(Ytest,preds)))

negative vs positive: 1
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9741902834008097
negative vs positive: 5
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9635627530364372
negative vs positive: 10
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008

#Of course, we can also try different thresholds
for name,i in zip(names,scale_pos_weight):
    for thres in [0.3,0.5,0.7,0.9]:
        param= {'silent':True,'objective':'binary:logistic'
                ,"eta":0.1,"scale_pos_weight":i}
        clf = xgb.train(param, dtrain, num_round)
        preds = clf.predict(dtest)
        ypred = preds.copy()
        ypred[preds > thres] = 1
        ypred[ypred != 1] = 0
        print("{},thresholds:{}".format(name,thres))
        print("\tAccuracy:{}".format(accuracy(Ytest,ypred)))
        print("\tRecall:{}".format(recall(Ytest,ypred)))
        print("\tAUC:{}".format(auc(Ytest,preds)))

negative vs positive: 1,thresholds:0.3
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.5
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.7
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9741902834008097
negative vs positive: 1,thresholds:0.9
	Accuracy:0.9515151515151515
	Recall:0.5384615384615384
	AUC:0.9741902834008097
negative vs positive: 5,thresholds:0.3
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.5
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.7
	Accuracy:0.9272727272727272
	Recall:0.6923076923076923
	AUC:0.9635627530364372
negative vs positive: 5,thresholds:0.9
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9635627530364372
negative vs positive: 10,thresholds:0.3
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.5
	Accuracy:0.9515151515151515
	Recall:1.0
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.7
	Accuracy:0.9393939393939394
	Recall:0.8461538461538461
	AUC:0.9665991902834008
negative vs positive: 10,thresholds:0.9
	Accuracy:0.9212121212121213
	Recall:0.6153846153846154
	AUC:0.9665991902834008

Topics: Python Machine Learning

Programmer Think

Integrated learning - xgboost learning

XGboost related learning

Hot Topics