Chapter 7 model evaluation notes

Posted by siefkencp on Fri, 04 Mar 2022 06:52:25 +0100

7.2 k-fold cross validation model performance

This method can solve the problem of over adaptation,

library(modeldata)
library(e1071)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- cut(1:nrow(churnTrain), breaks = 10, labels = F)
accuracies <- c()
for (i in 1:10) {
  fit <- svm(churn~.,churnTrain[ind!=i,])
  predictions <- predict(fit, churnTrain[ind==i,!names(churnTrain) %in% c("churn")])
  correct_count <- sum(as.data.frame(predictions) == churnTrain[ind==i,c("churn")])
  accuracies <- append(correct_count/nrow(churnTrain[ind==i,]),accuracies)
}
accuracies
[1] 0.920 0.874 0.906 0.902 0.874 0.890 0.884 0.904 0.922 0.902

7.3 cross validation of e1071 package

# e1071 cross validation
library(e1071)
churnTrain <- churn[,!names(churn) %in% c('state',
                                          'area_code', "account_length")]
set.seed(2)
ind <- sample(2,nrow(churnTrain),replace = TRUE,
              prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
tuned <- tune.svm(churn~., data = trainset, gamma = 10^-2,
                  cost = 10^2, tunedcontrol = tune.control(cross = 10))
summary(tuned)
# Error estimation of 'svm' using 10-fold cross validation: 0.07473914
tuned$performances
  gamma cost      error dispersion
1  0.01  100 0.07473914 0.01605983
svmfit <- tuned$best.model
table(trainset[,c("churn")], predict(svmfit))
       yes   no
  yes  400   93
  no    14 2972

The tune function uses the wind format search method to complete parameter optimization. View other optimization functions? tune

7.4 cross validation of caret package

# caret
library(caret)
# Repeat 3 times with 10 fold cross
control <- trainControl(method = "repeatedcv",number = 10,
                        repeats = 3)
model <- train(churn~.,data = trainset, method = "rpart",
               preProcess="scale", trControl=control)
model
CART 

3479 samples
  19 predictor
   2 classes: 'yes', 'no' 

Pre-processing: scaled (69) 
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 3131, 3132, 3130, 3130, 3131, 3132, ... 
Resampling results across tuning parameters:

  cp          Accuracy   Kappa    
  0.04056795  0.9301563  0.6720876
  0.05578093  0.9147378  0.5774005
  0.07505071  0.8731365  0.2416613

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.04056795.

trainControl can set resampling parameters, and specify boot\boot632\cv\repeatdcv\LOOCV\LGOCV\non\oob\adaptive_cv\adaptive_boot\adaptive_LGOCV et al.

7.5 importance ranking of variables by caret package

After the supervised learning model is obtained, the input value can be changed and the sensitivity of the output effect of a given model can be compared to evaluate the importance of different features to the model.

# Importance ranking
importance <- varImp(model, scale = FALSE)
importance
rpart variable importance

  only 20 most important variables shown (out of 69)

                              Overall
international_planyes         196.919
total_day_minutes             172.870
total_day_charge              161.582
number_customer_service_calls 155.274
total_intl_minutes            133.221
total_intl_charge             124.817
total_intl_calls               48.700
voice_mail_planyes             40.912
total_eve_charge               31.116
total_eve_minutes              31.116
...
plot(importance)

The objects generated from the training model in some classification algorithm packages such as extended rpart contain the importance of variables, which can be viewed and output.

library(rpart)
model.rp <- rpart(churn~.,data = trainset)
model.rp$variable.importance
             total_day_charge             total_day_minutes                         state 
                   161.951011                    161.951011                     85.145010 
           total_intl_minutes number_customer_service_calls             total_intl_charge 
                    80.504234                     76.323349                     75.141041 
...

7.6 rimier package to sort the importance of variables

This takes a lot of effort. It seems that only numerical variables can do it.

# rminier
install.packages("rminer")
library(rminer)
model <- fit(Species~., iris,model = "svm")
variable.importance <- Importance(model,iris,method = "sensv")
L=list(runs=1,sen=t(variable.importance$imp),sresponses=variable.importance$sresponses)
mgraph(L,graph = "IMP",leg = names(iris),col = "gray", Grid = 4)

7.7 caret packages find highly correlated features

Remove the non numerical attributes, calculate the correlation to obtain a correlation matrix, set the threshold to 0.75, and select the highly correlated attributes.

# findCorrelation
new_train <- trainset[,!names(trainset) %in% c('churn',
                                     'international_plan', "voice_mail_plan")]
cor_mat <- cor(new_train)
highlyCorrelated <- findCorrelation(cor_mat, cutoff = 0.75)
names(new_train)[highlyCorrelated]
[1] "total_night_minutes" "total_intl_charge"   "total_day_charge"    "total_eve_charge"

You can also use the leaps, genetic and annual functions in the subselect package to achieve the same effect.

7.8 feature selection using caret package

Feature selection can select the attribute subset with the lowest prediction error, which helps us to judge which features should be used to establish an accurate model. Recursive feature exclusion function rfe can automatically select the required features.

# caret selection feature
library(modeldata)
library(caret)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- sample(2,nrow(churnTrain),replace = TRUE,
              prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
# Feature transformation should be that yes no changes to 1, 0, and factor changes to binary attribute
intl_plan <- model.matrix(~ trainset.international_plan - 1,
                          data = data.frame(trainset$international_plan))
colnames(intl_plan) <- c("trainset.international_planno"="intl_no",
                         "trainset.international_planyes"="intl_yes")

voice_plan <- model.matrix(~ trainset.voice_mail_plan - 1,
                           data = data.frame(trainset$voice_mail_plan))
colnames(voice_plan) <- c("trainset.voice_mail_planno"="voice_no",
                          "trainset.voice_mail_planyes"="voice_yes")
trainset$international_plan <- NULL
trainset$voice_mail_plan <- NULL
trainset <- cbind(intl_plan, voice_plan,trainset)
# testset does the same
intl_plan <- model.matrix(~ testset.international_plan - 1,
                          data = data.frame(testset$international_plan))
colnames(intl_plan) <- c("testset.international_planno"="intl_no",
                         "testset.international_planyes"="intl_yes")

voice_plan <- model.matrix(~ testset.voice_mail_plan - 1,
                           data = data.frame(testset$voice_mail_plan))
colnames(voice_plan) <- c("testset.voice_mail_planno"="voice_no",
                          "testset.voice_mail_planyes"="voice_yes")
testset$international_plan <- NULL
testset$voice_mail_plan <- NULL
testset <- cbind(intl_plan, voice_plan,testset)
# Linear discriminant analysis creates a feature screening algorithm, cross validation method cv
ldaControl <- rfeControl(functions = ldaFuncs, method = "cv")
# Reverse feature filtering
ldaProfile <- rfe(trainset[,names(trainset)!='churn'][,-c(5,6,7)],
                  trainset[,'churn'],sizes = c(1:18), rfeControl = ldaControl)
ldaProfile

Recursive feature selection

Outer resampling method: Cross-Validated (10 fold) 

Resampling performance over subset size:

 Variables Accuracy   Kappa AccuracySD KappaSD Selected
         1   0.8554 0.02245   0.009841 0.06008         
         2   0.8554 0.02245   0.009841 0.06008         
         3   0.8494 0.23064   0.013363 0.10679         
...
plot(ldaProfile, type = c("o","g"))
# Optimal variable subset
ldaProfile$optVariables
# Appropriate model
ldaProfile$fit
Call:
lda(x, y)

Prior probabilities of groups:
      yes        no 
0.1417074 0.8582926 

Group means:
postResample(predict(ldaProfile, testset[,names(testset)!="churn"]), testset[,c("churn")])
 Accuracy     Kappa 
0.8520710 0.2523709

extend

7.9 evaluating the performance of regression model

Root mean square error method RMSE, relative square difference RSE, determination coefficient R-Square.

# Performance evaluation of regression model
library(car)
data("Quartet")
plot(Quartet$x,Quartet$y3)
lmfit<- lm(Quartet$y3~Quartet$x)
abline(lmfit, col="red")
predicted <- predict(lmfit,newdata = Quartet[c("x")])
actual <- Quartet$y3
# The function of caret package used here. This package is a treasure. It has everything
rmse <- RMSE(predicted, actual)
mu <- mean(actual)
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
Rsquare <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.118286 0.333676 0.666324
# MASS package rlm recalculation
library(MASS)
plot(Quartet$x,Quartet$y3)
rlmfit <- rlm(Quartet$y3~Quartet$x)
abline(rlmfit,col='red')
predicted <- predict(rlmfit, newdata = Quartet['x'])
rmse <- (mean((predicted-actual)^2))^0.5
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
rsqure <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.2790448 0.4365067 0.6663240

extend

library(e1071)
tune(lm,y3~x,data = Quartet)
Error estimation of 'lm' using 10-fold cross validation: 2.351897

Cross validation function of dacv.train LM can achieve the same effect

7.10 using confusion matrix to evaluate the prediction ability of the model

The accuracy, recall, specificity and accuracy of the model

# Confusion matrix
svm.model <- train(churn~., data=trainset,method = "svmRadial")
svm.pred <- predict(svm.model, testset[,names(testset)!="churn"])
table(svm.pred, testset[,"churn",drop=TRUE])
svm.pred  yes   no
     yes   12    1
     no   198 1290
confusionMatrix(svm.pred,testset[,"churn",drop=TRUE])
Confusion Matrix and Statistics

          Reference
Prediction  yes   no
       yes   12    1
       no   198 1290
                                          
               Accuracy : 0.8674          
                 95% CI : (0.8492, 0.8842)
    No Information Rate : 0.8601          
    P-Value [Acc > NIR] : 0.2183          
                                          
                  Kappa : 0.0928

7.11 prediction ability of ROCR evaluation model

The receiver operating curve ROC is a common performance display graph of binary classification system. The true Yang and false Yang rates of different tangent points are marked on the curve. The classification performance of the model is usually measured based on the area under the curve AUC.

install.packages("ROCR")
library(ROCR)
svmfit <- svm(churn~.,data = trainset, prob = TRUE)
pred <- predict(svmfit, testset[,names(testset)!="churn"],probability = TRUE)
# Probability of marking yes
pred.prob <- attr(pred,"probabilities")
pred.to.roc <- pred.prob[, 2]
# forecast
pred.rocr <- prediction(pred.to.roc, testset$churn)
# Performance evaluation
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.rocr, colorize = T, main = paste("AUC:", (perf.rocr@y.values)))

7.12 compare ROC curves using caret package

install.packages("pROC")
library(pROC)
# Repeat 3 times with 10 fold cross
control <- trainControl(method = "repeatedcv", number = 10,
                        repeats = 3,classProbs = TRUE,
                        summaryFunction = twoClassSummary)
# glm classification
glm.model <- train(churn ~., data = trainset,method = "glm",
                   metric = "ROC", trControl = control)
# svm classification
svm.model <- train(churn ~., data = trainset,method = "svmRadial",
                   metric = "ROC", trControl = control)
rpart.model <- train(churn ~., data = trainset,method = "rpart",
                     metric = "ROC", trControl = control)
# Forecast separately
glm.probs <- predict(glm.model, testset[,names(testset) != "churn"], type = "prob")
svm.probs <- predict(svm.model, testset[,names(testset) != "churn"], type = "prob")
rpart.probs <- predict(rpart.model, testset[,names(testset) != "churn"], type = "prob")
# Generate ROC curve in a graph
glm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
               predictor = glm.probs$yes,
               levels = levels(testset[,"churn",drop=TRUE]))
plot(glm.ROC, type = "S", col = 'red')

svm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
               predictor = svm.probs$yes,
               levels = levels(testset[,"churn",drop=TRUE]))
plot(svm.ROC, add = TRUE, col = 'green')
rpart.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
               predictor = rpart.probs$yes,
               levels = levels(testset[,"churn",drop=TRUE]))
plot(rpart.ROC, add = TRUE, col = 'blue')

Error in colnames (data): argument "data" is missing, with no default. Finally, if it is found that tibble does not reduce the dimension by default, add drop=TRUE to solve it.

7.13 performance differences of caret package comparison model

# Model resampling
cv.values <- resamples(list(glm=glm.model, svm=svm.model, rpart = rpart.model))
summary(cv.values)
Call:
summary.resamples(object = cv.values)

Models: glm, svm, rpart 
Number of resamples: 30 

ROC 
           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glm   0.7483007 0.7977494 0.8187351 0.8200275 0.8357939 0.8953608    0
svm   0.8356721 0.8594691 0.8722842 0.8778421 0.9018812 0.9184929    0
rpart 0.6446078 0.7146965 0.7666979 0.7773052 0.8459407 0.9173395    0

Sens 
           Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
glm   0.1600000 0.2088235 0.2672549 0.2667320 0.3184314 0.40    0
svm   0.2600000 0.3879412 0.4454902 0.4460654 0.5049020 0.66    0
rpart 0.2352941 0.3800000 0.4411765 0.4622876 0.5637255 0.76    0

Spec 
           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glm   0.9509804 0.9639639 0.9673203 0.9666242 0.9705882 0.9771242    0
svm   0.9346405 0.9542484 0.9672131 0.9641094 0.9729749 0.9836601    0
rpart 0.9705882 0.9836601 0.9901639 0.9885492 0.9934641 1.0000000    0
dotplot(cv.values, metric = "ROC")
bwplot(cv.values, layout=c(3,1))

Extensions can also use densityplot Splom and xyplot function visualization

Programmer Think