# Chapter 7 model evaluation notes

Posted by siefkencp on Fri, 04 Mar 2022 06:52:25 +0100

# 7.2 k-fold cross validation model performance

This method can solve the problem of over adaptation,

```library(modeldata)
library(e1071)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- cut(1:nrow(churnTrain), breaks = 10, labels = F)
accuracies <- c()
for (i in 1:10) {
fit <- svm(churn~.,churnTrain[ind!=i,])
predictions <- predict(fit, churnTrain[ind==i,!names(churnTrain) %in% c("churn")])
correct_count <- sum(as.data.frame(predictions) == churnTrain[ind==i,c("churn")])
accuracies <- append(correct_count/nrow(churnTrain[ind==i,]),accuracies)
}
accuracies
[1] 0.920 0.874 0.906 0.902 0.874 0.890 0.884 0.904 0.922 0.902
```

# 7.3 cross validation of e1071 package

```# e1071 cross validation
library(e1071)
churnTrain <- churn[,!names(churn) %in% c('state',
'area_code', "account_length")]
set.seed(2)
ind <- sample(2,nrow(churnTrain),replace = TRUE,
prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
tuned <- tune.svm(churn~., data = trainset, gamma = 10^-2,
cost = 10^2, tunedcontrol = tune.control(cross = 10))
summary(tuned)
# Error estimation of 'svm' using 10-fold cross validation: 0.07473914
tuned\$performances
gamma cost      error dispersion
1  0.01  100 0.07473914 0.01605983
svmfit <- tuned\$best.model
table(trainset[,c("churn")], predict(svmfit))
yes   no
yes  400   93
no    14 2972
```

The tune function uses the wind format search method to complete parameter optimization. View other optimization functions? tune

# 7.4 cross validation of caret package

```# caret
library(caret)
# Repeat 3 times with 10 fold cross
control <- trainControl(method = "repeatedcv",number = 10,
repeats = 3)
model <- train(churn~.,data = trainset, method = "rpart",
preProcess="scale", trControl=control)
model
CART

3479 samples
19 predictor
2 classes: 'yes', 'no'

Pre-processing: scaled (69)
Resampling: Cross-Validated (10 fold, repeated 3 times)
Summary of sample sizes: 3131, 3132, 3130, 3130, 3131, 3132, ...
Resampling results across tuning parameters:

cp          Accuracy   Kappa
0.04056795  0.9301563  0.6720876
0.05578093  0.9147378  0.5774005
0.07505071  0.8731365  0.2416613

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.04056795.
```

# 7.5 importance ranking of variables by caret package

After the supervised learning model is obtained, the input value can be changed and the sensitivity of the output effect of a given model can be compared to evaluate the importance of different features to the model.

```# Importance ranking
importance <- varImp(model, scale = FALSE)
importance
rpart variable importance

only 20 most important variables shown (out of 69)

Overall
international_planyes         196.919
total_day_minutes             172.870
total_day_charge              161.582
number_customer_service_calls 155.274
total_intl_minutes            133.221
total_intl_charge             124.817
total_intl_calls               48.700
voice_mail_planyes             40.912
total_eve_charge               31.116
total_eve_minutes              31.116
...
plot(importance)
```

The objects generated from the training model in some classification algorithm packages such as extended rpart contain the importance of variables, which can be viewed and output.

```library(rpart)
model.rp <- rpart(churn~.,data = trainset)
model.rp\$variable.importance
total_day_charge             total_day_minutes                         state
161.951011                    161.951011                     85.145010
total_intl_minutes number_customer_service_calls             total_intl_charge
80.504234                     76.323349                     75.141041
...
```

# 7.6 rimier package to sort the importance of variables

This takes a lot of effort. It seems that only numerical variables can do it.

```# rminier
install.packages("rminer")
library(rminer)
model <- fit(Species~., iris,model = "svm")
variable.importance <- Importance(model,iris,method = "sensv")
L=list(runs=1,sen=t(variable.importance\$imp),sresponses=variable.importance\$sresponses)
mgraph(L,graph = "IMP",leg = names(iris),col = "gray", Grid = 4)
```

# 7.7 caret packages find highly correlated features

Remove the non numerical attributes, calculate the correlation to obtain a correlation matrix, set the threshold to 0.75, and select the highly correlated attributes.

```# findCorrelation
new_train <- trainset[,!names(trainset) %in% c('churn',
'international_plan', "voice_mail_plan")]
cor_mat <- cor(new_train)
highlyCorrelated <- findCorrelation(cor_mat, cutoff = 0.75)
names(new_train)[highlyCorrelated]
[1] "total_night_minutes" "total_intl_charge"   "total_day_charge"    "total_eve_charge"
```

You can also use the leaps, genetic and annual functions in the subselect package to achieve the same effect.

# 7.8 feature selection using caret package

Feature selection can select the attribute subset with the lowest prediction error, which helps us to judge which features should be used to establish an accurate model. Recursive feature exclusion function rfe can automatically select the required features.

```# caret selection feature
library(modeldata)
library(caret)
data(mlc_churn)
churnTrain <- mlc_churn
ind <- sample(2,nrow(churnTrain),replace = TRUE,
prob = c(0.7,0.3))
trainset <- churnTrain[ind==1,]
testset <- churnTrain[ind==2,]
# Feature transformation should be that yes no changes to 1, 0, and factor changes to binary attribute
intl_plan <- model.matrix(~ trainset.international_plan - 1,
data = data.frame(trainset\$international_plan))
colnames(intl_plan) <- c("trainset.international_planno"="intl_no",
"trainset.international_planyes"="intl_yes")

voice_plan <- model.matrix(~ trainset.voice_mail_plan - 1,
data = data.frame(trainset\$voice_mail_plan))
colnames(voice_plan) <- c("trainset.voice_mail_planno"="voice_no",
"trainset.voice_mail_planyes"="voice_yes")
trainset\$international_plan <- NULL
trainset\$voice_mail_plan <- NULL
trainset <- cbind(intl_plan, voice_plan,trainset)
# testset does the same
intl_plan <- model.matrix(~ testset.international_plan - 1,
data = data.frame(testset\$international_plan))
colnames(intl_plan) <- c("testset.international_planno"="intl_no",
"testset.international_planyes"="intl_yes")

voice_plan <- model.matrix(~ testset.voice_mail_plan - 1,
data = data.frame(testset\$voice_mail_plan))
colnames(voice_plan) <- c("testset.voice_mail_planno"="voice_no",
"testset.voice_mail_planyes"="voice_yes")
testset\$international_plan <- NULL
testset\$voice_mail_plan <- NULL
testset <- cbind(intl_plan, voice_plan,testset)
# Linear discriminant analysis creates a feature screening algorithm, cross validation method cv
ldaControl <- rfeControl(functions = ldaFuncs, method = "cv")
# Reverse feature filtering
ldaProfile <- rfe(trainset[,names(trainset)!='churn'][,-c(5,6,7)],
trainset[,'churn'],sizes = c(1:18), rfeControl = ldaControl)
ldaProfile

Recursive feature selection

Outer resampling method: Cross-Validated (10 fold)

Resampling performance over subset size:

Variables Accuracy   Kappa AccuracySD KappaSD Selected
1   0.8554 0.02245   0.009841 0.06008
2   0.8554 0.02245   0.009841 0.06008
3   0.8494 0.23064   0.013363 0.10679
...
plot(ldaProfile, type = c("o","g"))
# Optimal variable subset
ldaProfile\$optVariables
# Appropriate model
ldaProfile\$fit
Call:
lda(x, y)

Prior probabilities of groups:
yes        no
0.1417074 0.8582926

Group means:
postResample(predict(ldaProfile, testset[,names(testset)!="churn"]), testset[,c("churn")])
Accuracy     Kappa
0.8520710 0.2523709
```

extend

# 7.9 evaluating the performance of regression model

Root mean square error method RMSE, relative square difference RSE, determination coefficient R-Square.

```# Performance evaluation of regression model
library(car)
data("Quartet")
plot(Quartet\$x,Quartet\$y3)
lmfit<- lm(Quartet\$y3~Quartet\$x)
abline(lmfit, col="red")
predicted <- predict(lmfit,newdata = Quartet[c("x")])
actual <- Quartet\$y3
# The function of caret package used here. This package is a treasure. It has everything
rmse <- RMSE(predicted, actual)
mu <- mean(actual)
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
Rsquare <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.118286 0.333676 0.666324
# MASS package rlm recalculation
library(MASS)
plot(Quartet\$x,Quartet\$y3)
rlmfit <- rlm(Quartet\$y3~Quartet\$x)
abline(rlmfit,col='red')
predicted <- predict(rlmfit, newdata = Quartet['x'])
rmse <- (mean((predicted-actual)^2))^0.5
rse <- mean((predicted-actual)^2)/mean((mu-actual)^2)
rsqure <- 1-rse
c(rmse,rse,Rsquare)
[1] 1.2790448 0.4365067 0.6663240
```

extend

```library(e1071)
tune(lm,y3~x,data = Quartet)
Error estimation of 'lm' using 10-fold cross validation: 2.351897
```

Cross validation function of dacv.train LM can achieve the same effect

# 7.10 using confusion matrix to evaluate the prediction ability of the model

The accuracy, recall, specificity and accuracy of the model

```# Confusion matrix
svm.model <- train(churn~., data=trainset,method = "svmRadial")
svm.pred <- predict(svm.model, testset[,names(testset)!="churn"])
table(svm.pred, testset[,"churn",drop=TRUE])
svm.pred  yes   no
yes   12    1
no   198 1290
confusionMatrix(svm.pred,testset[,"churn",drop=TRUE])
Confusion Matrix and Statistics

Reference
Prediction  yes   no
yes   12    1
no   198 1290

Accuracy : 0.8674
95% CI : (0.8492, 0.8842)
No Information Rate : 0.8601
P-Value [Acc > NIR] : 0.2183

Kappa : 0.0928
```

# 7.11 prediction ability of ROCR evaluation model

The receiver operating curve ROC is a common performance display graph of binary classification system. The true Yang and false Yang rates of different tangent points are marked on the curve. The classification performance of the model is usually measured based on the area under the curve AUC.

```install.packages("ROCR")
library(ROCR)
svmfit <- svm(churn~.,data = trainset, prob = TRUE)
pred <- predict(svmfit, testset[,names(testset)!="churn"],probability = TRUE)
# Probability of marking yes
pred.prob <- attr(pred,"probabilities")
pred.to.roc <- pred.prob[, 2]
# forecast
pred.rocr <- prediction(pred.to.roc, testset\$churn)
# Performance evaluation
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr", "fpr")
plot(perf.tpr.rocr, colorize = T, main = paste("AUC:", (perf.rocr@y.values)))
```

# 7.12 compare ROC curves using caret package

```install.packages("pROC")
library(pROC)
# Repeat 3 times with 10 fold cross
control <- trainControl(method = "repeatedcv", number = 10,
repeats = 3,classProbs = TRUE,
summaryFunction = twoClassSummary)
# glm classification
glm.model <- train(churn ~., data = trainset,method = "glm",
metric = "ROC", trControl = control)
# svm classification
svm.model <- train(churn ~., data = trainset,method = "svmRadial",
metric = "ROC", trControl = control)
rpart.model <- train(churn ~., data = trainset,method = "rpart",
metric = "ROC", trControl = control)
# Forecast separately
glm.probs <- predict(glm.model, testset[,names(testset) != "churn"], type = "prob")
svm.probs <- predict(svm.model, testset[,names(testset) != "churn"], type = "prob")
rpart.probs <- predict(rpart.model, testset[,names(testset) != "churn"], type = "prob")
# Generate ROC curve in a graph
glm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
predictor = glm.probs\$yes,
levels = levels(testset[,"churn",drop=TRUE]))
plot(glm.ROC, type = "S", col = 'red')

svm.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
predictor = svm.probs\$yes,
levels = levels(testset[,"churn",drop=TRUE]))
plot(svm.ROC, add = TRUE, col = 'green')
rpart.ROC <- roc(response = testset[,c("churn"),drop=TRUE],
predictor = rpart.probs\$yes,
levels = levels(testset[,"churn",drop=TRUE]))
plot(rpart.ROC, add = TRUE, col = 'blue')
```

Error in colnames (data): argument "data" is missing, with no default. Finally, if it is found that tibble does not reduce the dimension by default, add drop=TRUE to solve it.

# 7.13 performance differences of caret package comparison model

```# Model resampling
cv.values <- resamples(list(glm=glm.model, svm=svm.model, rpart = rpart.model))
summary(cv.values)
Call:
summary.resamples(object = cv.values)

Models: glm, svm, rpart
Number of resamples: 30

ROC
Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glm   0.7483007 0.7977494 0.8187351 0.8200275 0.8357939 0.8953608    0
svm   0.8356721 0.8594691 0.8722842 0.8778421 0.9018812 0.9184929    0
rpart 0.6446078 0.7146965 0.7666979 0.7773052 0.8459407 0.9173395    0

Sens
Min.   1st Qu.    Median      Mean   3rd Qu. Max. NA's
glm   0.1600000 0.2088235 0.2672549 0.2667320 0.3184314 0.40    0
svm   0.2600000 0.3879412 0.4454902 0.4460654 0.5049020 0.66    0
rpart 0.2352941 0.3800000 0.4411765 0.4622876 0.5637255 0.76    0

Spec
Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
glm   0.9509804 0.9639639 0.9673203 0.9666242 0.9705882 0.9771242    0
svm   0.9346405 0.9542484 0.9672131 0.9641094 0.9729749 0.9836601    0
rpart 0.9705882 0.9836601 0.9901639 0.9885492 0.9934641 1.0000000    0
dotplot(cv.values, metric = "ROC")
bwplot(cv.values, layout=c(3,1))
```

Extensions can also use densityplot Splom and xyplot function visualization