# ggpairs
library(ggplot2)
library(GGally)
#> Registered S3 method overwritten by 'GGally':
#> method from
#> +.gg ggplot2
library(hrbrthemes)
#> NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
#> Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
#> if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
# %>%
library(tidyverse)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
#> ✓ tibble 3.0.3 ✓ dplyr 1.0.2
#> ✓ tidyr 1.1.2 ✓ stringr 1.4.0
#> ✓ readr 1.3.1 ✓ forcats 0.5.0
#> ✓ purrr 0.3.4
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
# kfold
library(caret)
#> Loading required package: lattice
#>
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#>
#> lift
kfoldcv = 10
# lda
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
# prediction
library(ROCR)
#partimat
library(klaR)
library(knitr)
library(kableExtra)
#>
#> Attaching package: 'kableExtra'
#> The following object is masked from 'package:dplyr':
#>
#> group_rows
library(parallel)
library(future)
#>
#> Attaching package: 'future'
#> The following object is masked from 'package:caret':
#>
#> cluster
library(doParallel)
#> Loading required package: foreach
#>
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#>
#> accumulate, when
#> Loading required package: iterators
cl <- future::makeClusterPSOCK(4, outfile = NULL, verbose = FALSE)
get_metric <- function(caret_CM_object, metric){
caret_CM_object %>% broom::tidy() %>%
dplyr::filter(term==metric) %>%
dplyr::select(estimate) %>% pull()
}
get_metrics_func <- function(caret_CM_objects, metric, func){
caret_CM_objects %>%
purrr::map( ~ get_metric(.x, metric)) %>%
unlist() %>% func
}
get_out_of_folds_CM <- function(classifier){
out_of_folds_CM <- classifier$pred %>%
dplyr::mutate(pred2 = ifelse(Up > THRESHOLD, 'Up', 'Down')) %>%
dplyr::mutate(pred2 = factor(pred2, levels = c('Down', 'Up'))) %>%
dplyr::group_split(Resample) %>%
purrr::map( ~ caret::confusionMatrix(data=.x$pred2, reference=.x$obs,
positive='Up'))
}
plot_folds_CM <- function(classifier,THRESHOLD ){
out_of_folds_CM <- classifier$pred %>%
dplyr::mutate(pred2 = ifelse(Up>THRESHOLD, 'Up', 'Down')) %>%
dplyr::mutate(pred2 = factor(pred2, levels=c('Down','Up'))) %>%
dplyr::group_split(Resample) %>%
purrr::map(~ caret::confusionMatrix(data=.x$pred2, reference=.x$obs))
out_of_folds_CM_vis <- out_of_folds_CM %>%
purrr::map(~ .x$table %>% broom::tidy() %>%
ggplot(aes(x=Reference, y=Prediction, fill=n, label=n)) +
geom_tile() + geom_text(aes(label=n), size=5, color='white'))
gridExtra::grid.arrange(grobs=out_of_folds_CM_vis, ncol=2)
}
tuned_call <- function(preds, decision_rule_threshold){
calls <- rep('Down', length(preds))
calls[preds>decision_rule_threshold]='Up'
factor(calls, levels=c('Down','Up'))
}
get_optimal_threshold <- function(classifier){
thresholds = seq(0.001,0.5,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
GMeans <- rep(NA, length(thresholds))
preds <- predict(classifier, type='prob')[,2]
y <- train$BlueTarp_rest
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
sensitivity
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
#paste0("True Positive: ", truePositve )
#paste0("False Negative: ", falseNegative )
#paste0("True Negative: ", trueNegative )
#paste0("False Positive: ", falsePositive )
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
GMeans[i] = sqrt(TPR*SPC)
}
plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
opt_thresh = thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]
return(opt_thresh)
}
stat_fho <- function(classifier, testdata, threshold){
y <- testdata$BlueTarp_rest
preds <- predict(classifier, newdata = testdata, type = "prob")[,2]
yhat <- tuned_call(preds, threshold)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
sensitivity
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
# accuracy, sensitivity, specificity, FDR, precision
return(c(accuracy, TPR, SPC, FDR, PPV ))
}
In this project, you will use classification methods covered in this course to solve a real historical data mining problem: locating displaced persons living in makeshift shelters following the destruction of the earthquake in Haiti in 2010.
Following that earthquake, rescue workers, mostly from the United States military, needed to get food and water to the displaced persons. But with destroyed communications, impassable roads, and thousands of square miles, actually locating the people who needed help was challenging. As part of the rescue effort, a team from the Rochester Institute of Technology were flying an aircraft to collect high resolution geo-referenced imagery.
HaitiPixels <- read.csv(file = '/Users/awells/UVA/SYS6018/Disaster_Relief_Project/HaitiPixels.csv')
head(HaitiPixels)
#> Class Red Green Blue
#> 1 Vegetation 64 67 50
#> 2 Vegetation 64 67 50
#> 3 Vegetation 64 66 49
#> 4 Vegetation 75 82 53
#> 5 Vegetation 74 82 54
#> 6 Vegetation 72 76 52
HaitiPixels$Class <- as.factor(HaitiPixels$Class)
#summary(HaitiPixels)
dim(HaitiPixels)
#> [1] 63241 4
summary(HaitiPixels)
#> Class Red Green Blue
#> Blue Tarp : 2022 Min. : 48 Min. : 48.0 Min. : 44.0
#> Rooftop : 9903 1st Qu.: 80 1st Qu.: 78.0 1st Qu.: 63.0
#> Soil :20566 Median :163 Median :148.0 Median :123.0
#> Various Non-Tarp: 4744 Mean :163 Mean :153.7 Mean :125.1
#> Vegetation :26006 3rd Qu.:255 3rd Qu.:226.0 3rd Qu.:181.0
#> Max. :255 Max. :255.0 Max. :255.0
attach(HaitiPixels)
detach(HaitiPixels)
# Blue Tarp
BlueTarp_rest <- rep("0",length(HaitiPixels$Class))
BlueTarp_rest[HaitiPixels$Class == "Blue Tarp"] <- "1"
HaitiPixels <- data.frame(HaitiPixels,BlueTarp_rest )
HaitiPixels$BlueTarp_rest <- as.factor(HaitiPixels$BlueTarp_rest)
#summary(HaitiPixels)
attach(HaitiPixels)
#> The following object is masked _by_ .GlobalEnv:
#>
#> BlueTarp_rest
BlueTarp_rest <- as.factor(BlueTarp_rest)
#### Delete these 2 lines when ready for full set
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.1, list = FALSE) # ~15K
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.25, list = FALSE) # ~15K
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.32, list = FALSE) # ~20K
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.5, list = FALSE) # ~30K
#data <- HaitiPixels[index, ]
#HaitiPixels <- data
####
summary(HaitiPixels)
#> Class Red Green Blue
#> Blue Tarp : 2022 Min. : 48 Min. : 48.0 Min. : 44.0
#> Rooftop : 9903 1st Qu.: 80 1st Qu.: 78.0 1st Qu.: 63.0
#> Soil :20566 Median :163 Median :148.0 Median :123.0
#> Various Non-Tarp: 4744 Mean :163 Mean :153.7 Mean :125.1
#> Vegetation :26006 3rd Qu.:255 3rd Qu.:226.0 3rd Qu.:181.0
#> Max. :255 Max. :255.0 Max. :255.0
#> BlueTarp_rest
#> 0:61219
#> 1: 2022
#>
#>
#>
#>
HaitiPixels %>% ggpairs(upper=list(continuous=wrap("cor", size=3)))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For the set up of the data set ( HaitiPixels, \(n=\) 63241) a likely decision is to use a one vs rest approach. Our data is a multi-class classification problem with binary classifications:
Blue Tarp vs [ Rooftop , Soil , Various Non-Tarp, Vegetation ]
Rooftop vs [ Blue Tarp, Soil , Various Non-Tarp, Vegetation ]
Soil vs [ Blue Tarp, Rooftop , Various Non-Tarp, Vegetation ]
Various Non-Tarp vs [ Blue Tarp, Rooftop , Soil , Vegetation ]
Vegetation vs [ Blue Tarp, Rooftop , Soil , Various Non-Tarp ]
# Get Final Hold - Out 5% of Data ~3000 observations
#data.fho = sort(sample(nrow(HaitiPixels), nrow(HaitiPixels)*.05))
#fho <- HaitiPixels[data.fho,]
train <- HaitiPixels
# shuffle just to be safe
train <- dplyr::sample_n(train, nrow(train))
#fho <- dplyr::sample_n(fho, nrow(fho))
train$BlueTarp_rest <- as.factor(train$BlueTarp_rest)
levels(train$BlueTarp_rest) <- c("Down","Up")
y <- train$BlueTarp_rest
Since we are looking for temporary shelters made using blue tarps, we generated a variable named “BlueTarp_rest”, a binary classifier that has two levels
“Up” - if a blue tarp was visible (or up )
“Down” - if a blue tarp was not visible.
It was known that the people whose homes had been destroyed by the earthquake were creating temporary shelters using blue tarps, and these blue tarps would be good indicators of where the displaced persons were – if only they could be located in time, out of the thousands of images that would be collected every day.
#orthovnir057_ROI_NON_Blue_Tarps.txt
ovnir057.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir057_ROI_NON_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir057.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir057.NoBlueTarps$ID))
ovnir057.NoBlueTarps <- data.frame(ovnir057.NoBlueTarps,BlueTarp_rest )
#orthovnir067_ROI_Blue_Tarps.txt
ovnir067.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir067_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir067.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir067.BlueTarps$ID))
ovnir067.BlueTarps <- data.frame(ovnir067.BlueTarps,BlueTarp_rest )
#orthovnir067_ROI_NOT_Blue_Tarps.txt
ovnir067.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir067_ROI_NOT_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir067.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir067.NoBlueTarps$ID))
ovnir067.NoBlueTarps <- data.frame(ovnir067.NoBlueTarps,BlueTarp_rest )
#orthovnir069_ROI_Blue_Tarps.txt
ovnir069.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir069_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir069.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir069.BlueTarps$ID))
ovnir069.BlueTarps <- data.frame(ovnir069.BlueTarps,BlueTarp_rest )
#orthovnir069_ROI_NOT_Blue_Tarps.txt
ovnir069.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir069_ROI_NOT_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir069.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir069.NoBlueTarps$ID))
ovnir069.NoBlueTarps <- data.frame(ovnir069.NoBlueTarps,BlueTarp_rest )
#orthovnir078_ROI_Blue_Tarps.txt
ovnir078.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir078_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir078.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir078.BlueTarps$ID))
ovnir078.BlueTarps <- data.frame(ovnir078.BlueTarps,BlueTarp_rest )
#orthovnir078_ROI_NON_Blue_Tarps.txt
ovnir078.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir078_ROI_NON_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")
colnames(ovnir078.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir078.NoBlueTarps$ID))
ovnir078.NoBlueTarps <- data.frame(ovnir078.NoBlueTarps,BlueTarp_rest )
#summary(ovnir067.NoBlueTarps)
fho <- rbind(
ovnir057.NoBlueTarps,
ovnir067.BlueTarps,
ovnir067.NoBlueTarps,
ovnir069.BlueTarps,
ovnir069.NoBlueTarps,
ovnir078.BlueTarps,
ovnir078.NoBlueTarps
)
fho$BlueTarp_rest <- factor(fho$BlueTarp_rest)
#### Delete these 2 lines when ready for full set
#index.fho <- createDataPartition(fho$BlueTarp_rest, p = 0.1, list = FALSE)
#data.fho <- fho[index.fho, ]
#fho <- data.fho
####
# Scale Data
#fho.scale <- scale(subset(fho, select= -c(BlueTarp_rest)), center = TRUE, scale = TRUE)
#df <- data.frame(fho.scale, fho$BlueTarp_rest)
#fho <- df
colnames(fho) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue","BlueTarp_rest")
Our goal is to test a variety of algorithms from this course on the imagery data collected during the relief efforts made Haiti in response to the 2010 earthquake. This leaves a Hold-Out Test Data Set of \(n=\) 2004177 observations.
The first classification process that we used is the K-nearest neighbors regression. Given a value for K and a prediction point \(x_0\), KNN regression first identifies the K training observations that are closest to \(x_0\), represented by \(N_0\). The optimal value for K will depend on the bias-variance trade-off. A small value for K provides the most flexible fit, which will have low bias but high variance. This variance is due to the fact that the prediction in a given region is entirely dependent on just one observation. In contrast, larger values of K provide a smoother and less variable fit; the prediction in a region is an average of several points, and so changing one observation has a smaller effect. We’ll use the caret package in R to apply the KNN classification procedure using a 10 - Fold Cross-Validation evaluation procedure.
set.seed(1)
registerDoParallel(cl)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.knn <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="knn",
preProcess=c("center","scale"),
tuneGrid=data.frame(k=seq(1,17,2)),
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
stopCluster(cl)
classifier.knn
#> k-Nearest Neighbors
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results across tuning parameters:
#>
#> k Accuracy Kappa
#> 1 0.9965687 0.9442231
#> 3 0.9972803 0.9560412
#> 5 0.9972961 0.9564047
#> 7 0.9972645 0.9560470
#> 9 0.9971696 0.9545976
#> 11 0.9970747 0.9530375
#> 13 0.9970431 0.9524515
#> 15 0.9969956 0.9517298
#> 17 0.9970431 0.9524377
#>
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was k = 5.
bestk = classifier.knn$bestTune
Training across various values of K, we can see that our optimal K is 5. So we’ll proceed with the analysis using this value.
classifier.knn$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.knn$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model. We’ll use the image below as a reference to analyzing the matrix.
out_of_folds_CM.knn <- get_out_of_folds_CM(classifier.knn)
classifier.knn.sensitivity.mean <- get_metrics_func(out_of_folds_CM.knn, 'sensitivity', mean)
classifier.knn.sensitivity.sd <- get_metrics_func(out_of_folds_CM.knn, 'sensitivity', sd)
classifier.knn.specificity.mean <- get_metrics_func(out_of_folds_CM.knn, 'specificity', mean)
classifier.knn.specificity.sd <- get_metrics_func(out_of_folds_CM.knn, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.959457152611813
Sensitivity Standard Deviation: 0.0137156602761885
Specificity Mean: 0.998562542574257
Specificity Standard Deviation: 0.000467102491419399
From the 10-Fold cross validation we can see we have our true positive references to a blue tarp. We can measure the sensitivity also known as the true positive rate. This is the fraction of observations that are correctly identified, using a given threshold value. The initial default threshold is 0.5. The false positive rate is 1-specificity; the fraction of observations that we classify incorrectly, using that same threshold value of 0.5. We can view the receiver operating characteristic curve or ROC curve to measure these two statistics. The ideal ROC curve hugs the top left corner, indicating a high true positive rate and a low false positive rate.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.knn, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
#> Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
#> Instead, set the following argument directly in the metric function:
#> `options(yardstick.event_first = TRUE)` -> `event_level = 'first'` (the default)
#> `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
#> This warning is displayed once per session.
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
#ggplot(ROC_curve_plot)
ROC_curve_plot
predict(classifier.knn, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 1.00
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 0 0 1 1
#> 3 0.0701 0.992 1 0.00804
#> 4 0.0703 0.992 1 0.00802
#> 5 0.0820 0.997 0.989 0.00327
#> 6 0.143 0.997 0.988 0.00327
predict(classifier.knn, type='prob') %>% head()
#> Down Up
#> 1 1 0
#> 2 1 0
#> 3 1 0
#> 4 1 0
#> 5 1 0
#> 6 1 0
We can also look at each ROC curve from each fold from the KNN model. For the 10 different ROC curves, we can even get a sense of the variation in ROC curves.
ROC_curveS <- classifier.knn$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
We can measure the Precision and the Recall by the Precision-Recall (P-R) Curve .
Precision:
\[ \frac{true \ positives}{ true \ positives + false \ positives}\]
Recall:
\[ \frac{true \ positives}{ true \ positives + false \ negatives}\]
The ideal P-R curve hugs the top right corner, representing both high recall and high precision which indicates the model is returning accurate and positive results.
PR_curve <- predict(classifier.knn, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the KNN model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.knn$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.knn.recall.mean <- get_metrics_func(out_of_folds_CM.knn, 'recall', mean)
classifier.knn.recall.sd <- get_metrics_func(out_of_folds_CM.knn, 'recall', sd)
classifier.knn.precision.mean <- get_metrics_func(out_of_folds_CM.knn, 'precision', mean)
classifier.knn.precision.sd <- get_metrics_func(out_of_folds_CM.knn, 'precision', sd)
Recall Mean: 0.959457152611813
Recall Standard Deviation: 0.0137156602761885
Precision Mean: 0.9568424151997
Precision Standard Deviation: 0.0131157372711998
To find the optimal threshold, well look at using the \(F_1\) Score.
\(F_1\) score is more useful measure than accuracy for problems with uneven class distributions since \(F_1\) is defined as :
\[ F_1 = 2 * \frac{\frac{\Sigma \ True \ Positive}{\Sigma \ Predicted \ Condition \ Positive}*\frac{\Sigma True \ Positive}{\Sigma \ Condition \ Positive}}{\frac{\Sigma \ True \ Positive}{\Sigma \ Predicted \ Condition \ Positive}+\frac{\Sigma \ True \ Positive}{\Sigma \ Condition \ Positive}}\]
it takes into account both false positive and false negatives. The optimal \(F_1\) Score is between 0 and 1 with 1 meaning perfect precision and recall.
preds <- predict(classifier.knn, type='prob')[,2]
opt_thresh.knn <- get_optimal_threshold(classifier.knn)
yhat <- tuned_call(preds, opt_thresh.knn)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001143436
sensitivity
#> [1] 0.9693373
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61149 62
#> Up 70 1960
#>
#> Accuracy : 0.9979
#> 95% CI : (0.9975, 0.9983)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : <2e-16
#>
#> Kappa : 0.9663
#>
#> Mcnemar's Test P-Value : 0.5423
#>
#> Sensitivity : 0.96934
#> Specificity : 0.99886
#> Pos Pred Value : 0.96552
#> Neg Pred Value : 0.99899
#> Prevalence : 0.03197
#> Detection Rate : 0.03099
#> Detection Prevalence : 0.03210
#> Balanced Accuracy : 0.98410
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61149
False Negative = 62
False Positive = 70
True Positive = 1960
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2030
Predicted Condition Negative = 61211
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.99791274647776
Positive predictive value, precision = 0.96551724137931
False Discovery rate = 0.0344827586206897
False Omission Rate = 0.00101288984006143
Negative Predictive Value = 0.998987110159939
True Positive Rate, Recall, Sensitivity = 0.969337289812067
False Positive Rate = 0.00114343586141557
False Negative Rate = 0.0306627101879327
Specificity, Selectivity, True Negative rate = 0.998856564138584
Positive likelihood ratio = 847.740850642928
Negative likelihood ratio = 0.030697811166087
Diagnostic Odds Ratio = 27615.6774193548
\(F_1\) Score = 0.967423494570582
From our plot we selected our optimal threshold as 0.499.
plot_folds_CM(classifier.knn,opt_thresh.knn)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.knn, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.knn, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
knn.Accuracy = accuracy
knn.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.knn$pred$obs,
pred = classifier.knn$pred$pred,
Down = classifier.knn$pred$Down,
Up = classifier.knn$pred$Up), lev = levels(classifier.knn$pred$obs))
knn.ROC = "TRUE" #unname(twoclassSum[1])
knn.Threshold = opt_thresh.knn
knn.Sensitivity = knn.Recall= knn.Power = TPR
knn.Specificity= 1-FPR
knn.FDR = FDR
knn.Precision=PPV
df = data.frame(c(knn.Accuracy, knn.AUC, knn.ROC, knn.Threshold, knn.Sensitivity, knn.Specificity, knn.FDR , knn.Precision))
colnames(df) <- c(paste0("KNN (k =",bestk,")"))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
KNN (k =5) | |
---|---|
Accuracy | 0.99791274647776 |
AUC | 0.999862454860983 |
ROC | TRUE |
Threshold | 0.499 |
Sensitivity=Recall=Power | 0.969337289812067 |
Specificity=1-FPR | 0.998856564138584 |
FDR | 0.0344827586206897 |
Precision=PPV | 0.96551724137931 |
The next classification process that we used is the linear discriminant analysis (LDA). LDA assumes that the observations within each class are drawn from a multivariate Gaussian distribution with a class specific mean vector and a covariance matrix that is common to all K classes. Again, we’ll use the caret package in R to apply the LDA classification procedure using a 10 - Fold Cross-Validation evaluation procedure.
set.seed(1)
registerDoSEQ()
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.lda <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="lda",
preProcess=c("center","scale"),
#tuneGrid=data.frame(.dimen = seq(1,10,1)),#seq(1,1000,5)),
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
classifier.lda
#> Linear Discriminant Analysis
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.9839029 0.7525493
classifier.lda$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.lda$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.lda <- get_out_of_folds_CM(classifier.lda)
classifier.lda.sensitivity.mean <- get_metrics_func(out_of_folds_CM.lda, 'sensitivity', mean)
classifier.lda.sensitivity.sd <- get_metrics_func(out_of_folds_CM.lda, 'sensitivity', sd)
classifier.lda.specificity.mean <- get_metrics_func(out_of_folds_CM.lda, 'specificity', mean)
classifier.lda.specificity.sd <- get_metrics_func(out_of_folds_CM.lda, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.800692581573428
Sensitivity Standard Deviation: 0.0275020332680672
Specificity Mean: 0.989954135219603
Specificity Standard Deviation: 0.0014653493042761
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.lda, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.lda, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.989
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 1.85e-16 0 1 1
#> 3 5.08e-16 0.0000163 1 1.00
#> 4 4.04e-15 0.0000327 1 1.00
#> 5 1.28e-14 0.0000490 1 1.00
#> 6 1.61e-14 0.0000653 1 1.00
predict(classifier.lda, type='prob') %>% head()
#> Down Up
#> 1 0.9999986 1.355539e-06
#> 2 0.9999983 1.697476e-06
#> 3 1.0000000 4.967167e-08
#> 4 0.9973435 2.656453e-03
#> 5 1.0000000 4.102756e-09
#> 6 1.0000000 5.173050e-09
We can also look at each ROC curve from each fold from the LDA model. For the 10 different ROC curves, we can even get a sense of the variation in ROC curves.
ROC_curveS <- classifier.lda$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.lda, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the LDA model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.lda$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.lda.recall.mean <- get_metrics_func(out_of_folds_CM.lda, 'recall', mean)
classifier.lda.recall.sd <- get_metrics_func(out_of_folds_CM.lda, 'recall', sd)
classifier.lda.precision.mean <- get_metrics_func(out_of_folds_CM.lda, 'precision', mean)
classifier.lda.precision.sd <- get_metrics_func(out_of_folds_CM.lda, 'precision', sd)
Recall Mean: 0.800692581573428
Recall Standard Deviation: 0.0275020332680672
Precision Mean: 0.7257850068744
Precision Standard Deviation: 0.0259033842126505
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.lda, type='prob')[,2]
opt_thresh.lda <- get_optimal_threshold(classifier.lda)
yhat <- tuned_call(preds, opt_thresh.lda)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.009996896
sensitivity
#> [1] 0.8021761
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 60607 400
#> Up 612 1622
#>
#> Accuracy : 0.984
#> 95% CI : (0.983, 0.985)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.754
#>
#> Mcnemar's Test P-Value : 3.295e-11
#>
#> Sensitivity : 0.80218
#> Specificity : 0.99000
#> Pos Pred Value : 0.72605
#> Neg Pred Value : 0.99344
#> Prevalence : 0.03197
#> Detection Rate : 0.02565
#> Detection Prevalence : 0.03533
#> Balanced Accuracy : 0.89609
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 60607
False Negative = 400
False Positive = 612
True Positive = 1622
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2234
Predicted Condition Negative = 61007
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.983997722996158
Positive predictive value, precision = 0.726051924798568
False Discovery rate = 0.273948075201432
False Omission Rate = 0.00655662464963037
Negative Predictive Value = 0.99344337535037
True Positive Rate, Recall, Sensitivity = 0.80217606330366
False Positive Rate = 0.00999689638837616
False Negative Rate = 0.19782393669634
Specificity, Selectivity, True Negative rate = 0.990003103611624
Positive likelihood ratio = 80.242510489194
Negative likelihood ratio = 0.199821531846375
Diagnostic Odds Ratio = 401.570890522876
\(F_1\) Score = 0.762218045112782
From our plot we selected our optimal threshold as 0.495.
plot_folds_CM(classifier.lda,opt_thresh.lda)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.lda, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.lda, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
lda.Accuracy = accuracy
lda.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.lda$pred$obs,
pred = classifier.lda$pred$pred,
Down = classifier.lda$pred$Down,
Up = classifier.lda$pred$Up), lev = levels(classifier.lda$pred$obs))
lda.ROC = "TRUE" #unname(twoclassSum[1])
lda.Threshold = opt_thresh.lda
lda.Sensitivity = lda.Recall= lda.Power = TPR
lda.Specificity= 1-FPR
lda.FDR = FDR
lda.Precision=PPV
df = data.frame(c(lda.Accuracy, lda.AUC, lda.ROC, lda.Threshold, lda.Sensitivity, lda.Specificity, lda.FDR , lda.Precision))
colnames(df) <- c("LDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
LDA | |
---|---|
Accuracy | 0.983997722996158 |
AUC | 0.988876770009065 |
ROC | TRUE |
Threshold | 0.495 |
Sensitivity=Recall=Power | 0.80217606330366 |
Specificity=1-FPR | 0.990003103611624 |
FDR | 0.273948075201432 |
Precision=PPV | 0.726051924798568 |
The next classification process that we used is the quadratic discriminant analysis (qda). Like LDA, the QDA classifier results from assuming that the observations from each class are drawn from a Gaussian distribution, and plugging estimates for the parameters into Bayes’ theorem in order to perform prediction. Again, we’ll use the caret package in R to apply the qda classification procedure using a 10 - Fold Cross-Validation evaluation procedure.
set.seed(1)
registerDoSEQ()
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.qda <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="qda",
preProcess=c("center","scale"),
#tuneGrid=data.frame(.dimen = seq(1,10,1)),#seq(1,1000,5)),
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
classifier.qda
#> Quadratic Discriminant Analysis
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.9945605 0.9050437
classifier.qda$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.qda$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.qda <- get_out_of_folds_CM(classifier.qda)
classifier.qda.sensitivity.mean <- get_metrics_func(out_of_folds_CM.qda, 'sensitivity', mean)
classifier.qda.sensitivity.sd <- get_metrics_func(out_of_folds_CM.qda, 'sensitivity', sd)
classifier.qda.specificity.mean <- get_metrics_func(out_of_folds_CM.qda, 'specificity', mean)
classifier.qda.specificity.sd <- get_metrics_func(out_of_folds_CM.qda, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.838777252109447
Sensitivity Standard Deviation: 0.0190825467050525
Specificity Mean: 0.999705978438419
Specificity Standard Deviation: 0.000200795915437059
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.qda, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.qda, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.998
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 4.45e-33 0 1 1
#> 3 9.90e-31 0.0000163 1 1.00
#> 4 1.92e-29 0.0000327 1 1.00
#> 5 1.14e-28 0.0000490 1 1.00
#> 6 1.29e-28 0.0000653 1 1.00
predict(classifier.qda, type='prob') %>% head()
#> Down Up
#> 1 0.9994748 5.251838e-04
#> 2 0.9987765 1.223490e-03
#> 3 1.0000000 1.144881e-08
#> 4 0.9977478 2.252187e-03
#> 5 1.0000000 2.539137e-11
#> 6 1.0000000 3.682938e-14
We can also look at each ROC curve from each fold from the qda model.
ROC_curveS <- classifier.qda$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.qda, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the QDA model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.qda$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.qda.recall.mean <- get_metrics_func(out_of_folds_CM.qda, 'recall', mean)
classifier.qda.recall.sd <- get_metrics_func(out_of_folds_CM.qda, 'recall', sd)
classifier.qda.precision.mean <- get_metrics_func(out_of_folds_CM.qda, 'precision', mean)
classifier.qda.precision.sd <- get_metrics_func(out_of_folds_CM.qda, 'precision', sd)
Recall Mean: 0.838777252109447
Recall Standard Deviation: 0.0190825467050525
Precision Mean: 0.989610677170821
Precision Standard Deviation: 0.00701203785618355
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.qda, type='prob')[,2]
opt_thresh.qda <- get_optimal_threshold(classifier.qda)
yhat <- tuned_call(preds, opt_thresh.qda)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.0009964227
sensitivity
#> [1] 0.867458
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61158 268
#> Up 61 1754
#>
#> Accuracy : 0.9948
#> 95% CI : (0.9942, 0.9953)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.9116
#>
#> Mcnemar's Test P-Value : < 2.2e-16
#>
#> Sensitivity : 0.86746
#> Specificity : 0.99900
#> Pos Pred Value : 0.96639
#> Neg Pred Value : 0.99564
#> Prevalence : 0.03197
#> Detection Rate : 0.02774
#> Detection Prevalence : 0.02870
#> Balanced Accuracy : 0.93323
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61158
False Negative = 268
False Positive = 61
True Positive = 1754
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1815
Predicted Condition Negative = 61426
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.994797678721083
Positive predictive value, precision = 0.966391184573003
False Discovery rate = 0.0336088154269972
False Omission Rate = 0.00436297333376746
Negative Predictive Value = 0.995637026666233
True Positive Rate, Recall, Sensitivity = 0.867457962413452
False Positive Rate = 0.000996422679233571
False Negative Rate = 0.132542037586548
Specificity, Selectivity, True Negative rate = 0.999003577320766
Positive likelihood ratio = 870.57227870474
Negative likelihood ratio = 0.13267423720545
Diagnostic Odds Ratio = 6561.72816246636
\(F_1\) Score = 0.914255929111285
From our plot we selected our optimal threshold as 0.22.
plot_folds_CM(classifier.qda,opt_thresh.qda)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.qda, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.qda, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
qda.Accuracy = accuracy
qda.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.qda$pred$obs,
pred = classifier.qda$pred$pred,
Down = classifier.qda$pred$Down,
Up = classifier.qda$pred$Up), lev = levels(classifier.qda$pred$obs))
qda.ROC = "TRUE" #unname(twoclassSum[1])
qda.Threshold = opt_thresh.qda
qda.Sensitivity = qda.Recall= qda.Power = TPR
qda.Specificity= 1-FPR
qda.FDR = FDR
qda.Precision=PPV
df = data.frame(c(qda.Accuracy, qda.AUC, qda.ROC, qda.Threshold, qda.Sensitivity, qda.Specificity, qda.FDR , qda.Precision))
colnames(df) <- c("QDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
QDA | |
---|---|
Accuracy | 0.994797678721083 |
AUC | 0.998217543931761 |
ROC | TRUE |
Threshold | 0.22 |
Sensitivity=Recall=Power | 0.867457962413452 |
Specificity=1-FPR | 0.999003577320766 |
FDR | 0.0336088154269972 |
Precision=PPV | 0.966391184573003 |
Our next classification process that we used is the linear discriminant analysis (glm). Again, we’ll use the caret package in R to apply the glm classification procedure using a 10 - Fold Cross-Validation evaluation procedure.
set.seed(1)
registerDoSEQ()
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.glm <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="glm",
preProcess=c("center","scale"),
#tuneGrid=data.frame(.dimen = seq(1,10,1)),#seq(1,1000,5)),
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
classifier.glm
#> Generalized Linear Model
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results:
#>
#> Accuracy Kappa
#> 0.9953037 0.9209187
classifier.glm$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.glm$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.glm <- get_out_of_folds_CM(classifier.glm)
classifier.glm.sensitivity.mean <- get_metrics_func(out_of_folds_CM.glm, 'sensitivity', mean)
classifier.glm.sensitivity.sd <- get_metrics_func(out_of_folds_CM.glm, 'sensitivity', sd)
classifier.glm.specificity.mean <- get_metrics_func(out_of_folds_CM.glm, 'specificity', mean)
classifier.glm.specificity.sd <- get_metrics_func(out_of_folds_CM.glm, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.885758181729503
Sensitivity Standard Deviation: 0.0207876255423969
Specificity Mean: 0.998921904929239
Specificity Standard Deviation: 0.000279754908412848
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.glm, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.glm, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.999
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 2.22e-16 0 1 1
#> 3 9.39e-14 0.216 1 0.784
#> 4 9.40e-14 0.216 1 0.784
#> 5 9.41e-14 0.216 1 0.784
#> 6 9.43e-14 0.216 1 0.784
predict(classifier.glm, type='prob') %>% head()
#> Down Up
#> 1 0.9917534 8.246618e-03
#> 2 0.9999974 2.620194e-06
#> 3 1.0000000 2.220446e-16
#> 4 0.9999852 1.478614e-05
#> 5 1.0000000 2.220446e-16
#> 6 1.0000000 2.220446e-16
We can also look at each ROC curve from each fold from the glm model.
ROC_curveS <- classifier.glm$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.glm, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the logistic regression model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.glm$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.glm.recall.mean <- get_metrics_func(out_of_folds_CM.glm, 'recall', mean)
classifier.glm.recall.sd <- get_metrics_func(out_of_folds_CM.glm, 'recall', sd)
classifier.glm.precision.mean <- get_metrics_func(out_of_folds_CM.glm, 'precision', mean)
classifier.glm.precision.sd <- get_metrics_func(out_of_folds_CM.glm, 'precision', sd)
Recall Mean: 0.885758181729503
Recall Standard Deviation: 0.0207876255423969
Precision Mean: 0.964588763609958
Precision Standard Deviation: 0.00862150259548997
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.glm, type='prob')[,2]
opt_thresh.glm <- get_optimal_threshold(classifier.glm)
yhat <- tuned_call(preds, opt_thresh.glm)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001862167
sensitivity
#> [1] 0.9263106
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61105 149
#> Up 114 1873
#>
#> Accuracy : 0.9958
#> 95% CI : (0.9953, 0.9963)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : < 2e-16
#>
#> Kappa : 0.9323
#>
#> Mcnemar's Test P-Value : 0.03604
#>
#> Sensitivity : 0.92631
#> Specificity : 0.99814
#> Pos Pred Value : 0.94263
#> Neg Pred Value : 0.99757
#> Prevalence : 0.03197
#> Detection Rate : 0.02962
#> Detection Prevalence : 0.03142
#> Balanced Accuracy : 0.96222
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61105
False Negative = 149
False Positive = 114
True Positive = 1873
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1987
Predicted Condition Negative = 61254
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.995841305482203
Positive predictive value, precision = 0.942627075993961
False Discovery rate = 0.0573729240060393
False Omission Rate = 0.00243249420446012
Negative Predictive Value = 0.99756750579554
True Positive Rate, Recall, Sensitivity = 0.926310583580613
False Positive Rate = 0.00186216697430536
False Negative Rate = 0.0736894164193867
Specificity, Selectivity, True Negative rate = 0.998137833025695
Positive likelihood ratio = 497.436908914224
Negative likelihood ratio = 0.0738268944239986
Diagnostic Odds Ratio = 6737.88207935947
\(F_1\) Score = 0.934397605387877
From our plot we selected our optimal threshold as 0.214.
plot_folds_CM(classifier.glm,opt_thresh.glm)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.glm, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.glm, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
glm.Accuracy = accuracy
glm.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.glm$pred$obs,
pred = classifier.glm$pred$pred,
Down = classifier.glm$pred$Down,
Up = classifier.glm$pred$Up), lev = levels(classifier.glm$pred$obs))
glm.ROC = "TRUE" #unname(twoclassSum[1])
glm.Threshold = opt_thresh.glm
glm.Sensitivity = glm.Recall= glm.Power = TPR
glm.Specificity= 1-FPR
glm.FDR = FDR
glm.Precision=PPV
df = data.frame(c(glm.Accuracy, glm.AUC, glm.ROC, glm.Threshold, glm.Sensitivity, glm.Specificity, glm.FDR , glm.Precision))
colnames(df) <- c("Logistic Regression")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
Logistic Regression | |
---|---|
Accuracy | 0.995841305482203 |
AUC | 0.998506949374034 |
ROC | TRUE |
Threshold | 0.214 |
Sensitivity=Recall=Power | 0.926310583580613 |
Specificity=1-FPR | 0.998137833025695 |
FDR | 0.0573729240060393 |
Precision=PPV | 0.942627075993961 |
A Random forests model uses a set of decision trees in a way that decorrelates the trees. Each individual tree will decide a class prediction and the largest total will decide the class that the observation belongs to.
set.seed(1)
# Random Forest (method = 'rf')
# For classification and regression using package randomForest with tuning parameters:
# Number of Randomly Selected Predictors (mtry, numeric)
classifier.rf <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="rf",
preProcess=c("center","scale"),
nodesize=5, importance=TRUE,
tuneGrid=data.frame(mtry=1:(ncol(HaitiPixels)-1)),
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
stopCluster(cl)
classifier.rf
#> Random Forest
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results across tuning parameters:
#>
#> mtry Accuracy Kappa
#> 1 0.9970431 0.9518832
#> 2 0.9970589 0.9522075
#> 3 0.9970115 0.9513643
#> 4 0.9970589 0.9521707
#>
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was mtry = 2.
classifier.rf.bestMtry <- classifier.rf$bestTune$mtry
classifier.rf.nTree <- classifier.rf$finalModel$ntree
The argument mtry indicates the number of predictors that should be considered for each split of the tree. Training across various values of mtry, we can see that our optimal mtry is 2. The number of trees used was default 500 So we’ll proceed with the analysis using this value.
classifier.rf$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.rf$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.rf <- get_out_of_folds_CM(classifier.rf)
classifier.rf.sensitivity.mean <- get_metrics_func(out_of_folds_CM.rf, 'sensitivity', mean)
classifier.rf.sensitivity.sd <- get_metrics_func(out_of_folds_CM.rf, 'sensitivity', sd)
classifier.rf.specificity.mean <- get_metrics_func(out_of_folds_CM.rf, 'specificity', mean)
classifier.rf.specificity.sd <- get_metrics_func(out_of_folds_CM.rf, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.948570940837926
Sensitivity Standard Deviation: 0.0158306673347466
Specificity Mean: 0.998660555098661
Specificity Standard Deviation: 0.000351159803790381
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.rf, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.rf, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.994
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 0 0 1 1
#> 3 0.002 0.988 0.989 0.0123
#> 4 0.004 0.990 0.989 0.00956
#> 5 0.006 0.992 0.989 0.00822
#> 6 0.008 0.993 0.989 0.00750
predict(classifier.rf, type='prob') %>% head()
#> Down Up
#> 1 1 0
#> 2 1 0
#> 3 1 0
#> 4 1 0
#> 5 1 0
#> 6 1 0
We can also look at each ROC curve from each fold from the rf model.
ROC_curveS <- classifier.rf$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.rf, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the Random Forest model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.rf$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.rf.recall.mean <- get_metrics_func(out_of_folds_CM.rf, 'recall', mean)
classifier.rf.recall.sd <- get_metrics_func(out_of_folds_CM.rf, 'recall', sd)
classifier.rf.precision.mean <- get_metrics_func(out_of_folds_CM.rf, 'precision', mean)
classifier.rf.precision.sd <- get_metrics_func(out_of_folds_CM.rf, 'precision', sd)
Recall Mean: 0.948570940837926
Recall Standard Deviation: 0.0158306673347466
Precision Mean: 0.95907902771856
Precision Standard Deviation: 0.0102829571574502
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.rf, type='prob')[,2]
opt_thresh.rf <- get_optimal_threshold(classifier.rf)
yhat <- tuned_call(preds, opt_thresh.rf)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.0003593656
sensitivity
#> [1] 0.9836795
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61197 33
#> Up 22 1989
#>
#> Accuracy : 0.9991
#> 95% CI : (0.9989, 0.9993)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : <2e-16
#>
#> Kappa : 0.9859
#>
#> Mcnemar's Test P-Value : 0.1775
#>
#> Sensitivity : 0.98368
#> Specificity : 0.99964
#> Pos Pred Value : 0.98906
#> Neg Pred Value : 0.99946
#> Prevalence : 0.03197
#> Detection Rate : 0.03145
#> Detection Prevalence : 0.03180
#> Balanced Accuracy : 0.99166
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61197
False Negative = 33
False Positive = 22
True Positive = 1989
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2011
Predicted Condition Negative = 61230
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.9991303110324
Positive predictive value, precision = 0.989060169070114
False Discovery rate = 0.0109398309298856
False Omission Rate = 0.000538951494365507
Negative Predictive Value = 0.999461048505634
True Positive Rate, Recall, Sensitivity = 0.983679525222552
False Positive Rate = 0.000359365556444895
False Negative Rate = 0.0163204747774481
Specificity, Selectivity, True Negative rate = 0.999640634443555
Positive likelihood ratio = 2737.26712975452
Negative likelihood ratio = 0.0163263419023905
Diagnostic Odds Ratio = 167659.549586777
\(F_1\) Score = 0.986362509298289
From our plot we selected our optimal threshold as 0.495.
plot_folds_CM(classifier.rf,opt_thresh.rf)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.rf, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.rf, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
rf.Accuracy = accuracy
rf.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.rf$pred$obs,
pred = classifier.rf$pred$pred,
Down = classifier.rf$pred$Down,
Up = classifier.rf$pred$Up), lev = levels(classifier.rf$pred$obs))
rf.ROC = "TRUE" #unname(twoclassSum[1])
rf.Threshold = opt_thresh.rf
rf.Sensitivity = rf.Recall= rf.Power = TPR
rf.Specificity= 1-FPR
rf.FDR = FDR
rf.Precision=PPV
df = data.frame(c(rf.Accuracy, rf.AUC, rf.ROC, rf.Threshold, rf.Sensitivity, rf.Specificity, rf.FDR , rf.Precision))
colnames(df) <- c(paste0("Random Forest ( mtry = ",classifier.rf.bestMtry,", ntree=",classifier.rf.nTree,")"))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
Random Forest ( mtry = 2, ntree=500) | |
---|---|
Accuracy | 0.9991303110324 |
AUC | 0.994483115853513 |
ROC | TRUE |
Threshold | 0.495 |
Sensitivity=Recall=Power | 0.983679525222552 |
Specificity=1-FPR | 0.999640634443555 |
FDR | 0.0109398309298856 |
Precision=PPV | 0.989060169070114 |
The support vector machine (SVM) is an extension of the support vector machine classifier that results from enlarging the feature space using kernels. The linear kernel takes the form of
\[K(x_i,x_{i^\prime}) =\sum_{j=1}^{p}x_{ij}x_{{i^\prime}j}\]
where the kernel quantifies the similarity of a pair of observations using Pearson correlation.
set.seed(1)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.svm.linear <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="svmLinear",
preProcess=c("center","scale"),
tuneGrid=data.frame(C=c(10^((-1):3))),
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
stopCluster(cl)
classifier.svm.linear
#> Support Vector Machines with Linear Kernel
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results across tuning parameters:
#>
#> C Accuracy Kappa
#> 1e-01 0.9952246 0.9190427
#> 1e+00 0.9954144 0.9229297
#> 1e+01 0.9953669 0.9219726
#> 1e+02 0.9953986 0.9223722
#> 1e+03 0.9953669 0.9218230
#>
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was C = 1.
classifier.svm.linear.bestC <- classifier.svm.linear$bestTune$C
We can compute the distance from each training observation to a given separating hyperplane and call this the margin. When the cost argument is small, then the margins will be wide and many support vectors will violate the margin. When the cost argument is large, then the margins will be narrow and there will be few support vectors violating the margin. The parameter C can be used as a budget for the amount that the margin can be violated by the n observations. Training across various values of Cost, we can see that our optimal C is 1. So we’ll proceed with the analysis using this value.
classifier.svm.linear$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.svm.linear$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.svm.linear <- get_out_of_folds_CM(classifier.svm.linear)
classifier.svm.linear.sensitivity.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'sensitivity', mean)
classifier.svm.linear.sensitivity.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'sensitivity', sd)
classifier.svm.linear.specificity.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'specificity', mean)
classifier.svm.linear.specificity.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.888726040091694
Sensitivity Standard Deviation: 0.0158463375204761
Specificity Mean: 0.998938239460438
Specificity Standard Deviation: 0.000319807616920906
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.linear, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.svm.linear, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.998
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 0. 0 1 1
#> 3 1.11e-16 0.139 1 0.861
#> 4 2.22e-16 0.165 1 0.835
#> 5 3.33e-16 0.178 1 0.822
#> 6 4.44e-16 0.185 1 0.815
predict(classifier.svm.linear, type='prob') %>% head()
#> Down Up
#> 1 0.9923076 7.692367e-03
#> 2 0.9999997 2.702223e-07
#> 3 1.0000000 3.330669e-16
#> 4 0.9999970 2.972985e-06
#> 5 1.0000000 0.000000e+00
#> 6 1.0000000 0.000000e+00
We can also look at each ROC curve from each fold from the SVM with Linear Kernel model.
ROC_curveS <- classifier.svm.linear$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.svm.linear, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the SVM with Linear Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.svm.linear$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.svm.linear.recall.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'recall', mean)
classifier.svm.linear.recall.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'recall', sd)
classifier.svm.linear.precision.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'precision', mean)
classifier.svm.linear.precision.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'precision', sd)
Recall Mean: 0.888726040091694
Recall Standard Deviation: 0.0158463375204761
Precision Mean: 0.965193426042111
Precision Standard Deviation: 0.0101404124635792
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.svm.linear, type='prob')[,2]
opt_thresh.svm.linear <- get_optimal_threshold(classifier.svm.linear)
yhat <- tuned_call(preds, opt_thresh.svm.linear)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001666149
sensitivity
#> [1] 0.921365
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61117 159
#> Up 102 1863
#>
#> Accuracy : 0.9959
#> 95% CI : (0.9953, 0.9964)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.9324
#>
#> Mcnemar's Test P-Value : 0.0005276
#>
#> Sensitivity : 0.92136
#> Specificity : 0.99833
#> Pos Pred Value : 0.94809
#> Neg Pred Value : 0.99741
#> Prevalence : 0.03197
#> Detection Rate : 0.02946
#> Detection Prevalence : 0.03107
#> Balanced Accuracy : 0.95985
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61117
False Negative = 159
False Positive = 102
True Positive = 1863
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1965
Predicted Condition Negative = 61276
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.99587293053557
Positive predictive value, precision = 0.948091603053435
False Discovery rate = 0.0519083969465649
False Omission Rate = 0.00259481689405314
Negative Predictive Value = 0.997405183105947
True Positive Rate, Recall, Sensitivity = 0.921364985163205
False Positive Rate = 0.00166614939806269
False Negative Rate = 0.0786350148367952
Specificity, Selectivity, True Negative rate = 0.998333850601937
Positive likelihood ratio = 552.990617908885
Negative likelihood ratio = 0.0787662511787844
Diagnostic Odds Ratio = 7020.65427302997
\(F_1\) Score = 0.934537246049661
From our plot we selected our optimal threshold as 0.233.
plot_folds_CM(classifier.svm.linear,opt_thresh.svm.linear)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.linear, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.svm.linear, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
svm.linear.Accuracy = accuracy
svm.linear.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.svm.linear$pred$obs,
pred = classifier.svm.linear$pred$pred,
Down = classifier.svm.linear$pred$Down,
Up = classifier.svm.linear$pred$Up), lev = levels(classifier.svm.linear$pred$obs))
svm.linear.ROC = "TRUE" #unname(twoclassSum[1])
svm.linear.Threshold = opt_thresh.svm.linear
svm.linear.Sensitivity = svm.linear.Recall= svm.linear.Power = TPR
svm.linear.Specificity= 1-FPR
svm.linear.FDR = FDR
svm.linear.Precision=PPV
df = data.frame(c(svm.linear.Accuracy, svm.linear.AUC, svm.linear.ROC, svm.linear.Threshold, svm.linear.Sensitivity, svm.linear.Specificity, svm.linear.FDR , svm.linear.Precision))
colnames(df) <- c(paste0("SVM Linear ( Cost = ",classifier.svm.linear.bestC,")" ))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
SVM Linear ( Cost = 1) | |
---|---|
Accuracy | 0.99587293053557 |
AUC | 0.997848791117502 |
ROC | TRUE |
Threshold | 0.233 |
Sensitivity=Recall=Power | 0.921364985163205 |
Specificity=1-FPR | 0.998333850601937 |
FDR | 0.0519083969465649 |
Precision=PPV | 0.948091603053435 |
The radial kernel of the SVM takes the form
\[K(x_i,x_{i^\prime}) = exp(-\gamma\sum_{j=1}^{p}(x_{ij} - x_{{i^\prime}j})^2)\]
set.seed(1)
grid <- expand.grid(sigma = seq(0.8,1.8,.2),
C = c(10^((2):6))
)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.svm.radial <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="svmRadial",
preProcess=c("center","scale"),
tuneGrid=grid,
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
stopCluster(cl)
classifier.svm.radial
#> Support Vector Machines with Radial Basis Function Kernel
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results across tuning parameters:
#>
#> sigma C Accuracy Kappa
#> 0.8 1e+02 0.9969324 0.9500417
#> 0.8 1e+03 0.9970747 0.9522542
#> 0.8 1e+04 0.9972803 0.9557741
#> 0.8 1e+05 0.9972486 0.9551823
#> 0.8 1e+06 0.9606135 0.7975858
#> 1.0 1e+02 0.9970115 0.9512629
#> 1.0 1e+03 0.9970905 0.9525768
#> 1.0 1e+04 0.9972961 0.9561093
#> 1.0 1e+05 0.9973277 0.9563655
#> 1.0 1e+06 0.9929947 0.8547633
#> 1.2 1e+02 0.9970273 0.9515025
#> 1.2 1e+03 0.9972170 0.9546950
#> 1.2 1e+04 0.9972170 0.9547799
#> 1.2 1e+05 0.9973751 0.9570940
#> 1.2 1e+06 0.8931935 0.6229545
#> 1.4 1e+02 0.9970431 0.9517665
#> 1.4 1e+03 0.9972645 0.9555070
#> 1.4 1e+04 0.9972645 0.9555061
#> 1.4 1e+05 0.9972487 0.9549891
#> 1.4 1e+06 0.9597518 0.7016095
#> 1.6 1e+02 0.9970431 0.9517705
#> 1.6 1e+03 0.9972645 0.9554496
#> 1.6 1e+04 0.9973751 0.9572456
#> 1.6 1e+05 0.9972170 0.9543396
#> 1.6 1e+06 0.9453186 0.6469249
#> 1.8 1e+02 0.9971380 0.9533904
#> 1.8 1e+03 0.9972961 0.9560316
#> 1.8 1e+04 0.9973277 0.9564755
#> 1.8 1e+05 0.9972170 0.9543601
#> 1.8 1e+06 0.9545834 0.7186193
#>
#> Accuracy was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1.2 and C = 1e+05.
classifier.svm.radial.bestC <- classifier.svm.radial$bestTune$C
classifier.svm.radial.bestSigma <- classifier.svm.radial$bestTune$sigma
Training across various values of Cost and Sigma, we can see that our optimal C is 1e+05 and our optimal sigma is 1.2. So we’ll proceed with the analysis using this value.
classifier.svm.radial$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.svm.radial$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.svm.radial <- get_out_of_folds_CM(classifier.svm.radial)
classifier.svm.radial.sensitivity.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'sensitivity', mean)
classifier.svm.radial.sensitivity.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'sensitivity', sd)
classifier.svm.radial.specificity.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'specificity', mean)
classifier.svm.radial.specificity.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.950051212017754
Sensitivity Standard Deviation: 0.0247402222819946
Specificity Mean: 0.998938242129043
Specificity Standard Deviation: 0.000258257553779507
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.radial, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.svm.radial, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 1.00
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 0. 0 1 1
#> 3 1.11e-16 0.687 1 0.313
#> 4 2.22e-16 0.696 1 0.304
#> 5 3.33e-16 0.700 1 0.300
#> 6 4.44e-16 0.703 1 0.297
predict(classifier.svm.radial, type='prob') %>% head()
#> Down Up
#> 1 1.0000000 0.000000e+00
#> 2 0.9999186 8.143069e-05
#> 3 1.0000000 0.000000e+00
#> 4 0.9896867 1.031325e-02
#> 5 1.0000000 0.000000e+00
#> 6 1.0000000 0.000000e+00
We can also look at each ROC curve from each fold from the SVM with Radial Kernel model.
ROC_curveS <- classifier.svm.radial$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.svm.radial, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the SVM with Radial Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.svm.radial$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.svm.radial.recall.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'recall', mean)
classifier.svm.radial.recall.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'recall', sd)
classifier.svm.radial.precision.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'precision', mean)
classifier.svm.radial.precision.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'precision', sd)
Recall Mean: 0.950051212017754
Recall Standard Deviation: 0.0247402222819946
Precision Mean: 0.967355639517883
Precision Standard Deviation: 0.0074508221585993
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.svm.radial, type='prob')[,2]
opt_thresh.svm.radial <- get_optimal_threshold(classifier.svm.radial)
yhat <- tuned_call(preds, opt_thresh.svm.radial)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001519136
sensitivity
#> [1] 0.9772502
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61126 46
#> Up 93 1976
#>
#> Accuracy : 0.9978
#> 95% CI : (0.9974, 0.9982)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.9649
#>
#> Mcnemar's Test P-Value : 9.553e-05
#>
#> Sensitivity : 0.97725
#> Specificity : 0.99848
#> Pos Pred Value : 0.95505
#> Neg Pred Value : 0.99925
#> Prevalence : 0.03197
#> Detection Rate : 0.03125
#> Detection Prevalence : 0.03272
#> Balanced Accuracy : 0.98787
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61126
False Negative = 46
False Positive = 93
True Positive = 1976
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2069
Predicted Condition Negative = 61172
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.997802058790974
Positive predictive value, precision = 0.955050749154181
False Discovery rate = 0.0449492508458192
False Omission Rate = 0.00075197802916367
Negative Predictive Value = 0.999248021970836
True Positive Rate, Recall, Sensitivity = 0.977250247279921
False Positive Rate = 0.00151913621588069
False Negative Rate = 0.0227497527200791
Specificity, Selectivity, True Negative rate = 0.998480863784119
Positive likelihood ratio = 643.293364389564
Negative likelihood ratio = 0.0227843652745235
Diagnostic Odds Ratio = 28233.9822346891
\(F_1\) Score = 0.966022977267172
From our plot we selected our optimal threshold as 0.335.
plot_folds_CM(classifier.svm.radial,opt_thresh.svm.radial)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.radial, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.svm.radial, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
svm.radial.Accuracy = accuracy
svm.radial.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.svm.radial$pred$obs,
pred = classifier.svm.radial$pred$pred,
Down = classifier.svm.radial$pred$Down,
Up = classifier.svm.radial$pred$Up), lev = levels(classifier.svm.radial$pred$obs))
svm.radial.ROC = "TRUE" #unname(twoclassSum[1])
svm.radial.Threshold = opt_thresh.svm.radial
svm.radial.Sensitivity = svm.radial.Recall= svm.radial.Power = TPR
svm.radial.Specificity= 1-FPR
svm.radial.FDR = FDR
svm.radial.Precision=PPV
df = data.frame(c(svm.radial.Accuracy, svm.radial.AUC, svm.radial.ROC, svm.radial.Threshold, svm.radial.Sensitivity, svm.radial.Specificity, svm.radial.FDR , svm.radial.Precision))
colnames(df) <- c(paste0("SVM Radial ( Cost = ",classifier.svm.radial.bestC,", Sigma = ",classifier.svm.radial.bestSigma ,")" ))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
SVM Radial ( Cost = 1e+05, Sigma = 1.2) | |
---|---|
Accuracy | 0.997802058790974 |
AUC | 0.999820301064707 |
ROC | TRUE |
Threshold | 0.335 |
Sensitivity=Recall=Power | 0.977250247279921 |
Specificity=1-FPR | 0.998480863784119 |
FDR | 0.0449492508458192 |
Precision=PPV | 0.955050749154181 |
The polynomial kernel of the SVM takes the form of
\[K(x_i,x_{i^\prime}) =( 1+\sum_{j=1}^{p}x_{ij}x_{{i^\prime}j})^d\]
set.seed(1)
grid <- expand.grid(scale = seq(0.6,1.2,0.2),
C = c(10^((2):4)),
degree = seq(4,6,1)
)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.svm.poly <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="svmPoly",
preProcess=c("center","scale"),
tuneGrid=grid,
trControl = caret::trainControl("cv", number=kfoldcv,
returnResamp='all', savePredictions='final',
classProbs=TRUE))
stopCluster(cl)
classifier.svm.poly
#> Support Vector Machines with Polynomial Kernel
#>
#> 63241 samples
#> 3 predictor
#> 2 classes: 'Down', 'Up'
#>
#> Pre-processing: centered (3), scaled (3)
#> Resampling: Cross-Validated (10 fold)
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ...
#> Resampling results across tuning parameters:
#>
#> scale C degree Accuracy Kappa
#> 0.6 100 4 0.9969798 0.95073253
#> 0.6 100 5 0.9969798 0.95039113
#> 0.6 100 6 0.9969482 0.94973290
#> 0.6 1000 4 0.9971222 0.95283161
#> 0.6 1000 5 0.9969166 0.94921460
#> 0.6 1000 6 0.9953667 0.92889594
#> 0.6 10000 4 0.9637691 0.66318022
#> 0.6 10000 5 0.9450249 0.33139394
#> 0.6 10000 6 0.8959164 0.35610998
#> 0.8 100 4 0.9969798 0.95056708
#> 0.8 100 5 0.9969957 0.95060633
#> 0.8 100 6 0.9969166 0.94916759
#> 0.8 1000 4 0.9970905 0.95235627
#> 0.8 1000 5 0.9799512 0.82988695
#> 0.8 1000 6 0.9256509 0.60509759
#> 0.8 10000 4 0.9056307 0.43616018
#> 0.8 10000 5 0.9268660 0.15758162
#> 0.8 10000 6 0.9594508 0.20609072
#> 1.0 100 4 0.9970747 0.95209703
#> 1.0 100 5 0.9969324 0.94944514
#> 1.0 100 6 0.9967110 0.94559460
#> 1.0 1000 4 0.9968376 0.94849150
#> 1.0 1000 5 0.9475411 0.61267836
#> 1.0 1000 6 0.9569166 0.58566453
#> 1.0 10000 4 0.9779145 0.40916541
#> 1.0 10000 5 0.9518177 0.25844373
#> 1.0 10000 6 0.8896854 0.05280171
#> 1.2 100 4 0.9971064 0.95260152
#> 1.2 100 5 0.9958889 0.92687426
#> 1.2 100 6 0.9585005 0.76295369
#> 1.2 1000 4 0.9627126 0.77478578
#> 1.2 1000 5 0.9767552 0.56539850
#> 1.2 1000 6 0.9179317 0.38212066
#> 1.2 10000 4 0.9635912 0.24989693
#> 1.2 10000 5 0.9750423 0.31543508
#> 1.2 10000 6 0.9179759 0.01846190
#>
#> Accuracy was used to select the optimal model using the largest value.
#> The final values used for the model were degree = 4, scale = 0.6 and C = 1000.
classifier.svm.poly.bestC <- classifier.svm.poly$bestTune$C
classifier.svm.poly.bestDegree <- classifier.svm.poly$bestTune$degree
classifier.svm.poly.bestScale <- classifier.svm.poly$bestTune$scale
Training across various values of Cost, Degree and Scale, we can see that our optimal C is 1000, our optimal Degree is 4 and our optimal Scale is 0.6. So we’ll proceed with the analysis using this value.
We can look at the various plots of Scale vs Accuracy for various Cost and Polynomial degree.
classifier.svm.poly$resample %>%
dplyr::group_split(Resample) %>%
purrr::map(rowid_to_column) %>%
dplyr::bind_rows() %>%
ggplot(aes(rowid, Accuracy)) + geom_point() +
geom_smooth(formula='y~x', method='loess', span=.03) +
geom_line(classifier.svm.poly$results, mapping=aes(seq_along(Accuracy), Accuracy),
size=2, color='red')
Next we’ll look at the Confusion Matrix from the model.
out_of_folds_CM.svm.poly <- get_out_of_folds_CM(classifier.svm.poly)
classifier.svm.poly.sensitivity.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'sensitivity', mean)
classifier.svm.poly.sensitivity.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'sensitivity', sd)
classifier.svm.poly.specificity.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'specificity', mean)
classifier.svm.poly.specificity.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'specificity', sd)
Reviewing the confusion matrix, we have:
Sensitivity Mean: 0.942640101448568
Sensitivity Standard Deviation: 0.0259690507584868
Specificity Mean: 0.998921910266449
Specificity Standard Deviation: 0.000463281024253893
Next we’ll look at the ROC curve.
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.poly, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
predict(classifier.svm.poly, type='prob') %>%
yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 1.00
ROC_curve %>% head()
#> # A tibble: 6 x 4
#> .threshold specificity sensitivity one_minus_specificity
#> <dbl> <dbl> <dbl> <dbl>
#> 1 -Inf 0 1 1
#> 2 0. 0 1 1
#> 3 1.11e-16 0.202 1 0.798
#> 4 2.22e-16 0.220 1 0.780
#> 5 3.33e-16 0.227 1 0.773
#> 6 4.44e-16 0.231 1 0.769
predict(classifier.svm.poly, type='prob') %>% head()
#> Down Up
#> 1 0.9999989 1.120733e-06
#> 2 0.9999999 9.594520e-08
#> 3 0.9999898 1.023643e-05
#> 4 0.9995981 4.019017e-04
#> 5 0.9979354 2.064604e-03
#> 6 1.0000000 5.073072e-10
We can also look at each ROC curve from each fold from the SVM with Polynomial Kernel model.
ROC_curveS <- classifier.svm.poly$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))
ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC Curves')
Next, we’ll look at the P-R curve.
PR_curve <- predict(classifier.svm.poly, type='prob') %>%
yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
We can also look at each P-R curve from each fold from the SVM with Polynomial Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.
PR_curveS <- classifier.svm.poly$pred %>%
dplyr::group_split(Resample) %>%
purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))
PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) +
#geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
#xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('Precision Recall Curves')
classifier.svm.poly.recall.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'recall', mean)
classifier.svm.poly.recall.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'recall', sd)
classifier.svm.poly.precision.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'precision', mean)
classifier.svm.poly.precision.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'precision', sd)
Recall Mean: 0.942640101448568
Recall Standard Deviation: 0.0259690507584868
Precision Mean: 0.966767401633039
Precision Standard Deviation: 0.013585405388514
To find the optimal threshold, well look at using the \(F_1\) Score.
preds <- predict(classifier.svm.poly, type='prob')[,2]
opt_thresh.svm.poly <- get_optimal_threshold(classifier.svm.poly)
yhat <- tuned_call(preds, opt_thresh.svm.poly)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001176105
sensitivity
#> [1] 0.9495549
results <- tibble::tibble(y=y, yhat=yhat)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 61147 102
#> Up 72 1920
#>
#> Accuracy : 0.9972
#> 95% CI : (0.9968, 0.9976)
#> No Information Rate : 0.968
#> P-Value [Acc > NIR] : < 2e-16
#>
#> Kappa : 0.9552
#>
#> Mcnemar's Test P-Value : 0.02791
#>
#> Sensitivity : 0.94955
#> Specificity : 0.99882
#> Pos Pred Value : 0.96386
#> Neg Pred Value : 0.99833
#> Prevalence : 0.03197
#> Detection Rate : 0.03036
#> Detection Prevalence : 0.03150
#> Balanced Accuracy : 0.97419
#>
#> 'Positive' Class : Up
#>
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
Now using the optimal threshold we get:
True Negative = 61147
False Negative = 102
False Positive = 72
True Positive = 1920
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1992
Predicted Condition Negative = 61249
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.997248620357047
Positive predictive value, precision = 0.963855421686747
False Discovery rate = 0.036144578313253
False Omission Rate = 0.00166533331156427
Negative Predictive Value = 0.998334666688436
True Positive Rate, Recall, Sensitivity = 0.949554896142433
False Positive Rate = 0.00117610545745602
False Negative Rate = 0.0504451038575668
Specificity, Selectivity, True Negative rate = 0.998823894542544
Positive likelihood ratio = 807.37223870755
Negative likelihood ratio = 0.0505045024785579
Diagnostic Odds Ratio = 15986.1437908497
\(F_1\) Score = 0.956651718983558
From our plot we selected our optimal threshold as 0.39.
plot_folds_CM(classifier.svm.poly,opt_thresh.svm.poly)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.poly, type='prob') %>%
yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
temp = predict(classifier.svm.poly, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up)
svm.poly.Accuracy = accuracy
svm.poly.AUC = temp$.estimate
twoclassSum = twoClassSummary( data = data.frame(obs = classifier.svm.poly$pred$obs,
pred = classifier.svm.poly$pred$pred,
Down = classifier.svm.poly$pred$Down,
Up = classifier.svm.poly$pred$Up), lev = levels(classifier.svm.poly$pred$obs))
svm.poly.ROC = "TRUE" #unname(twoclassSum[1])
svm.poly.Threshold = opt_thresh.svm.poly
svm.poly.Sensitivity = svm.poly.Recall= svm.poly.Power = TPR
svm.poly.Specificity= 1-FPR
svm.poly.FDR = FDR
svm.poly.Precision=PPV
df = data.frame(c(svm.poly.Accuracy, svm.poly.AUC, svm.poly.ROC, svm.poly.Threshold, svm.poly.Sensitivity, svm.poly.Specificity, svm.poly.FDR , svm.poly.Precision))
colnames(df) <- c(paste0("SVM Polynomial ( Cost = ",classifier.svm.poly.bestC,", Degree = ",classifier.svm.poly.bestDegree , ", Scale = ",classifier.svm.poly.bestScale,")" ))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
SVM Polynomial ( Cost = 1000, Degree = 4, Scale = 0.6) | |
---|---|
Accuracy | 0.997248620357047 |
AUC | 0.999714294526813 |
ROC | TRUE |
Threshold | 0.39 |
Sensitivity=Recall=Power | 0.949554896142433 |
Specificity=1-FPR | 0.998823894542544 |
FDR | 0.036144578313253 |
Precision=PPV | 0.963855421686747 |
df = data.frame(c(knn.Accuracy, knn.AUC, knn.ROC, knn.Threshold, knn.Sensitivity, knn.Specificity, knn.FDR , knn.Precision),
c(lda.Accuracy, lda.AUC, lda.ROC, lda.Threshold, lda.Sensitivity, lda.Specificity, lda.FDR , lda.Precision),
c(qda.Accuracy, qda.AUC, qda.ROC, qda.Threshold, qda.Sensitivity, qda.Specificity, qda.FDR , qda.Precision),
c(glm.Accuracy, glm.AUC, glm.ROC, glm.Threshold, glm.Sensitivity, glm.Specificity, glm.FDR , glm.Precision))
colnames(df) <- c(paste0("KNN (k =",bestk,")"), "LDA","QDA","Logistic Regression")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
KNN (k =5) | LDA | QDA | Logistic Regression | |
---|---|---|---|---|
Accuracy | 0.99791274647776 | 0.983997722996158 | 0.994797678721083 | 0.995841305482203 |
AUC | 0.999862454860983 | 0.988876770009065 | 0.998217543931761 | 0.998506949374034 |
ROC | TRUE | TRUE | TRUE | TRUE |
Threshold | 0.499 | 0.495 | 0.22 | 0.214 |
Sensitivity=Recall=Power | 0.969337289812067 | 0.80217606330366 | 0.867457962413452 | 0.926310583580613 |
Specificity=1-FPR | 0.998856564138584 | 0.990003103611624 | 0.999003577320766 | 0.998137833025695 |
FDR | 0.0344827586206897 | 0.273948075201432 | 0.0336088154269972 | 0.0573729240060393 |
Precision=PPV | 0.96551724137931 | 0.726051924798568 | 0.966391184573003 | 0.942627075993961 |
df = data.frame(c(rf.Accuracy, rf.AUC, rf.ROC, rf.Threshold, rf.Sensitivity, rf.Specificity, rf.FDR , rf.Precision),
c(svm.linear.Accuracy, svm.linear.AUC, svm.linear.ROC, svm.linear.Threshold, svm.linear.Sensitivity, svm.linear.Specificity, svm.linear.FDR , svm.linear.Precision),
c(svm.radial.Accuracy, svm.radial.AUC, svm.radial.ROC, svm.radial.Threshold, svm.radial.Sensitivity, svm.radial.Specificity, svm.radial.FDR , svm.radial.Precision),
c(svm.poly.Accuracy, svm.poly.AUC, svm.poly.ROC, svm.poly.Threshold, svm.poly.Sensitivity, svm.poly.Specificity, svm.poly.FDR , svm.poly.Precision))
colnames(df) <- c(paste0("Random Forest ( mtry = ",classifier.rf.bestMtry,", ntree=",classifier.rf.nTree,")")
, paste0("SVM Linear ( Cost = ",classifier.svm.linear.bestC,")" ),paste0("SVM Radial ( Cost = ",classifier.svm.radial.bestC,", Sigma = ",classifier.svm.radial.bestSigma ,")" )
,paste0("SVM Polynomial ( Cost = ",classifier.svm.poly.bestC,", Degree = ",classifier.svm.poly.bestDegree , ", Scale = ",classifier.svm.poly.bestScale,")" ))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Random Forest ( mtry = 2, ntree=500) | SVM Linear ( Cost = 1) | SVM Radial ( Cost = 1e+05, Sigma = 1.2) | SVM Polynomial ( Cost = 1000, Degree = 4, Scale = 0.6) | |
---|---|---|---|---|
Accuracy | 0.9991303110324 | 0.99587293053557 | 0.997802058790974 | 0.997248620357047 |
AUC | 0.994483115853513 | 0.997848791117502 | 0.999820301064707 | 0.999714294526813 |
ROC | TRUE | TRUE | TRUE | TRUE |
Threshold | 0.495 | 0.233 | 0.335 | 0.39 |
Sensitivity=Recall=Power | 0.983679525222552 | 0.921364985163205 | 0.977250247279921 | 0.949554896142433 |
Specificity=1-FPR | 0.999640634443555 | 0.998333850601937 | 0.998480863784119 | 0.998823894542544 |
FDR | 0.0109398309298856 | 0.0519083969465649 | 0.0449492508458192 | 0.036144578313253 |
Precision=PPV | 0.989060169070114 | 0.948091603053435 | 0.955050749154181 | 0.963855421686747 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.knn, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1977410 2501
#> Up 12287 11979
#>
#> Accuracy : 0.9926
#> 95% CI : (0.9925, 0.9927)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 0.9948
#>
#> Kappa : 0.6148
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.827279
#> Specificity : 0.993825
#> Pos Pred Value : 0.493654
#> Neg Pred Value : 0.998737
#> Prevalence : 0.007225
#> Detection Rate : 0.005977
#> Detection Prevalence : 0.012108
#> Balanced Accuracy : 0.910552
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.knn.Threshold = min(c(knn.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.knn.ROC = "TRUE"
statistics = stat_fho(classifier.knn, fho, fho.knn.Threshold)
fho.knn.accuracy = statistics[1]
fho.knn.sensitivity = statistics[2]
fho.knn.specificity = statistics[3]
fho.knn.FDR = statistics[4]
fho.knn.precision = statistics[5]
yhat <- tuned_call(preds, fho.knn.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1977282 2499
#> Up 12415 11981
#>
#> Accuracy : 0.9926
#> 95% CI : (0.9924, 0.9927)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 0.9998
#>
#> Kappa : 0.6129
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.827417
#> Specificity : 0.993760
#> Pos Pred Value : 0.491105
#> Neg Pred Value : 0.998738
#> Prevalence : 0.007225
#> Detection Rate : 0.005978
#> Detection Prevalence : 0.012173
#> Balanced Accuracy : 0.910589
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.knn, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.knn, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.knn.accuracy, fho.knn.AUC, fho.knn.ROC, fho.knn.Threshold, fho.knn.sensitivity, fho.knn.specificity, fho.knn.FDR , fho.knn.precision))
colnames(df) <- c("FHO KNN")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO KNN | |
---|---|
Accuracy | 0.992558541486106 |
AUC | 0.961298412282614 |
ROC | TRUE |
Threshold | 0.499 |
Sensitivity=Recall=Power | 0.827417127071823 |
Specificity=1-FPR | 0.993760356476388 |
FDR | 0.50889490080341 |
Precision=PPV | 0.49110509919659 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.lda, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1955452 2332
#> Up 34245 12148
#>
#> Accuracy : 0.9817
#> 95% CI : (0.9816, 0.9819)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.3924
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.838950
#> Specificity : 0.982789
#> Pos Pred Value : 0.261850
#> Neg Pred Value : 0.998809
#> Prevalence : 0.007225
#> Detection Rate : 0.006061
#> Detection Prevalence : 0.023148
#> Balanced Accuracy : 0.910870
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.lda.Threshold = min(c(lda.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.lda.ROC = "TRUE"
statistics = stat_fho(classifier.lda, fho, fho.lda.Threshold)
fho.lda.accuracy = statistics[1]
fho.lda.sensitivity = statistics[2]
fho.lda.specificity = statistics[3]
fho.lda.FDR = statistics[4]
fho.lda.precision = statistics[5]
yhat <- tuned_call(preds, fho.lda.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1955437 2320
#> Up 34260 12160
#>
#> Accuracy : 0.9817
#> 95% CI : (0.9816, 0.9819)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.3927
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.839779
#> Specificity : 0.982781
#> Pos Pred Value : 0.261956
#> Neg Pred Value : 0.998815
#> Prevalence : 0.007225
#> Detection Rate : 0.006067
#> Detection Prevalence : 0.023162
#> Balanced Accuracy : 0.911280
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.lda, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.lda, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.lda.accuracy, fho.lda.AUC, fho.lda.ROC, fho.lda.Threshold, fho.lda.sensitivity, fho.lda.specificity, fho.lda.FDR , fho.lda.precision))
colnames(df) <- c("FHO LDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO LDA | |
---|---|
Accuracy | 0.981748119053357 |
AUC | 0.992115543668651 |
ROC | TRUE |
Threshold | 0.493 |
Sensitivity=Recall=Power | 0.839779005524862 |
Specificity=1-FPR | 0.982781297855905 |
FDR | 0.738043946574752 |
Precision=PPV | 0.261956053425248 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.qda, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1986046 4422
#> Up 3651 10058
#>
#> Accuracy : 0.996
#> 95% CI : (0.9959, 0.9961)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.7116
#>
#> Mcnemar's Test P-Value : < 2.2e-16
#>
#> Sensitivity : 0.694613
#> Specificity : 0.998165
#> Pos Pred Value : 0.733679
#> Neg Pred Value : 0.997778
#> Prevalence : 0.007225
#> Detection Rate : 0.005019
#> Detection Prevalence : 0.006840
#> Balanced Accuracy : 0.846389
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.qda.Threshold = min(c(qda.Threshold,thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.qda.ROC = "TRUE"
statistics = stat_fho(classifier.qda, fho, fho.qda.Threshold)
fho.qda.accuracy = statistics[1]
fho.qda.sensitivity = statistics[2]
fho.qda.specificity = statistics[3]
fho.qda.FDR = statistics[4]
fho.qda.precision = statistics[5]
yhat <- tuned_call(preds, fho.qda.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1982981 3464
#> Up 6716 11016
#>
#> Accuracy : 0.9949
#> 95% CI : (0.9948, 0.995)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.6814
#>
#> Mcnemar's Test P-Value : < 2.2e-16
#>
#> Sensitivity : 0.760773
#> Specificity : 0.996625
#> Pos Pred Value : 0.621250
#> Neg Pred Value : 0.998256
#> Prevalence : 0.007225
#> Detection Rate : 0.005497
#> Detection Prevalence : 0.008848
#> Balanced Accuracy : 0.878699
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.qda, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.qda, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.qda.accuracy, fho.qda.AUC, fho.qda.ROC, fho.qda.Threshold, fho.qda.sensitivity, fho.qda.specificity, fho.qda.FDR , fho.qda.precision))
colnames(df) <- c("FHO QDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO QDA | |
---|---|
Accuracy | 0.994920608309546 |
AUC | 0.991500075918026 |
ROC | TRUE |
Threshold | 0.22 |
Sensitivity=Recall=Power | 0.760773480662983 |
Specificity=1-FPR | 0.996624611687106 |
FDR | 0.378750281976088 |
Precision=PPV | 0.621249718023912 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.glm, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1969383 170
#> Up 20314 14310
#>
#> Accuracy : 0.9898
#> 95% CI : (0.9896, 0.9899)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.5786
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.988260
#> Specificity : 0.989790
#> Pos Pred Value : 0.413297
#> Neg Pred Value : 0.999914
#> Prevalence : 0.007225
#> Detection Rate : 0.007140
#> Detection Prevalence : 0.017276
#> Balanced Accuracy : 0.989025
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.glm.Threshold = min(c(glm.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.glm.ROC = "TRUE"
statistics = stat_fho(classifier.glm, fho, fho.glm.Threshold)
fho.glm.accuracy = statistics[1]
fho.glm.sensitivity = statistics[2]
fho.glm.specificity = statistics[3]
fho.glm.FDR = statistics[4]
fho.glm.precision = statistics[5]
yhat <- tuned_call(preds, fho.glm.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1920306 97
#> Up 69391 14383
#>
#> Accuracy : 0.9653
#> 95% CI : (0.9651, 0.9656)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.2839
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.993301
#> Specificity : 0.965125
#> Pos Pred Value : 0.171688
#> Neg Pred Value : 0.999949
#> Prevalence : 0.007225
#> Detection Rate : 0.007177
#> Detection Prevalence : 0.041800
#> Balanced Accuracy : 0.979213
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.glm, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.glm, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.glm.accuracy, fho.glm.AUC, fho.glm.ROC, fho.glm.Threshold, fho.glm.sensitivity, fho.glm.specificity, fho.glm.FDR , fho.glm.precision))
colnames(df) <- c("FHO Logistic Regression ")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO Logistic Regression | |
---|---|
Accuracy | 0.965328411612348 |
AUC | 0.999413070684321 |
ROC | TRUE |
Threshold | 0.214 |
Sensitivity=Recall=Power | 0.993301104972376 |
Specificity=1-FPR | 0.965124840616436 |
FDR | 0.828311886742904 |
Precision=PPV | 0.171688113257096 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.rf, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1979710 3002
#> Up 9987 11478
#>
#> Accuracy : 0.9935
#> 95% CI : (0.9934, 0.9936)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.6355
#>
#> Mcnemar's Test P-Value : < 2.2e-16
#>
#> Sensitivity : 0.792680
#> Specificity : 0.994981
#> Pos Pred Value : 0.534731
#> Neg Pred Value : 0.998486
#> Prevalence : 0.007225
#> Detection Rate : 0.005727
#> Detection Prevalence : 0.010710
#> Balanced Accuracy : 0.893830
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.rf.Threshold = min(c(rf.Threshold,thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.rf.ROC = "TRUE"
statistics = stat_fho(classifier.rf, fho, fho.rf.Threshold)
fho.rf.accuracy = statistics[1]
fho.rf.sensitivity = statistics[2]
fho.rf.specificity = statistics[3]
fho.rf.FDR = statistics[4]
fho.rf.precision = statistics[5]
yhat <- tuned_call(preds, fho.rf.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1979537 2993
#> Up 10160 11487
#>
#> Accuracy : 0.9934
#> 95% CI : (0.9933, 0.9935)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.6327
#>
#> Mcnemar's Test P-Value : < 2.2e-16
#>
#> Sensitivity : 0.793301
#> Specificity : 0.994894
#> Pos Pred Value : 0.530651
#> Neg Pred Value : 0.998490
#> Prevalence : 0.007225
#> Detection Rate : 0.005732
#> Detection Prevalence : 0.010801
#> Balanced Accuracy : 0.894097
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.rf, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.rf, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.rf.accuracy, fho.rf.AUC, fho.rf.ROC, fho.rf.Threshold, fho.rf.sensitivity, fho.rf.specificity, fho.rf.FDR , fho.rf.precision))
colnames(df) <- c("FHO Random Forest")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO Random Forest | |
---|---|
Accuracy | 0.993437206394445 |
AUC | 0.980334789679392 |
ROC | TRUE |
Threshold | 0.495 |
Sensitivity=Recall=Power | 0.793301104972376 |
Specificity=1-FPR | 0.994893694869118 |
FDR | 0.469349101492124 |
Precision=PPV | 0.530650898507876 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.linear, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1949566 162
#> Up 40131 14318
#>
#> Accuracy : 0.9799
#> 95% CI : (0.9797, 0.9801)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.4087
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.988812
#> Specificity : 0.979831
#> Pos Pred Value : 0.262962
#> Neg Pred Value : 0.999917
#> Prevalence : 0.007225
#> Detection Rate : 0.007144
#> Detection Prevalence : 0.027168
#> Balanced Accuracy : 0.984321
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.linear.Threshold = min(c(svm.linear.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.linear.ROC = "TRUE"
statistics = stat_fho(classifier.svm.linear, fho, fho.svm.linear.Threshold)
fho.svm.linear.accuracy = statistics[1]
fho.svm.linear.sensitivity = statistics[2]
fho.svm.linear.specificity = statistics[3]
fho.svm.linear.FDR = statistics[4]
fho.svm.linear.precision = statistics[5]
yhat <- tuned_call(preds, fho.svm.linear.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1894813 103
#> Up 94884 14377
#>
#> Accuracy : 0.9526
#> 95% CI : (0.9523, 0.9529)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.2225
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.992887
#> Specificity : 0.952312
#> Pos Pred Value : 0.131584
#> Neg Pred Value : 0.999946
#> Prevalence : 0.007225
#> Detection Rate : 0.007174
#> Detection Prevalence : 0.054517
#> Balanced Accuracy : 0.972600
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.linear, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.svm.linear, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.svm.linear.accuracy, fho.svm.linear.AUC, fho.svm.linear.ROC, fho.svm.linear.Threshold, fho.svm.linear.sensitivity, fho.svm.linear.specificity, fho.svm.linear.FDR , fho.svm.linear.precision))
colnames(df) <- c("FHO SVM w/ Linear")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO SVM w/ Linear | |
---|---|
Accuracy | 0.952605483447819 |
AUC | 0.999092429259135 |
ROC | TRUE |
Threshold | 0.233 |
Sensitivity=Recall=Power | 0.992886740331492 |
Specificity=1-FPR | 0.952312337004076 |
FDR | 0.86841599472822 |
Precision=PPV | 0.13158400527178 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.radial, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1977443 3524
#> Up 12254 10956
#>
#> Accuracy : 0.9921
#> 95% CI : (0.992, 0.9922)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.5776
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.756630
#> Specificity : 0.993841
#> Pos Pred Value : 0.472038
#> Neg Pred Value : 0.998221
#> Prevalence : 0.007225
#> Detection Rate : 0.005467
#> Detection Prevalence : 0.011581
#> Balanced Accuracy : 0.875236
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.radial.Threshold = min(c(svm.radial.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.radial.ROC = "TRUE"
statistics = stat_fho(classifier.svm.radial, fho, fho.svm.radial.Threshold)
fho.svm.radial.accuracy = statistics[1]
fho.svm.radial.sensitivity = statistics[2]
fho.svm.radial.specificity = statistics[3]
fho.svm.radial.FDR = statistics[4]
fho.svm.radial.precision = statistics[5]
yhat <- tuned_call(preds, fho.svm.radial.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1973658 3121
#> Up 16039 11359
#>
#> Accuracy : 0.9904
#> 95% CI : (0.9903, 0.9906)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.5381
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.784461
#> Specificity : 0.991939
#> Pos Pred Value : 0.414592
#> Neg Pred Value : 0.998421
#> Prevalence : 0.007225
#> Detection Rate : 0.005668
#> Detection Prevalence : 0.013670
#> Balanced Accuracy : 0.888200
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.radial, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.svm.radial, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.svm.radial.accuracy, fho.svm.radial.AUC, fho.svm.radial.ROC, fho.svm.radial.Threshold, fho.svm.radial.sensitivity, fho.svm.radial.specificity, fho.svm.radial.FDR , fho.svm.radial.precision))
colnames(df) <- c("FHO SVM w/ Radial")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO SVM w/ Radial | |
---|---|
Accuracy | 0.990439966130736 |
AUC | 0.986298521512439 |
ROC | TRUE |
Threshold | 0.335 |
Sensitivity=Recall=Power | 0.784461325966851 |
Specificity=1-FPR | 0.991938973622617 |
FDR | 0.585407693992262 |
Precision=PPV | 0.414592306007738 |
y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.poly, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1979177 7205
#> Up 10520 7275
#>
#> Accuracy : 0.9912
#> 95% CI : (0.991, 0.9913)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.4464
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.502417
#> Specificity : 0.994713
#> Pos Pred Value : 0.408823
#> Neg Pred Value : 0.996373
#> Prevalence : 0.007225
#> Detection Rate : 0.003630
#> Detection Prevalence : 0.008879
#> Balanced Accuracy : 0.748565
#>
#> 'Positive' Class : Up
#>
thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
yhat <- tuned_call(preds, thresholds[i])
results <- tibble::tibble(y=y, yhat=yhat)
cm <- results %>%yardstick::conf_mat(truth=y, yhat)
trueNegative = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve = cm$table[2,2]
conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive
predictedConditionPositive = truePositve + falsePositive
predictedConditionNegative = trueNegative + falseNegative
TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
PPV = truePositve / predictedConditionPositive
TPR = truePositve / conditionPositive
F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
F_1_scores[i] <- F_1_score
}
#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.poly.Threshold = min(c(svm.poly.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.poly.ROC = "TRUE"
statistics = stat_fho(classifier.svm.poly, fho, fho.svm.poly.Threshold)
fho.svm.poly.accuracy = statistics[1]
fho.svm.poly.sensitivity = statistics[2]
fho.svm.poly.specificity = statistics[3]
fho.svm.poly.FDR = statistics[4]
fho.svm.poly.precision = statistics[5]
yhat <- tuned_call(preds, fho.svm.poly.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction Down Up
#> Down 1977300 7107
#> Up 12397 7373
#>
#> Accuracy : 0.9903
#> 95% CI : (0.9901, 0.9904)
#> No Information Rate : 0.9928
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.4258
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.509185
#> Specificity : 0.993769
#> Pos Pred Value : 0.372939
#> Neg Pred Value : 0.996419
#> Prevalence : 0.007225
#> Detection Rate : 0.003679
#> Detection Prevalence : 0.009864
#> Balanced Accuracy : 0.751477
#>
#> 'Positive' Class : Up
#>
options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.poly, fho, type='prob') %>%
yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity, y=sensitivity)) +
geom_line() + geom_point() +
geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
xlab("one_minus_specificity\n(false positive rate)") +
ggtitle('ROC curve')
ROC_curve_plot
PR_curve <- predict(classifier.svm.poly, fho, type='prob') %>%
yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)
PR_curve_plot <- PR_curve %>%
ggplot(aes(x=recall, y=precision)) +
geom_line() + geom_point() +
#geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
ggtitle('Precision Recall Curve')
PR_curve_plot
df = data.frame(c(fho.svm.poly.accuracy, fho.svm.poly.AUC, fho.svm.poly.ROC, fho.svm.poly.Threshold, fho.svm.poly.sensitivity, fho.svm.poly.specificity, fho.svm.poly.FDR , fho.svm.poly.precision))
colnames(df) <- c("FHO SVM w/ Polynomial")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()
FHO SVM w/ Polynomial | |
---|---|
Accuracy | 0.990268324604064 |
AUC | 0.837972518519624 |
ROC | TRUE |
Threshold | 0.39 |
Sensitivity=Recall=Power | 0.509185082872928 |
Specificity=1-FPR | 0.993769403079966 |
FDR | 0.627061203844208 |
Precision=PPV | 0.372938796155792 |
df = data.frame(c(fho.knn.accuracy, fho.knn.AUC, fho.knn.ROC, fho.knn.Threshold, fho.knn.sensitivity, fho.knn.specificity, fho.knn.FDR , fho.knn.precision),
c(fho.lda.accuracy, fho.lda.AUC, fho.lda.ROC, fho.lda.Threshold, fho.lda.sensitivity, fho.lda.specificity, fho.lda.FDR , fho.lda.precision),
c(fho.qda.accuracy, fho.qda.AUC, fho.qda.ROC, fho.qda.Threshold, fho.qda.sensitivity, fho.qda.specificity, fho.qda.FDR , fho.qda.precision)
,
c(fho.glm.accuracy, fho.glm.AUC, fho.glm.ROC, fho.glm.Threshold, fho.glm.sensitivity, fho.glm.specificity, fho.glm.FDR , fho.glm.precision)
)
colnames(df) <- c("FHO KNN" , "FHO LDA","FHO QDA","FHO Logistic Regression")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
FHO KNN | FHO LDA | FHO QDA | FHO Logistic Regression | |
---|---|---|---|---|
Accuracy | 0.992558541486106 | 0.981748119053357 | 0.994920608309546 | 0.965328411612348 |
AUC | 0.961298412282614 | 0.992115543668651 | 0.991500075918026 | 0.999413070684321 |
ROC | TRUE | TRUE | TRUE | TRUE |
Threshold | 0.499 | 0.493 | 0.22 | 0.214 |
Sensitivity=Recall=Power | 0.827417127071823 | 0.839779005524862 | 0.760773480662983 | 0.993301104972376 |
Specificity=1-FPR | 0.993760356476388 | 0.982781297855905 | 0.996624611687106 | 0.965124840616436 |
FDR | 0.50889490080341 | 0.738043946574752 | 0.378750281976088 | 0.828311886742904 |
Precision=PPV | 0.49110509919659 | 0.261956053425248 | 0.621249718023912 | 0.171688113257096 |
df = data.frame(c(fho.rf.accuracy, fho.rf.AUC, fho.rf.ROC, fho.rf.Threshold, fho.rf.sensitivity, fho.rf.specificity, fho.rf.FDR , fho.rf.precision),
c(fho.svm.linear.accuracy, fho.svm.linear.AUC, fho.svm.linear.ROC, fho.svm.linear.Threshold, fho.svm.linear.sensitivity, fho.svm.linear.specificity, fho.svm.linear.FDR , fho.svm.linear.precision)
,
c(fho.svm.radial.accuracy, fho.svm.radial.AUC, fho.svm.radial.ROC, fho.svm.radial.Threshold, fho.svm.radial.sensitivity, fho.svm.radial.specificity, fho.svm.radial.FDR , fho.svm.radial.precision)
,
c(fho.svm.poly.accuracy, fho.svm.poly.AUC, fho.svm.poly.ROC, fho.svm.poly.Threshold, fho.svm.poly.sensitivity, fho.svm.poly.specificity, fho.svm.poly.FDR , fho.svm.poly.precision))
colnames(df) <- c("FHO Random Forest","FHO SVM Linear","FHO SVM Radial","FHO SVM Polynomial")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
FHO Random Forest | FHO SVM Linear | FHO SVM Radial | FHO SVM Polynomial | |
---|---|---|---|---|
Accuracy | 0.993437206394445 | 0.952605483447819 | 0.990439966130736 | 0.990268324604064 |
AUC | 0.980334789679392 | 0.999092429259135 | 0.986298521512439 | 0.837972518519624 |
ROC | TRUE | TRUE | TRUE | TRUE |
Threshold | 0.495 | 0.233 | 0.335 | 0.39 |
Sensitivity=Recall=Power | 0.793301104972376 | 0.992886740331492 | 0.784461325966851 | 0.509185082872928 |
Specificity=1-FPR | 0.994893694869118 | 0.952312337004076 | 0.991938973622617 | 0.993769403079966 |
FDR | 0.469349101492124 | 0.86841599472822 | 0.585407693992262 | 0.627061203844208 |
Precision=PPV | 0.530650898507876 | 0.13158400527178 | 0.414592306007738 | 0.372938796155792 |
When taking an approach to select an optimal model, by using all of the data within HaitiPixels for the model generation, their respective performances can be measured fairly accurately. Across all the algorithms, it’s worth noting the time it takes to generate each model and should be taken into account when deciding which approach to take. Using a 10-Fold cross validation within the caret package, we get the following timings:
timing <- c(classifier.knn$times$everything[3],
classifier.lda$times$everything[3],
classifier.qda$times$everything[3],
classifier.glm$times$everything[3],
classifier.rf$times$everything[3],
classifier.svm.linear$times$everything[3],
classifier.svm.radial$times$everything[3],
classifier.svm.poly$times$everything[3])
df = data.frame(timing)
colnames(df) <- c( "Elapsed Time (seconds)")
rownames(df) <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Elapsed Time (seconds) | |
---|---|
KNN | 191.085 |
LDA | 3.920 |
QDA | 3.758 |
Logistic Regression | 7.579 |
Random Forest | 323.020 |
SVM with Linear Kernel | 281.842 |
SVM with Radial Kernel | 4868.989 |
SVM with Polynomial Kernel | 11306.910 |
We can see some models have longer elapsed run times but this is a necessary cost since we want to avoid taking data from the model training data set. Using the full set will strengthen our model. We used the \(F_1\) scores at different thresholds since it can be used to measure the performance of the unbalanced classification data set to determine the value of the metrics used in this analysis.
The metrics we calculated for this application was precision, a measure of how precise the models are; recall, the total of actual positives; accuracy, the number of correct predictions over total number of predictions; false discovery rate, the rate of Type I errors; and specificity, the ratio of negatives that are correctly identified. As the observations are a matter of providing support and supplies to those in need, we can say that a greater emphasis is placed on a higher sensitivity and lower false discovery rate. In doing so, you’re stating that all blue tarps are correctly identity and no blue tarps are not incorrectly identified as a non-blue tarp.
name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
accuracy.ranks <- c(knn.Accuracy,lda.Accuracy,qda.Accuracy,glm.Accuracy,rf.Accuracy,svm.linear.Accuracy,svm.radial.Accuracy,svm.poly.Accuracy)
auc.ranks <- c(knn.AUC,lda.AUC,qda.AUC,glm.AUC,rf.AUC,svm.linear.AUC,svm.radial.AUC,svm.poly.AUC)
sensitivity.ranks <- c(knn.Sensitivity,lda.Sensitivity,qda.Sensitivity,glm.Sensitivity,rf.Sensitivity,svm.linear.Sensitivity,svm.radial.Sensitivity,svm.poly.Sensitivity)
specificity.ranks <- c(knn.Specificity,lda.Specificity,qda.Specificity,glm.Specificity,rf.Specificity,svm.linear.Specificity,svm.radial.Specificity,svm.poly.Specificity)
fdr.ranks <- c(knn.FDR, lda.FDR,qda.FDR,glm.FDR,rf.FDR,svm.linear.FDR,svm.radial.FDR,svm.poly.FDR)
precision.ranks <- c(knn.Precision, lda.Precision,qda.Precision,glm.Precision,rf.Precision,svm.linear.Precision,svm.radial.Precision,svm.poly.Precision)
i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
i2 <- sort(auc.ranks, index.return=TRUE, decreasing = TRUE)$ix
i3 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i4 <- sort(specificity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i5 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
i6 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix
myString = paste0("If we look at the statistics provided and ordering the models by accuracy, the best model is the ",name.ranks[i1][1], " with an accuracy of ", format(round(accuracy.ranks[i1][1], 6), nsmall = 6)," followed by ", name.ranks[i1][2], " with an accuracy of ",format(round(accuracy.ranks[i1][2], 6), nsmall = 6), " and ", name.ranks[i1][3], " with an accuracy of ",format(round(accuracy.ranks[i1][3], 6), nsmall = 6),". ","AUC, or Area Under Curve, is a measure of performance across all possible classification thresholds. Lowering the classification threshold classifies more items as positive. As such, you’ll want a model with a high AUC value. Ordering the models by AUC, the best model is the ", name.ranks[i2][1], " with an AUC of ", format(round(auc.ranks[i2][1], 6), nsmall = 6), " followed by ", name.ranks[i2][2], " with an AUC of ", format(round(auc.ranks[i2][2], 6), nsmall = 6), " and ", name.ranks[i2][3], " with an AUC of ", format(round(auc.ranks[i2][3], 6), nsmall = 6), ". ","Ordering the models by sensitivity, the best model is the ", name.ranks[i3][1], " with a sensitivity of ", format(round(sensitivity.ranks[i3][1], 6), nsmall = 6), " followed by ", name.ranks[i3][2], " with a sensitivity of ", format(round(sensitivity.ranks[i3][2], 6), nsmall = 6), " and ", name.ranks[i3][3], " with a sensitivity of ", format(round(sensitivity.ranks[i3][3], 6), nsmall = 6), ". ","Ordering the models by specificity, the best model is the ", name.ranks[i4][1], " with a specificity of ", format(round(specificity.ranks[i4][1], 6), nsmall = 6), " followed by ", name.ranks[i4][2], " with a specificity of ", format(round(specificity.ranks[i4][2], 6), nsmall = 6), " and ", name.ranks[i4][3], " with a specificity of ", format(round(specificity.ranks[i4][3], 6), nsmall = 6), ". ","The False Discovery Rate (FDR) is the expected proportion of type I errors ( you get a false positive ). The optimal model is one with a minimum value for FDR. The best model is the ", name.ranks[i5][1], " with a FDR of ", format(round(fdr.ranks[i5][1], 6), nsmall = 6), " followed by ", name.ranks[i5][2], " with a FDR of ", format(round(fdr.ranks[i5][2], 6), nsmall = 6), " and ", name.ranks[i5][3], " with a FDR of ", format(round(fdr.ranks[i5][3], 6), nsmall = 6), ". ","Lastly, you'll want a higher value for precision. Ordering the models by precision, the best model is the ", name.ranks[i6][1], " with a precision of ", format(round(precision.ranks[i6][1], 6), nsmall = 6), " followed by ", name.ranks[i6][2], " with a precision of ", format(round(precision.ranks[i6][2], 6), nsmall = 6), " and ", name.ranks[i6][3], " with a precision of ", format(round(precision.ranks[i6][3], 6), nsmall = 6), ".")
If we look at the statistics provided and ordering the models by accuracy, the best model is the Random Forest with an accuracy of 0.999130 followed by KNN with an accuracy of 0.997913 and SVM with Radial Kernel with an accuracy of 0.997802. AUC, or Area Under Curve, is a measure of performance across all possible classification thresholds. Lowering the classification threshold classifies more items as positive. As such, you’ll want a model with a high AUC value. Ordering the models by AUC, the best model is the KNN with an AUC of 0.999862 followed by SVM with Radial Kernel with an AUC of 0.999820 and SVM with Polynomial Kernel with an AUC of 0.999714. Ordering the models by sensitivity, the best model is the Random Forest with a sensitivity of 0.983680 followed by SVM with Radial Kernel with a sensitivity of 0.977250 and KNN with a sensitivity of 0.969337. Ordering the models by specificity, the best model is the Random Forest with a specificity of 0.999641 followed by QDA with a specificity of 0.999004 and KNN with a specificity of 0.998857. The False Discovery Rate (FDR) is the expected proportion of type I errors ( you get a false positive ). The optimal model is one with a minimum value for FDR. The best model is the Random Forest with a FDR of 0.010940 followed by QDA with a FDR of 0.033609 and KNN with a FDR of 0.034483. Lastly, you’ll want a higher value for precision. Ordering the models by precision, the best model is the Random Forest with a precision of 0.989060 followed by QDA with a precision of 0.966391 and KNN with a precision of 0.965517.
name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
accuracy.ranks <- c(
fho.knn.accuracy,
fho.lda.accuracy,
fho.qda.accuracy,
fho.glm.accuracy,
fho.rf.accuracy,
fho.svm.linear.accuracy,
fho.svm.radial.accuracy,
fho.svm.poly.accuracy)
auc.ranks <- c(fho.knn.AUC,
fho.lda.AUC,
fho.qda.AUC,
fho.glm.AUC,
fho.rf.AUC,
fho.svm.linear.AUC,
fho.svm.radial.AUC,
fho.svm.poly.AUC)
sensitivity.ranks <- c(fho.knn.sensitivity,
fho.lda.sensitivity,
fho.qda.sensitivity,
fho.glm.sensitivity,
fho.rf.sensitivity,
fho.svm.linear.sensitivity,
fho.svm.radial.sensitivity,
fho.svm.poly.sensitivity)
specificity.ranks <- c(fho.knn.specificity,
fho.lda.specificity,
fho.qda.specificity,
fho.glm.specificity,
fho.rf.specificity,
fho.svm.linear.specificity,
fho.svm.radial.specificity,
fho.svm.poly.specificity)
fdr.ranks <- c(fho.knn.FDR,
fho.lda.FDR,
fho.qda.FDR,
fho.glm.FDR,
fho.rf.FDR,
fho.svm.linear.FDR,
fho.svm.radial.FDR,
fho.svm.poly.FDR)
precision.ranks <- c(fho.knn.precision,
fho.lda.precision,
fho.qda.precision,
fho.glm.precision,
fho.rf.precision,
fho.svm.linear.precision,
fho.svm.radial.precision,
fho.svm.poly.precision)
i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
i2 <- sort(auc.ranks, index.return=TRUE, decreasing = TRUE)$ix
i3 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i4 <- sort(specificity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i5 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
i6 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix
myString2 = paste0("After running our models against the FHO provided and ordering the models that performed well on the hold-out data according to accuracy was ",name.ranks[i1][1], " with an accuracy of ", format(round(accuracy.ranks[i1][1], 6), nsmall = 6)," followed by ", name.ranks[i1][2], " with an accuracy of ",format(round(accuracy.ranks[i1][2], 6), nsmall = 6), " and ", name.ranks[i1][3], " with an accuracy of ",format(round(accuracy.ranks[i1][3], 6), nsmall = 6),". ","Ordering according to AUC, the best model is the ", name.ranks[i2][1], " with an AUC of ", format(round(auc.ranks[i2][1], 6), nsmall = 6), " followed by ", name.ranks[i2][2], " with an AUC of ", format(round(auc.ranks[i2][2], 6), nsmall = 6), " and ", name.ranks[i2][3], " with an AUC of ", format(round(auc.ranks[i2][3], 6), nsmall = 6), ". ","Ordering the results by sensitivity, the best model is the ", name.ranks[i3][1], " with a sensitivity of ", format(round(sensitivity.ranks[i3][1], 6), nsmall = 6), " followed by ", name.ranks[i3][2], " with a sensitivity of ", format(round(sensitivity.ranks[i3][2], 6), nsmall = 6), " and ", name.ranks[i3][3], " with a sensitivity of ", format(round(sensitivity.ranks[i3][3], 6), nsmall = 6), ". ","Ordering the models again by specificity, the best model is the ", name.ranks[i4][1], " with a specificity of ", format(round(specificity.ranks[i4][1], 6), nsmall = 6), " followed by ", name.ranks[i4][2], " with a specificity of ", format(round(specificity.ranks[i4][2], 6), nsmall = 6), " and ", name.ranks[i4][3], " with a specificity of ", format(round(specificity.ranks[i4][3], 6), nsmall = 6), ". ","The algorithms according to the FDR performance on the hold-out data was ", name.ranks[i5][1], " with a FDR of ", format(round(fdr.ranks[i5][1], 6), nsmall = 6), " followed by ", name.ranks[i5][2], " with a FDR of ", format(round(fdr.ranks[i5][2], 6), nsmall = 6), " and ", name.ranks[i5][3], " with a FDR of ", format(round(fdr.ranks[i5][3], 6), nsmall = 6), ". ","Lastly against the precision, the best model is the ", name.ranks[i6][1], " with a precision of ", format(round(precision.ranks[i6][1], 6), nsmall = 6), " followed by ", name.ranks[i6][2], " with a precision of ", format(round(precision.ranks[i6][2], 6), nsmall = 6), " and ", name.ranks[i6][3], " with a precision of ", format(round(precision.ranks[i6][3], 6), nsmall = 6), ".")
After running our models against the FHO provided and ordering the models that performed well on the hold-out data according to accuracy was QDA with an accuracy of 0.994921 followed by Random Forest with an accuracy of 0.993437 and KNN with an accuracy of 0.992559. Ordering according to AUC, the best model is the Logistic Regression with an AUC of 0.999413 followed by SVM with Linear Kernel with an AUC of 0.999092 and LDA with an AUC of 0.992116. Ordering the results by sensitivity, the best model is the Logistic Regression with a sensitivity of 0.993301 followed by SVM with Linear Kernel with a sensitivity of 0.992887 and LDA with a sensitivity of 0.839779. Ordering the models again by specificity, the best model is the QDA with a specificity of 0.996625 followed by Random Forest with a specificity of 0.994894 and SVM with Polynomial Kernel with a specificity of 0.993769. The algorithms according to the FDR performance on the hold-out data was QDA with a FDR of 0.378750 followed by Random Forest with a FDR of 0.469349 and KNN with a FDR of 0.508895. Lastly against the precision, the best model is the QDA with a precision of 0.621250 followed by Random Forest with a precision of 0.530651 and KNN with a precision of 0.491105.
When looking to decide what is the best model to select, the accuracy can play a part in deciding which algorithm to select. If we were to rank each algorithm based off the accuracy:
name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
accuracy.ranks <- c(knn.Accuracy,lda.Accuracy,qda.Accuracy,glm.Accuracy,rf.Accuracy,svm.linear.Accuracy,svm.radial.Accuracy,svm.poly.Accuracy)
i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC1 <- name.ranks[i1]
accuracy.ranks <- c(
fho.knn.accuracy,
fho.lda.accuracy,
fho.qda.accuracy,
fho.glm.accuracy,
fho.rf.accuracy,
fho.svm.linear.accuracy,
fho.svm.radial.accuracy,
fho.svm.poly.accuracy)
i2 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC2 <- name.ranks[i2]
df = data.frame(ACC1,ACC2)
colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
TRAIN | FHO TEST | |
---|---|---|
Rank 1 | Random Forest | QDA |
Rank 2 | KNN | Random Forest |
Rank 3 | SVM with Radial Kernel | KNN |
Rank 4 | SVM with Polynomial Kernel | SVM with Radial Kernel |
Rank 5 | SVM with Linear Kernel | SVM with Polynomial Kernel |
Rank 6 | Logistic Regression | LDA |
Rank 7 | QDA | Logistic Regression |
Rank 8 | LDA | SVM with Linear Kernel |
We can see that algorithms based on non-linear classifications rank near the top especially for the Random Forest Model, K Nearest Neighbors and Support Vector Machines with a Radial Kernel while the algorithms based on linear classifications ( Linear discriminant, Support Vector Machines with Linear Kernel ) had a much lower ranked accuracy.
However, accuracy is not the only metric to use in validating which model to use. We can rank the models by precision:
name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
precision.ranks <- c(knn.Precision, lda.Precision,qda.Precision,glm.Precision,rf.Precision,svm.linear.Precision,svm.radial.Precision,svm.poly.Precision)
i1 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC1 <- name.ranks[i1]
precision.ranks <- c(fho.knn.precision,
fho.lda.precision,
fho.qda.precision,
fho.glm.precision,
fho.rf.precision,
fho.svm.linear.precision,
fho.svm.radial.precision,
fho.svm.poly.precision)
i2 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC2 <- name.ranks[i2]
df = data.frame(ACC1,ACC2)
colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
TRAIN | FHO TEST | |
---|---|---|
Rank 1 | Random Forest | QDA |
Rank 2 | QDA | Random Forest |
Rank 3 | KNN | KNN |
Rank 4 | SVM with Polynomial Kernel | SVM with Radial Kernel |
Rank 5 | SVM with Radial Kernel | SVM with Polynomial Kernel |
Rank 6 | SVM with Linear Kernel | LDA |
Rank 7 | Logistic Regression | Logistic Regression |
Rank 8 | LDA | SVM with Linear Kernel |
and by recall:
name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
sensitivity.ranks <- c(knn.Sensitivity,lda.Sensitivity,qda.Sensitivity,glm.Sensitivity,rf.Sensitivity,svm.linear.Sensitivity,svm.radial.Sensitivity,svm.poly.Sensitivity)
i1 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC1 <- name.ranks[i1]
sensitivity.ranks <- c(fho.knn.sensitivity,
fho.lda.sensitivity,
fho.qda.sensitivity,
fho.glm.sensitivity,
fho.rf.sensitivity,
fho.svm.linear.sensitivity,
fho.svm.radial.sensitivity,
fho.svm.poly.sensitivity)
i2 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC2 <- name.ranks[i2]
df = data.frame(ACC1,ACC2)
colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
TRAIN | FHO TEST | |
---|---|---|
Rank 1 | Random Forest | Logistic Regression |
Rank 2 | SVM with Radial Kernel | SVM with Linear Kernel |
Rank 3 | KNN | LDA |
Rank 4 | SVM with Polynomial Kernel | KNN |
Rank 5 | Logistic Regression | Random Forest |
Rank 6 | SVM with Linear Kernel | SVM with Radial Kernel |
Rank 7 | QDA | QDA |
Rank 8 | LDA | SVM with Polynomial Kernel |
It’s important to note these models are generated from point estimates from the 10-fold cross-validation training. As such, I would recommend using the Random Forest model for detection of blue tarps. The mean and standard deviation point estimates from the 10-fold cross-validation for this model ( metrics Sensitivity, Specificity and Precision) :
Sensitivity Mean: 0.948570940837926
Sensitivity Standard Deviation: 0.0158306673347466
Specificity Mean: 0.998660555098661
Specificity Standard Deviation: 0.000351159803790381
Precision Mean: 0.95907902771856
Precision Standard Deviation: 0.0102829571574502
with the KNN Model as a close second. The mean and standard deviation point estimates from the 10-fold cross-validation for this model ( metrics Sensitivity, Specificity and Precision):
Sensitivity Mean: 0.959457152611813
Sensitivity Standard Deviation: 0.0137156602761885
Specificity Mean: 0.998562542574257
Specificity Standard Deviation: 0.000467102491419399
Precision Mean: 0.9568424151997
Precision Standard Deviation: 0.0131157372711998
The models do not behave the same when measuring the prediction against the training set of data and the final hold out test set but this reasoning can be attributed to many various factors. The rationale regarding which algorithm to use for detection of blue tarps in part can also explain why the training data and test data ranks do not exactly match but tend to fall into the same areas for linear / non linear models. It’s important to keep in mind that the data is an unbalanced classification set. The variables associated with this data set are additive colors of the RBG color spectrum. This data formulation allows us to address it with predictive modeling tools since the variables stem from the data being an unbalanced binary classification ( an unequal distribution of classes in the training dataset and FHO). This in turn means that within the metrics, there is a greater chance that a negatives that can be classified as a false positive. Although the models tested using FHO data has low precision overall compared to the training data, the sensitivity and specificity tend to be relatively higher and consistent with the training data. In addition, the based on the False Discovery Rate,
name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
fdr.ranks <- c(knn.FDR, lda.FDR,qda.FDR,glm.FDR,rf.FDR,svm.linear.FDR,svm.radial.FDR,svm.poly.FDR)
i1 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
FDR1 <- name.ranks[i1]
fdr.ranks <- c(fho.knn.FDR,
fho.lda.FDR,
fho.qda.FDR,
fho.glm.FDR,
fho.rf.FDR,
fho.svm.linear.FDR,
fho.svm.radial.FDR,
fho.svm.poly.FDR)
i2 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
FDR2 <- name.ranks[i2]
df = data.frame(FDR1,FDR2)
colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
TRAIN | FHO TEST | |
---|---|---|
Rank 1 | Random Forest | QDA |
Rank 2 | QDA | Random Forest |
Rank 3 | KNN | KNN |
Rank 4 | SVM with Polynomial Kernel | SVM with Radial Kernel |
Rank 5 | SVM with Radial Kernel | SVM with Polynomial Kernel |
Rank 6 | SVM with Linear Kernel | LDA |
Rank 7 | Logistic Regression | Logistic Regression |
Rank 8 | LDA | SVM with Linear Kernel |
Random Forest and KNN both rank near the top, indicating that these models have lower chances of getting a false positive.
In an extension of part 1 discussions on software aspects, we expect SVMs to take the longest performance which is shown in the above table on elapsed time. The time required to do 10-fold cross validation adjusted for using tuning parameters far exceed those for LDA, QDA, GLM, etc. To avoid overfitting in the Random Forest Model, Ntree was set to 500 and not iterated across various values of ntree since Random Forest already appeared to be one of the top models. The trade-off of timing for small increases in accuracy force the analysis to lean more on saving time where possible. We have iterated across reasonable values of cost, sigma, degree , etc for the various models but we do not expect the results to change position. Quadratic and polynomial based algorithms exceeded linear based algorithms with Random Forest decision trees and K-Nearest Neighbors standing above the rest however; it’s worth pointing out that the uncertainties based on the mean and standard deviation for the various metrics can have the models flip-flop in rankings.
https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
https://stats.stackexchange.com/questions/158583/what-does-node-size-refer-to-in-the-random-forest
https://blog.revolutionanalytics.com/2015/10/the-5th-tribe-support-vector-machines-and-caret.html
https://stackoverflow.com/questions/52422201/r-parallelmakecluster-hangs-on-mac
https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
https://machinelearningmastery.com/threshold-moving-for-imbalanced-classification/
https://machinelearningmastery.com/what-is-imbalanced-classification/
https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc
https://stats.stackexchange.com/questions/49226/how-to-interpret-f-measure-values
https://support.sas.com/resources/papers/proceedings17/0942-2017.pdf