# ggpairs
library(ggplot2)
library(GGally)
#> Registered S3 method overwritten by 'GGally':
#>   method from   
#>   +.gg   ggplot2
library(hrbrthemes)
#> NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
#>       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
#>       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
# %>%
library(tidyverse)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
#> ✓ tibble  3.0.3     ✓ dplyr   1.0.2
#> ✓ tidyr   1.1.2     ✓ stringr 1.4.0
#> ✓ readr   1.3.1     ✓ forcats 0.5.0
#> ✓ purrr   0.3.4
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()

# kfold
library(caret)
#> Loading required package: lattice
#> 
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#> 
#>     lift
kfoldcv = 10

# lda
library(MASS)
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select

# prediction
library(ROCR)

#partimat
library(klaR)

library(knitr)
library(kableExtra)
#> 
#> Attaching package: 'kableExtra'
#> The following object is masked from 'package:dplyr':
#> 
#>     group_rows

library(parallel)
library(future)
#> 
#> Attaching package: 'future'
#> The following object is masked from 'package:caret':
#> 
#>     cluster
library(doParallel)
#> Loading required package: foreach
#> 
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#> 
#>     accumulate, when
#> Loading required package: iterators
cl <- future::makeClusterPSOCK(4, outfile = NULL, verbose = FALSE)

get_metric <- function(caret_CM_object, metric){
  caret_CM_object %>% broom::tidy() %>% 
    dplyr::filter(term==metric) %>%
    dplyr::select(estimate) %>% pull()
}

get_metrics_func <- function(caret_CM_objects, metric, func){
  caret_CM_objects %>% 
    purrr::map( ~ get_metric(.x, metric)) %>%
    unlist() %>% func
}

get_out_of_folds_CM <- function(classifier){
  out_of_folds_CM <- classifier$pred %>%
  dplyr::mutate(pred2 = ifelse(Up > THRESHOLD, 'Up', 'Down')) %>%
  dplyr::mutate(pred2 = factor(pred2, levels = c('Down', 'Up'))) %>%
  dplyr::group_split(Resample) %>%
  purrr::map( ~ caret::confusionMatrix(data=.x$pred2, reference=.x$obs, 
                                       positive='Up'))
}

plot_folds_CM <- function(classifier,THRESHOLD ){
  out_of_folds_CM <- classifier$pred %>% 
    dplyr::mutate(pred2 = ifelse(Up>THRESHOLD, 'Up', 'Down')) %>%
    dplyr::mutate(pred2 = factor(pred2, levels=c('Down','Up'))) %>%
    dplyr::group_split(Resample) %>%
    purrr::map(~ caret::confusionMatrix(data=.x$pred2, reference=.x$obs))
  
  out_of_folds_CM_vis <- out_of_folds_CM %>% 
    purrr::map(~ .x$table %>% broom::tidy() %>%
                   ggplot(aes(x=Reference, y=Prediction, fill=n, label=n)) +
                   geom_tile() + geom_text(aes(label=n), size=5, color='white'))
   
  gridExtra::grid.arrange(grobs=out_of_folds_CM_vis, ncol=2)
}

tuned_call <- function(preds, decision_rule_threshold){
  calls <- rep('Down', length(preds))
  calls[preds>decision_rule_threshold]='Up'
  factor(calls, levels=c('Down','Up'))
}

get_optimal_threshold <- function(classifier){
  
  thresholds = seq(0.001,0.5,.001) # Avoid Threshold Tails
  F_1_scores <- rep(NA, length(thresholds))
  GMeans <- rep(NA, length(thresholds))
  
  preds <- predict(classifier, type='prob')[,2]
  y <- train$BlueTarp_rest
  for (i in 1:length(thresholds)){
    yhat <- tuned_call(preds, thresholds[i])
    one_minus_specificity <- mean(yhat[y=='Down']=='Up')
    sensitivity <- mean(yhat[y=='Up']=='Up')
    one_minus_specificity
    sensitivity
    results <- tibble::tibble(y=y, yhat=yhat)
    
    caret::confusionMatrix(yhat, y, positive = "Up") 
    cm <- results %>%yardstick::conf_mat(truth=y, yhat)
    
    trueNegative  = cm$table[1,1]
    falseNegative = cm$table[1,2]
    falsePositive = cm$table[2,1]
    truePositve   = cm$table[2,2]
    
    #paste0("True Positive: ", truePositve )
    #paste0("False Negative: ", falseNegative )
    #paste0("True Negative: ", trueNegative )
    #paste0("False Positive: ", falsePositive )
    conditionPositive = truePositve + falseNegative
    conditionNegative = trueNegative + falsePositive 
    predictedConditionPositive = truePositve +  falsePositive 
    predictedConditionNegative = trueNegative + falseNegative
    
    TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
    
    prevalence = conditionPositive / TotalPopulation
    accuracy = (truePositve + trueNegative) / TotalPopulation
    PPV = truePositve / predictedConditionPositive
    FDR = falsePositive / predictedConditionPositive
    FOR = falseNegative / predictedConditionNegative
    NPV = trueNegative / predictedConditionNegative
    TPR = truePositve / conditionPositive
    FPR = falsePositive / conditionNegative
    FNR = falseNegative / conditionPositive
    SPC = TNR = trueNegative / conditionNegative
    LR.PLUS = TPR / FPR
    LR.MINUS = FNR/TNR
    DOR = LR.PLUS / LR.MINUS
    
    F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
    F_1_scores[i] <- F_1_score
    GMeans[i] = sqrt(TPR*SPC)
  }
  
  plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
  opt_thresh = thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]
  return(opt_thresh)
}

stat_fho <- function(classifier, testdata, threshold){
  y <- testdata$BlueTarp_rest
  preds <- predict(classifier, newdata = testdata, type = "prob")[,2]
  yhat <- tuned_call(preds, threshold)
  
  one_minus_specificity <- mean(yhat[y=='Down']=='Up')
  sensitivity <- mean(yhat[y=='Up']=='Up')
  one_minus_specificity
  sensitivity
  results <- tibble::tibble(y=y, yhat=yhat)
  
  caret::confusionMatrix(yhat, y, positive = "Up") 
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  prevalence = conditionPositive / TotalPopulation
  accuracy = (truePositve + trueNegative) / TotalPopulation
  PPV = truePositve / predictedConditionPositive
  FDR = falsePositive / predictedConditionPositive
  FOR = falseNegative / predictedConditionNegative
  NPV = trueNegative / predictedConditionNegative
  TPR = truePositve / conditionPositive
  FPR = falsePositive / conditionNegative
  FNR = falseNegative / conditionPositive
  SPC = TNR = trueNegative / conditionNegative
  LR.PLUS = TPR / FPR
  LR.MINUS = FNR/TNR
  DOR = LR.PLUS / LR.MINUS
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  
  # accuracy, sensitivity, specificity, FDR, precision
  return(c(accuracy, TPR, SPC, FDR, PPV  ))
}

Introduction

In this project, you will use classification methods covered in this course to solve a real historical data mining problem: locating displaced persons living in makeshift shelters following the destruction of the earthquake in Haiti in 2010.

Following that earthquake, rescue workers, mostly from the United States military, needed to get food and water to the displaced persons. But with destroyed communications, impassable roads, and thousands of square miles, actually locating the people who needed help was challenging. As part of the rescue effort, a team from the Rochester Institute of Technology were flying an aircraft to collect high resolution geo-referenced imagery.

HaitiPixels <- read.csv(file = '/Users/awells/UVA/SYS6018/Disaster_Relief_Project/HaitiPixels.csv')
head(HaitiPixels)
#>        Class Red Green Blue
#> 1 Vegetation  64    67   50
#> 2 Vegetation  64    67   50
#> 3 Vegetation  64    66   49
#> 4 Vegetation  75    82   53
#> 5 Vegetation  74    82   54
#> 6 Vegetation  72    76   52

HaitiPixels$Class <- as.factor(HaitiPixels$Class)
#summary(HaitiPixels)

dim(HaitiPixels)
#> [1] 63241     4
summary(HaitiPixels)
#>               Class            Red          Green            Blue      
#>  Blue Tarp       : 2022   Min.   : 48   Min.   : 48.0   Min.   : 44.0  
#>  Rooftop         : 9903   1st Qu.: 80   1st Qu.: 78.0   1st Qu.: 63.0  
#>  Soil            :20566   Median :163   Median :148.0   Median :123.0  
#>  Various Non-Tarp: 4744   Mean   :163   Mean   :153.7   Mean   :125.1  
#>  Vegetation      :26006   3rd Qu.:255   3rd Qu.:226.0   3rd Qu.:181.0  
#>                           Max.   :255   Max.   :255.0   Max.   :255.0
attach(HaitiPixels)

detach(HaitiPixels)
# Blue Tarp
BlueTarp_rest <- rep("0",length(HaitiPixels$Class))
BlueTarp_rest[HaitiPixels$Class == "Blue Tarp"] <- "1"

HaitiPixels <- data.frame(HaitiPixels,BlueTarp_rest )

HaitiPixels$BlueTarp_rest <- as.factor(HaitiPixels$BlueTarp_rest)

#summary(HaitiPixels)
attach(HaitiPixels)
#> The following object is masked _by_ .GlobalEnv:
#> 
#>     BlueTarp_rest
BlueTarp_rest <- as.factor(BlueTarp_rest)

#### Delete these 2 lines when ready for full set
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.1, list = FALSE) # ~15K
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.25, list = FALSE) # ~15K
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.32, list = FALSE) # ~20K
#index <- createDataPartition(HaitiPixels$BlueTarp_rest, p = 0.5, list = FALSE) # ~30K

#data <- HaitiPixels[index, ]
#HaitiPixels <- data
#### 

summary(HaitiPixels)
#>               Class            Red          Green            Blue      
#>  Blue Tarp       : 2022   Min.   : 48   Min.   : 48.0   Min.   : 44.0  
#>  Rooftop         : 9903   1st Qu.: 80   1st Qu.: 78.0   1st Qu.: 63.0  
#>  Soil            :20566   Median :163   Median :148.0   Median :123.0  
#>  Various Non-Tarp: 4744   Mean   :163   Mean   :153.7   Mean   :125.1  
#>  Vegetation      :26006   3rd Qu.:255   3rd Qu.:226.0   3rd Qu.:181.0  
#>                           Max.   :255   Max.   :255.0   Max.   :255.0  
#>  BlueTarp_rest
#>  0:61219      
#>  1: 2022      
#>               
#>               
#>               
#>

HaitiPixels %>% ggpairs(upper=list(continuous=wrap("cor", size=3)))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

For the set up of the data set ( HaitiPixels, \(n=\) 63241) a likely decision is to use a one vs rest approach. Our data is a multi-class classification problem with binary classifications:

Blue Tarp vs [ Rooftop , Soil , Various Non-Tarp, Vegetation ]
Rooftop vs [ Blue Tarp, Soil , Various Non-Tarp, Vegetation ]
Soil vs [ Blue Tarp, Rooftop , Various Non-Tarp, Vegetation ]
Various Non-Tarp vs [ Blue Tarp, Rooftop , Soil , Vegetation ]
Vegetation vs [ Blue Tarp, Rooftop , Soil , Various Non-Tarp ]

# Get Final Hold - Out 5% of Data ~3000 observations
#data.fho = sort(sample(nrow(HaitiPixels), nrow(HaitiPixels)*.05))

#fho <- HaitiPixels[data.fho,]

train <- HaitiPixels

# shuffle just to be safe
train <- dplyr::sample_n(train, nrow(train))
#fho <- dplyr::sample_n(fho, nrow(fho))

train$BlueTarp_rest <- as.factor(train$BlueTarp_rest)
levels(train$BlueTarp_rest) <- c("Down","Up")
y <- train$BlueTarp_rest

Since we are looking for temporary shelters made using blue tarps, we generated a variable named “BlueTarp_rest”, a binary classifier that has two levels

“Up” - if a blue tarp was visible (or up )
“Down” - if a blue tarp was not visible.

Orthovnir Hold-Out Test Data Set

It was known that the people whose homes had been destroyed by the earthquake were creating temporary shelters using blue tarps, and these blue tarps would be good indicators of where the displaced persons were – if only they could be located in time, out of the thousands of images that would be collected every day.

#orthovnir057_ROI_NON_Blue_Tarps.txt

ovnir057.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir057_ROI_NON_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir057.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir057.NoBlueTarps$ID))
ovnir057.NoBlueTarps <- data.frame(ovnir057.NoBlueTarps,BlueTarp_rest )

#orthovnir067_ROI_Blue_Tarps.txt

ovnir067.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir067_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir067.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir067.BlueTarps$ID))
ovnir067.BlueTarps <- data.frame(ovnir067.BlueTarps,BlueTarp_rest )

#orthovnir067_ROI_NOT_Blue_Tarps.txt

ovnir067.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir067_ROI_NOT_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir067.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir067.NoBlueTarps$ID))
ovnir067.NoBlueTarps <- data.frame(ovnir067.NoBlueTarps,BlueTarp_rest )

#orthovnir069_ROI_Blue_Tarps.txt

ovnir069.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir069_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir069.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir069.BlueTarps$ID))
ovnir069.BlueTarps <- data.frame(ovnir069.BlueTarps,BlueTarp_rest )

#orthovnir069_ROI_NOT_Blue_Tarps.txt
ovnir069.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir069_ROI_NOT_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir069.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir069.NoBlueTarps$ID))
ovnir069.NoBlueTarps <- data.frame(ovnir069.NoBlueTarps,BlueTarp_rest )

#orthovnir078_ROI_Blue_Tarps.txt

ovnir078.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir078_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir078.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir078.BlueTarps$ID))
ovnir078.BlueTarps <- data.frame(ovnir078.BlueTarps,BlueTarp_rest )

#orthovnir078_ROI_NON_Blue_Tarps.txt
ovnir078.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir078_ROI_NON_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir078.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir078.NoBlueTarps$ID))
ovnir078.NoBlueTarps <- data.frame(ovnir078.NoBlueTarps,BlueTarp_rest )

#summary(ovnir067.NoBlueTarps)

fho <- rbind(
  ovnir057.NoBlueTarps,
  ovnir067.BlueTarps,
  ovnir067.NoBlueTarps,
  ovnir069.BlueTarps,
  ovnir069.NoBlueTarps,
  ovnir078.BlueTarps,
  ovnir078.NoBlueTarps
  )
fho$BlueTarp_rest <- factor(fho$BlueTarp_rest)

#### Delete these 2 lines when ready for full set
#index.fho <- createDataPartition(fho$BlueTarp_rest, p = 0.1, list = FALSE) 
#data.fho <- fho[index.fho, ]
#fho <- data.fho
#### 

# Scale Data
#fho.scale <- scale(subset(fho, select= -c(BlueTarp_rest)), center = TRUE, scale = TRUE)
#df <- data.frame(fho.scale, fho$BlueTarp_rest)
#fho <- df
colnames(fho) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue","BlueTarp_rest")

Sample Data Makeshift Villiage

Our goal is to test a variety of algorithms from this course on the imagery data collected during the relief efforts made Haiti in response to the 2010 earthquake. This leaves a Hold-Out Test Data Set of \(n=\) 2004177 observations.

K-Folds Out of Sampling Performance: K-Nearest Neighbors

The first classification process that we used is the K-nearest neighbors regression. Given a value for K and a prediction point \(x_0\), KNN regression first identifies the K training observations that are closest to \(x_0\), represented by \(N_0\). The optimal value for K will depend on the bias-variance trade-off. A small value for K provides the most flexible fit, which will have low bias but high variance. This variance is due to the fact that the prediction in a given region is entirely dependent on just one observation. In contrast, larger values of K provide a smoother and less variable fit; the prediction in a region is an average of several points, and so changing one observation has a smaller effect. We’ll use the caret package in R to apply the KNN classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

set.seed(1)
registerDoParallel(cl)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.knn <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="knn", 
              preProcess=c("center","scale"), 
              tuneGrid=data.frame(k=seq(1,17,2)),
              trControl = caret::trainControl("cv", number=kfoldcv, 
                          returnResamp='all', savePredictions='final',
                          classProbs=TRUE))
stopCluster(cl)

classifier.knn
#> k-Nearest Neighbors 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   k   Accuracy   Kappa    
#>    1  0.9965687  0.9442231
#>    3  0.9972803  0.9560412
#>    5  0.9972961  0.9564047
#>    7  0.9972645  0.9560470
#>    9  0.9971696  0.9545976
#>   11  0.9970747  0.9530375
#>   13  0.9970431  0.9524515
#>   15  0.9969956  0.9517298
#>   17  0.9970431  0.9524377
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was k = 5.
bestk = classifier.knn$bestTune

Training across various values of K, we can see that our optimal K is 5. So we’ll proceed with the analysis using this value.

classifier.knn %>% ggplot(aes(x=seq_along(Accuracy), y=Accuracy))

classifier.knn$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.knn$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model. We’ll use the image below as a reference to analyzing the matrix.

Confusion Matrix Reference

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.knn,THRESHOLD)

out_of_folds_CM.knn <- get_out_of_folds_CM(classifier.knn)
classifier.knn.sensitivity.mean <- get_metrics_func(out_of_folds_CM.knn, 'sensitivity', mean)
classifier.knn.sensitivity.sd <- get_metrics_func(out_of_folds_CM.knn, 'sensitivity', sd)

classifier.knn.specificity.mean <- get_metrics_func(out_of_folds_CM.knn, 'specificity', mean)
classifier.knn.specificity.sd <- get_metrics_func(out_of_folds_CM.knn, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.959457152611813
Sensitivity Standard Deviation: 0.0137156602761885
Specificity Mean: 0.998562542574257
Specificity Standard Deviation: 0.000467102491419399

Receiver Operating Characteristic Curve

From the 10-Fold cross validation we can see we have our true positive references to a blue tarp. We can measure the sensitivity also known as the true positive rate. This is the fraction of observations that are correctly identified, using a given threshold value. The initial default threshold is 0.5. The false positive rate is 1-specificity; the fraction of observations that we classify incorrectly, using that same threshold value of 0.5. We can view the receiver operating characteristic curve or ROC curve to measure these two statistics. The ideal ROC curve hugs the top left corner, indicating a high true positive rate and a low false positive rate.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.knn, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)
#> Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
#> Instead, set the following argument directly in the metric function:
#> `options(yardstick.event_first = TRUE)`  -> `event_level = 'first'` (the default)
#> `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
#> This warning is displayed once per session.

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

#ggplot(ROC_curve_plot)
ROC_curve_plot

predict(classifier.knn, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary          1.00

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>   .threshold specificity sensitivity one_minus_specificity
#>        <dbl>       <dbl>       <dbl>                 <dbl>
#> 1  -Inf            0           1                   1      
#> 2     0            0           1                   1      
#> 3     0.0701       0.992       1                   0.00804
#> 4     0.0703       0.992       1                   0.00802
#> 5     0.0820       0.997       0.989               0.00327
#> 6     0.143        0.997       0.988               0.00327
predict(classifier.knn, type='prob') %>% head()
#>   Down Up
#> 1    1  0
#> 2    1  0
#> 3    1  0
#> 4    1  0
#> 5    1  0
#> 6    1  0

We can also look at each ROC curve from each fold from the KNN model. For the 10 different ROC curves, we can even get a sense of the variation in ROC curves.

ROC_curveS <- classifier.knn$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

We can measure the Precision and the Recall by the Precision-Recall (P-R) Curve .

Precision:

\[ \frac{true \ positives}{ true \ positives + false \ positives}\]

Recall:

\[ \frac{true \ positives}{ true \ positives + false \ negatives}\]

The ideal P-R curve hugs the top right corner, representing both high recall and high precision which indicates the model is returning accurate and positive results.

PR_curve <- predict(classifier.knn, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the KNN model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.knn$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.knn.recall.mean <- get_metrics_func(out_of_folds_CM.knn, 'recall', mean)
classifier.knn.recall.sd <- get_metrics_func(out_of_folds_CM.knn, 'recall', sd)

classifier.knn.precision.mean <- get_metrics_func(out_of_folds_CM.knn, 'precision', mean)
classifier.knn.precision.sd <- get_metrics_func(out_of_folds_CM.knn, 'precision', sd)

Recall Mean: 0.959457152611813
Recall Standard Deviation: 0.0137156602761885
Precision Mean: 0.9568424151997
Precision Standard Deviation: 0.0131157372711998

To find the optimal threshold, well look at using the \(F_1\) Score.

\(F_1\) score is more useful measure than accuracy for problems with uneven class distributions since \(F_1\) is defined as :

\[ F_1 = 2 * \frac{\frac{\Sigma \ True \ Positive}{\Sigma \ Predicted \ Condition \ Positive}*\frac{\Sigma True \ Positive}{\Sigma \ Condition \ Positive}}{\frac{\Sigma \ True \ Positive}{\Sigma \ Predicted \ Condition \ Positive}+\frac{\Sigma \ True \ Positive}{\Sigma \ Condition \ Positive}}\]

it takes into account both false positive and false negatives. The optimal \(F_1\) Score is between 0 and 1 with 1 meaning perfect precision and recall.

preds <- predict(classifier.knn, type='prob')[,2]
opt_thresh.knn <- get_optimal_threshold(classifier.knn)

opt_thresh.knn
#> [1] 0.499

yhat <- tuned_call(preds, opt_thresh.knn)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001143436
sensitivity
#> [1] 0.9693373
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61149    62
#>       Up      70  1960
#>                                           
#>                Accuracy : 0.9979          
#>                  95% CI : (0.9975, 0.9983)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : <2e-16          
#>                                           
#>                   Kappa : 0.9663          
#>                                           
#>  Mcnemar's Test P-Value : 0.5423          
#>                                           
#>             Sensitivity : 0.96934         
#>             Specificity : 0.99886         
#>          Pos Pred Value : 0.96552         
#>          Neg Pred Value : 0.99899         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03099         
#>    Detection Prevalence : 0.03210         
#>       Balanced Accuracy : 0.98410         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61149
False Negative = 62
False Positive = 70
True Positive = 1960
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2030
Predicted Condition Negative = 61211
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.99791274647776
Positive predictive value, precision = 0.96551724137931
False Discovery rate = 0.0344827586206897
False Omission Rate = 0.00101288984006143
Negative Predictive Value = 0.998987110159939
True Positive Rate, Recall, Sensitivity = 0.969337289812067
False Positive Rate = 0.00114343586141557
False Negative Rate = 0.0306627101879327
Specificity, Selectivity, True Negative rate = 0.998856564138584
Positive likelihood ratio = 847.740850642928
Negative likelihood ratio = 0.030697811166087
Diagnostic Odds Ratio = 27615.6774193548
\(F_1\) Score = 0.967423494570582

From our plot we selected our optimal threshold as 0.499.

plot_folds_CM(classifier.knn,opt_thresh.knn)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats K Nearest Neighbor

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.knn, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.knn, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

knn.Accuracy = accuracy
knn.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.knn$pred$obs,
  pred = classifier.knn$pred$pred,
  Down = classifier.knn$pred$Down, 
  Up = classifier.knn$pred$Up), lev = levels(classifier.knn$pred$obs))

knn.ROC = "TRUE" #unname(twoclassSum[1])
knn.Threshold = opt_thresh.knn
knn.Sensitivity = knn.Recall= knn.Power =  TPR
knn.Specificity= 1-FPR
knn.FDR = FDR
knn.Precision=PPV


df = data.frame(c(knn.Accuracy, knn.AUC, knn.ROC, knn.Threshold, knn.Sensitivity, knn.Specificity, knn.FDR , knn.Precision))


colnames(df) <- c(paste0("KNN (k =",bestk,")"))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	KNN (k =5)
Accuracy	0.99791274647776
AUC	0.999862454860983
ROC	TRUE
Threshold	0.499
Sensitivity=Recall=Power	0.969337289812067
Specificity=1-FPR	0.998856564138584
FDR	0.0344827586206897
Precision=PPV	0.96551724137931

K-Folds Out of Sampling Performance: Linear Discriminant Analysis

The next classification process that we used is the linear discriminant analysis (LDA). LDA assumes that the observations within each class are drawn from a multivariate Gaussian distribution with a class specific mean vector and a covariance matrix that is common to all K classes. Again, we’ll use the caret package in R to apply the LDA classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

set.seed(1)
registerDoSEQ()
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.lda <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="lda",
                               preProcess=c("center","scale"),
                               #tuneGrid=data.frame(.dimen = seq(1,10,1)),#seq(1,1000,5)),
                               trControl = caret::trainControl("cv", number=kfoldcv, 
                                                               returnResamp='all', savePredictions='final',
                                                               classProbs=TRUE))

classifier.lda
#> Linear Discriminant Analysis 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results:
#> 
#>   Accuracy   Kappa    
#>   0.9839029  0.7525493

classifier.lda$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.lda$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.lda,THRESHOLD)

out_of_folds_CM.lda <- get_out_of_folds_CM(classifier.lda)
classifier.lda.sensitivity.mean <- get_metrics_func(out_of_folds_CM.lda, 'sensitivity', mean)
classifier.lda.sensitivity.sd <- get_metrics_func(out_of_folds_CM.lda, 'sensitivity', sd)

classifier.lda.specificity.mean <- get_metrics_func(out_of_folds_CM.lda, 'specificity', mean)
classifier.lda.specificity.sd <- get_metrics_func(out_of_folds_CM.lda, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.800692581573428
Sensitivity Standard Deviation: 0.0275020332680672
Specificity Mean: 0.989954135219603
Specificity Standard Deviation: 0.0014653493042761

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.lda, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.lda, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.989

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>    .threshold specificity sensitivity one_minus_specificity
#>         <dbl>       <dbl>       <dbl>                 <dbl>
#> 1 -Inf          0                   1                  1   
#> 2    1.85e-16   0                   1                  1   
#> 3    5.08e-16   0.0000163           1                  1.00
#> 4    4.04e-15   0.0000327           1                  1.00
#> 5    1.28e-14   0.0000490           1                  1.00
#> 6    1.61e-14   0.0000653           1                  1.00
predict(classifier.lda, type='prob') %>% head()
#>        Down           Up
#> 1 0.9999986 1.355539e-06
#> 2 0.9999983 1.697476e-06
#> 3 1.0000000 4.967167e-08
#> 4 0.9973435 2.656453e-03
#> 5 1.0000000 4.102756e-09
#> 6 1.0000000 5.173050e-09

We can also look at each ROC curve from each fold from the LDA model. For the 10 different ROC curves, we can even get a sense of the variation in ROC curves.

ROC_curveS <- classifier.lda$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.lda, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the LDA model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.lda$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.lda.recall.mean <- get_metrics_func(out_of_folds_CM.lda, 'recall', mean)
classifier.lda.recall.sd <- get_metrics_func(out_of_folds_CM.lda, 'recall', sd)

classifier.lda.precision.mean <- get_metrics_func(out_of_folds_CM.lda, 'precision', mean)
classifier.lda.precision.sd <- get_metrics_func(out_of_folds_CM.lda, 'precision', sd)

Recall Mean: 0.800692581573428
Recall Standard Deviation: 0.0275020332680672
Precision Mean: 0.7257850068744
Precision Standard Deviation: 0.0259033842126505

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.lda, type='prob')[,2]
opt_thresh.lda <- get_optimal_threshold(classifier.lda)

opt_thresh.lda
#> [1] 0.495

yhat <- tuned_call(preds, opt_thresh.lda)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.009996896
sensitivity
#> [1] 0.8021761
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 60607   400
#>       Up     612  1622
#>                                         
#>                Accuracy : 0.984         
#>                  95% CI : (0.983, 0.985)
#>     No Information Rate : 0.968         
#>     P-Value [Acc > NIR] : < 2.2e-16     
#>                                         
#>                   Kappa : 0.754         
#>                                         
#>  Mcnemar's Test P-Value : 3.295e-11     
#>                                         
#>             Sensitivity : 0.80218       
#>             Specificity : 0.99000       
#>          Pos Pred Value : 0.72605       
#>          Neg Pred Value : 0.99344       
#>              Prevalence : 0.03197       
#>          Detection Rate : 0.02565       
#>    Detection Prevalence : 0.03533       
#>       Balanced Accuracy : 0.89609       
#>                                         
#>        'Positive' Class : Up            
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 60607
False Negative = 400
False Positive = 612
True Positive = 1622
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2234
Predicted Condition Negative = 61007
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.983997722996158
Positive predictive value, precision = 0.726051924798568
False Discovery rate = 0.273948075201432
False Omission Rate = 0.00655662464963037
Negative Predictive Value = 0.99344337535037
True Positive Rate, Recall, Sensitivity = 0.80217606330366
False Positive Rate = 0.00999689638837616
False Negative Rate = 0.19782393669634
Specificity, Selectivity, True Negative rate = 0.990003103611624
Positive likelihood ratio = 80.242510489194
Negative likelihood ratio = 0.199821531846375
Diagnostic Odds Ratio = 401.570890522876
\(F_1\) Score = 0.762218045112782

From our plot we selected our optimal threshold as 0.495.

plot_folds_CM(classifier.lda,opt_thresh.lda)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats LDA

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.lda, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.lda, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

lda.Accuracy = accuracy
lda.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.lda$pred$obs,
                                                 pred = classifier.lda$pred$pred,
                                                 Down = classifier.lda$pred$Down, 
                                                 Up = classifier.lda$pred$Up), lev = levels(classifier.lda$pred$obs))

lda.ROC = "TRUE" #unname(twoclassSum[1])
lda.Threshold = opt_thresh.lda
lda.Sensitivity = lda.Recall= lda.Power =  TPR
lda.Specificity= 1-FPR
lda.FDR = FDR
lda.Precision=PPV

df = data.frame(c(lda.Accuracy, lda.AUC, lda.ROC, lda.Threshold, lda.Sensitivity, lda.Specificity, lda.FDR , lda.Precision))

colnames(df) <- c("LDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	LDA
Accuracy	0.983997722996158
AUC	0.988876770009065
ROC	TRUE
Threshold	0.495
Sensitivity=Recall=Power	0.80217606330366
Specificity=1-FPR	0.990003103611624
FDR	0.273948075201432
Precision=PPV	0.726051924798568

K-Folds Out of Sampling Performance: Quadratic Discriminant Analysis

The next classification process that we used is the quadratic discriminant analysis (qda). Like LDA, the QDA classifier results from assuming that the observations from each class are drawn from a Gaussian distribution, and plugging estimates for the parameters into Bayes’ theorem in order to perform prediction. Again, we’ll use the caret package in R to apply the qda classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

set.seed(1)
registerDoSEQ()
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.qda <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="qda",
                               preProcess=c("center","scale"),
                               #tuneGrid=data.frame(.dimen = seq(1,10,1)),#seq(1,1000,5)),
                               trControl = caret::trainControl("cv", number=kfoldcv, 
                                                               returnResamp='all', savePredictions='final',
                                                               classProbs=TRUE))

classifier.qda
#> Quadratic Discriminant Analysis 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results:
#> 
#>   Accuracy   Kappa    
#>   0.9945605  0.9050437

classifier.qda$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.qda$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.qda,THRESHOLD)

out_of_folds_CM.qda <- get_out_of_folds_CM(classifier.qda)
classifier.qda.sensitivity.mean <- get_metrics_func(out_of_folds_CM.qda, 'sensitivity', mean)
classifier.qda.sensitivity.sd <- get_metrics_func(out_of_folds_CM.qda, 'sensitivity', sd)

classifier.qda.specificity.mean <- get_metrics_func(out_of_folds_CM.qda, 'specificity', mean)
classifier.qda.specificity.sd <- get_metrics_func(out_of_folds_CM.qda, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.838777252109447
Sensitivity Standard Deviation: 0.0190825467050525
Specificity Mean: 0.999705978438419
Specificity Standard Deviation: 0.000200795915437059

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.qda, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.qda, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.998

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>    .threshold specificity sensitivity one_minus_specificity
#>         <dbl>       <dbl>       <dbl>                 <dbl>
#> 1 -Inf          0                   1                  1   
#> 2    4.45e-33   0                   1                  1   
#> 3    9.90e-31   0.0000163           1                  1.00
#> 4    1.92e-29   0.0000327           1                  1.00
#> 5    1.14e-28   0.0000490           1                  1.00
#> 6    1.29e-28   0.0000653           1                  1.00
predict(classifier.qda, type='prob') %>% head()
#>        Down           Up
#> 1 0.9994748 5.251838e-04
#> 2 0.9987765 1.223490e-03
#> 3 1.0000000 1.144881e-08
#> 4 0.9977478 2.252187e-03
#> 5 1.0000000 2.539137e-11
#> 6 1.0000000 3.682938e-14

We can also look at each ROC curve from each fold from the qda model.

ROC_curveS <- classifier.qda$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.qda, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the QDA model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.qda$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.qda.recall.mean <- get_metrics_func(out_of_folds_CM.qda, 'recall', mean)
classifier.qda.recall.sd <- get_metrics_func(out_of_folds_CM.qda, 'recall', sd)

classifier.qda.precision.mean <- get_metrics_func(out_of_folds_CM.qda, 'precision', mean)
classifier.qda.precision.sd <- get_metrics_func(out_of_folds_CM.qda, 'precision', sd)

Recall Mean: 0.838777252109447
Recall Standard Deviation: 0.0190825467050525
Precision Mean: 0.989610677170821
Precision Standard Deviation: 0.00701203785618355

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.qda, type='prob')[,2]
opt_thresh.qda <- get_optimal_threshold(classifier.qda)

opt_thresh.qda
#> [1] 0.22

yhat <- tuned_call(preds, opt_thresh.qda)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.0009964227
sensitivity
#> [1] 0.867458
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61158   268
#>       Up      61  1754
#>                                           
#>                Accuracy : 0.9948          
#>                  95% CI : (0.9942, 0.9953)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9116          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.86746         
#>             Specificity : 0.99900         
#>          Pos Pred Value : 0.96639         
#>          Neg Pred Value : 0.99564         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.02774         
#>    Detection Prevalence : 0.02870         
#>       Balanced Accuracy : 0.93323         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61158
False Negative = 268
False Positive = 61
True Positive = 1754
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1815
Predicted Condition Negative = 61426
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.994797678721083
Positive predictive value, precision = 0.966391184573003
False Discovery rate = 0.0336088154269972
False Omission Rate = 0.00436297333376746
Negative Predictive Value = 0.995637026666233
True Positive Rate, Recall, Sensitivity = 0.867457962413452
False Positive Rate = 0.000996422679233571
False Negative Rate = 0.132542037586548
Specificity, Selectivity, True Negative rate = 0.999003577320766
Positive likelihood ratio = 870.57227870474
Negative likelihood ratio = 0.13267423720545
Diagnostic Odds Ratio = 6561.72816246636
\(F_1\) Score = 0.914255929111285

From our plot we selected our optimal threshold as 0.22.

plot_folds_CM(classifier.qda,opt_thresh.qda)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats qda

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.qda, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.qda, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

qda.Accuracy = accuracy
qda.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.qda$pred$obs,
                                                 pred = classifier.qda$pred$pred,
                                                 Down = classifier.qda$pred$Down, 
                                                 Up = classifier.qda$pred$Up), lev = levels(classifier.qda$pred$obs))

qda.ROC = "TRUE" #unname(twoclassSum[1])
qda.Threshold = opt_thresh.qda
qda.Sensitivity = qda.Recall= qda.Power =  TPR
qda.Specificity= 1-FPR
qda.FDR = FDR
qda.Precision=PPV

df = data.frame(c(qda.Accuracy, qda.AUC, qda.ROC, qda.Threshold, qda.Sensitivity, qda.Specificity, qda.FDR , qda.Precision))

colnames(df) <- c("QDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	QDA
Accuracy	0.994797678721083
AUC	0.998217543931761
ROC	TRUE
Threshold	0.22
Sensitivity=Recall=Power	0.867457962413452
Specificity=1-FPR	0.999003577320766
FDR	0.0336088154269972
Precision=PPV	0.966391184573003

K-Folds Out of Sampling Performance: Logistic Regression

Our next classification process that we used is the linear discriminant analysis (glm). Again, we’ll use the caret package in R to apply the glm classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

set.seed(1)
registerDoSEQ()
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.glm <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="glm",
                               preProcess=c("center","scale"),
                               #tuneGrid=data.frame(.dimen = seq(1,10,1)),#seq(1,1000,5)),
                               trControl = caret::trainControl("cv", number=kfoldcv, 
                                                               returnResamp='all', savePredictions='final',
                                                               classProbs=TRUE))

classifier.glm
#> Generalized Linear Model 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results:
#> 
#>   Accuracy   Kappa    
#>   0.9953037  0.9209187

classifier.glm$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.glm$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.glm,THRESHOLD)

out_of_folds_CM.glm <- get_out_of_folds_CM(classifier.glm)
classifier.glm.sensitivity.mean <- get_metrics_func(out_of_folds_CM.glm, 'sensitivity', mean)
classifier.glm.sensitivity.sd <- get_metrics_func(out_of_folds_CM.glm, 'sensitivity', sd)

classifier.glm.specificity.mean <- get_metrics_func(out_of_folds_CM.glm, 'specificity', mean)
classifier.glm.specificity.sd <- get_metrics_func(out_of_folds_CM.glm, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.885758181729503
Sensitivity Standard Deviation: 0.0207876255423969
Specificity Mean: 0.998921904929239
Specificity Standard Deviation: 0.000279754908412848

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.glm, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.glm, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.999

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>    .threshold specificity sensitivity one_minus_specificity
#>         <dbl>       <dbl>       <dbl>                 <dbl>
#> 1 -Inf              0               1                 1    
#> 2    2.22e-16       0               1                 1    
#> 3    9.39e-14       0.216           1                 0.784
#> 4    9.40e-14       0.216           1                 0.784
#> 5    9.41e-14       0.216           1                 0.784
#> 6    9.43e-14       0.216           1                 0.784
predict(classifier.glm, type='prob') %>% head()
#>        Down           Up
#> 1 0.9917534 8.246618e-03
#> 2 0.9999974 2.620194e-06
#> 3 1.0000000 2.220446e-16
#> 4 0.9999852 1.478614e-05
#> 5 1.0000000 2.220446e-16
#> 6 1.0000000 2.220446e-16

We can also look at each ROC curve from each fold from the glm model.

ROC_curveS <- classifier.glm$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.glm, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the logistic regression model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.glm$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.glm.recall.mean <- get_metrics_func(out_of_folds_CM.glm, 'recall', mean)
classifier.glm.recall.sd <- get_metrics_func(out_of_folds_CM.glm, 'recall', sd)

classifier.glm.precision.mean <- get_metrics_func(out_of_folds_CM.glm, 'precision', mean)
classifier.glm.precision.sd <- get_metrics_func(out_of_folds_CM.glm, 'precision', sd)

Recall Mean: 0.885758181729503
Recall Standard Deviation: 0.0207876255423969
Precision Mean: 0.964588763609958
Precision Standard Deviation: 0.00862150259548997

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.glm, type='prob')[,2]
opt_thresh.glm <- get_optimal_threshold(classifier.glm)

opt_thresh.glm
#> [1] 0.214

yhat <- tuned_call(preds, opt_thresh.glm)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001862167
sensitivity
#> [1] 0.9263106
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61105   149
#>       Up     114  1873
#>                                           
#>                Accuracy : 0.9958          
#>                  95% CI : (0.9953, 0.9963)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2e-16         
#>                                           
#>                   Kappa : 0.9323          
#>                                           
#>  Mcnemar's Test P-Value : 0.03604         
#>                                           
#>             Sensitivity : 0.92631         
#>             Specificity : 0.99814         
#>          Pos Pred Value : 0.94263         
#>          Neg Pred Value : 0.99757         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.02962         
#>    Detection Prevalence : 0.03142         
#>       Balanced Accuracy : 0.96222         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61105
False Negative = 149
False Positive = 114
True Positive = 1873
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1987
Predicted Condition Negative = 61254
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.995841305482203
Positive predictive value, precision = 0.942627075993961
False Discovery rate = 0.0573729240060393
False Omission Rate = 0.00243249420446012
Negative Predictive Value = 0.99756750579554
True Positive Rate, Recall, Sensitivity = 0.926310583580613
False Positive Rate = 0.00186216697430536
False Negative Rate = 0.0736894164193867
Specificity, Selectivity, True Negative rate = 0.998137833025695
Positive likelihood ratio = 497.436908914224
Negative likelihood ratio = 0.0738268944239986
Diagnostic Odds Ratio = 6737.88207935947
\(F_1\) Score = 0.934397605387877

From our plot we selected our optimal threshold as 0.214.

plot_folds_CM(classifier.glm,opt_thresh.glm)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats glm

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.glm, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.glm, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

glm.Accuracy = accuracy
glm.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.glm$pred$obs,
                                                 pred = classifier.glm$pred$pred,
                                                 Down = classifier.glm$pred$Down, 
                                                 Up = classifier.glm$pred$Up), lev = levels(classifier.glm$pred$obs))

glm.ROC = "TRUE" #unname(twoclassSum[1])
glm.Threshold = opt_thresh.glm
glm.Sensitivity = glm.Recall= glm.Power =  TPR
glm.Specificity= 1-FPR
glm.FDR = FDR
glm.Precision=PPV

df = data.frame(c(glm.Accuracy, glm.AUC, glm.ROC, glm.Threshold, glm.Sensitivity, glm.Specificity, glm.FDR , glm.Precision))

colnames(df) <- c("Logistic Regression")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	Logistic Regression
Accuracy	0.995841305482203
AUC	0.998506949374034
ROC	TRUE
Threshold	0.214
Sensitivity=Recall=Power	0.926310583580613
Specificity=1-FPR	0.998137833025695
FDR	0.0573729240060393
Precision=PPV	0.942627075993961

K-Folds Out of Sampling Performance: Random Forest

A Random forests model uses a set of decision trees in a way that decorrelates the trees. Each individual tree will decide a class prediction and the largest total will decide the class that the observation belongs to.

cl <- future::makeClusterPSOCK(4, outfile = NULL, verbose = FALSE)
registerDoParallel(cl)

set.seed(1)

# Random Forest (method = 'rf')
# For classification and regression using package randomForest with tuning parameters:
# Number of Randomly Selected Predictors (mtry, numeric)
classifier.rf <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="rf",
                              preProcess=c("center","scale"),
                              nodesize=5, importance=TRUE,
                              tuneGrid=data.frame(mtry=1:(ncol(HaitiPixels)-1)),
                              trControl = caret::trainControl("cv", number=kfoldcv, 
                                                              returnResamp='all', savePredictions='final',
                                                              classProbs=TRUE))
stopCluster(cl)

classifier.rf
#> Random Forest 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   mtry  Accuracy   Kappa    
#>   1     0.9970431  0.9518832
#>   2     0.9970589  0.9522075
#>   3     0.9970115  0.9513643
#>   4     0.9970589  0.9521707
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was mtry = 2.
classifier.rf.bestMtry <- classifier.rf$bestTune$mtry
classifier.rf.nTree <- classifier.rf$finalModel$ntree

The argument mtry indicates the number of predictors that should be considered for each split of the tree. Training across various values of mtry, we can see that our optimal mtry is 2. The number of trees used was default 500 So we’ll proceed with the analysis using this value.

classifier.rf %>% ggplot(aes(x=seq_along(Accuracy), y=Accuracy))

classifier.rf$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.rf$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.rf,THRESHOLD)

out_of_folds_CM.rf <- get_out_of_folds_CM(classifier.rf)
classifier.rf.sensitivity.mean <- get_metrics_func(out_of_folds_CM.rf, 'sensitivity', mean)
classifier.rf.sensitivity.sd <- get_metrics_func(out_of_folds_CM.rf, 'sensitivity', sd)

classifier.rf.specificity.mean <- get_metrics_func(out_of_folds_CM.rf, 'specificity', mean)
classifier.rf.specificity.sd <- get_metrics_func(out_of_folds_CM.rf, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.948570940837926
Sensitivity Standard Deviation: 0.0158306673347466
Specificity Mean: 0.998660555098661
Specificity Standard Deviation: 0.000351159803790381

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.rf, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.rf, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.994

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>   .threshold specificity sensitivity one_minus_specificity
#>        <dbl>       <dbl>       <dbl>                 <dbl>
#> 1   -Inf           0           1                   1      
#> 2      0           0           1                   1      
#> 3      0.002       0.988       0.989               0.0123 
#> 4      0.004       0.990       0.989               0.00956
#> 5      0.006       0.992       0.989               0.00822
#> 6      0.008       0.993       0.989               0.00750
predict(classifier.rf, type='prob') %>% head()
#>   Down Up
#> 1    1  0
#> 2    1  0
#> 3    1  0
#> 4    1  0
#> 5    1  0
#> 6    1  0

We can also look at each ROC curve from each fold from the rf model.

ROC_curveS <- classifier.rf$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.rf, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the Random Forest model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.rf$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.rf.recall.mean <- get_metrics_func(out_of_folds_CM.rf, 'recall', mean)
classifier.rf.recall.sd <- get_metrics_func(out_of_folds_CM.rf, 'recall', sd)

classifier.rf.precision.mean <- get_metrics_func(out_of_folds_CM.rf, 'precision', mean)
classifier.rf.precision.sd <- get_metrics_func(out_of_folds_CM.rf, 'precision', sd)

Recall Mean: 0.948570940837926
Recall Standard Deviation: 0.0158306673347466
Precision Mean: 0.95907902771856
Precision Standard Deviation: 0.0102829571574502

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.rf, type='prob')[,2]
opt_thresh.rf <- get_optimal_threshold(classifier.rf)

opt_thresh.rf
#> [1] 0.495

yhat <- tuned_call(preds, opt_thresh.rf)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.0003593656
sensitivity
#> [1] 0.9836795
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61197    33
#>       Up      22  1989
#>                                           
#>                Accuracy : 0.9991          
#>                  95% CI : (0.9989, 0.9993)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : <2e-16          
#>                                           
#>                   Kappa : 0.9859          
#>                                           
#>  Mcnemar's Test P-Value : 0.1775          
#>                                           
#>             Sensitivity : 0.98368         
#>             Specificity : 0.99964         
#>          Pos Pred Value : 0.98906         
#>          Neg Pred Value : 0.99946         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03145         
#>    Detection Prevalence : 0.03180         
#>       Balanced Accuracy : 0.99166         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61197
False Negative = 33
False Positive = 22
True Positive = 1989
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2011
Predicted Condition Negative = 61230
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.9991303110324
Positive predictive value, precision = 0.989060169070114
False Discovery rate = 0.0109398309298856
False Omission Rate = 0.000538951494365507
Negative Predictive Value = 0.999461048505634
True Positive Rate, Recall, Sensitivity = 0.983679525222552
False Positive Rate = 0.000359365556444895
False Negative Rate = 0.0163204747774481
Specificity, Selectivity, True Negative rate = 0.999640634443555
Positive likelihood ratio = 2737.26712975452
Negative likelihood ratio = 0.0163263419023905
Diagnostic Odds Ratio = 167659.549586777
\(F_1\) Score = 0.986362509298289

From our plot we selected our optimal threshold as 0.495.

plot_folds_CM(classifier.rf,opt_thresh.rf)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats Random Forest

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.rf, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.rf, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

rf.Accuracy = accuracy
rf.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.rf$pred$obs,
                                                 pred = classifier.rf$pred$pred,
                                                 Down = classifier.rf$pred$Down, 
                                                 Up = classifier.rf$pred$Up), lev = levels(classifier.rf$pred$obs))

rf.ROC = "TRUE" #unname(twoclassSum[1])
rf.Threshold = opt_thresh.rf
rf.Sensitivity = rf.Recall= rf.Power =  TPR
rf.Specificity= 1-FPR
rf.FDR = FDR
rf.Precision=PPV

df = data.frame(c(rf.Accuracy, rf.AUC, rf.ROC, rf.Threshold, rf.Sensitivity, rf.Specificity, rf.FDR , rf.Precision))

colnames(df) <- c(paste0("Random Forest ( mtry = ",classifier.rf.bestMtry,", ntree=",classifier.rf.nTree,")"))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	Random Forest ( mtry = 2, ntree=500)
Accuracy	0.9991303110324
AUC	0.994483115853513
ROC	TRUE
Threshold	0.495
Sensitivity=Recall=Power	0.983679525222552
Specificity=1-FPR	0.999640634443555
FDR	0.0109398309298856
Precision=PPV	0.989060169070114

K-Folds Out of Sampling Performance: Support Vector Machines with Linear Kernel

The support vector machine (SVM) is an extension of the support vector machine classifier that results from enlarging the feature space using kernels. The linear kernel takes the form of

\[K(x_i,x_{i^\prime}) =\sum_{j=1}^{p}x_{ij}x_{{i^\prime}j}\]

where the kernel quantifies the similarity of a pair of observations using Pearson correlation.

cl <- future::makeClusterPSOCK(4, outfile = NULL, verbose = FALSE)
registerDoParallel(cl)

set.seed(1)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.svm.linear <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="svmLinear",
                                      preProcess=c("center","scale"),
                                      tuneGrid=data.frame(C=c(10^((-1):3))),
                                      trControl = caret::trainControl("cv", number=kfoldcv, 
                                                                      returnResamp='all', savePredictions='final',
                                                                      classProbs=TRUE))
stopCluster(cl)

classifier.svm.linear
#> Support Vector Machines with Linear Kernel 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   C      Accuracy   Kappa    
#>   1e-01  0.9952246  0.9190427
#>   1e+00  0.9954144  0.9229297
#>   1e+01  0.9953669  0.9219726
#>   1e+02  0.9953986  0.9223722
#>   1e+03  0.9953669  0.9218230
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final value used for the model was C = 1.
classifier.svm.linear.bestC <- classifier.svm.linear$bestTune$C

We can compute the distance from each training observation to a given separating hyperplane and call this the margin. When the cost argument is small, then the margins will be wide and many support vectors will violate the margin. When the cost argument is large, then the margins will be narrow and there will be few support vectors violating the margin. The parameter C can be used as a budget for the amount that the margin can be violated by the n observations. Training across various values of Cost, we can see that our optimal C is 1. So we’ll proceed with the analysis using this value.

classifier.svm.linear %>% ggplot(aes(x=seq_along(Accuracy), y=Accuracy))

classifier.svm.linear$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.svm.linear$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.svm.linear,THRESHOLD)

out_of_folds_CM.svm.linear <- get_out_of_folds_CM(classifier.svm.linear)
classifier.svm.linear.sensitivity.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'sensitivity', mean)
classifier.svm.linear.sensitivity.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'sensitivity', sd)

classifier.svm.linear.specificity.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'specificity', mean)
classifier.svm.linear.specificity.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.888726040091694
Sensitivity Standard Deviation: 0.0158463375204761
Specificity Mean: 0.998938239460438
Specificity Standard Deviation: 0.000319807616920906

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.linear, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.svm.linear, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary         0.998

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>    .threshold specificity sensitivity one_minus_specificity
#>         <dbl>       <dbl>       <dbl>                 <dbl>
#> 1 -Inf              0               1                 1    
#> 2    0.             0               1                 1    
#> 3    1.11e-16       0.139           1                 0.861
#> 4    2.22e-16       0.165           1                 0.835
#> 5    3.33e-16       0.178           1                 0.822
#> 6    4.44e-16       0.185           1                 0.815
predict(classifier.svm.linear, type='prob') %>% head()
#>        Down           Up
#> 1 0.9923076 7.692367e-03
#> 2 0.9999997 2.702223e-07
#> 3 1.0000000 3.330669e-16
#> 4 0.9999970 2.972985e-06
#> 5 1.0000000 0.000000e+00
#> 6 1.0000000 0.000000e+00

We can also look at each ROC curve from each fold from the SVM with Linear Kernel model.

ROC_curveS <- classifier.svm.linear$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.svm.linear, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the SVM with Linear Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.svm.linear$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.svm.linear.recall.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'recall', mean)
classifier.svm.linear.recall.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'recall', sd)

classifier.svm.linear.precision.mean <- get_metrics_func(out_of_folds_CM.svm.linear, 'precision', mean)
classifier.svm.linear.precision.sd <- get_metrics_func(out_of_folds_CM.svm.linear, 'precision', sd)

Recall Mean: 0.888726040091694
Recall Standard Deviation: 0.0158463375204761
Precision Mean: 0.965193426042111
Precision Standard Deviation: 0.0101404124635792

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.svm.linear, type='prob')[,2]
opt_thresh.svm.linear <- get_optimal_threshold(classifier.svm.linear)

opt_thresh.svm.linear
#> [1] 0.233

yhat <- tuned_call(preds, opt_thresh.svm.linear)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001666149
sensitivity
#> [1] 0.921365
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61117   159
#>       Up     102  1863
#>                                           
#>                Accuracy : 0.9959          
#>                  95% CI : (0.9953, 0.9964)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9324          
#>                                           
#>  Mcnemar's Test P-Value : 0.0005276       
#>                                           
#>             Sensitivity : 0.92136         
#>             Specificity : 0.99833         
#>          Pos Pred Value : 0.94809         
#>          Neg Pred Value : 0.99741         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.02946         
#>    Detection Prevalence : 0.03107         
#>       Balanced Accuracy : 0.95985         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61117
False Negative = 159
False Positive = 102
True Positive = 1863
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1965
Predicted Condition Negative = 61276
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.99587293053557
Positive predictive value, precision = 0.948091603053435
False Discovery rate = 0.0519083969465649
False Omission Rate = 0.00259481689405314
Negative Predictive Value = 0.997405183105947
True Positive Rate, Recall, Sensitivity = 0.921364985163205
False Positive Rate = 0.00166614939806269
False Negative Rate = 0.0786350148367952
Specificity, Selectivity, True Negative rate = 0.998333850601937
Positive likelihood ratio = 552.990617908885
Negative likelihood ratio = 0.0787662511787844
Diagnostic Odds Ratio = 7020.65427302997
\(F_1\) Score = 0.934537246049661

From our plot we selected our optimal threshold as 0.233.

plot_folds_CM(classifier.svm.linear,opt_thresh.svm.linear)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats SVM Linear

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.linear, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.svm.linear, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

svm.linear.Accuracy = accuracy
svm.linear.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.svm.linear$pred$obs,
                                                 pred = classifier.svm.linear$pred$pred,
                                                 Down = classifier.svm.linear$pred$Down, 
                                                 Up = classifier.svm.linear$pred$Up), lev = levels(classifier.svm.linear$pred$obs))

svm.linear.ROC = "TRUE" #unname(twoclassSum[1])
svm.linear.Threshold = opt_thresh.svm.linear
svm.linear.Sensitivity = svm.linear.Recall= svm.linear.Power =  TPR
svm.linear.Specificity= 1-FPR
svm.linear.FDR = FDR
svm.linear.Precision=PPV

df = data.frame(c(svm.linear.Accuracy, svm.linear.AUC, svm.linear.ROC, svm.linear.Threshold, svm.linear.Sensitivity, svm.linear.Specificity, svm.linear.FDR , svm.linear.Precision))


colnames(df) <- c(paste0("SVM Linear ( Cost = ",classifier.svm.linear.bestC,")" ))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	SVM Linear ( Cost = 1)
Accuracy	0.99587293053557
AUC	0.997848791117502
ROC	TRUE
Threshold	0.233
Sensitivity=Recall=Power	0.921364985163205
Specificity=1-FPR	0.998333850601937
FDR	0.0519083969465649
Precision=PPV	0.948091603053435

K-Folds Out of Sampling Performance: Support Vector Machines with Radial Basis Function Kernel

The radial kernel of the SVM takes the form

\[K(x_i,x_{i^\prime}) = exp(-\gamma\sum_{j=1}^{p}(x_{ij} - x_{{i^\prime}j})^2)\]

cl <- future::makeClusterPSOCK(4, outfile = NULL, verbose = FALSE)
registerDoParallel(cl)

set.seed(1)
grid <- expand.grid(sigma = seq(0.8,1.8,.2),
                    C = c(10^((2):6))
)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.svm.radial <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="svmRadial",
                                      preProcess=c("center","scale"),
                                      tuneGrid=grid,
                                      trControl = caret::trainControl("cv", number=kfoldcv, 
                                                                      returnResamp='all', savePredictions='final',
                                                                      classProbs=TRUE))
stopCluster(cl)

classifier.svm.radial
#> Support Vector Machines with Radial Basis Function Kernel 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   sigma  C      Accuracy   Kappa    
#>   0.8    1e+02  0.9969324  0.9500417
#>   0.8    1e+03  0.9970747  0.9522542
#>   0.8    1e+04  0.9972803  0.9557741
#>   0.8    1e+05  0.9972486  0.9551823
#>   0.8    1e+06  0.9606135  0.7975858
#>   1.0    1e+02  0.9970115  0.9512629
#>   1.0    1e+03  0.9970905  0.9525768
#>   1.0    1e+04  0.9972961  0.9561093
#>   1.0    1e+05  0.9973277  0.9563655
#>   1.0    1e+06  0.9929947  0.8547633
#>   1.2    1e+02  0.9970273  0.9515025
#>   1.2    1e+03  0.9972170  0.9546950
#>   1.2    1e+04  0.9972170  0.9547799
#>   1.2    1e+05  0.9973751  0.9570940
#>   1.2    1e+06  0.8931935  0.6229545
#>   1.4    1e+02  0.9970431  0.9517665
#>   1.4    1e+03  0.9972645  0.9555070
#>   1.4    1e+04  0.9972645  0.9555061
#>   1.4    1e+05  0.9972487  0.9549891
#>   1.4    1e+06  0.9597518  0.7016095
#>   1.6    1e+02  0.9970431  0.9517705
#>   1.6    1e+03  0.9972645  0.9554496
#>   1.6    1e+04  0.9973751  0.9572456
#>   1.6    1e+05  0.9972170  0.9543396
#>   1.6    1e+06  0.9453186  0.6469249
#>   1.8    1e+02  0.9971380  0.9533904
#>   1.8    1e+03  0.9972961  0.9560316
#>   1.8    1e+04  0.9973277  0.9564755
#>   1.8    1e+05  0.9972170  0.9543601
#>   1.8    1e+06  0.9545834  0.7186193
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1.2 and C = 1e+05.
classifier.svm.radial.bestC <- classifier.svm.radial$bestTune$C
classifier.svm.radial.bestSigma <- classifier.svm.radial$bestTune$sigma

Training across various values of Cost and Sigma, we can see that our optimal C is 1e+05 and our optimal sigma is 1.2. So we’ll proceed with the analysis using this value.

classifier.svm.radial %>% ggplot(aes(x=seq_along(Accuracy), y=Accuracy))

classifier.svm.radial$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.svm.radial$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.svm.radial,THRESHOLD)

out_of_folds_CM.svm.radial <- get_out_of_folds_CM(classifier.svm.radial)
classifier.svm.radial.sensitivity.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'sensitivity', mean)
classifier.svm.radial.sensitivity.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'sensitivity', sd)

classifier.svm.radial.specificity.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'specificity', mean)
classifier.svm.radial.specificity.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.950051212017754
Sensitivity Standard Deviation: 0.0247402222819946
Specificity Mean: 0.998938242129043
Specificity Standard Deviation: 0.000258257553779507

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.radial, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.svm.radial, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary          1.00

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>    .threshold specificity sensitivity one_minus_specificity
#>         <dbl>       <dbl>       <dbl>                 <dbl>
#> 1 -Inf              0               1                 1    
#> 2    0.             0               1                 1    
#> 3    1.11e-16       0.687           1                 0.313
#> 4    2.22e-16       0.696           1                 0.304
#> 5    3.33e-16       0.700           1                 0.300
#> 6    4.44e-16       0.703           1                 0.297
predict(classifier.svm.radial, type='prob') %>% head()
#>        Down           Up
#> 1 1.0000000 0.000000e+00
#> 2 0.9999186 8.143069e-05
#> 3 1.0000000 0.000000e+00
#> 4 0.9896867 1.031325e-02
#> 5 1.0000000 0.000000e+00
#> 6 1.0000000 0.000000e+00

We can also look at each ROC curve from each fold from the SVM with Radial Kernel model.

ROC_curveS <- classifier.svm.radial$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.svm.radial, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the SVM with Radial Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.svm.radial$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.svm.radial.recall.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'recall', mean)
classifier.svm.radial.recall.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'recall', sd)

classifier.svm.radial.precision.mean <- get_metrics_func(out_of_folds_CM.svm.radial, 'precision', mean)
classifier.svm.radial.precision.sd <- get_metrics_func(out_of_folds_CM.svm.radial, 'precision', sd)

Recall Mean: 0.950051212017754
Recall Standard Deviation: 0.0247402222819946
Precision Mean: 0.967355639517883
Precision Standard Deviation: 0.0074508221585993

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.svm.radial, type='prob')[,2]
opt_thresh.svm.radial <- get_optimal_threshold(classifier.svm.radial)

opt_thresh.svm.radial
#> [1] 0.335

yhat <- tuned_call(preds, opt_thresh.svm.radial)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001519136
sensitivity
#> [1] 0.9772502
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61126    46
#>       Up      93  1976
#>                                           
#>                Accuracy : 0.9978          
#>                  95% CI : (0.9974, 0.9982)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9649          
#>                                           
#>  Mcnemar's Test P-Value : 9.553e-05       
#>                                           
#>             Sensitivity : 0.97725         
#>             Specificity : 0.99848         
#>          Pos Pred Value : 0.95505         
#>          Neg Pred Value : 0.99925         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03125         
#>    Detection Prevalence : 0.03272         
#>       Balanced Accuracy : 0.98787         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61126
False Negative = 46
False Positive = 93
True Positive = 1976
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 2069
Predicted Condition Negative = 61172
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.997802058790974
Positive predictive value, precision = 0.955050749154181
False Discovery rate = 0.0449492508458192
False Omission Rate = 0.00075197802916367
Negative Predictive Value = 0.999248021970836
True Positive Rate, Recall, Sensitivity = 0.977250247279921
False Positive Rate = 0.00151913621588069
False Negative Rate = 0.0227497527200791
Specificity, Selectivity, True Negative rate = 0.998480863784119
Positive likelihood ratio = 643.293364389564
Negative likelihood ratio = 0.0227843652745235
Diagnostic Odds Ratio = 28233.9822346891
\(F_1\) Score = 0.966022977267172

From our plot we selected our optimal threshold as 0.335.

plot_folds_CM(classifier.svm.radial,opt_thresh.svm.radial)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats SVM Radial

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.radial, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.svm.radial, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

svm.radial.Accuracy = accuracy
svm.radial.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.svm.radial$pred$obs,
                                                 pred = classifier.svm.radial$pred$pred,
                                                 Down = classifier.svm.radial$pred$Down, 
                                                 Up = classifier.svm.radial$pred$Up), lev = levels(classifier.svm.radial$pred$obs))

svm.radial.ROC = "TRUE" #unname(twoclassSum[1])
svm.radial.Threshold = opt_thresh.svm.radial
svm.radial.Sensitivity = svm.radial.Recall= svm.radial.Power =  TPR
svm.radial.Specificity= 1-FPR
svm.radial.FDR = FDR
svm.radial.Precision=PPV

df = data.frame(c(svm.radial.Accuracy, svm.radial.AUC, svm.radial.ROC, svm.radial.Threshold, svm.radial.Sensitivity, svm.radial.Specificity, svm.radial.FDR , svm.radial.Precision))

colnames(df) <- c(paste0("SVM Radial ( Cost = ",classifier.svm.radial.bestC,", Sigma = ",classifier.svm.radial.bestSigma ,")" ))

rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	SVM Radial ( Cost = 1e+05, Sigma = 1.2)
Accuracy	0.997802058790974
AUC	0.999820301064707
ROC	TRUE
Threshold	0.335
Sensitivity=Recall=Power	0.977250247279921
Specificity=1-FPR	0.998480863784119
FDR	0.0449492508458192
Precision=PPV	0.955050749154181

K-Folds Out of Sampling Performance: Support Vector Machines with Polynomial Kernel

The polynomial kernel of the SVM takes the form of

\[K(x_i,x_{i^\prime}) =( 1+\sum_{j=1}^{p}x_{ij}x_{{i^\prime}j})^d\]

cl <- future::makeClusterPSOCK(4, outfile = NULL, verbose = FALSE)
registerDoParallel(cl)

set.seed(1)
grid <- expand.grid(scale = seq(0.6,1.2,0.2),
                    C = c(10^((2):4)),
                    degree = seq(4,6,1)
)
# https://blog.revolutionanalytics.com/2016/05/using-caret-to-compare-models.html
classifier.svm.poly <- caret::train(BlueTarp_rest ~ Red + Green + Blue, data=train, method="svmPoly",
                                    preProcess=c("center","scale"),
                                    tuneGrid=grid,
                                    trControl = caret::trainControl("cv", number=kfoldcv, 
                                                                    returnResamp='all', savePredictions='final',
                                                                    classProbs=TRUE))
stopCluster(cl)

classifier.svm.poly
#> Support Vector Machines with Polynomial Kernel 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   scale  C      degree  Accuracy   Kappa     
#>   0.6      100  4       0.9969798  0.95073253
#>   0.6      100  5       0.9969798  0.95039113
#>   0.6      100  6       0.9969482  0.94973290
#>   0.6     1000  4       0.9971222  0.95283161
#>   0.6     1000  5       0.9969166  0.94921460
#>   0.6     1000  6       0.9953667  0.92889594
#>   0.6    10000  4       0.9637691  0.66318022
#>   0.6    10000  5       0.9450249  0.33139394
#>   0.6    10000  6       0.8959164  0.35610998
#>   0.8      100  4       0.9969798  0.95056708
#>   0.8      100  5       0.9969957  0.95060633
#>   0.8      100  6       0.9969166  0.94916759
#>   0.8     1000  4       0.9970905  0.95235627
#>   0.8     1000  5       0.9799512  0.82988695
#>   0.8     1000  6       0.9256509  0.60509759
#>   0.8    10000  4       0.9056307  0.43616018
#>   0.8    10000  5       0.9268660  0.15758162
#>   0.8    10000  6       0.9594508  0.20609072
#>   1.0      100  4       0.9970747  0.95209703
#>   1.0      100  5       0.9969324  0.94944514
#>   1.0      100  6       0.9967110  0.94559460
#>   1.0     1000  4       0.9968376  0.94849150
#>   1.0     1000  5       0.9475411  0.61267836
#>   1.0     1000  6       0.9569166  0.58566453
#>   1.0    10000  4       0.9779145  0.40916541
#>   1.0    10000  5       0.9518177  0.25844373
#>   1.0    10000  6       0.8896854  0.05280171
#>   1.2      100  4       0.9971064  0.95260152
#>   1.2      100  5       0.9958889  0.92687426
#>   1.2      100  6       0.9585005  0.76295369
#>   1.2     1000  4       0.9627126  0.77478578
#>   1.2     1000  5       0.9767552  0.56539850
#>   1.2     1000  6       0.9179317  0.38212066
#>   1.2    10000  4       0.9635912  0.24989693
#>   1.2    10000  5       0.9750423  0.31543508
#>   1.2    10000  6       0.9179759  0.01846190
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final values used for the model were degree = 4, scale = 0.6 and C = 1000.
classifier.svm.poly.bestC <- classifier.svm.poly$bestTune$C
classifier.svm.poly.bestDegree <- classifier.svm.poly$bestTune$degree
classifier.svm.poly.bestScale <- classifier.svm.poly$bestTune$scale

Training across various values of Cost, Degree and Scale, we can see that our optimal C is 1000, our optimal Degree is 4 and our optimal Scale is 0.6. So we’ll proceed with the analysis using this value.

We can look at the various plots of Scale vs Accuracy for various Cost and Polynomial degree.

classifier.svm.poly %>% ggplot(aes(x=seq_along(Accuracy), y=Accuracy))

classifier.svm.poly$resample %>% 
  dplyr::group_split(Resample) %>% 
  purrr::map(rowid_to_column) %>%
  dplyr::bind_rows() %>%
  ggplot(aes(rowid, Accuracy)) + geom_point() +
  geom_smooth(formula='y~x', method='loess', span=.03) +
  geom_line(classifier.svm.poly$results, mapping=aes(seq_along(Accuracy), Accuracy),
            size=2, color='red')

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model.

# confusionMatrix
THRESHOLD <- 0.5
plot_folds_CM(classifier.svm.poly,THRESHOLD)

out_of_folds_CM.svm.poly <- get_out_of_folds_CM(classifier.svm.poly)
classifier.svm.poly.sensitivity.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'sensitivity', mean)
classifier.svm.poly.sensitivity.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'sensitivity', sd)

classifier.svm.poly.specificity.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'specificity', mean)
classifier.svm.poly.specificity.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'specificity', sd)

Reviewing the confusion matrix, we have:

Sensitivity Mean: 0.942640101448568
Sensitivity Standard Deviation: 0.0259690507584868
Specificity Mean: 0.998921910266449
Specificity Standard Deviation: 0.000463281024253893

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.poly, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


predict(classifier.svm.poly, type='prob') %>% 
  yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 
#> # A tibble: 1 x 3
#>   .metric .estimator .estimate
#>   <chr>   <chr>          <dbl>
#> 1 roc_auc binary          1.00

ROC_curve %>% head()
#> # A tibble: 6 x 4
#>    .threshold specificity sensitivity one_minus_specificity
#>         <dbl>       <dbl>       <dbl>                 <dbl>
#> 1 -Inf              0               1                 1    
#> 2    0.             0               1                 1    
#> 3    1.11e-16       0.202           1                 0.798
#> 4    2.22e-16       0.220           1                 0.780
#> 5    3.33e-16       0.227           1                 0.773
#> 6    4.44e-16       0.231           1                 0.769
predict(classifier.svm.poly, type='prob') %>% head()
#>        Down           Up
#> 1 0.9999989 1.120733e-06
#> 2 0.9999999 9.594520e-08
#> 3 0.9999898 1.023643e-05
#> 4 0.9995981 4.019017e-04
#> 5 0.9979354 2.064604e-03
#> 6 1.0000000 5.073072e-10

We can also look at each ROC curve from each fold from the SVM with Polynomial Kernel model.

ROC_curveS <- classifier.svm.poly$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::roc_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ dplyr::mutate(.x, one_minus_specificity = 1-specificity)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(one_minus_specificity, sensitivity)))

ROC_curveS[[1]] <- ggplot() + ROC_curveS[[1]]
Reduce("+", ROC_curveS) + 
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC Curves')

Precision-Recall Curve

Next, we’ll look at the P-R curve.

PR_curve <- predict(classifier.svm.poly, type='prob') %>% 
  yardstick::pr_curve(truth=train$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

We can also look at each P-R curve from each fold from the SVM with Polynomial Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

PR_curveS <- classifier.svm.poly$pred %>%
  dplyr::group_split(Resample) %>% 
  purrr::map(~ yardstick::pr_curve(.x, truth=obs, Up)) %>%
  purrr::map(~ geom_line(.x, mapping=aes(recall, precision)))

PR_curveS[[1]] <- ggplot() + PR_curveS[[1]]
Reduce("+", PR_curveS) + 
  #geom_abline(slope=-1, intercept=1, linetype='dashed', color='red') +
  #xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('Precision Recall Curves')

classifier.svm.poly.recall.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'recall', mean)
classifier.svm.poly.recall.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'recall', sd)

classifier.svm.poly.precision.mean <- get_metrics_func(out_of_folds_CM.svm.poly, 'precision', mean)
classifier.svm.poly.precision.sd <- get_metrics_func(out_of_folds_CM.svm.poly, 'precision', sd)

Recall Mean: 0.942640101448568
Recall Standard Deviation: 0.0259690507584868
Precision Mean: 0.966767401633039
Precision Standard Deviation: 0.013585405388514

To find the optimal threshold, well look at using the \(F_1\) Score.

preds <- predict(classifier.svm.poly, type='prob')[,2]
opt_thresh.svm.poly <- get_optimal_threshold(classifier.svm.poly)

opt_thresh.svm.poly
#> [1] 0.39

yhat <- tuned_call(preds, opt_thresh.svm.poly)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001176105
sensitivity
#> [1] 0.9495549
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61147   102
#>       Up      72  1920
#>                                           
#>                Accuracy : 0.9972          
#>                  95% CI : (0.9968, 0.9976)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2e-16         
#>                                           
#>                   Kappa : 0.9552          
#>                                           
#>  Mcnemar's Test P-Value : 0.02791         
#>                                           
#>             Sensitivity : 0.94955         
#>             Specificity : 0.99882         
#>          Pos Pred Value : 0.96386         
#>          Neg Pred Value : 0.99833         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03036         
#>    Detection Prevalence : 0.03150         
#>       Balanced Accuracy : 0.97419         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

True Negative = 61147
False Negative = 102
False Positive = 72
True Positive = 1920
Condition Positive = 2022
condition Negative = 61219
Predicted Condition Positive = 1992
Predicted Condition Negative = 61249
Total Population = 63241
Prevalence = 0.0319729289543176
Accuracy = 0.997248620357047
Positive predictive value, precision = 0.963855421686747
False Discovery rate = 0.036144578313253
False Omission Rate = 0.00166533331156427
Negative Predictive Value = 0.998334666688436
True Positive Rate, Recall, Sensitivity = 0.949554896142433
False Positive Rate = 0.00117610545745602
False Negative Rate = 0.0504451038575668
Specificity, Selectivity, True Negative rate = 0.998823894542544
Positive likelihood ratio = 807.37223870755
Negative likelihood ratio = 0.0505045024785579
Diagnostic Odds Ratio = 15986.1437908497
\(F_1\) Score = 0.956651718983558

From our plot we selected our optimal threshold as 0.39.

plot_folds_CM(classifier.svm.poly,opt_thresh.svm.poly)
#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

#> Warning: 'tidy.table' is deprecated.
#> See help("Deprecated")

Summary Stats SVM Poly

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.poly, type='prob') %>% 
  yardstick::roc_curve(truth=train$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.svm.poly, type='prob') %>% yardstick::roc_auc(truth=train$BlueTarp_rest, Up) 

svm.poly.Accuracy = accuracy
svm.poly.AUC = temp$.estimate

twoclassSum = twoClassSummary( data = data.frame(obs = classifier.svm.poly$pred$obs,
                                                 pred = classifier.svm.poly$pred$pred,
                                                 Down = classifier.svm.poly$pred$Down, 
                                                 Up = classifier.svm.poly$pred$Up), lev = levels(classifier.svm.poly$pred$obs))

svm.poly.ROC = "TRUE" #unname(twoclassSum[1])
svm.poly.Threshold = opt_thresh.svm.poly
svm.poly.Sensitivity = svm.poly.Recall= svm.poly.Power =  TPR
svm.poly.Specificity= 1-FPR
svm.poly.FDR = FDR
svm.poly.Precision=PPV

df = data.frame(c(svm.poly.Accuracy, svm.poly.AUC, svm.poly.ROC, svm.poly.Threshold, svm.poly.Sensitivity, svm.poly.Specificity, svm.poly.FDR , svm.poly.Precision))

colnames(df) <- c(paste0("SVM Polynomial ( Cost = ",classifier.svm.poly.bestC,", Degree = ",classifier.svm.poly.bestDegree , ", Scale = ",classifier.svm.poly.bestScale,")" ))

rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	SVM Polynomial ( Cost = 1000, Degree = 4, Scale = 0.6)
Accuracy	0.997248620357047
AUC	0.999714294526813
ROC	TRUE
Threshold	0.39
Sensitivity=Recall=Power	0.949554896142433
Specificity=1-FPR	0.998823894542544
FDR	0.036144578313253
Precision=PPV	0.963855421686747

K-Folds Out of Sampling Performance: Table 2


df = data.frame(c(knn.Accuracy, knn.AUC, knn.ROC, knn.Threshold, knn.Sensitivity, knn.Specificity, knn.FDR , knn.Precision),
                c(lda.Accuracy, lda.AUC, lda.ROC, lda.Threshold, lda.Sensitivity, lda.Specificity, lda.FDR , lda.Precision),
                c(qda.Accuracy, qda.AUC, qda.ROC, qda.Threshold, qda.Sensitivity, qda.Specificity, qda.FDR , qda.Precision), 
                c(glm.Accuracy, glm.AUC, glm.ROC, glm.Threshold, glm.Sensitivity, glm.Specificity, glm.FDR , glm.Precision))


colnames(df) <- c(paste0("KNN (k =",bestk,")"), "LDA","QDA","Logistic Regression")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	KNN (k =5)	LDA	QDA	Logistic Regression
Accuracy	0.99791274647776	0.983997722996158	0.994797678721083	0.995841305482203
AUC	0.999862454860983	0.988876770009065	0.998217543931761	0.998506949374034
ROC	TRUE	TRUE	TRUE	TRUE
Threshold	0.499	0.495	0.22	0.214
Sensitivity=Recall=Power	0.969337289812067	0.80217606330366	0.867457962413452	0.926310583580613
Specificity=1-FPR	0.998856564138584	0.990003103611624	0.999003577320766	0.998137833025695
FDR	0.0344827586206897	0.273948075201432	0.0336088154269972	0.0573729240060393
Precision=PPV	0.96551724137931	0.726051924798568	0.966391184573003	0.942627075993961


df = data.frame(c(rf.Accuracy, rf.AUC, rf.ROC, rf.Threshold, rf.Sensitivity, rf.Specificity, rf.FDR , rf.Precision),
                c(svm.linear.Accuracy, svm.linear.AUC, svm.linear.ROC, svm.linear.Threshold, svm.linear.Sensitivity, svm.linear.Specificity, svm.linear.FDR , svm.linear.Precision),
                c(svm.radial.Accuracy, svm.radial.AUC, svm.radial.ROC, svm.radial.Threshold, svm.radial.Sensitivity, svm.radial.Specificity, svm.radial.FDR , svm.radial.Precision), 
                c(svm.poly.Accuracy, svm.poly.AUC, svm.poly.ROC, svm.poly.Threshold, svm.poly.Sensitivity, svm.poly.Specificity, svm.poly.FDR , svm.poly.Precision))


colnames(df) <- c(paste0("Random Forest ( mtry = ",classifier.rf.bestMtry,", ntree=",classifier.rf.nTree,")")
, paste0("SVM Linear ( Cost = ",classifier.svm.linear.bestC,")" ),paste0("SVM Radial ( Cost = ",classifier.svm.radial.bestC,", Sigma = ",classifier.svm.radial.bestSigma ,")" )
,paste0("SVM Polynomial ( Cost = ",classifier.svm.poly.bestC,", Degree = ",classifier.svm.poly.bestDegree , ", Scale = ",classifier.svm.poly.bestScale,")" ))
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	Random Forest ( mtry = 2, ntree=500)	SVM Linear ( Cost = 1)	SVM Radial ( Cost = 1e+05, Sigma = 1.2)	SVM Polynomial ( Cost = 1000, Degree = 4, Scale = 0.6)
Accuracy	0.9991303110324	0.99587293053557	0.997802058790974	0.997248620357047
AUC	0.994483115853513	0.997848791117502	0.999820301064707	0.999714294526813
ROC	TRUE	TRUE	TRUE	TRUE
Threshold	0.495	0.233	0.335	0.39
Sensitivity=Recall=Power	0.983679525222552	0.921364985163205	0.977250247279921	0.949554896142433
Specificity=1-FPR	0.999640634443555	0.998333850601937	0.998480863784119	0.998823894542544
FDR	0.0109398309298856	0.0519083969465649	0.0449492508458192	0.036144578313253
Precision=PPV	0.989060169070114	0.948091603053435	0.955050749154181	0.963855421686747

Hold-Out Test Sample Performance: K-Nearest Neighbors

y <- fho$BlueTarp_rest
preds <- predict(classifier.knn, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977410    2501
#>       Up     12287   11979
#>                                           
#>                Accuracy : 0.9926          
#>                  95% CI : (0.9925, 0.9927)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 0.9948          
#>                                           
#>                   Kappa : 0.6148          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.827279        
#>             Specificity : 0.993825        
#>          Pos Pred Value : 0.493654        
#>          Neg Pred Value : 0.998737        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005977        
#>    Detection Prevalence : 0.012108        
#>       Balanced Accuracy : 0.910552        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]

  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive

  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.knn.Threshold = min(c(knn.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.knn.ROC = "TRUE"

statistics = stat_fho(classifier.knn, fho, fho.knn.Threshold)

fho.knn.accuracy = statistics[1]
fho.knn.sensitivity = statistics[2]
fho.knn.specificity = statistics[3]
fho.knn.FDR = statistics[4]
fho.knn.precision = statistics[5]

yhat <- tuned_call(preds, fho.knn.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977282    2499
#>       Up     12415   11981
#>                                           
#>                Accuracy : 0.9926          
#>                  95% CI : (0.9924, 0.9927)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 0.9998          
#>                                           
#>                   Kappa : 0.6129          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.827417        
#>             Specificity : 0.993760        
#>          Pos Pred Value : 0.491105        
#>          Neg Pred Value : 0.998738        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005978        
#>    Detection Prevalence : 0.012173        
#>       Balanced Accuracy : 0.910589        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.knn, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.knn, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.knn.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.knn, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.knn.accuracy, fho.knn.AUC, fho.knn.ROC, fho.knn.Threshold, fho.knn.sensitivity, fho.knn.specificity, fho.knn.FDR , fho.knn.precision))

colnames(df) <- c("FHO KNN")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO KNN
Accuracy	0.992558541486106
AUC	0.961298412282614
ROC	TRUE
Threshold	0.499
Sensitivity=Recall=Power	0.827417127071823
Specificity=1-FPR	0.993760356476388
FDR	0.50889490080341
Precision=PPV	0.49110509919659

Hold-Out Test Sample Performance: Linear Discriminant Analysis

y <- fho$BlueTarp_rest
preds <- predict(classifier.lda, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1955452    2332
#>       Up     34245   12148
#>                                           
#>                Accuracy : 0.9817          
#>                  95% CI : (0.9816, 0.9819)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.3924          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.838950        
#>             Specificity : 0.982789        
#>          Pos Pred Value : 0.261850        
#>          Neg Pred Value : 0.998809        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.006061        
#>    Detection Prevalence : 0.023148        
#>       Balanced Accuracy : 0.910870        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.lda.Threshold = min(c(lda.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.lda.ROC = "TRUE"

statistics = stat_fho(classifier.lda, fho, fho.lda.Threshold)

fho.lda.accuracy = statistics[1]
fho.lda.sensitivity = statistics[2]
fho.lda.specificity = statistics[3]
fho.lda.FDR = statistics[4]
fho.lda.precision = statistics[5]

yhat <- tuned_call(preds, fho.lda.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1955437    2320
#>       Up     34260   12160
#>                                           
#>                Accuracy : 0.9817          
#>                  95% CI : (0.9816, 0.9819)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.3927          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.839779        
#>             Specificity : 0.982781        
#>          Pos Pred Value : 0.261956        
#>          Neg Pred Value : 0.998815        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.006067        
#>    Detection Prevalence : 0.023162        
#>       Balanced Accuracy : 0.911280        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.lda, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.lda, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.lda.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.lda, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.lda.accuracy, fho.lda.AUC, fho.lda.ROC, fho.lda.Threshold, fho.lda.sensitivity, fho.lda.specificity, fho.lda.FDR , fho.lda.precision))

colnames(df) <- c("FHO LDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO LDA
Accuracy	0.981748119053357
AUC	0.992115543668651
ROC	TRUE
Threshold	0.493
Sensitivity=Recall=Power	0.839779005524862
Specificity=1-FPR	0.982781297855905
FDR	0.738043946574752
Precision=PPV	0.261956053425248

Hold-Out Test Sample Performance: Quadratic Discriminant Analysis

y <- fho$BlueTarp_rest
preds <- predict(classifier.qda, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1986046    4422
#>       Up      3651   10058
#>                                           
#>                Accuracy : 0.996           
#>                  95% CI : (0.9959, 0.9961)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.7116          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.694613        
#>             Specificity : 0.998165        
#>          Pos Pred Value : 0.733679        
#>          Neg Pred Value : 0.997778        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005019        
#>    Detection Prevalence : 0.006840        
#>       Balanced Accuracy : 0.846389        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.qda.Threshold = min(c(qda.Threshold,thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.qda.ROC = "TRUE"

statistics = stat_fho(classifier.qda, fho, fho.qda.Threshold)

fho.qda.accuracy = statistics[1]
fho.qda.sensitivity = statistics[2]
fho.qda.specificity = statistics[3]
fho.qda.FDR = statistics[4]
fho.qda.precision = statistics[5]

yhat <- tuned_call(preds, fho.qda.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1982981    3464
#>       Up      6716   11016
#>                                          
#>                Accuracy : 0.9949         
#>                  95% CI : (0.9948, 0.995)
#>     No Information Rate : 0.9928         
#>     P-Value [Acc > NIR] : < 2.2e-16      
#>                                          
#>                   Kappa : 0.6814         
#>                                          
#>  Mcnemar's Test P-Value : < 2.2e-16      
#>                                          
#>             Sensitivity : 0.760773       
#>             Specificity : 0.996625       
#>          Pos Pred Value : 0.621250       
#>          Neg Pred Value : 0.998256       
#>              Prevalence : 0.007225       
#>          Detection Rate : 0.005497       
#>    Detection Prevalence : 0.008848       
#>       Balanced Accuracy : 0.878699       
#>                                          
#>        'Positive' Class : Up             
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.qda, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.qda, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.qda.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.qda, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.qda.accuracy, fho.qda.AUC, fho.qda.ROC, fho.qda.Threshold, fho.qda.sensitivity, fho.qda.specificity, fho.qda.FDR , fho.qda.precision))

colnames(df) <- c("FHO QDA")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO QDA
Accuracy	0.994920608309546
AUC	0.991500075918026
ROC	TRUE
Threshold	0.22
Sensitivity=Recall=Power	0.760773480662983
Specificity=1-FPR	0.996624611687106
FDR	0.378750281976088
Precision=PPV	0.621249718023912

Hold-Out Test Sample Performance: Logistic Regression

y <- fho$BlueTarp_rest
preds <- predict(classifier.glm, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1969383     170
#>       Up     20314   14310
#>                                           
#>                Accuracy : 0.9898          
#>                  95% CI : (0.9896, 0.9899)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.5786          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.988260        
#>             Specificity : 0.989790        
#>          Pos Pred Value : 0.413297        
#>          Neg Pred Value : 0.999914        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007140        
#>    Detection Prevalence : 0.017276        
#>       Balanced Accuracy : 0.989025        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.glm.Threshold = min(c(glm.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.glm.ROC = "TRUE"

statistics = stat_fho(classifier.glm, fho, fho.glm.Threshold)

fho.glm.accuracy = statistics[1]
fho.glm.sensitivity = statistics[2]
fho.glm.specificity = statistics[3]
fho.glm.FDR = statistics[4]
fho.glm.precision = statistics[5]

yhat <- tuned_call(preds, fho.glm.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1920306      97
#>       Up     69391   14383
#>                                           
#>                Accuracy : 0.9653          
#>                  95% CI : (0.9651, 0.9656)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.2839          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.993301        
#>             Specificity : 0.965125        
#>          Pos Pred Value : 0.171688        
#>          Neg Pred Value : 0.999949        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007177        
#>    Detection Prevalence : 0.041800        
#>       Balanced Accuracy : 0.979213        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.glm, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.glm, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.glm.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.glm, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.glm.accuracy, fho.glm.AUC, fho.glm.ROC, fho.glm.Threshold, fho.glm.sensitivity, fho.glm.specificity, fho.glm.FDR , fho.glm.precision))

colnames(df) <- c("FHO Logistic Regression ")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO Logistic Regression
Accuracy	0.965328411612348
AUC	0.999413070684321
ROC	TRUE
Threshold	0.214
Sensitivity=Recall=Power	0.993301104972376
Specificity=1-FPR	0.965124840616436
FDR	0.828311886742904
Precision=PPV	0.171688113257096

Hold-Out Test Sample Performance: Random Forest

y <- fho$BlueTarp_rest
preds <- predict(classifier.rf, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1979710    3002
#>       Up      9987   11478
#>                                           
#>                Accuracy : 0.9935          
#>                  95% CI : (0.9934, 0.9936)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.6355          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.792680        
#>             Specificity : 0.994981        
#>          Pos Pred Value : 0.534731        
#>          Neg Pred Value : 0.998486        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005727        
#>    Detection Prevalence : 0.010710        
#>       Balanced Accuracy : 0.893830        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.rf.Threshold = min(c(rf.Threshold,thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.rf.ROC = "TRUE"

statistics = stat_fho(classifier.rf, fho, fho.rf.Threshold)

fho.rf.accuracy = statistics[1]
fho.rf.sensitivity = statistics[2]
fho.rf.specificity = statistics[3]
fho.rf.FDR = statistics[4]
fho.rf.precision = statistics[5]

yhat <- tuned_call(preds, fho.rf.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1979537    2993
#>       Up     10160   11487
#>                                           
#>                Accuracy : 0.9934          
#>                  95% CI : (0.9933, 0.9935)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.6327          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.793301        
#>             Specificity : 0.994894        
#>          Pos Pred Value : 0.530651        
#>          Neg Pred Value : 0.998490        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005732        
#>    Detection Prevalence : 0.010801        
#>       Balanced Accuracy : 0.894097        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.rf, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.rf, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.rf.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.rf, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.rf.accuracy, fho.rf.AUC, fho.rf.ROC, fho.rf.Threshold, fho.rf.sensitivity, fho.rf.specificity, fho.rf.FDR , fho.rf.precision))

colnames(df) <- c("FHO Random Forest")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO Random Forest
Accuracy	0.993437206394445
AUC	0.980334789679392
ROC	TRUE
Threshold	0.495
Sensitivity=Recall=Power	0.793301104972376
Specificity=1-FPR	0.994893694869118
FDR	0.469349101492124
Precision=PPV	0.530650898507876

Hold-Out Test Sample Performance: Support Vector Machines with Linear Kernel

y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.linear, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1949566     162
#>       Up     40131   14318
#>                                           
#>                Accuracy : 0.9799          
#>                  95% CI : (0.9797, 0.9801)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.4087          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.988812        
#>             Specificity : 0.979831        
#>          Pos Pred Value : 0.262962        
#>          Neg Pred Value : 0.999917        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007144        
#>    Detection Prevalence : 0.027168        
#>       Balanced Accuracy : 0.984321        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.linear.Threshold = min(c(svm.linear.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.linear.ROC = "TRUE"

statistics = stat_fho(classifier.svm.linear, fho, fho.svm.linear.Threshold)

fho.svm.linear.accuracy = statistics[1]
fho.svm.linear.sensitivity = statistics[2]
fho.svm.linear.specificity = statistics[3]
fho.svm.linear.FDR = statistics[4]
fho.svm.linear.precision = statistics[5]

yhat <- tuned_call(preds, fho.svm.linear.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1894813     103
#>       Up     94884   14377
#>                                           
#>                Accuracy : 0.9526          
#>                  95% CI : (0.9523, 0.9529)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.2225          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.992887        
#>             Specificity : 0.952312        
#>          Pos Pred Value : 0.131584        
#>          Neg Pred Value : 0.999946        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007174        
#>    Detection Prevalence : 0.054517        
#>       Balanced Accuracy : 0.972600        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.linear, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.svm.linear, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.svm.linear.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.svm.linear, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.svm.linear.accuracy, fho.svm.linear.AUC, fho.svm.linear.ROC, fho.svm.linear.Threshold, fho.svm.linear.sensitivity, fho.svm.linear.specificity, fho.svm.linear.FDR , fho.svm.linear.precision))

colnames(df) <- c("FHO SVM w/ Linear")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO SVM w/ Linear
Accuracy	0.952605483447819
AUC	0.999092429259135
ROC	TRUE
Threshold	0.233
Sensitivity=Recall=Power	0.992886740331492
Specificity=1-FPR	0.952312337004076
FDR	0.86841599472822
Precision=PPV	0.13158400527178

Hold-Out Test Sample Performance: Support Vector Machines with Radial Basis Function Kernel

y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.radial, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977443    3524
#>       Up     12254   10956
#>                                          
#>                Accuracy : 0.9921         
#>                  95% CI : (0.992, 0.9922)
#>     No Information Rate : 0.9928         
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.5776         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.756630       
#>             Specificity : 0.993841       
#>          Pos Pred Value : 0.472038       
#>          Neg Pred Value : 0.998221       
#>              Prevalence : 0.007225       
#>          Detection Rate : 0.005467       
#>    Detection Prevalence : 0.011581       
#>       Balanced Accuracy : 0.875236       
#>                                          
#>        'Positive' Class : Up             
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.radial.Threshold = min(c(svm.radial.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.radial.ROC = "TRUE"

statistics = stat_fho(classifier.svm.radial, fho, fho.svm.radial.Threshold)

fho.svm.radial.accuracy = statistics[1]
fho.svm.radial.sensitivity = statistics[2]
fho.svm.radial.specificity = statistics[3]
fho.svm.radial.FDR = statistics[4]
fho.svm.radial.precision = statistics[5]

yhat <- tuned_call(preds, fho.svm.radial.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1973658    3121
#>       Up     16039   11359
#>                                           
#>                Accuracy : 0.9904          
#>                  95% CI : (0.9903, 0.9906)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.5381          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.784461        
#>             Specificity : 0.991939        
#>          Pos Pred Value : 0.414592        
#>          Neg Pred Value : 0.998421        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005668        
#>    Detection Prevalence : 0.013670        
#>       Balanced Accuracy : 0.888200        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.radial, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.svm.radial, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.svm.radial.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.svm.radial, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.svm.radial.accuracy, fho.svm.radial.AUC, fho.svm.radial.ROC, fho.svm.radial.Threshold, fho.svm.radial.sensitivity, fho.svm.radial.specificity, fho.svm.radial.FDR , fho.svm.radial.precision))

colnames(df) <- c("FHO SVM w/ Radial")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO SVM w/ Radial
Accuracy	0.990439966130736
AUC	0.986298521512439
ROC	TRUE
Threshold	0.335
Sensitivity=Recall=Power	0.784461325966851
Specificity=1-FPR	0.991938973622617
FDR	0.585407693992262
Precision=PPV	0.414592306007738

Hold-Out Test Sample Performance: Support Vector Machines with Polynomial Kernel

y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.poly, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1979177    7205
#>       Up     10520    7275
#>                                          
#>                Accuracy : 0.9912         
#>                  95% CI : (0.991, 0.9913)
#>     No Information Rate : 0.9928         
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.4464         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.502417       
#>             Specificity : 0.994713       
#>          Pos Pred Value : 0.408823       
#>          Neg Pred Value : 0.996373       
#>              Prevalence : 0.007225       
#>          Detection Rate : 0.003630       
#>    Detection Prevalence : 0.008879       
#>       Balanced Accuracy : 0.748565       
#>                                          
#>        'Positive' Class : Up             
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.poly.Threshold = min(c(svm.poly.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.poly.ROC = "TRUE"

statistics = stat_fho(classifier.svm.poly, fho, fho.svm.poly.Threshold)

fho.svm.poly.accuracy = statistics[1]
fho.svm.poly.sensitivity = statistics[2]
fho.svm.poly.specificity = statistics[3]
fho.svm.poly.FDR = statistics[4]
fho.svm.poly.precision = statistics[5]

yhat <- tuned_call(preds, fho.svm.poly.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977300    7107
#>       Up     12397    7373
#>                                           
#>                Accuracy : 0.9903          
#>                  95% CI : (0.9901, 0.9904)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.4258          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.509185        
#>             Specificity : 0.993769        
#>          Pos Pred Value : 0.372939        
#>          Neg Pred Value : 0.996419        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.003679        
#>    Detection Prevalence : 0.009864        
#>       Balanced Accuracy : 0.751477        
#>                                           
#>        'Positive' Class : Up              
#>

Receiver Operating Characteristic Curve

options(yardstick.event_first = FALSE)
ROC_curve <- predict(classifier.svm.poly, fho, type='prob') %>% 
  yardstick::roc_curve(truth=fho$BlueTarp_rest, estimate=Up) %>%
  dplyr::mutate(one_minus_specificity = 1-specificity)

# Sensitivity: TP/(TP+FN)
# 1-Specificity: FP/(TN+FP)

ROC_curve_plot <- ROC_curve %>%
  ggplot(aes(x=one_minus_specificity, y=sensitivity)) + 
  geom_line() + geom_point() +
  geom_abline(slope=1, intercept=0, linetype='dashed', color='red') +
  xlab("one_minus_specificity\n(false positive rate)") +
  ggtitle('ROC curve')

ROC_curve_plot


temp = predict(classifier.svm.poly, fho, type='prob') %>% 
  yardstick::roc_auc(truth=fho$BlueTarp_rest, Up) 

fho.svm.poly.AUC = temp$.estimate

Precision-Recall Curve

PR_curve <- predict(classifier.svm.poly, fho, type='prob') %>% 
  yardstick::pr_curve(truth=fho$BlueTarp_rest, estimate=Up)

PR_curve_plot <- PR_curve %>%
  ggplot(aes(x=recall, y=precision)) + 
  geom_line() + geom_point() +
  #geom_abline(slope=0, intercept=0.5, linetype='dashed', color='red') +
  ggtitle('Precision Recall Curve')

PR_curve_plot

Summary

df = data.frame(c(fho.svm.poly.accuracy, fho.svm.poly.AUC, fho.svm.poly.ROC, fho.svm.poly.Threshold, fho.svm.poly.sensitivity, fho.svm.poly.specificity, fho.svm.poly.FDR , fho.svm.poly.precision))

colnames(df) <- c("FHO SVM w/ Polynomial")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling()

	FHO SVM w/ Polynomial
Accuracy	0.990268324604064
AUC	0.837972518519624
ROC	TRUE
Threshold	0.39
Sensitivity=Recall=Power	0.509185082872928
Specificity=1-FPR	0.993769403079966
FDR	0.627061203844208
Precision=PPV	0.372938796155792

Hold-Out Test Sample Performance: Table 3


df = data.frame(c(fho.knn.accuracy, fho.knn.AUC, fho.knn.ROC, fho.knn.Threshold, fho.knn.sensitivity, fho.knn.specificity, fho.knn.FDR , fho.knn.precision),
                c(fho.lda.accuracy, fho.lda.AUC, fho.lda.ROC, fho.lda.Threshold, fho.lda.sensitivity, fho.lda.specificity, fho.lda.FDR , fho.lda.precision),
                c(fho.qda.accuracy, fho.qda.AUC, fho.qda.ROC, fho.qda.Threshold, fho.qda.sensitivity, fho.qda.specificity, fho.qda.FDR , fho.qda.precision)
, 
                c(fho.glm.accuracy, fho.glm.AUC, fho.glm.ROC, fho.glm.Threshold, fho.glm.sensitivity, fho.glm.specificity, fho.glm.FDR , fho.glm.precision)
)


colnames(df) <- c("FHO KNN" , "FHO LDA","FHO QDA","FHO Logistic Regression")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	FHO KNN	FHO LDA	FHO QDA	FHO Logistic Regression
Accuracy	0.992558541486106	0.981748119053357	0.994920608309546	0.965328411612348
AUC	0.961298412282614	0.992115543668651	0.991500075918026	0.999413070684321
ROC	TRUE	TRUE	TRUE	TRUE
Threshold	0.499	0.493	0.22	0.214
Sensitivity=Recall=Power	0.827417127071823	0.839779005524862	0.760773480662983	0.993301104972376
Specificity=1-FPR	0.993760356476388	0.982781297855905	0.996624611687106	0.965124840616436
FDR	0.50889490080341	0.738043946574752	0.378750281976088	0.828311886742904
Precision=PPV	0.49110509919659	0.261956053425248	0.621249718023912	0.171688113257096


df = data.frame(c(fho.rf.accuracy, fho.rf.AUC, fho.rf.ROC, fho.rf.Threshold, fho.rf.sensitivity, fho.rf.specificity, fho.rf.FDR , fho.rf.precision),
                c(fho.svm.linear.accuracy, fho.svm.linear.AUC, fho.svm.linear.ROC, fho.svm.linear.Threshold, fho.svm.linear.sensitivity, fho.svm.linear.specificity, fho.svm.linear.FDR , fho.svm.linear.precision)
,
                c(fho.svm.radial.accuracy, fho.svm.radial.AUC, fho.svm.radial.ROC, fho.svm.radial.Threshold, fho.svm.radial.sensitivity, fho.svm.radial.specificity, fho.svm.radial.FDR , fho.svm.radial.precision)
, 
                c(fho.svm.poly.accuracy, fho.svm.poly.AUC, fho.svm.poly.ROC, fho.svm.poly.Threshold, fho.svm.poly.sensitivity, fho.svm.poly.specificity, fho.svm.poly.FDR , fho.svm.poly.precision))


colnames(df) <- c("FHO Random Forest","FHO SVM Linear","FHO SVM Radial","FHO SVM Polynomial")
rownames(df) <- c("Accuracy","AUC", "ROC", "Threshold","Sensitivity=Recall=Power","Specificity=1-FPR","FDR","Precision=PPV")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	FHO Random Forest	FHO SVM Linear	FHO SVM Radial	FHO SVM Polynomial
Accuracy	0.993437206394445	0.952605483447819	0.990439966130736	0.990268324604064
AUC	0.980334789679392	0.999092429259135	0.986298521512439	0.837972518519624
ROC	TRUE	TRUE	TRUE	TRUE
Threshold	0.495	0.233	0.335	0.39
Sensitivity=Recall=Power	0.793301104972376	0.992886740331492	0.784461325966851	0.509185082872928
Specificity=1-FPR	0.994893694869118	0.952312337004076	0.991938973622617	0.993769403079966
FDR	0.469349101492124	0.86841599472822	0.585407693992262	0.627061203844208
Precision=PPV	0.530650898507876	0.13158400527178	0.414592306007738	0.372938796155792

Conclusions

When taking an approach to select an optimal model, by using all of the data within HaitiPixels for the model generation, their respective performances can be measured fairly accurately. Across all the algorithms, it’s worth noting the time it takes to generate each model and should be taken into account when deciding which approach to take. Using a 10-Fold cross validation within the caret package, we get the following timings:

timing <- c(classifier.knn$times$everything[3],
            classifier.lda$times$everything[3],
            classifier.qda$times$everything[3],
            classifier.glm$times$everything[3],
            classifier.rf$times$everything[3],
            classifier.svm.linear$times$everything[3],
            classifier.svm.radial$times$everything[3],
            classifier.svm.poly$times$everything[3])

df = data.frame(timing)
colnames(df) <- c( "Elapsed Time (seconds)")
rownames(df) <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	Elapsed Time (seconds)
KNN	191.085
LDA	3.920
QDA	3.758
Logistic Regression	7.579
Random Forest	323.020
SVM with Linear Kernel	281.842
SVM with Radial Kernel	4868.989
SVM with Polynomial Kernel	11306.910

We can see some models have longer elapsed run times but this is a necessary cost since we want to avoid taking data from the model training data set. Using the full set will strengthen our model. We used the \(F_1\) scores at different thresholds since it can be used to measure the performance of the unbalanced classification data set to determine the value of the metrics used in this analysis.

The metrics we calculated for this application was precision, a measure of how precise the models are; recall, the total of actual positives; accuracy, the number of correct predictions over total number of predictions; false discovery rate, the rate of Type I errors; and specificity, the ratio of negatives that are correctly identified. As the observations are a matter of providing support and supplies to those in need, we can say that a greater emphasis is placed on a higher sensitivity and lower false discovery rate. In doing so, you’re stating that all blue tarps are correctly identity and no blue tarps are not incorrectly identified as a non-blue tarp.

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

accuracy.ranks <- c(knn.Accuracy,lda.Accuracy,qda.Accuracy,glm.Accuracy,rf.Accuracy,svm.linear.Accuracy,svm.radial.Accuracy,svm.poly.Accuracy)

auc.ranks <- c(knn.AUC,lda.AUC,qda.AUC,glm.AUC,rf.AUC,svm.linear.AUC,svm.radial.AUC,svm.poly.AUC)

sensitivity.ranks <- c(knn.Sensitivity,lda.Sensitivity,qda.Sensitivity,glm.Sensitivity,rf.Sensitivity,svm.linear.Sensitivity,svm.radial.Sensitivity,svm.poly.Sensitivity)

specificity.ranks <- c(knn.Specificity,lda.Specificity,qda.Specificity,glm.Specificity,rf.Specificity,svm.linear.Specificity,svm.radial.Specificity,svm.poly.Specificity)

fdr.ranks <- c(knn.FDR, lda.FDR,qda.FDR,glm.FDR,rf.FDR,svm.linear.FDR,svm.radial.FDR,svm.poly.FDR)

precision.ranks <- c(knn.Precision, lda.Precision,qda.Precision,glm.Precision,rf.Precision,svm.linear.Precision,svm.radial.Precision,svm.poly.Precision)

i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
i2 <- sort(auc.ranks, index.return=TRUE, decreasing = TRUE)$ix
i3 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i4 <- sort(specificity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i5 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
i6 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix

myString = paste0("If we look at the statistics provided and ordering the models by accuracy, the best model is the ",name.ranks[i1][1], " with an accuracy of ", format(round(accuracy.ranks[i1][1], 6), nsmall = 6)," followed by ", name.ranks[i1][2], " with an accuracy of ",format(round(accuracy.ranks[i1][2], 6), nsmall = 6), " and ", name.ranks[i1][3], " with an accuracy of ",format(round(accuracy.ranks[i1][3], 6), nsmall = 6),". ","AUC, or Area Under Curve, is a measure of performance across all possible classification thresholds. Lowering the classification threshold classifies more items as positive. As such, you’ll want a model with a high AUC value. Ordering the models by AUC, the best model is the ", name.ranks[i2][1], " with an AUC of ", format(round(auc.ranks[i2][1], 6), nsmall = 6), " followed by ", name.ranks[i2][2], " with an AUC of ", format(round(auc.ranks[i2][2], 6), nsmall = 6), " and ", name.ranks[i2][3], " with an AUC of ", format(round(auc.ranks[i2][3], 6), nsmall = 6), ". ","Ordering the models by sensitivity, the best model is the ", name.ranks[i3][1], " with a sensitivity of ", format(round(sensitivity.ranks[i3][1], 6), nsmall = 6), " followed by ", name.ranks[i3][2], " with a sensitivity of ", format(round(sensitivity.ranks[i3][2], 6), nsmall = 6), " and ", name.ranks[i3][3], " with a sensitivity of ", format(round(sensitivity.ranks[i3][3], 6), nsmall = 6), ". ","Ordering the models by specificity, the best model is the ", name.ranks[i4][1], " with a specificity of ", format(round(specificity.ranks[i4][1], 6), nsmall = 6), " followed by ", name.ranks[i4][2], " with a specificity of ", format(round(specificity.ranks[i4][2], 6), nsmall = 6), " and ", name.ranks[i4][3], " with a specificity of ", format(round(specificity.ranks[i4][3], 6), nsmall = 6), ". ","The False Discovery Rate (FDR) is the expected proportion of type I errors ( you get a false positive ). The optimal model is one with a minimum value for FDR. The best model is the ", name.ranks[i5][1], " with a FDR of ", format(round(fdr.ranks[i5][1], 6), nsmall = 6), " followed by ", name.ranks[i5][2], " with a FDR of ", format(round(fdr.ranks[i5][2], 6), nsmall = 6), " and ", name.ranks[i5][3], " with a FDR of ", format(round(fdr.ranks[i5][3], 6), nsmall = 6), ". ","Lastly, you'll want a higher value for precision. Ordering the models by precision, the best model is the ", name.ranks[i6][1], " with a precision of ", format(round(precision.ranks[i6][1], 6), nsmall = 6), " followed by ", name.ranks[i6][2], " with a precision of ", format(round(precision.ranks[i6][2], 6), nsmall = 6), " and ", name.ranks[i6][3], " with a precision of ", format(round(precision.ranks[i6][3], 6), nsmall = 6), ".")

If we look at the statistics provided and ordering the models by accuracy, the best model is the Random Forest with an accuracy of 0.999130 followed by KNN with an accuracy of 0.997913 and SVM with Radial Kernel with an accuracy of 0.997802. AUC, or Area Under Curve, is a measure of performance across all possible classification thresholds. Lowering the classification threshold classifies more items as positive. As such, you’ll want a model with a high AUC value. Ordering the models by AUC, the best model is the KNN with an AUC of 0.999862 followed by SVM with Radial Kernel with an AUC of 0.999820 and SVM with Polynomial Kernel with an AUC of 0.999714. Ordering the models by sensitivity, the best model is the Random Forest with a sensitivity of 0.983680 followed by SVM with Radial Kernel with a sensitivity of 0.977250 and KNN with a sensitivity of 0.969337. Ordering the models by specificity, the best model is the Random Forest with a specificity of 0.999641 followed by QDA with a specificity of 0.999004 and KNN with a specificity of 0.998857. The False Discovery Rate (FDR) is the expected proportion of type I errors ( you get a false positive ). The optimal model is one with a minimum value for FDR. The best model is the Random Forest with a FDR of 0.010940 followed by QDA with a FDR of 0.033609 and KNN with a FDR of 0.034483. Lastly, you’ll want a higher value for precision. Ordering the models by precision, the best model is the Random Forest with a precision of 0.989060 followed by QDA with a precision of 0.966391 and KNN with a precision of 0.965517.

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

accuracy.ranks <- c(
  fho.knn.accuracy,
  fho.lda.accuracy,
  fho.qda.accuracy,
  fho.glm.accuracy,
  fho.rf.accuracy,
  fho.svm.linear.accuracy,
  fho.svm.radial.accuracy,
  fho.svm.poly.accuracy)

auc.ranks <- c(fho.knn.AUC,
               fho.lda.AUC,
               fho.qda.AUC,
               fho.glm.AUC,
               fho.rf.AUC,
               fho.svm.linear.AUC,
               fho.svm.radial.AUC,
               fho.svm.poly.AUC)

sensitivity.ranks <- c(fho.knn.sensitivity,
                       fho.lda.sensitivity,
                       fho.qda.sensitivity,
                       fho.glm.sensitivity,
                       fho.rf.sensitivity,
                       fho.svm.linear.sensitivity,
                       fho.svm.radial.sensitivity,
                       fho.svm.poly.sensitivity)

specificity.ranks <- c(fho.knn.specificity,
                       fho.lda.specificity,
                       fho.qda.specificity,
                       fho.glm.specificity,
                       fho.rf.specificity,
                       fho.svm.linear.specificity,
                       fho.svm.radial.specificity,
                       fho.svm.poly.specificity)

fdr.ranks <- c(fho.knn.FDR,
               fho.lda.FDR,
               fho.qda.FDR,
               fho.glm.FDR,
               fho.rf.FDR,
               fho.svm.linear.FDR,
               fho.svm.radial.FDR,
               fho.svm.poly.FDR)

precision.ranks <- c(fho.knn.precision,
                     fho.lda.precision,
                     fho.qda.precision,
                     fho.glm.precision,
                     fho.rf.precision,
                     fho.svm.linear.precision,
                     fho.svm.radial.precision,
                     fho.svm.poly.precision)

i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
i2 <- sort(auc.ranks, index.return=TRUE, decreasing = TRUE)$ix
i3 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i4 <- sort(specificity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i5 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
i6 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix

myString2 = paste0("After running our models against the FHO provided and ordering the models that performed well on the hold-out data according to accuracy was ",name.ranks[i1][1], " with an accuracy of ", format(round(accuracy.ranks[i1][1], 6), nsmall = 6)," followed by ", name.ranks[i1][2], " with an accuracy of ",format(round(accuracy.ranks[i1][2], 6), nsmall = 6), " and ", name.ranks[i1][3], " with an accuracy of ",format(round(accuracy.ranks[i1][3], 6), nsmall = 6),". ","Ordering according to AUC, the best model is the ", name.ranks[i2][1], " with an AUC of ", format(round(auc.ranks[i2][1], 6), nsmall = 6), " followed by ", name.ranks[i2][2], " with an AUC of ", format(round(auc.ranks[i2][2], 6), nsmall = 6), " and ", name.ranks[i2][3], " with an AUC of ", format(round(auc.ranks[i2][3], 6), nsmall = 6), ". ","Ordering the results by sensitivity, the best model is the ", name.ranks[i3][1], " with a sensitivity of ", format(round(sensitivity.ranks[i3][1], 6), nsmall = 6), " followed by ", name.ranks[i3][2], " with a sensitivity of ", format(round(sensitivity.ranks[i3][2], 6), nsmall = 6), " and ", name.ranks[i3][3], " with a sensitivity of ", format(round(sensitivity.ranks[i3][3], 6), nsmall = 6), ". ","Ordering the models again by specificity, the best model is the ", name.ranks[i4][1], " with a specificity of ", format(round(specificity.ranks[i4][1], 6), nsmall = 6), " followed by ", name.ranks[i4][2], " with a specificity of ", format(round(specificity.ranks[i4][2], 6), nsmall = 6), " and ", name.ranks[i4][3], " with a specificity of ", format(round(specificity.ranks[i4][3], 6), nsmall = 6), ". ","The algorithms according to the FDR performance on the hold-out data was ", name.ranks[i5][1], " with a FDR of ", format(round(fdr.ranks[i5][1], 6), nsmall = 6), " followed by ", name.ranks[i5][2], " with a FDR of ", format(round(fdr.ranks[i5][2], 6), nsmall = 6), " and ", name.ranks[i5][3], " with a FDR of ", format(round(fdr.ranks[i5][3], 6), nsmall = 6), ". ","Lastly against the precision, the best model is the ", name.ranks[i6][1], " with a precision of ", format(round(precision.ranks[i6][1], 6), nsmall = 6), " followed by ", name.ranks[i6][2], " with a precision of ", format(round(precision.ranks[i6][2], 6), nsmall = 6), " and ", name.ranks[i6][3], " with a precision of ", format(round(precision.ranks[i6][3], 6), nsmall = 6), ".")

After running our models against the FHO provided and ordering the models that performed well on the hold-out data according to accuracy was QDA with an accuracy of 0.994921 followed by Random Forest with an accuracy of 0.993437 and KNN with an accuracy of 0.992559. Ordering according to AUC, the best model is the Logistic Regression with an AUC of 0.999413 followed by SVM with Linear Kernel with an AUC of 0.999092 and LDA with an AUC of 0.992116. Ordering the results by sensitivity, the best model is the Logistic Regression with a sensitivity of 0.993301 followed by SVM with Linear Kernel with a sensitivity of 0.992887 and LDA with a sensitivity of 0.839779. Ordering the models again by specificity, the best model is the QDA with a specificity of 0.996625 followed by Random Forest with a specificity of 0.994894 and SVM with Polynomial Kernel with a specificity of 0.993769. The algorithms according to the FDR performance on the hold-out data was QDA with a FDR of 0.378750 followed by Random Forest with a FDR of 0.469349 and KNN with a FDR of 0.508895. Lastly against the precision, the best model is the QDA with a precision of 0.621250 followed by Random Forest with a precision of 0.530651 and KNN with a precision of 0.491105.

When looking to decide what is the best model to select, the accuracy can play a part in deciding which algorithm to select. If we were to rank each algorithm based off the accuracy:

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

accuracy.ranks <- c(knn.Accuracy,lda.Accuracy,qda.Accuracy,glm.Accuracy,rf.Accuracy,svm.linear.Accuracy,svm.radial.Accuracy,svm.poly.Accuracy)
i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC1 <- name.ranks[i1]

accuracy.ranks <- c(
  fho.knn.accuracy,
  fho.lda.accuracy,
  fho.qda.accuracy,
  fho.glm.accuracy,
  fho.rf.accuracy,
  fho.svm.linear.accuracy,
  fho.svm.radial.accuracy,
  fho.svm.poly.accuracy)

i2 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC2 <- name.ranks[i2]

df = data.frame(ACC1,ACC2)


colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	TRAIN	FHO TEST
Rank 1	Random Forest	QDA
Rank 2	KNN	Random Forest
Rank 3	SVM with Radial Kernel	KNN
Rank 4	SVM with Polynomial Kernel	SVM with Radial Kernel
Rank 5	SVM with Linear Kernel	SVM with Polynomial Kernel
Rank 6	Logistic Regression	LDA
Rank 7	QDA	Logistic Regression
Rank 8	LDA	SVM with Linear Kernel

We can see that algorithms based on non-linear classifications rank near the top especially for the Random Forest Model, K Nearest Neighbors and Support Vector Machines with a Radial Kernel while the algorithms based on linear classifications ( Linear discriminant, Support Vector Machines with Linear Kernel ) had a much lower ranked accuracy.

However, accuracy is not the only metric to use in validating which model to use. We can rank the models by precision:

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

precision.ranks <- c(knn.Precision, lda.Precision,qda.Precision,glm.Precision,rf.Precision,svm.linear.Precision,svm.radial.Precision,svm.poly.Precision)

i1 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC1 <- name.ranks[i1]

precision.ranks <- c(fho.knn.precision,
                     fho.lda.precision,
                     fho.qda.precision,
                     fho.glm.precision,
                     fho.rf.precision,
                     fho.svm.linear.precision,
                     fho.svm.radial.precision,
                     fho.svm.poly.precision)

i2 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC2 <- name.ranks[i2]

df = data.frame(ACC1,ACC2)


colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	TRAIN	FHO TEST
Rank 1	Random Forest	QDA
Rank 2	QDA	Random Forest
Rank 3	KNN	KNN
Rank 4	SVM with Polynomial Kernel	SVM with Radial Kernel
Rank 5	SVM with Radial Kernel	SVM with Polynomial Kernel
Rank 6	SVM with Linear Kernel	LDA
Rank 7	Logistic Regression	Logistic Regression
Rank 8	LDA	SVM with Linear Kernel

and by recall:

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

sensitivity.ranks <- c(knn.Sensitivity,lda.Sensitivity,qda.Sensitivity,glm.Sensitivity,rf.Sensitivity,svm.linear.Sensitivity,svm.radial.Sensitivity,svm.poly.Sensitivity)

i1 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC1 <- name.ranks[i1]

sensitivity.ranks <- c(fho.knn.sensitivity,
                       fho.lda.sensitivity,
                       fho.qda.sensitivity,
                       fho.glm.sensitivity,
                       fho.rf.sensitivity,
                       fho.svm.linear.sensitivity,
                       fho.svm.radial.sensitivity,
                       fho.svm.poly.sensitivity)

i2 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
ACC2 <- name.ranks[i2]

df = data.frame(ACC1,ACC2)


colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	TRAIN	FHO TEST
Rank 1	Random Forest	Logistic Regression
Rank 2	SVM with Radial Kernel	SVM with Linear Kernel
Rank 3	KNN	LDA
Rank 4	SVM with Polynomial Kernel	KNN
Rank 5	Logistic Regression	Random Forest
Rank 6	SVM with Linear Kernel	SVM with Radial Kernel
Rank 7	QDA	QDA
Rank 8	LDA	SVM with Polynomial Kernel

It’s important to note these models are generated from point estimates from the 10-fold cross-validation training. As such, I would recommend using the Random Forest model for detection of blue tarps. The mean and standard deviation point estimates from the 10-fold cross-validation for this model ( metrics Sensitivity, Specificity and Precision) :

Sensitivity Mean: 0.948570940837926
Sensitivity Standard Deviation: 0.0158306673347466
Specificity Mean: 0.998660555098661
Specificity Standard Deviation: 0.000351159803790381
Precision Mean: 0.95907902771856
Precision Standard Deviation: 0.0102829571574502

with the KNN Model as a close second. The mean and standard deviation point estimates from the 10-fold cross-validation for this model ( metrics Sensitivity, Specificity and Precision):

Sensitivity Mean: 0.959457152611813
Sensitivity Standard Deviation: 0.0137156602761885
Specificity Mean: 0.998562542574257
Specificity Standard Deviation: 0.000467102491419399
Precision Mean: 0.9568424151997
Precision Standard Deviation: 0.0131157372711998

The models do not behave the same when measuring the prediction against the training set of data and the final hold out test set but this reasoning can be attributed to many various factors. The rationale regarding which algorithm to use for detection of blue tarps in part can also explain why the training data and test data ranks do not exactly match but tend to fall into the same areas for linear / non linear models. It’s important to keep in mind that the data is an unbalanced classification set. The variables associated with this data set are additive colors of the RBG color spectrum. This data formulation allows us to address it with predictive modeling tools since the variables stem from the data being an unbalanced binary classification ( an unequal distribution of classes in the training dataset and FHO). This in turn means that within the metrics, there is a greater chance that a negatives that can be classified as a false positive. Although the models tested using FHO data has low precision overall compared to the training data, the sensitivity and specificity tend to be relatively higher and consistent with the training data. In addition, the based on the False Discovery Rate,

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

fdr.ranks <- c(knn.FDR, lda.FDR,qda.FDR,glm.FDR,rf.FDR,svm.linear.FDR,svm.radial.FDR,svm.poly.FDR)


i1 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
FDR1 <- name.ranks[i1]

fdr.ranks <- c(fho.knn.FDR,
               fho.lda.FDR,
               fho.qda.FDR,
               fho.glm.FDR,
               fho.rf.FDR,
               fho.svm.linear.FDR,
               fho.svm.radial.FDR,
               fho.svm.poly.FDR)

i2 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
FDR2 <- name.ranks[i2]

df = data.frame(FDR1,FDR2)


colnames(df) <- c("TRAIN" , "FHO TEST")
rownames(df) <- c("Rank 1","Rank 2","Rank 3","Rank 4","Rank 5","Rank 6","Rank 7","Rank 8")
df %>% kbl() %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

	TRAIN	FHO TEST
Rank 1	Random Forest	QDA
Rank 2	QDA	Random Forest
Rank 3	KNN	KNN
Rank 4	SVM with Polynomial Kernel	SVM with Radial Kernel
Rank 5	SVM with Radial Kernel	SVM with Polynomial Kernel
Rank 6	SVM with Linear Kernel	LDA
Rank 7	Logistic Regression	Logistic Regression
Rank 8	LDA	SVM with Linear Kernel

Random Forest and KNN both rank near the top, indicating that these models have lower chances of getting a false positive.

In an extension of part 1 discussions on software aspects, we expect SVMs to take the longest performance which is shown in the above table on elapsed time. The time required to do 10-fold cross validation adjusted for using tuning parameters far exceed those for LDA, QDA, GLM, etc. To avoid overfitting in the Random Forest Model, Ntree was set to 500 and not iterated across various values of ntree since Random Forest already appeared to be one of the top models. The trade-off of timing for small increases in accuracy force the analysis to lean more on saving time where possible. We have iterated across reasonable values of cost, sigma, degree , etc for the various models but we do not expect the results to change position. Quadratic and polynomial based algorithms exceeded linear based algorithms with Random Forest decision trees and K-Nearest Neighbors standing above the rest however; it’s worth pointing out that the uncertainties based on the mean and standard deviation for the various metrics can have the models flip-flop in rankings.

Disaster Relief Project Part 2

Akeem Wells

2020-11-23

Introduction

Orthovnir Hold-Out Test Data Set

K-Folds Out of Sampling Performance: K-Nearest Neighbors

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats K Nearest Neighbor

K-Folds Out of Sampling Performance: Linear Discriminant Analysis

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats LDA

K-Folds Out of Sampling Performance: Quadratic Discriminant Analysis

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats qda

K-Folds Out of Sampling Performance: Logistic Regression

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats glm

K-Folds Out of Sampling Performance: Random Forest

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats Random Forest

K-Folds Out of Sampling Performance: Support Vector Machines with Linear Kernel

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats SVM Linear

K-Folds Out of Sampling Performance: Support Vector Machines with Radial Basis Function Kernel

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats SVM Radial

K-Folds Out of Sampling Performance: Support Vector Machines with Polynomial Kernel

Confusion Matrix

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary Stats SVM Poly

K-Folds Out of Sampling Performance: Table 2

Hold-Out Test Sample Performance: K-Nearest Neighbors

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Linear Discriminant Analysis

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Quadratic Discriminant Analysis

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Logistic Regression

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Random Forest

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Support Vector Machines with Linear Kernel

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Support Vector Machines with Radial Basis Function Kernel

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Support Vector Machines with Polynomial Kernel

Receiver Operating Characteristic Curve

Precision-Recall Curve

Summary

Hold-Out Test Sample Performance: Table 3

Conclusions