# ggpairs
library(ggplot2)
library(GGally)
#> Registered S3 method overwritten by 'GGally':
#>   method from   
#>   +.gg   ggplot2
library(hrbrthemes)
#> NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
#>       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
#>       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
# %>%
library(tidyverse)
#> ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
#> ✓ tibble  3.0.3     ✓ dplyr   1.0.2
#> ✓ tidyr   1.1.2     ✓ stringr 1.4.0
#> ✓ readr   1.3.1     ✓ forcats 0.5.0
#> ✓ purrr   0.3.4
#> ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()

# kfold
library(caret)
#> Loading required package: lattice
#> 
#> Attaching package: 'caret'
#> The following object is masked from 'package:purrr':
#> 
#>     lift
kfoldcv = 10

# lda
library(MASS)
#> 
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#> 
#>     select

# prediction
library(ROCR)

#partimat
library(klaR)

library(knitr)
library(kableExtra)
#> 
#> Attaching package: 'kableExtra'
#> The following object is masked from 'package:dplyr':
#> 
#>     group_rows
get_metric <- function(caret_CM_object, metric){
  caret_CM_object %>% broom::tidy() %>% 
    dplyr::filter(term==metric) %>%
    dplyr::select(estimate) %>% pull()
}

get_metrics_func <- function(caret_CM_objects, metric, func){
  caret_CM_objects %>% 
    purrr::map( ~ get_metric(.x, metric)) %>%
    unlist() %>% func
}

get_out_of_folds_CM <- function(classifier){
  out_of_folds_CM <- classifier$pred %>%
  dplyr::mutate(pred2 = ifelse(Up > THRESHOLD, 'Up', 'Down')) %>%
  dplyr::mutate(pred2 = factor(pred2, levels = c('Down', 'Up'))) %>%
  dplyr::group_split(Resample) %>%
  purrr::map( ~ caret::confusionMatrix(data=.x$pred2, reference=.x$obs, 
                                       positive='Up'))
}

plot_folds_CM <- function(classifier,THRESHOLD ){
  out_of_folds_CM <- classifier$pred %>% 
    dplyr::mutate(pred2 = ifelse(Up>THRESHOLD, 'Up', 'Down')) %>%
    dplyr::mutate(pred2 = factor(pred2, levels=c('Down','Up'))) %>%
    dplyr::group_split(Resample) %>%
    purrr::map(~ caret::confusionMatrix(data=.x$pred2, reference=.x$obs))
  
  out_of_folds_CM_vis <- out_of_folds_CM %>% 
    purrr::map(~ .x$table %>% broom::tidy() %>%
                   ggplot(aes(x=Reference, y=Prediction, fill=n, label=n)) +
                   geom_tile() + geom_text(aes(label=n), size=5, color='white'))
   
  gridExtra::grid.arrange(grobs=out_of_folds_CM_vis, ncol=2)
}

tuned_call <- function(preds, decision_rule_threshold){
  calls <- rep('Down', length(preds))
  calls[preds>decision_rule_threshold]='Up'
  factor(calls, levels=c('Down','Up'))
}

get_optimal_threshold <- function(classifier){
  
  thresholds = seq(0.001,0.5,.001) # Avoid Threshold Tails
  F_1_scores <- rep(NA, length(thresholds))
  GMeans <- rep(NA, length(thresholds))
  
  preds <- predict(classifier, type='prob')[,2]
  y <- train$BlueTarp_rest
  for (i in 1:length(thresholds)){
    yhat <- tuned_call(preds, thresholds[i])
    one_minus_specificity <- mean(yhat[y=='Down']=='Up')
    sensitivity <- mean(yhat[y=='Up']=='Up')
    one_minus_specificity
    sensitivity
    results <- tibble::tibble(y=y, yhat=yhat)
    
    caret::confusionMatrix(yhat, y, positive = "Up") 
    cm <- results %>%yardstick::conf_mat(truth=y, yhat)
    
    trueNegative  = cm$table[1,1]
    falseNegative = cm$table[1,2]
    falsePositive = cm$table[2,1]
    truePositve   = cm$table[2,2]
    
    #paste0("True Positive: ", truePositve )
    #paste0("False Negative: ", falseNegative )
    #paste0("True Negative: ", trueNegative )
    #paste0("False Positive: ", falsePositive )
    conditionPositive = truePositve + falseNegative
    conditionNegative = trueNegative + falsePositive 
    predictedConditionPositive = truePositve +  falsePositive 
    predictedConditionNegative = trueNegative + falseNegative
    
    TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
    
    prevalence = conditionPositive / TotalPopulation
    accuracy = (truePositve + trueNegative) / TotalPopulation
    PPV = truePositve / predictedConditionPositive
    FDR = falsePositive / predictedConditionPositive
    FOR = falseNegative / predictedConditionNegative
    NPV = trueNegative / predictedConditionNegative
    TPR = truePositve / conditionPositive
    FPR = falsePositive / conditionNegative
    FNR = falseNegative / conditionPositive
    SPC = TNR = trueNegative / conditionNegative
    LR.PLUS = TPR / FPR
    LR.MINUS = FNR/TNR
    DOR = LR.PLUS / LR.MINUS
    
    F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
    F_1_scores[i] <- F_1_score
    GMeans[i] = sqrt(TPR*SPC)
  }
  
  plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
  opt_thresh = thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]
  return(opt_thresh)
}

stat_fho <- function(classifier, testdata, threshold){
  y <- testdata$BlueTarp_rest
  preds <- predict(classifier, newdata = testdata, type = "prob")[,2]
  yhat <- tuned_call(preds, threshold)
  
  one_minus_specificity <- mean(yhat[y=='Down']=='Up')
  sensitivity <- mean(yhat[y=='Up']=='Up')
  one_minus_specificity
  sensitivity
  results <- tibble::tibble(y=y, yhat=yhat)
  
  caret::confusionMatrix(yhat, y, positive = "Up") 
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  prevalence = conditionPositive / TotalPopulation
  accuracy = (truePositve + trueNegative) / TotalPopulation
  PPV = truePositve / predictedConditionPositive
  FDR = falsePositive / predictedConditionPositive
  FOR = falseNegative / predictedConditionNegative
  NPV = trueNegative / predictedConditionNegative
  TPR = truePositve / conditionPositive
  FPR = falsePositive / conditionNegative
  FNR = falseNegative / conditionPositive
  SPC = TNR = trueNegative / conditionNegative
  LR.PLUS = TPR / FPR
  LR.MINUS = FNR/TNR
  DOR = LR.PLUS / LR.MINUS
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  
  # accuracy, sensitivity, specificity, FDR, precision
  return(c(accuracy, TPR, SPC, FDR, PPV  ))
}

Introduction

In this project, you will use classification methods covered in this course to solve a real historical data mining problem: locating displaced persons living in makeshift shelters following the destruction of the earthquake in Haiti in 2010.

Following that earthquake, rescue workers, mostly from the United States military, needed to get food and water to the displaced persons. But with destroyed communications, impassable roads, and thousands of square miles, actually locating the people who needed help was challenging. As part of the rescue effort, a team from the Rochester Institute of Technology were flying an aircraft to collect high resolution geo-referenced imagery.

For the set up of the data set ( HaitiPixels, \(n=\) 63241) a likely decision is to use a one vs rest approach. Our data is a multi-class classification problem with binary classifications:

  • Blue Tarp vs [ Rooftop , Soil , Various Non-Tarp, Vegetation ]

  • Rooftop vs [ Blue Tarp, Soil , Various Non-Tarp, Vegetation ]

  • Soil vs [ Blue Tarp, Rooftop , Various Non-Tarp, Vegetation ]

  • Various Non-Tarp vs [ Blue Tarp, Rooftop , Soil , Vegetation ]

  • Vegetation vs [ Blue Tarp, Rooftop , Soil , Various Non-Tarp ]

Since we are looking for temporary shelters made using blue tarps, we generated a variable named “BlueTarp_rest”, a binary classifier that has two levels

  • “Up” - if a blue tarp was visible (or up )

  • “Down” - if a blue tarp was not visible.

Orthovnir Hold-Out Test Data Set

It was known that the people whose homes had been destroyed by the earthquake were creating temporary shelters using blue tarps, and these blue tarps would be good indicators of where the displaced persons were – if only they could be located in time, out of the thousands of images that would be collected every day.

#orthovnir057_ROI_NON_Blue_Tarps.txt

ovnir057.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir057_ROI_NON_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir057.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir057.NoBlueTarps$ID))
ovnir057.NoBlueTarps <- data.frame(ovnir057.NoBlueTarps,BlueTarp_rest )

#orthovnir067_ROI_Blue_Tarps.txt

ovnir067.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir067_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir067.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir067.BlueTarps$ID))
ovnir067.BlueTarps <- data.frame(ovnir067.BlueTarps,BlueTarp_rest )

#orthovnir067_ROI_NOT_Blue_Tarps.txt

ovnir067.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir067_ROI_NOT_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir067.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir067.NoBlueTarps$ID))
ovnir067.NoBlueTarps <- data.frame(ovnir067.NoBlueTarps,BlueTarp_rest )

#orthovnir069_ROI_Blue_Tarps.txt

ovnir069.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir069_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir069.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir069.BlueTarps$ID))
ovnir069.BlueTarps <- data.frame(ovnir069.BlueTarps,BlueTarp_rest )

#orthovnir069_ROI_NOT_Blue_Tarps.txt
ovnir069.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir069_ROI_NOT_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir069.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir069.NoBlueTarps$ID))
ovnir069.NoBlueTarps <- data.frame(ovnir069.NoBlueTarps,BlueTarp_rest )

#orthovnir078_ROI_Blue_Tarps.txt

ovnir078.BlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir078_ROI_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir078.BlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Up",length(ovnir078.BlueTarps$ID))
ovnir078.BlueTarps <- data.frame(ovnir078.BlueTarps,BlueTarp_rest )

#orthovnir078_ROI_NON_Blue_Tarps.txt
ovnir078.NoBlueTarps <- read.delim("/Users/awells/UVA/SYS6018/Disaster_Relief_Project/hold_out_data/orthovnir078_ROI_NON_Blue_Tarps.txt",comment.char=";", header = FALSE, sep = "")

colnames(ovnir078.NoBlueTarps) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue")
BlueTarp_rest <- rep("Down",length(ovnir078.NoBlueTarps$ID))
ovnir078.NoBlueTarps <- data.frame(ovnir078.NoBlueTarps,BlueTarp_rest )

#summary(ovnir067.NoBlueTarps)

fho <- rbind(
  ovnir057.NoBlueTarps,
  ovnir067.BlueTarps,
  ovnir067.NoBlueTarps,
  ovnir069.BlueTarps,
  ovnir069.NoBlueTarps,
  ovnir078.BlueTarps,
  ovnir078.NoBlueTarps
  )
fho$BlueTarp_rest <- factor(fho$BlueTarp_rest)

#### Delete these 2 lines when ready for full set
#index.fho <- createDataPartition(fho$BlueTarp_rest, p = 0.1, list = FALSE) 
#data.fho <- fho[index.fho, ]
#fho <- data.fho
#### 

# Scale Data
#fho.scale <- scale(subset(fho, select= -c(BlueTarp_rest)), center = TRUE, scale = TRUE)
#df <- data.frame(fho.scale, fho$BlueTarp_rest)
#fho <- df
colnames(fho) <- c("ID", "X", "Y", "Map X", "Map Y", "Lat", "Lon", "Red", "Green", "Blue","BlueTarp_rest")
Sample Data Makeshift Villiage

Sample Data Makeshift Villiage

Our goal is to test a variety of algorithms from this course on the imagery data collected during the relief efforts made Haiti in response to the 2010 earthquake. This leaves a Hold-Out Test Data Set of \(n=\) 2004177 observations.

K-Folds Out of Sampling Performance: K-Nearest Neighbors

The first classification process that we used is the K-nearest neighbors regression. Given a value for K and a prediction point \(x_0\), KNN regression first identifies the K training observations that are closest to \(x_0\), represented by \(N_0\). The optimal value for K will depend on the bias-variance trade-off. A small value for K provides the most flexible fit, which will have low bias but high variance. This variance is due to the fact that the prediction in a given region is entirely dependent on just one observation. In contrast, larger values of K provide a smoother and less variable fit; the prediction in a region is an average of several points, and so changing one observation has a smaller effect. We’ll use the caret package in R to apply the KNN classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

Training across various values of K, we can see that our optimal K is 5. So we’ll proceed with the analysis using this value.

Confusion Matrix

Next we’ll look at the Confusion Matrix from the model. We’ll use the image below as a reference to analyzing the matrix.

Confusion Matrix Reference

Confusion Matrix Reference

Reviewing the confusion matrix, we have:

  • Sensitivity Mean: 0.959457152611813

  • Sensitivity Standard Deviation: 0.0137156602761885

  • Specificity Mean: 0.998562542574257

  • Specificity Standard Deviation: 0.000467102491419399

Receiver Operating Characteristic Curve

From the 10-Fold cross validation we can see we have our true positive references to a blue tarp. We can measure the sensitivity also known as the true positive rate. This is the fraction of observations that are correctly identified, using a given threshold value. The initial default threshold is 0.5. The false positive rate is 1-specificity; the fraction of observations that we classify incorrectly, using that same threshold value of 0.5. We can view the receiver operating characteristic curve or ROC curve to measure these two statistics. The ideal ROC curve hugs the top left corner, indicating a high true positive rate and a low false positive rate.

We can also look at each ROC curve from each fold from the KNN model. For the 10 different ROC curves, we can even get a sense of the variation in ROC curves.

Precision-Recall Curve

We can measure the Precision and the Recall by the Precision-Recall (P-R) Curve .

Precision:

\[ \frac{true \ positives}{ true \ positives + false \ positives}\]

Recall:

\[ \frac{true \ positives}{ true \ positives + false \ negatives}\]

The ideal P-R curve hugs the top right corner, representing both high recall and high precision which indicates the model is returning accurate and positive results.

We can also look at each P-R curve from each fold from the KNN model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.959457152611813

  • Recall Standard Deviation: 0.0137156602761885

  • Precision Mean: 0.9568424151997

  • Precision Standard Deviation: 0.0131157372711998

To find the optimal threshold, well look at using the \(F_1\) Score.

\(F_1\) score is more useful measure than accuracy for problems with uneven class distributions since \(F_1\) is defined as :

\[ F_1 = 2 * \frac{\frac{\Sigma \ True \ Positive}{\Sigma \ Predicted \ Condition \ Positive}*\frac{\Sigma True \ Positive}{\Sigma \ Condition \ Positive}}{\frac{\Sigma \ True \ Positive}{\Sigma \ Predicted \ Condition \ Positive}+\frac{\Sigma \ True \ Positive}{\Sigma \ Condition \ Positive}}\]

it takes into account both false positive and false negatives. The optimal \(F_1\) Score is between 0 and 1 with 1 meaning perfect precision and recall.

yhat <- tuned_call(preds, opt_thresh.knn)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001143436
sensitivity
#> [1] 0.9693373
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61149    62
#>       Up      70  1960
#>                                           
#>                Accuracy : 0.9979          
#>                  95% CI : (0.9975, 0.9983)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : <2e-16          
#>                                           
#>                   Kappa : 0.9663          
#>                                           
#>  Mcnemar's Test P-Value : 0.5423          
#>                                           
#>             Sensitivity : 0.96934         
#>             Specificity : 0.99886         
#>          Pos Pred Value : 0.96552         
#>          Neg Pred Value : 0.99899         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03099         
#>    Detection Prevalence : 0.03210         
#>       Balanced Accuracy : 0.98410         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61149

  • False Negative = 62

  • False Positive = 70

  • True Positive = 1960

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 2030

  • Predicted Condition Negative = 61211

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.99791274647776

  • Positive predictive value, precision = 0.96551724137931

  • False Discovery rate = 0.0344827586206897

  • False Omission Rate = 0.00101288984006143

  • Negative Predictive Value = 0.998987110159939

  • True Positive Rate, Recall, Sensitivity = 0.969337289812067

  • False Positive Rate = 0.00114343586141557

  • False Negative Rate = 0.0306627101879327

  • Specificity, Selectivity, True Negative rate = 0.998856564138584

  • Positive likelihood ratio = 847.740850642928

  • Negative likelihood ratio = 0.030697811166087

  • Diagnostic Odds Ratio = 27615.6774193548

  • \(F_1\) Score = 0.967423494570582

From our plot we selected our optimal threshold as 0.499.

Summary Stats K Nearest Neighbor

KNN (k =5)
Accuracy 0.99791274647776
AUC 0.999862454860983
ROC TRUE
Threshold 0.499
Sensitivity=Recall=Power 0.969337289812067
Specificity=1-FPR 0.998856564138584
FDR 0.0344827586206897
Precision=PPV 0.96551724137931

K-Folds Out of Sampling Performance: Linear Discriminant Analysis

The next classification process that we used is the linear discriminant analysis (LDA). LDA assumes that the observations within each class are drawn from a multivariate Gaussian distribution with a class specific mean vector and a covariance matrix that is common to all K classes. Again, we’ll use the caret package in R to apply the LDA classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the LDA model. For the 10 different ROC curves, we can even get a sense of the variation in ROC curves.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the LDA model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.800692581573428

  • Recall Standard Deviation: 0.0275020332680672

  • Precision Mean: 0.7257850068744

  • Precision Standard Deviation: 0.0259033842126505

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.lda)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.009996896
sensitivity
#> [1] 0.8021761
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 60607   400
#>       Up     612  1622
#>                                         
#>                Accuracy : 0.984         
#>                  95% CI : (0.983, 0.985)
#>     No Information Rate : 0.968         
#>     P-Value [Acc > NIR] : < 2.2e-16     
#>                                         
#>                   Kappa : 0.754         
#>                                         
#>  Mcnemar's Test P-Value : 3.295e-11     
#>                                         
#>             Sensitivity : 0.80218       
#>             Specificity : 0.99000       
#>          Pos Pred Value : 0.72605       
#>          Neg Pred Value : 0.99344       
#>              Prevalence : 0.03197       
#>          Detection Rate : 0.02565       
#>    Detection Prevalence : 0.03533       
#>       Balanced Accuracy : 0.89609       
#>                                         
#>        'Positive' Class : Up            
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 60607

  • False Negative = 400

  • False Positive = 612

  • True Positive = 1622

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 2234

  • Predicted Condition Negative = 61007

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.983997722996158

  • Positive predictive value, precision = 0.726051924798568

  • False Discovery rate = 0.273948075201432

  • False Omission Rate = 0.00655662464963037

  • Negative Predictive Value = 0.99344337535037

  • True Positive Rate, Recall, Sensitivity = 0.80217606330366

  • False Positive Rate = 0.00999689638837616

  • False Negative Rate = 0.19782393669634

  • Specificity, Selectivity, True Negative rate = 0.990003103611624

  • Positive likelihood ratio = 80.242510489194

  • Negative likelihood ratio = 0.199821531846375

  • Diagnostic Odds Ratio = 401.570890522876

  • \(F_1\) Score = 0.762218045112782

From our plot we selected our optimal threshold as 0.495.

Summary Stats LDA

LDA
Accuracy 0.983997722996158
AUC 0.988876770009065
ROC TRUE
Threshold 0.495
Sensitivity=Recall=Power 0.80217606330366
Specificity=1-FPR 0.990003103611624
FDR 0.273948075201432
Precision=PPV 0.726051924798568

K-Folds Out of Sampling Performance: Quadratic Discriminant Analysis

The next classification process that we used is the quadratic discriminant analysis (qda). Like LDA, the QDA classifier results from assuming that the observations from each class are drawn from a Gaussian distribution, and plugging estimates for the parameters into Bayes’ theorem in order to perform prediction. Again, we’ll use the caret package in R to apply the qda classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the qda model.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the QDA model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.838777252109447

  • Recall Standard Deviation: 0.0190825467050525

  • Precision Mean: 0.989610677170821

  • Precision Standard Deviation: 0.00701203785618355

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.qda)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.0009964227
sensitivity
#> [1] 0.867458
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61158   268
#>       Up      61  1754
#>                                           
#>                Accuracy : 0.9948          
#>                  95% CI : (0.9942, 0.9953)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9116          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.86746         
#>             Specificity : 0.99900         
#>          Pos Pred Value : 0.96639         
#>          Neg Pred Value : 0.99564         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.02774         
#>    Detection Prevalence : 0.02870         
#>       Balanced Accuracy : 0.93323         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61158

  • False Negative = 268

  • False Positive = 61

  • True Positive = 1754

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 1815

  • Predicted Condition Negative = 61426

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.994797678721083

  • Positive predictive value, precision = 0.966391184573003

  • False Discovery rate = 0.0336088154269972

  • False Omission Rate = 0.00436297333376746

  • Negative Predictive Value = 0.995637026666233

  • True Positive Rate, Recall, Sensitivity = 0.867457962413452

  • False Positive Rate = 0.000996422679233571

  • False Negative Rate = 0.132542037586548

  • Specificity, Selectivity, True Negative rate = 0.999003577320766

  • Positive likelihood ratio = 870.57227870474

  • Negative likelihood ratio = 0.13267423720545

  • Diagnostic Odds Ratio = 6561.72816246636

  • \(F_1\) Score = 0.914255929111285

From our plot we selected our optimal threshold as 0.22.

Summary Stats qda

QDA
Accuracy 0.994797678721083
AUC 0.998217543931761
ROC TRUE
Threshold 0.22
Sensitivity=Recall=Power 0.867457962413452
Specificity=1-FPR 0.999003577320766
FDR 0.0336088154269972
Precision=PPV 0.966391184573003

K-Folds Out of Sampling Performance: Logistic Regression

Our next classification process that we used is the linear discriminant analysis (glm). Again, we’ll use the caret package in R to apply the glm classification procedure using a 10 - Fold Cross-Validation evaluation procedure.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the glm model.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the logistic regression model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.885758181729503

  • Recall Standard Deviation: 0.0207876255423969

  • Precision Mean: 0.964588763609958

  • Precision Standard Deviation: 0.00862150259548997

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.glm)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001862167
sensitivity
#> [1] 0.9263106
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61105   149
#>       Up     114  1873
#>                                           
#>                Accuracy : 0.9958          
#>                  95% CI : (0.9953, 0.9963)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2e-16         
#>                                           
#>                   Kappa : 0.9323          
#>                                           
#>  Mcnemar's Test P-Value : 0.03604         
#>                                           
#>             Sensitivity : 0.92631         
#>             Specificity : 0.99814         
#>          Pos Pred Value : 0.94263         
#>          Neg Pred Value : 0.99757         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.02962         
#>    Detection Prevalence : 0.03142         
#>       Balanced Accuracy : 0.96222         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61105

  • False Negative = 149

  • False Positive = 114

  • True Positive = 1873

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 1987

  • Predicted Condition Negative = 61254

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.995841305482203

  • Positive predictive value, precision = 0.942627075993961

  • False Discovery rate = 0.0573729240060393

  • False Omission Rate = 0.00243249420446012

  • Negative Predictive Value = 0.99756750579554

  • True Positive Rate, Recall, Sensitivity = 0.926310583580613

  • False Positive Rate = 0.00186216697430536

  • False Negative Rate = 0.0736894164193867

  • Specificity, Selectivity, True Negative rate = 0.998137833025695

  • Positive likelihood ratio = 497.436908914224

  • Negative likelihood ratio = 0.0738268944239986

  • Diagnostic Odds Ratio = 6737.88207935947

  • \(F_1\) Score = 0.934397605387877

From our plot we selected our optimal threshold as 0.214.

Summary Stats glm

Logistic Regression
Accuracy 0.995841305482203
AUC 0.998506949374034
ROC TRUE
Threshold 0.214
Sensitivity=Recall=Power 0.926310583580613
Specificity=1-FPR 0.998137833025695
FDR 0.0573729240060393
Precision=PPV 0.942627075993961

K-Folds Out of Sampling Performance: Random Forest

A Random forests model uses a set of decision trees in a way that decorrelates the trees. Each individual tree will decide a class prediction and the largest total will decide the class that the observation belongs to.

The argument mtry indicates the number of predictors that should be considered for each split of the tree. Training across various values of mtry, we can see that our optimal mtry is 2. The number of trees used was default 500 So we’ll proceed with the analysis using this value.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the rf model.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the Random Forest model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.948570940837926

  • Recall Standard Deviation: 0.0158306673347466

  • Precision Mean: 0.95907902771856

  • Precision Standard Deviation: 0.0102829571574502

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.rf)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.0003593656
sensitivity
#> [1] 0.9836795
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61197    33
#>       Up      22  1989
#>                                           
#>                Accuracy : 0.9991          
#>                  95% CI : (0.9989, 0.9993)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : <2e-16          
#>                                           
#>                   Kappa : 0.9859          
#>                                           
#>  Mcnemar's Test P-Value : 0.1775          
#>                                           
#>             Sensitivity : 0.98368         
#>             Specificity : 0.99964         
#>          Pos Pred Value : 0.98906         
#>          Neg Pred Value : 0.99946         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03145         
#>    Detection Prevalence : 0.03180         
#>       Balanced Accuracy : 0.99166         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61197

  • False Negative = 33

  • False Positive = 22

  • True Positive = 1989

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 2011

  • Predicted Condition Negative = 61230

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.9991303110324

  • Positive predictive value, precision = 0.989060169070114

  • False Discovery rate = 0.0109398309298856

  • False Omission Rate = 0.000538951494365507

  • Negative Predictive Value = 0.999461048505634

  • True Positive Rate, Recall, Sensitivity = 0.983679525222552

  • False Positive Rate = 0.000359365556444895

  • False Negative Rate = 0.0163204747774481

  • Specificity, Selectivity, True Negative rate = 0.999640634443555

  • Positive likelihood ratio = 2737.26712975452

  • Negative likelihood ratio = 0.0163263419023905

  • Diagnostic Odds Ratio = 167659.549586777

  • \(F_1\) Score = 0.986362509298289

From our plot we selected our optimal threshold as 0.495.

Summary Stats Random Forest

Random Forest ( mtry = 2, ntree=500)
Accuracy 0.9991303110324
AUC 0.994483115853513
ROC TRUE
Threshold 0.495
Sensitivity=Recall=Power 0.983679525222552
Specificity=1-FPR 0.999640634443555
FDR 0.0109398309298856
Precision=PPV 0.989060169070114

K-Folds Out of Sampling Performance: Support Vector Machines with Linear Kernel

The support vector machine (SVM) is an extension of the support vector machine classifier that results from enlarging the feature space using kernels. The linear kernel takes the form of

\[K(x_i,x_{i^\prime}) =\sum_{j=1}^{p}x_{ij}x_{{i^\prime}j}\]

where the kernel quantifies the similarity of a pair of observations using Pearson correlation.

We can compute the distance from each training observation to a given separating hyperplane and call this the margin. When the cost argument is small, then the margins will be wide and many support vectors will violate the margin. When the cost argument is large, then the margins will be narrow and there will be few support vectors violating the margin. The parameter C can be used as a budget for the amount that the margin can be violated by the n observations. Training across various values of Cost, we can see that our optimal C is 1. So we’ll proceed with the analysis using this value.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the SVM with Linear Kernel model.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the SVM with Linear Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.888726040091694

  • Recall Standard Deviation: 0.0158463375204761

  • Precision Mean: 0.965193426042111

  • Precision Standard Deviation: 0.0101404124635792

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.svm.linear)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001666149
sensitivity
#> [1] 0.921365
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61117   159
#>       Up     102  1863
#>                                           
#>                Accuracy : 0.9959          
#>                  95% CI : (0.9953, 0.9964)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9324          
#>                                           
#>  Mcnemar's Test P-Value : 0.0005276       
#>                                           
#>             Sensitivity : 0.92136         
#>             Specificity : 0.99833         
#>          Pos Pred Value : 0.94809         
#>          Neg Pred Value : 0.99741         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.02946         
#>    Detection Prevalence : 0.03107         
#>       Balanced Accuracy : 0.95985         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61117

  • False Negative = 159

  • False Positive = 102

  • True Positive = 1863

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 1965

  • Predicted Condition Negative = 61276

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.99587293053557

  • Positive predictive value, precision = 0.948091603053435

  • False Discovery rate = 0.0519083969465649

  • False Omission Rate = 0.00259481689405314

  • Negative Predictive Value = 0.997405183105947

  • True Positive Rate, Recall, Sensitivity = 0.921364985163205

  • False Positive Rate = 0.00166614939806269

  • False Negative Rate = 0.0786350148367952

  • Specificity, Selectivity, True Negative rate = 0.998333850601937

  • Positive likelihood ratio = 552.990617908885

  • Negative likelihood ratio = 0.0787662511787844

  • Diagnostic Odds Ratio = 7020.65427302997

  • \(F_1\) Score = 0.934537246049661

From our plot we selected our optimal threshold as 0.233.

Summary Stats SVM Linear

SVM Linear ( Cost = 1)
Accuracy 0.99587293053557
AUC 0.997848791117502
ROC TRUE
Threshold 0.233
Sensitivity=Recall=Power 0.921364985163205
Specificity=1-FPR 0.998333850601937
FDR 0.0519083969465649
Precision=PPV 0.948091603053435

K-Folds Out of Sampling Performance: Support Vector Machines with Radial Basis Function Kernel

The radial kernel of the SVM takes the form

\[K(x_i,x_{i^\prime}) = exp(-\gamma\sum_{j=1}^{p}(x_{ij} - x_{{i^\prime}j})^2)\]

classifier.svm.radial
#> Support Vector Machines with Radial Basis Function Kernel 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   sigma  C      Accuracy   Kappa    
#>   0.8    1e+02  0.9969324  0.9500417
#>   0.8    1e+03  0.9970747  0.9522542
#>   0.8    1e+04  0.9972803  0.9557741
#>   0.8    1e+05  0.9972486  0.9551823
#>   0.8    1e+06  0.9606135  0.7975858
#>   1.0    1e+02  0.9970115  0.9512629
#>   1.0    1e+03  0.9970905  0.9525768
#>   1.0    1e+04  0.9972961  0.9561093
#>   1.0    1e+05  0.9973277  0.9563655
#>   1.0    1e+06  0.9929947  0.8547633
#>   1.2    1e+02  0.9970273  0.9515025
#>   1.2    1e+03  0.9972170  0.9546950
#>   1.2    1e+04  0.9972170  0.9547799
#>   1.2    1e+05  0.9973751  0.9570940
#>   1.2    1e+06  0.8931935  0.6229545
#>   1.4    1e+02  0.9970431  0.9517665
#>   1.4    1e+03  0.9972645  0.9555070
#>   1.4    1e+04  0.9972645  0.9555061
#>   1.4    1e+05  0.9972487  0.9549891
#>   1.4    1e+06  0.9597518  0.7016095
#>   1.6    1e+02  0.9970431  0.9517705
#>   1.6    1e+03  0.9972645  0.9554496
#>   1.6    1e+04  0.9973751  0.9572456
#>   1.6    1e+05  0.9972170  0.9543396
#>   1.6    1e+06  0.9453186  0.6469249
#>   1.8    1e+02  0.9971380  0.9533904
#>   1.8    1e+03  0.9972961  0.9560316
#>   1.8    1e+04  0.9973277  0.9564755
#>   1.8    1e+05  0.9972170  0.9543601
#>   1.8    1e+06  0.9545834  0.7186193
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final values used for the model were sigma = 1.2 and C = 1e+05.
classifier.svm.radial.bestC <- classifier.svm.radial$bestTune$C
classifier.svm.radial.bestSigma <- classifier.svm.radial$bestTune$sigma

Training across various values of Cost and Sigma, we can see that our optimal C is 1e+05 and our optimal sigma is 1.2. So we’ll proceed with the analysis using this value.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the SVM with Radial Kernel model.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the SVM with Radial Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.950051212017754

  • Recall Standard Deviation: 0.0247402222819946

  • Precision Mean: 0.967355639517883

  • Precision Standard Deviation: 0.0074508221585993

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.svm.radial)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001519136
sensitivity
#> [1] 0.9772502
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61126    46
#>       Up      93  1976
#>                                           
#>                Accuracy : 0.9978          
#>                  95% CI : (0.9974, 0.9982)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.9649          
#>                                           
#>  Mcnemar's Test P-Value : 9.553e-05       
#>                                           
#>             Sensitivity : 0.97725         
#>             Specificity : 0.99848         
#>          Pos Pred Value : 0.95505         
#>          Neg Pred Value : 0.99925         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03125         
#>    Detection Prevalence : 0.03272         
#>       Balanced Accuracy : 0.98787         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61126

  • False Negative = 46

  • False Positive = 93

  • True Positive = 1976

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 2069

  • Predicted Condition Negative = 61172

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.997802058790974

  • Positive predictive value, precision = 0.955050749154181

  • False Discovery rate = 0.0449492508458192

  • False Omission Rate = 0.00075197802916367

  • Negative Predictive Value = 0.999248021970836

  • True Positive Rate, Recall, Sensitivity = 0.977250247279921

  • False Positive Rate = 0.00151913621588069

  • False Negative Rate = 0.0227497527200791

  • Specificity, Selectivity, True Negative rate = 0.998480863784119

  • Positive likelihood ratio = 643.293364389564

  • Negative likelihood ratio = 0.0227843652745235

  • Diagnostic Odds Ratio = 28233.9822346891

  • \(F_1\) Score = 0.966022977267172

From our plot we selected our optimal threshold as 0.335.

Summary Stats SVM Radial

SVM Radial ( Cost = 1e+05, Sigma = 1.2)
Accuracy 0.997802058790974
AUC 0.999820301064707
ROC TRUE
Threshold 0.335
Sensitivity=Recall=Power 0.977250247279921
Specificity=1-FPR 0.998480863784119
FDR 0.0449492508458192
Precision=PPV 0.955050749154181

K-Folds Out of Sampling Performance: Support Vector Machines with Polynomial Kernel

The polynomial kernel of the SVM takes the form of

\[K(x_i,x_{i^\prime}) =( 1+\sum_{j=1}^{p}x_{ij}x_{{i^\prime}j})^d\]

classifier.svm.poly
#> Support Vector Machines with Polynomial Kernel 
#> 
#> 63241 samples
#>     3 predictor
#>     2 classes: 'Down', 'Up' 
#> 
#> Pre-processing: centered (3), scaled (3) 
#> Resampling: Cross-Validated (10 fold) 
#> Summary of sample sizes: 56918, 56917, 56917, 56917, 56916, 56917, ... 
#> Resampling results across tuning parameters:
#> 
#>   scale  C      degree  Accuracy   Kappa     
#>   0.6      100  4       0.9969798  0.95073253
#>   0.6      100  5       0.9969798  0.95039113
#>   0.6      100  6       0.9969482  0.94973290
#>   0.6     1000  4       0.9971222  0.95283161
#>   0.6     1000  5       0.9969166  0.94921460
#>   0.6     1000  6       0.9953667  0.92889594
#>   0.6    10000  4       0.9637691  0.66318022
#>   0.6    10000  5       0.9450249  0.33139394
#>   0.6    10000  6       0.8959164  0.35610998
#>   0.8      100  4       0.9969798  0.95056708
#>   0.8      100  5       0.9969957  0.95060633
#>   0.8      100  6       0.9969166  0.94916759
#>   0.8     1000  4       0.9970905  0.95235627
#>   0.8     1000  5       0.9799512  0.82988695
#>   0.8     1000  6       0.9256509  0.60509759
#>   0.8    10000  4       0.9056307  0.43616018
#>   0.8    10000  5       0.9268660  0.15758162
#>   0.8    10000  6       0.9594508  0.20609072
#>   1.0      100  4       0.9970747  0.95209703
#>   1.0      100  5       0.9969324  0.94944514
#>   1.0      100  6       0.9967110  0.94559460
#>   1.0     1000  4       0.9968376  0.94849150
#>   1.0     1000  5       0.9475411  0.61267836
#>   1.0     1000  6       0.9569166  0.58566453
#>   1.0    10000  4       0.9779145  0.40916541
#>   1.0    10000  5       0.9518177  0.25844373
#>   1.0    10000  6       0.8896854  0.05280171
#>   1.2      100  4       0.9971064  0.95260152
#>   1.2      100  5       0.9958889  0.92687426
#>   1.2      100  6       0.9585005  0.76295369
#>   1.2     1000  4       0.9627126  0.77478578
#>   1.2     1000  5       0.9767552  0.56539850
#>   1.2     1000  6       0.9179317  0.38212066
#>   1.2    10000  4       0.9635912  0.24989693
#>   1.2    10000  5       0.9750423  0.31543508
#>   1.2    10000  6       0.9179759  0.01846190
#> 
#> Accuracy was used to select the optimal model using the largest value.
#> The final values used for the model were degree = 4, scale = 0.6 and C = 1000.
classifier.svm.poly.bestC <- classifier.svm.poly$bestTune$C
classifier.svm.poly.bestDegree <- classifier.svm.poly$bestTune$degree
classifier.svm.poly.bestScale <- classifier.svm.poly$bestTune$scale

Training across various values of Cost, Degree and Scale, we can see that our optimal C is 1000, our optimal Degree is 4 and our optimal Scale is 0.6. So we’ll proceed with the analysis using this value.

We can look at the various plots of Scale vs Accuracy for various Cost and Polynomial degree.

Receiver Operating Characteristic Curve

Next we’ll look at the ROC curve.

We can also look at each ROC curve from each fold from the SVM with Polynomial Kernel model.

Precision-Recall Curve

Next, we’ll look at the P-R curve.

We can also look at each P-R curve from each fold from the SVM with Polynomial Kernel model. For the 10 different P-R curves, we can even get a sense of the variation in P-R curves.

  • Recall Mean: 0.942640101448568

  • Recall Standard Deviation: 0.0259690507584868

  • Precision Mean: 0.966767401633039

  • Precision Standard Deviation: 0.013585405388514

To find the optimal threshold, well look at using the \(F_1\) Score.

yhat <- tuned_call(preds, opt_thresh.svm.poly)
one_minus_specificity <- mean(yhat[y=='Down']=='Up')
sensitivity <- mean(yhat[y=='Up']=='Up')
one_minus_specificity
#> [1] 0.001176105
sensitivity
#> [1] 0.9495549
results <- tibble::tibble(y=y, yhat=yhat)

caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction  Down    Up
#>       Down 61147   102
#>       Up      72  1920
#>                                           
#>                Accuracy : 0.9972          
#>                  95% CI : (0.9968, 0.9976)
#>     No Information Rate : 0.968           
#>     P-Value [Acc > NIR] : < 2e-16         
#>                                           
#>                   Kappa : 0.9552          
#>                                           
#>  Mcnemar's Test P-Value : 0.02791         
#>                                           
#>             Sensitivity : 0.94955         
#>             Specificity : 0.99882         
#>          Pos Pred Value : 0.96386         
#>          Neg Pred Value : 0.99833         
#>              Prevalence : 0.03197         
#>          Detection Rate : 0.03036         
#>    Detection Prevalence : 0.03150         
#>       Balanced Accuracy : 0.97419         
#>                                           
#>        'Positive' Class : Up              
#> 
cm <- results %>%yardstick::conf_mat(truth=y, yhat)

trueNegative  = cm$table[1,1]
falseNegative = cm$table[1,2]
falsePositive = cm$table[2,1]
truePositve   = cm$table[2,2]

conditionPositive = truePositve + falseNegative
conditionNegative = trueNegative + falsePositive 
predictedConditionPositive = truePositve +  falsePositive 
predictedConditionNegative = trueNegative + falseNegative

TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

prevalence = conditionPositive / TotalPopulation
accuracy = (truePositve + trueNegative) / TotalPopulation
PPV = truePositve / predictedConditionPositive
FDR = falsePositive / predictedConditionPositive
FOR = falseNegative / predictedConditionNegative
NPV = trueNegative / predictedConditionNegative
TPR = truePositve / conditionPositive
FPR = falsePositive / conditionNegative
FNR = falseNegative / conditionPositive
SPC = TNR = trueNegative / conditionNegative
LR.PLUS = TPR / FPR
LR.MINUS = FNR/TNR
DOR = LR.PLUS / LR.MINUS

F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))

Now using the optimal threshold we get:

  • True Negative = 61147

  • False Negative = 102

  • False Positive = 72

  • True Positive = 1920

  • Condition Positive = 2022

  • condition Negative = 61219

  • Predicted Condition Positive = 1992

  • Predicted Condition Negative = 61249

  • Total Population = 63241

  • Prevalence = 0.0319729289543176

  • Accuracy = 0.997248620357047

  • Positive predictive value, precision = 0.963855421686747

  • False Discovery rate = 0.036144578313253

  • False Omission Rate = 0.00166533331156427

  • Negative Predictive Value = 0.998334666688436

  • True Positive Rate, Recall, Sensitivity = 0.949554896142433

  • False Positive Rate = 0.00117610545745602

  • False Negative Rate = 0.0504451038575668

  • Specificity, Selectivity, True Negative rate = 0.998823894542544

  • Positive likelihood ratio = 807.37223870755

  • Negative likelihood ratio = 0.0505045024785579

  • Diagnostic Odds Ratio = 15986.1437908497

  • \(F_1\) Score = 0.956651718983558

From our plot we selected our optimal threshold as 0.39.

Summary Stats SVM Poly

SVM Polynomial ( Cost = 1000, Degree = 4, Scale = 0.6)
Accuracy 0.997248620357047
AUC 0.999714294526813
ROC TRUE
Threshold 0.39
Sensitivity=Recall=Power 0.949554896142433
Specificity=1-FPR 0.998823894542544
FDR 0.036144578313253
Precision=PPV 0.963855421686747

K-Folds Out of Sampling Performance: Table 2

KNN (k =5) LDA QDA Logistic Regression
Accuracy 0.99791274647776 0.983997722996158 0.994797678721083 0.995841305482203
AUC 0.999862454860983 0.988876770009065 0.998217543931761 0.998506949374034
ROC TRUE TRUE TRUE TRUE
Threshold 0.499 0.495 0.22 0.214
Sensitivity=Recall=Power 0.969337289812067 0.80217606330366 0.867457962413452 0.926310583580613
Specificity=1-FPR 0.998856564138584 0.990003103611624 0.999003577320766 0.998137833025695
FDR 0.0344827586206897 0.273948075201432 0.0336088154269972 0.0573729240060393
Precision=PPV 0.96551724137931 0.726051924798568 0.966391184573003 0.942627075993961
Random Forest ( mtry = 2, ntree=500) SVM Linear ( Cost = 1) SVM Radial ( Cost = 1e+05, Sigma = 1.2) SVM Polynomial ( Cost = 1000, Degree = 4, Scale = 0.6)
Accuracy 0.9991303110324 0.99587293053557 0.997802058790974 0.997248620357047
AUC 0.994483115853513 0.997848791117502 0.999820301064707 0.999714294526813
ROC TRUE TRUE TRUE TRUE
Threshold 0.495 0.233 0.335 0.39
Sensitivity=Recall=Power 0.983679525222552 0.921364985163205 0.977250247279921 0.949554896142433
Specificity=1-FPR 0.999640634443555 0.998333850601937 0.998480863784119 0.998823894542544
FDR 0.0109398309298856 0.0519083969465649 0.0449492508458192 0.036144578313253
Precision=PPV 0.989060169070114 0.948091603053435 0.955050749154181 0.963855421686747

Hold-Out Test Sample Performance: K-Nearest Neighbors

y <- fho$BlueTarp_rest
preds <- predict(classifier.knn, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977410    2501
#>       Up     12287   11979
#>                                           
#>                Accuracy : 0.9926          
#>                  95% CI : (0.9925, 0.9927)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 0.9948          
#>                                           
#>                   Kappa : 0.6148          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.827279        
#>             Specificity : 0.993825        
#>          Pos Pred Value : 0.493654        
#>          Neg Pred Value : 0.998737        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005977        
#>    Detection Prevalence : 0.012108        
#>       Balanced Accuracy : 0.910552        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]

  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative

  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive

  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.knn.Threshold = min(c(knn.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.knn.ROC = "TRUE"

statistics = stat_fho(classifier.knn, fho, fho.knn.Threshold)

fho.knn.accuracy = statistics[1]
fho.knn.sensitivity = statistics[2]
fho.knn.specificity = statistics[3]
fho.knn.FDR = statistics[4]
fho.knn.precision = statistics[5]

yhat <- tuned_call(preds, fho.knn.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977282    2499
#>       Up     12415   11981
#>                                           
#>                Accuracy : 0.9926          
#>                  95% CI : (0.9924, 0.9927)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 0.9998          
#>                                           
#>                   Kappa : 0.6129          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.827417        
#>             Specificity : 0.993760        
#>          Pos Pred Value : 0.491105        
#>          Neg Pred Value : 0.998738        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005978        
#>    Detection Prevalence : 0.012173        
#>       Balanced Accuracy : 0.910589        
#>                                           
#>        'Positive' Class : Up              
#> 

Hold-Out Test Sample Performance: Linear Discriminant Analysis

y <- fho$BlueTarp_rest
preds <- predict(classifier.lda, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1955452    2332
#>       Up     34245   12148
#>                                           
#>                Accuracy : 0.9817          
#>                  95% CI : (0.9816, 0.9819)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.3924          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.838950        
#>             Specificity : 0.982789        
#>          Pos Pred Value : 0.261850        
#>          Neg Pred Value : 0.998809        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.006061        
#>    Detection Prevalence : 0.023148        
#>       Balanced Accuracy : 0.910870        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.lda.Threshold = min(c(lda.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.lda.ROC = "TRUE"

statistics = stat_fho(classifier.lda, fho, fho.lda.Threshold)

fho.lda.accuracy = statistics[1]
fho.lda.sensitivity = statistics[2]
fho.lda.specificity = statistics[3]
fho.lda.FDR = statistics[4]
fho.lda.precision = statistics[5]

yhat <- tuned_call(preds, fho.lda.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1955437    2320
#>       Up     34260   12160
#>                                           
#>                Accuracy : 0.9817          
#>                  95% CI : (0.9816, 0.9819)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.3927          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.839779        
#>             Specificity : 0.982781        
#>          Pos Pred Value : 0.261956        
#>          Neg Pred Value : 0.998815        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.006067        
#>    Detection Prevalence : 0.023162        
#>       Balanced Accuracy : 0.911280        
#>                                           
#>        'Positive' Class : Up              
#> 

Summary

FHO LDA
Accuracy 0.981748119053357
AUC 0.992115543668651
ROC TRUE
Threshold 0.493
Sensitivity=Recall=Power 0.839779005524862
Specificity=1-FPR 0.982781297855905
FDR 0.738043946574752
Precision=PPV 0.261956053425248

Hold-Out Test Sample Performance: Quadratic Discriminant Analysis

y <- fho$BlueTarp_rest
preds <- predict(classifier.qda, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1986046    4422
#>       Up      3651   10058
#>                                           
#>                Accuracy : 0.996           
#>                  95% CI : (0.9959, 0.9961)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.7116          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.694613        
#>             Specificity : 0.998165        
#>          Pos Pred Value : 0.733679        
#>          Neg Pred Value : 0.997778        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005019        
#>    Detection Prevalence : 0.006840        
#>       Balanced Accuracy : 0.846389        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.qda.Threshold = min(c(qda.Threshold,thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.qda.ROC = "TRUE"

statistics = stat_fho(classifier.qda, fho, fho.qda.Threshold)

fho.qda.accuracy = statistics[1]
fho.qda.sensitivity = statistics[2]
fho.qda.specificity = statistics[3]
fho.qda.FDR = statistics[4]
fho.qda.precision = statistics[5]

yhat <- tuned_call(preds, fho.qda.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1982981    3464
#>       Up      6716   11016
#>                                          
#>                Accuracy : 0.9949         
#>                  95% CI : (0.9948, 0.995)
#>     No Information Rate : 0.9928         
#>     P-Value [Acc > NIR] : < 2.2e-16      
#>                                          
#>                   Kappa : 0.6814         
#>                                          
#>  Mcnemar's Test P-Value : < 2.2e-16      
#>                                          
#>             Sensitivity : 0.760773       
#>             Specificity : 0.996625       
#>          Pos Pred Value : 0.621250       
#>          Neg Pred Value : 0.998256       
#>              Prevalence : 0.007225       
#>          Detection Rate : 0.005497       
#>    Detection Prevalence : 0.008848       
#>       Balanced Accuracy : 0.878699       
#>                                          
#>        'Positive' Class : Up             
#> 

Summary

FHO QDA
Accuracy 0.994920608309546
AUC 0.991500075918026
ROC TRUE
Threshold 0.22
Sensitivity=Recall=Power 0.760773480662983
Specificity=1-FPR 0.996624611687106
FDR 0.378750281976088
Precision=PPV 0.621249718023912

Hold-Out Test Sample Performance: Logistic Regression

y <- fho$BlueTarp_rest
preds <- predict(classifier.glm, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1969383     170
#>       Up     20314   14310
#>                                           
#>                Accuracy : 0.9898          
#>                  95% CI : (0.9896, 0.9899)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.5786          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.988260        
#>             Specificity : 0.989790        
#>          Pos Pred Value : 0.413297        
#>          Neg Pred Value : 0.999914        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007140        
#>    Detection Prevalence : 0.017276        
#>       Balanced Accuracy : 0.989025        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.glm.Threshold = min(c(glm.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.glm.ROC = "TRUE"

statistics = stat_fho(classifier.glm, fho, fho.glm.Threshold)

fho.glm.accuracy = statistics[1]
fho.glm.sensitivity = statistics[2]
fho.glm.specificity = statistics[3]
fho.glm.FDR = statistics[4]
fho.glm.precision = statistics[5]

yhat <- tuned_call(preds, fho.glm.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1920306      97
#>       Up     69391   14383
#>                                           
#>                Accuracy : 0.9653          
#>                  95% CI : (0.9651, 0.9656)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.2839          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.993301        
#>             Specificity : 0.965125        
#>          Pos Pred Value : 0.171688        
#>          Neg Pred Value : 0.999949        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007177        
#>    Detection Prevalence : 0.041800        
#>       Balanced Accuracy : 0.979213        
#>                                           
#>        'Positive' Class : Up              
#> 

Summary

FHO Logistic Regression
Accuracy 0.965328411612348
AUC 0.999413070684321
ROC TRUE
Threshold 0.214
Sensitivity=Recall=Power 0.993301104972376
Specificity=1-FPR 0.965124840616436
FDR 0.828311886742904
Precision=PPV 0.171688113257096

Hold-Out Test Sample Performance: Random Forest

y <- fho$BlueTarp_rest
preds <- predict(classifier.rf, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1979710    3002
#>       Up      9987   11478
#>                                           
#>                Accuracy : 0.9935          
#>                  95% CI : (0.9934, 0.9936)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.6355          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.792680        
#>             Specificity : 0.994981        
#>          Pos Pred Value : 0.534731        
#>          Neg Pred Value : 0.998486        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005727        
#>    Detection Prevalence : 0.010710        
#>       Balanced Accuracy : 0.893830        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.rf.Threshold = min(c(rf.Threshold,thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.rf.ROC = "TRUE"

statistics = stat_fho(classifier.rf, fho, fho.rf.Threshold)

fho.rf.accuracy = statistics[1]
fho.rf.sensitivity = statistics[2]
fho.rf.specificity = statistics[3]
fho.rf.FDR = statistics[4]
fho.rf.precision = statistics[5]

yhat <- tuned_call(preds, fho.rf.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1979537    2993
#>       Up     10160   11487
#>                                           
#>                Accuracy : 0.9934          
#>                  95% CI : (0.9933, 0.9935)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : < 2.2e-16       
#>                                           
#>                   Kappa : 0.6327          
#>                                           
#>  Mcnemar's Test P-Value : < 2.2e-16       
#>                                           
#>             Sensitivity : 0.793301        
#>             Specificity : 0.994894        
#>          Pos Pred Value : 0.530651        
#>          Neg Pred Value : 0.998490        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005732        
#>    Detection Prevalence : 0.010801        
#>       Balanced Accuracy : 0.894097        
#>                                           
#>        'Positive' Class : Up              
#> 

Summary

FHO Random Forest
Accuracy 0.993437206394445
AUC 0.980334789679392
ROC TRUE
Threshold 0.495
Sensitivity=Recall=Power 0.793301104972376
Specificity=1-FPR 0.994893694869118
FDR 0.469349101492124
Precision=PPV 0.530650898507876

Hold-Out Test Sample Performance: Support Vector Machines with Linear Kernel

y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.linear, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1949566     162
#>       Up     40131   14318
#>                                           
#>                Accuracy : 0.9799          
#>                  95% CI : (0.9797, 0.9801)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.4087          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.988812        
#>             Specificity : 0.979831        
#>          Pos Pred Value : 0.262962        
#>          Neg Pred Value : 0.999917        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007144        
#>    Detection Prevalence : 0.027168        
#>       Balanced Accuracy : 0.984321        
#>                                           
#>        'Positive' Class : Up              
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.linear.Threshold = min(c(svm.linear.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.linear.ROC = "TRUE"

statistics = stat_fho(classifier.svm.linear, fho, fho.svm.linear.Threshold)

fho.svm.linear.accuracy = statistics[1]
fho.svm.linear.sensitivity = statistics[2]
fho.svm.linear.specificity = statistics[3]
fho.svm.linear.FDR = statistics[4]
fho.svm.linear.precision = statistics[5]

yhat <- tuned_call(preds, fho.svm.linear.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1894813     103
#>       Up     94884   14377
#>                                           
#>                Accuracy : 0.9526          
#>                  95% CI : (0.9523, 0.9529)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.2225          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.992887        
#>             Specificity : 0.952312        
#>          Pos Pred Value : 0.131584        
#>          Neg Pred Value : 0.999946        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.007174        
#>    Detection Prevalence : 0.054517        
#>       Balanced Accuracy : 0.972600        
#>                                           
#>        'Positive' Class : Up              
#> 

Hold-Out Test Sample Performance: Support Vector Machines with Radial Basis Function Kernel

y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.radial, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977443    3524
#>       Up     12254   10956
#>                                          
#>                Accuracy : 0.9921         
#>                  95% CI : (0.992, 0.9922)
#>     No Information Rate : 0.9928         
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.5776         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.756630       
#>             Specificity : 0.993841       
#>          Pos Pred Value : 0.472038       
#>          Neg Pred Value : 0.998221       
#>              Prevalence : 0.007225       
#>          Detection Rate : 0.005467       
#>    Detection Prevalence : 0.011581       
#>       Balanced Accuracy : 0.875236       
#>                                          
#>        'Positive' Class : Up             
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.radial.Threshold = min(c(svm.radial.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.radial.ROC = "TRUE"

statistics = stat_fho(classifier.svm.radial, fho, fho.svm.radial.Threshold)

fho.svm.radial.accuracy = statistics[1]
fho.svm.radial.sensitivity = statistics[2]
fho.svm.radial.specificity = statistics[3]
fho.svm.radial.FDR = statistics[4]
fho.svm.radial.precision = statistics[5]

yhat <- tuned_call(preds, fho.svm.radial.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1973658    3121
#>       Up     16039   11359
#>                                           
#>                Accuracy : 0.9904          
#>                  95% CI : (0.9903, 0.9906)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.5381          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.784461        
#>             Specificity : 0.991939        
#>          Pos Pred Value : 0.414592        
#>          Neg Pred Value : 0.998421        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.005668        
#>    Detection Prevalence : 0.013670        
#>       Balanced Accuracy : 0.888200        
#>                                           
#>        'Positive' Class : Up              
#> 

Hold-Out Test Sample Performance: Support Vector Machines with Polynomial Kernel

y <- fho$BlueTarp_rest
preds <- predict(classifier.svm.poly, newdata = fho, type = "prob")[,2]
yhat <- tuned_call(preds, 0.5)
caret::confusionMatrix(yhat, y, positive = "Up") 
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1979177    7205
#>       Up     10520    7275
#>                                          
#>                Accuracy : 0.9912         
#>                  95% CI : (0.991, 0.9913)
#>     No Information Rate : 0.9928         
#>     P-Value [Acc > NIR] : 1              
#>                                          
#>                   Kappa : 0.4464         
#>                                          
#>  Mcnemar's Test P-Value : <2e-16         
#>                                          
#>             Sensitivity : 0.502417       
#>             Specificity : 0.994713       
#>          Pos Pred Value : 0.408823       
#>          Neg Pred Value : 0.996373       
#>              Prevalence : 0.007225       
#>          Detection Rate : 0.003630       
#>    Detection Prevalence : 0.008879       
#>       Balanced Accuracy : 0.748565       
#>                                          
#>        'Positive' Class : Up             
#> 

thresholds = seq(0.001,0.499,.001) # Avoid Threshold Tails
F_1_scores <- rep(NA, length(thresholds))
for (i in 1:length(thresholds)){
  yhat <- tuned_call(preds, thresholds[i])
  results <- tibble::tibble(y=y, yhat=yhat)
  cm <- results %>%yardstick::conf_mat(truth=y, yhat)
  
  trueNegative  = cm$table[1,1]
  falseNegative = cm$table[1,2]
  falsePositive = cm$table[2,1]
  truePositve   = cm$table[2,2]
  
  conditionPositive = truePositve + falseNegative
  conditionNegative = trueNegative + falsePositive 
  predictedConditionPositive = truePositve +  falsePositive 
  predictedConditionNegative = trueNegative + falseNegative
  
  TotalPopulation = truePositve + falsePositive + trueNegative + falseNegative
  
  PPV = truePositve / predictedConditionPositive
  TPR = truePositve / conditionPositive
  
  F_1_score = 2 * ((PPV * TPR)/(PPV + TPR))
  F_1_scores[i] <- F_1_score
}

#plot(thresholds, F_1_scores, xlab = 'Thresholds', ylab = 'F1 Score', type = 'l')
fho.svm.poly.Threshold = min(c(svm.poly.Threshold, thresholds[tail(which(na.omit(F_1_scores) == max(na.omit(F_1_scores))), n=1)]))
fho.svm.poly.ROC = "TRUE"

statistics = stat_fho(classifier.svm.poly, fho, fho.svm.poly.Threshold)

fho.svm.poly.accuracy = statistics[1]
fho.svm.poly.sensitivity = statistics[2]
fho.svm.poly.specificity = statistics[3]
fho.svm.poly.FDR = statistics[4]
fho.svm.poly.precision = statistics[5]

yhat <- tuned_call(preds, fho.svm.poly.Threshold)
caret::confusionMatrix(yhat, y, positive = "Up")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction    Down      Up
#>       Down 1977300    7107
#>       Up     12397    7373
#>                                           
#>                Accuracy : 0.9903          
#>                  95% CI : (0.9901, 0.9904)
#>     No Information Rate : 0.9928          
#>     P-Value [Acc > NIR] : 1               
#>                                           
#>                   Kappa : 0.4258          
#>                                           
#>  Mcnemar's Test P-Value : <2e-16          
#>                                           
#>             Sensitivity : 0.509185        
#>             Specificity : 0.993769        
#>          Pos Pred Value : 0.372939        
#>          Neg Pred Value : 0.996419        
#>              Prevalence : 0.007225        
#>          Detection Rate : 0.003679        
#>    Detection Prevalence : 0.009864        
#>       Balanced Accuracy : 0.751477        
#>                                           
#>        'Positive' Class : Up              
#> 

Hold-Out Test Sample Performance: Table 3

FHO KNN FHO LDA FHO QDA FHO Logistic Regression
Accuracy 0.992558541486106 0.981748119053357 0.994920608309546 0.965328411612348
AUC 0.961298412282614 0.992115543668651 0.991500075918026 0.999413070684321
ROC TRUE TRUE TRUE TRUE
Threshold 0.499 0.493 0.22 0.214
Sensitivity=Recall=Power 0.827417127071823 0.839779005524862 0.760773480662983 0.993301104972376
Specificity=1-FPR 0.993760356476388 0.982781297855905 0.996624611687106 0.965124840616436
FDR 0.50889490080341 0.738043946574752 0.378750281976088 0.828311886742904
Precision=PPV 0.49110509919659 0.261956053425248 0.621249718023912 0.171688113257096
FHO Random Forest FHO SVM Linear FHO SVM Radial FHO SVM Polynomial
Accuracy 0.993437206394445 0.952605483447819 0.990439966130736 0.990268324604064
AUC 0.980334789679392 0.999092429259135 0.986298521512439 0.837972518519624
ROC TRUE TRUE TRUE TRUE
Threshold 0.495 0.233 0.335 0.39
Sensitivity=Recall=Power 0.793301104972376 0.992886740331492 0.784461325966851 0.509185082872928
Specificity=1-FPR 0.994893694869118 0.952312337004076 0.991938973622617 0.993769403079966
FDR 0.469349101492124 0.86841599472822 0.585407693992262 0.627061203844208
Precision=PPV 0.530650898507876 0.13158400527178 0.414592306007738 0.372938796155792

Conclusions

When taking an approach to select an optimal model, by using all of the data within HaitiPixels for the model generation, their respective performances can be measured fairly accurately. Across all the algorithms, it’s worth noting the time it takes to generate each model and should be taken into account when deciding which approach to take. Using a 10-Fold cross validation within the caret package, we get the following timings:

Elapsed Time (seconds)
KNN 191.085
LDA 3.920
QDA 3.758
Logistic Regression 7.579
Random Forest 323.020
SVM with Linear Kernel 281.842
SVM with Radial Kernel 4868.989
SVM with Polynomial Kernel 11306.910

We can see some models have longer elapsed run times but this is a necessary cost since we want to avoid taking data from the model training data set. Using the full set will strengthen our model. We used the \(F_1\) scores at different thresholds since it can be used to measure the performance of the unbalanced classification data set to determine the value of the metrics used in this analysis.

The metrics we calculated for this application was precision, a measure of how precise the models are; recall, the total of actual positives; accuracy, the number of correct predictions over total number of predictions; false discovery rate, the rate of Type I errors; and specificity, the ratio of negatives that are correctly identified. As the observations are a matter of providing support and supplies to those in need, we can say that a greater emphasis is placed on a higher sensitivity and lower false discovery rate. In doing so, you’re stating that all blue tarps are correctly identity and no blue tarps are not incorrectly identified as a non-blue tarp.

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

accuracy.ranks <- c(knn.Accuracy,lda.Accuracy,qda.Accuracy,glm.Accuracy,rf.Accuracy,svm.linear.Accuracy,svm.radial.Accuracy,svm.poly.Accuracy)

auc.ranks <- c(knn.AUC,lda.AUC,qda.AUC,glm.AUC,rf.AUC,svm.linear.AUC,svm.radial.AUC,svm.poly.AUC)

sensitivity.ranks <- c(knn.Sensitivity,lda.Sensitivity,qda.Sensitivity,glm.Sensitivity,rf.Sensitivity,svm.linear.Sensitivity,svm.radial.Sensitivity,svm.poly.Sensitivity)

specificity.ranks <- c(knn.Specificity,lda.Specificity,qda.Specificity,glm.Specificity,rf.Specificity,svm.linear.Specificity,svm.radial.Specificity,svm.poly.Specificity)

fdr.ranks <- c(knn.FDR, lda.FDR,qda.FDR,glm.FDR,rf.FDR,svm.linear.FDR,svm.radial.FDR,svm.poly.FDR)

precision.ranks <- c(knn.Precision, lda.Precision,qda.Precision,glm.Precision,rf.Precision,svm.linear.Precision,svm.radial.Precision,svm.poly.Precision)

i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
i2 <- sort(auc.ranks, index.return=TRUE, decreasing = TRUE)$ix
i3 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i4 <- sort(specificity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i5 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
i6 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix

myString = paste0("If we look at the statistics provided and ordering the models by accuracy, the best model is the ",name.ranks[i1][1], " with an accuracy of ", format(round(accuracy.ranks[i1][1], 6), nsmall = 6)," followed by ", name.ranks[i1][2], " with an accuracy of ",format(round(accuracy.ranks[i1][2], 6), nsmall = 6), " and ", name.ranks[i1][3], " with an accuracy of ",format(round(accuracy.ranks[i1][3], 6), nsmall = 6),". ","AUC, or Area Under Curve, is a measure of performance across all possible classification thresholds. Lowering the classification threshold classifies more items as positive. As such, you’ll want a model with a high AUC value. Ordering the models by AUC, the best model is the ", name.ranks[i2][1], " with an AUC of ", format(round(auc.ranks[i2][1], 6), nsmall = 6), " followed by ", name.ranks[i2][2], " with an AUC of ", format(round(auc.ranks[i2][2], 6), nsmall = 6), " and ", name.ranks[i2][3], " with an AUC of ", format(round(auc.ranks[i2][3], 6), nsmall = 6), ". ","Ordering the models by sensitivity, the best model is the ", name.ranks[i3][1], " with a sensitivity of ", format(round(sensitivity.ranks[i3][1], 6), nsmall = 6), " followed by ", name.ranks[i3][2], " with a sensitivity of ", format(round(sensitivity.ranks[i3][2], 6), nsmall = 6), " and ", name.ranks[i3][3], " with a sensitivity of ", format(round(sensitivity.ranks[i3][3], 6), nsmall = 6), ". ","Ordering the models by specificity, the best model is the ", name.ranks[i4][1], " with a specificity of ", format(round(specificity.ranks[i4][1], 6), nsmall = 6), " followed by ", name.ranks[i4][2], " with a specificity of ", format(round(specificity.ranks[i4][2], 6), nsmall = 6), " and ", name.ranks[i4][3], " with a specificity of ", format(round(specificity.ranks[i4][3], 6), nsmall = 6), ". ","The False Discovery Rate (FDR) is the expected proportion of type I errors ( you get a false positive ). The optimal model is one with a minimum value for FDR. The best model is the ", name.ranks[i5][1], " with a FDR of ", format(round(fdr.ranks[i5][1], 6), nsmall = 6), " followed by ", name.ranks[i5][2], " with a FDR of ", format(round(fdr.ranks[i5][2], 6), nsmall = 6), " and ", name.ranks[i5][3], " with a FDR of ", format(round(fdr.ranks[i5][3], 6), nsmall = 6), ". ","Lastly, you'll want a higher value for precision. Ordering the models by precision, the best model is the ", name.ranks[i6][1], " with a precision of ", format(round(precision.ranks[i6][1], 6), nsmall = 6), " followed by ", name.ranks[i6][2], " with a precision of ", format(round(precision.ranks[i6][2], 6), nsmall = 6), " and ", name.ranks[i6][3], " with a precision of ", format(round(precision.ranks[i6][3], 6), nsmall = 6), ".")

If we look at the statistics provided and ordering the models by accuracy, the best model is the Random Forest with an accuracy of 0.999130 followed by KNN with an accuracy of 0.997913 and SVM with Radial Kernel with an accuracy of 0.997802. AUC, or Area Under Curve, is a measure of performance across all possible classification thresholds. Lowering the classification threshold classifies more items as positive. As such, you’ll want a model with a high AUC value. Ordering the models by AUC, the best model is the KNN with an AUC of 0.999862 followed by SVM with Radial Kernel with an AUC of 0.999820 and SVM with Polynomial Kernel with an AUC of 0.999714. Ordering the models by sensitivity, the best model is the Random Forest with a sensitivity of 0.983680 followed by SVM with Radial Kernel with a sensitivity of 0.977250 and KNN with a sensitivity of 0.969337. Ordering the models by specificity, the best model is the Random Forest with a specificity of 0.999641 followed by QDA with a specificity of 0.999004 and KNN with a specificity of 0.998857. The False Discovery Rate (FDR) is the expected proportion of type I errors ( you get a false positive ). The optimal model is one with a minimum value for FDR. The best model is the Random Forest with a FDR of 0.010940 followed by QDA with a FDR of 0.033609 and KNN with a FDR of 0.034483. Lastly, you’ll want a higher value for precision. Ordering the models by precision, the best model is the Random Forest with a precision of 0.989060 followed by QDA with a precision of 0.966391 and KNN with a precision of 0.965517.

name.ranks <- c("KNN","LDA","QDA", "Logistic Regression", "Random Forest", "SVM with Linear Kernel","SVM with Radial Kernel","SVM with Polynomial Kernel" )

accuracy.ranks <- c(
  fho.knn.accuracy,
  fho.lda.accuracy,
  fho.qda.accuracy,
  fho.glm.accuracy,
  fho.rf.accuracy,
  fho.svm.linear.accuracy,
  fho.svm.radial.accuracy,
  fho.svm.poly.accuracy)

auc.ranks <- c(fho.knn.AUC,
               fho.lda.AUC,
               fho.qda.AUC,
               fho.glm.AUC,
               fho.rf.AUC,
               fho.svm.linear.AUC,
               fho.svm.radial.AUC,
               fho.svm.poly.AUC)

sensitivity.ranks <- c(fho.knn.sensitivity,
                       fho.lda.sensitivity,
                       fho.qda.sensitivity,
                       fho.glm.sensitivity,
                       fho.rf.sensitivity,
                       fho.svm.linear.sensitivity,
                       fho.svm.radial.sensitivity,
                       fho.svm.poly.sensitivity)

specificity.ranks <- c(fho.knn.specificity,
                       fho.lda.specificity,
                       fho.qda.specificity,
                       fho.glm.specificity,
                       fho.rf.specificity,
                       fho.svm.linear.specificity,
                       fho.svm.radial.specificity,
                       fho.svm.poly.specificity)

fdr.ranks <- c(fho.knn.FDR,
               fho.lda.FDR,
               fho.qda.FDR,
               fho.glm.FDR,
               fho.rf.FDR,
               fho.svm.linear.FDR,
               fho.svm.radial.FDR,
               fho.svm.poly.FDR)

precision.ranks <- c(fho.knn.precision,
                     fho.lda.precision,
                     fho.qda.precision,
                     fho.glm.precision,
                     fho.rf.precision,
                     fho.svm.linear.precision,
                     fho.svm.radial.precision,
                     fho.svm.poly.precision)

i1 <- sort(accuracy.ranks, index.return=TRUE, decreasing = TRUE)$ix
i2 <- sort(auc.ranks, index.return=TRUE, decreasing = TRUE)$ix
i3 <- sort(sensitivity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i4 <- sort(specificity.ranks, index.return=TRUE, decreasing = TRUE)$ix
i5 <- sort(fdr.ranks, index.return=TRUE, decreasing = FALSE)$ix
i6 <- sort(precision.ranks, index.return=TRUE, decreasing = TRUE)$ix

myString2 = paste0("After running our models against the FHO provided and ordering the models that performed well on the hold-out data according to accuracy was ",name.ranks[i1][1], " with an accuracy of ", format(round(accuracy.ranks[i1][1], 6), nsmall = 6)," followed by ", name.ranks[i1][2], " with an accuracy of ",format(round(accuracy.ranks[i1][2], 6), nsmall = 6), " and ", name.ranks[i1][3], " with an accuracy of ",format(round(accuracy.ranks[i1][3], 6), nsmall = 6),". ","Ordering according to AUC, the best model is the ", name.ranks[i2][1], " with an AUC of ", format(round(auc.ranks[i2][1], 6), nsmall = 6), " followed by ", name.ranks[i2][2], " with an AUC of ", format(round(auc.ranks[i2][2], 6), nsmall = 6), " and ", name.ranks[i2][3], " with an AUC of ", format(round(auc.ranks[i2][3], 6), nsmall = 6), ". ","Ordering the results by sensitivity, the best model is the ", name.ranks[i3][1], " with a sensitivity of ", format(round(sensitivity.ranks[i3][1], 6), nsmall = 6), " followed by ", name.ranks[i3][2], " with a sensitivity of ", format(round(sensitivity.ranks[i3][2], 6), nsmall = 6), " and ", name.ranks[i3][3], " with a sensitivity of ", format(round(sensitivity.ranks[i3][3], 6), nsmall = 6), ". ","Ordering the models again by specificity, the best model is the ", name.ranks[i4][1], " with a specificity of ", format(round(specificity.ranks[i4][1], 6), nsmall = 6), " followed by ", name.ranks[i4][2], " with a specificity of ", format(round(specificity.ranks[i4][2], 6), nsmall = 6), " and ", name.ranks[i4][3], " with a specificity of ", format(round(specificity.ranks[i4][3], 6), nsmall = 6), ". ","The algorithms according to the FDR performance on the hold-out data was ", name.ranks[i5][1], " with a FDR of ", format(round(fdr.ranks[i5][1], 6), nsmall = 6), " followed by ", name.ranks[i5][2], " with a FDR of ", format(round(fdr.ranks[i5][2], 6), nsmall = 6), " and ", name.ranks[i5][3], " with a FDR of ", format(round(fdr.ranks[i5][3], 6), nsmall = 6), ". ","Lastly against the precision, the best model is the ", name.ranks[i6][1], " with a precision of ", format(round(precision.ranks[i6][1], 6), nsmall = 6), " followed by ", name.ranks[i6][2], " with a precision of ", format(round(precision.ranks[i6][2], 6), nsmall = 6), " and ", name.ranks[i6][3], " with a precision of ", format(round(precision.ranks[i6][3], 6), nsmall = 6), ".")

After running our models against the FHO provided and ordering the models that performed well on the hold-out data according to accuracy was QDA with an accuracy of 0.994921 followed by Random Forest with an accuracy of 0.993437 and KNN with an accuracy of 0.992559. Ordering according to AUC, the best model is the Logistic Regression with an AUC of 0.999413 followed by SVM with Linear Kernel with an AUC of 0.999092 and LDA with an AUC of 0.992116. Ordering the results by sensitivity, the best model is the Logistic Regression with a sensitivity of 0.993301 followed by SVM with Linear Kernel with a sensitivity of 0.992887 and LDA with a sensitivity of 0.839779. Ordering the models again by specificity, the best model is the QDA with a specificity of 0.996625 followed by Random Forest with a specificity of 0.994894 and SVM with Polynomial Kernel with a specificity of 0.993769. The algorithms according to the FDR performance on the hold-out data was QDA with a FDR of 0.378750 followed by Random Forest with a FDR of 0.469349 and KNN with a FDR of 0.508895. Lastly against the precision, the best model is the QDA with a precision of 0.621250 followed by Random Forest with a precision of 0.530651 and KNN with a precision of 0.491105.

When looking to decide what is the best model to select, the accuracy can play a part in deciding which algorithm to select. If we were to rank each algorithm based off the accuracy:

TRAIN FHO TEST
Rank 1 Random Forest QDA
Rank 2 KNN Random Forest
Rank 3 SVM with Radial Kernel KNN
Rank 4 SVM with Polynomial Kernel SVM with Radial Kernel
Rank 5 SVM with Linear Kernel SVM with Polynomial Kernel
Rank 6 Logistic Regression LDA
Rank 7 QDA Logistic Regression
Rank 8 LDA SVM with Linear Kernel

We can see that algorithms based on non-linear classifications rank near the top especially for the Random Forest Model, K Nearest Neighbors and Support Vector Machines with a Radial Kernel while the algorithms based on linear classifications ( Linear discriminant, Support Vector Machines with Linear Kernel ) had a much lower ranked accuracy.

However, accuracy is not the only metric to use in validating which model to use. We can rank the models by precision:

TRAIN FHO TEST
Rank 1 Random Forest QDA
Rank 2 QDA Random Forest
Rank 3 KNN KNN
Rank 4 SVM with Polynomial Kernel SVM with Radial Kernel
Rank 5 SVM with Radial Kernel SVM with Polynomial Kernel
Rank 6 SVM with Linear Kernel LDA
Rank 7 Logistic Regression Logistic Regression
Rank 8 LDA SVM with Linear Kernel

and by recall:

TRAIN FHO TEST
Rank 1 Random Forest Logistic Regression
Rank 2 SVM with Radial Kernel SVM with Linear Kernel
Rank 3 KNN LDA
Rank 4 SVM with Polynomial Kernel KNN
Rank 5 Logistic Regression Random Forest
Rank 6 SVM with Linear Kernel SVM with Radial Kernel
Rank 7 QDA QDA
Rank 8 LDA SVM with Polynomial Kernel

It’s important to note these models are generated from point estimates from the 10-fold cross-validation training. As such, I would recommend using the Random Forest model for detection of blue tarps. The mean and standard deviation point estimates from the 10-fold cross-validation for this model ( metrics Sensitivity, Specificity and Precision) :

  • Sensitivity Mean: 0.948570940837926

  • Sensitivity Standard Deviation: 0.0158306673347466

  • Specificity Mean: 0.998660555098661

  • Specificity Standard Deviation: 0.000351159803790381

  • Precision Mean: 0.95907902771856

  • Precision Standard Deviation: 0.0102829571574502

with the KNN Model as a close second. The mean and standard deviation point estimates from the 10-fold cross-validation for this model ( metrics Sensitivity, Specificity and Precision):

  • Sensitivity Mean: 0.959457152611813

  • Sensitivity Standard Deviation: 0.0137156602761885

  • Specificity Mean: 0.998562542574257

  • Specificity Standard Deviation: 0.000467102491419399

  • Precision Mean: 0.9568424151997

  • Precision Standard Deviation: 0.0131157372711998

The models do not behave the same when measuring the prediction against the training set of data and the final hold out test set but this reasoning can be attributed to many various factors. The rationale regarding which algorithm to use for detection of blue tarps in part can also explain why the training data and test data ranks do not exactly match but tend to fall into the same areas for linear / non linear models. It’s important to keep in mind that the data is an unbalanced classification set. The variables associated with this data set are additive colors of the RBG color spectrum. This data formulation allows us to address it with predictive modeling tools since the variables stem from the data being an unbalanced binary classification ( an unequal distribution of classes in the training dataset and FHO). This in turn means that within the metrics, there is a greater chance that a negatives that can be classified as a false positive. Although the models tested using FHO data has low precision overall compared to the training data, the sensitivity and specificity tend to be relatively higher and consistent with the training data. In addition, the based on the False Discovery Rate,

TRAIN FHO TEST
Rank 1 Random Forest QDA
Rank 2 QDA Random Forest
Rank 3 KNN KNN
Rank 4 SVM with Polynomial Kernel SVM with Radial Kernel
Rank 5 SVM with Radial Kernel SVM with Polynomial Kernel
Rank 6 SVM with Linear Kernel LDA
Rank 7 Logistic Regression Logistic Regression
Rank 8 LDA SVM with Linear Kernel

Random Forest and KNN both rank near the top, indicating that these models have lower chances of getting a false positive.

In an extension of part 1 discussions on software aspects, we expect SVMs to take the longest performance which is shown in the above table on elapsed time. The time required to do 10-fold cross validation adjusted for using tuning parameters far exceed those for LDA, QDA, GLM, etc. To avoid overfitting in the Random Forest Model, Ntree was set to 500 and not iterated across various values of ntree since Random Forest already appeared to be one of the top models. The trade-off of timing for small increases in accuracy force the analysis to lean more on saving time where possible. We have iterated across reasonable values of cost, sigma, degree , etc for the various models but we do not expect the results to change position. Quadratic and polynomial based algorithms exceeded linear based algorithms with Random Forest decision trees and K-Nearest Neighbors standing above the rest however; it’s worth pointing out that the uncertainties based on the mean and standard deviation for the various metrics can have the models flip-flop in rankings.