## Coding 4 Conservation 2022
## Model Evaluation & Comparison
## Script made by Samantha Sambado & Tatum Katz
## Material derived from Tatum Katz, An Bui, Ana Sofia Guerra, Caroline Owens, Cara Brooks

# Today we will be discussing model evaluation and comparison.
# This topic will include a review of concepts you have learned throughout the course.
# Model evaluation can be frustrating because it is less about strict rules
# and more about a biologists intuition after exhausting all evaluation tools.
# The key concept --> balance goodness of fit & complexity. 

# This R script will  
# (1) Briefly review concepts from previous coding sessions - practice is good!
# (2) Try a couple model evaluation tools on a single dataset about penguins
      # Residual analysis 
      # AIC/BIC
      # Cross-validation 
# (3) Provide code that you can explore at another time 
      # Power analysis
      # Confusion matrix

# Quick notes on my coding style :
  # QUESTION is meant to be a discussion point in class
  # EXERCISE is meant to be an activity done as a group or on your own
  # CHECK IN is meant for all of us to take a moment to reflect of what we have learned


# I tend to put a TON of information in scripts. 
# All of this information is not meant to be consumed during just one class.
# The intention for this script is to serve as a road map when you do your own analysis outside of class.
# We will not get through all of this information today,
# but you can always email me if you have additional questions.
# email : sbsambado@ucsb.edu


# Let's begin!



#---------------------------------------------------------#
# SECTION 0. The Set Up 
#---------------------------------------------------------#

# Clears your history & environment
rm(list=ls()) 

# Upload necessary packages

# If this is your first time using a package make sure to 
# install.packages("readr")

# To find out more information about a package, you can type
# ?readr()
# or click the `Help` button in bottom right console 

library(readr) # reads in csv files
library(palmerpenguins) # contains the penguin dataset if the csv file didn't upload
library(ggplot2) # ggplot visualization package 
library(tidyverse) # manipulates data
library(psych) # has the pairs.panel() function for exploring relationships within your dataset
library(lme4) # generalized linear model package 
library(lmerTest) # generalized linear model package
library(kableExtra) # makes summary tables

# Upload necessary dataset
# This dataset should be part of your .Rproj so you don't have to worry about file paths

# Dataset 1 : Palmer Penguins 
penguins_full <- read_csv("palmerpenguins.csv") # read in data
# The database we will use is known as The Palmer Archipelago penguins in Antarctica. 
# These data were collected from 2007-2009 by Dr. Kristen Gorman's team at the Palmer Station Long Term Ecological Research Program
# There are 344 individual penguins of 3 species on 3 islands
# More information on this dataset can be found at -> https://allisonhorst.github.io/palmerpenguins/articles/intro.html

penguins <- na.omit(penguins_full) # remove NAs

#---------------------------------------------------------#
# SECTION 1. Review - Data exploration 
#---------------------------------------------------------#

# Explore the database by numbers #
head(penguins) # shows the first 6 rows in the dataset
dim(penguins) # 333 rows of 9 variables 
names(penguins) # names gives you the names of the variables
# if column names are messy or not easily interpretative, 
# you can use the same command and "[column number]" to change to something more explicit
names(penguins)[1] <- "id" # change the 1st column in the dataset penguins to "id"
summary(penguins) # summary tells you the summary statistics (mean, min, quantiles) of each of the variables


# Make sure data structures are correct for your analysis #
str(penguins) # str() will tell you how the variables are categorized (numerical, integer, character, factor)
# what I see is
# numeric values: id, bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, year
# character values: species, island, sex


## QUESTION(s) ##
# Do these classifications make sense for numeric and character values? 
# Would it be wrong to classify year as a character for certain circumstances?
# If I wanted to transform year from numeric to character value, what would be potential code for that?
  # ****write code for the transformation below****


# Visually explore the database #

## EXERCISE 1.1 ##
#  Write code to make a ggplot 
#  of the `penguin` dataset  
# `body_mass_g` as your Y variable, 
# `sex` as your X variable
#  in the shape of a boxplot
  # ****write code for the transformation below****


# Explore the numeric vs numeric relationships #
# **Note: I am focusing on my outcome of interest, the body mass (g) of penguins

# Plot 1
ggplot(penguins, aes(x = bill_length_mm, y = body_mass_g)) +
  geom_point() # seems like there is potentially a linear relationship but notice the clusters

# Plot 2
ggplot(penguins, aes(x = bill_depth_mm, y = body_mass_g)) +
  geom_point() # looks like there are trends but they may be clustered by another variable

# Plot 3
ggplot(penguins, aes(x = flipper_length_mm, y = body_mass_g)) +
  geom_point() # looks like a positive linear relationship

# Plot 4a
ggplot(penguins, aes(x = year, y = body_mass_g)) +
  geom_point() # because my data are clustered around few years, I will make the personal call to treat year as a character

## EXERCISE 1.2 ##
# Changing year from numeric to character value
  # ****write code for the transformation below****


# Explore the character vs numeric relationships #

# Plot 4b
ggplot(penguins, aes(x = year, y = body_mass_g)) +
  geom_boxplot() # notice the geom_* changes... year looks better as a boxplot

# Plot 5
ggplot(penguins, aes(x = sex, y = body_mass_g)) +
  geom_boxplot() # means look slightly different for female and male

# Plot 6
ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot() # Gentoo species look like they have an average mean larger than the other species

# Plot 7
ggplot(penguins, aes(x = island, y = body_mass_g)) +
  geom_boxplot() # means look significantly different between Biscoe and other two islands


## QUESTION ##
# After this data exploration, what variables would you want to include in your model? 


## ANSWERS ##

# EXERCISE 1.1
ggplot() # shows a blank plot
ggplot(penguins, aes(x = sex, y = body_mass_g)) # shows axes labels
  geom_boxplot() # shows data, with axes labels, on a plot

# EXERCISE 1.2
penguins$year <- as.character(penguins$year)

#---------------------------------------------------------#
# SECTION 2. Review - Model building 
#---------------------------------------------------------#

## CHECK IN ##
# After some data exploration and with some knowledge of the system, the modeler (aka yourself),
# has some idea of what variable is most interesting to understand and possibly an idea of what 
# variables are associated with the most interesting variable. 
# This sort of curiosity and intuition drives model building and model evaluation.


# Terminology:

# Y variable / dependent variable / response variable / outcome of interest: 
  # the variable you are trying to understand
# X variable(s) /  independent variable(s) / explanatory variable / input(s) of interest / : 
  # the variable(s) you think explain your main variable of interest
# FULL model: include all variables
# NULL model: no variables, just the slope intercept (aka also known as the "mean" model)
# model: includes only the variables the modeler deems is "appropriate"


## EXERCISE 2.1 ##
# Write a FULL model for the penguin dataset with body_mass_g as your outcome of interest. 


## EXERCISE 2.2 ##
# Write a NULL model


# Because I am currently not sure which is the best model to explain the variation in penguin body mass
# I will build many different models using the same dataset.
# There are many packages in R that can build multiple models automatically but to emphasize the point of
# model building and evaluation. 

# To start the model building process, I can look at correlation plots to quantify associations 
# to see which variables could be important to include. 
# I like the function `pairs.panels()` because 
# it shows the distribution of each variable in blue histograms
# gives the correlation or rho of each bivariate relationship
# gives rough linear trends of each bivariate relationship 

pairs.panels(penguins) 
# if you are overwhelmed by this plot, 
# you can also subset the penguin dataset to 4 or 5 variables 
# you think are most likely to be associated with your variable of interest (body_mass_g)


## QUESTION ## 
# Based on our exploratory plots from above, 
# what variables do you think would be good to include in our different models?

# Some candidate models could be: 
mod_1_var <- lm(body_mass_g ~ flipper_length_mm, 
                data = penguins)

mod_2_var <- lm(body_mass_g ~ flipper_length_mm + species, 
                data = penguins)

mod_3_var <- lm(body_mass_g ~ flipper_length_mm + species + bill_length_mm, 
                data = penguins)


## CHECK IN ##
# But wait, are any of these independent variables correlated with each other?
# ** Remember, we want to follow rules of parsimony by 
# ** creating the most simple model that explains the most variation in our outcome variable

# To easily check if my dependent variables are correlated, I will remove body mass and quickly glance at correlation values

penguins_subset <- subset(penguins, select = -c(id, body_mass_g)) # subset columns out of penguin
head(penguins_subset) # always check to make sure your subset worked

# re-run correlation plot with just dependent variables
pairs.panels(penguins_subset)

# From the `pairs.panels()` plot I can see that 
# flipper_length_mm & species have a correlation value of 0.85 
# bill_depth_mm & species have a correlation value of -0.74
# bill_length_mm & species have a correlation value of 0.73
# flipper_length_mm & bill_length_mm have a correlation value of 0.65
# For me, this information helps me to decide which variables may be redundant to optimize parsimony


# **Remember, correlation rho can range from -1 to 1
# ** 0 means there is no correlation between 2 variables
# ** - 1 means the 2 variables are perfectly negatively correlated (aka a high value of 1 variable is correlated with a low value of the other variable)
# ** 1 means the 2 variables are perfectly positively correlated
# ** People have many cut offs for what two variables should/should not be included as independent variables, but as the biologist you can make the final call 


## ANSWERS ##

# EXERCISE 2.1
full_model <- lm(body_mass_g ~ species + island + bill_length_mm + bill_depth_mm + flipper_length_mm + year + sex, 
                 data = penguins)

# EXERCISE 2.2
null_model <- lm(body_mass_g ~ 1, 
                 data = penguins)


#---------------------------------------------------------#
# SECTION 3. Tool kit - Residual analysis 
#---------------------------------------------------------#

# Meant to evaluate linear models (LM) and generalized linear models (GLM)

# CONCEPT: Residuals are the difference between the model estimates and actual data. 

# Diagnostics of each model #

# Visually check the residuals

# ** Note:
# Residuals vs fitted plot: VARIANCE, looking for straight line with no distinct patterns (checking for equal variance)
# Normal Q-Q plot: NORMALITY, looking for circles to fall on 1:1 line, okay if there's some outliers off the dotted line but there shouldn't be a cluster of dots off the 1:1 line
# Scale-Location: VARIANCE, looking for straight line to check for constant variance
# Residuals vs Leverage: OUTLIERS, looking for points to be grouped together 

# Diagnostics of each model
plot(mod_1_var) # Residuals vs fitted: There is a non-straight pattern & Q-Q: Cluster of points of 1:1 line 
plot(mod_2_var) # Residuals vs fitted: There is a non-straight pattern with clusters
plot(mod_3_var) # Residuals vs fitted: There is a non-straight pattern with clusters  & Q-Q: Cluster of points of 1:1 line 
# May have to perform a transformation or choose another model type like a glm but for the sake of time let's carry on with this example


# Finally, we can compare the R-squared & F-statistic value for each model to see how much variation in the y-variable is explained by each combination of predictors. We are looking for a model with high adjusted R-squared.

summary(mod_1_var) # R-square =  0.7621 ; F-statistic = 1060 
summary(mod_2_var) # R-square =  0.7870 ; F-statistic = 405.3 
summary(mod_3_var) # R-square =  0.8243 ; F-statistic = 384.7 

# >> This method suggests that the variation in body mass is best explained by FLIPPER LENGTH & SPECIES & BILL LENGTH.
# Or specifically, the model explains 82% of the variation in the dataset. 
# Does this seem reasonable in terms of goodness of fit and complexity?

#---------------------------------------------------------#
# SECTION 4. Tool kit - AIC & BIC 
#---------------------------------------------------------#

# Meant to evaluate any models built on the same data

# CONCEPT: Evaluates how good the model fits the data and penalizes how many parameters there are in the model.

# We can also compare models using the AIC & BIC.
# Remember that AIC/BIC  is only a valid comparison between models that were fitted using exactly the same data. 
# AIC and BIC should generally lead us to similar results, but BIC places a higher penalty on extra parameters. This helps to enforce the principle of parsimony - that a simpler model is best.
# The absolute value of the AIC/BIC doesn't matter; we are only concerned about the *relative* value of AIC/BIC. 
# A model with a smaller AIC/BIC value performs relatively better than a model with a larger AIC/BIC value. Don't get tripped up by negative AICs/BIC.
# **We are looking for overall smallest, not closest to 0**.


# It is often convenient to print all of these values for each model as a neat table, so that your reader can compare them all at a glance. 
# The function kable() in the package knitr allows us to print a dataframe as a table in html, pdf, or latex format. 
# Look up the help documentation for kable using ?kable. You can specify left, right, or center alignment, give the number of digits to round to for the whole table or for each column individually, edit the names of your columns, and more to get the perfect beautiful table for your report.

# calculate AIC of each model
results <- AIC(full_model, null_model, 
               mod_1_var, mod_2_var, mod_3_var)

# add other metrics to your table
models <- list(full_model, null_model, 
               mod_1_var, mod_2_var, mod_3_var) # make sure you keep your models in the same order here as they were when you created your results table

# add a column for BIC to the results
results$BIC <- sapply(models, BIC)

model_summaries <- lapply(models, summary) # #look up ?lapply if you have not used this function before

# now we will use a for loop to easily extract the R^2 and adj R^2 value for each model from its summary, and store them in new columns in the results table
for(i in 1:length(models)){ # this creates a variable i that starts with the value i=1
  results$rsq[i] <- model_summaries[[i]]$r.squared # we assign the rsq value from model i to the i'th row of the column 'rsq' in the table 'results'
  results$adj_rsq[i] <- model_summaries[[i]]$adj.r.squared # same for adjusted rsq
} # now we go back to the beginning of the for-loop, add 1 to the value of i, and do everything again

# **Note: if you were typing in .Rmd file (we're currently in .R file), the table could be knitted into an interactive .html file
kable(results, digits = 2, align = "c") %>%
  kableExtra::kable_styling()

# >> This method suggests that the variation in body mass is best explained by the FULL model
# Does this seem reasonable in terms of goodness of fit and complexity?

##### ON YOUR OWN TIME ####

# There are many ways to calculate AIC and BIC. 
# I am just going to mention two other functions but feel free to explore outside of this class!

# Alternative Method 1: Stepwise
# Model selection is a very complex problem for statisticians, and we have just scratched the surface here. 
# There are many other ways to choose appropriate combinations of predictor variables. 
# One example is stepwise model selection. This process starts with a 'full model' (including all possible predictors) and eliminates variables one by one, calculating AIC each time. 
# When the AIC can't be reduced any more by taking away variables or putting them back, you have the final optimized model. 
# You can try stepwise AIC in R:

# install.packages("MASS")
library(MASS) # necessary package

# stepAIC function doesn't like missing values so make sure to remove any missing values
# Write out the full model
full_model <- lm(body_mass_g ~ species + island + bill_length_mm + bill_depth_mm + flipper_length_mm + year + sex, 
                 data = penguins)

# Write out the null model
null_model <- lm(body_mass_g ~ 1, 
                 data = penguins)

# calculate AIC
stepAIC(full_model, scope = c(upper = full_model, lower = null_model))
# >> This method suggests that the variation in body mass is best explained by ISLAND... Does that make sense?
# Does this seem reasonable in terms of goodness of fit and complexity?


# Alternative Method 2: MumIN
# For this, we use the package `MuMIn` which is useful for excluding/including each predictor 
# when you're dealing with a model that has many predictors. 

#install.packages("MuMIn")
library(MuMIn) # necessary package

penguins_nona <- na.omit(penguins) # must remove all NAs for MuMIn to work 
global.model <- lm(body_mass_g ~ species + island + bill_length_mm + bill_depth_mm + flipper_length_mm + year + sex, 
                 data = penguins_nona) 

# this makes all the possible models that are subsets of global.model
penguin.dredge <- dredge(global.model)
penguin.dredge

# >> This method suggests that the variation in body mass is best explained by SEX & SPECIES... 
# Does this seem reasonable in terms of goodness of fit and complexity?


#---------------------------------------------------------#
# SECTION 6. Tool kit - Cross validation  
#---------------------------------------------------------#

# Meant to evaluate any type of model

# CONCEPT: Validates your model. Beware of overly complex models. 

# Cross-validation is a tool to validate your model using another set of data
  # train data: the data you use to build your model (in-bag)
  # test data: the data you test the performance of your model with 

# **Note: There is a lot of extra code here that isn't helpful to code along with during class
# but may be helpful to look through on your own time.
# The point of this section is to show that the simpler model may not be the best at predicting your outcome

# must remove all NAs for this to work
penguins_nona <- na.omit(penguins) 

## Model 1
# Let's just say this was our best model but we want to test it
mod_best <- lm(body_mass_g ~ flipper_length_mm +  bill_length_mm  + species, 
               data = penguins_nona)

# separate your data into a training set (most of the data) and a test set (a few observations, or <10% of rows)
splitter <- sample(1:nrow(penguins_nona), 20, replace = F) # pick 20 random rows from penguin to reserve as test data

# leave those rows out of the training data
psub_train <- penguins_nona[-splitter, ]

# use them to create a set of test data
psub_test <- penguins_nona[splitter, ]

# fit the final model, using JUST your training set as the data argument
mod_split <- lm(body_mass_g ~ flipper_length_mm  + bill_length_mm  + species, 
                data = psub_train)

# use the fitted model to predict values for your test data
mod_prediction <- predict(mod_split, psub_test)

# Visually check 
plot(psub_test$body_mass_g, pch = 1, # plot the actual test data values
     ylab = "Body Mass (g)", xlab = "Individual Penguin") 
points(mod_prediction, pch = 20, col = "red") # plot the model predictions for those points

## Model 2
# let's see what a simpler model would predict, 
# everything stays the same except I am changing the model
# let's just say this was our best model but we want to test it
mod_simpler <- lm(body_mass_g ~ flipper_length_mm +  species, 
               data = penguins_nona)

# separate your data into a training set (most of the data) and a test set (a few observations, or <10% of rows)
splitter_simpler <- sample(1:nrow(penguins_nona), 20, replace = F) # pick 20 random rows from penguin to reserve as test data

# leave those rows out of the training data
psub_train_simpler <- penguins_nona[-splitter_simpler, ]

# use them to create a set of test data
psub_test_simpler <- penguins_nona[splitter_simpler, ]

# fit the final model, using JUST your training set as the data argument
mod_split_simpler <- lm(body_mass_g ~ flipper_length_mm  + species, 
                data = psub_train_simpler)

# use the fitted model to predict values for your test data
mod_prediction_simpler <- predict(mod_split_simpler, psub_test_simpler)

# Compare Model 1 & Model 2 
plot(psub_test$body_mass_g, pch = 1, # plot the actual test data values
     ylab = "Body Mass (g)", xlab = "Individual Penguin") 
points(mod_prediction, pch = 20, col = "red") # plot the model predictions for those points
points(mod_prediction_simpler, pch = 20, col = "blue") # plot the simpler model predictions for those points

## >> This method suggests that Model 1 is better than Model 2 at predicting the "real" data
# Does this seem reasonable in terms of goodness of fit and complexity?
  ## Model 1 is slightly more complex than Model 2 but may be worth it considering it is still a relatively simple model

## QUESTION ##
#  How well does this model fit the data? How well does it predict penguin body mass?



#---------------------------------------------------------#
# IN SUMMARY   
#---------------------------------------------------------#

# We saw within this dataset there was disagreement of what the "best" model to describe the variation of body mass
  
  # Residual analysis > flipper_length_mm + species + bill_length_mm

  # AIC > species + island + bill_length_mm + bill_depth_mm + flipper_length_mm + year + sex
    # stepAIC(): island
    # MuMIn(): sex  + species

  # BIC > species + island + bill_length_mm + bill_depth_mm + flipper_length_mm + year + sex

  # Cross-validation > flipper_length_mm  + bill_length_mm  + species

# Model evaluation and comparison is an art form
# Strive for the model that achieves the best goodness of fit while maintaining simplicity
# Use as many evalution tools as possible, but also bring your biological intuition


# Thank you - Misaotra anao!



# ---------------- END OF CLASS ---------------- #



# Outside of class you can explore additional information on model evaluation tools

# Some resources I used to make this script 
  # kable tips to make nice tables
  # https://bookdown.org/yihui/rmarkdown-cookbook/kable.html#change-column-names

  # how to interpret lm() summary output
  # https://feliperego.github.io/blog/2015/10/23/Interpreting-Model-Output-In-R

  # how to interpret lm() diagnostic plots
  # https://data.library.virginia.edu/diagnostic-plots/



#---------------------------------------------------------#
# ADDITION MODEL EVALUATION AND COMPARISON TOOLS
#---------------------------------------------------------#

#### Tool kit - Confusion Matrix ####

# Example came from : https://www.digitalocean.com/community/tutorials/confusion-matrix-in-r

# Meant for binary data (0/1, infected/uninfected, yes/no)

# CONCEPT: Categorizes the predictions of your model against the actual data values to calculate the model's accuracy 


# Install required packages
install.packages('caret')

# Import required library
library(caret)

# Creates vectors having data points
expected_value <- factor(c(1,0,1,0,1,1,1,0,0,1))
predicted_value <- factor(c(1,0,0,1,1,1,0,0,0,1))

# Creating confusion matrix
example <- confusionMatrix(data=predicted_value, reference = expected_value)

# Display results 
example

# Output a table with the predicted values vs actual values
table(expected_value,predicted_value)
# 3 true negative (tn)
# 1 false negative (fn)
# 2 false positive (fp)
# 4 true positive (tp)


# Using the accuracy formula :
# tp + tn / (tp + tn + fp + fn)
# 4 + 3 / 10
# 7 / 10
# .7

# >> The accuracy of the model is 70% 



#### Tool kit - AUC of the ROC ####


# Example came from : https://www.r-bloggers.com/2016/08/roc-curves-in-two-lines-of-r-code/

# Meant for binary data (0/1, infected/uninfected, yes/no)

# CONCEPT: Characterize the sensitivity and specificity tradeoffs of the model.

# Sensitivity describes the true positive rate (tp / p )
# Specificity describes the true negative rate (tn / n )

# create a function
simple_roc <- function(labels, scores){
  labels <- labels[order(scores, decreasing=TRUE)]
  data.frame(TPR=cumsum(labels)/sum(labels), FPR=cumsum(!labels)/sum(!labels), labels)
}

set.seed(1) # set seed for reproducibility 
sim_widget_data <- function(N, noise=100){
  x <- runif(N, min=0, max=100) # create your x variable 
  y <- 122 - x/2 + rnorm(N, sd=noise) # create your y variable
  bad_widget <- factor(y > 100) # classify a threshold
  data.frame(x, y, bad_widget) # make a data frame
}
widget_data <- sim_widget_data(500, 10) # simulate data

test_set_idx <- sample(1:nrow(widget_data), size=floor(nrow(widget_data)/4)) # test data idex

test_set <- widget_data[test_set_idx,] # test data
training_set <- widget_data[-test_set_idx,] # training data

library(ggplot2) # necessary package
library(dplyr) # necessary package

test_set %>% # from your test data
  ggplot(aes(x=x, y=y, col=bad_widget)) +  # use this data and color by threshold
  scale_color_manual(values=c("black", "red")) + # make colors correspond to threshold level
  geom_point() + # make a scatterplot
  ggtitle("Bad widgets related to x") # give plot a title

