## Coding 4 Conservation 2022
## Multivariate Model Formulation (Mixed Models, Part 1)
## David Klinges and Emily Ruhs

## Introduction/Preperation ---------------

## How to run a line of code
# (review from Sophia's tutorial from last time)
# ctrl/cmd + Enter, OR "Run" button
# Can also run a a chunk of code at the same time by highlighting

# All packages for today should already be installed from the preparation
# for the course (see https://coding4conservation.org/preparation). You can 
# also alternatively install and load the `tidyverse` meta-package, which 
# includes `dplyr`, `ggplot2`, and a number of other useful packages all in
# one. However `tidyverse` isn't necessary for this tutorial

# install.packages("tidyverse")
# library(tidyverse)
library(dplyr) # for curating and re-shaping data
library(lme4) # generalized linear, nonlinear, and mixed model package
library(ggplot2) #g gplot visualization package. We will alternate between 
# this and the base R plot() functions

## Create a header like this ------------

# You can explore and quickly navigate between headers, or "folds", using the 
# menu at the bottom of the script window

# You can close all the headers by using option + cmd + o on Mac or
# Alt + o on Windows
  
# You can open everything with shift + option + cmd + o on Mac or 
# Shift + Alt + o on Windows
  
## Loading in and exploring data ----------

# read.table() 
lemur_data <- read.csv("Lemurdata_glm.csv")
# read_csv()

# Glance at the data
head(lemur_data)
# Or use the tidyverse/dplyr function glimpse(), which displays the data differently.
# I find this way easier to read if there are many columns
glimpse(lemur_data)

## Curate data using dplyr ---------

unique(lemur_data$year)

## Group by and summarize by individual. Get the mean
id_grouped <- group_by(lemur_data, ID)
id_means <- summarize(id_grouped, mean_GIparasites = mean(GIparasites))

glimpse(id_means)

year_grouped <- group_by(lemur_data, year)
year_means <- summarize(year_grouped, mean_GIparasites = mean(GIparasites))

glimpse(year_means)

## EXERCISE: Using the functions group_by() and summarize(), find the min and max
# number of GIparasites for each location 

## HINT --------------

# Above, we put "ID" as the second argument in group_by() to group by ID, and
# we put "year" as the second argument in group_by() to group by year. Perhaps
# we can do the same thing with other columns in lemur_data?

# Also, we used the mean() function to acquire the mean. There are similar 
# functions in R called min() and max() that may be useful here!

## ANSWER ----------------

# We'll first group by location....
location_grouped <- group_by(lemur_data, location)

# Then summarize to minima for each location.....
location_mins <- summarize(location_grouped, 
                           min_GIparasites = min(GIparasites))
location_mins

# And then summarize to maxima for each location
location_maxs <- summarize(location_grouped, 
                           max_GIparasites = max(GIparasites))
location_maxs

# We can also acquire both the min and max at the same time
location_minmaxs <- summarize(location_grouped, 
                              min_GIparasites = min(GIparasites),
                              max_GIparasites = max(GIparasites))

location_minmaxs

## Fitting a model ---------

## using glm() 
glm_model <- glm(GIparasites ~ sex + year, data = lemur_data, 
                 family = "poisson")

summary(glm_model)

## Predicting using the model -------------

# You may notice that we have a missing year, 2020, in our dataset. This was 
# because due to the COVID-19 pandemic, we weren't able to collect data in 2020.
# However, we still care about understanding the number of GIparasites in 2020,
# so we can better understand trends over time. So, let's try to predict new
# data using our model

## Now we'll predict using the mode we just fitted
lemur_data$predictions <- predict(glm_model, type = "response")

glimpse(lemur_data)

hist(lemur_data$GIparasites)
hist(lemur_data$predictions)

# Notice that we have only generated unique predictions for each group of our
# predictor variables: 
# c("Female", "Male") x c(2018, 2019, 2021, 2022) = 8 possible combinations
unique(lemur_data$predictions)
length(unique(lemur_data$predictions))

## Creating a ggplot visual of data and predictions -----------

## Base R version
plot(lemur_data$year, lemur_data$GIparasites)

## ggplot2 version
ggplot(data = lemur_data) +
  geom_point(aes(year, GIparasites))

# Same plot but slightly different code notation
ggplot() +
  geom_point(data = lemur_data, aes(year, GIparasites))

ggplot(lemur_data) +
  geom_point(aes(year, GIparasites, color = sex))

ggplot(lemur_data) +
  geom_point(aes(year, GIparasites, color = sex)) +
  geom_line(aes(year, predictions, color = sex))
# This plot displays both our data (points) as well as the predictions from
# the model (lines).

# For a publication-ready figure, it would be nice to clean this up a bit.
# For instance, perhaps we only care about displaying the mean number of 
# GIparasites per sex per year, rather than every individual. We can use the 
# same group_by() and summarize() functions we used above, but this time we'll
# group by two variables at the same time: year AND sex

lemur_data_means <- group_by(lemur_data, year, sex)
lemur_data_means <- summarize(lemur_data_means,
                              mean_GIparasites = mean(GIparasites, na.rm = T),
                              mean_predictions = mean(predictions, na.rm = T))
glimpse(lemur_data_means)

# When you group by more than one variable, it's a good practice to use ungroup()
# to de-group the data frame
lemur_data_means <- ungroup(lemur_data_means)
glimpse(lemur_data_means)

ggplot(lemur_data_means) +
  geom_point(aes(year, mean_GIparasites, color = sex)) +
  geom_line(aes(year, mean_predictions, color = sex))

# Note that our model does not perfectly fit the data: the points (data) do not
# fall exactly along the lines (predictions from the model)

## Predict using simulated data: extrapolation -----------------

# Now, we want to try predicting for years outside the domain of our observation,
# 2018 - 2022. CAUTION: this means we're extrapolating! Given that our model
# was fitted only to a few years of data, it may generate very incorrect 
# predictions when we depart our domain of observation

lemur_data_sim <- lemur_data %>% 
  bind_rows(
    data.frame(
      year = c(2016, 2016, 2017, 2017, 2020, 2020, 2023, 2023),
      sex = c("Female", "Male", "Female", "Male", "Female", "Male", "Female", "Male")
    )
  )

lemur_data_sim$predictions <- predict(glm_model, 
                                             lemur_data_sim, 
                                             type = "response")

# We can the copy + paste the code from above, to reapply it to the new dataset
# We'll use ctrl/cmd + f to search for a word, and then replace it with our new
# word ()
lemur_data_sim_means <- group_by(lemur_data_sim, year, sex)
lemur_data_sim_means <- summarize(lemur_data_sim_means,
                              mean_GIparasites = mean(GIparasites, na.rm = T),
                              mean_predictions = mean(predictions, na.rm = T))
lemur_data_sim_means <- ungroup(lemur_data_sim_means)

# Now, we can plot again using the same format as above
ggplot(lemur_data_sim_means) +
  geom_point(aes(year, mean_GIparasites, color = sex)) +
  geom_line(aes(year, mean_predictions, color = sex))
# Notice the warning message: this was because we didn't have data for years
# 2016, 2017, and 2023, only predictions

## More customization of ggplots ----------

ggplot(lemur_data_sim_means) +
  geom_point(aes(year, mean_GIparasites, color = sex)) +
  geom_line(aes(year, mean_predictions, color = sex)) +
  labs(x = "Year",
       y = "Mean # of GI Parasites",
       color = "Sex") +
  scale_color_manual(values = c("black", "yellow")) +
  theme_bw()

ggplot(lemur_data_sim_means) +
  geom_point(aes(year, mean_GIparasites, color = sex)) +
  geom_line(aes(year, mean_predictions, color = sex)) +
  labs(x = "Year",
       y = "Mean # of GI Parasites",
       color = "Sex") +
  scale_color_discrete(l = 40, c = 30) +
  theme_bw()

ggplot(lemur_data) +
  geom_point(aes(age, tail, color = age)) +
  scale_color_gradient(low = "black", high = "yellow") +
  theme_bw()
  
# https://www.w3schools.com/colors/colors_picker.asp
ggplot(lemur_data) +
  geom_point(aes(age, tail, color = age)) +
  scale_color_gradient(low = "#b3ffcc", high = "#00802b") +
  theme_bw()

## INTERACTIVE: customize your plot. Using our lemur data and the above code as
## taemplates, try building your own plot with colors and labels of your choice!

## What colors do you like the most? Try choosing some new ones from this URL:
# https://www.w3schools.com/colors/colors_picker.asp

# Perhaps you want to change the axis lables to Francais or Malagasy?

