# C4C July 27, 2002
# Week 4 - Community biodiversity analysis code
# Dr. Katherine I. Young
# Resources for data = https://onlinelibrary.wiley.com/doi/abs/10.1111/ecog.04487
# Additional resources for code = https://rpubs.com/an-bui/vegan-cheat-sheet

library(tidyverse)
library(vegan)
library(RColorBrewer)
library(gridExtra)
library(grid)
library(RVAideMemoire) # allows pairwise comparisons of adonis function for PERMANOVA
library(BiodiversityR) # includes the vegan package which is used for most community ecology analyses

# Load the data sets
# To begin doing community analyses on species data, the dataframe needs to only include species counts and the sites these counts were taken from. The sample sites are  the rows and the species sampled are the columns of the dataframe. The birds dataframe below is the general format of how community level data should be prepared.

birds <- read.csv('bird-comm.csv') # Load the bird data


env <-  read.csv('env-var.csv')# Load the environmental data


######################################
#### Proportional and total abundance

# Knowing the abundance of different species in a community can provide insight into less visible aspects of a community such as competition or predation. Most often, there are many rare species and few common. 
# We will assess the abundance of bird species within these communities in several ways

# First we can look at the abundance of all birds by site
bird.abun <- rowSums(birds[, -1])
env$bird.abun <- bird.abun # Add calculated abundance into the environmental dataframe for analysis later

# evaluate the distribution of abundance
hist(bird.abun, xlab = "Species abundance", main = NULL) # Looks like we have a fairly normal distribution of abundance in this dataset

qqnorm(bird.abun) # Test for normality
qqline(bird.abun)

shapiro.test(bird.abun) # Test for normality

# We can now plot the total bird abundance by land cover type
pal <- c("#1B9E77", "#E7298A", "#7570B3") 

plot_abun <- ggplot(env, aes(x = land_cover, y=bird.abun, fill = land_cover)) +
  geom_boxplot() +
  scale_fill_manual(values = pal) +
  scale_x_discrete(labels = c("dry \n (n = 96)", "mix \n (n = 59)", "riparian \n (n = 55)")) +
  theme(legend.position = "none",
        plot.background = element_rect("white"),
        panel.background = element_rect("white"),
        panel.grid = element_line("grey90"),
        axis.line = element_line("gray25"),
        axis.text = element_text(size = 12, color = "gray25"),
        axis.title = element_text(color = "gray25"),
        legend.text = element_text(size = 12)) + 
  labs(x = "Ecological landtype",
       y = "Number of species per site",
       title = "Bird abundance")
plot_abun

# Does bird abundance differ by land cover type?

abund_aov <- aov(bird.abun ~ land_cover, data = env)
summary(abund_aov) # There is a significant difference in total bird abundance by land cover type

#Df Sum Sq Mean Sq F value  Pr(>F)   
#land_cover    2   1292   646.0   4.985 0.00768 **
#Residuals   207  26824   129.6    

# Which land cover types differ from each other in bird abundance?
# We will use a Tukey's post hoc test to assess pairwise differences from our ANOVA above

TukeyHSD(abund_aov, conf.level=.95) # Abundance differs significantly between the riparian and dry land cover types.

#diff       lwr       upr     p adj
#mixed-dry      2.794492 -1.651028  7.240011 0.3007101
#riparian-dry   6.043182  1.498643 10.587721 0.0054908
#riparian-mixed 3.248690 -1.788211  8.285591 0.2824280


# Now evaluate the abundance of each species

(spp.abun <- (colSums(birds[, -1])))
(tot.abun <- sum(rowSums(birds[, -1])))
(prop.abun <- (colSums(birds[, -1]))/tot.abun)
rank.abun <- rank(prop.abun, ties.method = "first") 

par(mfrow = c(1,3))
hist(prop.abun, xlab = "Proportional abundance", main = NULL) 
hist(log(spp.abun), xlab = "Log abundance", main = NULL)
plot(rank.abun, prop.abun, xlim = rev(range(rank.abun)), 
     ylab = "Proportional abundance", xlab = "Species rank")
par(mfrow=c(1,1))


######################################
### How many species are in my communities?
# Calculate and evaluate bird species richness
# Richness represents the number of species present in a community/sample

richness <- specnumber(birds[,-1]) # Used to calculate the number of bird species, need to exclude the sites column
length(richness) # Look at length, should be 210 which is the number of sites sampled

env$S <- richness # Add calculated richness into the environmental dataframe for analysis later

# Are there differences in S (species richness) across the land cover types sampled?
# Use analysis of variance to assess

rich_aov <- aov(S ~ land_cover, data = env)
summary(rich_aov) # No significant difference in bird species richness across land cover types sampled

#              Df Sum Sq Mean Sq F value Pr(>F)  
#ELT           2     86    43.0   2.774 0.0648 
#Residuals   207   3209    15.5 

# plot species richness

plot_rich <- ggplot(env, aes(x = land_cover, y=S, fill = land_cover)) +
  geom_boxplot() +
  scale_fill_manual(values = pal) +
  scale_x_discrete(labels = c("dry \n (n = 96)", "mix \n (n = 59)", "riparian \n (n = 55)")) +
  theme(legend.position = "none",
        plot.background = element_rect("white"),
        panel.background = element_rect("white"),
        panel.grid = element_line("grey90"),
        axis.line = element_line("gray25"),
        axis.text = element_text(size = 12, color = "gray25"),
        axis.title = element_text(color = "gray25"),
        legend.text = element_text(size = 12)) + 
  labs(x = "Ecological landtype",
       y = "Number of species per site",
       title = "Species richness")
plot_rich


######################################
#### Species accumulation curves
# Species accumulation curves are a way to assess how effective ones sampling effort was at capturing the species represented in a certain area. 
# They are a representation of the cumulative number of species recorded by the researcher as a function of the cumulative effort spent sampling

spa <- specaccum(birds[, -1]) # Use the specaccum function to calculate # species vs # of sites sampled for the birds dataframe
plot(spa, ci.type="poly", col="blue", lwd=2, ci.lty=0, ci.col="lightblue") # if the plot comes to an asymptote then there was sufficient sampling to describe the brids present in the study area.


# Look at species accumulation by land cover type
# the BiodiversityR package allows us to estimate species accumulation based on a factor, like land cover type, using the accumcomp function
# Resource for code https://rpubs.com/Roeland-KINDT/694021

# Calculated accumulated species richness and standard deviation for each land cover type
Accum.1 <- accumcomp(birds[,-1], y = env, factor = "land_cover", 
                     method = "exact", conditioned = FALSE, plotit = FALSE)
Accum.1

# Render the species accumulation data into long format for plotting in ggplot
accum.long1 <- accumcomp.long(Accum.1, ci = NA, label.freq = 5) 
head(accum.long1)

# Create a clean plot theme to use in ggplot
BioR.theme <- theme(
  panel.background = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.line = element_line("gray25"),
  text = element_text(size = 12),
  axis.text = element_text(size = 10, colour = "gray25"),
  axis.title = element_text(size = 14, colour = "gray25"),
  legend.title = element_text(size = 14),
  legend.text = element_text(size = 14),
  legend.key = element_blank())

# Plot the species accumulation data by land cover type. 
plotgg1 <- ggplot(data=accum.long1, aes(x = Sites, y = Richness, ymax = UPR, ymin = LWR)) + 
  scale_x_continuous(expand=c(0, 1), sec.axis = dup_axis(labels=NULL, name=NULL)) +
  scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
  geom_line(aes(colour=Grouping), size=2) +
  geom_point(data=subset(accum.long1, labelit==TRUE), 
             aes(colour=Grouping, shape=Grouping), size=5) +
  geom_ribbon(aes(colour=Grouping, fill=after_scale(alpha(colour, 0.3))), show.legend=FALSE) + 
  BioR.theme +
  scale_colour_manual(values = pal) +
  labs(x = "Sites", y = "Bird richness", colour = "land_cover", shape = "land_cover")
plotgg1

# All land cover types come to aymptote showing efficient sampling to determine the diversity of birds present

######################################
### How diverse are my communities?
# Diversity represents both the species richness and evenness within a community. In other words how many species are present and how abundance are these species relative to the others within the community
# Calculate shannon and simpson diversity as well as evenness
# Shannon's diversity
Shannon <- diversity(birds[,-1], index = "shannon")# Calculate Shannon diversity
env$H <- Shannon # Add calculated Shannon's H into the environmental dataframe for analysis later

shan_aov <- aov(H ~ land_cover, data = env)
summary(shan_aov) # No significant difference in Shannon's H across land cover types sampled

#Df Sum Sq Mean Sq F value Pr(>F)  
#land_cover    2  0.399 0.19958   2.817 0.0621 .
#Residuals   207 14.668 0.07086  

# Simpson's diversity
Simpson <- diversity(birds[,-1], index = "simpson")
env$D <- Simpson # Add calculated Simpson's D into the environmental dataframe for analysis later

simp_aov <- aov(D ~ land_cover, data = env)
summary(simp_aov) # No significant differences in Simpson's H across land cover types sampled

#Df  Sum Sq  Mean Sq F value Pr(>F)  
#land_cover    2 0.00583 0.002916   2.776 0.0646 .
#Residuals   207 0.21744 0.001050 

# Evenness represents XXXXXX
# Pielou's evennes (J)
Even <- env$H/log(specnumber(birds[,-1]))
env$J <- Even

even_aov <- aov(J ~ land_cover, data = env)
summary(even_aov) # No significant differences in Evenness across land cover types sampled

# To see other diversity estimators use specpool
specpool(birds[, -1])

#Df  Sum Sq  Mean Sq F value Pr(>F)
#land_cover    2 0.00323 0.001615   1.411  0.246
#Residuals   207 0.23696 0.001145               

# Represent Diversity indices in column format and stack graphs

# Make a new dataframe with means for plotting
div_plot_df <- env %>%
  group_by(land_cover) %>%
  summarize(mean_H = round(mean(H), 2),
            err_H = sd(H)/sqrt(length(H)),
            mean_D = round(mean(D), 2),
            err_D = sd(D)/sqrt(length(D)), 
            mean_J = round(mean(J), 2),
            err_J = sd(J)/sqrt(length(J))) 

clean_background <- theme(plot.background = element_rect("white"),
                          panel.background = element_rect("white"),
                          panel.grid = element_line("white"),
                          axis.line = element_line("gray25"),
                          axis.text = element_text(size = 12, color = "gray25"),
                          axis.title = element_text(color = "gray25"),
                          legend.text = element_text(size = 12),
                          legend.key = element_rect("white"))
# Plot Shannon's H
plot_shandiv <- ggplot(div_plot_df, aes(x = land_cover, y = mean_H, fill = land_cover)) +
  geom_col(color = "black") +
  scale_fill_manual(values = pal) +
  geom_errorbar(aes(ymin = mean_H - err_H, ymax = mean_H + err_H), width = 0.5) + # need to make sure the mean and err is from the correct indice
  geom_text(aes(x = land_cover, y = mean_H + err_H + 0.07 , label = mean_H)) +
  # scale_x_discrete(labels = c("dry \n (n = 96)", "mix \n (n = 59)", "riparian \n (n = 55)")) +
  scale_y_continuous(limits = c(0, 2.75), expand = c(0,0), labels = scales::number_format(accuracy=0.01)) +
  clean_background + 
  theme(legend.position = "none",
        axis.title.x = element_blank(),
        axis.text.x = element_blank()) +
  labs(y = "Mean Shannon diversity",
       title = "Shannon diversity")
plot_shandiv

# Plot Simpson's D

plot_simpdiv <- ggplot(div_plot_df, aes(x = land_cover, y = mean_D, fill = land_cover)) +
  geom_col(color = "black") +
  scale_fill_manual(values = pal) +
  geom_errorbar(aes(ymin = mean_D - err_D, ymax = mean_D + err_D), width = 0.5) + # need to make sure the mean and err is from the correct indice
  geom_text(aes(x = land_cover, y = mean_D + err_D + 0.07 , label = mean_D)) +
  # scale_x_discrete(labels = c("dry \n (n = 96)", "mix \n (n = 59)", "riparian \n (n = 55)")) +
  scale_y_continuous(limits = c(0, 1.0), expand = c(0,0)) +
  clean_background + 
  theme(legend.position = "none",
        axis.title.x = element_blank(),
        axis.text.x = element_blank()) +
  labs(y = "Mean Simpson diversity", 
       title = "Simpson diversity")
plot_simpdiv

# Plot evenness 

plot_even <- ggplot(div_plot_df, aes(x = land_cover, y = mean_J, fill = land_cover)) +
  geom_col(color = "black") +
  scale_fill_manual(values = pal) +
  geom_errorbar(aes(ymin = mean_D - err_J, ymax = mean_J + err_J), width = 0.5) + # need to make sure the mean and err is from the correct indice
  geom_text(aes(x = land_cover, y = mean_J + err_D + 0.07 , label = mean_J)) +
  scale_x_discrete(labels = c("dry \n (n = 96)", "mix \n (n = 59)", "riparian \n (n = 55)")) +
  scale_y_continuous(limits = c(0, 1.0), expand = c(0,0)) +
  clean_background + 
  theme(legend.position = "none") +
  labs(x = "Ecological landtype",
       y = "Mean Evenness", 
       title = "Evenness")
plot_even

# Combine plots into a single figure

alpha <- grid.arrange(plot_shandiv, plot_simpdiv, plot_even, ncol = 1)


######################################
### How different are my communities?
# Beta diversity represents how similar or dissimilar species compostion is between sample sites or habitats etc. It is an essential to understanding the overall diversity of a landscape and can be useful in answering questions about how certain species separate or inhabit the same areas. 
# beta diversity is calculated using 'vegdist' in vegan
# the function handles many version of beta
# We will use Bray-Curtis distance 
# This line creates an object named 'bray' that contains our pairwise distance calculations

bray <- vegdist(birds[, -1], method="bray", binary=FALSE) #binary=FALSE means you look at the number of individuals.  TRUE would give the result for presence-absence (Sorenson's index)
bray


# If we want to assess differences in community composition we need to use a permutational multivariate analysis of variance, or a perMANOVA. This test will assess the differences between communities based on how dissimilar they are. 

bird.perm <- adonis(birds[,-1] ~ land_cover, data = land_cover)
bird.perm$aov.tab # Bird communities differ by land cover type


# 'betadisper' is a simple way to view the variance in beta in our groups
# Distance to the centroid tells us about the variation within each group

mod.bd <- betadisper(bray, env$land_cover)

# Create a simple plot of the results using base R
boxplot(mod.bd, xlab="Land cover", notch = TRUE, col = pal)

# How do we know which land cover types differed in Beta diversity?
# We need to do pairwise comparisons
# We can use the package "RVAideMemoire" to run a pairwise Permanova

pair.comp <- RVAideMemoire::pairwise.perm.manova(birds[,-1], env$land_cover, nperm = 999,
                                                 test = "Pillai", p.method = "fdr")
pair.comp # There are differences across all land cover types

# We can also brute force these comparisons by subsetting our datasets
# First, create subsetted birds data by land cover for comparisons

# dry vs mixed
birds.dry.mix <- birds %>%
  mutate(land_cover = env$land_cover) %>% # this will put land cover into the birds dataframe
  relocate(land_cover, .after = site) %>% # move land cover to front of dataframe
  select(-site) %>% # get rid of the site column
  filter(land_cover == "dry" | land_cover == "mixed")

perm.dry.mix <- adonis(birds.dry.mix[,-1] ~ land_cover, data = birds.dry.mix)
perm.dry.mix$aov.tab # Bird communities differ by p = 0.037

# dry vs riparian
birds.dry.rip <- birds %>%
  mutate(land_cover = env$land_cover) %>% # this will put land cover into the birds dataframe
  relocate(land_cover, .after = site) %>% # move land cover to front of dataframe
  select(-site) %>% # get rid of the site column
  filter(land_cover == "dry" | land_cover == "riparian")

perm.dry.rip <- adonis(birds.dry.rip[,-1] ~ land_cover, data = birds.dry.rip)
perm.dry.rip$aov.tab # Bird communities differ by p = 0.001

# mixed vs riparian
birds.mix.rip <- birds %>%
  mutate(land_cover = env$land_cover) %>% # this will put land cover into the birds dataframe
  relocate(land_cover, .after = site) %>% # move land cover to front of dataframe
  select(-site) %>% # get rid of the site column
  filter(land_cover == "mixed" | land_cover == "riparian")

perm.mix.rip <- adonis(birds.mix.rip[,-1] ~ land_cover, data = birds.mix.rip)
perm.mix.rip$aov.tab # Bird communities differ by p = 0.002

# All communities have significantly dissimilar bird species composition


######################################
# Can we visualize how bird communities differ in space
# PCA or principal components analysis form of ordination that will reduce the species data matrix to a smaller number of synthetic variables, or axes, in represent this reduced dataset in space.

# Lets pull out a dataframe that has the site names and land cover types just for plotting
site_type <- env %>%
  select(site, land_cover)

# use the function rda in vegan to run a PCA on the matrix of bird counts by site
birdPCA <- rda(birds[,-1])
birdPCA

# Extract the PCA scores from the PCA by site
# We will add in the site names and land cover types for plotting
PCAscores <- scores(birdPCA, display = "sites") %>% 
  as.data.frame() %>% 
  mutate(site = site_type$site, land_cover = site_type$land_cover)

# Extract the PCA scores from the PCA by species
PCAvect <- scores(birdPCA, display = "species") %>% 
  as.data.frame()

# Plot the PCA
plot_PCA <- ggplot() +
  geom_point(data = PCAscores, aes(x = PC1, y = PC2, color = land_cover)) +
  scale_color_manual(values = pal) +
  geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
  geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
  geom_segment(data = PCAvect, aes(x = 0, y = 0, xend = PC1, yend = PC2), arrow = arrow(length = unit(0.2, "cm"))) +
  geom_text(data = PCAvect, aes(x = PC1, y = PC2, label = rownames(PCAvect))) +
  clean_background +
  labs(x = "PC1 (23.57%)",
       y = "PC2 (12.23%)",
       title = "Principal Components Analysis") 
plot_PCA

# There is still a lot of overlap between bird communities but certain bird species explain most of the variation between communities. 

######################################
# An alternative to a PCA is an NMDS, Nonmetric multidimensional scaling analysis. This is very commonly used by community ecologists.
# NMDS also reduces our species data into axes, but does so relative to all other species. Imagine an axis that describes relative abundance of ACFL (Acadian Flycatchers) - all points exist somewhere on that axis relative to the others. Now consider another axis describing the relative abundance of KEWA (Kentucky Warblers) - all points still exist somewhere on that axis, but now there are two axes describing the position of your points. NMDS allows you to collapse all these species axes into 2 to plot in space in order to visualize the differences between samples and sites.

# Run the NMDS using metaMDS()
bird_NMDS <- metaMDS(birds[,-1])

# Assess the model performance using stressplot()
# This lets us know how likely or unlikely it is that we might missinterpret the community separation
stressplot(bird_NMDS)

# extract NMDS output to plot in ggplot
plot_df <- scores(bird_NMDS, display = "sites") %>% 
  as.data.frame() %>%
  mutate(site = site_type$site, land_cover = site_type$land_cover)

# Create NMDS plot colored by land cover type
plot_nmds <- ggplot(plot_df, aes(x = NMDS1, y = NMDS2, color = land_cover, shape = land_cover)) +
  geom_point(size = 3, alpha = 0.8) +
  scale_color_manual(values = pal) +
  stat_ellipse(linetype = 2, size = 1) +
  clean_background +
  labs(title = "NMDS")
plot_nmds

# We can use the function envfit() in vegan to determine the relative contribution of both bird species and the environment to the separation we see in our communities

# envfit() will take the NMDS output and the species matrix 
fit <- envfit(bird_NMDS, birds[,-1], perm = 999) 

# extract p-values for each species
fit_pvals <- fit$vectors$pvals %>% 
  as.data.frame() %>% 
  rownames_to_column("species") %>% 
  dplyr::rename("pvals" = ".")

# extract coordinates for species, only keep species with p-val = 0.001
fit_spp <- fit %>% 
  scores(., display = "vectors") %>% 
  as.data.frame() %>% 
  rownames_to_column("species") %>% 
  full_join(., fit_pvals, by = "species") %>% 
  filter(pvals == 0.001)

# Create a new plot
nmds_plot_new <- ggplot(plot_df, aes(x = NMDS1, y = NMDS2)) +
  coord_fixed() +
  geom_point(aes(color = land_cover, shape = land_cover), size = 3, alpha = 0.8) +
  stat_ellipse(aes(color = land_cover)) +
  scale_color_manual(values = pal) +
  geom_segment(data = fit_spp, aes(x = 0, xend = NMDS1, y = 0, yend = NMDS2),
               arrow = arrow(length = unit(0.25, "cm")),
               col = "black") +
  geom_text(data = fit_spp, aes(label = species)) +
  clean_background
nmds_plot_new

# The vectors or arrows indicate bird species with significant contribution to the separation in the communities by land cover type. 

######################################
##### How does environment affect communities? 
# Let's go back to our NMDS for a moment

# envfit() will take the NMDS output and the environmental variable matrix 
env.fit <- envfit(bird_NMDS, env[,2:5], perm = 999) 

# extract p-values for each environmental variable
env_pvals <- env.fit$vectors$pvals %>% 
  as.data.frame() %>% 
  rownames_to_column("environment") %>% 
  dplyr::rename("pvals" = ".")

# extract coordinates for environmental variables, only keep variables with p-val = 0.001
fit_env <- env.fit %>% 
  scores(., display = "vectors") %>% 
  as.data.frame() %>% 
  rownames_to_column("environment") %>% 
  full_join(., env_pvals, by = "environment") %>% 
  filter(pvals == 0.001)

# Create a new plot
nmds_plot_new <- ggplot(plot_df, aes(x = NMDS1, y = NMDS2)) +
  coord_fixed() +
  geom_point(aes(color = land_cover, shape = land_cover), size = 3, alpha = 0.8) +
  stat_ellipse(aes(color = land_cover)) +
  scale_color_manual(values = pal) +
  geom_segment(data = fit_env, aes(x = 0, xend = NMDS1, y = 0, yend = NMDS2),
               arrow = arrow(length = unit(0.25, "cm")),
               col = "black") +
  geom_text(data = fit_env, aes(label = environment)) +
  clean_background
nmds_plot_new
