# Step 1: Run the regression and save it as a new object
SexTail_Model <- lm(tail ~ sex, data = lemur_data)
# Step 3 : Examine the output
summary(SexTail_Model)
# Step 4 : Plot the diagnostics (hint: see line X for plotting all at once)
par(mfrow = c(2.2))
plot(SexTail_Model)
plot(SexTail_Model,1)
## 0.3 Examine data
# Explore the database #
# print the top 10 lines
head(lemur.data)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.3 Examine data
# Explore the database #
# print the top 10 lines
head(lemur.data)
dim(lemur.data)
names(lemur.data)
#
summary(lemur.data)
str(lemur.data)
attach(lemur.data)
View(lemur.data)
tail
## 0.4 let's check our out outcome variable of interest - lemur tail length
hist(taille, col='grey')
## 0.4 let's check our out outcome variable of interest - lemur tail length
hist(tail, col='grey')
## 0.4 let's check our out outcome variable of interest - lemur tail length
hist(tail, col='grey')
boxplot(tail, ylab='Height (cm)')
## 0.5 now let's check out our potential predictor variables
par(mfrow=c(2,2))
hist(age, col='grey') ; plot(sexe, col='grey', main="Gender") ; hist(GIparasites, col='grey') ; plot(malaria, col='grey', main="Malaria") ;
## 0.5 now let's check out our potential predictor variables
par(mfrow=c(2,2))
hist(age, col='grey') ; plot(sex, col='grey', main="Gender") ; hist(GIparasites, col='grey') ; plot(malaria, col='grey', main="Malaria") ;
pairs(lemur.data)
## 0.5 now let's check out our potential predictor variables
par(mfrow=c(2,2))
hist(age, col='grey') ; plot(sex, col='grey', main="Gender") ; hist(GIparasites, col='grey') ; plot(malaria, col='grey', main="Malaria") ;
pairs(lemur.data)
View(lemur.data)
hist(age, col='grey') ; plot(sex, col='grey', main="Gender") ; hist(GIparasites, col='grey') ; plot(malaria, col='grey', main="Malaria") ;
## 0.5 now let's check out our potential predictor variables
par(mfrow=c(2,2))
hist(age, col='grey') ; plot(sex, col='grey', main="Gender") ; hist(GIparasites, col='grey') ; plot(malaria, col='grey', main="Malaria") ;
## 0.5 now let's check out our potential predictor variables
par(mfrow=c(2,2))
hist(age, col='grey') ; plot(sex, col='grey', main="Gender") ; hist(GIparasites, col='grey') ; plot(malaria, col='grey', main="Malaria")
plot(sex, col='grey', main="Gender")
hist(GIparasites, col='grey')
plot(malaria, col='grey', main="Malaria")
par(mfrow=c(1,1))
# Relationship between height and age (univariate analyses)
plot(age, taille, pch=19)
m1=lm(tail~age, data=lemur.data)
abline(m1)
summary(m1)
# Height and age
par(mfrow=c(1,1))
plot(age, tail, pch=19)
m1=lm(tail~age, data=lemur.data)
abline(m1)
summary(m1)
# GI Parasites
plot(GIparasites, tail, pch=19)
m2=lm(tail~GIparasites, data=lemur.data)
abline(m2, col = 'red')
# GI Parasites
plot(GIparasites, tail, pch=19, main = "GI Parasites & Tail Length")
m2=lm(tail~GIparasites, data=lemur.data)
abline(m2, col = 'red')
summary(m2)
# Sex and Age
boxplot(tail~sex)
plot(age, tail, col=c('blue','red')[sex],pch=19)
plot(age, tail, col=c('blue','red')[sex],pch=19)
plot(age, tail, col=c('blue','red')[sex],pch=19)
# allows us to access the variables in the dataframe without having to call the dataframe
# i.e. can print 'tail' intead of 'lemur.data$tail'
attach(lemur.data)
plot(age, tail, col=c('blue','red')[sex],pch=19)
plot(lemur.data$age, lemur.data$tail, col=c('blue','red')[sex],pch=19)
# Sex and Age
boxplot(tail~sex)
plot(age, tail, col=c('blue','red')[sex],pch=19)
abline(lm(tail[sexe=='Female']~age[sexe=='Female']), col='blue',lwd=2)
abline(lm(tail[sexe=='Male']~age[sexe=='Male']), col='red',lwd=2)
# Sex and Age
boxplot(tail~sex)
plot(age, tail, col=c('blue','red')[sex],pch=19)
abline(lm(tail[sex=='Female']~age[sex=='Female']), col='blue',lwd=2)
abline(lm(tail[sex=='Male']~age[sex=='Male']), col='red',lwd=2)
m3=lm(tail~age+sex, data=lemur.data)
summary(m3)
# Include all relevant variables
m4=lm(tail~age+sex+GIparasites+malaria, data=lemur.data)
summary(m4)
# Stepwise selection to exclude variables that do not help explain the outcome variable
step(m4)   # One can also do forward selection (add one variable at a time and see how the model improves) ; do backwards selection (the opposite) ; or multimodel selection (a combination of methods)
m5=step(m4)
summary(m5)
# Simple verification of model assumptions
plot(m5)
# Simple verification of model assumptions
par(mfrow = c(2.2))
# Simple verification of model assumptions
par(mfrow = c(2,2))
plot(m5)
?(step)
# this is the best-fit model
summary(m5)
shapiro.test(resid(m5))
plot(m5)
# Instead of tail length, let's explore determinants of parasite numbers and of infectious status
# For this, we need to understand first what type of outcome variable we're dealing with and the most appropriate model
hist(GIparasites, col='grey')
# Instead of tail length, let's explore determinants of parasite numbers and of infectious status
# For this, we need to understand first what type of outcome variable we're dealing with and the most appropriate model
par(mfrow=c(1,1))
hist(GIparasites, col='grey')
plot(sickGIparasites, col="grey", main="Infection status")
plot(as.numberic(sickGIparasites), col="grey", main="Infection status")
plot(as.numeric(sickGIparasites), col="grey", main="Infection status")
plot(sickGIparasites, col="grey", main="Infection status")
# The variable "Number of parasites" is count data and it's poisson distributed.
# This type of variable is typically modelled with poisson models
m6=glm(GIparasites~age+sexe++malaria, family='poisson', data=lemur.data)
# The variable "Number of parasites" is count data and it's poisson distributed.
# This type of variable is typically modelled with poisson models
m6=glm(GIparasites~age+sex++malaria, family='poisson', data=lemur.data)
summary(m6)
m7=step(m6)
summary(m7)
# The variable "infectious status" is a binary variable
# This type of variable is typically modelled with binomial models (also known as logistic regression)
m8=glm(sickGIparasites~age+sex++malaria, family='binomial', data=lemur.data)
## Let's change some categorical variables to numeric ones for later analysis
test <- unclass(sex)
test
## Let's change some categorical variables to numeric ones for later analysis
test <- sapply(sex, unclass)
test
## Let's change some categorical variables to numeric ones for later analysis
test <- sapply(lemur.data$sex, unclass)
lemur_data <- read.csv("~/Desktop/C4C-LinearRegression/lemur_data.txt", sep="")
View(lemur_data)
## Let's change some categorical variables to numeric ones for later analysis
test <- sapply(lemur.data$sex, unclass)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## Let's change some categorical variables to numeric ones for later analysis
test <- sapply(lemur.data$sex, unclass)
## Let's change some categorical variables to numeric ones for later analysis
test <- unclass(lemur.data$sex)
## Let's change some categorical variables to numeric ones for later analysis
lemur.data$sex <- unclass(lemur.data$sex)
View(lemur.data)
## Let's change some categorical variables to numeric ones for later analysis
## Males = 0
## Females = 1
lemur.data$sex <- factor(lemur.data$sex,
levels=c(0,1),
labels=c("Male","Female"))
View(lemur.data)
lemur_data <- read.csv("~/Desktop/C4C-LinearRegression/lemur_data.txt", sep="")
View(lemur_data)
type(lemur.data$sex)
## Let's change some categorical variables to numeric ones for later analysis
## Males = 0
## Females = 1
lemur.data$sex[lemur.data$sex == "Male"] <- "0"
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## Let's change some categorical variables to numeric ones for later analysis
## Males = 0
## Females = 1
lemur.data$sex[lemur.data$sex == "Male"] <- "0"
lemur.data$sex[lemur.data$sex == "Female"] <- "1"
# Sex and Age
boxplot(tail~sex)
plot(age, tail, col=c('blue','red')[sex],pch=19)
plot(sickGIparasites, col="grey", main="Infection status")
plot(age, tail, col=c('blue','red')[sex],pch=19)
abline(lm(tail[sex=='Female']~age[sex=='Female']), col='blue',lwd=2)
abline(lm(tail[sex=='Male']~age[sex=='Male']), col='red',lwd=2)
plot(age, tail, col=c('blue','red')[lemur.data$sex],pch=19)
abline(lm(tail[sex=='Female']~age[sex=='Female']), col='blue',lwd=2)
plot(age, tail, col=c('blue','red')[lemur.data$sex],pch=19)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.3 Explore the data
# print the top 10 lines
head(lemur.data)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(height, col='grey')
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(height, col='grey')
boxplot(height, ylab='Height (cm)')
## 1.1 AGE
par(mfrow=c(1,1))
plot(age, height, pch=19, main = "Age & Height")
m1=lm(height~age, data=lemur.data)
abline(m1, col = 'red')
summary(m1)
# 2.1 Sex and Age
boxplot(height~sex)
plot(age, height, col=c('blue','red')[lemur.data$sex],pch=19)
abline(lm(height[sex=='Female']~age[sex=='Female']), col='blue',lwd=2)
abline(lm(height[sex=='Male']~age[sex=='Male']), col='red',lwd=2)
plot(age, height, col=c('blue','red')[sex],pch=19)
plot(age, height, col=sex,pch=19)
View(lemur.data)
ggplot(lemur.data,aes(x=age,y=height,col=sex))+geom_point()
library(tidyverse)
ggplot(lemur.data,aes(x=age,y=height,col=sex))+geom_point()
## 4.2 Number of Parasites
# The variable "Number of parasites" is count data and it's poisson distributed.
# This type of variable is typically modeled with poisson models
m6=glm(GIparasites~age+sex++malaria, family='poisson', data=lemur.data)
## 4.2 Number of Parasites
# The variable "Number of parasites" is count data and it's poisson distributed.
# This type of variable is typically modeled with poisson models
m6=glm(GIparasites~age+sex+malaria, family='poisson', data=lemur.data)
## 4.2 Stepwise selection
m7=step(m6)
summary(m7)
#----------------------------------------------
# 5. Back-transformation of estimates for GLMs
#----------------------------------------------
# Remember that each family has a certain link to associate the linear predictor with the response variable
# To be able to interpret the results given in the model output, we need to back-transform them
# Let's look at the intercept of our poisson model of GI parasites and one coefficient
m7tab=summary(m7)$coefficients
intercept=exp(m7tab[1,1]) ; intercept
effet.sex=exp(m7tab[2,1]) ; effet.sexe
effet.sex=exp(m7tab[2,1]) ; effet.sexe
effet.sex=exp(m7tab[2,1]) ; effet.sex
summary(m7)
## 4.3 Diagnostics
plot(m7)
library(tidyverse)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.3 Explore the data
# print the top 10 lines
head(lemur.data)
# examine dimensions of dataframe
dim(lemur.data)
# print column names
names(lemur.data)
# basic stats on each variable
summary(lemur.data)
# list type of each variable
str(lemur.data)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
dettach(lemur.data)
detach(lemur.data)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(lemur.data$height, col='grey')
## 0.2 Load dataset
data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(data$height, col='grey')
boxplot(data$height, ylab='Height (cm)')
plot(age, height, pch=19, main = "Age & Height")
m1=lm(height~age, data=data)
abline(m1, col = 'red')
summary(m1)
# 2.1 Sex and Age
boxplot(height~sex)
plot(age, height, col=c('blue','red')[sex],pch=19)
detach(lemur.data)
detach(lemur.data)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(height, col='grey')
boxplot(height, ylab='Height (cm)')
## Let's change some categorical variables to numeric ones for later analysis
## Males = 0
## Females = 1
lemur.data$sex[lemur.data$sex == "Male"] <- "0"
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
detach(lemur.data)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
detach(lemur.data)
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(height, col='grey')
boxplot(height, ylab='Height (cm)')
## 1.1 Age
par(mfrow=c(1,1))
plot(age, height, pch=19, main = "Age & Height")
m1=lm(height~age, data=lemur.data)
abline(m1, col = 'red')
summary(m1)
# 2.1 Sex and Age
boxplot(height~sex)
plot(age, height, col=c('blue','red')[sex],pch=19)
abline(lm(height[sex=='Female']~age[sex=='Female']), col='blue',lwd=2)
abline(lm(height[sex=='Male']~age[sex=='Male']), col='red',lwd=2)
m3=lm(height~age+sex, data=lemur.data)
summary(m3)
# 2.2 Biggest multivariate of them all -- all variables!
m4=lm(height~age+sex+GIparasites+malaria, data=lemur.data)
summary(m4)
#-----------------------------------
# 3. MODEL SELECTION & DIAGNOSTICS
#-----------------------------------
## 3.1 Stepwise selection
# Now that we've built many different linear models, we can perform model selection to choose the best
# Stepwise selection to exclude variables that do not help explain the outcome variable
# perform on model with all variables
m5=step(m4)
# this is the best-fit model -- it says that out of all of our predictors, age, sex, and presence of parasites together are most predictive of height length
summary(m5)
## 3.2 Model diagnostics
# remember, we need to make sure it doesn't violate any of the assumptions!
par(mfrow = c(2,2))
plot(m5)
#--------------------------------------#
#   4. GENERALIZED LINEAR REGRESSION
#--------------------------------------#
## 4.1 Examine data
# Instead of height, let's explore determinants of parasite numbers and of infectious status
# For this, we need to understand first what type of outcome variable we're dealing with and the most appropriate model
par(mfrow=c(1,1))
hist(GIparasites, col='grey')
plot(sickGIparasites, col="grey", main="Infection status")
## 4.2 Number of Parasites
# The variable "Number of parasites" is count data and it's poisson distributed.
# This type of variable is typically modeled with poisson models
m6=glm(GIparasites~age+sex++malaria, family='poisson', data=lemur.data)
summary(m6)
## 4.2 Stepwise selection
m7=step(m6)
summary(m7)
## 4.3 Diagnostics
par(mfrow=c(2,2))
plot(m7)
## 4.3 Backtransform
# Remember that each family has a certain link to associate the linear predictor with the response variable
# To be able to interpret the results given in the model output, we need to back-transform them
# Let's look at the intercept of our poisson model of GI parasites and one coefficient
m7tab=summary(m7)$coefficients
intercept=exp(m7tab[1,1]) ; intercept
effet.sex=exp(m7tab[2,1]) ; effet.sex
plot(age, height, col=factor(sex),pch=19)
legend("topleft",
legend = levels(factor(sex)),
pch = 19,
col = factor(levels(factor(sex))))
# 2.1 Sex and Age
boxplot(height~sex)
plot(age, height, col=factor(sex),pch=19)
legend("topleft",
legend = levels(factor(sex)),
pch = 19,
col = factor(levels(factor(sex))))
abline(lm(height[sex=='Female']~age[sex=='Female']), col='black',lwd=2)
abline(lm(height[sex=='Male']~age[sex=='Male']), col='red',lwd=2)
hist(GIparasites, col='grey')
plot(sickGIparasites, col="grey", main="Infection status")
hist(sickGIparasites, col="grey", main="Infection status")
plot(sickGIparasites, col="grey", main="Infection status")
barplot(sickGIparasites, col="grey", main="Infection status")
boxplot(sickGIparasites, col="grey", main="Infection status")
plot(as.factor(sickGIparasites), col="grey", main="Infection status")
hist(GIparasites, col='grey')
plot(as.factor(sickGIparasites), col="grey", main="Infection status")
detach(lemur.data)
lemur_data <- read.csv("~/Desktop/C4C-LinearRegression/lemur_data.txt", sep="")
View(lemur_data)
#----------------------------------#
#   0. CREATE WORK ENVIRONMENT
#----------------------------------#
## 0.1 check that you are working in the right directory ("C4C-Linear Regression")
getwd()
## 0.2 Load dataset
lemur.data <- read.table("lemur_data.txt", stringsAsFactors = F, header = T)
## 0.3 Explore the data
# print the top 10 lines
head(lemur.data)
# examine dimensions of dataframe
dim(lemur.data)
# print column names
names(lemur.data)
# basic stats on each variable
summary(lemur.data)
# list type of each variable
str(lemur.data)
# access the variables in the dataframe without having to call the dataframe
# i.e. can print 'height' intead of 'lemur.data$height'
attach(lemur.data)
## 0.4 let's check our out outcome variable of interest - lemur height
hist(height, col='grey')
boxplot(height, ylab='Height (cm)')
## 1.1 Age
par(mfrow=c(1,1))
plot(age, height, pch=19, main = "Age & Height")
m1=lm(height~age, data=lemur.data)
abline(m1, col = 'red')
summary(m1)
# 2.1 Sex and Age
boxplot(height~sex)
plot(age, height, col=factor(sex),pch=19)
legend("topleft",
legend = levels(factor(sex)),
pch = 19,
col = factor(levels(factor(sex))))
abline(lm(height[sex=='Female']~age[sex=='Female']), col='black',lwd=2)
abline(lm(height[sex=='Male']~age[sex=='Male']), col='red',lwd=2)
m3=lm(height~age+sex, data=lemur.data)
summary(m3)
# 2.2 Biggest multivariate of them all -- all variables!
m4=lm(height~age+sex+GIparasites+malaria, data=lemur.data)
summary(m4)
#-----------------------------------
# 3. MODEL SELECTION & DIAGNOSTICS
#-----------------------------------
## 3.1 Stepwise selection
# Now that we've built many different linear models, we can perform model selection to choose the best
# Stepwise selection to exclude variables that do not help explain the outcome variable
# perform on model with all variables
m5=step(m4)
# this is the best-fit model -- it says that out of all of our predictors, age, sex, and presence of parasites together are most predictive of height length
summary(m5)
## 3.2 Model diagnostics
# remember, we need to make sure it doesn't violate any of the assumptions!
par(mfrow = c(2,2))
plot(m5)
#--------------------------------------#
#   4. GENERALIZED LINEAR REGRESSION
#--------------------------------------#
## 4.1 Examine data
# Instead of height, let's explore determinants of parasite numbers and of infectious status
# For this, we need to understand first what type of outcome variable we're dealing with and the most appropriate model
par(mfrow=c(1,1))
hist(GIparasites, col='grey')
plot(as.factor(sickGIparasites), col="grey", main="Infection status")
## 4.2 Number of Parasites
# The variable "Number of parasites" is count data and it's poisson distributed.
# This type of variable is typically modeled with poisson models
m6=glm(GIparasites~age+sex+malaria, family='poisson', data=lemur.data)
summary(m6)
## 4.2 Stepwise selection
m7=step(m6)
summary(m7)
## 4.3 Backtransform
# Remember that each family has a certain link to associate the linear predictor with the response variable
# To be able to interpret the results given in the model output, we need to back-transform them
# Let's look at the intercept of our poisson model of GI parasites and one coefficient
m7tab=summary(m7)$coefficients
intercept=exp(m7tab[1,1]) ; intercept
effet.sex=exp(m7tab[2,1]) ; effet.sex
#-----------------------------------
# 3. MODEL SELECTION & DIAGNOSTICS
#-----------------------------------
## 3.1 Stepwise selection
# Now that we've built many different linear models, we can perform model selection to choose the best
# Stepwise selection to exclude variables that do not help explain the outcome variable
# perform on model with all variables
m5=step(m4)
#-----------------------------------
# 3. MODEL SELECTION & DIAGNOSTICS
#-----------------------------------
## 3.1 Stepwise selection
# Now that we've built many different linear models, we can perform model selection to choose the best
# Stepwise selection to exclude variables that do not help explain the outcome variable
# let's start from the null model and work our way up to all predictors
m5 <- step(intercept_only, direction='forward', scope=formula(m4), trace=0)
#
?lemur.data
#-----------------------------------
# 3. MODEL SELECTION & DIAGNOSTICS
#-----------------------------------
## 3.1 Stepwise selection
# Now that we've built many different linear models, we can perform model selection to choose the best
# Stepwise selection to exclude variables that do not help explain the outcome variable
# Step function : choose a model by AIC in a stepwise algorithm
# the default is 'both', but you can also select 'backward' or 'forward' selection
# start with full model
m5=step(m4)
