################################################################################
### R BASICS WORKSHOP                                                        ###
### PRESENTATION 7: DATA GENERATION                                          ###
###                                                                          ###
### Center for Conservation and Sustainable Development                      ###
### Missouri Botanical Garden                                                ###
### Website: rbasicsworkshop.weebly.com                                      ###
################################################################################

# The material below is largely based on section 3.4 of "R for beginners". It is
# divided into three parts:

#	A) Why would you want to generate data with R?
#	B) Generating regular sequences
#	C) Generating random sequences
#	D) Examples of simulations that combine regular and random sequences

# This presentation will cover sections A through C and the first example under D.
# After the presentation, participants will study examples 2-5 under D.

#################################################################################################################################
# A) Why would you want to generate data with R?
#################################################################################################################################

#1. Publish papers based on fake data

#This is a way to mischievously advance your career: new theories may
#be accepted because their supporters made use of any trick - rational,
#rhetorical or ribald - in order to advance their cause
#(P. K. Feyerabend, 1975, "Against Method"), see:
#https://plato.stanford.edu/entries/feyerabend/

#Two examples:
#a) see in the workshop's website the pdf copy of a paper by Staple et al. (2011);
#   if interested, see  a piece featuring Staple's scientific fraud in the New York
#   Times Magazine, published in 2013 under the title "The Mind of a Con Man". 
#b) yet another retraction of a paper published in a high-end journal:
#   http://www.sciencemag.org/content/346/6215/1366
#   https://osf.io/preprints/metaarxiv/qy2se/

#2. Explore theoretical models (numerically and graphically)


#3. Conduct computer simulations


#################################################################################################################################
# B) Generating regular sequences
#################################################################################################################################

#operator ":"
x <- 1:30
x
#notice operator precedence
1:10-1
1:(10-1)

#funtion c
c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5)

#function rep
?rep
rep(1, 30)
rep(1:4, times = 3)
rep(1:4, times = c(3, 2, 1, 5))
rep(1:4, length.out = 30)
rep(1:4, each = 3) 

#function scan
z <- scan()
z

#function seq
seq(1, 5, 0.5)
seq(length.out=20, from=1, to=5)

#function sequence
sequence(4:5)
sequence(c(10,5))

#function gl (generate factor levels)
gl(3, 5)
gl(3, 5, length=30)
gl(2, 6, label=c("Male", "Female"))
gl(2, 10)
gl(2, 1, length=20)
gl(2, 2, length=20)

#function expand.grid
expand.grid(a=c(60,80), p=c(100, 300), sex=c("Male", "Female"))
expand.grid(a=c(60,80), sex=c("Male", "Female"), p=c(100, 300))
expand.grid(sex=c("Male", "Female"), a=c(60,80), p=c(100, 300))


#################################################################################################################################
# C) Generating random sequences
#################################################################################################################################

#The normal distribution is used as an example,
#but there are many other distributions available (see below)

#random deviates 
rnorm(10)
hist(rnorm(1000))
?rnorm

#functions "pnorm", "dnorm" and "qnorm" are useful to visualize
#and explore the parent distribution from which the data is generated: 

#distribution function
pnorm(q=seq(-4,4,0.1), mean = 0, sd = 1)
plot(seq(-4,4,0.1), pnorm(q=seq(-4,4,0.1), mean = 0, sd = 1), type="o")
pnorm(q=2.7, mean = 0, sd = 1)
pnorm(q=0, mean = 0, sd = 1)

#probability density function
dnorm(x=seq(-4,4,0.1), mean = 0, sd = 1, log = FALSE)
plot(seq(-4,4,0.1), dnorm(x=seq(-4,4,0.1), mean = 0, sd = 1), type="o") 
dnorm(x=2.7, mean = 0, sd = 1, log = FALSE)
dnorm(x=0, mean = 0, sd = 1, log = FALSE)
#A side comment: function "dnorm" and other functions for probability
#densities (e.g., "dpois", "dbinom", "dbeta") are central in maximum likelihood
#estimation. For a brief informal introduction to maximum likelihood estimation
#in R see: http://www.r-bloggers.com/fitting-a-model-by-maximum-likelihood/
#For a more formal, but very accessible, introduction see chapter 6 of
#Bolker, B. 2007. Ecological Models and Data in R. An excellent book, freely
#available here: http://ms.mcmaster.ca/~bolker/emdbook/ 

#quantile function
qnorm(p=0.975, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
qnorm(p=0.5, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)

### some distributions available in R (from section 3.4 of "R for beginners"):
##Gaussian (normal)
#rnorm(n, mean=0, sd=1)
##exponential
#rexp(n, rate=1)
##gamma
#rgamma(n, shape, scale=1)
##Poisson
#rpois(n, lambda)
##Weibull
#rweibull(n, shape, scale=1)
##Cauchy
#rcauchy(n, location=0, scale=1)
##beta
#rbeta(n, shape1, shape2)
##Student t 
#rt(n, df)
##Fisher-Snedecor (F)
#rf(n, df1, df2)
##Pearson chi-squared
#rchisq(n, df)
##binomial
#rbinom(n, size, prob)
##multinomial
#rmultinom(n, size, prob)
##geometric
#rgeom(n, prob)
##hypergeometric
#rhyper(nn, m, n, k)
##logistic
#rlogis(n, location=0, scale=1)
##lognormal
#rlnorm(n, meanlog=0, sdlog=1)
##negative binomial
#rnbinom(n, size, prob)
##uniform
#runif(n, min=0, max=1)
##Wilcoxon's statistics
#rwilcox(nn, m, n)
#rsignrank(nn, n)


#################################################################################################################################
# D) Examples of simulations that combine regular and random sequences
#################################################################################################################################

############
# Example 1
############

# Based on the Brownian model of evolution (Felsenstein, 1985.
# Phylogenies and the comparative method. Am.Nat. 125:1-15), the code below
# generates values for 150 evolutionary trait changes with an evolutionary rate = 1.
# First visualize the parent distribution from which trait changes will be sampled:

plot(seq(-4,4,0.1), dnorm(x=seq(-4,4,0.1), mean = 0, sd = 1), type="o", 
	xlab="Trait change", ylab="Probability density", main="Distribution of trait change",
	bty="n", cex.lab=1.5, cex.axis=1.5, cex.main=2) 

# next generate the trait change data:

change.trait.1 <- rnorm(150, mean=0, sd=1)

# plot the changes across evolutionary time:

plot(1:150, change.trait.1, xlab="Evolutionary time", ylab="Evolutionary change in trait 1", type="l")

# examine the distribution of evolutionary trait change:

hist(change.trait.1, xlab="Evolutionary change in trait 1", breaks=20)

# use function "cumsum" to plot the trait value across evolutionary time, assumming the
# ancestral trait value was zero:

?cumsum
plot(0:150, c(0,cumsum(change.trait.1)), xlab="Evolutionary time", ylab="Trait 1", type="l")

############
# Example 2
############
 
# Increasingly, studies of trait evolution consider models with parameters
# that vary through time (e.g., O�Meara et al. 2006. Testing for different
# rates of continuous trait evolution. Evolution 60: 922�933). Based on the
# Brownian model of evolution, the code below generates values for 150 evolutionary
# trait changes, with an evolutionary rate = 1 during the first 100 time units and
# evolutionary rate = 16 during the last 50 time units. First visualize the parent
# distributions from which trait changes will be sampled:

plot(seq(-8,8,0.1), dnorm(x=seq(-8,8,0.1), mean = 0, sd = 1), type="o", 
	xlab="Trait change", ylab="Probability density", main="Distributions of trait change",
	bty="n", cex.lab=1.5, cex.axis=1.5, cex.main=2) 
points(seq(-8,8,0.1), dnorm(x=seq(-8,8,0.1), mean = 0, sd = 4), type="o", col="red") 

# next generate the trait change data:

change.trait.2 <- rnorm(150, mean=0, sd=rep(c(1,4), times=c(100,50)))

# plot the changes across evolutionary time:

plot(1:150, change.trait.2, xlab="Evolutionary time", ylab="Evolutionary change in trait 2", type="l")

# add a vertical line at the time where the evolutionary rate changed:

abline(v=100, lty=2, col="red")

# examine the distribution of evolutionary trait change:

hist(change.trait.2, xlab="Evolutionary change in trait 2", breaks=20)

# use function "cumsum" to plot the trait value across evolutionary time, assumming the
# ancestral trait value was zero:

plot(0:150, c(0,cumsum(change.trait.2)), xlab="Evolutionary time", ylab="Trait 2", type="l")

# add a vertical line at the time where the evolutionary rate changed:

abline(v=100, lty=2, col="red")

############
# Example 3
############

# Beyond changes in the rate of evolution, the mode of evolution may shift through time
# (Hunt, G. et al. 2015. Simple versus complex models of trait evolution and stasis as
# a response to environmental change. PNAS 112: 4885�4890). This example models a shift
# in the mode of evolution: from a Brownian model of evolution to a "General Random Walk",
# abbreviated GRW (Hunt, G. 2008. Fitting and Comparing Models of Phyletic Evolution:
# Random Walks and beyond. Paleobiology 32: 578-601). The Brownian model is non-directional
# because the mean of trait change is zero. In contrast, the GRW can describe evolutionary
# trends, because the mean of trait change is not zero. The code below generates values for
# 150 evolutionary trait changes with an evolutionary rate = 1, and an evolutionary trend
# starting 60 time units ago, characterized by a mean of trait change = 2. First visualize
# the parent distributions from which trait changes will be sampled:

plot(seq(-4,6,0.1), dnorm(x=seq(-4,6,0.1), mean = 0, sd = 1), type="o", 
	xlab="Trait change", ylab="Probability density", main="Distributions of trait change",
	bty="n", cex.lab=1.5, cex.axis=1.5, cex.main=2) 
points(seq(-4,6,0.1), dnorm(x=seq(-4,6,0.1), mean = 2, sd = 1), type="o", col="red")

# next generate the trait change data:

change.trait.3 <- rnorm(150, mean=rep(c(0,2), times=c(90,60)), sd=1)

# plot the changes across evolutionary time:

plot(1:150, change.trait.3, xlab="Evolutionary time", ylab="Evolutionary change in trait 3", type="l")

# add a vertical line at the time where the mode of evolution changed:

abline(v=90, lty=2, col="red")

# examine the distribution of evolutionary trait change:

hist(change.trait.3, xlab="Evolutionary change in trait 3", breaks=20)

# use function "cumsum" to plot the trait value across evolutionary time, assumming the
# ancestral trait value was zero:

plot(0:150, c(0,cumsum(change.trait.3)), xlab="Evolutionary time", ylab="Trait 3", type="l")

# add a vertical line at the time where the mode of evolution changed:

abline(v=90, lty=2, col="red")

############
# Example 4 
############

# In the spatial analysis of "point patterns", the standard model for
# "complete spatial randomness" (CSR) is commonly used as a baseline hypothesis.
# In that model points follow a homogeneous Poisson process over a study region,
# so that the number of observed points in the study region is described by a
# Poisson distribution. To simulate a spatial point patter based on this model
# (following Bailey and Gatrell, 1995, "Interactive spatial data analysis", Prentice Hall),
# imagine a rectangular study region of 10 x 10 spatial units, and examine
# the Poisson distribution to decide which value of "lambda" you would like to use:

?dpois
plot(seq(0,10,1), dpois(seq(0,10,1), lambda=1), type="o", xlab= "Points in the study region")
plot(seq(0,10,1), ppois(seq(0,10,1), lambda=1), type="o", xlab= "Points in the study region")
ppois(0, lambda=1)

# next draw a value from the Poisson distribution, using the value of "lambda" you chose:

number.obs.points <- rpois(1, lambda=25)
number.obs.points

# now carry on the simulation by sampling spatial coordinates for points across
# the 10 x 10 study region:

x.coor <- runif(number.obs.points, min = 0, max = 10)
y.coor <- runif(number.obs.points, min = 0, max = 10)
plot(x.coor, y.coor, xlim=c(0,10), ylim=c(0,10), pch=19)

############
# Example 5
############

# This example places the code in Example 4 within a loop (more specifically, a "for loop").
# In day 4 of the workshop we will study loops in detail, for now you may want to informally
# examine the code. Note how the loop starts with this line of code:
# for (i in 1:100){
# and finishes with a curly bracket in the last line of code:
# } 
# Make sure to select all the code below, from the starting line to the final curly bracket,
# and run it all at once. If you want to stop the loop before it finishes the 100 iterations,
# you can press the "escape" key. 
 
for (i in 1:100){
	# next draw a value from the Poisson distribution, using the value of "lambda" you chose:
	number.obs.points <- rpois(1, lambda=25)
	# now carry on the simulation by sampling spatial coordinates for points across
	# the 10 x 10 study region:
	x.coor <- runif(number.obs.points, min = 0, max = 10)
	y.coor <- runif(number.obs.points, min = 0, max = 10)
	plot(x.coor, y.coor, xlim=c(0,10), ylim=c(0,10), pch=19)
	Sys.sleep(1) #this slows down the computer, so that there is more time to look at the plot in each iteration
}