################################################################################ ### R BASICS WORKSHOP ### ### PRESENTATION 7: DATA GENERATION ### ### ### ### Center for Conservation and Sustainable Development ### ### Missouri Botanical Garden ### ### Website: rbasicsworkshop.weebly.com ### ################################################################################ # The material below is largely based on section 3.4 of "R for beginners". It is # divided into three parts: # A) Why would you want to generate data with R? # B) Generating regular sequences # C) Generating random sequences # D) Examples of simulations that combine regular and random sequences # This presentation will cover sections A through C and the first example under D. # After the presentation, participants will study examples 2-5 under D. ################################################################################################################################# # A) Why would you want to generate data with R? ################################################################################################################################# #1. Publish papers based on fake data #This is a way to mischievously advance your career: new theories may #be accepted because their supporters made use of any trick - rational, #rhetorical or ribald - in order to advance their cause #(P. K. Feyerabend, 1975, "Against Method"), see: #https://plato.stanford.edu/entries/feyerabend/ #Two examples: #a) see in the workshop's website the pdf copy of a paper by Staple et al. (2011); # if interested, see a piece featuring Staple's scientific fraud in the New York # Times Magazine, published in 2013 under the title "The Mind of a Con Man". #b) yet another retraction of a paper published in a high-end journal: # http://www.sciencemag.org/content/346/6215/1366 # https://osf.io/preprints/metaarxiv/qy2se/ #2. Explore theoretical models (numerically and graphically) #3. Conduct computer simulations ################################################################################################################################# # B) Generating regular sequences ################################################################################################################################# #operator ":" x <- 1:30 x #notice operator precedence 1:10-1 1:(10-1) #funtion c c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5) #function rep ?rep rep(1, 30) rep(1:4, times = 3) rep(1:4, times = c(3, 2, 1, 5)) rep(1:4, length.out = 30) rep(1:4, each = 3) #function scan z <- scan() z #function seq seq(1, 5, 0.5) seq(length.out=20, from=1, to=5) #function sequence sequence(4:5) sequence(c(10,5)) #function gl (generate factor levels) gl(3, 5) gl(3, 5, length=30) gl(2, 6, label=c("Male", "Female")) gl(2, 10) gl(2, 1, length=20) gl(2, 2, length=20) #function expand.grid expand.grid(a=c(60,80), p=c(100, 300), sex=c("Male", "Female")) expand.grid(a=c(60,80), sex=c("Male", "Female"), p=c(100, 300)) expand.grid(sex=c("Male", "Female"), a=c(60,80), p=c(100, 300)) ################################################################################################################################# # C) Generating random sequences ################################################################################################################################# #The normal distribution is used as an example, #but there are many other distributions available (see below) #random deviates rnorm(10) hist(rnorm(1000)) ?rnorm #functions "pnorm", "dnorm" and "qnorm" are useful to visualize #and explore the parent distribution from which the data is generated: #distribution function pnorm(q=seq(-4,4,0.1), mean = 0, sd = 1) plot(seq(-4,4,0.1), pnorm(q=seq(-4,4,0.1), mean = 0, sd = 1), type="o") pnorm(q=2.7, mean = 0, sd = 1) pnorm(q=0, mean = 0, sd = 1) #probability density function dnorm(x=seq(-4,4,0.1), mean = 0, sd = 1, log = FALSE) plot(seq(-4,4,0.1), dnorm(x=seq(-4,4,0.1), mean = 0, sd = 1), type="o") dnorm(x=2.7, mean = 0, sd = 1, log = FALSE) dnorm(x=0, mean = 0, sd = 1, log = FALSE) #A side comment: function "dnorm" and other functions for probability #densities (e.g., "dpois", "dbinom", "dbeta") are central in maximum likelihood #estimation. For a brief informal introduction to maximum likelihood estimation #in R see: http://www.r-bloggers.com/fitting-a-model-by-maximum-likelihood/ #For a more formal, but very accessible, introduction see chapter 6 of #Bolker, B. 2007. Ecological Models and Data in R. An excellent book, freely #available here: http://ms.mcmaster.ca/~bolker/emdbook/ #quantile function qnorm(p=0.975, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) qnorm(p=0.5, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) ### some distributions available in R (from section 3.4 of "R for beginners"): ##Gaussian (normal) #rnorm(n, mean=0, sd=1) ##exponential #rexp(n, rate=1) ##gamma #rgamma(n, shape, scale=1) ##Poisson #rpois(n, lambda) ##Weibull #rweibull(n, shape, scale=1) ##Cauchy #rcauchy(n, location=0, scale=1) ##beta #rbeta(n, shape1, shape2) ##Student t #rt(n, df) ##Fisher-Snedecor (F) #rf(n, df1, df2) ##Pearson chi-squared #rchisq(n, df) ##binomial #rbinom(n, size, prob) ##multinomial #rmultinom(n, size, prob) ##geometric #rgeom(n, prob) ##hypergeometric #rhyper(nn, m, n, k) ##logistic #rlogis(n, location=0, scale=1) ##lognormal #rlnorm(n, meanlog=0, sdlog=1) ##negative binomial #rnbinom(n, size, prob) ##uniform #runif(n, min=0, max=1) ##Wilcoxon's statistics #rwilcox(nn, m, n) #rsignrank(nn, n) ################################################################################################################################# # D) Examples of simulations that combine regular and random sequences ################################################################################################################################# ############ # Example 1 ############ # Based on the Brownian model of evolution (Felsenstein, 1985. # Phylogenies and the comparative method. Am.Nat. 125:1-15), the code below # generates values for 150 evolutionary trait changes with an evolutionary rate = 1. # First visualize the parent distribution from which trait changes will be sampled: plot(seq(-4,4,0.1), dnorm(x=seq(-4,4,0.1), mean = 0, sd = 1), type="o", xlab="Trait change", ylab="Probability density", main="Distribution of trait change", bty="n", cex.lab=1.5, cex.axis=1.5, cex.main=2) # next generate the trait change data: change.trait.1 <- rnorm(150, mean=0, sd=1) # plot the changes across evolutionary time: plot(1:150, change.trait.1, xlab="Evolutionary time", ylab="Evolutionary change in trait 1", type="l") # examine the distribution of evolutionary trait change: hist(change.trait.1, xlab="Evolutionary change in trait 1", breaks=20) # use function "cumsum" to plot the trait value across evolutionary time, assumming the # ancestral trait value was zero: ?cumsum plot(0:150, c(0,cumsum(change.trait.1)), xlab="Evolutionary time", ylab="Trait 1", type="l") ############ # Example 2 ############ # Increasingly, studies of trait evolution consider models with parameters # that vary through time (e.g., O�Meara et al. 2006. Testing for different # rates of continuous trait evolution. Evolution 60: 922�933). Based on the # Brownian model of evolution, the code below generates values for 150 evolutionary # trait changes, with an evolutionary rate = 1 during the first 100 time units and # evolutionary rate = 16 during the last 50 time units. First visualize the parent # distributions from which trait changes will be sampled: plot(seq(-8,8,0.1), dnorm(x=seq(-8,8,0.1), mean = 0, sd = 1), type="o", xlab="Trait change", ylab="Probability density", main="Distributions of trait change", bty="n", cex.lab=1.5, cex.axis=1.5, cex.main=2) points(seq(-8,8,0.1), dnorm(x=seq(-8,8,0.1), mean = 0, sd = 4), type="o", col="red") # next generate the trait change data: change.trait.2 <- rnorm(150, mean=0, sd=rep(c(1,4), times=c(100,50))) # plot the changes across evolutionary time: plot(1:150, change.trait.2, xlab="Evolutionary time", ylab="Evolutionary change in trait 2", type="l") # add a vertical line at the time where the evolutionary rate changed: abline(v=100, lty=2, col="red") # examine the distribution of evolutionary trait change: hist(change.trait.2, xlab="Evolutionary change in trait 2", breaks=20) # use function "cumsum" to plot the trait value across evolutionary time, assumming the # ancestral trait value was zero: plot(0:150, c(0,cumsum(change.trait.2)), xlab="Evolutionary time", ylab="Trait 2", type="l") # add a vertical line at the time where the evolutionary rate changed: abline(v=100, lty=2, col="red") ############ # Example 3 ############ # Beyond changes in the rate of evolution, the mode of evolution may shift through time # (Hunt, G. et al. 2015. Simple versus complex models of trait evolution and stasis as # a response to environmental change. PNAS 112: 4885�4890). This example models a shift # in the mode of evolution: from a Brownian model of evolution to a "General Random Walk", # abbreviated GRW (Hunt, G. 2008. Fitting and Comparing Models of Phyletic Evolution: # Random Walks and beyond. Paleobiology 32: 578-601). The Brownian model is non-directional # because the mean of trait change is zero. In contrast, the GRW can describe evolutionary # trends, because the mean of trait change is not zero. The code below generates values for # 150 evolutionary trait changes with an evolutionary rate = 1, and an evolutionary trend # starting 60 time units ago, characterized by a mean of trait change = 2. First visualize # the parent distributions from which trait changes will be sampled: plot(seq(-4,6,0.1), dnorm(x=seq(-4,6,0.1), mean = 0, sd = 1), type="o", xlab="Trait change", ylab="Probability density", main="Distributions of trait change", bty="n", cex.lab=1.5, cex.axis=1.5, cex.main=2) points(seq(-4,6,0.1), dnorm(x=seq(-4,6,0.1), mean = 2, sd = 1), type="o", col="red") # next generate the trait change data: change.trait.3 <- rnorm(150, mean=rep(c(0,2), times=c(90,60)), sd=1) # plot the changes across evolutionary time: plot(1:150, change.trait.3, xlab="Evolutionary time", ylab="Evolutionary change in trait 3", type="l") # add a vertical line at the time where the mode of evolution changed: abline(v=90, lty=2, col="red") # examine the distribution of evolutionary trait change: hist(change.trait.3, xlab="Evolutionary change in trait 3", breaks=20) # use function "cumsum" to plot the trait value across evolutionary time, assumming the # ancestral trait value was zero: plot(0:150, c(0,cumsum(change.trait.3)), xlab="Evolutionary time", ylab="Trait 3", type="l") # add a vertical line at the time where the mode of evolution changed: abline(v=90, lty=2, col="red") ############ # Example 4 ############ # In the spatial analysis of "point patterns", the standard model for # "complete spatial randomness" (CSR) is commonly used as a baseline hypothesis. # In that model points follow a homogeneous Poisson process over a study region, # so that the number of observed points in the study region is described by a # Poisson distribution. To simulate a spatial point patter based on this model # (following Bailey and Gatrell, 1995, "Interactive spatial data analysis", Prentice Hall), # imagine a rectangular study region of 10 x 10 spatial units, and examine # the Poisson distribution to decide which value of "lambda" you would like to use: ?dpois plot(seq(0,10,1), dpois(seq(0,10,1), lambda=1), type="o", xlab= "Points in the study region") plot(seq(0,10,1), ppois(seq(0,10,1), lambda=1), type="o", xlab= "Points in the study region") ppois(0, lambda=1) # next draw a value from the Poisson distribution, using the value of "lambda" you chose: number.obs.points <- rpois(1, lambda=25) number.obs.points # now carry on the simulation by sampling spatial coordinates for points across # the 10 x 10 study region: x.coor <- runif(number.obs.points, min = 0, max = 10) y.coor <- runif(number.obs.points, min = 0, max = 10) plot(x.coor, y.coor, xlim=c(0,10), ylim=c(0,10), pch=19) ############ # Example 5 ############ # This example places the code in Example 4 within a loop (more specifically, a "for loop"). # In day 4 of the workshop we will study loops in detail, for now you may want to informally # examine the code. Note how the loop starts with this line of code: # for (i in 1:100){ # and finishes with a curly bracket in the last line of code: # } # Make sure to select all the code below, from the starting line to the final curly bracket, # and run it all at once. If you want to stop the loop before it finishes the 100 iterations, # you can press the "escape" key. for (i in 1:100){ # next draw a value from the Poisson distribution, using the value of "lambda" you chose: number.obs.points <- rpois(1, lambda=25) # now carry on the simulation by sampling spatial coordinates for points across # the 10 x 10 study region: x.coor <- runif(number.obs.points, min = 0, max = 10) y.coor <- runif(number.obs.points, min = 0, max = 10) plot(x.coor, y.coor, xlim=c(0,10), ylim=c(0,10), pch=19) Sys.sleep(1) #this slows down the computer, so that there is more time to look at the plot in each iteration }