################################################################################ ### R BASICS WORKSHOP ### ### PRESENTATION 7: DATA GENERATION ### ### ### ### Center for Conservation and Sustainable Development ### ### Missouri Botanical Garden ### ### Website: rbasicsworkshop.weebly.com ### ################################################################################ ### A. WHY GENERATE DATA IN R? ################################################# # 1. Faking data for publications (the easy way to Science or Nature) # e.g.: https://en.wikipedia.org/wiki/Diederik_Stapel # 2. Exploring theoretical models (numerically or graphically) # 3. Generating computer simulations # B. CREATING REGULAR SEQUENCES ################################################ # Function *c* - concatenate x <- c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5) x # Operator *:* - sequence of integers x <- 1:30 x ## IMPORTANT: Pay attention to the priority of operators 1:10-1 # This is not the same as this: 1:(10-1) # Function: *rep* - repeat rep(1, 30) rep(1:4, times=3) rep(1:4, each = 3) # Function *seq* - sequence seq(1, 5, 0.5) seq(length = 9, from = 1, to = 5) # Function *sequence* - vector of sequences sequence(nvec = 4:6) sequence(nvec = c(10.5)) # Function: *gl* - generate levels of a factor gl(n = 3, k = 5) gl(3, 5, length = 30) gl(2, 6, label = c("Male", "Female")) gl(2, 10) gl(2, 1, length = 20) gl(2, 2, length = 20) # Function *expand.grid* - combine levels expand.grid(a = c(60, 80), p = c(100, 300), sex = c("Male", "Female")) ### C. GENERATING RANDOM SEQUENCES ############################################# # Function *rnorm* - random values of a normal distribution x.norm <- rnorm(n = 1000, mean = 0, sd = 1) x.norm hist(x.norm) ?rnorm ## Some other statistical distributions available in R ## # Poisson x.pois <- rpois(n = 1000, lambda = 5) hist (x.pois) # Uniform x.unif <- runif(n = 1000, min = 0, max = 1) hist (x.unif) # Exponential x.exp <- rexp(n = 100, rate = 1) hist(x.exp) par(mfrow = c (2,2)) # This creates a window with several panels (2x2) hist (x.norm, main = "Normal") hist (x.pois, main = "Poisson") hist (x.unif, main = "Uniform") hist (x.exp, main = "Exponential") # Gamma # rgamma (n, shape, scale = 1) # Weibull # rweibull (n, shape, scale = 1) # Cauchy # rcauchy (n, location = 0, scale = 1) # Beta # rbeta (n, shape1, shape2) # Student t # rt (n, df) # Fisher-Snedecor (F) # rf (n, df1, df2) # Pearson chi-squared # rchisq (n, df) # Binomial # rbinom (n, size, prob) # Multinomial # rmultinom (n, size, prob) # Geometric # rgeom (n, prob) # Hypergeometric # rhyper (nn, m, n, k) # Logistic # rlogis (n, location = 0, scale = 1) # Lognormal # rlnorm (n, meanlog = 0, sdlog = 1) # Negative binomial # rnbinom (n, size, prob) # Wilcoxon's statistics # rwilcox (nn, m, n) ### D. AN EXAMPLE BASED ON SPATIAL ANALYSIS #################################### ## In this example, we will simulate the distribution of individuals (e.g. trees) ## Within a rectangular space (e.g. a plot), and we will study the behavior of a ## measure of aggregation under various theoretical distribution models. # Open the package *spatstat*, which is needed for the analyses library (spatstat) # Define the domain (space used) for the simulation max.coord <- 125 min.coord <- -125 domain <- cbind(c(min.coord, max.coord), c(min.coord, max.coord)) colnames(domain) <- c("x", "y") plot(domain, type = "n") # Define several numbers needed for simulations clumps.n <- 10^2 # The number of groups n.per.clump <- 4^2 # The number of individuals within each group n <- n.per.clump * clumps.n # The total number of individuals # This is an object of class *owin* which will be needed later. It defines the # spatial window for analysis distri.window <- owin(xrange = c(-100, 100), yrange = c(-100, 100)) class(distri.window) ## 1. Perfectly random distribution of individuals ## rand.x <- runif (n, min.coord, max.coord) # uniform random sequence rand.y <- runif (n, min.coord, max.coord) # uniform random sequence coords.rand <- cbind (rand.x, rand.y) colnames (coords.rand) <- c ("x", "y") dim (coords.rand) # This calculates the "pair-correlation function", which describes the spatial # distribution of individuals at various distances - values of more than 1 # indicate aggregation (high density of individuals), values lower than one # indicate segregation (low density of individuals). The value of one would be # expected under a completely random distribution. To learn more, read the help # file of function *pcf* pcf.rand <- pcf(as.ppp(coords.rand, W = distri.window)) # Plot the PCF par(mfrow = c(1,2)) plot(domain, type = "n", asp = 1) plot(distri.window, col = "red", add = TRUE) points(coords.rand) plot (pcf.rand) ## 2. Perfectly regular distribution of individuals ## reg.x <- seq(domain[1, "x"], domain[2, "x"], length.out = sqrt(n)) reg.y <- seq(domain[1, "y"], domain[2, "y"], length.out = sqrt(n)) coords.reg <- expand.grid (reg.x, reg.y) colnames (coords.reg) <- c ("x", "y") dim (coords.reg) pcf.reg <- pcf(as.ppp(coords.reg, W = distri.window)) # Plot pair(mfrow = c(1,2)) plot(domain, type = "n", asp = 1) plot(distri.window, col = "red", add = TRUE) points(coords.reg) plot(pcf.reg) ## 3. Highly aggregated distribution ## rand.x <- rnorm (n, 0, 6) rand.y <- rnorm (n, 0, 6) coords.agreg <- cbind(rand.x, rand.y) colnames(coords.agreg) <- c("x", "y") pcf.agreg <- pcf(as.ppp(coords.agreg, W = distri.window)) # Plot par(mfrow = c(1,2)) plot(domain, type = "n", asp = 1) plot(distri.window, col = "red", add = TRUE) points(coords.agreg) plot(pcf.agreg) ## 4. Weakly aggregated distribution ## rand.x <- rnorm(n, 0, 25) rand.y <- rnorm(n, 0, 25) coords.agreg.2 <- cbind(rand.x, rand.y) colnames(coords.agreg.2) <- c("x", "y") pcf.agreg.2 <- pcf(as.ppp(coords.agreg.2, W = distri.window)) par(mfrow = c(1,2)) plot(domain, type = "n", asp = 1) plot(distri.window, col = "red", add = TRUE) points(coords.agreg.2) plot(pcf.agreg) ## 5. Distribution at two levels - regulate distribution of groups and aggregated ## distribution of individuals within groups ## reg.clump.x <- seq(domain[1, "x"], domain[2, "x"], length.out = sqrt(clumps.n)) reg.clump.y <- seq(domain[1, "y"], domain[2, "y"], length.out = sqrt(clumps.n)) coords.clump.reg <- expand.grid(reg.clump.x, reg.clump.y) colnames(coords.clump.reg) <- c("x", "y") dist.x <- rnorm(n, 0, 2) dist.y <- rnorm(n, 0, 2) coords.reg.agreg <- cbind(coords.clump.reg["x"] + dist.x, coords.clump.reg ["y"] + dist.y) pcf.reg.agreg <- pcf(as.ppp(coords.reg.agreg, W = distri.window)) par(mfrow = c (1,2)) plot(domain, type = "n", asp = 1) plot(distri.window, col = "red", add = TRUE) points(coords.reg.agreg) points(coords.clump.reg, col = "red") plot(pcf.reg.agreg)