################################################################################
### R BASICS WORKSHOP                                                        ###
### PRESENTATION 7: DATA GENERATION                                          ###
###                                                                          ###
### Center for Conservation and Sustainable Development                      ###
### Missouri Botanical Garden                                                ###
### Website: rbasicsworkshop.weebly.com                                      ###
################################################################################


### A. WHY GENERATE DATA IN R? #################################################

# 1. Faking data for publications (the easy way to Science or Nature)
# e.g.: https://en.wikipedia.org/wiki/Diederik_Stapel

# 2. Exploring theoretical models (numerically or graphically)

# 3. Generating computer simulations




# B. CREATING REGULAR SEQUENCES ################################################

# Function *c* - concatenate
x <- c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5)
x

# Operator *:* - sequence of integers
x <- 1:30
x

## IMPORTANT: Pay attention to the priority of operators
1:10-1 # This is not the same as this:
1:(10-1)

# Function: *rep* - repeat
rep(1, 30)

rep(1:4, times=3)

rep(1:4, each = 3)


# Function *seq* - sequence
seq(1, 5, 0.5)

seq(length = 9, from = 1, to = 5)


# Function *sequence* - vector of sequences
sequence(nvec = 4:6)

sequence(nvec = c(10.5))


# Function: *gl* - generate levels of a factor
gl(n = 3, k = 5)

gl(3, 5, length = 30)

gl(2, 6, label = c("Male", "Female"))

gl(2, 10)

gl(2, 1, length = 20)

gl(2, 2, length = 20)


# Function *expand.grid* - combine levels
expand.grid(a = c(60, 80), p = c(100, 300), sex = c("Male", "Female"))




### C. GENERATING RANDOM SEQUENCES #############################################

# Function *rnorm* - random values of a normal distribution
x.norm <- rnorm(n = 1000, mean = 0, sd = 1)

x.norm

hist(x.norm)

?rnorm


## Some other statistical distributions available in R ##

# Poisson
x.pois <- rpois(n = 1000, lambda = 5)
hist (x.pois)

# Uniform
x.unif <- runif(n = 1000, min = 0, max = 1)
hist (x.unif)

# Exponential
x.exp <- rexp(n = 100, rate = 1)
hist(x.exp)


par(mfrow = c (2,2)) # This creates a window with several panels (2x2)
hist (x.norm, main = "Normal")
hist (x.pois, main = "Poisson")
hist (x.unif, main = "Uniform")
hist (x.exp, main = "Exponential")



# Gamma
# rgamma (n, shape, scale = 1)

# Weibull
# rweibull (n, shape, scale = 1)

# Cauchy
# rcauchy (n, location = 0, scale = 1)

# Beta
# rbeta (n, shape1, shape2)

# Student t
# rt (n, df)

# Fisher-Snedecor (F)
# rf (n, df1, df2)

# Pearson chi-squared
# rchisq (n, df)

# Binomial
# rbinom (n, size, prob)

# Multinomial
# rmultinom (n, size, prob)

# Geometric
# rgeom (n, prob)

# Hypergeometric
# rhyper (nn, m, n, k)

# Logistic
# rlogis (n, location = 0, scale = 1)

# Lognormal
# rlnorm (n, meanlog = 0, sdlog = 1)

# Negative binomial
# rnbinom (n, size, prob)

# Wilcoxon's statistics
# rwilcox (nn, m, n)



### D. AN EXAMPLE BASED ON SPATIAL ANALYSIS ####################################

## In this example, we will simulate the distribution of individuals (e.g. trees)
## Within a rectangular space (e.g. a plot), and we will study the behavior of a
## measure of aggregation under various theoretical distribution models.


# Open the package *spatstat*, which is needed for the analyses
library (spatstat)


# Define the domain (space used) for the simulation
max.coord <- 125
min.coord <- -125

domain <- cbind(c(min.coord, max.coord), c(min.coord, max.coord))
colnames(domain) <- c("x", "y")

plot(domain, type = "n")


# Define several numbers needed for simulations
clumps.n <- 10^2 # The number of groups

n.per.clump <- 4^2 # The number of individuals within each group

n <- n.per.clump * clumps.n # The total number of individuals


# This is an object of class *owin* which will be needed later. It defines the
# spatial window for analysis
distri.window <- owin(xrange = c(-100, 100), yrange = c(-100, 100))

class(distri.window)


## 1. Perfectly random distribution of individuals ##
rand.x <- runif (n, min.coord, max.coord) # uniform random sequence
rand.y <- runif (n, min.coord, max.coord) # uniform random sequence

coords.rand <- cbind (rand.x, rand.y)
colnames (coords.rand) <- c ("x", "y")
dim (coords.rand)

# This calculates the "pair-correlation function", which describes the spatial
# distribution of individuals at various distances - values of more than 1
# indicate aggregation (high density of individuals), values lower than one
# indicate segregation (low density of individuals). The value of one would be
# expected under a completely random distribution. To learn more, read the help
# file of function *pcf*

pcf.rand <- pcf(as.ppp(coords.rand, W = distri.window))


# Plot the PCF
par(mfrow = c(1,2))
  plot(domain, type = "n", asp = 1)
  plot(distri.window, col = "red", add = TRUE)
  points(coords.rand)
  plot (pcf.rand)



## 2. Perfectly regular distribution of individuals ##
reg.x <- seq(domain[1, "x"], domain[2, "x"], length.out = sqrt(n))
reg.y <- seq(domain[1, "y"], domain[2, "y"], length.out = sqrt(n))

coords.reg <- expand.grid (reg.x, reg.y)
colnames (coords.reg) <- c ("x", "y")
dim (coords.reg)

pcf.reg <- pcf(as.ppp(coords.reg, W = distri.window))

# Plot
pair(mfrow = c(1,2))
  plot(domain, type = "n", asp = 1)
  plot(distri.window, col = "red", add = TRUE)
  points(coords.reg)
  plot(pcf.reg)



## 3. Highly aggregated distribution ##
rand.x <- rnorm (n, 0, 6)
rand.y <- rnorm (n, 0, 6)

coords.agreg <- cbind(rand.x, rand.y)
colnames(coords.agreg) <- c("x", "y")

pcf.agreg <- pcf(as.ppp(coords.agreg, W = distri.window))


# Plot
par(mfrow = c(1,2))
  plot(domain, type = "n", asp = 1)
  plot(distri.window, col = "red", add = TRUE)
  points(coords.agreg)
  plot(pcf.agreg)



## 4. Weakly aggregated distribution ##
rand.x <- rnorm(n, 0, 25)
rand.y <- rnorm(n, 0, 25)

coords.agreg.2 <- cbind(rand.x, rand.y)
colnames(coords.agreg.2) <- c("x", "y")

pcf.agreg.2 <- pcf(as.ppp(coords.agreg.2, W = distri.window))

par(mfrow = c(1,2))
  plot(domain, type = "n", asp = 1)
  plot(distri.window, col = "red", add = TRUE)
  points(coords.agreg.2)
  plot(pcf.agreg)



## 5. Distribution at two levels - regulate distribution of groups and aggregated
## distribution of individuals within groups ##
reg.clump.x <- seq(domain[1, "x"], domain[2, "x"], length.out = sqrt(clumps.n))
reg.clump.y <- seq(domain[1, "y"], domain[2, "y"], length.out = sqrt(clumps.n))

coords.clump.reg <- expand.grid(reg.clump.x, reg.clump.y)
colnames(coords.clump.reg) <- c("x", "y")

dist.x <- rnorm(n, 0, 2)
dist.y <- rnorm(n, 0, 2)

coords.reg.agreg <- cbind(coords.clump.reg["x"] + dist.x,
  coords.clump.reg ["y"] + dist.y)

pcf.reg.agreg <- pcf(as.ppp(coords.reg.agreg, W = distri.window))


par(mfrow = c (1,2))
  plot(domain, type = "n", asp = 1)
  plot(distri.window, col = "red", add = TRUE)
  points(coords.reg.agreg)
  points(coords.clump.reg, col = "red")
  plot(pcf.reg.agreg)