################################################################################ ### R BASICS WORKSHOP ### ### EXERCISE 9.2: Flow control ### ### ### ### Center for Conservation and Sustainable Development ### ### Missouri Botanical Garden ### ### Website: rbasicsworkshop.weebly.com ### ################################################################################ ## OBJECTIVE: ## Practice the use of 'for' loops # Data randomization can be used to examine the statistical significance # of the eigenvalues of a principal component analysis (PCA), and thus the # statistical significance of each principal component. The code below examines # the statistical significance of the PCA on the morphological characters in # the Edgar Anderson Iris data. It is based on the method described in section # 2.1.2. of Peres-Neto et al. 2005. How many principal components? stopping # rules for determining the number of non-trivial axes revisited. Computational # Statistics & Data Analysis 49:974–997. Study the code carefully. data(iris) # Loads the Iris data from the 'datasets' package iris[1:5, ] # Prints the first 5 rows of the dataset ## TASK 1 ## ## Create an object called morpho containing only the first 4 columns ## of the data.frame iris pca.iris <- prcomp(morpho, sclale=TRUE) # This conducts a PCA summary(pca.iris) # Reports a summary of the results of the PCA str(pca.iris) # Shows the structure of the object pca.iris containing the results pca.iris$sdev # This is a vector containing the square root of the eigenvalues empirical.ev <- pca.iris$sdev names(empirical.ev) <- paste("PC", 1:length(empirical.ev), sep=" ") # This puts the empirical eigenvalues in an object. Randomized eigenvalues # will be generated and compared to these empirical values. ## TASK 2 ## ## Make a barplot showing the change in empirical eigenvalues k <- 999 # This creates an object that will define how many iterations the randomization # test will use rand.ev <- matrix(data=NA, nrow=k, ncol=length(empirical.ev)) rownames(rand.ev) <- paste("rand", 1:k, sep="_") # This creates a matrix full of NAs that will be filled with each iteration # of the loop with random values of eigenvalues for (i in 1:k) { rand.morpho <- morpho # This copies the empirical morpho data rand.morpho[,1] <- sample(rand.morpho[,1]) # This randomizes the position of values in the first variable (column) in # rand.morpho. This test is based on the comparison of the empirical eigenvalues # in a PCA with eigenvalues from PCA where the relationships among values among # variables are randomized. This implies the random re-arrangement of each # morphological variable independently. ## TASK 3 ## ## Repeat the step above for all other columns of the dataset independently ## TASK 4 ## ## Run a new principal component analysis, but this time using the ## randomized morpho data. Save the results into an object named rand.pca.iris ## TASK 5 ## ## The line below puts the randomized eigenvalues in the empty *rand.ev* ## matrix. With each iteration of the loop, it is supposed to put the ## values in a different row of the matrix. However, it is always placing ## values the last row. Fix the problem with this line. rand.ev[k,] <- rand.pca.iris$sdev } # Loop ends rand.ev <- rbind(empirical.ev, rand.ev) # This combines by rows the empirical and random eigenvalues. This puts the # empirical values in the first row of the *rand.ev* matrix ## TASK 6 ## ## Open a window and divided in 4 panels with the functions *par* or *layout* hist(rand.ev[,1]^2, breaks=100, xlab="Eigenvalue", cex.lab=1.5, cex.axis=1.5, main="PC 1") # This creates a histogram with the distribution of the first eigenvalue (column 1) abline(v=rand.ev[1,1]^2, col="red") # This highlights with a line the position of the empirical eigenvalue. sum(rand.ev[,1] >= rand.ev[1,1]) / (k+1) # This calculates the p-value. The p-value is calculated as the proportion of # eigenvalues in the randomization that are equal or larger than the empirical # eigenvalue. Note that we are summing a logical vector, which is equivalent # to counting the number of times that the comparison >= is TRUE. ## TASK 7 ## ## Create the corresponding figures for eigenvalues 2, 3 and 4 in a way ## that the figure for each eigenvalue is in a single panel of the divided ## window. Also, calculate the corresponding p-values. ## TASK 8 ## ## Repeat the figure, but this time make it look nice and save it as a ## tiff file with a high resolution (600 ppi). ################################################################################ ### TASK SOLUTIONS ############################################################# ################################################################################ ## TASK 1 ## morpho <- iris[,1:4] ## TASK 2 ## barplot(empirical.ev^2, col="firebrick1", border="firebrick1", xlab="Principal Components", ylab="Eigenvalue") ## TASK 3 ## rand.morpho[,2] <- sample(rand.morpho[,2]) rand.morpho[,3] <- sample(rand.morpho[,3]) rand.morpho[,4] <- sample(rand.morpho[,4]) ## TASK 4 ## rand.pca.iris <- prcomp(rand.morpho, sclale=TRUE) ## TASK 5 ## rand.ev[i,] <- rand.pca.iris$sdev ## TASK 6 ## par(mfrow=c(2,2)) #or layout(m=matrix(1:4, ncol=2)) layout.show(4) ## TASK 7 ## hist(rand.ev[,2]^2, breaks=100, xlab="Eigenvalue", cex.lab=1.5, cex.axis=1.5, main="PC 2") abline(v=rand.ev[1,2]^2, col="red") sum(rand.ev[,2] >= rand.ev[1,2]) / (k+1) hist(rand.ev[,3]^2, breaks=100, xlab="Eigenvalue", cex.lab=1.5, cex.axis=1.5, main="PC 3") abline(v=rand.ev[1,3]^2, col="red") sum(rand.ev[,3] >= rand.ev[1,3]) / (k+1) hist(rand.ev[,4]^2, breaks=100, xlab="Eigenvalue", cex.lab=1.5, cex.axis=1.5, main="PC 4") abline(v=rand.ev[1,4]^2, col="red") sum(rand.ev[,4] >= rand.ev[1,4]) / (k+1) ## TASK 8 ## getwd() # Check the working directory and change it if necessary line.wd <- 2 line.col <- "black" bar.col <- "firebrick3" main.size <- 1.75 axis.size <- 1.5 lab.size <- 1.5 breaks.n <- 50 tiff(filename = "PCA_exercise.tiff", width = 25, height = 20, units = "cm", pointsize = 12, res = 600) par(mfrow=c(2,2), mar=c(5, 6, 4, 2), mgp=c(3.75, 1, 0)) hist(rand.ev[,1]^2, breaks=breaks.n, xlab="Eigenvalue", las=1, cex.lab=lab.size, cex.axis=axis.size, main="PC 1", cex.main=main.size, col=bar.col, border=bar.col) abline(v=rand.ev[1,1]^2, col=line.col, lwd=line.wd) hist(rand.ev[,2]^2, breaks=breaks.n, xlab="Eigenvalue", las=1, cex.lab=lab.size, cex.axis=axis.size, main="PC 2", cex.main=main.size, col=bar.col, border=bar.col) abline(v=rand.ev[1,2]^2, col=line.col, lwd=line.wd) hist(rand.ev[,3]^2, breaks=breaks.n, xlab="Eigenvalue", las=1, cex.lab=lab.size, cex.axis=axis.size, main="PC 3", cex.main=main.size, col=bar.col, border=bar.col) abline(v=rand.ev[1,3]^2, col=line.col, lwd=line.wd) hist(rand.ev[,4]^2, breaks=breaks.n, xlab="Eigenvalue", las=1, cex.lab=lab.size, cex.axis=axis.size, main="PC 4", cex.main=main.size, col=bar.col, border=bar.col) abline(v=rand.ev[1,4]^2, col=line.col, lwd=line.wd) dev.off()