################################################################################ ### R BASICS WORKSHOP ### ### PRESENTATION 6: OBJECT MANIPULATION - INDEXING ### ### ### ### Center for Conservation and Sustainable Development ### ### Missouri Botanical Garden ### ### Website: rbasicsworkshop.weebly.com ### ################################################################################ ### INTRODUCTION ############################################################### # The indexing system is an efficient and flexible way to selectively access # elements of an object. Indexing can be numeric, logic or by names. To index, # we use square brackets *[]* and the *$* operator. In addition, this section # will cover some other useful functions like *which*, *unique* and *str*. # This part of the workshop is divided into the following sections: ## Types of Indexation ## # A. Numerical Indexing # B. Logical Indexing # C. Indexing using *which* # D. Indexed by names # E. Replacing elements of an object ## Indexing for different classes of objects ## # F. Indexing vectors # G. Indexing matrices # H. Indexing data frames # I. Indexing lists # J. Indexing other objects - e.g. with object of class "lm" ################################################################################ ############################# TYPES OF INDEXATION ############################## ################################################################################ ### A. NUMERICAL INDEXING ###################################################### # Suppose you have a vector with data on species of 20 tree individuals spp <- rep(paste("sp", c("a", "b", "c", "d"), sep = "_"), each = 5) spp class(spp) length(spp) # Suppose now that we have a vector with measurements of wood density for # each individual wood.density <- c (8.0766242, 9.8493313, 2.9028278, 10.0433943, 0.1470901, 12.5288041, 10.6120501, 14.6478501, 8.2003356, 17.9935623, 12.4214381, 18.3749778, 24.0950527, 19.3236943, 15.5498672, 22.0520207, 28.9908186, 17.5659344, 26.0387389, 14.1152262) wood.density class (wood.density) length (wood.density) # The number within square brackets is the position of the element to be extracted spp[2] # Extracts the second element in the vector spp[10] # Extracts the tenth element wood.density[2] wood.density[10] # You can also extract more than one element at a time spp[c(7,7,7)] wood.density[c(2,5,7)] ## IMPORTANT: numerical indexing can use the minus sign *-* to extracts all items ## except those indicated in brackets spp[2] # This extracts the second element spp[-2] # This extracts all items except the second! wood.density[c(2,5,7)] # This extracts the elements 2, 5 and 7 wood.density[-c(2,5,7)] # This extracts everything except items 2, 5 and 7 ### B. LOGICAL INDEXING ######################################################## # TRUE OR FALSE values ARE used within the brackets *[]* to extract elements. # This extracts elements corresponding to values of TRUE. wood.density wood.density < 15 # This generates a logical vector where values are TRUE when # *wood.density < 15* wood.density[wood.density < 15] # This extracts the elements of *wood.density* # that are less than 15 spp[wood.density < 15] spp == "sp_b" wood.density[spp == "sp_b"] # You can also use more complex conditions to extract values wood.density [spp=="sp_b" | spp=="sp_c"] wood.density [spp=="sp_b" & spp=="sp_c"] wood.density [spp=="sp_b" & wood.density>15] wood.density [wood.density<15 & wood.density>25] wood.density [wood.density>15 & wood.density<25] wood.density [wood.density<15 | wood.density>25] ### C. INDEXING USING THE FUNCTION *which* ##################################### # This function gives the position of the elements that meet a certain condition # In what elements of *wood density* are the values greater than 15? which(wood.density > 15) # IMPORTANT: These are not the elements in *wood.density* # that meet the condition, but their positions in the vector # This, in turn, extracts values of *wood.density* greater than 15 wood.density[which(wood.density > 15)] # This extracts the species of individuals with values of *wood.density* # greater than 15 spp[which(wood.density > 15)] which(spp == "sp_b") wood.density[which(spp == "sp_b")] ### D. INDEXING BY NAME ######################################################## # Names of elements can be used within brackets *[]* to extract elements # For this type of indexing, the elements must have names: wood.density names(wood.density) <- length(wood.density):1 wood.density spp names(spp) <- length(spp):1 spp wood.density["3"] # Extracts the value in *wood.density* that has the name "3" ## IMPORTANT: Note the difference between name and numeric indexing: spp ["15"] # Extracts the value in *spp* that has the name "15" spp [15] # This, in turn, extracts the fifteenth value in *spp* wood.density [spp == "sp_c"] # This, in turn, extracts the values in *wood.density* # that corresponds to "spp_c" in *spp* # You can also extract several elements by name wood.density[c("4", "8", "20")] wood.density[c(4, 8, 20)] # These two lines are NOT equivalent spp[c("4", "8", "20")] spp[c(4, 8, 20)] ### E. REPLACE VALUES WITHIN AN OBJECT ######################################### # The indexing system allows us to replace or re-write values of particular # elements within an object spp spp[c(1,4,18)] spp[c(1,4,18)] <- "spp_x" spp spp <- c (spp, "sp_aslj") spp ################################################################################ ######################## INDEXATION OF DIFFERENT CLASSES ####################### ################################################################################ ### F. INDEXING VECTORS ######################################################## # We already used vectors to practice the different kinds of indexation (numerical, # logical, and by name). The objects needed (to be extracted) are placed between # square brackets letters[c(1,4,6)] ### G. INDEXING MATRICES ####################################################### # Let's open a file (CarbonDioxideYearlyEmissions.txt) to practice indexing # matrices. This file contains data on CO2 emissions by country (columns) per # year (rows). CO2 <- read.table(file = file.choose(), header = TRUE, row.names = 1, sep = "\t") dim(CO2) class(CO2) # The *read.table* function always produces a data frame # Lets transform the data frame to a matrix CO2 <- as.matrix (CO2) class (CO2) head (CO2) # The most common way to index a matrix is by row and column. Within square # brackets you specify the rows needed then a comma then the columns needed CO2[150, 30] # This extracts the value in row 150 and column 30 rownames(CO2)[150] colnames(CO2)[30] ## IMPORTANT: rows are always specified first followed columns CO2[200, 45] CO2[45, 200] CO2[240, 155] # You can also extract multiple columns and/or rows at the same time CO2[c (200, 45, 240), c (45, 200, 155)] ## IMPORTANT: When you want all the elements of a row or column, ## simply do not specify anything for rows or columns. For example: CO2[ ,100] # This extracts all rows and column 100 CO2[ ,240] CO2[10, ] # This extracts row 10 and all columns CO2[-10, ] # This extracts all items except row 10 CO2[ ,-100] # # Matrices can also be indexed by names of rows or columns CO2[2010, ] # This generates an error because there is no row 2010 CO2 ["2010", ] # This does NOT generate an error because we are asking for row # with name "2010" CO2["2010", "United.States"] # How haveCO2 emissions changed in the United States? years <- as.numeric(rownames(CO2)) plot(CO2[,"United.States"] ~ years, col = "forestgreen") # How have emissions changed in the 21st century? plot (CO2[years>2000, "United.States"] ~ years[years>2000], col = "forestgreen", type = "b") # How do emissions compare among the United States, Mexico and Ecuador? plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "l", lwd = 4, ylab = "Issues") points (CO2[,"Mexico"] ~ years, col = "navy", type = "l", lwd = 4) points (CO2[,"Ecuador"] ~ years, col = "gold", type = "l", lwd = 4) # We could mess up data by changing some values to 0 CO2[years>1950, "United.States"] <- 0 plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "b") ## IMPORTANT: matrices can also be indexed by element number, not only by ## row and column M <- matrix(letters[-26], ncol = 5) colnames(M) <- paste("var", 1:ncol(M), sep = "_") M class(M) dim(M) # These commands extract the same element M [2, 2] M [7] M [5,5] M [25] ### H. INDEXING DATA FRAMES #################################################### # indexing data frames is very similar to indexing matrices except for # these two aspects: # 1. The data frames can not be indexed by item number, only # by rows and columns M.df <- as.data.frame(M) class(M) M [2,2] M [7] class(M.df) M.df[2,2] M.df[7] # 2. The columns in a data frame can also be indexed by variable name using # *$* after the name of the object. This cannot be done for matrices: colnames(M) M.df[,"var_2"] M.df$var_2 M[,"var_2"] M$var_2 # This opens the "iris" dataset in the "datasets" package. # (http://en.wikipedia.org/wiki/Iris_flower_data_set). data(iris) help(iris) class(iris) dim(iris) str(iris) # the *str* function reports a summary of the structure of an object morpho <- iris[,1:4] morpho # The first columns are morphological variables species <- iris$Species species # This last column has names of species class(species) levels(species) species <- as.vector(species) class(species) unique(species) # Create a list of unique values # Graphic of sepal length vs. petal length for I. setosa and I. virginica plot(morpho$Sepal.Length ~ morpho$Sepal.Width, type="n") points(morpho$Sepal.Length[species=="setosa"] ~ morpho$Sepal.Width[species=="setosa"], col="gold") points(morpho$Sepal.Length[species=="versicolor"] ~ morpho$Sepal.Width[species=="versicolor"], col="navy") points(morpho$Sepal.Length[species=="virginica"] ~ morpho$Sepal.Width[species=="virginica"], col="red") ### I. INDEXING LISTS ########################################################## L1 <- list(c(0.01, 3.1), c(0.02, 4.0, 0.1), c("a"), c(0.01, 2.9), c(0.03), c(0.04, 3.4, 8.2, 1.6)) class(L1) L1 length(L1) str(L1) names(L1) <- paste("elem", 1:length(L1), sep="_") L1 L1.1 <- L1[1] # *[]* extracts the first element of the list as a list L1.1 class(L1.1) L1.1 <- L1[[1]] # *[[]]* Extracts the first element of the list as the vector it contains L1.1 class(L1.1) L1[-1] L1[1:3] L1["elem_1"] # In lists, indexing can also be done by element names class(L1["elem_1"]) L1[["elem_1"]] class(L1[["elem_1"]]) L1$elem_1 class(L1$elem_1) # Other manipulations: L1[2:4][1] L1[1:3][-1] L1[[1]] L1[[1]][2] L1[[1]] L1[[1]][1] L1[[1]]<-3 L1 L1[[2]] L1[[2]] > 2 L1[[2]] [L1[[2]]<1] ### J. INDEXING OF AN OBJECT OF CLASS 'LM' ##################################### # Let's open a data file (BatsEnviroAmerica.txt) to create a linear model # and practice indexing. # This file contains data on species richness of bats across the New World in # 100x100 km cells. There is also information on several environmental variables # in each cell. bat.data <- read.table(file = file.choose(), header = TRUE, sep = "\t") dim(bat.data) class(bat.data) colnames(bat.data) length(which(bat.data$richness == 0)) # By indexing, we can know how many cells # have a richness of 0 (empty cells) length(which(bat.data$richness> 0)) # For the following analyses, we will remove all empty cells dim(bat.data) bat.data <- bat.data[-which(bat.data$richness == 0),] dim(bat.data) hist(bat.data$richness) # By indexing the column "richness", we can create a # a histogram of the values bat richness per cell plot(bat.data$richness ~ bat.data$temp_AVG) par(mfrow = c(1,2)) plot(log(bat.data$richness) ~ bat.data$temp_AVG) plot(log(bat.data$richness) ~ bat.data$ele_RANGE) model.1 <- lm(log(bat.data$richness) ~ scale(bat.data$temp_AVG) + scale(bat.data$ele_RANGE)) summary(model.1) class(model.1) str(model.1) # Although this is an object of class "lm", it has the structure of a list model.1[[1]] # Numerical indexing for extracting coefficients model.1$coefficients # Indexing by names to extract coefficients model.1$coefficients[1] # Indexing by name and number for the intercept model.1$coefficients["(Intercept)"] # Double indexing by names for the intercept model.1$coefficients[2] model.1$coefficients[3] model.1$residuals[1:10] model.1$fitted.values[1:10] plot(log(bat.data$richness) ~ bat.data$temp_AVG) points(model.1$fitted.values ~ bat.data$temp_AVG, col = "red") library(car) # Open the package *car* avPlots(model = model.1) # This produces a graph of "partial residuals" # showing the effects of each variable