################################################################################ ### R BASICS WORKSHOP ### ### PRESENTATION 6: OBJECT MANIPULATION - INDEXING ### ### ### ### Center for Conservation and Sustainable Development ### ### Missouri Botanical Garden ### ### Website: rbasicsworkshop.weebly.com ### ################################################################################ ### INTRODUCTION ############################################################### # The indexing system is an efficient and flexible way to selectively access # elements of an object. Indexing can be numeric, logic or by names. To index, # we use square brackets *[]* and the *$* operator. In addition, this section # will discuss functions *which* and *str*. # This part of the workshop is divided into the following sections: ## Four approaches to indexing ## # A. Numerical indexing # B. Logical indexing # C. Indexing using the function *which* # D. Indexing with names ## Editing objects ## # E. Replacing values within an object ## Indexing different classes of objects ## # F. Indexing vectors # G. Indexing matrices # H. Indexing data frames # I. Indexing lists # J. Indexing other objects - e.g., objects of class "lm" ################################################################################ ##################### FOUR APPROACHES TO INDEXING ############################## ################################################################################ ### A. NUMERICAL INDEXING ###################################################### # Numeric values are used within the brackets *[]* to select elements in an # object. The numeric values indicate the positions of the elements to be selected. # Suppose you have a vector with data on the species to which 20 individual # trees belong: spp <- rep(paste("sp", c("a", "b", "c", "d"), sep = "_"), each = 5) spp class(spp) length(spp) # Suppose you also have a vector with measurements of wood density for # each of the 20 individual trees above: wood.density <- c (8.0766242, 9.8493313, 2.9028278, 10.0433943, 0.1470901, 12.5288041, 10.6120501, 14.6478501, 8.2003356, 17.9935623, 12.4214381, 18.3749778, 24.0950527, 19.3236943, 15.5498672, 22.0520207, 28.9908186, 17.5659344, 26.0387389, 14.1152262) wood.density class(wood.density) length(wood.density) # In the code below, the number within square brackets is the position of the # element to be selectively accessed: spp[2] # selects the second element in the vector spp spp[10] # selects the tenth element in the vector spp wood.density[2] wood.density[10] # You can also select more than one element at a time spp[c(7,7,7)] wood.density[c(2,5,7)] ## IMPORTANT: numerical indexing can use the minus sign *-* to select all ## elements except those indicated in brackets spp[2] # This selects the second element in the vector spp spp[-2] # This selects all elements except the second element in the vector spp wood.density[c(2,5,7)] # This selects the second, fifth and seventh elements #in the vector wood.density wood.density[-c(2,5,7)] # This selects all elements in vector wood.density #except the second, fifth and seventh elements ### B. LOGICAL INDEXING ######################################################## # Logical values (TRUE or FALSE) are used within the brackets *[]* to select elements # in an object. The elements selected are those corresponding to TRUE values. wood.density wood.density < 15 # This generates a logical vector where values are TRUE when # the value of wood density is less than 15 wood.density[wood.density < 15] # This selects the elements of *wood.density* # that are less than 15 spp[wood.density < 15] spp == "sp_b" wood.density[spp == "sp_b"] # You can also use more complex criteria to select values wood.density [spp=="sp_b" | spp=="sp_c"] wood.density [spp=="sp_b" & spp=="sp_c"] wood.density [spp=="sp_b" & wood.density>15] wood.density [wood.density<15 & wood.density>25] wood.density [wood.density>15 & wood.density<25] wood.density [wood.density<15 | wood.density>25] ### C. INDEXING USING THE FUNCTION *which* ##################################### # This function gives the position of the elements that meet a given criterion # In what elements of *wood density* are the values greater than 15? which(wood.density > 15) # IMPORTANT: These are not the elements in *wood.density* # that meet the criterion, but their positions in the vector # The code below selects values of *wood.density* greater than 15 wood.density[which(wood.density > 15)] # The code below selects the species names corresponding to individuals with *wood.density* # values greater than 15 spp[which(wood.density > 15)] which(spp == "sp_b") wood.density[which(spp == "sp_b")] ### D. INDEXING WITH NAMES ######################################################## # Names of elements can be used within brackets *[]* to extract elements # This approach to indexing assumes the elements have names: wood.density names(wood.density) <- length(wood.density):1 wood.density spp names(spp) <- length(spp):1 spp wood.density["3"] # Selects the value in *wood.density* that has the name "3" ## IMPORTANT: Note the difference between name and numeric indexing: spp ["15"] # Selects the value in *spp* that has the name "15" spp [15] # This, in contrast, selects the fifteenth value in *spp* wood.density [spp == "sp_c"] # This code selects the values in *wood.density* # that corresponds to "spp_c" in *spp* # You can also use names to select several elements at once wood.density[c("4", "8", "20")] wood.density[c(4, 8, 20)] # This and the previous line of code are NOT equivalent spp[c("4", "8", "20")] spp[c(4, 8, 20)] ################################################################################ ############################## EDITING OBJECTS ################################# ################################################################################ ### E. REPLACE VALUES WITHIN AN OBJECT ######################################### # The indexing system allows us to replace or re-write values of particular # elements within an object spp spp[c(1,4,18)] spp[c(1,4,18)] <- "spp_x" spp spp <- c (spp, "sp_aslj") spp ################################################################################ ################## INDEXING DIFFERENT CLASSES OF OBJECTS ####################### ################################################################################ ### F. INDEXING VECTORS ######################################################## # We already used vectors to practice the different approaches to indexing (numerical, # logical and by name). The elements to be selected are placed between # square brackets letters[c(1,4,6)] ### G. INDEXING MATRICES ####################################################### # Let's open a file (CarbonDioxideYearlyEmissions.txt) to practice indexing # matrices. This file contains data on CO2 emissions by country (columns) per # year (rows). CO2 <- read.table(file = file.choose(), header = TRUE, row.names = 1, sep = "\t") dim(CO2) class(CO2) # The *read.table* function always produces a data frame # Let's convert the data frame to a matrix CO2 <- as.matrix (CO2) class (CO2) head (CO2) # A common way to index a matrix is by rows and columns. Within square # brackets you specify the rows to be selected, then a comma and then the # columns to be selected: CO2[150, 30] # This selects the value in row 150 and column 30 rownames(CO2)[150] colnames(CO2)[30] ## IMPORTANT: rows are always specified first followed by columns CO2[200, 45] CO2[45, 200] CO2[240, 155] # You can also select multiple columns and/or rows at once CO2[c (200, 45, 240), c (45, 200, 155)] ## IMPORTANT: To select all elements in a row (or column), ## simply omit the row (or column) index, but do not forget ## the comma. For example: CO2[ ,100] # This selects all rows and column 100 CO2[ ,240] CO2[10, ] # This selects row 10 and all columns CO2[-10, ] # This selects all items except row 10 CO2[ ,-100] # Matrices can also be indexed by names of rows or columns CO2[2010, ] # This generates an error because there is no row 2010 CO2 ["2010", ] # This does NOT generate an error because we are asking for row # with name "2010" CO2["2010", "United.States"] # How have CO2 emissions changed in the United States? years <- as.numeric(rownames(CO2)) plot(CO2[,"United.States"] ~ years, col = "forestgreen") # How have emissions changed in the 21st century? plot (CO2[years>2000, "United.States"] ~ years[years>2000], col = "forestgreen", type = "b") # How do emissions compare among the United States, Mexico and Ecuador? plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "l", lwd = 4, ylab = "Issues") points (CO2[,"Mexico"] ~ years, col = "navy", type = "l", lwd = 4) points (CO2[,"Ecuador"] ~ years, col = "gold", type = "l", lwd = 4) # We could mess up data by changing some values to 0 CO2[years>1950, "United.States"] <- 0 plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "b") ## IMPORTANT: matrices can also be indexed by element number, not only by ## row and column M <- matrix(letters[-26], ncol = 5) colnames(M) <- paste("var", 1:ncol(M), sep = "_") M class(M) dim(M) # The two lines of code below select the same element of matrix M: M [2, 2] M [7] M [5,5] M [25] ### H. INDEXING DATA FRAMES #################################################### # indexing data frames is very similar to indexing matrices except for # these two aspects: # 1. The data frames can not be indexed by element number, only # by rows and columns M.df <- as.data.frame(M) class(M) M [2,2] M [7] class(M.df) M.df[2,2] M.df[7] # 2. The columns in a data frame can also be indexed by variable name using # *$* after the name of the object. This cannot be done for matrices: colnames(M) M.df[,"var_2"] M.df$var_2 M[,"var_2"] M$var_2 # This opens the "iris" dataset in the "datasets" package. # (http://en.wikipedia.org/wiki/Iris_flower_data_set). data(iris) help(iris) class(iris) dim(iris) str(iris) # the *str* function reports a summary of the structure of an object morpho <- iris[,1:4] morpho # The first columns are morphological variables species <- iris$Species species # This last column has names of species class(species) levels(species) species <- as.vector(species) class(species) unique(species) # Create a list of unique values # Graphic of sepal length vs. petal length for I. setosa and I. virginica plot(morpho$Sepal.Length ~ morpho$Sepal.Width, type="n") points(morpho$Sepal.Length[species=="setosa"] ~ morpho$Sepal.Width[species=="setosa"], col="gold") points(morpho$Sepal.Length[species=="versicolor"] ~ morpho$Sepal.Width[species=="versicolor"], col="navy") points(morpho$Sepal.Length[species=="virginica"] ~ morpho$Sepal.Width[species=="virginica"], col="red") ### I. INDEXING LISTS ########################################################## L1 <- list(c(0.01, 3.1), c(0.02, 4.0, 0.1), c("a"), c(0.01, 2.9), c(0.03), c(0.04, 3.4, 8.2, 1.6)) class(L1) L1 length(L1) str(L1) names(L1) <- paste("elem", 1:length(L1), sep="_") L1 L1.1 <- L1[1] # *[]* selects the first element of the list, as a list L1.1 class(L1.1) L1.1 <- L1[[1]] # *[[]]* selects the first element of the list, as the class of the element L1.1 class(L1.1) L1[-1] L1[1:3] L1["elem_1"] # In lists, indexing can also be done by element names class(L1["elem_1"]) L1[["elem_1"]] class(L1[["elem_1"]]) L1$elem_1 class(L1$elem_1) # Other manipulations: L1[2:4][1] L1[1:3][-1] L1[[1]] L1[[1]][2] L1[[1]] L1[[1]][1] L1[[1]]<-3 L1 L1[[2]] L1[[2]] > 2 L1[[2]] [L1[[2]]<1] ### J. INDEXING OF AN OBJECT OF CLASS 'LM' ##################################### # Let's open a data file (BatsEnviroAmerica.txt) to create a linear model # and practice indexing. This file contains data on species richness of bats # across the New World in 100x100 km cells. There is also information on # several environmental variables in each cell. bat.data <- read.table(file = file.choose(), header = TRUE, sep = "\t") dim(bat.data) class(bat.data) colnames(bat.data) length(which(bat.data$richness == 0)) # We may use indexing to know how many cells # have species richness equal to 0 (empty cells) length(which(bat.data$richness> 0)) # In the following analyses, we will remove all empty cells dim(bat.data) bat.data <- bat.data[-which(bat.data$richness == 0),] dim(bat.data) hist(bat.data$richness) # By indexing the column "richness", we can create a # a histogram of the values bat richness per cell plot(bat.data$richness ~ bat.data$temp_AVG) par(mfrow = c(1,2)) plot(log(bat.data$richness) ~ bat.data$temp_AVG) plot(log(bat.data$richness) ~ bat.data$ele_RANGE) model.1 <- lm(log(bat.data$richness) ~ scale(bat.data$temp_AVG) + scale(bat.data$ele_RANGE)) summary(model.1) class(model.1) str(model.1) # The class of this object is "lm", which is a particular kind of list model.1[[1]] # Numerical indexing for extracting coefficients model.1$coefficients # Indexing by names to extract coefficients model.1$coefficients[1] # Indexing by name and number for the intercept model.1$coefficients["(Intercept)"] # Double indexing by names for the intercept model.1$coefficients[2] model.1$coefficients[3] model.1$residuals[1:10] model.1$fitted.values[1:10] plot(log(bat.data$richness) ~ bat.data$temp_AVG) points(model.1$fitted.values ~ bat.data$temp_AVG, col = "red")