################################################################################
### R BASICS WORKSHOP                                                        ###
### PRESENTATION 6: OBJECT MANIPULATION - INDEXING                           ###
###                                                                          ###
### Center for Conservation and Sustainable Development                      ###
### Missouri Botanical Garden                                                ###
### Website: rbasicsworkshop.weebly.com                                      ###
################################################################################


### INTRODUCTION ###############################################################
# The indexing system is an efficient and flexible way to selectively access
# elements of an object. Indexing can be numeric, logic or by names. To index,
# we use square brackets *[]* and the *$* operator. In addition, this section
# will discuss functions *which* and *str*.

# This part of the workshop is divided into the following sections:

## Four approaches to indexing ##
# A. Numerical indexing
# B. Logical indexing
# C. Indexing using the function *which*
# D. Indexing with names

## Editing objects ##
# E. Replacing values within an object

## Indexing different classes of objects ##
# F. Indexing vectors
# G. Indexing matrices
# H. Indexing data frames
# I. Indexing lists
# J. Indexing other objects - e.g., objects of class "lm"


################################################################################
##################### FOUR APPROACHES TO INDEXING ##############################
################################################################################


### A. NUMERICAL INDEXING ######################################################
# Numeric values are used within the brackets *[]* to select elements in an
# object. The numeric values indicate the positions of the elements to be selected.


# Suppose you have a vector with data on the species to which 20 individual
# trees belong:
spp <- rep(paste("sp", c("a", "b", "c", "d"), sep = "_"), each = 5)

spp
class(spp)
length(spp)

# Suppose you also have a vector with measurements of wood density for
# each of the 20 individual trees above:
wood.density <- c (8.0766242, 9.8493313, 2.9028278, 10.0433943, 0.1470901,
  12.5288041, 10.6120501, 14.6478501, 8.2003356, 17.9935623, 12.4214381,
  18.3749778, 24.0950527, 19.3236943, 15.5498672, 22.0520207, 28.9908186,
  17.5659344, 26.0387389, 14.1152262)

wood.density
class(wood.density)
length(wood.density)

# In the code below, the number within square brackets is the position of the
# element to be selectively accessed:
spp[2] # selects the second element in the vector spp
spp[10] # selects the tenth element in the vector spp

wood.density[2]
wood.density[10]

# You can also select more than one element at a time
spp[c(7,7,7)]
wood.density[c(2,5,7)]

## IMPORTANT: numerical indexing can use the minus sign *-* to select all
## elements except those indicated in brackets
spp[2] # This selects the second element in the vector spp
spp[-2] # This selects all elements except the second element in the vector spp

wood.density[c(2,5,7)] # This selects the second, fifth and seventh elements 
#in the vector wood.density
wood.density[-c(2,5,7)] # This selects all elements in vector wood.density
#except the second, fifth and seventh elements


### B. LOGICAL INDEXING ########################################################
# Logical values (TRUE or FALSE) are used within the brackets *[]* to select elements
# in an object. The elements selected are those corresponding to TRUE values.

wood.density
wood.density < 15 # This generates a logical vector where values are TRUE when
                  # the value of wood density is less than 15

wood.density[wood.density < 15] # This selects the elements of *wood.density*
                                # that are less than 15

spp[wood.density < 15]

spp == "sp_b"
wood.density[spp == "sp_b"]

# You can also use more complex criteria to select values
wood.density [spp=="sp_b" | spp=="sp_c"]

wood.density [spp=="sp_b" & spp=="sp_c"]

wood.density [spp=="sp_b" & wood.density>15]

wood.density [wood.density<15 & wood.density>25]

wood.density [wood.density>15 & wood.density<25]

wood.density [wood.density<15 | wood.density>25]


### C. INDEXING USING THE FUNCTION *which* #####################################
# This function gives the position of the elements that meet a given criterion

# In what elements of *wood density* are the values greater than 15?
which(wood.density > 15) # IMPORTANT: These are not the elements in *wood.density*
                         # that meet the criterion, but their positions in the vector

# The code below selects values of *wood.density* greater than 15
wood.density[which(wood.density > 15)]

# The code below selects the species names corresponding to individuals with *wood.density*
# values greater than 15
spp[which(wood.density > 15)]


which(spp == "sp_b")
wood.density[which(spp == "sp_b")]


### D. INDEXING WITH NAMES ########################################################
# Names of elements can be used within brackets *[]* to extract elements

# This approach to indexing assumes the elements have names:
wood.density
names(wood.density) <- length(wood.density):1
wood.density

spp
names(spp) <- length(spp):1
spp

wood.density["3"] # Selects the value in *wood.density* that has the name "3"

## IMPORTANT: Note the difference between name and numeric indexing:
spp ["15"] # Selects the value in *spp* that has the name "15"
spp [15] # This, in contrast, selects the fifteenth value in *spp*

wood.density [spp == "sp_c"] # This code selects the values in *wood.density*
                             # that corresponds to "spp_c" in *spp*

# You can also use names to select several elements at once
wood.density[c("4", "8", "20")]
wood.density[c(4, 8, 20)] # This and the previous line of code are NOT equivalent

spp[c("4", "8", "20")]
spp[c(4, 8, 20)]


################################################################################
############################## EDITING OBJECTS #################################
################################################################################


### E. REPLACE VALUES WITHIN AN OBJECT #########################################
# The indexing system allows us to replace or re-write values of particular
# elements within an object

spp
spp[c(1,4,18)]


spp[c(1,4,18)] <- "spp_x"
spp

spp <- c (spp, "sp_aslj")
spp


################################################################################
################## INDEXING DIFFERENT CLASSES OF OBJECTS #######################
################################################################################


### F. INDEXING VECTORS ########################################################
# We already used vectors to practice the different approaches to indexing (numerical,
# logical and by name). The elements to be selected are placed between
# square brackets

letters[c(1,4,6)]


### G. INDEXING MATRICES #######################################################

# Let's open a file (CarbonDioxideYearlyEmissions.txt) to practice indexing
# matrices. This file contains data on CO2 emissions by country (columns) per
# year (rows).

CO2 <- read.table(file = file.choose(), header = TRUE, row.names = 1, sep = "\t")

dim(CO2)
class(CO2) # The *read.table* function always produces a data frame

# Let's convert the data frame to a matrix
CO2 <- as.matrix (CO2)
class (CO2)
head (CO2)

# A common way to index a matrix is by rows and columns. Within square
# brackets you specify the rows to be selected, then a comma and then the
# columns to be selected:
CO2[150, 30] # This selects the value in row 150 and column 30

rownames(CO2)[150]
colnames(CO2)[30]

## IMPORTANT: rows are always specified first followed by columns
CO2[200, 45]
CO2[45, 200]
CO2[240, 155]

# You can also select multiple columns and/or rows at once
CO2[c (200, 45, 240), c (45, 200, 155)]

## IMPORTANT: To select all elements in a row (or column),
## simply omit the row (or column) index, but do not forget
## the comma. For example:

CO2[ ,100] # This selects all rows and column 100
CO2[ ,240]

CO2[10, ] # This selects row 10 and all columns

CO2[-10, ] # This selects all items except row 10
CO2[ ,-100] 

# Matrices can also be indexed by names of rows or columns
CO2[2010, ] # This generates an error because there is no row 2010
CO2 ["2010", ] # This does NOT generate an error because we are asking for row
               # with name "2010"

CO2["2010", "United.States"]

# How have CO2 emissions changed in the United States?
years <- as.numeric(rownames(CO2))

plot(CO2[,"United.States"] ~ years, col = "forestgreen")

# How have emissions changed in the 21st century?
plot (CO2[years>2000, "United.States"] ~ years[years>2000],
  col = "forestgreen", type = "b")

# How do emissions compare among the United States, Mexico and Ecuador?
plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "l",
  lwd = 4, ylab = "Issues")
points (CO2[,"Mexico"] ~ years, col = "navy", type = "l", lwd = 4)
points (CO2[,"Ecuador"] ~ years, col = "gold", type = "l", lwd = 4)

# We could mess up data by changing some values to 0
CO2[years>1950, "United.States"] <- 0

plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "b")


## IMPORTANT: matrices can also be indexed by element number, not only by
## row and column

M <- matrix(letters[-26], ncol = 5)
colnames(M) <- paste("var", 1:ncol(M), sep = "_")

M
class(M)
dim(M)

# The two lines of code below select the same element of matrix M:
M [2, 2]
M [7]

M [5,5]
M [25]


### H. INDEXING DATA FRAMES ####################################################
# indexing data frames is very similar to indexing matrices except for
# these two aspects:

# 1. The data frames can not be indexed by element number, only
# by rows and columns

M.df <- as.data.frame(M)

class(M)
M [2,2]
M [7]

class(M.df)
M.df[2,2]
M.df[7]

# 2. The columns in a data frame can also be indexed by variable name using
# *$* after the name of the object. This cannot be done for matrices:

colnames(M)

M.df[,"var_2"]
M.df$var_2

M[,"var_2"]
M$var_2

# This opens the "iris" dataset in the "datasets" package.
# (http://en.wikipedia.org/wiki/Iris_flower_data_set).
data(iris)
help(iris)

class(iris)
dim(iris)

str(iris) # the *str* function reports a summary of the structure of an object

morpho <- iris[,1:4]
morpho # The first columns are morphological variables

species <- iris$Species
species # This last column has names of species

class(species)
levels(species)

species <- as.vector(species)
class(species)
unique(species) # Create a list of unique values

# Graphic of sepal length vs. petal length for I. setosa and I. virginica
plot(morpho$Sepal.Length ~ morpho$Sepal.Width, type="n")

points(morpho$Sepal.Length[species=="setosa"] ~
    morpho$Sepal.Width[species=="setosa"], col="gold")

points(morpho$Sepal.Length[species=="versicolor"] ~
    morpho$Sepal.Width[species=="versicolor"], col="navy")

points(morpho$Sepal.Length[species=="virginica"] ~
    morpho$Sepal.Width[species=="virginica"], col="red")


### I. INDEXING LISTS ##########################################################

L1 <- list(c(0.01, 3.1), c(0.02, 4.0, 0.1), c("a"), c(0.01, 2.9), c(0.03),
    c(0.04, 3.4, 8.2, 1.6))

class(L1)
L1

length(L1)
str(L1)

names(L1) <- paste("elem", 1:length(L1), sep="_")
L1

L1.1 <- L1[1] # *[]* selects the first element of the list, as a list
L1.1
class(L1.1)

L1.1 <- L1[[1]] # *[[]]* selects the first element of the list, as the class of the element

L1.1
class(L1.1)

L1[-1]
L1[1:3]

L1["elem_1"] # In lists, indexing can also be done by element names
class(L1["elem_1"])

L1[["elem_1"]]
class(L1[["elem_1"]])

L1$elem_1
class(L1$elem_1)

# Other manipulations:
L1[2:4][1]
L1[1:3][-1]

L1[[1]]
L1[[1]][2]

L1[[1]]
L1[[1]][1]

L1[[1]]<-3
L1

L1[[2]]
L1[[2]] > 2

L1[[2]] [L1[[2]]<1]


### J. INDEXING OF AN OBJECT OF CLASS 'LM' #####################################

# Let's open a data file (BatsEnviroAmerica.txt) to create a linear model
# and practice indexing. This file contains data on species richness of bats
# across the New World in 100x100 km cells. There is also information on
# several environmental variables in each cell.

bat.data <- read.table(file = file.choose(), header = TRUE, sep = "\t")
dim(bat.data)
class(bat.data)

colnames(bat.data)

length(which(bat.data$richness == 0)) # We may use indexing to know how many cells
                                      # have species richness equal to 0 (empty cells)
length(which(bat.data$richness> 0))

# In the following analyses, we will remove all empty cells
dim(bat.data)
bat.data <- bat.data[-which(bat.data$richness == 0),]
dim(bat.data)

hist(bat.data$richness) # By indexing the column "richness", we can create a
                        # a histogram of the values bat richness per cell

plot(bat.data$richness ~ bat.data$temp_AVG)

par(mfrow = c(1,2))
  plot(log(bat.data$richness) ~ bat.data$temp_AVG)
  plot(log(bat.data$richness) ~ bat.data$ele_RANGE)

model.1 <- lm(log(bat.data$richness) ~ scale(bat.data$temp_AVG) +
    scale(bat.data$ele_RANGE))

summary(model.1)

class(model.1)
str(model.1) # The class of this object is "lm", which is a particular kind of list

model.1[[1]] # Numerical indexing for extracting coefficients
model.1$coefficients # Indexing by names to extract coefficients

model.1$coefficients[1] # Indexing by name and number for the intercept
model.1$coefficients["(Intercept)"] # Double indexing by names for the intercept

model.1$coefficients[2]
model.1$coefficients[3]

model.1$residuals[1:10]

model.1$fitted.values[1:10]

plot(log(bat.data$richness) ~ bat.data$temp_AVG)
points(model.1$fitted.values ~ bat.data$temp_AVG, col = "red")