################################################################################
### R BASICS WORKSHOP                                                        ###
### PRESENTATION 6: OBJECT MANIPULATION - INDEXING                           ###
###                                                                          ###
### Center for Conservation and Sustainable Development                      ###
### Missouri Botanical Garden                                                ###
### Website: rbasicsworkshop.weebly.com                                      ###
################################################################################


### INTRODUCTION ###############################################################
# The indexing system is an efficient and flexible way to selectively access
# elements of an object. Indexing can be numeric, logic or by names. To index,
# we use square brackets *[]* and the *$* operator. In addition, this section
# will cover some other useful functions like *which*, *unique* and *str*.

# This part of the workshop is divided into the following sections:

## Types of Indexation ##
# A. Numerical Indexing
# B. Logical Indexing
# C. Indexing using *which*
# D. Indexed by names

# E. Replacing elements of an object

## Indexing for different classes of objects ##
# F. Indexing vectors
# G. Indexing matrices
# H. Indexing data frames
# I. Indexing lists
# J. Indexing other objects - e.g. with object of class "lm"





################################################################################
############################# TYPES OF INDEXATION ##############################
################################################################################


### A. NUMERICAL INDEXING ######################################################

# Suppose you have a vector with data on species of 20 tree individuals
spp <- rep(paste("sp", c("a", "b", "c", "d"), sep = "_"), each = 5)
spp

class(spp)
length(spp)

# Suppose now that we have a vector with measurements of wood density for
# each individual
wood.density <- c (8.0766242, 9.8493313, 2.9028278, 10.0433943, 0.1470901,
  12.5288041, 10.6120501, 14.6478501, 8.2003356, 17.9935623, 12.4214381,
  18.3749778, 24.0950527, 19.3236943, 15.5498672, 22.0520207, 28.9908186,
  17.5659344, 26.0387389, 14.1152262)

wood.density

class (wood.density)
length (wood.density)



# The number within square brackets is the position of the element to be extracted
spp[2] # Extracts the second element in the vector
spp[10] # Extracts the tenth element

wood.density[2]
wood.density[10]

# You can also extract more than one element at a time
spp[c(7,7,7)]
wood.density[c(2,5,7)]


## IMPORTANT: numerical indexing can use the minus sign *-* to extracts all items
## except those indicated in brackets

spp[2] # This extracts the second element
spp[-2] # This extracts all items except the second!

wood.density[c(2,5,7)] # This extracts the elements 2, 5 and 7
wood.density[-c(2,5,7)] # This extracts everything except items 2, 5 and 7



### B. LOGICAL INDEXING ########################################################
# TRUE OR FALSE values ARE used within the brackets *[]* to extract elements.
# This extracts elements corresponding to values of TRUE.

wood.density
wood.density < 15 # This generates a logical vector where values are TRUE when
                  # *wood.density < 15*

wood.density[wood.density < 15] # This extracts the elements of *wood.density*
                                # that are less than 15

spp[wood.density < 15]

spp == "sp_b"
wood.density[spp == "sp_b"]



# You can also use more complex conditions to extract values
wood.density [spp=="sp_b" | spp=="sp_c"]

wood.density [spp=="sp_b" & spp=="sp_c"]

wood.density [spp=="sp_b" & wood.density>15]


wood.density [wood.density<15 & wood.density>25]

wood.density [wood.density>15 & wood.density<25]

wood.density [wood.density<15 | wood.density>25]



### C. INDEXING USING THE FUNCTION *which* #####################################
# This function gives the position of the elements that meet a certain condition

# In what elements of *wood density* are the values greater than 15?
which(wood.density > 15) # IMPORTANT: These are not the elements in *wood.density*
                         # that meet the condition, but their positions in the vector

# This, in turn, extracts values of *wood.density* greater than 15
wood.density[which(wood.density > 15)]


# This extracts the species of individuals with values of *wood.density*
# greater than 15
spp[which(wood.density > 15)]


which(spp == "sp_b")
wood.density[which(spp == "sp_b")]



### D. INDEXING BY NAME ########################################################
# Names of elements can be used within brackets *[]* to extract elements

# For this type of indexing, the elements must have names:
wood.density
names(wood.density) <- length(wood.density):1
wood.density

spp
names(spp) <- length(spp):1
spp


wood.density["3"] # Extracts the value in *wood.density* that has the name "3"


## IMPORTANT: Note the difference between name and numeric indexing:
spp ["15"] # Extracts the value in *spp* that has the name "15"
spp [15] # This, in turn, extracts the fifteenth value in *spp*


wood.density [spp == "sp_c"] # This, in turn, extracts the values in *wood.density*
                             # that corresponds to "spp_c" in *spp*


# You can also extract several elements by name
wood.density[c("4", "8", "20")]
wood.density[c(4, 8, 20)] # These two lines are NOT equivalent

spp[c("4", "8", "20")]
spp[c(4, 8, 20)]




### E. REPLACE VALUES WITHIN AN OBJECT #########################################
# The indexing system allows us to replace or re-write values of particular
# elements within an object

spp
spp[c(1,4,18)]


spp[c(1,4,18)] <- "spp_x"
spp

spp <- c (spp, "sp_aslj")
spp





################################################################################
######################## INDEXATION OF DIFFERENT CLASSES #######################
################################################################################


### F. INDEXING VECTORS ########################################################
# We already used vectors to practice the different kinds of indexation (numerical,
# logical, and by name). The objects needed (to be extracted) are placed between
# square brackets

letters[c(1,4,6)]



### G. INDEXING MATRICES #######################################################

# Let's open a file (CarbonDioxideYearlyEmissions.txt) to practice indexing
# matrices. This file contains data on CO2 emissions by country (columns) per
# year (rows).

CO2 <- read.table(file = file.choose(), header = TRUE, row.names = 1, sep = "\t")

dim(CO2)
class(CO2) # The *read.table* function always produces a data frame

# Lets transform the data frame to a matrix
CO2 <- as.matrix (CO2)
class (CO2)
head (CO2)

# The most common way to index a matrix is by row and column. Within square
# brackets you specify the rows needed then a comma then the columns needed
CO2[150, 30] # This extracts the value in row 150 and column 30


rownames(CO2)[150]
colnames(CO2)[30]


## IMPORTANT: rows are always specified first followed columns
CO2[200, 45]
CO2[45, 200]
CO2[240, 155]


# You can also extract multiple columns and/or rows at the same time
CO2[c (200, 45, 240), c (45, 200, 155)]


## IMPORTANT: When you want all the elements of a row or column,
## simply do not specify anything for rows or columns. For example:

CO2[ ,100] # This extracts all rows and column 100
CO2[ ,240]

CO2[10, ] # This extracts row 10 and all columns

CO2[-10, ] # This extracts all items except row 10
CO2[ ,-100] #


# Matrices can also be indexed by names of rows or columns
CO2[2010, ] # This generates an error because there is no row 2010
CO2 ["2010", ] # This does NOT generate an error because we are asking for row
               # with name "2010"

CO2["2010", "United.States"]

# How haveCO2 emissions changed in the United States?
years <- as.numeric(rownames(CO2))

plot(CO2[,"United.States"] ~ years, col = "forestgreen")


# How have emissions changed in the 21st century?
plot (CO2[years>2000, "United.States"] ~ years[years>2000],
  col = "forestgreen", type = "b")


# How do emissions compare among the United States, Mexico and Ecuador?
plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "l",
  lwd = 4, ylab = "Issues")
points (CO2[,"Mexico"] ~ years, col = "navy", type = "l", lwd = 4)
points (CO2[,"Ecuador"] ~ years, col = "gold", type = "l", lwd = 4)


# We could mess up data by changing some values to 0
CO2[years>1950, "United.States"] <- 0

plot(CO2[,"United.States"] ~ years, col = "forestgreen", type = "b")


## IMPORTANT: matrices can also be indexed by element number, not only by
## row and column

M <- matrix(letters[-26], ncol = 5)
colnames(M) <- paste("var", 1:ncol(M), sep = "_")

M
class(M)
dim(M)

# These commands extract the same element
M [2, 2]
M [7]

M [5,5]
M [25]




### H. INDEXING DATA FRAMES ####################################################
# indexing data frames is very similar to indexing matrices except for
# these two aspects:

# 1. The data frames can not be indexed by item number, only
# by rows and columns

M.df <- as.data.frame(M)

class(M)
M [2,2]
M [7]

class(M.df)
M.df[2,2]
M.df[7]


# 2. The columns in a data frame can also be indexed by variable name using
# *$* after the name of the object. This cannot be done for matrices:

colnames(M)

M.df[,"var_2"]
M.df$var_2

M[,"var_2"]
M$var_2



# This opens the "iris" dataset in the "datasets" package.
# (http://en.wikipedia.org/wiki/Iris_flower_data_set).
data(iris)
help(iris)

class(iris)
dim(iris)

str(iris) # the *str* function reports a summary of the structure of an object


morpho <- iris[,1:4]
morpho # The first columns are morphological variables

species <- iris$Species
species # This last column has names of species


class(species)
levels(species)

species <- as.vector(species)
class(species)
unique(species) # Create a list of unique values


# Graphic of sepal length vs. petal length for I. setosa and I. virginica
plot(morpho$Sepal.Length ~ morpho$Sepal.Width, type="n")

points(morpho$Sepal.Length[species=="setosa"] ~
    morpho$Sepal.Width[species=="setosa"], col="gold")

points(morpho$Sepal.Length[species=="versicolor"] ~
    morpho$Sepal.Width[species=="versicolor"], col="navy")

points(morpho$Sepal.Length[species=="virginica"] ~
    morpho$Sepal.Width[species=="virginica"], col="red")



### I. INDEXING LISTS ##########################################################

L1 <- list(c(0.01, 3.1), c(0.02, 4.0, 0.1), c("a"), c(0.01, 2.9), c(0.03),
    c(0.04, 3.4, 8.2, 1.6))

class(L1)
L1

length(L1)
str(L1)

names(L1) <- paste("elem", 1:length(L1), sep="_")
L1

L1.1 <- L1[1] # *[]* extracts the first element of the list as a list
L1.1
class(L1.1)

L1.1 <- L1[[1]] # *[[]]* Extracts the first element of the list as the vector it contains

L1.1
class(L1.1)


L1[-1]
L1[1:3]

L1["elem_1"] # In lists, indexing can also be done by element names
class(L1["elem_1"])

L1[["elem_1"]]
class(L1[["elem_1"]])

L1$elem_1
class(L1$elem_1)


# Other manipulations:
L1[2:4][1]
L1[1:3][-1]

L1[[1]]
L1[[1]][2]

L1[[1]]
L1[[1]][1]

L1[[1]]<-3
L1

L1[[2]]
L1[[2]] > 2

L1[[2]] [L1[[2]]<1]



### J. INDEXING OF AN OBJECT OF CLASS 'LM' #####################################

# Let's open a data file (BatsEnviroAmerica.txt) to create a linear model
# and practice indexing.
# This file contains data on species richness of bats across the New World in
# 100x100 km cells. There is also information on several environmental variables
# in each cell.

bat.data <- read.table(file = file.choose(), header = TRUE, sep = "\t")
dim(bat.data)
class(bat.data)

colnames(bat.data)

length(which(bat.data$richness == 0)) # By indexing, we can know how many cells
                                      # have a richness of 0 (empty cells)
length(which(bat.data$richness> 0))


# For the following analyses, we will remove all empty cells
dim(bat.data)
bat.data <- bat.data[-which(bat.data$richness == 0),]
dim(bat.data)

hist(bat.data$richness) # By indexing the column "richness", we can create a
                        # a histogram of the values bat richness per cell

plot(bat.data$richness ~ bat.data$temp_AVG)

par(mfrow = c(1,2))
  plot(log(bat.data$richness) ~ bat.data$temp_AVG)
  plot(log(bat.data$richness) ~ bat.data$ele_RANGE)

model.1 <- lm(log(bat.data$richness) ~ scale(bat.data$temp_AVG) +
    scale(bat.data$ele_RANGE))

summary(model.1)

class(model.1)
str(model.1) # Although this is an object of class "lm", it has the structure of a list

model.1[[1]] # Numerical indexing for extracting coefficients
model.1$coefficients # Indexing by names to extract coefficients

model.1$coefficients[1] # Indexing by name and number for the intercept
model.1$coefficients["(Intercept)"] # Double indexing by names for the intercept

model.1$coefficients[2]
model.1$coefficients[3]

model.1$residuals[1:10]

model.1$fitted.values[1:10]

plot(log(bat.data$richness) ~ bat.data$temp_AVG)
points(model.1$fitted.values ~ bat.data$temp_AVG, col = "red")


library(car) # Open the package *car*
avPlots(model = model.1) # This produces a graph of "partial residuals"
                         # showing the effects of each variable