Matrices

# a matrix is an atomic vector that is organized into rows and columns

my_vec <- 1:12

m <- matrix(data=my_vec,nrow=4)
print(m)

m <- matrix(data=my_vec,ncol=3)
print(m)

m <- matrix(data=my_vec,ncol=3,byrow=TRUE)
print(m)

Lists

# Lists are atomic vectors but each element 
# can hold things of different types and different sizes

my_list <- list(1:10, matrix(1:8,nrow=4,byrow=TRUE), letters[1:3],pi)
str(my_list)
print(my_list)


# using [] gives you a single item, which is of type list
my_list[4]
my_list[4] - 3 # no, can't subtract a number from a list!


# single brackets gives you only the element in that slot, which is always of type list

# to grab the object itself, use [[]]
my_list[[4]]
my_list[[4]] - 3 # now we have the contents

# if a list has 10 elements it is like a train with 10 cars
# [[5]] gives you the contents of car #5
##[c(4,5,6)] gives you a little train with cars 4, 5, and 6

# once the double bracket is called, you can access individual elements as before

my_list[[2]]
my_list[[2]][4,1]

# name list items when they are created

my_list2 <- list(tester=FALSE,little_m=matrix(1:9,nrow=3))

# named elements can be accessed with dollar sign

my_list2$little_m[2,3] # get row 2, column3
my_list2$little_m # show whole matrix
my_list2$little_m[2,] # show second row, all columns
my_list2$little_m[2] # what does this give you?

# The "unlist" strings everything back into
# a single atomic vector; coercion is used if there are mixed data types
unrolled <- unlist(my_list2)
print(unrolled)


# The most common use of list: output from a linear model is a list

library(ggplot2)
y_var <- runif(10)
x_var <- runif(10)
my_model <- lm(y_var~x_var)
qplot(x=x_var,y=y_var)

# look at output in myModel
print(my_model)

# full results are in summary
print(summary(my_model))

# drill down into summary to get what we want
str(summary(my_model))
summary(my_model)$coefficients
summary(my_model)$coefficients["x_var","Pr(>|t|)"]
summary(my_model)$coefficients[2,4]

# use unlist instead

u <- unlist(summary(my_model))
print(u)

my_slope <- u$coefficients2
my_pval <- u$coefficients8

Data Frames

# a data frame is a list of equal-lengthed vectors, each of which is a column

var_a <- 1:12
var_b <- rep(c("Con","LowN","HighN"),each=4)
var_c <- runif(12)
d_frame <- data.frame(var_a,var_b,var_c)
print(d_frame)
str(d_frame)

# add another row with rbind
# make sure you add a list, with each item corresponding to a column

# newData <- data.frame(list(varA=13,varB="HighN",varC=0.668),stringsAsFactors=FALSE)
new_data <- list(var_a=13,var_b="HighN",var_c=0.668)
print(new_data)
str(new_data)

# now bind them
d_frame <- rbind(d_frame,new_data)
str(d_frame)
tail(d_rame)


# adding another column is a little easier

#newVar <- data.frame(varD=runif(13))
new_var <- runif(13)
d_frame <- cbind(d_frame,new_var)
head(d_frame)

Important Distinctions Between Lists and Matrices

# create a matrix and data frame with same structures
z_mat <- matrix(data=1:30,ncol=3,byrow=TRUE)
z_dframe <- as.data.frame(z_mat) # coerce it

str(z_mat)    # an atomic vector with 2 dimensions
str(z_dframe) # note horizontal layout of variabes!

head(z_dframe) # note automatic variable names
head(z_mat) # note identical layout

# element referencing is the same in both
z_mat[3,3]
z_dframe[3,3]

# so is column referencing

z_mat[,3]
z_dframe[,3]
z_dframe$V3 # note use of $ and named variable column
# and row referencing
z_mat[3,]
z_dframe[3,] # note variable names and row number shown

# what happens if we reference only one dimension?

z_mat[2] # takes the second element of atomic vector (column fill)
z_dframe[2] # takes second atomic vector (= column) from list
z_dframe["V2"]
z_dframe$V2

Eliminating missing values

# use complete.cases with atomic vector
zd <- runif(10)
zd[c(5,7)] <- NA
print(zd)

complete.cases(zd)

zD[complete.cases(zd)] # clean them out

which(!complete.cases(zd)) # find NA slots

# use with a matrix

m <- matrix(1:20,nrow=5)
m[1,1] <- NA
m[5,4] <- NA
print(m)

m[complete.cases(m),] 

# now get complete cases for only certain columns!
m[complete.cases(m[,c(1,2)]),] # drops row 1
m[complete.cases(m[,c(2,3)]),] # no drops
m[complete.cases(m[,c(3,4)]),] # drops row 4
m[complete.cases(m[,c(1,4)]),] # drops 1&4

Techniques for assignments and subsetting matrices and data frames

# same principle applied to both dimensions of a matrix
m <- matrix(data=1:12,nrow=3)
dimnames(m) <- list(paste("Species",LETTERS[1:nrow(m)],sep=""),paste("Site",1:ncol(m),sep=""))
print(m)

# subsetting based on elements
m[1:2,3:4]
# same subsetting based on character strings (but no negative elements)
m[c("SpeciesA","SpeciesB"), c("Site3","Site4")]

# use blanks before or after comma to indicate full rows or columns
m[1:2, ]    

m[ ,3:4]

# use logicals for more complex subsetting

# e.g. select all columns for which the totals are > 15

# first try this logical
colSums(m) > 15
m[ , colSums(m) > 15]


# e.g. select all rows for which the row total is 22
m[rowSums(m)==22, ]

# note == for logical equal and != for logical NOT equal
m[rowSums(m)!=22, ]

# e.g., choose all rows for which numbers for site 1 are less than 3
# AND choose all columns for which the numbers for species A are less than 5

# first, try out this logical for rows
m[ ,"Site1"]<3

# add this in and select with all columns
m[m[ ,"Site1"]<3, ]

# and try this logical for columns
m["SpeciesA", ]<5

# add this in and select with all rows
m[ ,m["SpeciesA", ]<5]

# now combine both
m[m[ ,"Site1"]<3,m["SpeciesA", ]<5]

# and compare with full m
print(m)


# caution! simple subscripting to a vector changes the data type!
z <- m[1, ]
print(z)
str(z)

# to keep this as a matrix, must add the drop=FALSE option

z2 <- m[1, ,drop=FALSE]
print(z2)
str(z2)

# caution #2, always use both dimensions, or you will select a single matrix element

m2 <- matrix(data=runif(9),nrow=3)
print(m2)
m2[2, ]

# but now this will just pull the second element
m2[2]

# probably should specify row and column indicators
m2[2,1]
# also use logicals for assignments, not just subsetting
m2[m2>0.6] <- NA
print(m2)


# A few changes for working with data frames:

data <-read.csv(file="antcountydata.csv",header=TRUE,sep=",")
str(data)

# the data frame is a list of vectors, so it is set up like a matrix
data[3,2]

# you can specify just the column names

data_names <- data[c("state","county")]
str(dataNames)

# or in matrix style
data_names <- data[ ,c("county", "ecoregion")]
str(data_names)


# as before, with matrices, selecting only a single column changes it
# from a data frame to a vector
data_names <- data[ ,"county"]
str(data_names)

Data curation

Exporting and importing data

First create a tiny data set in Excel:

# comments at the top
# beaucoup metadata
ID, Treatment, Biomass, Notes
1, Control, 30.3, 
2, HighN, 13.0, 
3, HighN, NA, broken scale
4, Control, 35.3,

Use read.table to bring in data

my_data <- read.table(file="path/to/data.csv",
                    header=TRUE,
                    sep=",",
                    comment.char="#")

# inspect object
str(my_data)

# now add a column
my_data$newVar <- runif(4)
head(my_data)

Use write.table to export to a data file

write.table(x=my_data,
            file="Path/To/OutputFileName.csv",
            HEADER=TRUE,
            sep=",")

But this is not a good way to save or share data objects if we are working in R. Some researchers use the save() function, which preserves the whole environment, but once it is restored with load(), the variable names cannot be changed. It is better to use `saveRDS().

saveRDS(): useful when you are working only in R

saveRDS(my_data, file="Path/To/FileName.RDS") # .RDS suffix is not required, but good for clarity

This only saves a single R object as a binary, but remember, you can bundle up many things into a single list!

Use readRDS() to restore it.

readRDS()

data_in <-readRDS("FileName.RDS")