Different function types in R
z <- 1:10
# built-in functions ("prefix" functions)
mean(z)
## [1] 5.5
# "in-fix" functions
z + 100
## [1] 101 102 103 104 105 106 107 108 109 110
# equivalent "pre-fix" functions
`+`(z,100)
## [1] 101 102 103 104 105 106 107 108 109 110
# user-defined functions
# --------------------------------------
# FUNCTION my_fun
# description: calculate maximum of sin of x + x
# inputs: numeric vector
# outputs: 1-element numeric vector
########################################
my_fun <- function(x=runif(5)) {
z <- max(sin(x) + x)
return(z)
} # end of my_fun
# --------------------------------------
my_fun()
## [1] 1.813425
## [1] 9.455979
# anonymous functions
# unnamed, used for simple calculations, usually with a single input, by convention called x
function(x) x + 3 # anonymous function
## function(x) x + 3
function(x) x + 3 (10) # try to provide input
## function(x) x + 3 (10)
(function(x) x + 3) (10) # use of parentheses to call
## [1] 13
Functions that call functions
# first create some short user-defined functions
my_sum <- function(a,b) a + b
my_dif <- function(a,b) a - b
my_mult <- function(a,b) a*b
# we already know that built in functions can be called directly from within a function
funct_1 <- function(a=3,b=2) sum(a,b)
funct_1()
## [1] 5
funct_2 <- function(a=3,b=2) my_sum(a,b)
funct_2()
## [1] 5
funct_3 <- function(a=3,b=2) my_mult(a,b)
funct_3()
## [1] 6
# each time we want to use a different one of the "my" functions, we have to create a new function to call it.
# now pass data AND another function into a function as parameters:
algebra <- function(x=my_sum,a=3,b=2) x(a,b)
algebra(x=my_sum)
## [1] 5
## [1] 1
## [1] 6
# This also works for passing in built-in functions
algebra(x=sum)
## [1] 5
# but this will not work
# algebra(x=mean)
# problem: all the functions called must use same inputs
# most r functions work on a single vector
algebra_vec <- function(x=mean,a=1:10) x(a)
algebra_vec(x=sqrt)
## [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427
## [9] 3.000000 3.162278
## [1] 5.5
## [1] 3.02765
First Task: Apply a function to each row (or column of a matrix
m <- matrix(1:20,nrow=5,byrow=TRUE)
print(m)
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 5 6 7 8
## [3,] 9 10 11 12
## [4,] 13 14 15 16
## [5,] 17 18 19 20
for loop
solution
# create a vector of lists to hold the output
output <- vector("list",nrow(m))
str(output)
## List of 5
## $ : NULL
## $ : NULL
## $ : NULL
## $ : NULL
## $ : NULL
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
# run the function in a for loop for each
# row of the matrix
for (i in seq_len(nrow(m))) {
output[i] <- my_fun(m[i,])
}
print(output)
## [[1]]
## [1] 3.243198
##
## [[2]]
## [1] 8.989358
##
## [[3]]
## [1] 11.46343
##
## [[4]]
## [1] 15.7121
##
## [[5]]
## [1] 20.91295
apply
solution
# using the apply function to do the same thing
# apply(X,MARGIN,FUN,...)
# X = vector or an array (= matrix)
# MARGIN 1=row,2=column, c(1,2)=rows and columns
# ... optional arguments to FUN
# apply function my_fun to each row of m
row_out <- apply(X=m,
MARGIN=1,
FUN=my_fun)
print(row_out)
## [1] 3.243198 8.989358 11.463427 15.712097 20.912945
# apply function my_fun to each column of m
apply(m,2,my_fun)
## [1] 16.03860 17.24901 19.14988 20.91295
# apply function my_fun to each element of m
apply(m,c(1,2),my_fun)
## [,1] [,2] [,3] [,4]
## [1,] 1.841471 2.909297 3.141120 3.243198
## [2,] 4.041076 5.720585 7.656987 8.989358
## [3,] 9.412118 9.455979 10.000010 11.463427
## [4,] 13.420167 14.990607 15.650288 15.712097
## [5,] 16.038603 17.249013 19.149877 20.912945
apply
solutions with anonymous function
apply(m,1,function(x) max(sin(x) + x))
## [1] 3.243198 8.989358 11.463427 15.712097 20.912945
apply(m,2,function(x) max(sin(x) + x))
## [1] 16.03860 17.24901 19.14988 20.91295
apply(m,c(1,2),function(x) max(sin(x) + x))
## [,1] [,2] [,3] [,4]
## [1,] 1.841471 2.909297 3.141120 3.243198
## [2,] 4.041076 5.720585 7.656987 8.989358
## [3,] 9.412118 9.455979 10.000010 11.463427
## [4,] 13.420167 14.990607 15.650288 15.712097
## [5,] 16.038603 17.249013 19.149877 20.912945
What happens to output of variable length?
# first, modification of code to simply reshuffle the order of the elements
apply(m,1,sample)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 3 7 11 14 18
## [2,] 4 6 12 15 19
## [3,] 1 5 10 16 20
## [4,] 2 8 9 13 17
# caution! array output is each vector is in a column! to preserve original matrix dimensions, we need to transpose
t(apply(m,1,sample))
## [,1] [,2] [,3] [,4]
## [1,] 1 4 2 3
## [2,] 5 8 6 7
## [3,] 12 10 11 9
## [4,] 13 16 14 15
## [5,] 18 20 17 19
# function to choose a random number of elements
# from each row and pick them in random order
apply(m,1,function(x) x[sample(seq_along(x),size=sample(seq_along(x),size=1))])
## [[1]]
## [1] 4 2
##
## [[2]]
## [1] 6
##
## [[3]]
## [1] 12
##
## [[4]]
## [1] 13 16 14 15
##
## [[5]]
## [1] 17 20 19 18
# thus, output from apply can be a matrix, vector, or a list!
for loop
solution
output <- vector("list",ncol(df))
print(output)
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
for (i in seq_len(ncol(df))) {
output[i] <- sd(df[,i])/mean(df[,i])
}
print(output)
## [[1]]
## [1] 0.5700665
##
## [[2]]
## [1] 0.5765338
##
## [[3]]
## [1] 0.5915379
lapply
solution
# using lapply to do the same thing
# lapply(X,FUN,...)
# X is a vector (atomic or list)
# FUN is a function applied to each element of the list
# (note that a data frame is a list of vectors!)
# ... additional inputs to FUN
# note that output is always a list!
# note that names are retained from columns of data frame
summary_out <- lapply(X=df,
FUN=function(x) sd(x)/mean(x))
print(summary_out)
## $x
## [1] 0.5700665
##
## $y
## [1] 0.5765338
##
## $z
## [1] 0.5915379
# sapply tries to simplify output to vector or matrix (s(implify)apply)
# vapply requires specified output formats (v(erify)apply)
# these are special cases of lapply
Challenge: what if not all dataframe columns are of same type?
treatment <- rep(c("Control","Treatment"),each=(nrow(df)/2))
print(treatment)
## [1] "Control" "Control" "Control" "Control" "Control" "Control"
## [7] "Control" "Control" "Control" "Control" "Treatment" "Treatment"
## [13] "Treatment" "Treatment" "Treatment" "Treatment" "Treatment" "Treatment"
## [19] "Treatment" "Treatment"
df2 <- cbind(treatment,df)
head(df2)
## treatment x y z
## 1 Control 0.3392265 0.3468184 0.31756599
## 2 Control 0.9016315 0.0599094 0.27838238
## 3 Control 0.3074128 0.9749445 0.38565506
## 4 Control 0.6370175 0.5417987 0.09976567
## 5 Control 0.9239644 0.9230761 0.05844815
## 6 Control 0.2577916 0.5683634 0.37664110
for loop
solution
output2 <- vector("list",ncol(df2))
for (i in seq_len(ncol(df2))) {
if(!is.numeric(df2[,i])) next
output2[i] <- sd(df2[,i])/mean(df2[,i])
}
print(output2)
## [[1]]
## NULL
##
## [[2]]
## [1] 0.5700665
##
## [[3]]
## [1] 0.5765338
##
## [[4]]
## [1] 0.5915379
lapply
solution
lapply(df2,function(x) if(is.numeric(x)) sd(x)/mean(x))
## $treatment
## NULL
##
## $x
## [1] 0.5700665
##
## $y
## [1] 0.5765338
##
## $z
## [1] 0.5915379
# if you wanted the output as a vector, you could
# just unlist it:
z <- lapply(df2,function(x) if(is.numeric(x)) sd(x)/mean(x))
z <- unlist(z)
print(z) # note difference in output length!
## x y z
## 0.5700665 0.5765338 0.5915379
Third Task: split/apply/combine for groups in a data frame
for loop
solution
# use df2 for this, and split over two groups
print(df2)
## treatment x y z
## 1 Control 0.33922647 0.3468184 0.31756599
## 2 Control 0.90163148 0.0599094 0.27838238
## 3 Control 0.30741277 0.9749445 0.38565506
## 4 Control 0.63701748 0.5417987 0.09976567
## 5 Control 0.92396440 0.9230761 0.05844815
## 6 Control 0.25779164 0.5683634 0.37664110
## 7 Control 0.70278618 0.9810845 0.77763563
## 8 Control 0.26147183 0.6286650 0.97629934
## 9 Control 0.17905253 0.4118574 0.38720294
## 10 Control 0.70387882 0.1229274 0.81894170
## 11 Treatment 0.32304699 0.8263627 0.87636709
## 12 Treatment 0.89144666 0.6623240 0.25934992
## 13 Treatment 0.46937590 0.3071462 0.15581514
## 14 Treatment 0.11615511 0.1782750 0.76897682
## 15 Treatment 0.82666517 0.8736802 0.62136204
## 16 Treatment 0.54662088 0.3686728 0.83734998
## 17 Treatment 0.40139619 0.7524497 0.78154587
## 18 Treatment 0.29203800 0.1199348 0.14517409
## 19 Treatment 0.44798560 0.4917904 0.96049613
## 20 Treatment 0.04883076 0.2679076 0.56577596
g <- unique(df2$treatment)
print(g)
## [1] "Control" "Treatment"
out_g <- vector("list",length(g))
names(out_g) <- g
print(out_g)
## $Control
## NULL
##
## $Treatment
## NULL
for (i in seq_along(g)){
df_sub <- df2[df2$treatment==g[i],]
out_g[i] <- sd(df_sub$x)/mean(df_sub$x)
}
print(out_g)
## $Control
## [1] 0.5422297
##
## $Treatment
## [1] 0.6207971
tapply
solution
# using tapply to do the same thing (t(agged)apply)
# tapply(X,INDEX,FUN...)
# X is a vector (atomic or list) to be subset
# index is a list of factors (or character strings) # with one or more groups
# FUN is a function applied to each element of the different subsetted groups
# ... additional inputs to FUN
z <- tapply(X=df2$x,
INDEX=df2$treatment,
FUN= function(x) sd(x)/mean(x))
print(z)
## Control Treatment
## 0.5422297 0.6207971
Fourth Task: Replicate a stochastic process
# --------------------------------------
# FUNCTION pop_gen
# description: generate a stochastic population track of varying length
# inputs: number of time steps
# outputs: population track
########################################
pop_gen <- function(z=sample.int(n=10,size=1)) {
n <- runif(z)
return(n) # note returns a numeric vector of stochastic length
} # end of pop_gen
# --------------------------------------
pop_gen()
## [1] 0.47087167 0.77715957 0.41940079 0.34269929 0.02144105
for loop
solution
n_reps <- 20
list_out <- vector("list",n_reps)
for(i in seq_len(n_reps)){
list_out[i] <- pop_gen()
}
head(list_out)
## [[1]]
## [1] 0.81813
##
## [[2]]
## [1] 0.9377393
##
## [[3]]
## [1] 0.4615917
##
## [[4]]
## [1] 0.1489041
##
## [[5]]
## [1] 0.849458
##
## [[6]]
## [1] 0.03035621
## [1] 0.81813
replicate
solution
# using replicate to do the same thing
# replicate(n,expr)
# n is the number of times the operation is to be repeated
# expr is a function (base, or user-defined), or an expression (like an anonymous function, but without the function(x) header; just the bare code for execution).
z_out <- replicate(n=5,
expr=pop_gen())
print(z_out)
## [[1]]
## [1] 0.5009026
##
## [[2]]
## [1] 0.21892746 0.56898385 0.30724567 0.10753162 0.63140082 0.13409410 0.46176958
## [8] 0.06802756
##
## [[3]]
## [1] 0.22405529 0.62196035 0.72118236 0.68082775 0.83924651 0.09190961 0.03381506
## [8] 0.42078000 0.19644869
##
## [[4]]
## [1] 0.5193734 0.5982519 0.5785883
##
## [[5]]
## [1] 0.4583318 0.3772320 0.4943908 0.1950180 0.3092008 0.5193125 0.9014765
## [8] 0.4887908
Fifth Task: Sweep a function with all parameter combinations
# use previous example of parameter sweep for
# species area function S=cA^z
# this has parameters c, z, and A as inputs
# first, let's set up a data frame
# with all parameter combiinations
a_pars <- 1:10
c_pars <- c(100,150,125)
z_pars <- c(0.10,0.16,0.26,0.30)
df <- expand.grid(a=a_pars,c=c_pars,z=z_pars)
head(df)
## a c z
## 1 1 100 0.1
## 2 2 100 0.1
## 3 3 100 0.1
## 4 4 100 0.1
## 5 5 100 0.1
## 6 6 100 0.1
for loop
solution
df_out <-cbind(df,s=NA)
for (i in seq_len(nrow(df))) {
df_out$s[i] <- df$c[i]*(df$a[i]^df$z[i])
}
head(df_out)
## a c z s
## 1 1 100 0.1 100.0000
## 2 2 100 0.1 107.1773
## 3 3 100 0.1 111.6123
## 4 4 100 0.1 114.8698
## 5 5 100 0.1 117.4619
## 6 6 100 0.1 119.6231
mapply
solution
# using mapply to do the same thing (m(ultiple)apply)
# mapply(FUN,...,MoreArgs)
# FUN is the function to be used (note it is listed first!)
#...arguments to vectorize over(vectors or lists)
#MoreArgs list of additional arguments that are constant in all of the different runs
df_out$s <- mapply(FUN=function(a, c, z) c*(a^z), df$a,df$c,df$z)
head(df_out)
## a c z s
## 1 1 100 0.1 100.0000
## 2 2 100 0.1 107.1773
## 3 3 100 0.1 111.6123
## 4 4 100 0.1 114.8698
## 5 5 100 0.1 117.4619
## 6 6 100 0.1 119.6231
the “correct” solution
# no need for loops or mapply for this simple
# function. We can just vectorize it with a single line of code!
df_out$s <- df_out$c*(df_out$a^df_out$z)
head(df_out)
## a c z s
## 1 1 100 0.1 100.0000
## 2 2 100 0.1 107.1773
## 3 3 100 0.1 111.6123
## 4 4 100 0.1 114.8698
## 5 5 100 0.1 117.4619
## 6 6 100 0.1 119.6231