Different function types in R

z <- 1:10


# built-in functions ("prefix" functions)
mean(z)
## [1] 5.5
# "in-fix" functions
z + 100
##  [1] 101 102 103 104 105 106 107 108 109 110
# equivalent "pre-fix" functions
`+`(z,100)
##  [1] 101 102 103 104 105 106 107 108 109 110
# user-defined functions
# --------------------------------------
# FUNCTION my_fun
# description: calculate maximum of sin of x + x
# inputs: numeric vector
# outputs: 1-element numeric vector
########################################
my_fun <- function(x=runif(5)) {

z <- max(sin(x) + x)
return(z)

} # end of my_fun
# --------------------------------------
my_fun()
## [1] 1.813425
my_fun(z)
## [1] 9.455979
# anonymous functions
# unnamed, used for simple calculations, usually with a single input, by convention called x

function(x) x + 3 # anonymous function
## function(x) x + 3
function(x) x + 3 (10) # try to provide input
## function(x) x + 3 (10)
(function(x) x + 3) (10) # use of parentheses to call
## [1] 13

Functions that call functions

# first create some short user-defined functions
my_sum <- function(a,b) a + b
my_dif <- function(a,b) a - b
my_mult <- function(a,b) a*b

# we already know that built in functions can be called directly from within a function

funct_1 <- function(a=3,b=2) sum(a,b)
funct_1()
## [1] 5
funct_2 <- function(a=3,b=2) my_sum(a,b)
funct_2()
## [1] 5
funct_3 <- function(a=3,b=2) my_mult(a,b)
funct_3()
## [1] 6
# each time we want to use a different one of the "my" functions, we have to create a new function to call it. 

# now pass data AND another function into a function as parameters:

algebra <- function(x=my_sum,a=3,b=2) x(a,b)

algebra(x=my_sum)
## [1] 5
algebra(x=my_dif)
## [1] 1
algebra(x=my_mult)
## [1] 6
# This also works for passing in built-in functions
algebra(x=sum)
## [1] 5
# but this will not work
# algebra(x=mean)

# problem: all the functions called must use same inputs
# most r functions work on a single vector
algebra_vec <- function(x=mean,a=1:10) x(a)
  algebra_vec(x=sqrt)
##  [1] 1.000000 1.414214 1.732051 2.000000 2.236068 2.449490 2.645751 2.828427
##  [9] 3.000000 3.162278
  algebra_vec(x=mean)
## [1] 5.5
  algebra_vec(x=sd)
## [1] 3.02765

First Task: Apply a function to each row (or column of a matrix

m <- matrix(1:20,nrow=5,byrow=TRUE)
print(m)
##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    5    6    7    8
## [3,]    9   10   11   12
## [4,]   13   14   15   16
## [5,]   17   18   19   20

for loop solution

# create a vector of lists to hold the output
output <- vector("list",nrow(m))
str(output)
## List of 5
##  $ : NULL
##  $ : NULL
##  $ : NULL
##  $ : NULL
##  $ : NULL
print(output)
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
# run the function in a for loop for each
# row of the matrix

for (i in seq_len(nrow(m))) {
  output[i] <- my_fun(m[i,])
}

print(output)
## [[1]]
## [1] 3.243198
## 
## [[2]]
## [1] 8.989358
## 
## [[3]]
## [1] 11.46343
## 
## [[4]]
## [1] 15.7121
## 
## [[5]]
## [1] 20.91295

apply solution

# using the apply function to do the same thing
# apply(X,MARGIN,FUN,...)
# X = vector or an array (= matrix)
# MARGIN 1=row,2=column, c(1,2)=rows and columns
# ... optional arguments to FUN

# apply function my_fun to each row of m
row_out <- apply(X=m,
                 MARGIN=1,
                 FUN=my_fun)
print(row_out)
## [1]  3.243198  8.989358 11.463427 15.712097 20.912945
# apply function my_fun to each column of m
apply(m,2,my_fun)
## [1] 16.03860 17.24901 19.14988 20.91295
# apply function my_fun to each element of m
apply(m,c(1,2),my_fun)
##           [,1]      [,2]      [,3]      [,4]
## [1,]  1.841471  2.909297  3.141120  3.243198
## [2,]  4.041076  5.720585  7.656987  8.989358
## [3,]  9.412118  9.455979 10.000010 11.463427
## [4,] 13.420167 14.990607 15.650288 15.712097
## [5,] 16.038603 17.249013 19.149877 20.912945

apply solutions with anonymous function

apply(m,1,function(x) max(sin(x) + x))
## [1]  3.243198  8.989358 11.463427 15.712097 20.912945
apply(m,2,function(x) max(sin(x) + x))
## [1] 16.03860 17.24901 19.14988 20.91295
apply(m,c(1,2),function(x) max(sin(x) + x))
##           [,1]      [,2]      [,3]      [,4]
## [1,]  1.841471  2.909297  3.141120  3.243198
## [2,]  4.041076  5.720585  7.656987  8.989358
## [3,]  9.412118  9.455979 10.000010 11.463427
## [4,] 13.420167 14.990607 15.650288 15.712097
## [5,] 16.038603 17.249013 19.149877 20.912945

What happens to output of variable length?

# first, modification of code to simply reshuffle the order of the elements
apply(m,1,sample)
##      [,1] [,2] [,3] [,4] [,5]
## [1,]    3    7   11   14   18
## [2,]    4    6   12   15   19
## [3,]    1    5   10   16   20
## [4,]    2    8    9   13   17
# caution! array output is each vector is in a column! to preserve original matrix dimensions, we need to transpose
t(apply(m,1,sample))
##      [,1] [,2] [,3] [,4]
## [1,]    1    4    2    3
## [2,]    5    8    6    7
## [3,]   12   10   11    9
## [4,]   13   16   14   15
## [5,]   18   20   17   19
# function to choose a random number of elements
# from each row and pick them in random order
apply(m,1,function(x) x[sample(seq_along(x),size=sample(seq_along(x),size=1))])
## [[1]]
## [1] 4 2
## 
## [[2]]
## [1] 6
## 
## [[3]]
## [1] 12
## 
## [[4]]
## [1] 13 16 14 15
## 
## [[5]]
## [1] 17 20 19 18
# thus, output from apply can be a matrix, vector, or a list!

Second Task: Apply a function to every column of a data frame

df <- data.frame(x=runif(20),y=runif(20),z=runif(20))

for loop solution

output <- vector("list",ncol(df))
print(output)
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
for (i in seq_len(ncol(df))) {
  output[i] <- sd(df[,i])/mean(df[,i])
}

print(output)
## [[1]]
## [1] 0.5700665
## 
## [[2]]
## [1] 0.5765338
## 
## [[3]]
## [1] 0.5915379

lapply solution

# using lapply to do the same thing
# lapply(X,FUN,...)
# X is a vector (atomic or list)
# FUN is a function applied to each element of the list
# (note that a data frame is a list of vectors!)
# ... additional inputs to FUN

# note that output is always a list!
# note that names are retained from columns of data frame
summary_out <- lapply(X=df,
                      FUN=function(x) sd(x)/mean(x))
print(summary_out)
## $x
## [1] 0.5700665
## 
## $y
## [1] 0.5765338
## 
## $z
## [1] 0.5915379
# sapply tries to simplify output to vector or matrix (s(implify)apply)
# vapply requires specified output formats (v(erify)apply)
# these are special cases of lapply

Challenge: what if not all dataframe columns are of same type?

treatment <- rep(c("Control","Treatment"),each=(nrow(df)/2))
print(treatment)
##  [1] "Control"   "Control"   "Control"   "Control"   "Control"   "Control"  
##  [7] "Control"   "Control"   "Control"   "Control"   "Treatment" "Treatment"
## [13] "Treatment" "Treatment" "Treatment" "Treatment" "Treatment" "Treatment"
## [19] "Treatment" "Treatment"
df2 <- cbind(treatment,df)
head(df2)
##   treatment         x         y          z
## 1   Control 0.3392265 0.3468184 0.31756599
## 2   Control 0.9016315 0.0599094 0.27838238
## 3   Control 0.3074128 0.9749445 0.38565506
## 4   Control 0.6370175 0.5417987 0.09976567
## 5   Control 0.9239644 0.9230761 0.05844815
## 6   Control 0.2577916 0.5683634 0.37664110

for loop solution

output2 <- vector("list",ncol(df2))
for (i in seq_len(ncol(df2))) {
  if(!is.numeric(df2[,i])) next
    output2[i] <- sd(df2[,i])/mean(df2[,i])
}
print(output2)
## [[1]]
## NULL
## 
## [[2]]
## [1] 0.5700665
## 
## [[3]]
## [1] 0.5765338
## 
## [[4]]
## [1] 0.5915379

lapply solution

lapply(df2,function(x) if(is.numeric(x)) sd(x)/mean(x))
## $treatment
## NULL
## 
## $x
## [1] 0.5700665
## 
## $y
## [1] 0.5765338
## 
## $z
## [1] 0.5915379
# if you wanted the output as a vector, you could 
# just unlist it:
z <- lapply(df2,function(x) if(is.numeric(x)) sd(x)/mean(x))
z <- unlist(z)

print(z) # note difference in output length!
##         x         y         z 
## 0.5700665 0.5765338 0.5915379

Third Task: split/apply/combine for groups in a data frame

for loop solution

# use df2 for this, and split over two groups
print(df2)
##    treatment          x         y          z
## 1    Control 0.33922647 0.3468184 0.31756599
## 2    Control 0.90163148 0.0599094 0.27838238
## 3    Control 0.30741277 0.9749445 0.38565506
## 4    Control 0.63701748 0.5417987 0.09976567
## 5    Control 0.92396440 0.9230761 0.05844815
## 6    Control 0.25779164 0.5683634 0.37664110
## 7    Control 0.70278618 0.9810845 0.77763563
## 8    Control 0.26147183 0.6286650 0.97629934
## 9    Control 0.17905253 0.4118574 0.38720294
## 10   Control 0.70387882 0.1229274 0.81894170
## 11 Treatment 0.32304699 0.8263627 0.87636709
## 12 Treatment 0.89144666 0.6623240 0.25934992
## 13 Treatment 0.46937590 0.3071462 0.15581514
## 14 Treatment 0.11615511 0.1782750 0.76897682
## 15 Treatment 0.82666517 0.8736802 0.62136204
## 16 Treatment 0.54662088 0.3686728 0.83734998
## 17 Treatment 0.40139619 0.7524497 0.78154587
## 18 Treatment 0.29203800 0.1199348 0.14517409
## 19 Treatment 0.44798560 0.4917904 0.96049613
## 20 Treatment 0.04883076 0.2679076 0.56577596
g <- unique(df2$treatment)
print(g)
## [1] "Control"   "Treatment"
out_g <- vector("list",length(g))
names(out_g) <- g
print(out_g)
## $Control
## NULL
## 
## $Treatment
## NULL
for (i in seq_along(g)){
  df_sub <- df2[df2$treatment==g[i],]
  out_g[i] <- sd(df_sub$x)/mean(df_sub$x)
}
print(out_g)
## $Control
## [1] 0.5422297
## 
## $Treatment
## [1] 0.6207971

tapply solution

# using tapply to do the same thing (t(agged)apply)
# tapply(X,INDEX,FUN...)
# X is a vector (atomic or list) to be subset
# index is a list of factors (or character strings) # with one or more groups
# FUN is a function applied to each element of the different subsetted groups
# ... additional inputs to FUN

z <- tapply(X=df2$x,
            INDEX=df2$treatment,
            FUN= function(x) sd(x)/mean(x))
print(z)
##   Control Treatment 
## 0.5422297 0.6207971

Fourth Task: Replicate a stochastic process

# --------------------------------------
# FUNCTION pop_gen
# description: generate a stochastic population track of varying length
# inputs: number of time steps
# outputs: population track
########################################
pop_gen <- function(z=sample.int(n=10,size=1)) {

n <- runif(z)

return(n) # note returns a numeric vector of stochastic length

} # end of pop_gen
# --------------------------------------
pop_gen()
## [1] 0.47087167 0.77715957 0.41940079 0.34269929 0.02144105

for loop solution

n_reps <- 20
list_out <- vector("list",n_reps)
for(i in seq_len(n_reps)){
  list_out[i] <- pop_gen()
}
head(list_out)
## [[1]]
## [1] 0.81813
## 
## [[2]]
## [1] 0.9377393
## 
## [[3]]
## [1] 0.4615917
## 
## [[4]]
## [1] 0.1489041
## 
## [[5]]
## [1] 0.849458
## 
## [[6]]
## [1] 0.03035621
list_out[[1]]
## [1] 0.81813

replicate solution

# using replicate to do the same thing
# replicate(n,expr)
# n is the number of times the operation is to be repeated
# expr is a function (base, or user-defined), or an expression (like an anonymous function, but without the function(x) header; just the bare code for execution).

z_out <- replicate(n=5,
                   expr=pop_gen())
print(z_out)
## [[1]]
## [1] 0.5009026
## 
## [[2]]
## [1] 0.21892746 0.56898385 0.30724567 0.10753162 0.63140082 0.13409410 0.46176958
## [8] 0.06802756
## 
## [[3]]
## [1] 0.22405529 0.62196035 0.72118236 0.68082775 0.83924651 0.09190961 0.03381506
## [8] 0.42078000 0.19644869
## 
## [[4]]
## [1] 0.5193734 0.5982519 0.5785883
## 
## [[5]]
## [1] 0.4583318 0.3772320 0.4943908 0.1950180 0.3092008 0.5193125 0.9014765
## [8] 0.4887908

Fifth Task: Sweep a function with all parameter combinations

# use previous example of parameter sweep for 
# species area function S=cA^z
# this has parameters c, z, and A as inputs

# first, let's set up a data frame
# with all parameter combiinations
a_pars <- 1:10
c_pars <- c(100,150,125)
z_pars <- c(0.10,0.16,0.26,0.30)
df <- expand.grid(a=a_pars,c=c_pars,z=z_pars)
head(df)
##   a   c   z
## 1 1 100 0.1
## 2 2 100 0.1
## 3 3 100 0.1
## 4 4 100 0.1
## 5 5 100 0.1
## 6 6 100 0.1

for loop solution

df_out <-cbind(df,s=NA)

for (i in seq_len(nrow(df))) {
  df_out$s[i] <- df$c[i]*(df$a[i]^df$z[i])
}
head(df_out)
##   a   c   z        s
## 1 1 100 0.1 100.0000
## 2 2 100 0.1 107.1773
## 3 3 100 0.1 111.6123
## 4 4 100 0.1 114.8698
## 5 5 100 0.1 117.4619
## 6 6 100 0.1 119.6231

mapply solution

# using mapply to do the same thing (m(ultiple)apply)
# mapply(FUN,...,MoreArgs)
# FUN is the function to be used (note it is listed first!)
#...arguments to vectorize over(vectors or lists)
#MoreArgs list of additional arguments that are constant in all of the different runs
df_out$s <- mapply(FUN=function(a, c, z) c*(a^z), df$a,df$c,df$z)

head(df_out)
##   a   c   z        s
## 1 1 100 0.1 100.0000
## 2 2 100 0.1 107.1773
## 3 3 100 0.1 111.6123
## 4 4 100 0.1 114.8698
## 5 5 100 0.1 117.4619
## 6 6 100 0.1 119.6231

the “correct” solution

# no need for loops or mapply for this simple 
# function. We can just vectorize it with a single line of code!

df_out$s <- df_out$c*(df_out$a^df_out$z)
head(df_out)
##   a   c   z        s
## 1 1 100 0.1 100.0000
## 2 2 100 0.1 107.1773
## 3 3 100 0.1 111.6123
## 4 4 100 0.1 114.8698
## 5 5 100 0.1 117.4619
## 6 6 100 0.1 119.6231