randomForest {r} package demo

Computational Biology, Univeristy of Vermont

### randomForest tutorial ###
### Computational Biology: Univeristy of Vermont 
### April 26, 2017
### Matthias Nevins 

### Notes on randomForest ###

# "randomForest" is an ensemble learning model
# (also refered to as a "machine learning" tool). 
# 
# The package uses a random forest algorithm to sample data randomly 
# and then construct and analyzes multiple random decision trees (ensemble). 
# The performance of each random decision tree is compared and results are used to
# determine the mode of a classification or the mean of a regression. 
# The model can be trained on a subset of the data and then tested for accuracy. 

## See notes on decision trees and random forest ## 
############ randomForest #######################

# Begin by installing the randomForest package
#install.packages("randomForest")
library("randomForest")
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## INPUTS ## 
#help("randomForest")
# relevent input arguments 
  # x = data frame or matrix of predictor variable(s)
  # y = response vector (if factor, classification)
  # ntree = number of decision trees to grow 
  # mtry = Number of randomly sampled 
    ## mtry should be a value lower than the total number of variables
    ## for classification trees use the square root of number of  variables
  # sampsize = number of rows that are randomly sampled for each tree 
    ## sampsize should be lower than the total number of rows in your data set
  # nodesize = minimum size of terminal nodes (larger the number smaller the tree)

### OUTPUTS ##
# predication by randomForest is the mean of the random decision trees
# confusion matrix (classification) 

EXAMPLE 1

#### EXAMPLE 1: Iris data ####
# Call to "iris" data set available in R 
#data(iris) 
#View(iris) 
#str(iris) # numeric predictor variables, Species variable is catagorical
#summary(iris)

# Store iris in a new data fram 

Dframe <- iris

# Let's get started 
set.seed(123) # to get reproducible random results

# Split iris data to training data and testing data
# Train the model with 70% of data and test it 
# with the remaing %30 of the data. 
help(sample)
spl <- sample(2,nrow(Dframe),replace=TRUE,prob=c(0.7,0.3)) 
print(spl)
##   [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1
##  [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1
##  [71] 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
## [106] 2 2 1 1 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 2 2 1
## [141] 1 1 1 1 2 1 1 1 1 2
str(spl)
##  int [1:150] 1 2 1 2 2 1 1 2 1 1 ...
# define the training data 
trainData <- Dframe[spl==1,]
head(trainData)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
# Test data 
testData <- Dframe[spl==2,]
head(testData)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2           4.9         3.0          1.4         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 16          5.7         4.4          1.5         0.4  setosa
# Generate random forest with training data 
irisRF <- randomForest(Species~.,data=trainData, mtry= 3, ntree=200,proximity=TRUE) 
help(randomForest)

# Print Random Forest model and see the importance features
print(irisRF)
## 
## Call:
##  randomForest(formula = Species ~ ., data = trainData, mtry = 3,      ntree = 200, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 2.83%
## Confusion matrix:
##            setosa versicolor virginica class.error
## setosa         35          0         0  0.00000000
## versicolor      0         35         1  0.02777778
## virginica       0          2        33  0.05714286
# Confusion matrix for train data
table(predict(irisRF),trainData$Species) 
##             
##              setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         35         2
##   virginica       0          1        33
# Plot random forest 
plot(irisRF)

# Look at importance of independant vars
importance(irisRF)
##              MeanDecreaseGini
## Sepal.Length        0.4378989
## Sepal.Width         0.2784602
## Petal.Length       39.0131656
## Petal.Width        30.1709471
# Plot importance 
varImpPlot(irisRF)

# Now build random forest for testing data
help("predict.randomForest")

irisPred <- predict(irisRF,newdata=testData)
print(irisPred)
##          2          4          5          8         11         16 
##     setosa     setosa     setosa     setosa     setosa     setosa 
##         20         21         24         26         31         32 
##     setosa     setosa     setosa     setosa     setosa     setosa 
##         34         37         50         53         58         59 
##     setosa     setosa     setosa versicolor versicolor versicolor 
##         65         67         68         69         71         73 
## versicolor versicolor versicolor versicolor  virginica  virginica 
##         84         87         88         89         97        104 
##  virginica versicolor versicolor versicolor versicolor  virginica 
##        106        107        111        114        115        118 
##  virginica versicolor  virginica  virginica  virginica  virginica 
##        126        132        134        137        138        139 
##  virginica  virginica  virginica  virginica  virginica  virginica 
##        145        150 
##  virginica  virginica 
## Levels: setosa versicolor virginica
table(irisPred, testData$Species)
##             
## irisPred     setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         11         1
##   virginica       0          3        14
#Now, let's look at the margin, positive or negative,
# if positive it means correct classification
help("margin.randomForest")
plot(margin(irisRF,testData$Species))

EXAMPLE 2

#------------------------------------------------------
####### EXAMPLE 2: Using MASS package ##################

# Begin by importing two libraries 
library(randomForest)
library(MASS)

# set seed 
#help("set.seed")
set.seed(1234)

# Store data "birthwt" data set from the MASS package into a DataFrame 

#help("birthwt")
dFrame <- birthwt

# Identify predictor variables and target variable
# Identify catagorical target variable 
#head(dFrame)
#str(dFrame)
#View(dFrame)

# see how many unique values are within each variable 
# for "low"
length(unique(dFrame$low))
## [1] 2
hist(dFrame$low) # two unique values 

length(unique(dFrame$bwt))
## [1] 131
hist(dFrame$bwt) # continuous variable 

# another way to view unique values for all variables
apply(dFrame,2,function(x) length(unique(x))) 
##   low   age   lwt  race smoke   ptl    ht    ui   ftv   bwt 
##     2    24    75     3     2     4     2     2     6   131
help(apply) # 2 in this function indicates columns

# Now convert catagorical variables using as.factor so 
# they are treated as numerical data by randomForest

# Begin by placing the variables you need to convert into a new placehoder 
# variable cVars (catagorical variables)
cVars <- c("low", "race", "smoke", "ptl", "ht", "ui", "ftv")

# use a for loop to go over data frame (dFrame) 
for(i in cVars){
  dFrame[,i]=as.factor(dFrame[,i])
}
str(dFrame) # we see that the numerical values for the 
## 'data.frame':    189 obs. of  10 variables:
##  $ low  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ age  : int  19 33 20 21 18 21 22 17 29 26 ...
##  $ lwt  : int  182 155 105 108 107 124 118 103 123 113 ...
##  $ race : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
##  $ smoke: Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
##  $ ptl  : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ht   : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ui   : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
##  $ ftv  : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
##  $ bwt  : int  2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
#catagorical variables have been converted

# Acquire new library "caTools" to assist in the splitting of 
# sampled data
#help("caTools")
library(caTools)

ivar <- sample.split(Y = dFrame$low, SplitRatio = 0.7) 
ivar
##   [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
##  [12]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [23]  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
##  [34]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE
##  [45]  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
##  [56]  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
##  [67]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
##  [89]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [100] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [111] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE
## [122] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
## [133]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [144]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
## [155]  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [166] FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE
## [177]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [188]  TRUE  TRUE
# Now assign the split data to a "train" and a "test" data frame
trainD <- dFrame[ivar,]
testD <- dFrame[!ivar,]


#### FIT THE MODEL ####
# use randomForest model to try and predict the importance of 
# each variable as it relates to the traget variable ("low")

ranForMod <- randomForest(low~., data = trainD, mtry=2, ntree=200)
print(ranForMod)
## 
## Call:
##  randomForest(formula = low ~ ., data = trainD, mtry = 2, ntree = 200) 
##                Type of random forest: classification
##                      Number of trees: 200
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 0.76%
## Confusion matrix:
##    0  1 class.error
## 0 90  1  0.01098901
## 1  0 41  0.00000000
# Error rate 
# OOB (out of bag rate) mis-classification rate 
# Low rate = high tree strength

### Importance of each Variable ###
# High mean decrease Gini (or high mean decrease in accuracy of the model)
# indicates a high importance of the variable within the model 

importance(ranForMod)
##       MeanDecreaseGini
## age          3.9675709
## lwt          3.7500810
## race         1.6278126
## smoke        0.6814731
## ptl          1.6076642
## ht           0.8557607
## ui           0.3148854
## ftv          2.4081853
## bwt         38.6020990
varImpPlot(ranForMod)

### Predictions for model ### 

predictMod <- predict(ranForMod, testD, type = "class")
print(predictMod)
##  86  88  89  91  94  96  99 101 112 114 115 123 126 127 129 134 137 140 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 145 147 148 155 163 166 173 174 180 184 186 195 207 209 212 213 216 217 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 218 219 220   4  10  15  17  20  23  28  30  35  37  42  51  56  59  61 
##   0   0   0   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  65  79  82 
##   1   1   1 
## Levels: 0 1
tableP <- table(predictions=predictMod, actual=testD$low)
print(tableP)
##            actual
## predictions  0  1
##           0 39  0
##           1  0 18
sum(diag(tableP)/sum(tableP))
## [1] 1
## Plotting ROC (Reciever Operator Curve) ###
# expressed graphically in 2-D
# plots relationship b/w sensitivity and false positive rate (FPR)

#install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
pModProb <- predict(ranForMod, testD, type = "prob")
print(pModProb)
##         0     1
## 86  0.690 0.310
## 88  0.860 0.140
## 89  0.835 0.165
## 91  0.935 0.065
## 94  1.000 0.000
## 96  0.895 0.105
## 99  0.835 0.165
## 101 0.955 0.045
## 112 0.990 0.010
## 114 0.975 0.025
## 115 0.890 0.110
## 123 0.980 0.020
## 126 0.930 0.070
## 127 0.940 0.060
## 129 0.920 0.080
## 134 0.965 0.035
## 137 0.830 0.170
## 140 0.960 0.040
## 145 0.970 0.030
## 147 0.920 0.080
## 148 0.920 0.080
## 155 0.870 0.130
## 163 0.955 0.045
## 166 0.935 0.065
## 173 0.965 0.035
## 174 0.965 0.035
## 180 0.910 0.090
## 184 0.980 0.020
## 186 0.985 0.015
## 195 0.995 0.005
## 207 0.980 0.020
## 209 0.970 0.030
## 212 0.955 0.045
## 213 0.825 0.175
## 216 0.865 0.135
## 217 0.975 0.025
## 218 0.925 0.075
## 219 0.990 0.010
## 220 0.980 0.020
## 4   0.245 0.755
## 10  0.350 0.650
## 15  0.085 0.915
## 17  0.200 0.800
## 20  0.240 0.760
## 23  0.205 0.795
## 28  0.285 0.715
## 30  0.115 0.885
## 35  0.140 0.860
## 37  0.125 0.875
## 42  0.235 0.765
## 51  0.135 0.865
## 56  0.410 0.590
## 59  0.240 0.760
## 61  0.095 0.905
## 65  0.270 0.730
## 79  0.375 0.625
## 82  0.180 0.820
## attr(,"class")
## [1] "matrix" "votes"
auc <- auc(testD$low, pModProb[,2])
plot(roc(testD$low, pModProb[,2]))

# Finding the best mtry by "tuning" the model
bestmtry <- tuneRF(trainD, trainD$low, ntreeTry = 200, 
                   stepFactor = 1.5, 
                   improve = 0.01, 
                   trace = T, 
                   plot = T)
## mtry = 3  OOB error = 0.76% 
## Searching left ...
## mtry = 2     OOB error = 0% 
## 1 0.01 
## Searching right ...
## mtry = 4     OOB error = 0.76% 
## -Inf 0.01