EXAMPLE 1
#### EXAMPLE 1: Iris data ####
# Call to "iris" data set available in R
#data(iris)
#View(iris)
#str(iris) # numeric predictor variables, Species variable is catagorical
#summary(iris)
# Store iris in a new data fram
Dframe <- iris
# Let's get started
set.seed(123) # to get reproducible random results
# Split iris data to training data and testing data
# Train the model with 70% of data and test it
# with the remaing %30 of the data.
help(sample)
spl <- sample(2,nrow(Dframe),replace=TRUE,prob=c(0.7,0.3))
print(spl)
## [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1
## [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1
## [71] 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1
## [106] 2 2 1 1 1 2 1 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 2 2 1
## [141] 1 1 1 1 2 1 1 1 1 2
str(spl)
## int [1:150] 1 2 1 2 2 1 1 2 1 1 ...
# define the training data
trainData <- Dframe[spl==1,]
head(trainData)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
# Test data
testData <- Dframe[spl==2,]
head(testData)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 2 4.9 3.0 1.4 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 16 5.7 4.4 1.5 0.4 setosa
# Generate random forest with training data
irisRF <- randomForest(Species~.,data=trainData, mtry= 3, ntree=200,proximity=TRUE)
help(randomForest)
# Print Random Forest model and see the importance features
print(irisRF)
##
## Call:
## randomForest(formula = Species ~ ., data = trainData, mtry = 3, ntree = 200, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 2.83%
## Confusion matrix:
## setosa versicolor virginica class.error
## setosa 35 0 0 0.00000000
## versicolor 0 35 1 0.02777778
## virginica 0 2 33 0.05714286
# Confusion matrix for train data
table(predict(irisRF),trainData$Species)
##
## setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 35 2
## virginica 0 1 33
# Plot random forest
plot(irisRF)
# Look at importance of independant vars
importance(irisRF)
## MeanDecreaseGini
## Sepal.Length 0.4378989
## Sepal.Width 0.2784602
## Petal.Length 39.0131656
## Petal.Width 30.1709471
# Plot importance
varImpPlot(irisRF)
# Now build random forest for testing data
help("predict.randomForest")
irisPred <- predict(irisRF,newdata=testData)
print(irisPred)
## 2 4 5 8 11 16
## setosa setosa setosa setosa setosa setosa
## 20 21 24 26 31 32
## setosa setosa setosa setosa setosa setosa
## 34 37 50 53 58 59
## setosa setosa setosa versicolor versicolor versicolor
## 65 67 68 69 71 73
## versicolor versicolor versicolor versicolor virginica virginica
## 84 87 88 89 97 104
## virginica versicolor versicolor versicolor versicolor virginica
## 106 107 111 114 115 118
## virginica versicolor virginica virginica virginica virginica
## 126 132 134 137 138 139
## virginica virginica virginica virginica virginica virginica
## 145 150
## virginica virginica
## Levels: setosa versicolor virginica
table(irisPred, testData$Species)
##
## irisPred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 11 1
## virginica 0 3 14
#Now, let's look at the margin, positive or negative,
# if positive it means correct classification
help("margin.randomForest")
plot(margin(irisRF,testData$Species))
EXAMPLE 2
#------------------------------------------------------
####### EXAMPLE 2: Using MASS package ##################
# Begin by importing two libraries
library(randomForest)
library(MASS)
# set seed
#help("set.seed")
set.seed(1234)
# Store data "birthwt" data set from the MASS package into a DataFrame
#help("birthwt")
dFrame <- birthwt
# Identify predictor variables and target variable
# Identify catagorical target variable
#head(dFrame)
#str(dFrame)
#View(dFrame)
# see how many unique values are within each variable
# for "low"
length(unique(dFrame$low))
## [1] 2
hist(dFrame$low) # two unique values
length(unique(dFrame$bwt))
## [1] 131
hist(dFrame$bwt) # continuous variable
# another way to view unique values for all variables
apply(dFrame,2,function(x) length(unique(x)))
## low age lwt race smoke ptl ht ui ftv bwt
## 2 24 75 3 2 4 2 2 6 131
help(apply) # 2 in this function indicates columns
# Now convert catagorical variables using as.factor so
# they are treated as numerical data by randomForest
# Begin by placing the variables you need to convert into a new placehoder
# variable cVars (catagorical variables)
cVars <- c("low", "race", "smoke", "ptl", "ht", "ui", "ftv")
# use a for loop to go over data frame (dFrame)
for(i in cVars){
dFrame[,i]=as.factor(dFrame[,i])
}
str(dFrame) # we see that the numerical values for the
## 'data.frame': 189 obs. of 10 variables:
## $ low : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ age : int 19 33 20 21 18 21 22 17 29 26 ...
## $ lwt : int 182 155 105 108 107 124 118 103 123 113 ...
## $ race : Factor w/ 3 levels "1","2","3": 2 3 1 1 1 3 1 3 1 1 ...
## $ smoke: Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 1 2 2 ...
## $ ptl : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
## $ ht : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ui : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 1 1 1 ...
## $ ftv : Factor w/ 6 levels "0","1","2","3",..: 1 4 2 3 1 1 2 2 2 1 ...
## $ bwt : int 2523 2551 2557 2594 2600 2622 2637 2637 2663 2665 ...
#catagorical variables have been converted
# Acquire new library "caTools" to assist in the splitting of
# sampled data
#help("caTools")
library(caTools)
ivar <- sample.split(Y = dFrame$low, SplitRatio = 0.7)
ivar
## [1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [12] TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [34] TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [45] TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## [56] TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE
## [67] TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
## [78] TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
## [89] TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [100] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [111] FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
## [122] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [133] TRUE TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [144] TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [155] TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE
## [177] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [188] TRUE TRUE
# Now assign the split data to a "train" and a "test" data frame
trainD <- dFrame[ivar,]
testD <- dFrame[!ivar,]
#### FIT THE MODEL ####
# use randomForest model to try and predict the importance of
# each variable as it relates to the traget variable ("low")
ranForMod <- randomForest(low~., data = trainD, mtry=2, ntree=200)
print(ranForMod)
##
## Call:
## randomForest(formula = low ~ ., data = trainD, mtry = 2, ntree = 200)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 0.76%
## Confusion matrix:
## 0 1 class.error
## 0 90 1 0.01098901
## 1 0 41 0.00000000
# Error rate
# OOB (out of bag rate) mis-classification rate
# Low rate = high tree strength
### Importance of each Variable ###
# High mean decrease Gini (or high mean decrease in accuracy of the model)
# indicates a high importance of the variable within the model
importance(ranForMod)
## MeanDecreaseGini
## age 3.9675709
## lwt 3.7500810
## race 1.6278126
## smoke 0.6814731
## ptl 1.6076642
## ht 0.8557607
## ui 0.3148854
## ftv 2.4081853
## bwt 38.6020990
varImpPlot(ranForMod)
### Predictions for model ###
predictMod <- predict(ranForMod, testD, type = "class")
print(predictMod)
## 86 88 89 91 94 96 99 101 112 114 115 123 126 127 129 134 137 140
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 145 147 148 155 163 166 173 174 180 184 186 195 207 209 212 213 216 217
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 218 219 220 4 10 15 17 20 23 28 30 35 37 42 51 56 59 61
## 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 65 79 82
## 1 1 1
## Levels: 0 1
tableP <- table(predictions=predictMod, actual=testD$low)
print(tableP)
## actual
## predictions 0 1
## 0 39 0
## 1 0 18
sum(diag(tableP)/sum(tableP))
## [1] 1
## Plotting ROC (Reciever Operator Curve) ###
# expressed graphically in 2-D
# plots relationship b/w sensitivity and false positive rate (FPR)
#install.packages("pROC")
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pModProb <- predict(ranForMod, testD, type = "prob")
print(pModProb)
## 0 1
## 86 0.690 0.310
## 88 0.860 0.140
## 89 0.835 0.165
## 91 0.935 0.065
## 94 1.000 0.000
## 96 0.895 0.105
## 99 0.835 0.165
## 101 0.955 0.045
## 112 0.990 0.010
## 114 0.975 0.025
## 115 0.890 0.110
## 123 0.980 0.020
## 126 0.930 0.070
## 127 0.940 0.060
## 129 0.920 0.080
## 134 0.965 0.035
## 137 0.830 0.170
## 140 0.960 0.040
## 145 0.970 0.030
## 147 0.920 0.080
## 148 0.920 0.080
## 155 0.870 0.130
## 163 0.955 0.045
## 166 0.935 0.065
## 173 0.965 0.035
## 174 0.965 0.035
## 180 0.910 0.090
## 184 0.980 0.020
## 186 0.985 0.015
## 195 0.995 0.005
## 207 0.980 0.020
## 209 0.970 0.030
## 212 0.955 0.045
## 213 0.825 0.175
## 216 0.865 0.135
## 217 0.975 0.025
## 218 0.925 0.075
## 219 0.990 0.010
## 220 0.980 0.020
## 4 0.245 0.755
## 10 0.350 0.650
## 15 0.085 0.915
## 17 0.200 0.800
## 20 0.240 0.760
## 23 0.205 0.795
## 28 0.285 0.715
## 30 0.115 0.885
## 35 0.140 0.860
## 37 0.125 0.875
## 42 0.235 0.765
## 51 0.135 0.865
## 56 0.410 0.590
## 59 0.240 0.760
## 61 0.095 0.905
## 65 0.270 0.730
## 79 0.375 0.625
## 82 0.180 0.820
## attr(,"class")
## [1] "matrix" "votes"
auc <- auc(testD$low, pModProb[,2])
plot(roc(testD$low, pModProb[,2]))
# Finding the best mtry by "tuning" the model
bestmtry <- tuneRF(trainD, trainD$low, ntreeTry = 200,
stepFactor = 1.5,
improve = 0.01,
trace = T,
plot = T)
## mtry = 3 OOB error = 0.76%
## Searching left ...
## mtry = 2 OOB error = 0%
## 1 0.01
## Searching right ...
## mtry = 4 OOB error = 0.76%
## -Inf 0.01