Required file: This tutorial uses publically available climate data collected by two different weather stations in the Rocky Mountains of Colorado, USA. The dataset is available here.
Install the required package and read in the data
#install.packages("plyr")
library(plyr)
climate <- read.csv(file="ClimateData.csv")
climate$Month <- as.factor(climate$Month)
climate$Year <- as.factor(climate$Year)
str(climate)
## 'data.frame': 2166 obs. of 14 variables:
## $ StationName: Factor w/ 2 levels "BisonLake","NorthLostTrail": 1 1 1 1 1 1 1 1 1 1 ...
## $ DataSource : Factor w/ 1 level "SNOTEL": 1 1 1 1 1 1 1 1 1 1 ...
## $ Elevation : num 3316 3316 3316 3316 3316 ...
## $ Latitude : num 39.8 39.8 39.8 39.8 39.8 ...
## $ Longitude : num 107 107 107 107 107 ...
## $ Month : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Day : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : Factor w/ 3 levels "2010","2011",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ MaxAirTemp : num -3 -5.3 -6.6 -6.5 -4.7 -4.8 -11.6 -6.2 -1.3 0.1 ...
## $ MinAirTemp : num -13.3 -10.8 -14.5 -14.6 -9.4 -13.5 -18.6 -15.7 -9.8 -9.1 ...
## $ AvgAirTemp : num -7.7 -8.2 -11.4 -9.9 -7.2 -7.4 -15.3 -9.6 -6.1 -5.9 ...
## $ Precip : int 3 8 0 0 3 0 0 0 3 0 ...
## $ AccumPrecip: int 315 318 325 325 325 328 328 328 328 330 ...
## $ Snowdepth : int 1190 1140 1220 1140 1140 1190 1220 1220 1190 1140 ...
Using ‘ddply’: a simple example
# Use 'ddply' to calculate the mean air temperature for each month
ddply(climate, # input data frame
"Month", # variable to subset by
function(x){ # function to run on each subset
mean(x$AvgAirTemp)
}
)
## Month V1
## 1 1 -6.8715054
## 2 2 -7.6479290
## 3 3 -2.1327957
## 4 4 0.5316667
## 5 5 4.4129032
## 6 6 11.3645714
## 7 7 13.1733696
## 8 8 12.5160920
## 9 9 9.2017143
## 10 10 3.0005376
## 11 11 -3.1505556
## 12 12 -6.2610811
# Alter the function slightly so that the output is easier to work with
monthlyData <- ddply(climate,
"Month",
function(x){
MeanAirTemp <- mean(x$AvgAirTemp)
data.frame(MeanAirTemp=MeanAirTemp)
}
)
print(monthlyData)
## Month MeanAirTemp
## 1 1 -6.8715054
## 2 2 -7.6479290
## 3 3 -2.1327957
## 4 4 0.5316667
## 5 5 4.4129032
## 6 6 11.3645714
## 7 7 13.1733696
## 8 8 12.5160920
## 9 9 9.2017143
## 10 10 3.0005376
## 11 11 -3.1505556
## 12 12 -6.2610811
Using ‘ddply’ to subset data based on multiple factors and perform a calculation
# Use 'ddply' to calculate the mean air temperature for each month-year combination
monthYearData <- ddply(climate,
c("Month","Year"),
function(x){
MeanAirTemp <- mean(x$AvgAirTemp)
data.frame(MeanAirTemp=MeanAirTemp)
}
)
print(monthYearData)
## Month Year MeanAirTemp
## 1 1 2010 -6.2516129
## 2 1 2011 -8.3516129
## 3 1 2012 -6.0112903
## 4 2 2010 -7.6535714
## 5 2 2011 -8.1000000
## 6 2 2012 -7.2137931
## 7 3 2010 -3.2661290
## 8 3 2011 -2.5500000
## 9 3 2012 -0.5822581
## 10 4 2010 -0.0850000
## 11 4 2011 -0.9433333
## 12 4 2012 2.6233333
## 13 5 2010 3.7225806
## 14 5 2011 2.9016129
## 15 5 2012 6.6145161
## 16 6 2010 11.0927273
## 17 6 2011 9.5516667
## 18 6 2012 13.4266667
## 19 7 2010 13.1819672
## 20 7 2011 12.9852459
## 21 7 2012 13.3500000
## 22 8 2010 11.2420000
## 23 8 2011 13.0838710
## 24 8 2012 12.9758065
## 25 9 2010 10.0690909
## 26 9 2011 8.4983333
## 27 9 2012 9.1100000
## 28 10 2010 3.4435484
## 29 10 2011 2.3532258
## 30 10 2012 3.2048387
## 31 11 2010 -4.9083333
## 32 11 2011 -3.7150000
## 33 11 2012 -0.8283333
## 34 12 2010 -4.5258065
## 35 12 2011 -6.9196721
## 36 12 2012 -7.3483871
Using ‘ddply’ to calculate multiple summary statistics
# Use 'ddply' to calculate, for each month, means and standard deviations for daily air
# temperature and precipitation
monthlyData <- ddply(climate,
"Month",
function(x){
meanAirTemp <- mean(x$AvgAirTemp)
sdAirTemp <- sd(x$AvgAirTemp)
meanPrecip <- mean(x$Precip)
sdPrecip <- sd(x$Precip)
data.frame(meanAirTemp=meanAirTemp,sdAirTemp=sdAirTemp,
meanPrecip=meanPrecip,sdPrecip=sdPrecip)
}
)
print(monthlyData)
## Month meanAirTemp sdAirTemp meanPrecip sdPrecip
## 1 1 -6.8715054 4.188091 2.919355 5.148492
## 2 2 -7.6479290 4.176737 4.059172 5.977840
## 3 3 -2.1327957 4.502906 2.715054 4.934105
## 4 4 0.5316667 4.608730 4.550000 6.605631
## 5 5 4.4129032 4.285922 2.403226 5.096240
## 6 6 11.3645714 3.453551 0.720000 3.029814
## 7 7 13.1733696 2.296492 2.385870 5.623722
## 8 8 12.5160920 2.023411 1.735632 4.067238
## 9 9 9.2017143 2.823108 1.577143 4.588033
## 10 10 3.0005376 4.557495 2.666667 6.232146
## 11 11 -3.1505556 4.967127 3.255556 6.708722
## 12 12 -6.2610811 4.864646 4.378378 7.978066
Using ‘summarise’ within ‘ddply’
# Calculate the mean air temperature for each month
monthlyData <- ddply(climate, # input data frame
"Month", # variable to subset by
summarise, # "helper function" to run
MeanAirTemp = mean(AvgAirTemp)) # function to apply to each subset
print(monthlyData)
## Month MeanAirTemp
## 1 1 -6.8715054
## 2 2 -7.6479290
## 3 3 -2.1327957
## 4 4 0.5316667
## 5 5 4.4129032
## 6 6 11.3645714
## 7 7 13.1733696
## 8 8 12.5160920
## 9 9 9.2017143
## 10 10 3.0005376
## 11 11 -3.1505556
## 12 12 -6.2610811
# Calculate the mean air temperature for each month-year combination
monthYearData <- ddply(climate,
c("Month","Year"),
summarise,
MeanAirTemp = mean(AvgAirTemp))
print(monthYearData)
## Month Year MeanAirTemp
## 1 1 2010 -6.2516129
## 2 1 2011 -8.3516129
## 3 1 2012 -6.0112903
## 4 2 2010 -7.6535714
## 5 2 2011 -8.1000000
## 6 2 2012 -7.2137931
## 7 3 2010 -3.2661290
## 8 3 2011 -2.5500000
## 9 3 2012 -0.5822581
## 10 4 2010 -0.0850000
## 11 4 2011 -0.9433333
## 12 4 2012 2.6233333
## 13 5 2010 3.7225806
## 14 5 2011 2.9016129
## 15 5 2012 6.6145161
## 16 6 2010 11.0927273
## 17 6 2011 9.5516667
## 18 6 2012 13.4266667
## 19 7 2010 13.1819672
## 20 7 2011 12.9852459
## 21 7 2012 13.3500000
## 22 8 2010 11.2420000
## 23 8 2011 13.0838710
## 24 8 2012 12.9758065
## 25 9 2010 10.0690909
## 26 9 2011 8.4983333
## 27 9 2012 9.1100000
## 28 10 2010 3.4435484
## 29 10 2011 2.3532258
## 30 10 2012 3.2048387
## 31 11 2010 -4.9083333
## 32 11 2011 -3.7150000
## 33 11 2012 -0.8283333
## 34 12 2010 -4.5258065
## 35 12 2011 -6.9196721
## 36 12 2012 -7.3483871
# Calculate, for each month, the mean and standard deviation for air temperature
monthlyData <- ddply(climate,
"Month",
summarise,
meanAirTemp=mean(AvgAirTemp),
sdAirTemp=sd(AvgAirTemp))
print(monthlyData)
## Month meanAirTemp sdAirTemp
## 1 1 -6.8715054 4.188091
## 2 2 -7.6479290 4.176737
## 3 3 -2.1327957 4.502906
## 4 4 0.5316667 4.608730
## 5 5 4.4129032 4.285922
## 6 6 11.3645714 3.453551
## 7 7 13.1733696 2.296492
## 8 8 12.5160920 2.023411
## 9 9 9.2017143 2.823108
## 10 10 3.0005376 4.557495
## 11 11 -3.1505556 4.967127
## 12 12 -6.2610811 4.864646
Using ‘transform’ and ‘mutate’ within ‘ddply’
# Use 'transform' within 'ddply' to split your data into subsets, perform a calculation on
# each subset, and add the results to a copy of your input data frame as a new column
x <- ddply(climate,
"Month",
transform,
MonthlyMeanTemp = mean(AvgAirTemp))
head(x)
## StationName DataSource Elevation Latitude Longitude Month Day Year
## 1 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 1 2010
## 2 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 2 2010
## 3 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 3 2010
## 4 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 4 2010
## 5 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 5 2010
## 6 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 6 2010
## MaxAirTemp MinAirTemp AvgAirTemp Precip AccumPrecip Snowdepth
## 1 -3.0 -13.3 -7.7 3 315 1190
## 2 -5.3 -10.8 -8.2 8 318 1140
## 3 -6.6 -14.5 -11.4 0 325 1220
## 4 -6.5 -14.6 -9.9 0 325 1140
## 5 -4.7 -9.4 -7.2 3 325 1140
## 6 -4.8 -13.5 -7.4 0 328 1190
## MonthlyMeanTemp
## 1 -6.871505
## 2 -6.871505
## 3 -6.871505
## 4 -6.871505
## 5 -6.871505
## 6 -6.871505
# 'Mutate' works similarly to 'transform', but allows you to do calculations within 'ddply'
# using columns you just created
x <- ddply(climate,
"Month",
mutate,
AvgMaxTemp = mean(MaxAirTemp),
AvgMinTemp = mean(MinAirTemp),
MonthlyMeanTempRange = AvgMaxTemp - AvgMinTemp)
head(x)
## StationName DataSource Elevation Latitude Longitude Month Day Year
## 1 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 1 2010
## 2 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 2 2010
## 3 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 3 2010
## 4 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 4 2010
## 5 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 5 2010
## 6 BisonLake SNOTEL 3316.224 39.76487 107.3568 1 6 2010
## MaxAirTemp MinAirTemp AvgAirTemp Precip AccumPrecip Snowdepth AvgMaxTemp
## 1 -3.0 -13.3 -7.7 3 315 1190 -1.437097
## 2 -5.3 -10.8 -8.2 8 318 1140 -1.437097
## 3 -6.6 -14.5 -11.4 0 325 1220 -1.437097
## 4 -6.5 -14.6 -9.9 0 325 1140 -1.437097
## 5 -4.7 -9.4 -7.2 3 325 1140 -1.437097
## 6 -4.8 -13.5 -7.4 0 328 1190 -1.437097
## AvgMinTemp MonthlyMeanTempRange
## 1 -11.52258 10.08548
## 2 -11.52258 10.08548
## 3 -11.52258 10.08548
## 4 -11.52258 10.08548
## 5 -11.52258 10.08548
## 6 -11.52258 10.08548
Using plyr for plotting
# Create boxplots of the mean daily air temperatures for each weather station
par(mfrow = c(1,2))
d_ply(climate,
"StationName",
summarise,
boxplot(AvgAirTemp,
xlab=unique(StationName),
ylab="Mean Daily Air Temperature (degrees C)"))
Using other types of functions within ‘ddply’
# Use a linear model to examine how precipitation changes with air temperature within
# each month
precipTemp <- ddply(climate,
"Month",
function(x) {
model <- lm(Precip ~ AvgAirTemp, data=x)
setNames(coef(model), c("Intercept", "Slope"))
}
)
print(precipTemp)
## Month Intercept Slope
## 1 1 3.354372 0.06330741
## 2 2 5.467214 0.18410763
## 3 3 2.079644 -0.29792359
## 4 4 4.881196 -0.62293967
## 5 5 4.645158 -0.50804029
## 6 6 4.601663 -0.34155827
## 7 7 12.195870 -0.74468419
## 8 8 13.021219 -0.90168619
## 9 9 5.909191 -0.47078707
## 10 10 3.837858 -0.39032702
## 11 11 2.230940 -0.32521754
## 12 12 5.397384 0.16275233
Creating graphs using ‘ddply’ output
# For each month, calculate and graph the mean and standard deviation for air temperature
monthlyData <- ddply(climate,
"Month",
summarise,
meanAirTemp=mean(AvgAirTemp),
sdAirTemp=sd(AvgAirTemp))
print(monthlyData)
## Month meanAirTemp sdAirTemp
## 1 1 -6.8715054 4.188091
## 2 2 -7.6479290 4.176737
## 3 3 -2.1327957 4.502906
## 4 4 0.5316667 4.608730
## 5 5 4.4129032 4.285922
## 6 6 11.3645714 3.453551
## 7 7 13.1733696 2.296492
## 8 8 12.5160920 2.023411
## 9 9 9.2017143 2.823108
## 10 10 3.0005376 4.557495
## 11 11 -3.1505556 4.967127
## 12 12 -6.2610811 4.864646
library(ggplot2)
p <- ggplot(monthlyData, aes(x=Month,y=meanAirTemp,colour=Month))
p <- p + geom_point(position=position_dodge(width=0.3), stat="identity", size = 3)
p <- p + geom_errorbar(aes(ymin=meanAirTemp-sdAirTemp, ymax=meanAirTemp+sdAirTemp),
width=.1,position=position_dodge(.3))
print(p)