Required file: This tutorial uses publically available climate data collected by two different weather stations in the Rocky Mountains of Colorado, USA. The dataset is available here.

Install the required package and read in the data

#install.packages("plyr")
library(plyr)

climate <- read.csv(file="ClimateData.csv")
climate$Month <- as.factor(climate$Month)
climate$Year <- as.factor(climate$Year)
str(climate)
## 'data.frame':    2166 obs. of  14 variables:
##  $ StationName: Factor w/ 2 levels "BisonLake","NorthLostTrail": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DataSource : Factor w/ 1 level "SNOTEL": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Elevation  : num  3316 3316 3316 3316 3316 ...
##  $ Latitude   : num  39.8 39.8 39.8 39.8 39.8 ...
##  $ Longitude  : num  107 107 107 107 107 ...
##  $ Month      : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Day        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Year       : Factor w/ 3 levels "2010","2011",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ MaxAirTemp : num  -3 -5.3 -6.6 -6.5 -4.7 -4.8 -11.6 -6.2 -1.3 0.1 ...
##  $ MinAirTemp : num  -13.3 -10.8 -14.5 -14.6 -9.4 -13.5 -18.6 -15.7 -9.8 -9.1 ...
##  $ AvgAirTemp : num  -7.7 -8.2 -11.4 -9.9 -7.2 -7.4 -15.3 -9.6 -6.1 -5.9 ...
##  $ Precip     : int  3 8 0 0 3 0 0 0 3 0 ...
##  $ AccumPrecip: int  315 318 325 325 325 328 328 328 328 330 ...
##  $ Snowdepth  : int  1190 1140 1220 1140 1140 1190 1220 1220 1190 1140 ...

Using ‘ddply’: a simple example

# Use 'ddply' to calculate the mean air temperature for each month
ddply(climate, # input data frame
      "Month", # variable to subset by
      function(x){ # function to run on each subset
        mean(x$AvgAirTemp)
        }
      )
##    Month         V1
## 1      1 -6.8715054
## 2      2 -7.6479290
## 3      3 -2.1327957
## 4      4  0.5316667
## 5      5  4.4129032
## 6      6 11.3645714
## 7      7 13.1733696
## 8      8 12.5160920
## 9      9  9.2017143
## 10    10  3.0005376
## 11    11 -3.1505556
## 12    12 -6.2610811
# Alter the function slightly so that the output is easier to work with
monthlyData <- ddply(climate,
                     "Month",
                     function(x){ 
                       MeanAirTemp <- mean(x$AvgAirTemp)
                       data.frame(MeanAirTemp=MeanAirTemp)
                       }
                     )
print(monthlyData)
##    Month MeanAirTemp
## 1      1  -6.8715054
## 2      2  -7.6479290
## 3      3  -2.1327957
## 4      4   0.5316667
## 5      5   4.4129032
## 6      6  11.3645714
## 7      7  13.1733696
## 8      8  12.5160920
## 9      9   9.2017143
## 10    10   3.0005376
## 11    11  -3.1505556
## 12    12  -6.2610811

Using ‘ddply’ to subset data based on multiple factors and perform a calculation

# Use 'ddply' to calculate the mean air temperature for each month-year combination
monthYearData <- ddply(climate,
                     c("Month","Year"),
                     function(x){ 
                       MeanAirTemp <- mean(x$AvgAirTemp)
                       data.frame(MeanAirTemp=MeanAirTemp)
                       }
                     )
print(monthYearData)
##    Month Year MeanAirTemp
## 1      1 2010  -6.2516129
## 2      1 2011  -8.3516129
## 3      1 2012  -6.0112903
## 4      2 2010  -7.6535714
## 5      2 2011  -8.1000000
## 6      2 2012  -7.2137931
## 7      3 2010  -3.2661290
## 8      3 2011  -2.5500000
## 9      3 2012  -0.5822581
## 10     4 2010  -0.0850000
## 11     4 2011  -0.9433333
## 12     4 2012   2.6233333
## 13     5 2010   3.7225806
## 14     5 2011   2.9016129
## 15     5 2012   6.6145161
## 16     6 2010  11.0927273
## 17     6 2011   9.5516667
## 18     6 2012  13.4266667
## 19     7 2010  13.1819672
## 20     7 2011  12.9852459
## 21     7 2012  13.3500000
## 22     8 2010  11.2420000
## 23     8 2011  13.0838710
## 24     8 2012  12.9758065
## 25     9 2010  10.0690909
## 26     9 2011   8.4983333
## 27     9 2012   9.1100000
## 28    10 2010   3.4435484
## 29    10 2011   2.3532258
## 30    10 2012   3.2048387
## 31    11 2010  -4.9083333
## 32    11 2011  -3.7150000
## 33    11 2012  -0.8283333
## 34    12 2010  -4.5258065
## 35    12 2011  -6.9196721
## 36    12 2012  -7.3483871

Using ‘ddply’ to calculate multiple summary statistics

# Use 'ddply' to calculate, for each month, means and standard deviations for daily air
# temperature and precipitation
monthlyData <- ddply(climate,
                     "Month",
                     function(x){ 
                       
                       meanAirTemp <- mean(x$AvgAirTemp)
                       sdAirTemp <- sd(x$AvgAirTemp)
                       meanPrecip <- mean(x$Precip)
                       sdPrecip <- sd(x$Precip)
                       
                       data.frame(meanAirTemp=meanAirTemp,sdAirTemp=sdAirTemp,
                                  meanPrecip=meanPrecip,sdPrecip=sdPrecip)
                       }
                     )
print(monthlyData)
##    Month meanAirTemp sdAirTemp meanPrecip sdPrecip
## 1      1  -6.8715054  4.188091   2.919355 5.148492
## 2      2  -7.6479290  4.176737   4.059172 5.977840
## 3      3  -2.1327957  4.502906   2.715054 4.934105
## 4      4   0.5316667  4.608730   4.550000 6.605631
## 5      5   4.4129032  4.285922   2.403226 5.096240
## 6      6  11.3645714  3.453551   0.720000 3.029814
## 7      7  13.1733696  2.296492   2.385870 5.623722
## 8      8  12.5160920  2.023411   1.735632 4.067238
## 9      9   9.2017143  2.823108   1.577143 4.588033
## 10    10   3.0005376  4.557495   2.666667 6.232146
## 11    11  -3.1505556  4.967127   3.255556 6.708722
## 12    12  -6.2610811  4.864646   4.378378 7.978066

Using ‘summarise’ within ‘ddply’

# Calculate the mean air temperature for each month
monthlyData <- ddply(climate, # input data frame 
           "Month", # variable to subset by
           summarise, # "helper function" to run
           MeanAirTemp = mean(AvgAirTemp)) # function to apply to each subset
print(monthlyData)
##    Month MeanAirTemp
## 1      1  -6.8715054
## 2      2  -7.6479290
## 3      3  -2.1327957
## 4      4   0.5316667
## 5      5   4.4129032
## 6      6  11.3645714
## 7      7  13.1733696
## 8      8  12.5160920
## 9      9   9.2017143
## 10    10   3.0005376
## 11    11  -3.1505556
## 12    12  -6.2610811
# Calculate the mean air temperature for each month-year combination
monthYearData <- ddply(climate, 
           c("Month","Year"), 
           summarise, 
           MeanAirTemp = mean(AvgAirTemp))
print(monthYearData)
##    Month Year MeanAirTemp
## 1      1 2010  -6.2516129
## 2      1 2011  -8.3516129
## 3      1 2012  -6.0112903
## 4      2 2010  -7.6535714
## 5      2 2011  -8.1000000
## 6      2 2012  -7.2137931
## 7      3 2010  -3.2661290
## 8      3 2011  -2.5500000
## 9      3 2012  -0.5822581
## 10     4 2010  -0.0850000
## 11     4 2011  -0.9433333
## 12     4 2012   2.6233333
## 13     5 2010   3.7225806
## 14     5 2011   2.9016129
## 15     5 2012   6.6145161
## 16     6 2010  11.0927273
## 17     6 2011   9.5516667
## 18     6 2012  13.4266667
## 19     7 2010  13.1819672
## 20     7 2011  12.9852459
## 21     7 2012  13.3500000
## 22     8 2010  11.2420000
## 23     8 2011  13.0838710
## 24     8 2012  12.9758065
## 25     9 2010  10.0690909
## 26     9 2011   8.4983333
## 27     9 2012   9.1100000
## 28    10 2010   3.4435484
## 29    10 2011   2.3532258
## 30    10 2012   3.2048387
## 31    11 2010  -4.9083333
## 32    11 2011  -3.7150000
## 33    11 2012  -0.8283333
## 34    12 2010  -4.5258065
## 35    12 2011  -6.9196721
## 36    12 2012  -7.3483871
# Calculate, for each month, the mean and standard deviation for air temperature 
monthlyData <- ddply(climate, 
           "Month", 
           summarise, 
           meanAirTemp=mean(AvgAirTemp), 
           sdAirTemp=sd(AvgAirTemp))
print(monthlyData)
##    Month meanAirTemp sdAirTemp
## 1      1  -6.8715054  4.188091
## 2      2  -7.6479290  4.176737
## 3      3  -2.1327957  4.502906
## 4      4   0.5316667  4.608730
## 5      5   4.4129032  4.285922
## 6      6  11.3645714  3.453551
## 7      7  13.1733696  2.296492
## 8      8  12.5160920  2.023411
## 9      9   9.2017143  2.823108
## 10    10   3.0005376  4.557495
## 11    11  -3.1505556  4.967127
## 12    12  -6.2610811  4.864646

Using ‘transform’ and ‘mutate’ within ‘ddply’

# Use 'transform' within 'ddply' to split your data into subsets, perform a calculation on
# each subset, and add the results to a copy of your input data frame as a new column
x <- ddply(climate, 
           "Month", 
           transform, 
           MonthlyMeanTemp = mean(AvgAirTemp))
head(x)
##   StationName DataSource Elevation Latitude Longitude Month Day Year
## 1   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   1 2010
## 2   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   2 2010
## 3   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   3 2010
## 4   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   4 2010
## 5   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   5 2010
## 6   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   6 2010
##   MaxAirTemp MinAirTemp AvgAirTemp Precip AccumPrecip Snowdepth
## 1       -3.0      -13.3       -7.7      3         315      1190
## 2       -5.3      -10.8       -8.2      8         318      1140
## 3       -6.6      -14.5      -11.4      0         325      1220
## 4       -6.5      -14.6       -9.9      0         325      1140
## 5       -4.7       -9.4       -7.2      3         325      1140
## 6       -4.8      -13.5       -7.4      0         328      1190
##   MonthlyMeanTemp
## 1       -6.871505
## 2       -6.871505
## 3       -6.871505
## 4       -6.871505
## 5       -6.871505
## 6       -6.871505
# 'Mutate' works similarly to 'transform', but allows you to do calculations within 'ddply'
# using columns you just created
x <- ddply(climate, 
           "Month", 
           mutate, 
           AvgMaxTemp = mean(MaxAirTemp),
           AvgMinTemp = mean(MinAirTemp),
           MonthlyMeanTempRange = AvgMaxTemp - AvgMinTemp)
head(x)
##   StationName DataSource Elevation Latitude Longitude Month Day Year
## 1   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   1 2010
## 2   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   2 2010
## 3   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   3 2010
## 4   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   4 2010
## 5   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   5 2010
## 6   BisonLake     SNOTEL  3316.224 39.76487  107.3568     1   6 2010
##   MaxAirTemp MinAirTemp AvgAirTemp Precip AccumPrecip Snowdepth AvgMaxTemp
## 1       -3.0      -13.3       -7.7      3         315      1190  -1.437097
## 2       -5.3      -10.8       -8.2      8         318      1140  -1.437097
## 3       -6.6      -14.5      -11.4      0         325      1220  -1.437097
## 4       -6.5      -14.6       -9.9      0         325      1140  -1.437097
## 5       -4.7       -9.4       -7.2      3         325      1140  -1.437097
## 6       -4.8      -13.5       -7.4      0         328      1190  -1.437097
##   AvgMinTemp MonthlyMeanTempRange
## 1  -11.52258             10.08548
## 2  -11.52258             10.08548
## 3  -11.52258             10.08548
## 4  -11.52258             10.08548
## 5  -11.52258             10.08548
## 6  -11.52258             10.08548

Using plyr for plotting

# Create boxplots of the mean daily air temperatures for each weather station
par(mfrow = c(1,2))
d_ply(climate, 
      "StationName", 
      summarise, 
      boxplot(AvgAirTemp, 
              xlab=unique(StationName), 
              ylab="Mean Daily Air Temperature (degrees C)"))

Using other types of functions within ‘ddply’

# Use a linear model to examine how precipitation changes with air temperature within 
# each month
precipTemp <- ddply(climate, 
                     "Month", 
                     function(x) {
                       model <- lm(Precip ~ AvgAirTemp, data=x)
                       setNames(coef(model), c("Intercept", "Slope"))
                       }
                     )
print(precipTemp)
##    Month Intercept       Slope
## 1      1  3.354372  0.06330741
## 2      2  5.467214  0.18410763
## 3      3  2.079644 -0.29792359
## 4      4  4.881196 -0.62293967
## 5      5  4.645158 -0.50804029
## 6      6  4.601663 -0.34155827
## 7      7 12.195870 -0.74468419
## 8      8 13.021219 -0.90168619
## 9      9  5.909191 -0.47078707
## 10    10  3.837858 -0.39032702
## 11    11  2.230940 -0.32521754
## 12    12  5.397384  0.16275233

Creating graphs using ‘ddply’ output

# For each month, calculate and graph the mean and standard deviation for air temperature 

monthlyData <- ddply(climate, 
           "Month", 
           summarise, 
           meanAirTemp=mean(AvgAirTemp), 
           sdAirTemp=sd(AvgAirTemp))
print(monthlyData)
##    Month meanAirTemp sdAirTemp
## 1      1  -6.8715054  4.188091
## 2      2  -7.6479290  4.176737
## 3      3  -2.1327957  4.502906
## 4      4   0.5316667  4.608730
## 5      5   4.4129032  4.285922
## 6      6  11.3645714  3.453551
## 7      7  13.1733696  2.296492
## 8      8  12.5160920  2.023411
## 9      9   9.2017143  2.823108
## 10    10   3.0005376  4.557495
## 11    11  -3.1505556  4.967127
## 12    12  -6.2610811  4.864646
library(ggplot2)

p <- ggplot(monthlyData, aes(x=Month,y=meanAirTemp,colour=Month))
p <- p + geom_point(position=position_dodge(width=0.3), stat="identity", size = 3) 
p <- p + geom_errorbar(aes(ymin=meanAirTemp-sdAirTemp, ymax=meanAirTemp+sdAirTemp),
                         width=.1,position=position_dodge(.3))
print(p)