> # load packages
require(plyr); require(reshape2)
> # Make immutable data frame
baseball_i <- idata.frame(baseball)
> # Example 1 - idata.frame more than twice as fast
system.time( replicate(50, ddply( baseball, "year", summarise, mean(rbi))) )
user system elapsed
14.812 0.252 15.065
> system.time( replicate(50, ddply( baseball_i, "year", summarise, mean(rbi))) )
user system elapsed
6.895 0.020 6.915
> # Example 2 - Bummer, this does not work with idata.frame's
> colwise(max, is.numeric) ( baseball ) # works year stint g ab r h X2b X3b hr rbi sb cs bb so ibb hbp sh sf gidp1 2007 4 165 705 177 257 64 28 73 NA NA NA 232 NA NA NA NA NA NA> colwise(max, is.numeric) ( baseball_i ) # doesn't workError: is.data.frame(df) is not TRUE
> # Example 3 - idata.frame twice as fast
system.time( replicate(100, baseball[baseball$year == "1884", ] ) )
user system elapsed
1.155 0.048 1.203
> system.time( replicate(100, baseball_i[baseball_i$year == "1884", ] ) )
user system elapsed
0.598 0.011 0.609
> # Example 4 - idata.frame faster
system.time( replicate(50, melt(baseball[, 1:4], id = 1) ) )
user system elapsed
16.587 1.169 17.755
> system.time( replicate(50, melt(baseball_i[, 1:4], id = 1) ) )
user system elapsed
0.869 0.196 1.065
> # And you can go back to a data frame by
d <- as.data.frame(baseball_i)
str(d)
'data.frame': 21699 obs. of 23 variables:
$ id : chr "ansonca01" "forceda01" "mathebo01" "startjo01" ...
$ year : int 1871 1871 1871 1871 1871 1871 1871 1872 1872 1872 ...
$ stint: int 1 1 1 1 1 1 1 1 1 1 ...
$ team : chr "RC1" "WS3" "FW1" "NY2" ...
$ lg : chr "" "" "" "" ...
$ g : int 25 32 19 33 29 29 29 46 37 25 ...
$ ab : int 120 162 89 161 128 146 145 217 174 130 ...
$ r : int 29 45 15 35 35 40 36 60 26 40 ...
$ h : int 39 45 24 58 45 47 37 90 46 53 ...
$ X2b : int 11 9 3 5 3 6 5 10 3 11 ...
$ X3b : int 3 4 1 1 7 5 7 7 0 0 ...
$ hr : int 0 0 0 1 3 1 2 0 0 0 ...
$ rbi : int 16 29 10 34 23 21 23 50 15 16 ...
$ sb : int 6 8 2 4 3 2 2 6 0 2 ...
$ cs : int 2 0 1 2 1 2 2 6 1 2 ...
$ bb : int 2 4 2 3 1 4 9 16 1 1 ...
$ so : int 1 0 0 0 0 1 1 3 1 0 ...
$ ibb : int NA NA NA NA NA NA NA NA NA NA ...
$ hbp : int NA NA NA NA NA NA NA NA NA NA ...
$ sh : int NA NA NA NA NA NA NA NA NA NA ...
$ sf : int NA NA NA NA NA NA NA NA NA NA ...
$ gidp : int NA NA NA NA NA NA NA NA NA NA ...
$ teamf: Factor w/ 132 levels "ALT","ANA","ARI",..: 99 127 51 79 35 35 122 86 16 122 ...
> # idata.frame doesn't work with the doBy package
require(doBy)
summaryBy(rbi ~ year, baseball_i, FUN=c(mean), na.rm=T)
Error in as.vector(x, mode) :
cannot coerce type 'environment' to vector of type 'any'
> # But idata.frame works with aggregate in base (but with minimal speed gains)
# and aggregate is faster than ddply of course
system.time( replicate(100, aggregate(rbi ~ year, baseball, mean) ) )
user system elapsed
4.117 0.423 4.541
> system.time( replicate(100, aggregate(rbi ~ year, baseball_i, mean) ) )
user system elapsed
3.908 0.383 4.291
> system.time( replicate(100, ddply( baseball_i, "year", summarise, mean(rbi)) ) )
user system elapsed
14.015 0.048 14.082