Повышение производительности ggplot2

Пакет ggplot2 - это лучшая система построения графиков, с которой я когда-либо работал, за исключением того, что производительность не очень хороша для больших наборов данных (~ 50 тыс. точек). Я изучаю веб-анализы через Shiny, используя ggplot2 в качестве бэкэнда построения, но я не очень доволен производительностью, особенно в отличие от базовой графики. Мой вопрос в том, есть ли какие-либо конкретные способы увеличить эту производительность.

Отправной точкой является следующий пример кода:

library(ggplot2)

n = 86400 # a day in seconds
dat = data.frame(id = 1:n, val = sort(runif(n)))

dev.new()

gg_base = ggplot(dat, aes(x = id, y = val))
gg_point = gg_base + geom_point()
gg_line = gg_base + geom_line()
gg_both = gg_base + geom_point() + geom_line()

benchplot(gg_point)
benchplot(gg_line)
benchplot(gg_both)
system.time(plot(dat))
system.time(plot(dat, type = 'l'))

Я получаю следующие тайминги на моей сетчатке сетчатки MacPro:

> benchplot(gg_point)
       step user.self sys.self elapsed
1 construct     0.000    0.000   0.000
2     build     0.321    0.078   0.398
3    render     0.271    0.088   0.359
4      draw     2.013    0.018   2.218
5     TOTAL     2.605    0.184   2.975
> benchplot(gg_line)
       step user.self sys.self elapsed
1 construct     0.000    0.000   0.000
2     build     0.330    0.073   0.403
3    render     0.622    0.095   0.717
4      draw     2.078    0.009   2.266
5     TOTAL     3.030    0.177   3.386
> benchplot(gg_both)
       step user.self sys.self elapsed
1 construct     0.000    0.000   0.000
2     build     0.602    0.155   0.757
3    render     0.866    0.186   1.051
4      draw     4.020    0.030   4.238
5     TOTAL     5.488    0.371   6.046
> system.time(plot(dat))
   user  system elapsed 
  1.133   0.004   1.138 
# Note that the timing below depended heavily on wether or net the graphics device
# was in view or not. Not in view made performance much, much better.
> system.time(plot(dat, type = 'l'))
   user  system elapsed 
  1.230   0.003   1.233

Дополнительная информация о моей настройке:

> sessionInfo()
R version 2.15.3 (2013-03-01)
Platform: x86_64-apple-darwin9.8.0/x86_64 (64-bit)

locale:
[1] C/UTF-8/C/C/C/C

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ggplot2_0.9.3.1

loaded via a namespace (and not attached):
 [1] MASS_7.3-23        RColorBrewer_1.0-5 colorspace_1.2-1   dichromat_2.0-0   
 [5] digest_0.6.3       grid_2.15.3        gtable_0.1.2       labeling_0.1      
 [9] munsell_0.4        plyr_1.8           proto_0.3-10       reshape2_1.2.2    
[13] scales_0.2.3       stringr_0.6.2

#' @param table name of the table #' @param start start time/date #' @param end end time/date #' @param aggregate one of "days", "hours", "mins" or "weeks" #' @param group grouping variable #' @param column name of the target column (y axis) #' @export minmaxdata <- function(table, start, end, aggregate=c("days", "hours", "mins", "weeks"), group=1, column){ #dates start <- round(unclass(as.POSIXct(start))*1000); end <- round(unclass(as.POSIXct(end))*1000); #must aggregate aggregate <- match.arg(aggregate); #calcluate modulus mod <- switch(aggregate, "mins" = 1000*60, "hours" = 1000*60*60, "days" = 1000*60*60*24, "weeks" = 1000*60*60*24*7, stop("invalid aggregate value") ); #we need to add the time differene between gmt and pst to make modulo work delta <- 1000 * 60 * 60 * (24 - unclass(as.POSIXct(format(Sys.time(), tz="GMT")) - Sys.time())); #form query query <- paste("SELECT", group, "AS grouping, AVG(", column, ") AS yavg, MAX(", column, ") AS ymax, MIN(", column, ") AS ymin, ((CMilliseconds_g +", delta, ") DIV", mod, ") AS timediv FROM", table, "WHERE CMilliseconds_g BETWEEN", start, "AND", end, "GROUP BY", group, ", timediv;") mydata <- getquery(query); #data mydata$time <- structure(mod*mydata[["timediv"]]/1000 - delta/1000, class=c("POSIXct", "POSIXt")); mydata$grouping <- as.factor(mydata$grouping) #round timestamps if(aggregate %in% c("mins", "hours")){ mydata$time <- round(mydata$time, aggregate) } else { mydata$time <- as.Date(mydata$time); } #return return(mydata) }

Ответ 1