# This is accompanying code for # https://jozef.io/r003-aggregation/ # Original data source: # http://ec.europa.eu/eurostat/web/sector-accounts/data/annual-data # Important note - the figures are presented # - in millions € for euro area countries and euro area / EU aggregates, # - millions of national currency otherwise # Read and prepare data ------------------------------------------------------- gdi <- read.csv( stringsAsFactors = FALSE , file = "https://jozef.io/post/data/ESA2010_pretty.csv" ) gdi <- reshape(data = gdi , direction = "long" # we are going from wide to long , varying = 2:67 # columns that will be stacked into 1 , idvar = "country" # identifying the subject in rows ) # Simple aggregations --------------------------------------------------------- aggregate(x = gdi["GrossSaving"] , by = list(country = gdi[["country"]]) , FUN = mean ) aggregate(x = gdi["GrossSaving"] , by = list(country = gdi[["country"]]) , FUN = mean , na.rm = TRUE ) # Grouping by more variables -------------------------------------------------- aggregate(x = gdi["GrossSaving"] , by = list(decade = paste0(substr(gdi[["time"]], 1L, 3L), "0s") , country = gdi[["country"]] ) , FUN = mean , na.rm = TRUE ) # Aggregating more variables -------------------------------------------------- aggregate(x = gdi[c("ConspC", "AGDIpC", "GrossSaving")] , by = list(decade = paste0(substr(gdi[["time"]], 1L, 3L), "0s") , country = gdi[["country"]] ) , FUN = mean , na.rm = TRUE ) # Very simple custom aggregation function ------------------------------------- dummyaggfun <- function(v) { c(max = max(v) , min = min(v) , range = max(v) - min(v) ) } aggregate(gdi["GrossSaving"] , by = list(decade = paste0(substr(gdi[["time"]], 1L, 3L), "0s") , country = gdi[["country"]] ) , FUN = dummyaggfun ) # Answers to the Exercises ---------------------------------------------------- # |- Exercise 1. -------------------------------------------------------------- # Looking at the `aggregate(state.x77, list(Region = state.region), mean)` # example in `?aggregate`, how does R know how to match the states to the # regions ? Would the example still work if the data in `state.x77` # were sorted differently ? r1 <- aggregate(state.x77 , list(Region = state.region) , mean ) r2 <- aggregate(state.x77[order(state.x77[, "Population"]), ] , list(Region = state.region) , mean ) r1 == r2 # As we can see, the results are not identical. This is because R does not # know the connection between the `x` and `by arguments. The example from # help works because the state data are in the same order in all the state # datasets (alphabetical order of the state names). ?datasets::state # |- Exercise 2. -------------------------------------------------------------- # What is the difference between # `aggregate(x = gdi["GrossSaving"], by = gdi["country"], FUN = mean)` # and # `aggregate(x = gdi[["GrossSaving"]], by = gdi["country"], FUN = mean)` # What is the issue with the latter ? # Looking at the code, why does the latter still work ? r21 <- aggregate(x = gdi["GrossSaving"] , by = gdi["country"] , FUN = mean) r22 <- aggregate(x = gdi[["GrossSaving"]] , by = gdi["country"] , FUN = mean ) str(gdi["GrossSaving"]) # The x argument in the former case is a data.frame str(gdi[["GrossSaving"]]) # The x argument in the latter case is a vector # Therefore the latter case will not provide us with a proper column name # for the aggregated column: names(r21) names(r22) # The latter still works because the default method, aggregate.default, # uses the time series method if x is a time series, and otherwise coerces # x to a data frame and calls the data frame method: x <- gdi[["GrossSaving"]] if (!is.data.frame(x)) x <- as.data.frame(x)