# This is accompanying code for
# https://jozef.io/r003-aggregation/

# Original data source:
# http://ec.europa.eu/eurostat/web/sector-accounts/data/annual-data
# Important note - the figures are presented
# - in millions € for euro area countries and euro area / EU aggregates,
# - millions of national currency otherwise

# Read and prepare data -------------------------------------------------------
gdi <- read.csv(
  stringsAsFactors = FALSE
, file = "https://jozef.io/post/data/ESA2010_pretty.csv"
)

gdi <- reshape(data = gdi
             , direction = "long" # we are going from wide to long
             , varying = 2:67     # columns that will be stacked into 1
             , idvar = "country"  # identifying the subject in rows
             )

# Simple aggregations ---------------------------------------------------------
aggregate(x = gdi["GrossSaving"]
        , by = list(country = gdi[["country"]])
        , FUN = mean
        )

aggregate(x = gdi["GrossSaving"]
        , by = list(country = gdi[["country"]])
        , FUN = mean
        , na.rm = TRUE
        )

# Grouping by more variables --------------------------------------------------
aggregate(x = gdi["GrossSaving"]
        , by = list(decade = paste0(substr(gdi[["time"]], 1L, 3L), "0s")
                  , country = gdi[["country"]]
                  )
        , FUN = mean
        , na.rm = TRUE
        )

# Aggregating more variables --------------------------------------------------
aggregate(x = gdi[c("ConspC", "AGDIpC", "GrossSaving")]
        , by = list(decade = paste0(substr(gdi[["time"]], 1L, 3L), "0s")
                  , country = gdi[["country"]]
                  )
        , FUN = mean
        , na.rm = TRUE
        )

# Very simple custom aggregation function -------------------------------------
dummyaggfun <- function(v) {
  c(max = max(v)
  , min = min(v)
  , range = max(v) - min(v)
  )
}

aggregate(gdi["GrossSaving"]
        , by = list(decade = paste0(substr(gdi[["time"]], 1L, 3L), "0s")
                  , country = gdi[["country"]]
                  )
        , FUN = dummyaggfun
        )

# Answers to the Exercises ----------------------------------------------------

# |- Exercise 1. --------------------------------------------------------------
# Looking at the `aggregate(state.x77, list(Region = state.region), mean)`
# example in `?aggregate`, how does R know how to match the states to the
# regions ? Would the example still work if the data in `state.x77`
# were sorted differently ?
r1 <- aggregate(state.x77
              , list(Region = state.region)
              , mean
              )
r2 <- aggregate(state.x77[order(state.x77[, "Population"]), ]
              , list(Region = state.region)
              , mean
              )
r1 == r2

# As we can see, the results are not identical. This is because R does not
# know the connection between the `x` and `by arguments.  The example from
# help works because the state data are in the same order in all the state
# datasets (alphabetical order of the state names).

?datasets::state

# |- Exercise 2. --------------------------------------------------------------
# What is the difference between
# `aggregate(x = gdi["GrossSaving"], by = gdi["country"], FUN = mean)`
# and
# `aggregate(x = gdi[["GrossSaving"]], by = gdi["country"], FUN = mean)`

# What is the issue with the latter ?
# Looking at the code, why does the latter still work ?

r21 <- aggregate(x = gdi["GrossSaving"]
               , by = gdi["country"]
               , FUN = mean)

r22 <- aggregate(x = gdi[["GrossSaving"]]
               , by = gdi["country"]
               , FUN = mean
               )

str(gdi["GrossSaving"])   # The x argument in the former case is a data.frame
str(gdi[["GrossSaving"]]) # The x argument in the latter case is a vector

# Therefore the latter case will not provide us with a proper column name
# for the aggregated column:
names(r21)
names(r22)

# The latter still works because the default method, aggregate.default,
# uses the time series method if x is a time series, and otherwise coerces
# x to a data frame and calls the data frame method:
x <- gdi[["GrossSaving"]]
if (!is.data.frame(x))
  x <- as.data.frame(x)