# This is accompanying code for # https://jozef.io/r006-merge/ # Original data source: # https://cran.r-project.org/web/packages/nycflights13/index.html # Merges (joins) of two data frames =========================================== # |- Read and prepare data ---------------------------------------------------- dataurl <- "https://jozef.io/post/data/" weather <- readRDS(url(paste0(dataurl, "r006/weather.rds"))) flights <- readRDS(url(paste0(dataurl, "r006/flights.rds"))) mergeCols <- c("time_hour", "origin") head(flights) head(weather) # |- Inner join --------------------------------------------------------------- inner <- merge(flights, weather, by = mergeCols) # |- Left (outer) join -------------------------------------------------------- left <- merge(flights, weather, by = mergeCols, all.x = TRUE) # |- Right (outer) join ------------------------------------------------------- right <- merge(flights, weather, by = mergeCols, all.y = TRUE) # |- Full (outer) join -------------------------------------------------------- full <- merge(flights, weather, by = mergeCols, all = TRUE) # |- Cross Join (Cartesian product) ------------------------------------------- # on first 10 rows only cross <- merge(flights[1:10, ], weather[1:10, ], by = NULL) # |- Natural Join ------------------------------------------------------------- natural <- merge(flights, weather) # Merges (joins) of multiple data frames ====================================== # Load and look at the data --------------------------------------------------- flightsList <- readRDS(url(paste0(dataurl, "r006/nycflights13-list.rds"))) lapply(flightsList, function(x) c(toString(dim(x)), toString(names(x)))) # |- Full (outer) join using nested merge ------------------------------------- multi_full <- merge(merge(merge(merge( flightsList[[1L]], flightsList[[2L]], all = TRUE), flightsList[[3L]], all = TRUE), flightsList[[4L]], all = TRUE), flightsList[[5L]], all = TRUE) # |- Full (outer) join using reduce ------------------------------------------- multi_full <- Reduce( function(x, y, ...) merge(x, y, all = TRUE, ...), flightsList ) # |- Innter join using reduce ------------------------------------------------- multi_inner <- Reduce( function(x, y, ...) merge(x, y, ...), flightsList ) # |- Cleanup ------------------------------------------------------------------ rm(inner, left, right, full, cross, natural, multi_full, multi_inner) gc() # Alternatives to Base R - dplyr ============================================== library(dplyr) # |- dplyr join functions ----------------------------------------------------- inner_dplyr <- inner_join(flights, weather, by = mergeCols) left_dplyr <- left_join(flights, weather, by = mergeCols) right_dplyr <- right_join(flights, weather, by = mergeCols) full_dplyr <- full_join(flights, weather, by = mergeCols) # |- dplyr join functions with %>% -------------------------------------------- inner_dplyr <- flights %>% inner_join(weather, by = mergeCols) left_dplyr <- flights %>% left_join(weather, by = mergeCols) right_dplyr <- flights %>% right_join(weather, by = mergeCols) full_dplyr <- flights %>% full_join(weather, by = mergeCols) # Alternatives to Base R - data.table ========================================= library(data.table) # |- data.table data preparation ---------------------------------------------- weather <- as.data.table(weather) flights <- as.data.table(flights) setkeyv(weather, mergeCols) setkeyv(flights, mergeCols) # |- data.table's merge method ------------------------------------------------ inner_dt <- merge(flights, weather, by = mergeCols) left_dt <- merge(flights, weather, by = mergeCols, all.x = TRUE) right_dt <- merge(flights, weather, by = mergeCols, all.y = TRUE) full_dt <- merge(flights, weather, by = mergeCols, all = TRUE) # |- data.table joins as subsets ---------------------------------------------- inner_dt <- flights[weather, on = mergeCols, nomatch = 0] left_dt <- weather[flights, on = mergeCols] right_dt <- flights[weather, on = mergeCols]