library(dplyr)
library(data.table)
library(collapse)
library(tidyverse)Introduction
This post is inspired in the Atreba’s blog: A data.table and dplyr tour. The objective of this post is to complement Atreba’s one with the syntax of the {collapse} R package.
Basic understanding of the three packages
dplyr
A grammar of data manipulation in R which provides a consistent set of verbs to help you solve the most common data manipulation challenges. It is part of the tidyverse universe. Click here for more information..
data.table
A syntax to operate data manipulation operations, such as subset, group, update, join, etc. It reduces programming and compute time tremendously. Click here for more information.
collapse
A large C/C++ based package for data transformation and statistical computing in R. It aims to facilitate complex data transformations, explorations and computing tasks in R, while making code fast, flexible, parsimonious and programmer friendly. Click here for more information.
Load libraries
Data
set.seed(42)
# Number of rows
n <- 10000
# # Generate fake data
# df <- data.frame(
# id1 = 1:n, # Unique ID
# id2 = sample(1:500, n, replace = TRUE), # Repeating ID
# dt = seq.Date(from = as.Date("2023-01-01"), by = "day", length.out = n), # Dates
# tm = format(seq.POSIXt(from = as.POSIXct("2023-01-01 00:00:00"),
# by = "hour", length.out = n), "%H:%M:%S"), # Time
# ch = sample(c("A", "B", "C", "D"), n, replace = TRUE), # Character
# int = sample(1:100, n, replace = TRUE), # Integer
# log = sample(c(TRUE, FALSE), n, replace = TRUE), # Logical
# realf = runif(n, 1, 100), # Real (float),
# reald = runif(n), # Real ,
# fct = factor(sample(c("X", "Y", "Z"), n, replace = TRUE)) # Factor
# )
set.seed(42)
# Number of rows
n <- 10000
# Generate fake data with some NAs
df <- data.frame(
id1 = 1:n, # Unique ID
id2 = sample(1:500, n, replace = TRUE), # Repeating ID
dt = sample(c(seq.Date(from = as.Date("2023-01-01"), by = "day", length.out = n), NA), n, replace = TRUE), # Dates with NAs
tm = sample(c(format(seq.POSIXt(from = as.POSIXct("2023-01-01 00:00:00"),
by = "hour", length.out = n), "%H:%M:%S"), NA), n, replace = TRUE), # Time with NAs
ch = sample(c("A", "B", "C", "D", NA), n, replace = TRUE, prob = c(0.24, 0.24, 0.24, 0.24, 0.04)), # Character with some NAs
int = sample(c(1:100, NA), n, replace = TRUE), # Integer with some NAs
log = sample(c(TRUE, FALSE, NA), n, replace = TRUE, prob = c(0.49, 0.49, 0.02)), # Logical with some NAs
realf = sample(c(runif(n, 1, 100), NA), n, replace = TRUE), # Real (float) with some NAs
reald = sample(c(runif(n), NA), n, replace = TRUE), # Real with some NAs
fct = factor(sample(c("X", "Y", "Z", NA), n, replace = TRUE, prob = c(0.32, 0.32, 0.32, 0.04))) # Factor with some NAs
)
# Print summary to check distribution of NAs
summary(df)
# Ensure uniqueness
df <- unique(df, by = c("id1", "id2"))
dt <- copy(setDT(df))
tb <- as_tibble(df)Basic use
Filtering rows
Filter rows using indices
# super efficient
df |>
ss(2:5)
# efficient
df |>
fsubset(2:5)dt[2:5]tb |>
slice(2:5)
# or using index like any data.frame
tb[2:5,]
# you need to add the comma. Otherwise, you get a different result
tb[2:5]Discard rows using negative indices
df |>
ss(-c(2:5)) |>
head(4)
df |>
ss(-c(2:5)) |>
head(4)dt[!2:5] |>
head(4)tb |>
slice(-(2:5)) |>
head(4)Filter rows using conditions
df |>
fsubset(ch == "A" & id1 == 6)
df |>
fsubset(ch == x & id1 == 6)
# This works
df |>
fsubset(ch == ch & id1 == 6)
# This does not work
df |>
fsubset(ch == fct & id1 == 6)
# This through error
df |>
fsubset(ch == get(fct) & id1 == 6) |>
try()
# This works
df |>
fsubset(ch == get("fct", envir = -2) & id1 == 6)
# NOTE: is there a better way?dt[ch == "A" & id1 == 6]
dt[ch == x & id1 == 6]
dt[ch ==ch & id1 == 6]
# this does not work
dt[ch == fct & id1 == 6]
dt[ch == eval(fct) & id1 == 6]
# These work but they are verbose
dt[ch == get("fct", envir = parent.frame()) & id1 == 6]
dt[ch == get("fct", envir = -2) & id1 == 6]tb |>
filter(ch == "A" & id1 == 6)
tb |>
filter(ch == x & id1 == 6)
tb |>
filter(ch == ch & id1 == 6)
# does not work
tb |>
filter(ch == fct & id1 == 6)
# works really well
tb |>
filter(ch == !!fct & id1 == 6)Filter unique rows
# Remove duplicate rows
df |>
funique()
# Keeps only one row per unique value in id2
df |>
funique(cols = "int") # selecting column by col name
df |>
funique(cols = 6) # selecting column by indices
df |>
funique(cols = names(df) %in% "int") # with logical condition # Remove duplicate rows
dt |>
unique()
# Keeps only one row per unique value in id2
dt |>
unique(by = "id2") # Remove duplicate rows
tb |>
distinct()
# Keeps only one row per unique id1
tb |> distinct(id1, .keep_all = TRUE) # keep all colDiscard rows with missing values
# Discard rows with any NA value
df |>
na_omit()
# Discard rows with NA value for selected col
df |>
na_omit(cols = "ch")
# More flexible options:
# Remove rows where more than 50% of values are missing
df |>
na_omit(prop = 0.5)# Discard rows with any NA value
dt |>
na_omit()
# Discard rows with NA value for selected col
dt <- dt[!is.na(ch)]# Discard rows with any NA value
tb |>
tidyr::drop_na()
# Discard rows with NA value for selected col
tb |>
tidyr::drop_na(ch)Other filters: slice options
df |>
fslice(n = 3) # First 3 rows
df |>
fslice(n = 3,
how = "last") # Last 3 rows
df |>
fslice(n = 0.1) # Fraction of rows: first 10% of rows
fslice(n = 3,
how = "min",
order.by = int) # 3 obs with lowest int
# TODO: add fslicev()# Frist 3 rows
dt[1:3, ] # First 3 rows, all columns
# Last 3 rows
dt[(.N-2):.N] # .N gives the total number of rows
# Fraction of rows: first 10% of rows
dt[1:(.N * 0.1)]
# 3 obs with lowest int
dt[order(int)][1:3]# First 3 rows
tb |>
slice_head(n = 3)
# Last 3 rows
tb |>
slice_tail(n = 3)
# Fraction of rows: first 10% of rows
tb |>
slice_head(prop = 0.1)
# 3 obs with lowest int
tb |>
slice_min(order_by = int,
n = 3) # all rows
tb |>
slice_min(order_by = int,
n = 3,
with_ties = FALSE) Sort rows
Sort rows by column(s)
df |>
roworder(id1)
df |>
roworder(-id2) # Sort by decreasing order of id2
df |>
roworder(id1, -id2) # Sort by multiple cols dt[order(id2)] # This makes a copy
setorder(dt, id2) # To modify by reference
dt[order(-id2)] # Sort by decreasing order
dt[order(id1, -id2)] # Sort by multiple cols tb |>
arrange(id2)
tb |>
arrange(desc(id2)) # Sort by decreasing order
# Sort by multiple cols
df |>
arrange(id1, desc(id2))Select columns
Select one or more columns
## Select one column ####
# _________________________
# by index
df |>
fselect(2)
df |>
slt(2) # shorthand for fselect
# by name
df |>
fselect(id2) # returns a dataframe
## Select multiple columns ####
# _____________________________
df |>
fselect(id1, id2, fct)
df |>
fselect(id1, ch:fct)## Select one column ####
# _________________________
# by index
dt[[3]] # returns a vector
dt[, 3] # returns a data.table
# by name
dt[, list(id2)] # returns a data.table
dt[, .(id2)] # returns a data.table (. is an alias for list)
dt[, "id2"] # returns a data.table
dt[, id2] # returns a vector
dt[["id2"]] # returns a vector
## Select multiple columns ####
# _____________________________
dt[, .(id1, id2, int)]
dt[, list(id1, id2, int)]
dt[, id2:int] # select columns between id2 and int## Select one column ####
# _________________________
tb |>
select(id2) # returns a tibble
pull(tb, id2, name = ch) # returns a (named) vector
tb[, "id2"] # returns a tibble
tb[["id2"]] # returns a vector
## Select multiple columns ####
# _____________________________
df |>
select(id1, id2, ch)
df |>
select(id1, ch:fct)Exclude columns
# Exclude columns by column names
df |>
fselect(-dt, -tm)
# Using a character vector
cols <- c("dt", "tm")
df |>
fselect(-cols) # does not work Error in -cols: invalid argument to unary operator
df |>
fselect(!cols) # does not work Error in !cols: invalid argument type
# what is a better way to do this?# Exclude columns by column names
dt[, !c("dt", "tm")]
# Using a character vector
cols <- c("dt", "tm")
dt[, ..cols]
# .. prefix means 'one-level up', as cols is outside the parent environment
dt[, !..cols] # or dt[, -..cols]# Exclude columns by column names
tb |>
select( -dt, -tm)
# Using a character vector
cols <- c("dt", "tm")
tb |>
select(all_of(cols))
tb |>
select(-all_of(cols))Other selections - not sure it is relevant, to check
Miscellaneous
Read & write data
Write
# no specific functions for reading and writing data fwrite(dt,
"DT.csv") # write to csv
fwrite(dt,
"DT.txt",
sep = "\t") # write to a tab-delimited filereadr::write_csv(tb,
"tb.csv") # write to csv
readr::write_delim(tb,
"tb.txt",
delim = "\t") # write to a tab-delimited fileRead
# no specific functions for reading and writing data fread("dt.csv") # read csv
# fread("DT.csv", verbose = TRUE) # full details
fread("dt.txt", sep = "\t") # read tab-delimited file
# Read and rbind several files
rbindlist(
lapply(c("dt.csv", "dt.csv"),
fread))readr::read_csv("tb.csv") # read csv
readr::read_delim("tb.txt",
delim = "\t") # read tab-delimited file
# Read and rbind several files
c("tb.csv", "tb.csv") |>
purrr::map_dfr(readr::read_csv)Reshape data
# ---- Long to Wide ----
wide_pivot <- pivot(df,
ids = c("id1", "id2", "dt"), # Columns to keep
values = "int", # Column with values
names = "ch", # Column whose values become new cols
how = "wider") # Reshape to wide format
# ---- Wide to Long ----
long_pivot <- pivot(wide_pivot,
ids = c("id1", "id2", "dt"),
values = NULL,
names = list("ch", "int"),
how = "longer") # ---- Long to Wide ----
wide_dt <- dcast(dt,
id1 + id2 + dt ~ ch,
value.var = "int")
# ---- Wide to Long ----
long_dt <- melt(wide_dt,
id.vars = c("id1", "id2", "dt"),
variable.name = "ch",
value.name = "int")# ---- Long to Wide ----
tb_wide <- tb |>
# rm NAs
filter(!is.na(ch)) |>
pivot_wider(names_from = ch,
values_from = int)
# ---- Wide to Long ----
tb_long <- tb_wide |>
pivot_longer(cols = c("A", "D", "C", "B"),
values_to = "int",
names_to = "ch")Summarise data
Summarise columns
# efficient
df |>
fsummarise(sum_rf = fsum(realf),
sd_rd = fsd(reald))
# shorthand
df |>
smr(sum_rf = fsum(realf),
sd_rd = fsd(reald))dt[, sum(realf)] # returns a vector
dt[, .(sum(realf))] # returns a data.table
dt[, .(sum_rf = sum(realf), # returns a data.table with named columns
sd_rd = sd(reald))]summarise(tb, sum(realf)) # returns a tibble
tb |>
summarise(sum_rf = sum(realf), # returns a tibble
sd_rd = sd(reald))Helper functions
The package includes fsum,fprod, fmedian, fmode, fvar, fsd, fmin,fmax,fnth, ffirst, flast,fnobs, and fndistinct.
The package includes first, last and uniqueN.
The package includes first, last, n, nth, and n_distinct.
Manipulations of columns
# Add one oe several columns (can also use ftransform)
df <- df |>
fmutate(log_rf = log(realf))
df <- df |>
fmutate(log_rd = log(reald),
sqrt_rd = sqrt(reald))
# Create one column and eliminate others
fcompute(df, log2_rf = log_rf*2)
# remove columns
df <- df |>
fselect(-log_rf,-log_rd,-sqrt_rd)# Add one column
dt[, log_rf := log(realf)]
# Add several columns
dt[, ':=' (log_rd = log(reald),
sqrt_rd = sqrt(reald))]
# Create one column and eliminate others
dt[, .(log2_rf = log_rf*2)]
# remove columns
dt[, c("log_rf", "log_rd", "sqrt_rd") := NULL] # Add one or several column
tb <- tb |>
mutate(log_rf = log(realf))
tb <- tb |>
mutate(log_rd = log(reald),
sqrt_rd = sqrt(reald))
# Create one column and eliminate others
transmute(tb, log2_rf = log_rf*2)
# remove columns
tb <- tb |>
select(-log_rf,-log_rd,-sqrt_rd)by
df |>
fgroup_by(ch)|>
fsummarise(sumrf = fsum(realf)) # ordered and indexed results
# Assigned column name
df |>
fgroup_by(abc = tolower(ch))|>
fsummarise(sumrf = fsum(realf))
# Add a column with number of observations for each group
df |>
fgroup_by(ch)|>
fcount(add = TRUE)dt[, .(sumrf = sum(realf)), by = "ch"] # unordered results
# Reordered and indented:
dt[, keyby = ch,
.(sumrf = sum(realf))]
# Assigning column name
dt[, keyby = .(abc = tolower(ch)),
.(sumrf = sum(realf))]
# Add a column with number of observations for each group
dt[, n := .N, by = ch][]
dt[, n := NULL] # remove for consistencytb |>
group_by(ch)|>
summarise(sumrf = sum(realf)) # ordered results
# Assigned column name
tb |>
group_by(abc = tolower(ch))|>
summarise(sumrf = sum(realf))
# Add a column with number of observations for each group
tb |>
group_by(ch)|>
add_tally()
# or...
add_count(tb, ch)Going further
Advanced columns manipulation
# Summarize columns
df |>
fsummarise(across(c("realf", "reald"),
fmean))
# Summarize using a condition
df |>
fsummarise(across(is.numeric, # different from dplyr due to across
fmean))
# Modify all the columns
df |>
fmutate(across(NULL,rev))
# Modify several columns
df |>
fcomputev(vars = c("realf", "reald"), # dropping the other columns
sqrt)
df <- df |>
ftransformv(vars = c("realf", "reald"), # keeping the other columns
sqrt)
df <- df |>
ftransformv(vars = c("realf", "reald"), # reverting for consistency
FUN = function(x){ x^2 })
# Modify columns using a condition
df |>
fcomputev(is.numeric,
FUN = function(x){x - 1})# Summarize columns
dt[, lapply(.SD, mean),
.SDcols = c("realf", "reald")]
# Summarize using a condition
dt[, lapply(.SD, mean),
.SDcols = is.numeric]
# Modify all the columns
dt[, lapply(.SD, rev)]
# Modify several columns
dt[, lapply(.SD, sqrt), # dropping the other columns
.SDcols = realf:reald]
cols <- c("realf", "reald")
dt[, (cols) := lapply(.SD, sqrt), # keeping the other columns
.SDcols = cols]
dt[, (cols) := lapply(.SD, "^", 2L), # reverting for consistency
.SDcols = cols]
# Modify columns using a condition
dt[, .SD - 1,
.SDcols = is.numeric]
rm(cols)# Summarize columns
tb |>
summarise(across(c("realf", "reald"),
mean))
# Summarize using a condition
tb |>
summarise(across(where(is.numeric),
mean))
# Modify all the columns
tb |>
mutate(across(everything(),
rev))
# Modify several columns
tb |>
transmute(across(c("realf", "reald"), # dropping the other columns
sqrt))
tb <- tb |>
mutate(across(all_of(c("realf", "reald")), # keeping the other columns
sqrt))
tb <- tb |>
mutate(across(all_of(c("realf", "reald")), # reverting for consistency
~ "^"(.x, 2L)))
# Modify columns using a condition
tb |>
transmute(across(where(is.numeric),
~ '-'(., 1L)))Chain expressions
df |>
fgroup_by(ch)|>
fsummarise(sumrf = fsum(realf))|>
fsubset(sumrf < 129000)dt[, by = ch,
.(sumrf = sum(realf))][
sumrf < 129000
]tb |>
group_by(ch)|>
summarise(sumrf = sum(realf))|>
filter(sumrf < 129000)Indexing and keys (not sure it applies)
set modifications
# Replace values (recommendation to use set of data.table)
set(df, i = 1L, j = 2L, value = 30L)
# Reorder rows
df <- roworder(df, id2, -id1)
df <- roworder(df, id1) # reversal for consistency
# Modify column names
df <- frename(df, dt = date)
df <- frename(df, date = dt) # reversal for consistency
# reorder columns
df <- colorder(df, id1, id2, ch)# Replace values
set(dt, i = 1L, j = 2L, value = 30L)
# Reorder rows
setorder(dt, id2, -id1)
setorder(dt, id1) # reversal for consistency
# Modify column names
setnames(dt, old = "dt", new = "date")
setnames(dt, old = "date", new = "dt") # reversal for consistency
# reorder columns
setcolorder(dt, c("id1","id2","ch"))# Replace values
tb[1,2] <- 30L
# Reorder rows
tb <- arrange(tb, id2, desc(id1))
tb <- arrange(tb, id1) # reversal for consistency
# Modify column names
tb <- rename(tb, date = dt)
tb <- rename(tb, dt = date) # reversal for consistency
# reorder columns
tb <- relocate(tb, c("id1", "id2", "ch"))Advanced use of by (maybe)
Join/Bind data sets
Bind
x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
# bind rows
rowbind(x, y, fill = TRUE) # always fills
# bind rows using a list
rowbind(list(x, y), idcol = "id")
# bind columns
base::cbind(x, y)
add_vars(x) <- y # modifies x but keeps data structure and attributesx <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
# bind rows
rbind(x, y, fill = TRUE)
# bind rows using a list
rbindlist(list(x, y), idcol = "id")
# bind columns
base::cbind(x, y)
x <- base::cbind(x, y) # modifies x but column names are not changedx <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)
# bind rows
bind_rows(x, y) # always fills
# bind rows using a list
bind_rows(list(x, y), .id = "id")
# bind columns
bind_cols(x, y)
x <- bind_cols(x, y) # modifies x and replace namesEfficient functions (maybe)
Other useful functions in collapse
# quick summary (From STATA summarize and xtsummarize)
qsu(df, cols = c("realf"))