Collapse Cheat sheet – PIP-Bloggers

Introduction

This post is inspired in the Atreba’s blog: A data.table and dplyr tour. The objective of this post is to complement Atreba’s one with the syntax of the {collapse} R package.

Basic understanding of the three packages

dplyr

A grammar of data manipulation in R which provides a consistent set of verbs to help you solve the most common data manipulation challenges. It is part of the tidyverse universe. Click here for more information..

data.table

A syntax to operate data manipulation operations, such as subset, group, update, join, etc. It reduces programming and compute time tremendously. Click here for more information.

collapse

A large C/C++ based package for data transformation and statistical computing in R. It aims to facilitate complex data transformations, explorations and computing tasks in R, while making code fast, flexible, parsimonious and programmer friendly. Click here for more information.

Load libraries

library(dplyr)
library(data.table)
library(collapse)
library(tidyverse)

Data

set.seed(42)

# Number of rows
n <- 10000

# # Generate fake data
# df <- data.frame(
#   id1 = 1:n,  # Unique ID
#   id2 = sample(1:500, n, replace = TRUE),  # Repeating ID
#   dt = seq.Date(from = as.Date("2023-01-01"), by = "day", length.out = n),  # Dates
#   tm = format(seq.POSIXt(from = as.POSIXct("2023-01-01 00:00:00"), 
#                          by = "hour", length.out = n), "%H:%M:%S"),  # Time
#   ch = sample(c("A", "B", "C", "D"), n, replace = TRUE),  # Character
#   int = sample(1:100, n, replace = TRUE),  # Integer
#   log = sample(c(TRUE, FALSE), n, replace = TRUE),  # Logical
#   realf = runif(n, 1, 100),  # Real (float),
#   reald = runif(n),  # Real ,
#   fct = factor(sample(c("X", "Y", "Z"), n, replace = TRUE))  # Factor
# )

set.seed(42)

# Number of rows
n <- 10000

# Generate fake data with some NAs
df <- data.frame(
  id1 = 1:n,  # Unique ID
  id2 = sample(1:500, n, replace = TRUE),  # Repeating ID
  dt = sample(c(seq.Date(from = as.Date("2023-01-01"), by = "day", length.out = n), NA), n, replace = TRUE),  # Dates with NAs
  tm = sample(c(format(seq.POSIXt(from = as.POSIXct("2023-01-01 00:00:00"), 
                                  by = "hour", length.out = n), "%H:%M:%S"), NA), n, replace = TRUE),  # Time with NAs
  ch = sample(c("A", "B", "C", "D", NA), n, replace = TRUE, prob = c(0.24, 0.24, 0.24, 0.24, 0.04)),  # Character with some NAs
  int = sample(c(1:100, NA), n, replace = TRUE),  # Integer with some NAs
  log = sample(c(TRUE, FALSE, NA), n, replace = TRUE, prob = c(0.49, 0.49, 0.02)),  # Logical with some NAs
  realf = sample(c(runif(n, 1, 100), NA), n, replace = TRUE),  # Real (float) with some NAs
  reald = sample(c(runif(n), NA), n, replace = TRUE),  # Real with some NAs
  fct = factor(sample(c("X", "Y", "Z", NA), n, replace = TRUE, prob = c(0.32, 0.32, 0.32, 0.04)))  # Factor with some NAs
)

# Print summary to check distribution of NAs
summary(df)


# Ensure uniqueness
df <- unique(df, by = c("id1", "id2"))
dt <- copy(setDT(df))
tb <- as_tibble(df)

Basic use

Filtering rows

Filter rows using indices

# super efficient
df |> 
  ss(2:5)

# efficient
df |> 
  fsubset(2:5)

dt[2:5]

tb |> 
  slice(2:5)

# or using index like any data.frame
tb[2:5,]

# you need to add the comma. Otherwise, you get a different result
tb[2:5]

Discard rows using negative indices

df |> 
  ss(-c(2:5)) |> 
  head(4)

df |> 
  ss(-c(2:5)) |> 
  head(4)

dt[!2:5] |> 
  head(4)

tb |> 
  slice(-(2:5)) |> 
  head(4)

Filter rows using conditions

df |> 
  fsubset(ch == "A" & id1 == 6)

df |> 
  fsubset(ch == x & id1 == 6)

# This works
df |> 
  fsubset(ch == ch & id1 == 6)

# This does not work
df |> 
  fsubset(ch == fct & id1 == 6)

# This through error
df |> 
  fsubset(ch == get(fct) & id1 == 6) |> 
  try()

# This works
df |> 
  fsubset(ch == get("fct", envir = -2) & id1 == 6) 

# NOTE: is there a better way?

dt[ch == "A" & id1 == 6]

dt[ch == x & id1 == 6]

dt[ch ==ch & id1 == 6]

# this does not work
dt[ch == fct & id1 == 6]


dt[ch == eval(fct) & id1 == 6]

# These work but they are  verbose
dt[ch == get("fct", envir = parent.frame()) & id1 == 6]
dt[ch == get("fct", envir = -2) & id1 == 6]

tb |> 
  filter(ch == "A" & id1 == 6)

tb |> 
  filter(ch == x & id1 == 6)

tb |> 
  filter(ch == ch & id1 == 6)

# does not work
tb |> 
  filter(ch == fct & id1 == 6)

# works really well
tb |> 
  filter(ch == !!fct & id1 == 6)

Filter unique rows

# Remove duplicate rows
df |>
  funique()

# Keeps only one row per unique value in id2
df |>
  funique(cols = "int") # selecting column by col name 
df |>
  funique(cols = 6)     # selecting column by indices
df |>
  funique(cols = names(df) %in% "int") # with logical condition

 # Remove duplicate rows
dt |>
  unique()

# Keeps only one row per unique value in id2
dt |>
  unique(by = "id2")

# Remove duplicate rows
tb |>
  distinct()

# Keeps only one row per unique id1
tb |> distinct(id1, .keep_all = TRUE) # keep all col

Discard rows with missing values

# Discard rows with any NA value
df |>
  na_omit()

# Discard rows with NA value for selected col
df |>
  na_omit(cols = "ch")

# More flexible options:
# Remove rows where more than 50% of values are missing
df |>
  na_omit(prop = 0.5)

# Discard rows with any NA value
dt |>
  na_omit()

# Discard rows with NA value for selected col
dt <- dt[!is.na(ch)]

# Discard rows with any NA value
tb |>
  tidyr::drop_na()

# Discard rows with NA value for selected col
tb |> 
  tidyr::drop_na(ch)

Other filters: slice options

df |>
  fslice(n = 3)                 # First 3 rows
df |>
  fslice(n   = 3, 
         how = "last")          # Last 3 rows
df |>
  fslice(n = 0.1)               # Fraction of rows: first 10% of rows

fslice(n        = 3, 
       how      = "min", 
       order.by = int)          # 3 obs with lowest int

# TODO: add fslicev()

# Frist 3 rows
dt[1:3, ]   # First 3 rows, all columns


# Last 3 rows
dt[(.N-2):.N]  # .N gives the total number of rows

# Fraction of rows: first 10% of rows
dt[1:(.N * 0.1)]


# 3 obs with lowest int
dt[order(int)][1:3]

# First 3 rows
tb |>
  slice_head(n = 3)

# Last 3 rows
tb |>
  slice_tail(n = 3)

# Fraction of rows: first 10% of rows
tb |>
  slice_head(prop = 0.1)

# 3 obs with lowest int
tb |>
  slice_min(order_by = int, 
            n        = 3) # all rows

tb |>
 slice_min(order_by = int, 
           n        = 3, 
           with_ties = FALSE)

Sort rows

Sort rows by column(s)

df |>
  roworder(id1)  

df |>
  roworder(-id2)      # Sort by decreasing order of id2

df |>
  roworder(id1, -id2) # Sort by multiple cols

dt[order(id2)]    # This makes a copy

setorder(dt, id2) # To modify by reference  

dt[order(-id2)]   # Sort by decreasing order

dt[order(id1, -id2)] # Sort by multiple cols

tb |>
  arrange(id2)

tb |>
  arrange(desc(id2)) # Sort by decreasing order

# Sort by multiple cols 
df |>
   arrange(id1, desc(id2))

Select columns

Select one or more columns

## Select one column   ####
# _________________________ 

# by index
df |>
  fselect(2)

df |>
  slt(2) # shorthand for fselect

# by name

df |>
  fselect(id2)  # returns a dataframe 

## Select multiple columns ####
# _____________________________ 

df |>
  fselect(id1, id2, fct)

df |>
  fselect(id1, ch:fct)

## Select one column   ####
# _________________________ 

# by index
dt[[3]]  # returns a vector
dt[, 3]  # returns a data.table

# by name
dt[, list(id2)] # returns a data.table
dt[, .(id2)]    # returns a data.table (. is an alias for list)
dt[, "id2"]     # returns a data.table
dt[, id2]       # returns a vector
dt[["id2"]]     # returns a vector

## Select multiple columns ####
# _____________________________ 

dt[, .(id1, id2, int)]
dt[, list(id1, id2, int)]
dt[, id2:int] # select columns between id2 and int

## Select one column   ####
# _________________________ 

tb |>
  select(id2)               # returns a tibble

pull(tb, id2, name = ch)    # returns a (named) vector
tb[, "id2"]                 # returns a tibble
tb[["id2"]]                 # returns a vector

## Select multiple columns ####
# _____________________________ 

df |>
  select(id1, id2, ch)
df |>
  select(id1, ch:fct)

Exclude columns

# Exclude columns by column names 
df |>
  fselect(-dt, -tm)

# Using a character vector 
cols <- c("dt", "tm")

df |>
  fselect(-cols)  # does not work

Error in -cols: invalid argument to unary operator

df |>
  fselect(!cols) # does not work

Error in !cols: invalid argument type

# what is a better way to do this?

# Exclude columns by column names 
dt[, !c("dt", "tm")]

# Using a character vector 
cols <- c("dt", "tm")

dt[, ..cols] 
# .. prefix means 'one-level up', as cols is outside the parent environment 

dt[, !..cols] # or dt[, -..cols]

# Exclude columns by column names 
tb |>
  select( -dt, -tm)

# Using a character vector 
cols <- c("dt", "tm")

tb |>
  select(all_of(cols))

tb |>
  select(-all_of(cols))

Other selections - not sure it is relevant, to check

Miscellaneous

Read & write data

Write

# no specific functions for reading and writing data

fwrite(dt, 
       "DT.csv")                # write to csv


fwrite(dt, 
       "DT.txt", 
       sep = "\t")              # write to a tab-delimited file

readr::write_csv(tb, 
                 "tb.csv")  # write to csv

readr::write_delim(tb, 
                   "tb.txt", 
                   delim = "\t")  # write to a tab-delimited file

Read

# no specific functions for reading and writing data

fread("dt.csv")   # read csv
# fread("DT.csv", verbose = TRUE) # full details

fread("dt.txt", sep = "\t") # read tab-delimited file

# Read and rbind several files
rbindlist(
  lapply(c("dt.csv", "dt.csv"), 
         fread))

readr::read_csv("tb.csv")  # read csv

readr::read_delim("tb.txt", 
                  delim = "\t")  # read tab-delimited file

# Read and rbind several files
c("tb.csv",  "tb.csv") |>
  purrr::map_dfr(readr::read_csv)

Reshape data

# ---- Long to Wide ----
wide_pivot <- pivot(df, 
                    ids    = c("id1", "id2", "dt"),  # Columns to keep
                    values = "int",              # Column with values
                    names  = "ch",   # Column whose values become new cols
                    how    = "wider")               # Reshape to wide format

# ---- Wide to Long ----
long_pivot <- pivot(wide_pivot, 
                    ids    = c("id1", "id2", "dt"),  
                    values = NULL,  
                    names  = list("ch", "int"),  
                    how    = "longer")

# ---- Long to Wide ----
wide_dt <- dcast(dt, 
                 id1 + id2 + dt ~ ch, 
                 value.var = "int")

# ---- Wide to Long ----
long_dt <- melt(wide_dt, 
                id.vars       = c("id1", "id2", "dt"), 
                variable.name = "ch", 
                value.name    = "int")

# ---- Long to Wide ----
tb_wide <- tb |>
  # rm NAs
  filter(!is.na(ch)) |>
  pivot_wider(names_from  = ch, 
              values_from = int)

# ---- Wide to Long ----
tb_long <- tb_wide |>
  pivot_longer(cols      = c("A", "D", "C", "B"),
               values_to = "int",
               names_to  = "ch")

Summarise data

Summarise columns

# efficient
df |>
  fsummarise(sum_rf = fsum(realf),
             sd_rd = fsd(reald))

# shorthand
df |>
  smr(sum_rf = fsum(realf),
      sd_rd = fsd(reald))

dt[, sum(realf)] # returns a vector

dt[, .(sum(realf))] # returns a data.table

dt[, .(sum_rf = sum(realf), # returns a data.table with named columns
       sd_rd = sd(reald))]

summarise(tb, sum(realf)) # returns a tibble

tb |> 
  summarise(sum_rf = sum(realf), # returns a tibble
            sd_rd = sd(reald))

Helper functions

The package includes fsum,fprod, fmedian, fmode, fvar, fsd, fmin,fmax,fnth, ffirst, flast,fnobs, and fndistinct.

The package includes first, last and uniqueN.

The package includes first, last, n, nth, and n_distinct.

Manipulations of columns

 # Add one oe several columns (can also use ftransform)
df <- df |>
  fmutate(log_rf = log(realf)) 

df <- df |>
  fmutate(log_rd = log(reald),
           sqrt_rd = sqrt(reald))

# Create one column and eliminate others
fcompute(df, log2_rf = log_rf*2) 

 # remove columns
df <- df |>
  fselect(-log_rf,-log_rd,-sqrt_rd)

# Add one column
dt[, log_rf := log(realf)] 

# Add several columns
dt[, ':=' (log_rd = log(reald), 
           sqrt_rd = sqrt(reald))]

# Create one column and eliminate others
dt[, .(log2_rf = log_rf*2)] 

# remove columns
dt[, c("log_rf", "log_rd", "sqrt_rd") := NULL]

# Add one or several column
tb <- tb |> 
  mutate(log_rf = log(realf))

tb <- tb |> 
  mutate(log_rd = log(reald), 
           sqrt_rd = sqrt(reald))

# Create one column and eliminate others
transmute(tb, log2_rf = log_rf*2)

# remove columns
tb <- tb |> 
  select(-log_rf,-log_rd,-sqrt_rd)

by

df |> 
  fgroup_by(ch)|>
  fsummarise(sumrf = fsum(realf)) # ordered and indexed results

# Assigned column name

df |> 
  fgroup_by(abc = tolower(ch))|>
  fsummarise(sumrf = fsum(realf))

# Add a column with number of observations for each group

df |>
  fgroup_by(ch)|>
  fcount(add = TRUE)

dt[, .(sumrf = sum(realf)), by = "ch"] # unordered results

# Reordered and indented:

dt[, keyby = ch,
     .(sumrf = sum(realf))]

# Assigning column name

dt[, keyby = .(abc = tolower(ch)),
     .(sumrf = sum(realf))]

# Add a column with number of observations for each group

dt[, n := .N, by = ch][]
dt[, n := NULL] # remove for consistency

tb |> 
  group_by(ch)|>
  summarise(sumrf = sum(realf)) # ordered results

# Assigned column name

tb |> 
  group_by(abc = tolower(ch))|>
  summarise(sumrf = sum(realf))

# Add a column with number of observations for each group

tb |>
  group_by(ch)|>
  add_tally()

# or...

add_count(tb, ch)

Going further

Advanced columns manipulation

# Summarize columns
df |>
  fsummarise(across(c("realf", "reald"),
            fmean))

# Summarize using a condition
df |>
  fsummarise(across(is.numeric, # different from dplyr due to across
                   fmean))

# Modify all the columns
df |> 
  fmutate(across(NULL,rev))

# Modify several columns
df |> 
  fcomputev(vars = c("realf", "reald"), # dropping the other columns
                  sqrt)

df <- df |>
  ftransformv(vars = c("realf", "reald"), # keeping the other columns
                sqrt)

df <- df |>
  ftransformv(vars = c("realf", "reald"), # reverting for consistency
                FUN = function(x){ x^2 })

# Modify columns using a condition 

df |>
  fcomputev(is.numeric,
           FUN = function(x){x - 1})

# Summarize columns
dt[, lapply(.SD, mean),
   .SDcols = c("realf", "reald")]

# Summarize using a condition
dt[, lapply(.SD, mean),
     .SDcols = is.numeric]

# Modify all the columns
dt[, lapply(.SD, rev)]

# Modify several columns
dt[, lapply(.SD, sqrt), # dropping the other columns
     .SDcols = realf:reald]

cols <- c("realf", "reald")

dt[, (cols) := lapply(.SD, sqrt), # keeping the other columns
     .SDcols = cols]

dt[, (cols) := lapply(.SD, "^", 2L), # reverting for consistency
     .SDcols = cols]

# Modify columns using a condition 
dt[, .SD - 1,
     .SDcols = is.numeric] 

rm(cols)

# Summarize columns
tb |>
  summarise(across(c("realf", "reald"),
            mean))

# Summarize using a condition
tb |>
  summarise(across(where(is.numeric),
                   mean))

# Modify all the columns
tb |> 
  mutate(across(everything(),
                rev))

# Modify several columns
tb |>
  transmute(across(c("realf", "reald"), # dropping the other columns
                  sqrt))

tb <- tb |>
  mutate(across(all_of(c("realf", "reald")), # keeping the other columns
                sqrt))

tb <- tb |>
  mutate(across(all_of(c("realf", "reald")), # reverting for consistency
                ~ "^"(.x, 2L)))

# Modify columns using a condition 

tb |>
  transmute(across(where(is.numeric), 
                   ~ '-'(., 1L)))

Chain expressions

df |>
  fgroup_by(ch)|>
  fsummarise(sumrf = fsum(realf))|>
  fsubset(sumrf < 129000)

dt[, by = ch,
   .(sumrf = sum(realf))][
     sumrf < 129000
   ]

tb |>
  group_by(ch)|>
  summarise(sumrf = sum(realf))|>
  filter(sumrf < 129000)

Indexing and keys (not sure it applies)

set modifications

# Replace values (recommendation to use set of data.table)

set(df, i = 1L, j = 2L, value = 30L) 

# Reorder rows

df <- roworder(df, id2, -id1)

df <- roworder(df, id1) # reversal for consistency

# Modify column names

df <- frename(df, dt = date)

df <- frename(df, date = dt) # reversal for consistency

# reorder columns

df <- colorder(df, id1, id2, ch)

# Replace values

set(dt, i = 1L, j = 2L, value = 30L)

# Reorder rows

setorder(dt, id2, -id1)

setorder(dt, id1) # reversal for consistency

# Modify column names

setnames(dt, old = "dt", new = "date")

setnames(dt, old = "date", new = "dt")  # reversal for consistency

# reorder columns

setcolorder(dt, c("id1","id2","ch"))

# Replace values

tb[1,2] <- 30L

# Reorder rows

tb <- arrange(tb, id2, desc(id1))

tb <- arrange(tb, id1) # reversal for consistency

# Modify column names

tb <- rename(tb, date = dt)

tb <- rename(tb, dt = date) # reversal for consistency

# reorder columns

tb <- relocate(tb, c("id1", "id2", "ch"))

Advanced use of by (maybe)

Join/Bind data sets

Bind

x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)

# bind rows

rowbind(x, y, fill = TRUE) # always fills

# bind rows using a list

rowbind(list(x, y), idcol = "id")

# bind columns

base::cbind(x, y)

add_vars(x) <- y # modifies x but keeps data structure and attributes

x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)

# bind rows

rbind(x, y, fill = TRUE)

# bind rows using a list

rbindlist(list(x, y), idcol = "id")

# bind columns

base::cbind(x, y)

x <- base::cbind(x, y) # modifies x but column names are not changed

x <- data.table(1:3)
y <- data.table(4:6)
z <- data.table(7:9, 0L)

# bind rows

bind_rows(x, y) # always fills

# bind rows using a list

bind_rows(list(x, y), .id = "id")

# bind columns

bind_cols(x, y)

x <- bind_cols(x, y) # modifies x and replace names

Efficient functions (maybe)

Other useful functions in collapse

# quick summary (From STATA summarize and xtsummarize)

qsu(df, cols = c("realf"))