Reference semantics

I use the R data.table package on a daily basis. One of the features that I find very useful is the reference semantics. More information about this can be found in the documentation:

vignette(package = "data.table", topic = "datatable-reference-semantics")
library(data.table)
?":="

Here is a small example to show what the reference semantics actually does:

dt1 <- data.table::data.table(c1 = 1:3)
data.table::address(dt1)
# "0x560a8f2b92f0"

dt2 <- dt1
data.table::address(dt2)
# "0x560a8f2b92f0", i.e. the same address as dt1

# modifying dt2 actually modifies dt1, since they are the same object:
dt2[, c2 := letters[1:3]]
print(x = dt1, class = TRUE)
#       c1     c2
#    <int> <char>
# 1:     1      a
# 2:     2      b
# 3:     3      c

# Treating the data.table as a data.frame, for example by creating a new column
# using the "<-" triggers the creation of a new data.table, i.e. new address
dt2$c3 <- letters[1:3]
data.table::address(dt2)
# "0x560a94785120", i.e. different from dt1's address

# as a result, dt1 is unchanged:
print(x = dt1, class = TRUE)
#       c1     c2
#    <int> <char>
# 1:     1      a
# 2:     2      b
# 3:     3      c

# but dt2 differs now from dt1:
print(x = dt2, class = TRUE)
#       c1     c2     c3
#    <int> <char> <char>
# 1:     1      a      a
# 2:     2      b      b
# 3:     3      c      c

Reference semantics comes in handy when defining functions which transform the input data.table by reference. If the data.table is big, this can make quite a difference, since the data.table is transformed in place, i.e. no copying is done:

attach_authors_column <- function(dt)
{
  authors <- c("David Grossman", "Nichita Stanescu", "Matsuo Basho")
  dt[, authors := authors]
}
attach_authors_column(dt1)
print(x = dt1, class = TRUE)
#       c1     c2          authors
#    <int> <char>           <char>
# 1:     1      a   David Grossman
# 2:     2      b Nichita Stanescu
# 3:     3      c     Matsuo Basho

 

In-place joins with unique keys

What I learned just recently is that one can also join in-place, i.e. without additional copying operations:

dt1 <- utils::read.table(
  header = TRUE,
  stringsAsFactors = FALSE,
  text = "
      c1 c2 d   e
      a  1  11  A
      b  3  21  B
      a  2  31  C
      "
)
data.table::setDT(dt1)

dt2 <- utils::read.table(
  header = TRUE,
  stringsAsFactors = FALSE,
  text = "
      c1 c10 c2    d
      b 3    -10  -1
      c 7    -11  -3
      "
)
data.table::setDT(dt2)

key_column <- "c1"
merge(
  x = dt2,
  y = dt1,
  by = key_column,
  all.x = TRUE
)
#    c1 c10 c2.x d.x c2.y d.y    e
# 1:  b   3  -10  -1    3  21    B
# 2:  c   7  -11  -3   NA  NA <NA>

# dt2 was not modified
print(dt2, class = TRUE)
#        c1   c10    c2     d
#    <char> <int> <int> <int>
# 1:      b     3   -10    -1
# 2:      c     7   -11    -3

dt2b <- data.table::copy(dt2)
dt2b[dt1, on = key_column, multiplied := d * c2][]
#    c1 c10  c2  d multiplied
# 1:  b   3 -10 -1         10
# 2:  c   7 -11 -3         NA

dt2b <- data.table::copy(dt2)
dt2b[dt1, on = key_column, multiplied := x.d * x.c2][]
#    c1 c10  c2  d multiplied
# 1:  b   3 -10 -1         10
# 2:  c   7 -11 -3         NA

dt2b <- data.table::copy(dt2)
dt2b[dt1, on = key_column, multiplied := i.d * x.c2][]
#    c1 c10  c2  d multiplied
# 1:  b   3 -10 -1       -210
# 2:  c   7 -11 -3         NA

Notes:

  • the notation i.d refers to column d of the data.table dt1, while x.d means column d of the data.table dt2b
  • pay attention to the uniqueness of the keys (i.e. the columns from the on statement) when doing in-place joins. If the key is not unique, the results might be surprising, as shown below
  • [] is there just to trigger a print of the table

If you want to attach to dt2 all the dt1 columns which are not in dt2, use mget:

dt2b <- data.table::copy(dt2)
cols1 <- colnames(dt1)
cols2 <- colnames(dt2b)
common_cols <- intersect(cols2, cols1)
print(common_cols)
# "c1" "c2" "d" 
cols_to_add <- setdiff(cols1, cols2)
print(cols_to_add)
# "e"

# to add multiple dt1 columns, use "mget":
dt2b[dt1, on = "c1", (cols_to_add) := mget(paste0("i.", cols_to_add))][]
# c1 c10  c2  d       e
# 1:  b   3 -10 -1    B
# 2:  c   7 -11 -3 <NA>

The power of in-place join can be seen in functions which transform a data. table:

get_dt <- function()
{
  dt1 <- utils::read.table(
    header = TRUE,
    stringsAsFactors = FALSE,
    text = "
      c1 c2 d   e
      a  1  11  A
      b  3  21  B
      a  2  31  C
      "
  )
  data.table::setDT(dt1)
}


attach_multiplier_column <- function(dt)
{
  key_column <- "c1"
  # you might want to check here that your key is unique in both data.table
  
  dt2 <- get_dt()
  
  dt[dt2, on = key_column, multiplier := i.d * x.c2]
}

dt <- data.table::copy(dt2)
attach_multiplier_column(dt)
dt
#    c1 c10  c2  d multiplier
# 1:  b   3 -10 -1       -210
# 2:  c   7 -11 -3         NA

 

In-place joins with non-unique keys

When the keys are not unique:

  • if the table to be modified has less rows than the number of  matching rows in the joined table, only the last matched row is kept
  • if the table to be modified has more rows, an error is issued
dt1 <- utils::read.table(
  header = TRUE,
  stringsAsFactors = FALSE,
  text = "
      c1 c2 d   e
      a  1  11  a
      a  2  31  c
      b  3  21  b
      b  4  41  d
      b  5  61  f
      b  5  60  f
      c  6  51  e
      c  7  52  e
      "
)

data.table::setorder(dt1, c1)

dt2 <- utils::read.table(
  header = TRUE,
  stringsAsFactors = FALSE,
  text = "
      c1 c10
      b 4
      c 2
      "
)
data.table::setDT(dt2)

key_column <- "c1"
merge(
  x = dt2,
  y = dt1,
  by = key_column,
  all.x = TRUE
)
#    c1 c10 c2  d e
# 1:  b   4  3 21 b
# 2:  b   4  4 41 d
# 3:  b   4  5 61 f
# 4:  b   4  5 60 f
# 5:  c   2  6 51 e
# 6:  c   2  7 52 e

cols1 <- colnames(dt1)
cols1 <- setdiff(cols1, key_column)
dt2b <- data.table::copy(dt2)
dt2b[dt1, on = "c1", (cols1) := mget(paste0("i.", cols1))][]
# Note how only the last row for c1 = b is kept
#    c1 c10 c2  d e
# 1:  b   4  5 60 f
# 3:  c   2  7 52 e

#-------------------------
merge(
  x = dt1,
  y = dt2,
  by = key_column,
  all.x = TRUE
)
#   c1 c2  d e c10
# 1  a  1 11 a  NA
# 2  a  2 31 c  NA
# 3  b  3 21 b   4
# 4  b  4 41 d   4
# 5  b  5 61 f   4
# 6  b  5 60 f   4
# 7  c  6 51 e   2
# 8  c  7 52 e   2

cols2 <- colnames(dt2)
cols2 <- setdiff(cols2, key_column)
dt1b <- data.table::copy(dt1)
dt1b[dt2, on = "c1", (cols2) := mget(paste0("i.", cols2))]
# Error in `[.data.frame`(dt1b, dt2, on = "c1", `:=`((cols2), mget(paste0("i.",  : 
# unused argument (on = "c1")

These examples were created with R version 4.0.4, data.table version 1.14.0.

 

Make a promise. Show up. Do the work. Repeat.