Introduction to keyholder

Evgeni Chasnovski

2023-03-11

keyholder is a package for storing information (keys) about rows of data frame like objects. The common use cases are to track rows of data without modifying it and to backup and restore information about rows. This is done with creating a class keyed_df which has special attribute “keys”. Keys are updated according to changes in rows of reference data frame.

keyholder is designed to work tightly with dplyr package. All its one- and two-table verbs update keys properly.

mtcars_tbl <- mtcars %>% as_tibble()

Set keys

The general agreement is that keys are always converted to tibble. In this way one can use multiple variables as keys by binding them.

There are two general ways of creating keys:

mtcars_tbl_keyed <- mtcars_tbl
keys(mtcars_tbl_keyed) <- tibble(id = 1:nrow(mtcars_tbl_keyed))

mtcars_tbl %>% assign_keys(tibble(id = 1:nrow(.)))
#> # A keyed object. Keys: id 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows
mtcars_tbl %>% key_by(vs, am)
#> # A keyed object. Keys: vs, am 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl %>% key_by(starts_with("c"))
#> # A keyed object. Keys: cyl, carb 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl %>% key_by(starts_with("c"), .exclude = TRUE)
#> # A keyed object. Keys: cyl, carb 
#> # A tibble: 32 × 9
#>     mpg  disp    hp  drat    wt  qsec    vs    am  gear
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21     160   110  3.9   2.62  16.5     0     1     4
#> 2  21     160   110  3.9   2.88  17.0     0     1     4
#> 3  22.8   108    93  3.85  2.32  18.6     1     1     4
#> # … with 29 more rows

  # Scoped variants
mtcars_tbl %>% key_by_all()
#> # A keyed object. Keys: mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

# One can also rename variables before keying by supplying .funs
mtcars_tbl %>% key_by_if(rlang::is_integerish, .funs = toupper)
#> # A keyed object. Keys: CYL, HP, VS, AM, GEAR, CARB 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl %>% key_by_at(c("vs", "am"))
#> # A keyed object. Keys: vs, am 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

To track rows use use_id() which creates a special key .id with row numbers as values.

To properly unkey object use unkey().

mtcars_tbl_keyed <- mtcars_tbl %>% key_by(vs, am)

# Good
mtcars_tbl_keyed %>% unkey()
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

# Bad
attr(mtcars_tbl_keyed, "keys") <- NULL
mtcars_tbl_keyed
#> # A keyed object. Keys: there are no keys.
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

Get keys

There are three ways of extracting keys:

mtcars_tbl %>% keys()
#> # A tibble: 32 × 0

mtcars_tbl %>% key_by(vs, am) %>% keys()
#> # A tibble: 32 × 2
#>      vs    am
#>   <dbl> <dbl>
#> 1     0     1
#> 2     0     1
#> 3     1     1
#> # … with 29 more rows
mtcars_tbl %>% raw_keys()
#> NULL

mtcars_tbl %>% key_by(vs, am) %>% raw_keys()
#> # A tibble: 32 × 2
#>      vs    am
#>   <dbl> <dbl>
#> 1     0     1
#> 2     0     1
#> 3     1     1
#> # … with 29 more rows
mtcars_tbl %>% key_by(vs, am) %>% pull_key(vs)
#>  [1] 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1

Manipulate keys

mtcars_tbl %>% key_by(vs, mpg) %>% remove_keys(vs)
#> # A keyed object. Keys: mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl %>% key_by(vs, mpg) %>% remove_keys(everything(), .unkey = TRUE)
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

  # Scoped variants
# Identical to previous one
mtcars_tbl %>% key_by(vs, mpg) %>% remove_keys_all(.unkey = TRUE)
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl %>% key_by(vs, mpg) %>% remove_keys_if(rlang::is_integerish)
#> # A keyed object. Keys: mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows
mtcars_tbl_keyed <- mtcars_tbl %>%
  key_by(vs, mpg) %>%
  mutate(vs = 1, mpg = 0)
mtcars_tbl_keyed
#> # A keyed object. Keys: vs, mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     0     6   160   110  3.9   2.62  16.5     1     1     4     4
#> 2     0     6   160   110  3.9   2.88  17.0     1     1     4     4
#> 3     0     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl_keyed %>% restore_keys(vs)
#> # A keyed object. Keys: vs, mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     0     6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2     0     6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3     0     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl_keyed %>% restore_keys(vs, .remove = TRUE)
#> # A keyed object. Keys: mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     0     6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2     0     6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3     0     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl_keyed %>% restore_keys(vs, mpg, .unkey = TRUE)
#> # A keyed object. Keys: vs, mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl_keyed %>% restore_keys(vs, mpg, .remove = TRUE, .unkey = TRUE)
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

  # Scoped variants
mtcars_tbl_keyed %>% restore_keys_all()
#> # A keyed object. Keys: vs, mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl_keyed %>% restore_keys_if(rlang::is_integerish, .remove = TRUE)
#> # A keyed object. Keys: mpg 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     0     6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2     0     6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3     0     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

One important feature of restore_keys() is that restoring keys beats ‘not-modifying’ grouping variables rule. It is made according to the ideology of keys: they contain information about rows and by restoring you want it to be available. Groups are recomputed after restoring.

mtcars_tbl_keyed %>% group_by(vs, mpg)
#> # A keyed object. Keys: vs, mpg 
#> # A tibble: 32 × 11
#> # Groups:   vs, mpg [1]
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1     0     6   160   110  3.9   2.62  16.5     1     1     4     4
#> 2     0     6   160   110  3.9   2.88  17.0     1     1     4     4
#> 3     0     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

mtcars_tbl_keyed %>% group_by(vs, mpg) %>% restore_keys(vs, mpg)
#> # A keyed object. Keys: vs, mpg 
#> # A tibble: 32 × 11
#> # Groups:   vs, mpg [26]
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows
mtcars_tbl %>% key_by(vs, am) %>% rename_keys(Vs = vs)
#> # A keyed object. Keys: Vs, am 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

  # Scoped variants
mtcars_tbl %>% key_by(vs, am) %>% rename_keys_all(.funs = toupper)
#> # A keyed object. Keys: VS, AM 
#> # A tibble: 32 × 11
#>     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1  21       6   160   110  3.9   2.62  16.5     0     1     4     4
#> 2  21       6   160   110  3.9   2.88  17.0     0     1     4     4
#> 3  22.8     4   108    93  3.85  2.32  18.6     1     1     4     1
#> # … with 29 more rows

React to subset

A method for subsetting function [ is implemented for keyed_df to react on changes in rows: if rows in reference data frame are rearranged or removed the same operation is done to keys.

mtcars_tbl_subset <- mtcars_tbl %>% key_by(vs, am) %>%
  `[`(c(3, 18, 19), c(2, 8, 9))

mtcars_tbl_subset
#> # A keyed object. Keys: vs, am 
#> # A tibble: 3 × 3
#>     cyl    vs    am
#>   <dbl> <dbl> <dbl>
#> 1     4     1     1
#> 2     4     1     1
#> 3     4     1     1

keys(mtcars_tbl_subset)
#> # A tibble: 3 × 2
#>      vs    am
#>   <dbl> <dbl>
#> 1     1     1
#> 2     1     1
#> 3     1     1

Verbs from dplyr

All one- and two-table verbs from dplyr (with present scoped variants) support keyed_df. Most functions react to changes in rows as in [ but some functions (summarise(), distinct() and do()) unkey object.

mtcars_tbl_keyed <- mtcars_tbl %>% key_by(vs, am)

mtcars_tbl_keyed %>% select(gear, mpg)
#> # A keyed object. Keys: vs, am 
#> # A tibble: 32 × 2
#>    gear   mpg
#>   <dbl> <dbl>
#> 1     4  21  
#> 2     4  21  
#> 3     4  22.8
#> # … with 29 more rows

mtcars_tbl_keyed %>% summarise(meanMPG = mean(mpg))
#> # A tibble: 1 × 1
#>   meanMPG
#>     <dbl>
#> 1    20.1

mtcars_tbl_keyed %>% filter(vs == 1) %>% keys()
#> # A tibble: 14 × 2
#>      vs    am
#>   <dbl> <dbl>
#> 1     1     1
#> 2     1     0
#> 3     1     0
#> # … with 11 more rows

mtcars_tbl_keyed %>% arrange_at("mpg") %>% keys()
#> # A tibble: 32 × 2
#>      vs    am
#>   <dbl> <dbl>
#> 1     0     0
#> 2     0     0
#> 3     0     0
#> # … with 29 more rows

band_members %>% key_by(name) %>%
  semi_join(band_instruments, by = "name") %>%
  keys()
#> # A tibble: 2 × 1
#>   name 
#>   <chr>
#> 1 John 
#> 2 Paul