README

deduped contains one main function deduped() which speeds up slow, vectorized functions by only performing computations on the unique values of the input and expanding the results at the end. A convenience wrapper, with_deduped(), was added in version 0.3.0 to allow piping an existing expression.

Installation

install.packages("deduped")

if(!requireNamespace("remotes")) install.packages("remotes")

remotes::install_github("orgadish/deduped")

Examples

Setup

library(deduped)
set.seed(0)

slow_tolower <- function(x) {
  for (i in x) {
    Sys.sleep(0.0005)
  }
  tolower(x)
}

deduped(...)


# Create a vector with significant duplication.
set.seed(1)
unique_vec <- sample(LETTERS, 5)
print(unique_vec)
#> [1] "Y" "D" "G" "A" "B"
duplicated_vec <- sample(rep(unique_vec, 100))
length(duplicated_vec)
#> [1] 500

system.time({  x1 <- slow_tolower(duplicated_vec)  })
#>    user  system elapsed 
#>    0.02    0.00    6.73


system.time({  x2 <- deduped(slow_tolower)(duplicated_vec)  })
#>    user  system elapsed 
#>    0.08    0.00    0.17
all.equal(x1, x2)
#> [1] TRUE

# As of version 0.3.0, you can use `with_deduped()`.
all.equal(x1, slow_tolower(duplicated_vec) |> with_deduped())
#> [1] TRUE

deduped(lapply)(...)


set.seed(2)
unique_list <- lapply(1:3, function(j) sample(LETTERS, j, replace = TRUE))
str(unique_list)
#> List of 3
#>  $ : chr "U"
#>  $ : chr [1:2] "O" "F"
#>  $ : chr [1:3] "F" "H" "Q"

# Create a list with significant duplication.
duplicated_list <- sample(rep(unique_list, 50)) 
length(duplicated_list)
#> [1] 150

system.time({  y1 <- lapply(duplicated_list, slow_tolower)  })
#>    user  system elapsed 
#>    0.03    0.00    3.90
system.time({  y2 <- deduped(lapply)(duplicated_list, slow_tolower)  })
#>    user  system elapsed 
#>    0.00    0.00    0.09

all.equal(y1, y2)
#> [1] TRUE

deduped(fs::path_rel)(...)


set.seed(3)
top_path <- "x/y/z/"
unique_paths <- paste0(top_path, LETTERS, "/file.csv")
str(unique_paths)
#>  chr [1:26] "x/y/z/A/file.csv" "x/y/z/B/file.csv" "x/y/z/C/file.csv" ...

# Create a list with significant duplication.
dup_paths <- sample(rep(unique_paths, 500)) 
length(dup_paths)
#> [1] 13000

system.time({  y1 <- fs::path_rel(dup_paths, start=top_path)  })
#>    user  system elapsed 
#>    6.16    0.05    6.35
system.time({  y2 <- deduped(fs::path_rel)(dup_paths, start=top_path)  })
#>    user  system elapsed 
#>    0.01    0.00    0.01

all.equal(y1, y2)
#> [1] TRUE

When to use with_deduped()

with_deduped() is a convenience wrapper for interactive or one-off use. Because it reconstructs the wrapper function on every call, prefer deduped() directly when calling inside a loop:

# Good: wrapper is built once
deduped_slow_tolower <- deduped(slow_tolower)
for (x in list_of_vecs) deduped_slow_tolower(x)

# Avoid: wrapper is rebuilt on every iteration
for (x in list_of_vecs) slow_tolower(x) |> with_deduped()

deduped(..., verbose = TRUE)

For benchmarking or debugging, pass verbose = TRUE to see the reduction achieved.

head(
  deduped(slow_tolower, verbose = TRUE)(duplicated_vec)
)
#> deduped: 500 value(s) reduced to 5 unique (99.0% reduction).
#> [1] "y" "a" "b" "y" "d" "d"

# Also available in `with_deduped()`:
head(
  slow_tolower(duplicated_vec) |> with_deduped(verbose = TRUE)
)
#> deduped: 500 value(s) reduced to 5 unique (99.0% reduction).
#> [1] "y" "a" "b" "y" "d" "d"

# Use `options(deduped.verbose)` to enable for the entire session.
options(deduped.verbose = TRUE)
head(
  deduped(slow_tolower)(duplicated_vec)
)
#> deduped: 500 value(s) reduced to 5 unique (99.0% reduction).
#> [1] "y" "a" "b" "y" "d" "d"

deduped

Installation

Examples

Setup

`deduped(...)`

`deduped(lapply)(...)`

`deduped(fs::path_rel)(...)`

When to use `with_deduped()`

`deduped(..., verbose = TRUE)`

deduped

Installation

Examples

Setup

deduped(...)

deduped(lapply)(...)

deduped(fs::path_rel)(...)

When to use with_deduped()

deduped(..., verbose = TRUE)

`deduped(...)`

`deduped(lapply)(...)`

`deduped(fs::path_rel)(...)`

When to use `with_deduped()`

`deduped(..., verbose = TRUE)`