Title: | Statistical Experiments on Batch Computing Clusters |
Description: | Extends the BatchJobs package to run statistical experiments on batch computing clusters. For further details see the project web page. |
Author: | Bernd Bischl <bernd_bischl@gmx.net>, Michel Lang <michellang@gmail.com>, Olaf Mersmann <olafm@p-value.net> |
Maintainer: | Michel Lang <michellang@gmail.com> |
URL: | https://github.com/tudo-r/BatchExperiments |
BugReports: | https://github.com/tudo-r/BatchExperiments/issues |
License: | BSD_3_clause + file LICENSE |
Depends: | R (≥ 3.0.0), BatchJobs (≥ 1.7) |
Imports: | backports, utils, stats, checkmate (≥ 1.8.5), BBmisc (≥ 1.11), DBI, RSQLite (≥ 2.0), data.table (≥ 1.9.6) |
Suggests: | plyr, randomForest, rpart, testthat |
Version: | 1.4.3 |
RoxygenNote: | 7.1.2 |
NeedsCompilation: | no |
Packaged: | 2022-03-21 10:39:59 UTC; michel |
Repository: | CRAN |
Date/Publication: | 2022-03-21 11:00:02 UTC |
The BatchExperiments package
Description
Extends the BatchJobs package to run statistical experiments on batch computing clusters.
Additional information
- Homepage:
- Wiki:
ExperimentJob
Description
You can access job properties using the job
object which is optionally passed
to dynamic problem functions and algorithms. The object is a named list with the following
elements:
id
[integer(1)
]:Job ID.
prob.id
[character(1)
]:Problem ID.
prob.pars
[list
]:Problem parameters as named list.
algo.id
[character(1)
]:algo.id
Algorithm ID.
algo.pars
[list
]:Algorithm parameters as named list.
repl
[integer(1)
]:Replication number of this experiment.
seed
[integer(1)
]:Seed set right before algorithm execution.
prob.seed
[integer(1)
]:Seed set right before generation of problem instance.
Add an algorithm to registry.
Description
Add an algorithm to registry and stores it on disk.
Usage
addAlgorithm(reg, id, fun, overwrite = FALSE)
Arguments
reg |
[ |
id |
[ |
fun |
[ To retrieve job informations from the |
overwrite |
[ |
Value
[character(1)
]. Invisibly returns the id.
See Also
Other add:
addExperiments()
,
addProblem()
Add experiemts to the registry.
Description
Add experiments for running algorithms on problems to the registry, so they can be executed later.
Usage
addExperiments(
reg,
prob.designs,
algo.designs,
repls = 1L,
skip.defined = FALSE
)
Arguments
reg |
[ |
prob.designs |
[ |
algo.designs |
[ |
repls |
[ |
skip.defined |
[ |
Value
Invisibly returns vector of ids of added experiments.
See Also
Other add:
addAlgorithm()
,
addProblem()
Examples
### EXAMPLE 1 ###
reg = makeExperimentRegistry(id = "example1", file.dir = tempfile())
# Define a problem:
# Subsampling from the iris dataset.
data(iris)
subsample = function(static, ratio) {
n = nrow(static)
train = sample(n, floor(n * ratio))
test = setdiff(seq(n), train)
list(test = test, train = train)
}
addProblem(reg, id = "iris", static = iris,
dynamic = subsample, seed = 123)
# Define algorithm "tree":
# Decision tree on the iris dataset, modeling Species.
tree.wrapper = function(static, dynamic, ...) {
library(rpart)
mod = rpart(Species ~ ., data = static[dynamic$train, ], ...)
pred = predict(mod, newdata = static[dynamic$test, ], type = "class")
table(static$Species[dynamic$test], pred)
}
addAlgorithm(reg, id = "tree", fun = tree.wrapper)
# Define algorithm "forest":
# Random forest on the iris dataset, modeling Species.
forest.wrapper = function(static, dynamic, ...) {
library(randomForest)
mod = randomForest(Species ~ ., data = static, subset = dynamic$train, ...)
pred = predict(mod, newdata = static[dynamic$test, ])
table(static$Species[dynamic$test], pred)
}
addAlgorithm(reg, id = "forest", fun = forest.wrapper)
# Define problem parameters:
pars = list(ratio = c(0.67, 0.9))
iris.design = makeDesign("iris", exhaustive = pars)
# Define decision tree parameters:
pars = list(minsplit = c(10, 20), cp = c(0.01, 0.1))
tree.design = makeDesign("tree", exhaustive = pars)
# Define random forest parameters:
pars = list(ntree = c(100, 500))
forest.design = makeDesign("forest", exhaustive = pars)
# Add experiments to the registry:
# Use previously defined experimental designs.
addExperiments(reg, prob.designs = iris.design,
algo.designs = list(tree.design, forest.design),
repls = 2) # usually you would set repls to 100 or more.
# Optional: Short summary over problems and algorithms.
summarizeExperiments(reg)
# Optional: Test one decision tree job and one expensive (ntree = 1000)
# random forest job. Use findExperiments to get the right job ids.
do.tests = FALSE
if (do.tests) {
id1 = findExperiments(reg, algo.pattern = "tree")[1]
id2 = findExperiments(reg, algo.pattern = "forest",
algo.pars = (ntree == 1000))[1]
testJob(reg, id1)
testJob(reg, id2)
}
# Submit the jobs to the batch system
submitJobs(reg)
# Calculate the misclassification rate for all (already done) jobs.
reduce = function(job, res) {
n = sum(res)
list(mcr = (n-sum(diag(res)))/n)
}
res = reduceResultsExperiments(reg, fun = reduce)
print(res)
# Aggregate results using 'ddply' from package 'plyr':
# Calculate the mean over all replications of identical experiments
# (same problem, same algorithm and same parameters)
library(plyr)
vars = setdiff(names(res), c("repl", "mcr"))
aggr = ddply(res, vars, summarise, mean.mcr = mean(mcr))
print(aggr)
## Not run:
### EXAMPLE 2 ###
# define two simple test functions
testfun1 = function(x) sum(x^2)
testfun2 = function(x) -exp(-sum(abs(x)))
# Define ExperimentRegistry:
reg = makeExperimentRegistry("example02", seed = 123, file.dir = tempfile())
# Add the testfunctions to the registry:
addProblem(reg, "testfun1", static = testfun1)
addProblem(reg, "testfun2", static = testfun2)
# Use SimulatedAnnealing on the test functions:
addAlgorithm(reg, "sann", fun = function(static, dynamic) {
upp = rep(10, 2)
low = -upp
start = sample(c(-10, 10), 2)
res = optim(start, fn = static, lower = low, upper = upp, method = "SANN")
res = res[c("par", "value", "counts", "convergence")]
res$start = start
return(res)
})
# add experiments and submit
addExperiments(reg, repls = 10)
submitJobs(reg)
# Gather informations from the experiments, in this case function value
# and whether the algorithm convergenced:
reduceResultsExperiments(reg, fun = function(job, res) res[c("value", "convergence")])
## End(Not run)
Add a problem to registry.
Description
Add a algorithm to problem and stores it on disk.
Usage
addProblem(
reg,
id,
static = NULL,
dynamic = NULL,
seed = NULL,
overwrite = FALSE
)
Arguments
reg |
Registry. |
id |
[ |
static |
[any] |
dynamic |
[ |
seed |
[ |
overwrite |
[ |
Value
[character(1)
]. Invisibly returns the id.
See Also
Other add:
addAlgorithm()
,
addExperiments()
Find ids of experiments that match a query.
Description
Find job ids by querying problem/algorithm ids, problem/algorithm parameters or replication number.
Usage
findExperiments(
reg,
ids,
prob.pattern,
prob.pars,
algo.pattern,
algo.pars,
repls,
match.substring = TRUE,
regexp = FALSE
)
Arguments
reg |
[ |
ids |
[ |
prob.pattern |
[ |
prob.pars |
[R expression] |
algo.pattern |
[ |
algo.pars |
[R expression] |
repls |
[ |
match.substring |
[ |
regexp |
[ |
Value
[integer
]. Ids for experiments which match the query.
Examples
reg = makeExperimentRegistry(id = "example1", file.dir = tempfile())
p1 = addProblem(reg, "one", 1)
p2 = addProblem(reg, "two", 2)
a = addAlgorithm(reg, "A", fun = function(static, n) static + n)
addExperiments(reg, algo.design = makeDesign(a, exhaustive = list(n = 1:4)))
findExperiments(reg, prob.pattern = "one")
findExperiments(reg, prob.pattern = "o")
findExperiments(reg, algo.pars = (n > 2))
Generate dynamic part of problem.
Description
Calls the dynamic problem function on the static problem part and thereby creates the problem instance. The seeding mechanism is identical to execution on the slave.
Usage
generateProblemInstance(reg, id)
Arguments
reg |
[ |
id |
[ |
Value
Dynamic part of problem.
Get algorithm from registry by id.
Description
The requested object is loaded from disk.
Usage
getAlgorithm(reg, id)
Arguments
reg |
[ |
id |
[ |
Value
[Algorithm
].
See Also
Other get:
getAlgorithmIds()
,
getExperimentParts()
,
getJobs.ExperimentRegistry()
,
getProblemIds()
,
getProblem()
Get ids of algorithms in registry.
Description
Get algorithm ids for jobs.
Usage
getAlgorithmIds(reg, ids)
Arguments
reg |
[ |
ids |
[codeinteger] |
Value
[character
].
See Also
Other get:
getAlgorithm()
,
getExperimentParts()
,
getJobs.ExperimentRegistry()
,
getProblemIds()
,
getProblem()
Get all parts required to run a single job.
Description
Get all parts which define an Experiment
.
Usage
getExperimentParts(reg, id)
Arguments
reg |
[ |
id |
[ |
Value
[named list]. Returns the Job, Problem, Instance and Algorithm.
See Also
Other get:
getAlgorithmIds()
,
getAlgorithm()
,
getJobs.ExperimentRegistry()
,
getProblemIds()
,
getProblem()
Group experiments.
Description
Creates a list of factor
to use in functions like tapply
, by
or aggregate
.
Usage
getIndex(
reg,
ids,
by.prob = FALSE,
by.algo = FALSE,
by.repl = FALSE,
by.prob.pars,
by.algo.pars,
enclos = parent.frame()
)
Arguments
reg |
[ |
ids |
[ |
by.prob |
[ |
by.algo |
[ |
by.repl |
[ |
by.prob.pars |
[R expression] |
by.algo.pars |
[R expression] |
enclos |
[ |
Value
[list
]. List of factors.
Examples
# create a registry and add problems and algorithms
reg = makeExperimentRegistry("getIndex", file.dir = tempfile(""))
addProblem(reg, "prob", static = 1)
addAlgorithm(reg, "f0", function(static, dynamic) static)
addAlgorithm(reg, "f1", function(static, dynamic, i, k) static * i^k)
ad = list(makeDesign("f0"), makeDesign("f1", exhaustive = list(i = 1:5, k = 1:3)))
addExperiments(reg, algo.designs = ad)
submitJobs(reg)
# get grouped job ids
ids = getJobIds(reg)
by(ids, getIndex(reg, by.prob = TRUE, by.algo = TRUE), identity)
ids = findExperiments(reg, algo.pattern = "f1")
by(ids, getIndex(reg, ids, by.algo.pars = (k == 1)), identity)
# groupwise reduction
ids = findExperiments(reg, algo.pattern = "f1")
showStatus(reg, ids)
f = function(aggr, job, res) aggr + res
by(ids, getIndex(reg, ids, by.algo.pars = k), reduceResults, reg = reg, fun = f)
by(ids, getIndex(reg, ids, by.algo.pars = i), reduceResults, reg = reg, fun = f)
Get jobs (here: experiments) from registry by id.
Description
Constructs an Experiment
for each job id provided.
Usage
## S3 method for class 'ExperimentRegistry'
getJobs(reg, ids, check.ids = TRUE)
Arguments
reg |
[ |
ids |
[ |
check.ids |
[ |
Value
[list of Experiment
].
See Also
Other get:
getAlgorithmIds()
,
getAlgorithm()
,
getExperimentParts()
,
getProblemIds()
,
getProblem()
Get problem from registry by id.
Description
The requested object is loaded from disk.
Usage
getProblem(reg, id)
Arguments
reg |
[ |
id |
[ |
Value
[Problem
].
See Also
Other get:
getAlgorithmIds()
,
getAlgorithm()
,
getExperimentParts()
,
getJobs.ExperimentRegistry()
,
getProblemIds()
Get ids of problems in registry.
Description
Get problem ids for jobs.
Usage
getProblemIds(reg, ids)
Arguments
reg |
[ |
ids |
[codeinteger] |
Value
[character
].
See Also
Other get:
getAlgorithmIds()
,
getAlgorithm()
,
getExperimentParts()
,
getJobs.ExperimentRegistry()
,
getProblem()
Get variable groups of reduced results.
Description
Useful helper for e.g. package plyr and such.
Usage
getResultVars(data, type = "group")
Arguments
data |
[ |
type |
[ |
Value
[character
]. Names of of columns.
Examples
reg = makeExperimentRegistry("BatchExample", seed = 123, file.dir = tempfile())
addProblem(reg, "p1", static = 1)
addProblem(reg, "p2", static = 2)
addAlgorithm(reg, id = "a1",
fun = function(static, dynamic, alpha) c(y = static*alpha))
addAlgorithm(reg, id = "a2",
fun = function(static, dynamic, alpha, beta) c(y = static*alpha+beta))
ad1 = makeDesign("a1", exhaustive = list(alpha = 1:2))
ad2 = makeDesign("a2", exhaustive = list(alpha = 1:2, beta = 5:6))
addExperiments(reg, algo.designs = list(ad1, ad2), repls = 2)
submitJobs(reg)
data = reduceResultsExperiments(reg)
library(plyr)
ddply(data, getResultVars(data, "group"), summarise, mean_y = mean(y))
Create parameter designs for problems and algorithms.
Description
Create a parameter design for either a problem or an algorithm that you
can use in addExperiments
.
All parameters in design
and exhaustive
be “primitive”
in the sense that either is.atomic
is TRUE
or is.factor
is TRUE
.
Be aware of R's default behaviour of converting strings into factors if you use the design
parameter. See option stringsAsFactors
in data.frame
to turn this off.
Usage
makeDesign(id, design = data.frame(), exhaustive = list())
Arguments
id |
[ Id of algorithm or problem. |
design |
[ |
exhaustive |
[ |
Value
[Design
].
Examples
## Not run:
# simple design for algorithm "a1" with no parameters:
design = makeDesign("a1")
# design for problem "p1" using predefined parameter combinations
design = makeDesign("p1", design = data.frame(alpha = 0:1, beta = c(0.1, 0.2)))
# creating a list of designs for several algorithms at once, all using the same
# exhaustive grid of parameters
designs = lapply(c("a1", "a2", "a3"), makeDesign,
exhaustive = list(alpha = 0:1, gamma = 1:10/10))
## End(Not run)
Construct a registry object for experiments.
Description
Note that if you don't want links in your paths (file.dir
, work.dir
) to get resolved and have
complete control over the way the path is used internally, pass an absolute path which begins with “/”.
Every object is a list that contains the passed arguments of the constructor.
Usage
makeExperimentRegistry(
id = "BatchExperimentRegistry",
file.dir,
sharding = TRUE,
work.dir,
multiple.result.files = FALSE,
seed,
packages = character(0L),
src.dirs = character(0L),
src.files = character(0L),
skip = TRUE
)
Arguments
id |
[ |
file.dir |
[ |
sharding |
[ |
work.dir |
[ |
multiple.result.files |
[ |
seed |
[ |
packages |
[ |
src.dirs |
[ |
src.files |
[ |
skip |
[ |
Value
Reduce results into a data.frame with all relevant information.
Description
Generates a data.frame
with one row per job id. The columns are: ids of problem and algorithm
(named “prob” and “algo”), one column per parameter of problem or algorithm (named by the parameter name),
the replication number (named “repl”) and all columns defined in the function to collect the values.
Note that you cannot rely on the order of the columns.
If a parameter does not have a setting for a certain job / experiment it is set to NA
.
Have a look at getResultVars
if you want to use something like ddply
on the
results.
The rows are ordered as ids
and named with ids
, so one can easily index them.
Usage
reduceResultsExperiments(
reg,
ids,
part = NA_character_,
fun,
...,
strings.as.factors = FALSE,
block.size,
impute.val,
apply.on.missing = FALSE,
progressbar = TRUE
)
Arguments
reg |
[ |
ids |
[ |
part |
[ |
fun |
[ |
... |
[any] |
strings.as.factors |
[ |
block.size |
[ |
impute.val |
[ |
apply.on.missing |
[ |
progressbar |
[ |
Value
[data.frame
]. Aggregated results, containing problem and algorithm paramaters and collected values.
Reduce very many results in parallel.
Description
Basically the same as reduceResultsExperiments
but creates a few (hopefully short) jobs
to reduce the results in parallel. The function internally calls batchMapQuick
,
does “busy-waiting” till
all jobs are done and cleans all temporary files up.
The rows are ordered as ids
and named with ids
, so one can easily index them.
Usage
reduceResultsExperimentsParallel(
reg,
ids,
part = NA_character_,
fun,
...,
timeout = 604800L,
njobs = 20L,
strings.as.factors = FALSE,
impute.val,
apply.on.missing = FALSE,
progressbar = TRUE
)
Arguments
reg |
[ |
ids |
[ |
part |
[ |
fun |
[ |
... |
[any] |
timeout |
[ |
njobs |
[ |
strings.as.factors |
[ |
impute.val |
[ |
apply.on.missing |
[ |
progressbar |
[ |
Value
[data.frame
]. Aggregated results, containing problem and algorithm paramaters and collected values.
Remove algorithm from registry.
Description
THIS DELETES ALL FILES REGARDING THIS ALGORITHM, INCLUDING ALL JOBS AND RESULTS!
Usage
removeAlgorithm(reg, id, force = FALSE)
Arguments
reg |
[ |
id |
[ |
force |
[ |
Value
Nothing.
See Also
Other remove:
removeExperiments()
,
removeProblem()
Remove jobs from registry.
Description
THIS DELETES ALL FILES REGARDING THE JOBS, INCLUDING RESULTS!
If you really know what you are doing, you may set force
to TRUE
to omit sanity checks on running jobs.
Usage
removeExperiments(reg, ids, force = FALSE)
Arguments
reg |
[ |
ids |
[ |
force |
[ |
Value
Vector of type integer
of removed job ids.
See Also
Other remove:
removeAlgorithm()
,
removeProblem()
Remove problem from registry.
Description
THIS DELETES ALL FILES REGARDING THIS PROBLEM, INCLUDING ALL JOBS AND RESULTS!
Usage
removeProblem(reg, id, force = FALSE)
Arguments
reg |
[ |
id |
[ |
force |
[ |
Value
Nothing.
See Also
Other remove:
removeAlgorithm()
,
removeExperiments()
Summarize selected experiments.
Description
A data.frame is returned that contains summary information
about the selected experiments. The data.frame is constructed
by building the columns “prob, <prob.pars>, algo, <algo.pars>, repl”.
Now only the columns in show
will be selected, how many of such experiments
exist will be counted in a new column “.count”.
Usage
summarizeExperiments(reg, ids, show = c("prob", "algo"))
Arguments
reg |
[ |
ids |
[ |
show |
[ |
Value
[data.frame
].
Examples
reg = makeExperimentRegistry("summarizeExperiments", seed = 123, file.dir = tempfile())
p1 = addProblem(reg, "p1", static = 1)
a1 = addAlgorithm(reg, id = "a1", fun = function(static, dynamic, alpha, beta) 1)
a2 = addAlgorithm(reg, id = "a2", fun = function(static, dynamic, alpha, gamma) 2)
ad1 = makeDesign(a1, exhaustive = list(alpha = 1:2, beta = 1:2))
ad2 = makeDesign(a2, exhaustive = list(alpha = 1:2, gamma = 7:8))
addExperiments(reg, algo.designs = list(ad1, ad2), repls = 2)
print(summarizeExperiments(reg))
print(summarizeExperiments(reg, show = c("prob", "algo", "alpha", "gamma")))