# regression function
<- function(
create_regression n = 1e7) {
# 1) actual
# values
<- abs(rnorm(n = n))
actual
# 2) predicted
# values
<- actual + abs(rnorm(n = n))
predicted
# 3) generate
# weights
<- runif(n)
w
list(
actual = actual,
predicted = predicted,
w = w
) }
3 Benchmarking
In this section a detailed benchmark of {SLmetrics} is conducted. The benchmarks will be conducted on randomly selected functions, and then compared to {pkg} discussed in Chapter 1. The benchmarks are conducted on three parameters: median execution time, memory usage and gc()
calls.
This section strucutred as follows, Section 3.1 sets up the infrastructure needed to conduct the benchmark in an unbiased way, in Section 3.2 the benchmarks are conducted and discussed and summarized in Section 3.3 and Section 3.4 respectively.
3.1 The setup
To conduct the benchmarking two functions are defined. create_regression()
and create_factor()
, both functions returns a vector of actual
and predicted
values with a length
of 10,000,000 rows.
3.1.1 Regression problems
The benchmarks on regression metrics is conducted on correlated absolute value <numeric>
-vectors, with uniformly distributed weights. create_regression()
returns a named list
, and is defined below:
3.1.2 Classification problems
The benchmarks on classification metrics is conducted on the randomly sampled letters c("a", "b", "c")
. create_regression()
returns a vector of <factor>
, and is defined below:
# classification function
<- function(
create_factor k = 3,
balanced = TRUE,
n = 1e7) {
<- NULL
probs
if (!balanced) {
<- rbeta(
probs n = k,
shape1 = 10,
shape2 = 2
)
which.min(probs)] <- 0
probs[
<- probs / sum(probs)
probs
}
factor(
x = sample(
1:k,
size = n,
replace = TRUE,
prob = probs
),labels = letters[1:k],
levels = 1:k
) }
3.1.3 Staging the testing ground
The vectors used in the benchmarks are created with the seed 1903
for reproducibility, see below:
# 1) set seed for reproducibility
set.seed(1903)
# 2) create classification
# problem
<- create_factor()
fct_actual <- create_factor()
fct_predicted
# 3) create regression
# problem
# 3.1) store results
# in regression
<- create_regression()
lst_regression
# 3.2) assign the values
# accordingly
<- lst_regression$actual
num_actual <- lst_regression$predicted
num_predicted <- lst_regression$w num_weights
3.2 Benchmarking
To conduct the benchmark {bench} is used. Before the benchmarks are conducted, a benchmark()
-wrapper is created.
This wrapper conducts m
(Default: 10) benchmarks, with 10 iterations for each benchmarked function passed into benchmark()
- to allow for warm-up the first iteration is discarded. The wrapper is defined as follows:
<- function(
benchmark
..., m = 10) {
library(magrittr)
# 1) create list
# for storing values
<- list()
performance
for (i in 1:m) {
# 1) run the benchmarks
<- bench::mark(
results
...,iterations = 10,
check = FALSE
)
# 2) extract values
# and calculate medians
$time[[i]] <- setNames(lapply(results$time, mean), results$expression)
performance$memory[[i]] <- setNames(lapply(results$memory, function(x) { sum(x$bytes, na.rm = TRUE)}), results$expression)
performance$n_gc[[i]] <- setNames(lapply(results$n_gc, sum), results$expression)
performance
}
::pmap_dfr(
purrrlist(performance$time, performance$memory, performance$n_gc),
~{
::tibble(
tibbleexpression = names(..1),
time = unlist(..1),
memory = unlist(..2),
n_gc = unlist(..3)
)
}%>%
) ::mutate(expression = factor(expression, levels = unique(expression))) %>%
dplyr::group_by(expression) %>%
dplyr::filter(dplyr::row_number() > 1) %>%
dplyr::summarize(
dplyrexecution_time = bench::as_bench_time(median(time)),
memory_usage = bench::as_bench_bytes(median(memory)),
gc_calls = median(n_gc),
.groups = "drop"
)
}
3.2.1 Regression metrics
Code
benchmark(
`{RMSE}` = SLmetrics::rmse(num_actual, num_predicted),
`{Pinball Loss}` = SLmetrics::pinball(num_actual, num_predicted),
`{Huber Loss}` = SLmetrics::huberloss(num_actual, num_predicted)
)
Code
benchmark(
`{SLmetrics}` = SLmetrics::rmse(num_actual, num_predicted),
`{MLmetrics}` = MLmetrics::RMSE(num_actual, num_predicted),
`{yardstick}` = yardstick::rmse_vec(num_actual, num_predicted),
`{mlr3measures}` = mlr3measures::rmse(num_actual, num_predicted)
)
3.2.2 Classification metrics
Code
benchmark(
`{Confusion Matrix}` = SLmetrics::cmatrix(fct_actual, fct_predicted),
`{Accuracy}` = SLmetrics::accuracy(fct_actual, fct_predicted),
`{F-beta}` = SLmetrics::fbeta(fct_actual, fct_predicted)
)
Code
benchmark(
`{SLmetrics}` = SLmetrics::cmatrix(fct_actual, fct_predicted),
`{MLmetrics}` = MLmetrics::ConfusionMatrix(fct_predicted, fct_actual),
`{yardstick}` = yardstick::conf_mat(table(fct_actual, fct_predicted))
)
3.3 Discussion
Does speed really matter at the milliseconds level, and justify the raîson d’être for {SLmetrics} - the answer is inevitably no. A reduction of a few milliseconds may marginally improve performance, perhaps shaving off minutes or hours in large-scale grid searches or multi-model experiments. While this might slightly reduce cloud expenses, the overall impact is often negligible unless you’re operating at an enormous scale or in latency-critical environments.
However, the memory efficiency of {SLmetrics} is where its real value lies. Its near-zero RAM usage allows more memory to be allocated for valuable tasks, such as feeding larger datasets into models. This can directly lead to higher-performing models, as more data generally improves learning outcomes. Furthermore, by optimizing memory usage, {SLmetrics} can reduce infrastructure costs significantly, as less powerful machines or fewer cloud resources may be required to achieve the same — or better — results.
In short, while speed optimization may seem like a more visible metric, it’s the memory efficiency of {SLmetrics} that has a broader, more transformative impact on machine learning workflows, from enabling better model performance to substantial cost reductions.
3.4 Conclusion
The benchmarks conducted in Section 3.2 suggests that {SLmetrics} is the memory-efficient and fast alternative to {MLmetrics}, {yardstick} and {mlr3measures}.
In the worst performing benchmarks {SLmetrics} is on par with low-level implementations of equivalent metrics and is consistently more memory-efficient in all benchmarks.