下面是一个简单的for循环方法.如果有更快的解决方案而不reshape 你的数据,试图将其转化为矩阵乘法,我会感到惊讶,但reshape 可能会比这一次操作获得的效率yield 更昂贵.
提高速度的一种方法可能是使用data.table
个数据帧而不是基本数据帧.或者可能只使用矩阵--在您的示例中没有显示非数字数据;如果matrix
适用于data1
,而data2
是列顺序正确的向量,那么速度可能会快一些.
for(i in 1:nrow(data2)) {
data1[paste0("w", data2$id[i])] = data1[[data2$id[i]]] * data2$w[i]
}
data1
# id d01 d02 d03 wd01 wd02 wd03
# 1 1 1.5 1.0 2 3 4 3.0
# 2 2 4.0 2.0 4 8 8 6.0
# 3 3 3.0 1.0 3 6 4 4.5
# 4 4 2.0 4.5 2 4 18 3.0
# 5 5 1.0 3.0 5 2 12 7.5
事实证明,我的直觉是错误的,for循环方法比矩阵乘法快得多,但lapply
方法的速度要快得多.这里有一个关于"全尺寸"数据的基准测试(首先是结果,然后是基准测试代码).
bench::mark(
matrix_mult = matrix_mult(m1, w),
for_loop = for_loop(data1, data2),
lapply = lapply_replace(data1, data2),
pivot = pivot(data1, data2),
check = FALSE
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 matrix_mult 226.35ms 264.33ms 3.78 323.9MB 3.78 2 2 528.65ms <NULL> <Rprofmem [90 × 3]> <bench_tm [2]> <tibble>
# 2 for_loop 47.28ms 52ms 17.4 120.3MB 1.93 9 1 516.81ms <NULL> <Rprofmem [440 × 3]> <bench_tm [9]> <tibble>
# 3 lapply 8.4ms 9.34ms 84.9 81.3MB 5.93 43 3 506.21ms <NULL> <Rprofmem [244 × 3]> <bench_tm [43]> <tibble>
# 4 pivot 1.99s 1.99s 0.503 2.5GB 2.01 1 4 1.99s <NULL> <Rprofmem [737 × 3]> <bench_tm [1]> <tibble>
# Warning message:
# Some expressions had a GC in every iteration; so filtering is disabled.
基准代码:
set.seed(47)
nr = 3e5
nc = 35
data1 = data.frame(id = 1:nr, replicate(nc, runif(nr)))
names(data1)[-1] = sprintf("d%02d", 1:nc)
data2 = data.frame(id = sprintf("d%02d", 1:nc), w = runif(nc))
m1 = as.matrix(data1)
w = data2$w
matrix_mult = function(m1, w) {
res = m1[, -1] %*% diag(w)
colnames(res) = paste0("w", colnames(m1)[-1])
cbind(m1, res)
}
for_loop = function(d1, d2) {
for(i in 1:nrow(d2)) {
d1[paste0("w", d2$id[i])] = d1[[d2$id[i]]] * d2$w[i]
}
d1
}
lapply_replace = function(d1, d2) {
d1[paste0("w",names(d1)[-1])] <- lapply(names(d1)[-1], \(x) {
d1[,match(x, d2$id)+1] * d2[match(x, d2$id), "w"]
})
d1
}
pivot = function(d1, d2) {
d1 %>%
# Unpivot to 'id' | 'name' | 'value'.
pivot_longer(!id) %>%
# Match each 'w' to its appropriate column 'name'.
left_join(d2,
join_by(name == id)
) %>%
mutate(
# Multiply by 'w'...
prod = value * w,
# ...and also prefix the column 'name' with "w".
name = paste0("w", name)
) %>%
# Pivot to 'id' | 'wd01' | ... | 'wd03'.
pivot_wider(
id_cols = id,
values_from = prod,
names_from = name
) %>%
# Use 'id' to associate each 'wd*' with its 'd*'.
right_join(d1, "id")
}