可以基于值event
创建列,如果每行中的列名等于event
值,则为其分配1.然后,您可以使用cumsum
乘以person.id
来累加值.
library(data.table)
categories <- sort(as.character(unique(current$event)))
current[, (categories) := lapply(
lapply(categories, function(x) +(event == x)),
cumsum), .(person.id)][]
#> person.id event date categoryA categoryB categoryC categoryD
#> 1: 1 categoryA 2020-01-01 1 0 0 0
#> 2: 2 categoryA 2020-03-23 1 0 0 0
#> 3: 3 categoryD 2020-09-09 0 0 0 1
#> 4: 1 categoryA 2020-12-30 2 0 0 0
#> 5: 2 categoryC 2021-06-03 1 0 1 0
#> 6: 3 categoryB 2022-03-22 0 1 0 1
如果您更喜欢dcast
,并且希望保留顺序并避免日期冲突,请try 在dcast
之前添加行ID号.
result <- dcast(current[, rid := .I],
rid + person.id + date + event ~ event,
fun = length)[, rid := NULL][]
cols <- setdiff(names(result), names(current))
result[, (cols) := lapply(.SD, cumsum), by = person.id, .SDcols = cols][]
#> person.id date event categoryA categoryB categoryC categoryD
#> 1: 1 2020-01-01 categoryA 1 0 0 0
#> 2: 2 2020-03-23 categoryA 1 0 0 0
#> 3: 3 2020-09-09 categoryD 0 0 0 1
#> 4: 1 2020-12-30 categoryA 2 0 0 0
#> 5: 2 2021-06-03 categoryC 1 0 1 0
#> 6: 3 2022-03-22 categoryB 0 1 0 1
不过,我觉得很难相信dcast
更快:
library(microbenchmark)
microbenchmark(
M_categories = {
categories <- sort(as.character(unique(current$event)))
current[, (categories) := lapply(
lapply(categories, function(x) event == x),
cumsum), .(person.id)]
},
M_dcast = {
result <- dcast(current[, rid := .I],
rid + person.id + date + event ~ event,
fun = length)[, rid := NULL][]
cols <- setdiff(names(result), names(current))
result[, (cols) := lapply(.SD, cumsum), by = person.id, .SDcols = cols][]
},
GT_dcats = {
result = dcast(current, formula = person.id + date ~ event, fun.aggregate = length)
cols = names(result)[names(result) %like% "category"]
result[, (cols) := lapply(.SD, cumsum), by = person.id, .SDcols = cols]
}
)
Unit: microseconds
expr min lq mean median uq max neval
M_categories 563.4 600.45 655.900 626.15 651.95 1302.0 100
M_dcast 3963.3 4235.40 4627.580 4443.90 4620.25 15476.1 100
GT_dcats 3355.5 3554.40 4030.594 3711.10 3929.75 14002.1 100
对于较大的数据集,dcast
的速度明显较慢:
large_data <- current[sample(nrow(current), 10000, replace = TRUE), ]
microbenchmark(
M_categories = {
categories <- sort(as.character(unique(large_data$event)))
large_data[, (categories) := lapply(
lapply(categories, function(x) event == x),
cumsum), .(person.id)]
},
M_dcast = {
result <- dcast(large_data[, rid := .I],
rid + person.id + date + event ~ event,
fun = length)[, rid := NULL][]
cols <- setdiff(names(result), names(large_data))
result[, (cols) := lapply(.SD, cumsum), by = person.id, .SDcols = cols][]
},
GT_dcats = {
result = dcast(large_data, formula = person.id + date ~ event, fun.aggregate = length)
cols = names(result)[names(result) %like% "category"]
result[, (cols) := lapply(.SD, cumsum), by = person.id, .SDcols = cols]
}
)
Unit: milliseconds
expr min lq mean median uq max neval
M_categories 1.4199 1.65105 2.23003 1.98575 2.45670 10.9188 100
M_dcast 26.1525 27.53905 28.46625 28.07925 28.64415 36.4706 100
GT_dcats 4.3829 4.94760 5.49732 5.44890 5.78995 15.0705 100
创建于2024-01-29年第reprex v2.0.2页