R解析列表并填充data.table优化

发布于04月26日

我有以下R码.它创建了一个包含1600个元素的列表‘data_list’.每个元素都是一个包含两个元素的列表:‘Sample’和‘Input’.创建一个输出数据表‘OUT_DT’，并用‘INPUT’中出现的值的平均值和SD填充.

library(data.table)

# create an example data list with 1600 elements
# create the input data_list
set.seed(123)

n_samples <- 1600
n_regions_at_start <- 50000
chr_sample_at_start <- sample(1:22, n_regions_at_start, replace = TRUE)
start_sample_at_start <- sort(sample(1:100000, n_regions_at_start))
end_sample_at_start <- sort(sample(100001:200000, n_regions_at_start))

data_list <- vector("list", n_samples)

for (i in 1:n_samples) {
  # use this trick to create input_dfs with a varying number of rows. This is an extra complexity compared to the first code
  if (i %% 5 == 0) {
    elements_to_remove = sort(sample(1:n_regions_at_start, 5))
    n_regions <- n_regions_at_start - 5
    CHR <- chr_sample_at_start[-c(elements_to_remove)]
    START <- start_sample_at_start[-c(elements_to_remove)]
    END <- end_sample_at_start[-c(elements_to_remove)]
  } else {
    n_regions <- n_regions_at_start
    CHR <- chr_sample_at_start
    START <- start_sample_at_start
    END <- end_sample_at_start
  }
  
  sample_list <- list(sample = paste0("Sample", i))
  
  input_df <- data.frame (
    CHR = CHR,
    START = START,
    END = END,
    COL1 = rnorm(n_regions),
    COL2 = rnorm(n_regions),
    COL3 = rnorm(n_regions),
    COL4 = rnorm(n_regions),
    COL5 = rnorm(n_regions),
    COL6 = rnorm(n_regions),
    COL7 = rnorm(n_regions),
    COL8 = rnorm(n_regions),
    COL9 = rnorm(n_regions),
    COL10 = rnorm(n_regions),
    COL11 = rnorm(n_regions),
    COL12 = rnorm(n_regions),
    COL13 = rnorm(n_regions)
  )
  data_list[[i]] <- list(sample = sample_list$sample, input = input_df)
}

# get the data for the first 3 columns of out_dt
region_concat_list <- lapply(data_list, function(x) paste(x$input[["CHR"]], x$input[["START"]], x$input[["END"]], sep = "-"))
region_vec_all_unique <- unique(unlist(region_concat_list))

# create the output data.table with first 3 columns
out_dt <- data.table("CHR" = sapply(region_vec_all_unique, function(x) as.character(unlist(strsplit(x, "-"))[1])), "START" = as.numeric(sapply(region_vec_all_unique, function(x) as.character(unlist(strsplit(x, "-"))[2]))), "END" = as.numeric(sapply(region_vec_all_unique, function(x) as.character(unlist(strsplit(x, "-"))[3]))))

# Get column names for means and sds
colnames_mean <- paste0(names(data_list[[1]]$input)[4:16], ".MEAN")
colnames_sd <- paste0(names(data_list[[1]]$input)[4:16], ".SD")

# Calculate means and sds and add them to out_dt
print(Sys.time())
for (i in seq_len(nrow(out_dt))) {
  print(paste("row", i, Sys.time()))
  chr_val <- as.numeric(unname(unlist(out_dt[i, "CHR"])))
  start_val <- as.numeric(unname(unlist(out_dt[i, "START"])))
  end_val <- as.numeric(unname(unlist(out_dt[i, "END"])))
  for (j in 4:16) {
    out_dt[i, colnames_mean[j-3] := mean(unlist(sapply(data_list, function(x) x$input[x$input$CHR == chr_val & x$input$START == start_val & x$input$END == end_val, j])), na.rm = TRUE)]
    out_dt[i, colnames_sd[j-3] :=  sd(unlist(sapply(data_list, function(x) x$input[x$input$CHR == chr_val & x$input$START == start_val & x$input$END == end_val, j])), na.rm = TRUE)]
  }
}
print(Sys.time())

我想要人帮忙

在给定元素数量(1600)和行数(50000)的情况下，在速度方面优化该代码.目前，在我的机器上，i的每次迭代需要15秒.这意味着它将需要208小时才能完成.我知道并行化，我会对使用这种方法的一些解决方案感兴趣，但我也想知道，如果不使用并行化，代码是否仍然可以优化？

dcast( melt( rbindlist(lapply(data_list, "[[", "input_df")), measure = patterns("^COL") )[, .(SD = sd(value), MEAN = mean(value)), .(CHR, START, END, variable)], CHR + START + END ~ variable, value.var = c("MEAN", "SD") )

CHR START END MEAN_COL1 MEAN_COL2 MEAN_COL3 SD_COL1 SD_COL2 SD_COL3 1: 1 33714 1047782 0.141747 0.2738446 0.1534639 0.79823 0.92176 0.59589 2: 1 130552 1120116 0.373040 -0.2715679 0.0181300 1.19567 0.80810 1.06344 3: 1 159027 1154071 -0.150262 -0.0466695 0.0029242 0.91435 0.92095 0.95829 4: 1 167912 1173695 0.141090 -0.3735618 -0.4127391 1.00912 1.05998 0.75489 5: 1 190046 1190635 -0.223469 -0.1353540 -0.5677172 1.16733 1.23969 0.99495 --- 996: 22 920450 1933075 0.046370 0.1025626 -0.3027420 1.03382 1.27190 0.94798 997: 22 947212 1957620 0.247132 0.0573555 -0.3610278 1.10706 1.01420 1.15222 998: 22 959074 1963730 0.237973 0.0089281 -0.1306382 0.78475 0.70369 0.86592 999: 22 959963 1964285 -0.496522 0.4612102 -0.1540762 1.19330 0.93966 0.87134 1000: 22 964800 1965878 -0.055107 0.0729134 -0.0650617 1.23578 0.77902 0.58955

R解析列表并填充data.table优化

推荐答案

R相关问答推荐

如何使用geom_sf在边界显示两种 colored颜色？

行式dppr中的变量列名

带有叠加饼图系列的Highmap

过滤器数据.基于两列的帧行和R中的外部向量

有没有一个R函数允许你从一个数字变量中提取一个数字，而不考虑它的位置(不仅仅是第一个或最后一个数字？

用相同方法得到不同函数的ROC最优截断值

如何提取所有完美匹配的10个核苷酸在一个成对的匹配与生物字符串在R？>

如何删除最后一个可操作对象

在GG图中绘制射线的自动程序

具有重复元素的维恩图

R：如果为NA，则根据条件，使用列名模式将缺少的值替换为另一列中的值

按组跨多列创建伪变量

R基于变量组合创建新的指标列

如何调整一个facet_work()面板内的框图和移动标签之间的水平宽度？

R-如何在ggplot2中显示具有不同x轴值(日期)的多行？

如何在shiny 的应用程序 map 视图宣传单中可视化单点

修复标签重叠和ggploy内的空间

创建两个变量组合的索引矩阵

如何修改Rust中的R字符串并将其赋给新的R变量，并使用extendr保留原始R字符串

如何从矩阵绘制环弦图