我有以下R码.它创建了一个包含1600个元素的列表‘data_list’.每个元素都是一个包含两个元素的列表:‘Sample’和‘Input’.创建一个输出数据表‘OUT_DT’,并用‘INPUT’中出现的值的平均值和SD填充.
library(data.table)
# create an example data list with 1600 elements
# create the input data_list
set.seed(123)
n_samples <- 1600
n_regions_at_start <- 50000
chr_sample_at_start <- sample(1:22, n_regions_at_start, replace = TRUE)
start_sample_at_start <- sort(sample(1:100000, n_regions_at_start))
end_sample_at_start <- sort(sample(100001:200000, n_regions_at_start))
data_list <- vector("list", n_samples)
for (i in 1:n_samples) {
# use this trick to create input_dfs with a varying number of rows. This is an extra complexity compared to the first code
if (i %% 5 == 0) {
elements_to_remove = sort(sample(1:n_regions_at_start, 5))
n_regions <- n_regions_at_start - 5
CHR <- chr_sample_at_start[-c(elements_to_remove)]
START <- start_sample_at_start[-c(elements_to_remove)]
END <- end_sample_at_start[-c(elements_to_remove)]
} else {
n_regions <- n_regions_at_start
CHR <- chr_sample_at_start
START <- start_sample_at_start
END <- end_sample_at_start
}
sample_list <- list(sample = paste0("Sample", i))
input_df <- data.frame (
CHR = CHR,
START = START,
END = END,
COL1 = rnorm(n_regions),
COL2 = rnorm(n_regions),
COL3 = rnorm(n_regions),
COL4 = rnorm(n_regions),
COL5 = rnorm(n_regions),
COL6 = rnorm(n_regions),
COL7 = rnorm(n_regions),
COL8 = rnorm(n_regions),
COL9 = rnorm(n_regions),
COL10 = rnorm(n_regions),
COL11 = rnorm(n_regions),
COL12 = rnorm(n_regions),
COL13 = rnorm(n_regions)
)
data_list[[i]] <- list(sample = sample_list$sample, input = input_df)
}
# get the data for the first 3 columns of out_dt
region_concat_list <- lapply(data_list, function(x) paste(x$input[["CHR"]], x$input[["START"]], x$input[["END"]], sep = "-"))
region_vec_all_unique <- unique(unlist(region_concat_list))
# create the output data.table with first 3 columns
out_dt <- data.table("CHR" = sapply(region_vec_all_unique, function(x) as.character(unlist(strsplit(x, "-"))[1])), "START" = as.numeric(sapply(region_vec_all_unique, function(x) as.character(unlist(strsplit(x, "-"))[2]))), "END" = as.numeric(sapply(region_vec_all_unique, function(x) as.character(unlist(strsplit(x, "-"))[3]))))
# Get column names for means and sds
colnames_mean <- paste0(names(data_list[[1]]$input)[4:16], ".MEAN")
colnames_sd <- paste0(names(data_list[[1]]$input)[4:16], ".SD")
# Calculate means and sds and add them to out_dt
print(Sys.time())
for (i in seq_len(nrow(out_dt))) {
print(paste("row", i, Sys.time()))
chr_val <- as.numeric(unname(unlist(out_dt[i, "CHR"])))
start_val <- as.numeric(unname(unlist(out_dt[i, "START"])))
end_val <- as.numeric(unname(unlist(out_dt[i, "END"])))
for (j in 4:16) {
out_dt[i, colnames_mean[j-3] := mean(unlist(sapply(data_list, function(x) x$input[x$input$CHR == chr_val & x$input$START == start_val & x$input$END == end_val, j])), na.rm = TRUE)]
out_dt[i, colnames_sd[j-3] := sd(unlist(sapply(data_list, function(x) x$input[x$input$CHR == chr_val & x$input$START == start_val & x$input$END == end_val, j])), na.rm = TRUE)]
}
}
print(Sys.time())
我想要人帮忙
- 在给定元素数量(1600)和行数(50000)的情况下,在速度方面优化该代码.目前,在我的机器上,i的每次迭代需要15秒.这意味着它将需要208小时才能完成.我知道并行化,我会对使用这种方法的一些解决方案感兴趣,但我也想知道,如果不使用并行化,代码是否仍然可以优化?