我有这些(不整齐的)数据,其中包含每个患者的药物方案阶段(ip或cp)、药物名称(编码为数字)和多种药物的剂量信息:

df_have

#   id ip_drug1 ip_dose1 ip_drug2 ip_dose2 cp_drug1 cp_dose1 cp_drug2 cp_dose2
# 1 A1        1      300        3      100        6      500        7      100
# 2 A2        1      300        2      200       11      300       NA       NA
# 3 A3        1      500       NA       NA        9      100        5     1500

我想把这些数据整理成长格式:

df_want

#    id phase drug dose
# 1  A1    ip    1  300
# 2  A1    ip    3  100
# 3  A1    cp    6  500
# 4  A1    cp    7  100
# 5  A2    ip    1  300
# 6  A2    ip    2  200
# 7  A2    cp   11  300
# 8  A2    cp   NA   NA
# 9  A3    ip    1  500
# 10 A3    ip   NA   NA
# 11 A3    cp    9  100
# 12 A3    cp    5 1500

通过组合tidyr::pivot_longerdplyr::mutatetidyr::pivot_wider(以及dplyr::select),我能够获得所需的数据帧:

library(tidyr)
library(dplyr)

df_have %>% 
  pivot_longer(cols = -id, 
               names_to = c("phase", "type"),
               names_pattern = "(cp|ip)_(drug|dose)") %>%
  mutate(temp = row_number(), 
         .by = c(id, phase, type)) %>%
  pivot_wider(names_from = type, 
              values_from = value) %>%
  select(-temp)

然而,上面的多步代码在我的非常大的实际数据上非常慢.我想在tidyr/dplyr,ideally in a single 102 step.内更快地完成这一转变,这可能吗?


一百零二

# have
df_have <- data.frame(id = paste0("A", 1:3), 
                 ip_drug1 = 1,
                 ip_dose1 = c(300, 300, 500),
                 ip_drug2 = c(3, 2, NA),
                 ip_dose2 = c(100, 200, NA),
                 cp_drug1 = c(6, 11, 9),
                 cp_dose1 = c(500, 300, 100),
                 cp_drug2 = c(7, NA, 5),
                 cp_dose2 = c(100, NA, 1500))

# want
df_want <- data.frame(id = rep(paste0("A", 1:3), each = 4),
                      phase = rep(rep(c("ip", "cp"), each = 2), times = 3),
                      drug = c(1, 3, 6, 7, 1, 2, 11, NA, 1, NA, 9, 5),
                      dose = c(300, 100, 500, 100, 300, 200, 300, NA, 500, NA, 100, 1500))

推荐答案

这是可能的pivot_longer个步骤.这应该快4.5倍左右:

library(tidyr)

df_have |>
  pivot_longer(-id,
               names_pattern = "(.*?)_(.*?)\\d",
               names_to = c("phase", ".value")) 

我认为关键是在names_to论点中使用.value.从?pivot_longer起:

".Value"表示列名的对应组件 定义包含单元格值的输出列的名称, 完全压倒values_to人.

Benchmark

@ThomasIsCoding的解决方案速度更快(~12倍)

one_pivot <- function() {
  df_have |>
    pivot_longer(-id,
                 names_pattern = "(.*?)_(.*?)\\d",
                 names_to = c("phase", ".value"))
}

current <- function() {
  df_have %>% 
    pivot_longer(cols = -id, 
                 names_to = c("phase", "type"),
                 names_pattern = "(cp|ip)_(drug|dose)") %>%
    mutate(temp = row_number(), 
           .by = c(id, phase, type)) %>%
    pivot_wider(names_from = type, 
                values_from = value) %>%
    select(-temp)
}

base <- function() {
  out <- reshape(
    setNames(
      df_have,
      gsub("(\\D+)_(\\D+)", "\\2_\\1", names(df_have))
    ),
    direction = "long",
    idvar = "id",
    varying = -1,
    sep = "_",
    timevar = "phase"
  )
  
  transform(
    `row.names<-`(out[order(out$id), ], NULL),
    phase = sub("\\d+$", "", phase)
  )
}

bench::mark(
  current(),
  one_pivot(),
  base(),
  relative = TRUE,
  check = FALSE
)
  
  expression    min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory               time       gc      
  <bch:expr>  <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list> <list>               <list>     <list>  
1 current()   11.4   11.4       1          Inf     1.04    27     2      463ms <NULL> <Rprofmem [141 × 3]> <bench_tm> <tibble>
2 one_pivot()  2.54   2.67      4.43       Inf     1      124     2      480ms <NULL> <Rprofmem [22 × 3]>  <bench_tm> <tibble>
3 base()       1      1        11.7        NaN     1.52   322     3      473ms <NULL> <Rprofmem [0 × 3]>   <bench_tm> <tibble>

Output

   id    phase  drug  dose
   <chr> <chr> <dbl> <dbl>
 1 A1    ip        1   300
 2 A1    ip        3   100
 3 A1    cp        6   500
 4 A1    cp        7   100
 5 A2    ip        1   300
 6 A2    ip        2   200
 7 A2    cp       11   300
 8 A2    cp       NA    NA
 9 A3    ip        1   500
10 A3    ip       NA    NA
11 A3    cp        9   100
12 A3    cp        5  1500

R相关问答推荐

从具有随机模式的字符串中提取值

在数据表中呈现数学符号

从API中抓取R数据SON

根据收件箱中的特定值提取列名

以R中的正确顺序将日期时间字符列转换为posixct

如何修复R码的置换部分?

根据选中三个复选框中的一个或两个来调整绘图

如何在R中添加截止点到ROC曲线图?

用R ggplot2求上、下三角形中两个变量的矩阵热图

当我添加美学时,geom_point未对齐

QY数据的处理:如何定义QY因素的水平

函数可以跨多个列搜索多个字符串并创建二进制输出变量

将多个列合并为一个列的有效方法是什么?

SHILINY中DT列的条件着色

图中显示错误 colored颜色 的图例geom_sf

使用显式二元谓词子集化sfc对象时出错

列间序列生成器的功能

在子图内和子图之间对齐行数不均匀的表格罗布对

Gggvenn为Venn增加了不存在的价值

使用循环改进功能( struct 简单)