我有一个数据集sdf,我正试图获得汇总统计数据的分组汇总.我使用的是数据汇总.表,但问题是,当分组中缺少某个值,或者假设计数为0时,不会给出该值的统计信息.

yields dput(as.data.frame(sdf):

structure(list(USUBJID = structure(1:67, .Label = c("Dummy-01-0001",
"Dummy-01-0002", "Dummy-01-0004", "Dummy-01-0005", "Dummy-01-0007",
"Dummy-01-0008", "Dummy-01-0010", "Dummy-01-0011", "Dummy-01-0013",
"Dummy-01-0014", "Dummy-01-0016", "Dummy-01-0017", "Dummy-01-0019",
"Dummy-01-0020", "Dummy-01-0022", "Dummy-01-0023", "Dummy-01-0025",
"Dummy-01-0026", "Dummy-01-0028", "Dummy-01-0029", "Dummy-01-0031",
"Dummy-01-0032", "Dummy-01-0034", "Dummy-01-0035", "Dummy-01-0037",
"Dummy-01-0038", "Dummy-01-0040", "Dummy-01-0041", "Dummy-01-0043",
"Dummy-01-0044", "Dummy-01-0046", "Dummy-01-0047", "Dummy-01-0049",
"Dummy-01-0050", "Dummy-01-0052", "Dummy-01-0053", "Dummy-01-0055",
"Dummy-01-0056", "Dummy-01-0058", "Dummy-01-0059", "Dummy-01-0061",
"Dummy-01-0062", "Dummy-01-0064", "Dummy-01-0065", "Dummy-01-0067",
"Dummy-01-0068", "Dummy-01-0070", "Dummy-01-0071", "Dummy-01-0073",
"Dummy-01-0074", "Dummy-01-0076", "Dummy-01-0077", "Dummy-01-0079",
"Dummy-01-0080", "Dummy-01-0082", "Dummy-01-0083", "Dummy-01-0085",
"Dummy-01-0086", "Dummy-01-0088", "Dummy-01-0089", "Dummy-01-0091",
"Dummy-01-0092", "Dummy-01-0094", "Dummy-01-0095", "Dummy-01-0097",
"Dummy-01-0098", "Dummy-01-0100"), class = "factor"), ACTARMCD = structure(c(2L,
3L, 1L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 3L,
1L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 1L, 3L, 3L, 2L, 2L, 1L, 3L,
2L, 1L, 2L, 1L, 3L, 1L, 1L, 2L, 3L, 1L, 3L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 3L, 3L, 2L, 3L, 2L, 1L, 2L, 3L, 1L, 2L, 1L,
1L, 3L), .Label = c("PLAC", "TRTA", "TRTB"), class = "factor"),
RACE = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("BLACK OR AFRICAN AMERICAN",
"WHITE"), class = "factor"), ETHNIC = structure(c(2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 2L, 2L, 2L), .Label = c("HISPANIC OR LATINO", "NOT HISPANIC OR LATINO"
), class = "factor"), AGE = c(41.0102669404517, 38.0451745379877,
34.9705681040383, 42.135523613963, 29.5249828884326, 42.9103353867214,
42.4476386036961, 45.6262833675565, 30.4312114989733, 25.9356605065024,
32.1122518822724, 30.009582477755, 41.2156057494867, 47.7289527720739,
38.6721423682409, 36.4298425735797, 48.331279945243, 39.337440109514,
42.4120465434634, 25.9794661190965, 29.6043805612594, 45.7275838466804,
37.1060917180014, 37.1553730321697, 45.2265571526352, 35.7344284736482,
35.5099247091034, 34.2505133470226, 26.3189596167009, 25.8261464750171,
38.2039698836413, 44.0793976728268, 34.611909650924, 45.5222450376454,
45.4017796030116, 27.1403148528405, 38.4695414099932, 31.419575633128,
23.4743326488706, 31.8904859685147, 26.611909650924, 44.3093771389459,
41.6427104722793, 45.3497604380561, 45.4647501711157, 23.378507871321,
44.1177275838467, 25.4401095140315, 26.1245722108145, 44.0273785078713,
43.7535934291581, 40.5284052019165, 22.9267624914442, 44.6242299794661,
29.2128678986995, 31.8247775496235, 40.0109514031485, 30.072553045859,
38.0889801505818, 31.3812457221081, 22.5982203969884, 32.2354551676934,
29.3223819301848, 29.2758384668036, 33.9739904175222, 47.1211498973306,
33.3415468856947)), row.names = c(NA, -67L), class = c("data.table",
"data.frame"))
sdf <- as.data.table(sdf)


check <- rollup(sdf,
j = c(list(
n = uniqueN(USUBJID),
mean = mean(AGE),
median = median(AGE),
sd = sd(AGE)
)),
by = c("ACTARMCD", "RACE", "ETHNIC"), id = TRUE
)

setorderv(check,
cols = c("ACTARMCD", "RACE", "ETHNIC"), na.last = TRUE
)
grouping ACTARMCD RACE ETHNIC n mean median sd
1: 0 PLAC BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO 1 38.46954 38.46954 0.000000
2: 0 PLAC BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO 5 32.87630 31.89049 8.214756
3: 1 PLAC BLACK OR AFRICAN AMERICAN <NA> 6 33.96388 31.89049 7.682840
4: 0 PLAC WHITE HISPANIC OR LATINO 6 38.21587 39.33744 5.287809
5: 0 PLAC WHITE NOT HISPANIC OR LATINO 10 40.31991 40.52841 6.500323
6: 1 PLAC WHITE <NA> 16 39.62653 40.52841 6.177196
7: 3 PLAC <NA> <NA> 22 37.98254 39.33744 7.102537
8: 0 TRTA BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO 2 28.28679 26.61191 4.737277
9: 1 TRTA BLACK OR AFRICAN AMERICAN <NA> 2 28.28679 26.61191 4.737277
10: 0 TRTA WHITE HISPANIC OR LATINO 7 39.15834 38.08898 6.935487
11: 0 TRTA WHITE NOT HISPANIC OR LATINO 10 35.70969 38.20397 7.874779
12: 1 TRTA WHITE <NA> 17 37.13792 38.20397 7.656136
13: 3 TRTA <NA> <NA> 19 36.47616 38.20397 7.819534
14: 0 TRTB BLACK OR AFRICAN AMERICAN HISPANIC OR LATINO 1 42.91034 42.91034 0.000000
15: 0 TRTB BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO 3 36.32307 34.61191 7.158414
16: 1 TRTB BLACK OR AFRICAN AMERICAN <NA> 4 38.59454 42.91034 6.564692
17: 0 TRTB WHITE HISPANIC OR LATINO 7 42.47524 45.34976 4.268211
18: 0 TRTB WHITE NOT HISPANIC OR LATINO 15 34.06475 34.53799 6.184451
19: 1 TRTB WHITE <NA> 22 37.48440 38.04517 6.860548
20: 3 TRTB <NA> <NA> 26 37.75268 38.04517 6.779733
21: 7 <NA> <NA> <NA> 67 37.44474 38.46954 7.231613

当值为ACTARMCD = PLAC, RACE = BLACK OR AFRICAN AMERICAN, ETHNIC = NON HISPANIC OR LATINO时,没有年龄数据,因此该行被删除.


我希望出现计数为0的分组,计数显示为0,其他统计数据与其他数据一起显示为NA.

grouping ACTARMCD RACE ETHNIC n mean median sd
9: 0 TRTA BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO 0 NA NA NA

推荐答案

一种方法(类似于注释中建议的方法)是在一个查找表上加入三个分组变量的所有唯一组合:

sdf <- sdf[data.table(setNames(
  expand.grid(levels(sdf$ACTARMCD), levels(sdf$RACE),levels(sdf$ETHNIC)),
  c("ACTARMCD", "RACE", "ETHNIC")
)), on=.(ACTARMCD, RACE,ETHNIC)]

Update, using CJ()

也可以使用CJ()创建上述内容,如下所示

setkey(sdf, ACTARMCD,RACE,ETHNIC)
sdf <- sdf[do.call(CJ, sapply(sdf[,.(ACTARMCD,RACE,ETHNIC)],levels))]

然后可以运行生成check的代码,只需进行一些小的修改:

  • 使用na.rm=T
  • 确保在uniqueN()调用中不计算行数,其中USUBJIDNA
check <- rollup(sdf,
                j = c(list(
                  n = uniqueN(USUBJID[!is.na(USUBJID)]),
                  mean = mean(AGE,na.rm=T),
                  median = median(AGE,na.rm=T),
                  sd = sd(AGE,na.rm=T)
                )),
                by = c("ACTARMCD", "RACE", "ETHNIC"), id = TRUE
)
setorderv(check,
          cols = c("ACTARMCD", "RACE", "ETHNIC"), na.last = TRUE
)

输出:

    grouping ACTARMCD                      RACE                 ETHNIC     n     mean   median       sd
       <int>   <fctr>                    <fctr>                 <fctr> <int>    <num>    <num>    <num>
 1:        0     PLAC BLACK OR AFRICAN AMERICAN     HISPANIC OR LATINO     1 38.46954 38.46954       NA
 2:        0     PLAC BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO     5 32.79836 31.89049 9.690775
 3:        1     PLAC BLACK OR AFRICAN AMERICAN                   <NA>     6 33.74355 33.70021 8.971582
 4:        0     PLAC                     WHITE     HISPANIC OR LATINO     6 35.43692 35.72485 7.332993
 5:        0     PLAC                     WHITE NOT HISPANIC OR LATINO    10 37.40452 37.74949 7.936198
 6:        1     PLAC                     WHITE                   <NA>    16 36.66667 37.15400 7.528750
 7:        3     PLAC                      <NA>                   <NA>    22 35.86945 35.24025 7.837526
 8:        0     TRTA BLACK OR AFRICAN AMERICAN     HISPANIC OR LATINO     2 33.31143 33.31143 9.474553
 9:        0     TRTA BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO     0      NaN       NA       NA
10:        1     TRTA BLACK OR AFRICAN AMERICAN                   <NA>     2 33.31143 33.31143 9.474553
11:        0     TRTA                     WHITE     HISPANIC OR LATINO     7 34.42222 37.15537 9.264020
12:        0     TRTA                     WHITE NOT HISPANIC OR LATINO    10 36.15578 39.60712 8.208825
13:        1     TRTA                     WHITE                   <NA>    17 35.44196 38.08898 8.417876
14:        3     TRTA                      <NA>                   <NA>    19 35.21770 38.08898 8.271974
15:        0     TRTB BLACK OR AFRICAN AMERICAN     HISPANIC OR LATINO     1 42.91034 42.91034       NA
16:        0     TRTB BLACK OR AFRICAN AMERICAN NOT HISPANIC OR LATINO     3 36.17522 34.61191 7.476108
17:        1     TRTB BLACK OR AFRICAN AMERICAN                   <NA>     4 37.85900 38.76112 6.971506
18:        0     TRTB                     WHITE     HISPANIC OR LATINO     7 39.69884 38.67214 5.714909
19:        0     TRTB                     WHITE NOT HISPANIC OR LATINO    15 35.14050 33.34155 6.864614
20:        1     TRTB                     WHITE                   <NA>    22 36.59088 36.08214 6.743081
21:        3     TRTB                      <NA>                   <NA>    26 36.78597 36.08214 6.651618
22:        7     <NA>                      <NA>                   <NA>    67 36.04029 36.42984 7.442384
    grouping ACTARMCD                      RACE                 ETHNIC     n     mean   median       sd

R相关问答推荐

R:随机抽取所有可能排列的样本

将收件箱变量传递给ggplot 2函数

将R data.frame转换为json数组(源代码)

按崩溃类别分类的指数

如何在ggplot 2 geom_segment图表中将UTC转换为EET?

当两个图层映射到相同的美学时,隐藏一个图层的图例值

在R中使用数据集名称

计算时间段的ECDF(R)

无法正确设置动态创建的Quarto标注的格式

在R中按行按列范围查找最大值的名称

以字符格式导入的ExcelElectron 表格日期列标题

函数可以跨多个列搜索多个字符串并创建二进制输出变量

在R中创建连续的期间

基于数据集属性将科分配给物种

如何从向量构造一系列双边公式

在R中,如何将误差条放置在堆叠的每个条上?

将列表中的字符串粘贴到R中for循环内的dplyr筛选器中

按组跨多列创建伪变量

以任意顺序提取具有多个可能匹配项的组匹配项

是否有一个R函数可以输出在输入的字符向量中找到的相应正则表达式模式?