下面是我的两个数据帧,df1和df2

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))

id     text response
1  632592651     asdf        y
2  633322173      cat        y
3  634703802      dog        y
4  634927873    mouse        n
5  635812953 elephant        n
6  636004739    goose        y
7  636101211      rat        y
8  636157799     mice        n
9  636263106    kitty        n
10 636752420   kitten        y

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))

 id           text volume
1   632592651       asdf_xyz   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  636809222      tiger_xyz      6
12 2004722036           lion    345
13 2004894388        leopard     23
14 2005045755        ostrich      2
15 2005535472       kangaroo   4778
16 2005630542       platypus    234
17 2005788781           fish   8675
18 2005809679        reptile   3459
19 2005838317        mammals      8
20 2005866692 amphibians_xyz      9

如何将不匹配项从df2的id1:20行更改为NA(即所有项都与df1不匹配),并将id1的"text"(即asdf\u xyz)列更改为NA?

我试过了

library(dplyr)

df3 <- df2 %>%
  anti_join(df1, by=c("id"))

id           text volume
1   636809222      tiger_xyz      6
2  2004722036           lion    345
3  2004894388        leopard     23
4  2005045755        ostrich      2
5  2005535472       kangaroo   4778
6  2005630542       platypus    234
7  2005788781           fish   8675
8  2005809679        reptile   3459
9  2005838317        mammals      8
10 2005866692 amphibians_xyz      9

df3$id[df3$id != 0] <- NA
df3$text[df3$text != 0] <- NA
df3$volume[df3$volume != 0] <- NA

(一个接一个地这样做,因为我找不到如何将数据帧的整个值更改为NA的解决方案)

id text volume
1  <NA> <NA>   <NA>
2  <NA> <NA>   <NA>
3  <NA> <NA>   <NA>
4  <NA> <NA>   <NA>
5  <NA> <NA>   <NA>
6  <NA> <NA>   <NA>
7  <NA> <NA>   <NA>
8  <NA> <NA>   <NA>
9  <NA> <NA>   <NA>
10 <NA> <NA>   <NA>

and df4 (solution from How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'?)

inner_join(x = df1, 
           y = df2, 
           by = "id") %>%
  mutate_if(is.factor, as.character) %>%
  mutate(text = ifelse(test = text.x != text.y, 
                       yes = NA, 
                       no = text.x)) %>%
  select(id, text, response, volume)

id     text response volume
1  632592651     <NA>        y   1234
2  633322173      cat        y    432
3  634703802      dog        y    324
4  634927873    mouse        n    333
5  635812953 elephant        n   2223
6  636004739    goose        y 412346
7  636101211      rat        y   7456
8  636157799     mice        n   3456
9  636263106    kitty        n   2345
10 636752420   kitten        y   2345

但不确定如何用df3和df4替换df2.所需输出如下所示:

id           text volume
1   632592651       NA   1234
2   633322173            cat    432
3   634703802            dog    324
4   634927873          mouse    333
5   635812953       elephant   2223
6   636004739          goose 412346
7   636101211            rat   7456
8   636157799           mice   3456
9   636263106          kitty   2345
10  636752420         kitten   2345
11  NA               NA      NA
12  NA               NA      NA
13  NA               NA      NA
14  NA               NA      NA
15  NA               NA      NA
16  NA               NA      NA
17  NA               NA      NA
18  NA               NA      NA
19  NA               NA      NA
20  NA               NA      NA

有人能帮忙吗?

第2部分:

对于我请求的第二部分,我想从joined\u df创建另一个数据帧,它只出现在df1中(称为found\u in\u df1).输出示例:

在_df1中找到_:

#           id     text volume
# 1: 632592651     <NA>   1234
# 2: 633322173      cat    432
# 3: 634703802      dog    324
# 4: 634927873    mouse    333
# 5: 635812953 elephant   2223
# 6: 636004739    goose 412346
# 7: 636101211      rat   7456
# 8: 636157799     mice   3456
# 9: 636263106    kitty   2345
#10: 636752420   kitten   2345

The solution is given in How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? but I'm looking for an alternative approach, i.e., is it possible to write a script to say retrieve from joined_df using df1 to give found_in_df1 since we have df1 and joined_df?

推荐答案

处理冲突的一个潜在解决方案是使用powerjoin package,例如:.

library(dplyr)

df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                  text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                  response=c("y","y","y","n","n","y","y","n","n","y"))

df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                  text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                  volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))

expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
                                      NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                               text = c(NA, "cat", "dog", "mouse", "elephant", "goose", 
                                        "rat", "mice", "kitty", "kitten", 
                                        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                               volume = c(1234, 432, 324, 333, 2223, 412346, 7456, 
                                          3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))

library(powerjoin)
joined_df <- power_full_join(df1, df2, by = c("id"),
                             conflict = rw ~ ifelse(.x != .y,
                                                    NA_integer_, 
                                                    .x))

final_df <- joined_df %>%
  mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
  select(id, text, volume)
final_df
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345
#> 11      <NA>     <NA>     NA
#> 12      <NA>     <NA>     NA
#> 13      <NA>     <NA>     NA
#> 14      <NA>     <NA>     NA
#> 15      <NA>     <NA>     NA
#> 16      <NA>     <NA>     NA
#> 17      <NA>     <NA>     NA
#> 18      <NA>     <NA>     NA
#> 19      <NA>     <NA>     NA
#> 20      <NA>     <NA>     NA

all_equal(final_df, expected_outcome)
#> [1] TRUE

# Part 2
found_in_df1 <- power_left_join(df1, df2, by = c("id"),
                                conflict = rw ~ ifelse(.x != .y,
                                                       NA_integer_, 
                                                       .x)) %>%
  select(id, text, volume)
found_in_df1
#>           id     text volume
#> 1  632592651     <NA>   1234
#> 2  633322173      cat    432
#> 3  634703802      dog    324
#> 4  634927873    mouse    333
#> 5  635812953 elephant   2223
#> 6  636004739    goose 412346
#> 7  636101211      rat   7456
#> 8  636157799     mice   3456
#> 9  636263106    kitty   2345
#> 10 636752420   kitten   2345

reprex package(v2.0.1)于2022-07-02创建

R相关问答推荐

按列A中的值进行子集化,并获得列C中对应于R中列B的最大值行的值?使用循环自动化此操作

rvest函数read_html_live()不允许html_elements()正确读取

在交互式情节中从barplot中获取值时遇到问题,在shinly中的ggplotly

用单个表达匹配多个替代模式

在数据表中呈现数学符号

使用ggplot将平滑线添加到条形图

在数学中正确显示摄氏度、开氏度或华氏度

derrr mutate case_when grepl不能在R中正确返回值

单个轮廓重叠条的单独图例

如何直接从R中的风险分数计算c指数?

在GGPLATE中将突出的点放在前面

将包含卷的底部25%的组拆分为2行

是否可以创建一个ggplot与整洁判断的交互作用

Ggplot2中的重复注记

LOF中的插图短文字幕

仅 Select 超过9行的CSV文件

R中1到n_1,2到n_2,…,n到n_n的所有组合都是列表中的向量?

在GG图中绘制射线的自动程序

条形图顶部与其错误条形图不对齐

在R中创建连续的期间