我在列Segments
中有文本数据,在列Q_c7_collpsd
中有相应的文本标记组合.Q_c7_collpsd
比Segments
长.我的任务是将Q_c7_collpsd
号文件的长度和内容修剪成Segments
号文件的精确长度和内容.(另一个困难是Segments
包含特殊字符,这些字符在Q_c7_collpsd
中找不到).
这是what I've tried so far;它只起到了部分作用(错误的分配用**...*标记)
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
df %>%
mutate(
# trim whitespace:
Segments = trimws(Segments),
# correct "n 't" to "n't":
Segments = str_replace_all(Segments, "(?<=n)\\s(?='t)", ""),
# separate clitics from host:
Segments = str_replace_all(Segments, "n't", " n't")) %>%
# split into Segment_type and Utterance:
separate(Segments, into = c("Segment_type", "Utterance"), sep = ":\\s") %>%
# remove special characters:
mutate(Utterance_Clean = str_remove_all(Utterance, "(?![\\s'])\\W")) %>%
# remove rows where Utterance_Clean is digits:
filter(!str_detect(Utterance_Clean, "^\\d+$")) %>%
# create id:
group_by(Q_c7_collpsd) %>%
mutate(id = rleid(Sequ, Utterance)) %>%
group_by(id) %>%
# split into individual word-tag combinations:
separate_rows(Q_c7_collpsd, sep = "\\s") %>%
separate(Q_c7_collpsd, into = c("w", "c7"), sep = "_") %>%
# split into individual words:
separate_rows(Utterance_Clean, sep = "\\s") %>%
group_by(Sequ, id) %>%
# keep only rows where w == Utterance_Clean:
filter(w == Utterance_Clean) %>%
# combine w and c7 tag back again:
mutate(w_c7 = str_c(w, c7, sep = "_")) %>%
# put splitted elemments back together:
summarise(across(c(Segment_type, Utterance), first),
w_c7 = str_c(w_c7, collapse = " "))
Result:个
# A tibble: 6 × 5
# Groups: Sequ [2]
Sequ id Segment_type Utterance w_c7
<dbl> <int> <chr> <chr> <chr>
1 1 1 tcu_noQ °[o::h my God] oh_UH my_APPGE God_NP1 **my_APPGE*
2 1 2 tcu_pol you have n't seen my place?°= **my_APPGE* you_PPY have_VH0 n't_XX seen_VVN my_APPGE place_NN1
3 2 1 tcu_decl [but you] use leggings¿ but_CCB you_PPY use_VV0 leggings_NN2
4 2 2 frg or or_CC
5 2 3 tcu_decl no? no_UH
6 2 4 tcu_noQ [(I do n't know)] I_PPIS1 do_VD0 n't_XX know_VVI
Desired result:个
Sequ id Segment_type Utterance w_c7
<dbl> <int> <chr> <chr> <chr>
1 1 1 tcu_noQ °[o::h my God] oh_UH my_APPGE God_NP1
2 1 2 tcu_pol you have n't seen my place?°= you_PPY have_VH0 n't_XX seen_VVN my_APPGE place_NN1
3 2 1 tcu_decl [but you] use leggings¿ but_CCB you_PPY use_VV0 leggings_NN2
4 2 2 frg or or_CC
5 2 3 tcu_decl no? no_UH
6 2 4 tcu_noQ [(I do n't know)] I_PPIS1 do_VD0 n't_XX know_VVI
Data:个
df <- data.frame(
Sequ = c(1,1,2,2,2,2,2),
Segments = c("tcu_noQ: °[o::h my God] ", "tcu_pol: you haven't seen my place?°=", "tcu_decl: [but you] use leggings¿","frg: or","pause: 0.485","tcu_decl: no?","tcu_noQ: [(I don 't know)]"),
Q_c7_collpsd = c("oh_UH my_APPGE God_NP1 you_PPY have_VH0 n't_XX seen_VVN my_APPGE place_NN1", "oh_UH my_APPGE God_NP1 you_PPY have_VH0 n't_XX seen_VVN my_APPGE place_NN1",
"but_CCB you_PPY use_VV0 leggings_NN2 or_CC no_UH I_PPIS1 do_VD0 n't_XX know_VVI","but_CCB you_PPY use_VV0 leggings_NN2 or_CC no_UH I_PPIS1 do_VD0 n't_XX know_VVI","but_CCB you_PPY use_VV0 leggings_NN2 or_CC no_UH I_PPIS1 do_VD0 n't_XX know_VVI","but_CCB you_PPY use_VV0 leggings_NN2 or_CC no_UH I_PPIS1 do_VD0 n't_XX know_VVI","but_CCB you_PPY use_VV0 leggings_NN2 or_CC no_UH I_PPIS1 do_VD0 n't_XX know_VVI"))