如何获取自定义函数并将其与 r 中的 dplyr group_by 一起使用

发布于03月09日

很抱歉问了同样的问题，但我就是不知道如何使用自定义函数并在分组数据上迭代它.所以，如果有人能给我指出一些深入的资源，我会得到额外的分数.

这样做的目的是:

library(dplyr)

a_df <- tibble(team = "A",
             cluster = list(
               c(0.01, 0.01, 0.09, 0.03, 0.14, 0.28, 0.09, 0.25, 0.18, 0.17, 0.54, 0.41, 0.16, 0.18, 0.25, 0.02, 0.2, 0.69, 0.02, 0.01, 0.02, 0.07, 0.07, 0.21)),
             tot_matches = 20,
             tot_ck = 121)

#Function that will take a list of xG and return the probability of n goals being scored.Normalized
calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){
  
  #set seed only for stackoverflow question!!!!
  set.seed(123)
  
  #Start goal total at 0
  goals_total <- 0
  
  #Function to take an xG number and simulate whether it was a goal or not
  xg_to_goals_sim <- function(shot_xg){

    #Start goal count at 0
    Goals <- 0

    #For each shot, if it goes in, add a goal
    for (shot in shot_xg){
      if (runif(1)<=shot){
        Goals <- Goals + 1
      }
    }

    #Finally, return the number of goals
    return(Goals)

  }
  
  #Run xG calculator 10000 times to test winner %
  sim_goal_list <- c()
  # Create a for statement to populate the list
  for (i in 1:10000) {
    #Run the above formula for xG list
    #But first we need to normalize the number of xG by randomly picking from xG
    
    if(n_matches == norm_matches){
      new_shot_xg_list <- shot_xg_list
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches < norm_matches){
      new_shot_xg_list <- c(shot_xg_list, 
                            sample(shot_xg_list, 
                                   round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list),
                                   replace=FALSE))
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
    if(n_matches > norm_matches){
      
      new_shot_xg_list <- sample(shot_xg_list, 
                                 round(length(shot_xg_list)/n_matches*norm_matches,0),
                                 replace=FALSE)
      goals_total <- xg_to_goals_sim(new_shot_xg_list)
      sim_goal_list[[i]] <- goals_total
    }
    
  }
  
  
  sim_goal_dat <- data.frame(value = unlist(sim_goal_list))
  
  goal_prob <- sim_goal_dat %>% 
    count(value) %>% 
    summarise(goals = value,
              prob = round(n/1000*10,1)) %>% 
    arrange(goals)
  
  return(goal_prob)
  
  
}

#apply function to a single team dataframe (1 obs. of 4 variables)
calculate_goals_norm(shot_xg_list= unlist(a_df$cluster), n_matches = a_df$tot_matches, norm_matches = 20)

# A tibble: 12 x 2
   goals  prob
   <dbl> <dbl>
 1     0   0.6
 2     1   4  
 3     2  12.4
 4     3  20.6
 5     4  23.8
 6     5  19.1
 7     6  11.6
 8     7   5.6
 9     8   1.8
10     9   0.4
11    10   0.1
12    12   0

添加第二个团队来创建一个full_df，以实现定制功能的预期用途:

full_df <- add_row(a_df, tibble(team = "B",
                                cluster = list(
                                  c(0.06, 0.01, 0.11, 0.18, 0.75, 0.04, 0.23, 0.07, 0.1, 0.05, 0.24, 0.12, 0.28, 0.02, 0.09, 0.16, 0.64, 0.03, 0.1, 0.19, 0.09, 0.01, 0.02, 0.12, 0.01, 0.11, 0.18, 0.05, 0.02, 0.8, 0.08)),
                                tot_matches = 19,
                                tot_ck = 83) )

#final function should look like this?
full_df %>% 
  group_by(team) %>% 
  calculate_goals_norm(shot_xg_list = unlist(cluster), n_matches = tot_matches, norm_matches = 20)

我不喜欢上面例子中的自定义函数.我知道apply()/sapply()通常用于在数据帧上进行迭代，但我还是没有足够的知识来知道如何在这里apply.

谢谢你的帮助.

library(dplyr) out <- full_df %>% group_by(team) %>% group_modify(~with(.x, calculate_goals_norm(shot_xg_list = unlist(cluster), n_matches = tot_matches, norm_matches = 20))) %>% ungroup

> as.data.frame(out) team goals prob 1 A 0 0.6 2 A 1 4.0 3 A 2 12.4 4 A 3 20.6 5 A 4 23.8 6 A 5 19.0 7 A 6 11.6 8 A 7 5.6 9 A 8 1.8 10 A 9 0.4 11 A 10 0.1 12 A 12 0.0 13 B 0 0.0 14 B 1 0.8 15 B 2 3.7 16 B 3 11.1 17 B 4 18.6 18 B 5 22.4 19 B 6 19.6 20 B 7 12.4 21 B 8 6.9 22 B 9 2.9 23 B 10 1.1 24 B 11 0.3 25 B 12 0.1 26 B 14 0.0

calculate_goals_norm <- function(shot_xg_list, n_matches, norm_matches){ #set seed only for stackoverflow question!!!! set.seed(123) #Start goal total at 0 goals_total <- 0 #Function to take an xG number and simulate whether it was a goal or not xg_to_goals_sim <- function(shot_xg){ #Start goal count at 0 Goals <- 0 #For each shot, if it goes in, add a goal for (shot in shot_xg){ if (runif(1)<=shot){ Goals <- Goals + 1 } } #Finally, return the number of goals return(Goals) } #Run xG calculator 10000 times to test winner % sim_goal_list <- c() # Create a for statement to populate the list for (i in 1:10000) { #Run the above formula for xG list #But first we need to normalize the number of xG by randomly picking from xG if(n_matches == norm_matches){ new_shot_xg_list <- shot_xg_list goals_total <- xg_to_goals_sim(new_shot_xg_list) sim_goal_list[[i]] <- goals_total } if(n_matches < norm_matches){ new_shot_xg_list <- c(shot_xg_list, sample(shot_xg_list, round(length(shot_xg_list)/n_matches*norm_matches,0)-length(shot_xg_list), replace=FALSE)) goals_total <- xg_to_goals_sim(new_shot_xg_list) sim_goal_list[[i]] <- goals_total } if(n_matches > norm_matches){ new_shot_xg_list <- sample(shot_xg_list, round(length(shot_xg_list)/n_matches*norm_matches,0), replace=FALSE) goals_total <- xg_to_goals_sim(new_shot_xg_list) sim_goal_list[[i]] <- goals_total } } sim_goal_dat <- data.frame(value = unlist(sim_goal_list)) goal_prob <- sim_goal_dat %>% count(value) %>% summarise(goals = value, prob = round(n/1000*10,1)) %>% arrange(goals) return(goal_prob) }

如何获取自定义函数并将其与 r 中的 dplyr group_by 一起使用

推荐答案

R相关问答推荐

如何将Rmarkdown中包含图像和文本的行的两个单元格与.PDF输出垂直对齐？

在之前合并的数据.tables中分配新列后.internal.selfref无效

pivot_longer：names_to和names_pattern

使用case_when和Mutate搜索多个列以寻找条件

在值和NA的行顺序中寻找中断模式

derrr summarise每个组返回多行？

筛选出以特定顺序患病的个体

lightgbm发动机在tidymmodels中的L1正则化""

如何从R ggplot图片中获取SVG字符串？

矩阵的堆叠条形图，条形图上有数字作为标签

线性模型斜率在减少原始数据时提供NA

以相同的方式对每个表进行排序

使用范围和单个数字将数字与字符串进行比较

在R函数中使用加号

当我添加美学时，geom_point未对齐

如何使这些react 表对象相互独立？

变长向量的矢量化和

向R中的数据帧添加一列，该列统计另一列中每个唯一值的二进制观测值的数量

按组跨多列创建伪变量

为什么不能使用lApply在包装函数中调用子集