我实现了我的_rgeom函数,如果X~geom(P)和Y~exp(-−(1 Log P)),则Z=CELING(Y)与X具有相同的分布.使用inverse-transform method,我现在可以将我的函数定义为:

my_rgeom <- function(N, p) {
    U <- runif(N)
    Y <- log(U) / log(1 - p)
    
    return(ceiling(Y))
}

为什么R的实现不使用例如逆变换,因为它要快得多?

将此功能与标准rgeom功能进行基准比较的结果:

library(microbenchmark)
library(ggplot2)
library(gridExtra)

N <- 10^6
prob <- c(0.1, 0.2, 0.3, 0.4, 0.5) 

results_list <- list()

for (p in prob) {
    benchmark_results <- microbenchmark(
        my_rgeom(N, p),
        rgeom(N, p),
        times = 100  
    )
    
    # Store the results in the list
    results_list[[as.character(p)]] <- benchmark_results
}
# Plot the results for each value of p
plots <- lapply(prob, function(p) {
    autoplot(results_list[[as.character(p)]], title = paste("p =", p))
})

# Display the plots
gridExtra::grid.arrange(grobs = plots)

Results of code above

并输出RESULTS_LIST:

results_list 
$`0.1`
Unit: milliseconds
           expr      min       lq      mean    median       uq      max neval
 my_rgeom(N, p)  63.2907  69.1584  76.95829  73.62385  78.8459 172.4273   100
    rgeom(N, p) 115.0211 123.7101 131.01782 127.05655 134.2484 191.6843   100

$`0.2`
Unit: milliseconds
           expr      min       lq      mean    median        uq      max neval
 my_rgeom(N, p)  63.4618  76.8853  87.02115  85.91125  94.29075 195.5412   100
    rgeom(N, p) 117.3709 130.1152 142.02507 135.24175 148.68975 274.5087   100

$`0.3`
Unit: milliseconds
           expr      min        lq      mean   median       uq      max neval
 my_rgeom(N, p)  66.2584  77.31255  85.50857  81.4676  88.8424 178.2553   100
    rgeom(N, p) 115.3894 129.59360 139.56611 135.2741 146.1852 187.7755   100

$`0.4`
Unit: milliseconds
           expr      min       lq      mean   median       uq      max neval
 my_rgeom(N, p)  68.1903  75.1489  83.53151  80.1580  86.8057 180.0446   100
    rgeom(N, p) 116.3780 121.7696 132.27215 126.9434 137.8039 213.7012   100

$`0.5`
Unit: milliseconds
           expr      min        lq      mean   median       uq      max neval
 my_rgeom(N, p)  70.0728  73.63675  82.20571  79.3234  84.6281 178.6583   100
    rgeom(N, p) 116.5077 119.13275 126.49830 124.5081 130.1617 187.6423   100

推荐答案

rgeom的R实现似乎将几何分布视为负二项分布的特例,负二项分布依赖于泊松随机变量.我看不出有什么理由不使用逆变换采样,因为它会更快,并且可以使数值稳定到rgeom.然而,我建议对my_rgeom进行几处修改.

首先,-rexp(N)会比log(runif(N))快.其次,为了更好地处理非常小的值p,log1p(-p)log(1 - p)更可取,正如 comments 中提到的那样.

顺便说一句,rgeom支持从0开始,所以我将使用floor而不是ceiling.

下面的代码演示了上面的内容.

首先定义一些候选函数:

set.seed(1845088186)

rgeom1 <- function(N, p) {
  U <- runif(N)
  Y <- log(U) / log(1 - p)
  return(floor(Y))
}

rgeom2 <- function(N, p) {
  floor(-rexp(N)/log(1 - p))
}

rgeom3 <- function(N, p) {
  floor(-rexp(N)/log1p(-p))
}

rgeom_all <- list(rgeom, rgeom1, rgeom2, rgeom3)

测试它们是否都返回相同的分布.

(p <- runif(1))
#> [1] 0.08180412
x <- lapply(rgeom_all, \(f) f(1e6, p))

suppressWarnings(sapply(x[-1], \(y) ks.test(y, x[[1]])$p.value))
#> [1] 0.7088201 0.7734314 0.7856763
mapply(summary, x)
#>              [,1]      [,2]      [,3]      [,4]
#> Min.      0.00000   0.00000   0.00000   0.00000
#> 1st Qu.   3.00000   3.00000   3.00000   3.00000
#> Median    8.00000   8.00000   8.00000   8.00000
#> Mean     11.23687  11.23102  11.21926  11.21025
#> 3rd Qu.  16.00000  16.00000  16.00000  16.00000
#> Max.    203.00000 187.00000 146.00000 171.00000

比较计时:

p <- runif(1e6)

microbenchmark::microbenchmark(
  rgeom = rgeom(1e6, p),
  rgeom1 = rgeom1(1e6, p),
  rgeom2 = rgeom2(1e6, p),
  rgeom3 = rgeom3(1e6, p)
)
#> Unit: milliseconds
#>    expr     min       lq      mean    median       uq      max neval
#>   rgeom 82.3725 91.00560 100.98802 100.48860 108.7335 146.5782   100
#>  rgeom1 64.3914 74.54830  82.31284  80.25625  87.1611 123.3017   100
#>  rgeom2 55.6838 60.14365  69.41881  66.87925  75.9065 143.6629   100
#>  rgeom3 53.8316 58.36640  65.48733  64.77050  70.4816 102.4296   100

请注意,log1p(-p)log(1 - p)略快.

但是,当变量在整数范围内时,rgeom的行为与其他三种实现的行为不同:

sapply(rgeom_all, \(f) typeof(f(1, 0.5)))
#> [1] "integer" "double"  "double"  "double"
sapply(rgeom_all, \(f) typeof(f(1, .Machine$double.xmin)))
#> [1] "double" "double" "double" "double"

它们在如何处理一些无效输入方面也有所不同:

rgeom(1, -0.5)
#> Warning in rgeom(1, -0.5): NAs produced
#> [1] NA
rgeom1(1, -0.5)
#> [1] -2
rgeom2(1, -0.5)
#> [1] -2
rgeom3(1, -0.5)
#> [1] -1

rgeom(1, 1.1)
#> Warning in rgeom(1, 1.1): NAs produced
#> [1] NA
rgeom1(1, 1.1)
#> Warning in log(1 - p): NaNs produced
#> [1] NaN
rgeom2(1, 1.1)
#> Warning in log(1 - p): NaNs produced
#> [1] NaN
rgeom3(1, 1.1)
#> Warning in log1p(-p): NaNs produced
#> [1] NaN

rgeom(1, NA)
#> Warning in rgeom(1, NA): NAs produced
#> [1] NA
rgeom1(1, NA)
#> [1] NA
rgeom2(1, NA)
#> [1] NA
rgeom3(1, NA)
#> [1] NA

可以修改三个候选函数以反映*rgeom的行为,但会对性能产生一些影响:

(*足够接近--在所有情况下,完全和忠实地反映rgeom的行为将消除甚至扭转性能差异,而不需要求助于C代码.)

rgeom1 <- function(N, p) {
  y <- log(runif(N))/log(1 - p)
  y[p < 0] <- NA_real_
  if (max(y, na.rm = 1) > .Machine$integer.max) floor(y) else as.integer(y)
}

rgeom2 <- function(N, p) {
  y <- -rexp(N)/log(1 - p)
  y[p < 0] <- NA_real_
  if (max(y, na.rm = 1) > .Machine$integer.max) floor(y) else as.integer(y)
}

rgeom3 <- function(N, p) {
  y <- -rexp(N)/log1p(-p)
  y[p < 0] <- NA_real_
  if (max(y, na.rm = 1) > .Machine$integer.max) floor(y) else as.integer(y)
}

测试:

rgeom(1, -0.5)
#> Warning in rgeom(1, -0.5): NAs produced
#> [1] NA
rgeom1(1, -0.5)
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA
rgeom2(1, -0.5)
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA
rgeom3(1, -0.5)
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA

rgeom(1, 1.1)
#> Warning in rgeom(1, 1.1): NAs produced
#> [1] NA
rgeom1(1, 1.1)
#> Warning in log(1 - p): NaNs produced
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA
rgeom2(1, 1.1)
#> Warning in log(1 - p): NaNs produced

#> Warning in log(1 - p): no non-missing arguments to max; returning -Inf
#> [1] NA
rgeom3(1, 1.1)
#> Warning in log1p(-p): NaNs produced

#> Warning in log1p(-p): no non-missing arguments to max; returning -Inf
#> [1] NA

rgeom(1, NA)
#> Warning in rgeom(1, NA): NAs produced
#> [1] NA
rgeom1(1, NA)
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA
rgeom2(1, NA)
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA
rgeom3(1, NA)
#> Warning in max(y, na.rm = 1): no non-missing arguments to max; returning -Inf
#> [1] NA

计时:

p <- runif(1e6, -0.1, 1.1)
p[sample(1e6, 1e3)] <- NA

microbenchmark::microbenchmark(
  rgeom = rgeom(1e6, p),
  rgeom1 = rgeom1(1e6, p),
  rgeom2 = rgeom2(1e6, p),
  rgeom3 = rgeom3(1e6, p)
)

#> Unit: milliseconds
#>    expr     min       lq     mean   median       uq      max neval
#>   rgeom 70.7436 77.90410 85.63081 84.49175 90.66080 121.3149   100
#>  rgeom1 65.1350 75.00100 82.40402 80.18435 90.90025 109.3224   100
#>  rgeom2 57.8535 66.52855 74.24759 73.25985 80.11250 126.4952   100
#>  rgeom3 58.4662 66.12285 73.17567 71.85420 79.00085 112.0432   100

但是,对于非常小的值prgeom1rgeom2表现不佳:

sapply(rgeom_all, \(f) sum(is.finite(f(1e6, .Machine$double.xmin))))
#> Warning in f(1e+06, .Machine$double.xmin): NAs produced
#> [1] 981778      0      0 981709

除了是四个实现中最快的一个之外,rgeom3在所有值p上都表现良好.

R相关问答推荐

替换收件箱的子集(行和列)

在R中,如何创建时间间隔的图表?

如何将y轴上的线定位得彼此更近

用单个表达匹配多个替代模式

有没有方法将琴弦完全捕捉到R中的多边形?

使用Shiny组合和显示复制和粘贴的数据

为什么当我try 在收件箱中使用合并功能时会出现回收错误?

如何使用rmarkdown和kableExtra删除包含折叠行的表的第一列的名称

随机森林回归:下拉列重要性

Highcharter多次钻取不起作用,使用不同方法

在R中替换函数中的特定符号

删除列表中存储的数据帧内和数据帧之间的重复行

未识别时区

如何同时从多个列表中获取名字?

如何对2个列表元素的所有组合进行操作?

如何将一列中的值拆分到R中各自的列中

如何在PrePlot()中将多个元素设置为斜体

防止正则表达式覆盖以前的语句

填充图例什么时候会有点?

R:使用ApexCharge更改标签在饼图中的位置