使用单通道for
环路将是高效的:
cumskewness <- function(x) {
n <- length(x)
if (n == 0L) {
return(x)
} else if (n == 1L) {
return(0)
}
m2 <- m3 <- term1 <- 0
out <- numeric(n)
out[1] <- NaN
m1 <- x[1]
for (i in 2:n) {
n0 <- i - 1
delta <- x[i] - m1
delta_n <- delta/i
m1 <- m1 + delta_n
term1 <- delta*delta_n*n0
m3 <- m3 + term1*delta_n*(n0 - 1) - 3*delta_n*m2
m2 <- m2 + term1
out[i] <- sqrt(i)*m3/m2^1.5
}
out
}
它也很简单,可以写成Rcpp
:
library(Rcpp)
cppFunction('
NumericVector cumskewnessCpp(const NumericVector& x) {
const int n = x.size();
if (n == 0) {
return(x);
} else if (n == 1) {
return(0);
}
int n1;
double m1 = x(0);
double m2, m3, delta, delta_n, term1;
NumericVector out(n);
out(0) = R_NaN;
for (int i = 1; i < n; i++) {
n1 = i + 1;
delta = x(i) - m1;
delta_n = delta/n1;
m1 += delta_n;
term1 = delta*delta_n*i;
m3 += term1*delta_n*(i - 1) - 3*delta_n*m2;
m2 += term1;
out(i) = sqrt(n1)*m3/pow(m2, 1.5);
}
return out;
}
')
测试
cumskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
cumskewnessCpp(x)
#> [1] NaN 0.000000e+00 3.818018e-01 -2.808667e-17 4.082483e-01
标杆
包括来自@ PandisIsCoding的矢量化解决方案:
cskewness_tic <- function(y) {
# cumulative length of y
k <- seq_along(y)
# cumulative n-th order moments of y
m3 <- cumsum(y^3)
m2 <- cumsum(y^2)
m1 <- cumsum(y)
u <- m1 / k
# expand cubic terms and refactor skewness in terms of num/den
num <- (m3 - 3 * u * m2 + 3 * u^2 * m1 - k * u^3) / k
den <- sqrt((m2 + k * u^2 - 2 * u * m1) / k)^3
num / den
}
set.seed(0)
x <- sample(1e3)
microbenchmark::microbenchmark(
cskewness_tic = cskewness_tic(x),
cumskewness = cumskewness(x),
cumskewnessCpp = cumskewnessCpp(x),
check = "equal",
unit = "relative"
)
#> Unit: relative
#> expr min lq mean median uq max neval
#> cskewness_tic 2.035272 2.07954 2.006118 2.388216 2.282022 0.7228815 100
#> cumskewness 3.930424 3.96835 4.003956 3.939762 3.711879 1.1339946 100
#> cumskewnessCpp 1.000000 1.00000 1.000000 1.000000 1.000000 1.0000000 100
A note of caution using cskewness_tic
灾难性的取消可能会导致高阶矩的精度误差.当标准差相对于平均值较小时,就会发生这种情况.演示:
set.seed(0)
x <- sample(1e3) + 1e8
y1 <- cskewness(x)
y2 <- cumskewness(x)
y3 <- cumskewnessCpp(x)
y4 <- cskewness_tic(x); y4[1] <- NaN
all.equal(y1, y2)
#> [1] TRUE
all.equal(y1, y3)
#> [1] TRUE
all.equal(y1, y4)
#> [1] "Mean relative difference: 270.1872"
原始答案
skewness2 <- function(.x) {
d <- outer(.x, cumsum(.x)/(1:length(.x)), "-")
d[lower.tri(d)] <- 0
sqrt(1:length(.x))*colSums(d^3)/colSums(d^2)^(3/2)
}
x <- c(1,2,4,5,8)
cskewness(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483
skewness2(x)
#> [1] NaN 0.0000000 0.3818018 0.0000000 0.4082483