我使用{ggplot2}
绘制分组变量的平均值和可信区间,并注意到如果我手工计算或使用stat_summary()
计算可信区间,我得到的结果略有不同.
有人知道是什么导致了这种差异吗?
下面是可复制的代码(注意-变量y
故意倾斜以模拟我的实际数据集,但想知道这是否是导致问题的原因).
# Generate data
# Number of observations per group
n_per_group <- 50
# Generate left-skewed data
group1 <- rgamma(n_per_group, shape = 2, scale = 1)
group2 <- rgamma(n_per_group, shape = 3, scale = 1.5)
group3 <- rgamma(n_per_group, shape = 4, scale = 2)
# Combine data into a single data frame
df <- data.frame(
y = rep(c("Group 1", "Group 2", "Group 3"), each = n_per_group),
x = c(group1, group2, group3)
)
# Using stat_summary()
df %>%
ggplot(., aes(x = x, y = y, group = y)) +
stat_summary(fun = "mean", geom = "point") +
stat_summary(fun.data = "mean_se",
geom = "errorbar",
width = 0.1) +
scale_x_continuous(breaks = seq(1, 10, by = 0.5))
# By hand
df %>%
group_by(y) %>%
summarise(mean = mean(x, na.rm = T),
std.dev = sd(x, na.rm = T),
n = n(),
se = std.dev / sqrt(n)) %>%
ggplot(., aes(y = y)) +
geom_errorbar(aes(xmin = mean - 1.96*se,
xmax = mean + 1.96*se),
width = 0.1) +
geom_point(aes(x = mean)) +
scale_x_continuous(breaks = seq(1, 10, by = 0.5))