我有一个CSV,它包含响应变量(称为NTL)和预测值.我正在运行随机森林(RF)回归来预测NTL值.因为我多次执行RF算法,所以我在预测器名称后面添加了一个数字(030,040),以便在执行for循环时更方便(参见下面的代码).CSV的前六行如下所示:

head(block.data)
                 ntl  agbh030  agbh040   blue030   blue040    nir030    nir040   pop030   pop040  road030  road040  tirs030
1 52.94 1.773147 1.778323 0.7179366 0.7178904 0.9911458 0.9909238 33.98462 33.99507 152.4204 153.0130 25.39851
2 49.45 2.895677 2.893922 0.9956004 0.9937335 1.3766788 1.3741488 62.45861 62.34375 435.8899 434.7412 35.85143
3 49.12 3.996930 3.992193 1.0742660 1.0725741 1.4975094 1.4952157 78.47204 78.32093 412.6563 411.8447 37.86271
4 44.37 4.146727 4.142556 1.1628517 1.1606724 1.5424725 1.5395863 64.81855 64.78107 295.8405 295.5381 38.16120
5 39.97 4.324530 4.319309 1.1125427 1.1113752 1.4355043 1.4340464 69.18198 68.98098 289.2147 288.5684 36.89496
6 38.44 3.946339 3.943006 1.0540453 1.0531118 1.4150538 1.4135966 47.66867 47.65901 194.1575 193.8339 34.88610
   tirs040
1 25.39134
2 35.77946
3 37.79913
4 38.09968
5 36.84203
6 34.84804 

如你所见,当我执行第一个循环时,我使用的是名称中有030的预测值,而在第二个循环中,名称中有040的预测值等等.

library("randomForest")
library("doFuture") 

wd <- "test/"

# Load the data
block.data <- read.csv(paste0(wd, "block.data.psf.csv"))
eq1 <- ntl ~ .

# for reproduciblity
set.seed(123)

foreach (i = seq(030, 040, by = 10)) %do% {
  std <- sprintf("%03.0f", i)
  
  popCol <- paste0("pop", std)
  tirsCol <- paste0("tirs", std)
  agbhCol <- paste0("agbh", std)
  roadCol <- paste0("road", std)
  blueCol <- paste0("blue", std)
  nirCol <- paste0("nir", std)

  testVect = c("ntl", 
               popCol,
               tirsCol,
               agbhCol,
               roadCol,
               blueCol,
               nirCol)
  
  subBlockData <- subset(block.data, select = testVect)
  
  # default RF model
  m1 <- randomForest(
    formula = eq1,
    data    = subBlockData)
  
  # number of trees with highest r2
  btree = which.max(m1$rsq)
  
  features <- subBlockData[, 2:7]
  
  m1 <- randomForest(
    formula = ntl ~ .,
    data    = subBlockData,
    ntree = btree,
    mtry = ncol(features)/3,
    importance = FALSE)
  
  m1 # "% Var explained" is the number I want in the csv
}

我想要的是创建一个CSV(称为r2.csv),它将有两列(称为std和r2).第一列将在预测值名称之后包含数字(030,040等),第二列将包含RF的r平方(在我的代码中称为M1).我怎么能做到这一点?

以下是我的CSV:

block.data = structure(list(ntl = c(52.93999863, 49.45000076, 49.11999893, 
44.36999893, 39.97000122, 38.43999863, 38.99000168, 40.88000107, 
37.47999954, 40.65000153, 44.79999924, 50.38000107, 49.38000107, 
68.80000305, 59.88000107), agbh030 = c(1.773146749, 2.89567709, 
3.996930361, 4.146727085, 4.324529648, 3.946338654, 3.430744886, 
2.554599762, 2.052541256, 1.614771366, 1.991117001, 2.35665369, 
0.801872492, 3.310310841, 5.005721092), agbh040 = c(1.778323054, 
2.893922091, 3.992192984, 4.14255619, 4.319309235, 3.943005562, 
3.425927639, 2.553015947, 2.049137354, 1.619293809, 1.997152209, 
2.3637712, 0.81288743, 3.306827068, 4.997349739), blue030 = c(0.717936635, 
0.995600402, 1.074265957, 1.162851691, 1.112542748, 1.05404532, 
0.972470403, 0.864918828, 0.797388136, 0.780449748, 0.717492938, 
0.659686863, 0.544718802, 0.954298615, 1.098036289), blue040 = c(0.717890441, 
0.993733525, 1.072574139, 1.160672426, 1.111375213, 1.053111792, 
0.971249402, 0.863858879, 0.796197593, 0.780405819, 0.717574954, 
0.661465645, 0.546935678, 0.95236063, 1.097277403), nir030 = c(0.99114579, 
1.376678824, 1.49750936, 1.542472482, 1.435504317, 1.415053844, 
1.312831402, 1.290184379, 1.151046991, 1.062780142, 0.961566031, 
0.851697564, 0.746279657, 1.258480549, 1.483649731), nir040 = c(0.990923822, 
1.374148846, 1.495215654, 1.539586306, 1.434046388, 1.41359663, 
1.31127429, 1.288556933, 1.149601102, 1.062715054, 0.961549222, 
0.854078174, 0.749200106, 1.256223083, 1.48290813), pop030 = c(33.98461914, 
62.45861053, 78.47203827, 64.81855011, 69.18198395, 47.66866684, 
85.6471405, 63.21319962, 17.80360603, 57.13486862, 88.91275024, 
58.64352036, 34.46327209, 54.0263176, 61.17818451), pop040 = c(33.99507141, 
62.34374619, 78.32093048, 64.78107452, 68.98097992, 47.65901184, 
85.38433838, 63.14036942, 17.84538078, 57.06893158, 88.97719574, 
58.98400879, 34.64493942, 53.96364212, 61.17742538), road030 = c(152.4203644, 
435.8898926, 412.6563416, 295.8404541, 289.2147217, 194.1574554, 
136.9489441, 213.915741, 195.634903, 275.4249573, 129.0567017, 
110.0097885, 35.32381439, 138.662796, 365.485321), road040 = c(153.013031, 
434.7412109, 411.8447266, 295.538147, 288.5683899, 193.8339233, 
137.1492004, 213.4812012, 196.2715302, 276.1116638, 129.8981934, 
110.5063858, 35.86360931, 139.2144165, 364.8994446), tirs030 = c(25.39851189, 
35.85142899, 37.86270905, 38.16120148, 36.8949585, 34.88610458, 
33.1179924, 29.98550034, 26.51055908, 26.58303452, 25.10464287, 
22.69356346, 18.32171249, 30.8216877, 36.87643814), tirs040 = c(25.39133835, 
35.77946091, 37.79912567, 38.09967804, 36.84203339, 34.84803772, 
33.07061386, 29.94636917, 26.48450851, 26.57940674, 25.10551834, 
22.75337219, 18.39633369, 30.77775002, 36.85960007)), class = "data.frame", row.names = c(NA, 
-15L))

推荐答案

您不需要在循环中两次拟合randomForest模型,因为第一次拟合的模型已经包含了构建随机森林模型的每个回归树的(伪)r平方值,我们只需要索引返回的值,以便提取树解释的解释最大方差的百分比方差.

set.seed(123)
r2.df <- NULL

foreach (i = seq(030, 040, by = 10)) %do% {
  std <- sprintf("%03.0f", i)
  
  popCol <- paste0("pop", std)
  tirsCol <- paste0("tirs", std)
  agbhCol <- paste0("agbh", std)
  roadCol <- paste0("road", std)
  blueCol <- paste0("blue", std)
  nirCol <- paste0("nir", std)
  
  testVect = c("ntl", 
               popCol,
               tirsCol,
               agbhCol,
               roadCol,
               blueCol,
               nirCol)
  
  subBlockData <- subset(block.data, select = testVect)
  
  # default RF model
  m1 <- randomForest(
    formula = eq1,
    data    = subBlockData)
  
  # index of the tree with the highest r2
  btree = which.max(m1$rsq)
  
  features <- subBlockData[, 2:7]      
 
  #m1$rsq[btree] # "% Var explained" is the number I want in the csv
  r2.df <- rbind(r2.df, data.frame(column=i, r2=m1$rsq[btree]))
}

head(r2.df)
#  column         r2
#1     30 -0.1492738
#2     40 -0.4921124
write.csv(r2.df, 'result.csv', row.names = FALSE)

R2在这里是负的,因为模型对数据的拟合很差(比响应变量的平均值更差来解释方差),数据大小非常小(只有15个观测值).

y <- block.data$ntl
n <- length(y)
m <- mean(y)
p <- predict(m1)
plot(y, pch='19', col='black')
lines(y, col='black')
lines(rep(m, n), col='red', lty=2)
points(p, col='green', pch=19)
lines(p, col='green', pch=19)
legend('left', legend=c("ntl", "mean", "RF-pred"), fill = c("black", "red","green"))

enter image description here

R相关问答推荐

在水平条形图中zoom x_轴

DT::可数据的正规表达OR运算符问题

self_函数无法工作--无法子集结束后的列

如何判断某列中由某些行组成的百分比

如何将具有重复名称的收件箱合并到R中的另一列中,而结果不同?

使用lapply的重新定位功能

提取R中值和列名的所有可能组合

通过使用str_detect对具有相似字符串的组进行分组

迭代通过1个长度的字符串长字符R

如何优化向量的以下条件赋值?

如何对2个列表元素的所有组合进行操作?

将二进制数据库转换为频率表

从R中的对数正态分布生成随机数的正确方法

有没有可能用shiny 的书签恢复手风琴面板?

使用不同的定性属性定制主成分分析中点的 colored颜色 和形状

`-`是否也用于数据帧,有时使用引用调用?

快速合并R内的值

抽样变换-REXP与RWEIBUR

位置_道奇在geom_point图中不躲避

从两个数据帧中,有没有办法计算R中一列的唯一值?