一种 Select 是采用嵌套的tibbles/data.frames,其中每个行政单元都有自己的邻居+自己的Tibble.由于dplyr
中的行式操作,从这样的 struct 中收集汇总统计信息非常方便.总是可以 Select 使用unnest()
来加长该嵌套的数据集,而改用group_by()
/summarise()
.
下面的示例使用来自sf
包的nc
个数据集,距离阈值设置为50公里,以更好地匹配这些形状大小.它没有使用spdep
个包,而是只使用了sf
个函数,这意味着我们要使用的列表(sgbp
个稀疏几何二进制谓词列表,标准的sf
个东西)可能具有略有不同的 struct .在这里,邻居被定义为50公里缓冲区和县多边形的交叉点,而不是问题标题中描述的县质心;因此,这可能是另一个需要更改/复习的细节.
Prepare example, test and visualize subsetting for a single county.
library(sf)
#> Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(dplyr, warn.conflicts = FALSE)
library(purrr)
library(tidyr)
library(ggplot2)
# example data from sf package examples
nc = read_sf(system.file("shape/nc.shp", package="sf")) %>%
select(NAME, AREA, starts_with("BIR"))
nc
#> Simple feature collection with 100 features and 4 fields
#> Geometry type: MULTIPOLYGON
#> Dimension: XY
#> Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS: NAD27
#> # A tibble: 100 × 5
#> NAME AREA BIR74 BIR79 geometry
#> <chr> <dbl> <dbl> <dbl> <MULTIPOLYGON [°]>
#> 1 Ashe 0.114 1091 1364 (((-81.47276 36.23436, -81.54084 36.27251, -81…
#> 2 Alleghany 0.061 487 542 (((-81.23989 36.36536, -81.24069 36.37942, -81…
#> 3 Surry 0.143 3188 3616 (((-80.45634 36.24256, -80.47639 36.25473, -80…
#> 4 Currituck 0.07 508 830 (((-76.00897 36.3196, -76.01735 36.33773, -76.…
#> 5 Northampton 0.153 1421 1606 (((-77.21767 36.24098, -77.23461 36.2146, -77.…
#> 6 Hertford 0.097 1452 1838 (((-76.74506 36.23392, -76.98069 36.23024, -76…
#> 7 Camden 0.062 286 350 (((-76.00897 36.3196, -75.95718 36.19377, -75.…
#> 8 Gates 0.091 420 594 (((-76.56251 36.34057, -76.60424 36.31498, -76…
#> 9 Warren 0.118 968 1190 (((-78.30876 36.26004, -78.28293 36.29188, -78…
#> 10 Stokes 0.124 1612 2038 (((-80.02567 36.25023, -80.45301 36.25709, -80…
#> # ℹ 90 more rows
# test and visualize neighbourhood subsetting for a single county (Lee)
lee = tibble::lst(
polygon = nc[nc$NAME == "Lee","geometry"],
centroid = st_centroid(polygon),
buffer = st_buffer(centroid, 50000),
within = st_filter(nc, buffer)
)
ggplot() +
geom_sf(data = nc) +
geom_sf(data = lee$within, fill = "green") +
geom_sf(data = lee$polygon, fill = "red") +
geom_sf(data = lee$buffer, alpha = .6, fill = "gold") +
geom_sf(data = lee$centroid, size = 1)
Apply that same logic on a dataset
# resulting within_dist type is sgbp, "sparse geometry binary predicate lists",
# a list of sf object row indeces that intersect with the buffer
nc <- nc %>%
mutate(within_dist = st_centroid(geometry) %>%
st_buffer(50000) %>%
st_intersects(geometry)
, .before = "geometry")
nc
#> Simple feature collection with 100 features and 5 fields
#> Geometry type: MULTIPOLYGON
#> Dimension: XY
#> Bounding box: xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
#> Geodetic CRS: NAD27
#> # A tibble: 100 × 6
#> NAME AREA BIR74 BIR79 within_dist geometry
#> * <chr> <dbl> <dbl> <dbl> <sgbp[,100]> <MULTIPOLYGON [°]>
#> 1 Ashe 0.114 1091 1364 <int [7]> (((-81.47276 36.23436, -81.54084 …
#> 2 Alleghany 0.061 487 542 <int [7]> (((-81.23989 36.36536, -81.24069 …
#> 3 Surry 0.143 3188 3616 <int [9]> (((-80.45634 36.24256, -80.47639 …
#> 4 Currituck 0.07 508 830 <int [8]> (((-76.00897 36.3196, -76.01735 3…
#> 5 Northampton 0.153 1421 1606 <int [9]> (((-77.21767 36.24098, -77.23461 …
#> 6 Hertford 0.097 1452 1838 <int [10]> (((-76.74506 36.23392, -76.98069 …
#> 7 Camden 0.062 286 350 <int [11]> (((-76.00897 36.3196, -75.95718 3…
#> 8 Gates 0.091 420 594 <int [9]> (((-76.56251 36.34057, -76.60424 …
#> 9 Warren 0.118 968 1190 <int [8]> (((-78.30876 36.26004, -78.28293 …
#> 10 Stokes 0.124 1612 2038 <int [8]> (((-80.02567 36.25023, -80.45301 …
#> # ℹ 90 more rows
# within_dist for Lee:
nc$within_dist[nc$NAME == "Lee"]
#> [[1]]
#> [1] 27 29 30 37 47 48 54 60 63 67 82 86
# resolved to NAMEs :
nc$NAME[nc$within_dist[nc$NAME == "Lee"][[1]]]
#> [1] "Alamance" "Orange" "Durham" "Wake" "Randolph"
#> [6] "Chatham" "Johnston" "Lee" "Harnett" "Moore"
#> [11] "Cumberland" "Hoke"
Build a nested tibble
# ignore geometries for now for more compact output
nc_df <- st_drop_geometry(nc)
# build a nested tibble, each county row gets a tibble of neighbours (nb),
# use rowwise grouping to subset nc_df with within_dist of current row
nc_nested <- nc_df %>%
rowwise() %>%
mutate(
nb_idx = paste(within_dist, collapse = ","),
nb_names = paste(nc_df[["NAME"]][within_dist], collapse = ","),
nb = (nc_df[within_dist, ]) %>% select(-within_dist) %>% list()) %>%
ungroup()
nc_nested
#> # A tibble: 100 × 8
#> NAME AREA BIR74 BIR79 within_dist nb_idx nb_names nb
#> <chr> <dbl> <dbl> <dbl> <sgbp[,100]> <chr> <chr> <list>
#> 1 Ashe 0.114 1091 1364 <int [7]> 1,2,3,18,19,22,… Ashe,Al… <tibble>
#> 2 Alleghany 0.061 487 542 <int [7]> 1,2,3,18,19,23,… Ashe,Al… <tibble>
#> 3 Surry 0.143 3188 3616 <int [9]> 1,2,3,10,18,23,… Ashe,Al… <tibble>
#> 4 Currituck 0.07 508 830 <int [8]> 4,7,8,17,20,21,… Curritu… <tibble>
#> 5 Northampton 0.153 1421 1606 <int [9]> 5,6,8,9,16,28,3… Northam… <tibble>
#> 6 Hertford 0.097 1452 1838 <int [10]> 5,6,7,8,16,17,2… Northam… <tibble>
#> 7 Camden 0.062 286 350 <int [11]> 4,6,7,8,17,20,2… Curritu… <tibble>
#> 8 Gates 0.091 420 594 <int [9]> 4,5,6,7,8,17,20… Curritu… <tibble>
#> 9 Warren 0.118 968 1190 <int [8]> 5,9,13,15,16,24… Northam… <tibble>
#> 10 Stokes 0.124 1612 2038 <int [8]> 3,10,12,23,25,2… Surry,S… <tibble>
#> # ℹ 90 more rows
Working with nested tibbles
# we can now calculate summary statistics by accessing nested tibbles in
# nb column with purrr::map*() or lapply();
# or though rowwise grouping:
nc_nested %>%
select(NAME, nb_names, nb) %>%
rowwise() %>%
mutate(nb_bir74_mean = mean(nb$BIR74),
nb_bir74_sum = sum(nb$BIR74),
nb_area_sum = sum(nb$AREA))
#> # A tibble: 100 × 6
#> # Rowwise:
#> NAME nb_names nb nb_bir74_mean nb_bir74_sum nb_area_sum
#> <chr> <chr> <list> <dbl> <dbl> <dbl>
#> 1 Ashe Ashe,Alleghany,S… <tibble> 1946. 13625 0.784
#> 2 Alleghany Ashe,Alleghany,S… <tibble> 2092. 14643 0.839
#> 3 Surry Ashe,Alleghany,S… <tibble> 3111. 27997 1.06
#> 4 Currituck Currituck,Camden… <tibble> 607 4856 0.576
#> 5 Northampton Northampton,Hert… <tibble> 2047. 18420 1.22
#> 6 Hertford Northampton,Hert… <tibble> 1293. 12933 1.05
#> 7 Camden Currituck,Hertfo… <tibble> 784. 8622 0.953
#> 8 Gates Currituck,Northa… <tibble> 920. 8284 0.813
#> 9 Warren Northampton,Warr… <tibble> 2366. 18925 1.08
#> 10 Stokes Surry,Stokes,Roc… <tibble> 5660. 45276 0.998
#> # ℹ 90 more rows
# or we could unnest nb column to lenghten our dataset ...
nc_unnested <- nc_nested %>%
unnest(nb, names_sep = ".") %>%
select(-(within_dist:nb_names))
print(nc_unnested, n = 14)
#> # A tibble: 1,004 × 8
#> NAME AREA BIR74 BIR79 nb.NAME nb.AREA nb.BIR74 nb.BIR79
#> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#> 1 Ashe 0.114 1091 1364 Ashe 0.114 1091 1364
#> 2 Ashe 0.114 1091 1364 Alleghany 0.061 487 542
#> 3 Ashe 0.114 1091 1364 Surry 0.143 3188 3616
#> 4 Ashe 0.114 1091 1364 Wilkes 0.199 3146 3725
#> 5 Ashe 0.114 1091 1364 Watauga 0.081 1323 1775
#> 6 Ashe 0.114 1091 1364 Avery 0.064 781 977
#> 7 Ashe 0.114 1091 1364 Caldwell 0.122 3609 4249
#> 8 Alleghany 0.061 487 542 Ashe 0.114 1091 1364
#> 9 Alleghany 0.061 487 542 Alleghany 0.061 487 542
#> 10 Alleghany 0.061 487 542 Surry 0.143 3188 3616
#> 11 Alleghany 0.061 487 542 Wilkes 0.199 3146 3725
#> 12 Alleghany 0.061 487 542 Watauga 0.081 1323 1775
#> 13 Alleghany 0.061 487 542 Yadkin 0.086 1269 1568
#> 14 Alleghany 0.061 487 542 Iredell 0.155 4139 5400
#> # ℹ 990 more rows
# ... and use group_by() / summarise() or just summarise(..., .by):
nc_unnested %>%
summarise(nb_bir74_mean = mean(nb.BIR74),
nb_bir74_sum = sum(nb.BIR74),
nb_area_sum = sum(nb.AREA), .by = NAME)
#> # A tibble: 100 × 4
#> NAME nb_bir74_mean nb_bir74_sum nb_area_sum
#> <chr> <dbl> <dbl> <dbl>
#> 1 Ashe 1946. 13625 0.784
#> 2 Alleghany 2092. 14643 0.839
#> 3 Surry 3111. 27997 1.06
#> 4 Currituck 607 4856 0.576
#> 5 Northampton 2047. 18420 1.22
#> 6 Hertford 1293. 12933 1.05
#> 7 Camden 784. 8622 0.953
#> 8 Gates 920. 8284 0.813
#> 9 Warren 2366. 18925 1.08
#> 10 Stokes 5660. 45276 0.998
#> # ℹ 90 more rows
创建于2023-08-01年第reprex v2.0.2页