我的问题是,每次我抓取给定静态站点的不同部分时,如果我不知道确切的页码,我必须手动更改"Pages_to_sccrap"整数向量.所以,我想把这个自动化.换句话说,当我事先不知道页码时,我希望能够刮掉所有可用的页面.
library(rvest)
base_url <- "https://morskidar.bg/products.php?category=pryasna-riba&page=%d"
scrape_prices <- function(page) {
url <- sprintf(base_url, page)
page_content <- read_html(url)
pc <- page_content %>%
html_elements(".col-sm-6") %>%
map_dfr(~ tibble(
product = .x %>%
html_element(".shop-three-products-name a") %>%
html_text2(),
price = .x %>%
html_element(".shop-three-products-price") %>%
html_text2(),
)) %>%
mutate(date = Sys.Date(),
location = "Unknown",
type = "Unknown",
source = "Unknown", .before = product) %>%
separate_wider_delim(price, delim = " - ", names = c("unit", "price")) %>%
mutate(price = parse_number(price), unit = str_remove(unit, "\\.")) %>%
distinct()
return(pc)
}
pages_to_scrape <- 1:5
final_df <- map_dfr(pages_to_scrape, scrape_prices)