这是一个有点复杂的抓取操作,因为您需要单击每个按钮,然后等待结果到达.您可以通过监视请求来实现这一点,或者使用"更多"按钮等待每个部分的长度增加.
以下是如何制作底部版本的快速草图.它有效,但可以进行一些清理,作为练习:
const puppeteer = require("puppeteer"); // ^22.6.0
const url = "<Your URL>";
let browser;
(async () => {
browser = await puppeteer.launch();
const [page] = await browser.pages();
const ua =
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36";
await page.setUserAgent(ua);
// Performance optimization: block unnecessary requests
await page.setRequestInterception(true);
const allowedResources = ["script", "other", "fetch"];
page.on("request", req => {
if (
(req.url().startsWith("https://www.imdb.com") ||
allowedResources.includes(req.resourceType())) &&
!/google|amazon|beacon/.test(req.url()) &&
req.resourceType() !== "xhr"
) {
req.continue();
} else {
req.abort();
}
});
await page.goto(url, {waitUntil: "domcontentloaded"});
// Retrieve the sections with 'more' buttons as [length, index] pairs
const lengths = await page.$$eval(
".ipc-page-grid .ipc-page-section",
els =>
els
.map((e, i) => [e, i])
.filter(([e]) => e.querySelector(".ipc-see-more__text"))
.map(([e, i]) => [e.querySelectorAll("p").length, i])
);
// Click all of the 'more' buttons
await page.$$eval(".ipc-see-more__text", els =>
els.forEach(el => el.click())
);
// Wait until the lengths of each 'more' section increase
await page.waitForFunction(
lengths =>
[
...document.querySelectorAll(
".ipc-page-grid .ipc-page-section"
),
].every((el, i) => {
const companion = lengths.find(e => e[1] === i);
const {length} = el.querySelectorAll("p");
return !companion || companion[0] < length;
}),
{},
lengths
);
// Scrape the data
const data = await page.$$eval(
".ipc-page-grid .ipc-page-section",
els =>
els
.map(el => ({
title: el
.querySelector(".ipc-title")
?.textContent.trim(),
items: [...el.querySelectorAll("p")].map(e => ({
href: e.querySelector("a").href,
year: [...e.childNodes].at(-1).textContent.trim(),
})),
}))
.filter(e => e.items.length)
);
console.log(JSON.stringify(data, null, 2));
})()
.catch(err => console.error(err))
.finally(() => browser?.close());
部分输出:
[
{
"title": "Edited into",
"items": [
{
"href": "https://www.imdb.com/title/tt0101627?ref_=ttcnn",
"year": "(1991)"
},
{
"href": "https://www.imdb.com/title/tt3233580?ref_=ttcnn",
"year": "(2013)"
}
]
},
{
"title": "Featured in",
"items": [
{
"href": "https://www.imdb.com/title/tt14701700?ref_=ttcnn",
"year": "(TV Episode 1986)"
},
{
"href": "https://www.imdb.com/title/tt1577448?ref_=ttcnn",
"year": "(TV Episode 1986)"
},
{
"href": "https://www.imdb.com/title/tt0093629?ref_=ttcnn",
"year": "(1987)"
},
{
"href": "https://www.imdb.com/title/tt6079512?ref_=ttcnn",
"year": "(TV Episode 1989)"
},
{
"href": "https://www.imdb.com/title/tt0116289?ref_=ttcnn",
"year": "(1996)"
},
{
"href": "https://www.imdb.com/title/tt0834914?ref_=ttcnn",
"year": "(Video 2006)"
},
{
"href": "https://www.imdb.com/title/tt1748981?ref_=ttcnn",
"year": "(TV Episode 2010)"
},
{
"href": "https://www.imdb.com/title/tt4213530?ref_=ttcnn",
"year": "(TV Episode 2014)"
},
// ...