我正在使用下面的代码滚动到YouTube页面的底部,它很有效.我的问题是,在站点被向下滚动到底部后,我如何才能安慰.log已经到达底部?先谢谢你.

note:这个解决方案应该适用于youtube.com.我已经try 获取文档高度,并将其与滚动高度进行比较,但不起作用!

const puppeteer = require('puppeteer');

let thumbArr = []
const scrapeInfiniteScrollItems = async(page) => {
  while (true) {
    const previousHeight = await page.evaluate(
      "document.querySelector('ytd-app').scrollHeight"
    );
    await page.evaluate(() => {
      const youtubeScrollHeight =
        document.querySelector("ytd-app").scrollHeight;
      window.scrollTo(0, youtubeScrollHeight);
    });
    await page.waitForFunction(
      `document.querySelector('ytd-app').scrollHeight > ${previousHeight}`, {
        timeout: 0
      }
    );

    const thumbnailLength = (await page.$$('ytd-grid-video-renderer')).length
    //this logs the amount of thumbnails every loop but once bottom scroll has        been reached it stops logging (obviously) but the question is how am I supposed to compare the last amount of thumbnail's found with total thumbnails once the loop has stopped running. Take a look below to better understand my question.
    thumbArr.push(thumbnailLength)

    if (thumbnailLength == thumbArr.at(-1)) {
      console.log('bottom has been reached')
    }

    await page.waitForTimeout(1000)
  }
};

(async() => {
  const browser = await puppeteer.launch({
    headless: false
  });
  const page = await browser.newPage();
  await page.goto('https://www.youtube.com', {
    waitUntil: 'networkidle2',
  });

  await scrapeInfiniteScrollItems(page)
})();

UPDATE:

let clientHeightArr = []
let clientHeightArrTracker = []
const scrapeInfiniteScrollItems = async(browser, page) => {
  var infiniteScrollTrackerInterval = setInterval(async() => {
    clientHeightArrTracker.push(clientHeightArr.length)
    if (clientHeightArrTracker.some((e, i, arr) => arr.indexOf(e) !== i) == true) {
      clearInterval(infiniteScrollTrackerInterval)
      console.log('Bottom is reached')
      //causes error "ProtocolError: Protocol error (Runtime.callFunctionOn): Target closed."
      await browser.close()
    }
  }, 2000)
  while (true) {
    const previousHeight = await page.evaluate(
      "document.querySelector('ytd-app').scrollHeight"
    );

    await page.evaluate(() => {
      const youtubeScrollHeight =
        document.querySelector("ytd-app").scrollHeight;
      window.scrollTo(0, youtubeScrollHeight);
    });

    await page.waitForFunction(
      `document.querySelector('ytd-app').scrollHeight > ${previousHeight}`, {
        timeout: 0
      },
    );

    const clientHeight = await page.$$eval("ytd-app", el => el.map(x => x.clientHeight));
    clientHeightArr.push(clientHeight[0])
    await page.waitForTimeout(1000)
  }
};

(async() => {
  const browser = await puppeteer.launch({
    headless: false
  });
  const page = await browser.newPage();
  await page.goto('https://www.youtube.com/c/mkbhd/videos', {
    waitUntil: 'networkidle2',
  });

  await scrapeInfiniteScrollItems(browser, page)
})();

推荐答案

这里有很多可能的方法.在打乱了一会儿之后,我随意地采用了以下策略,并认为这对初学者来说已经足够了:

const puppeteer = require("puppeteer"); // ^16.2.0

let browser;
(async () => {
  browser = await puppeteer.launch({headless: true});
  const [page] = await browser.pages();
  await page.setRequestInterception(true);
  page.on("request", req => {
    if (req.resourceType() === "image") {
      req.abort();
    }
    else {
      req.continue();
    }
  });
  const url = "https://www.youtube.com/c/mkbhd/videos";
  await page.goto(url, {waitUntil: "domcontentloaded"});
  await page.waitForSelector("#video-title");
  let thumbs = [];

  for (;;) {
    try {
      await page.waitForFunction(
        `${thumbs.length} !==
          document.querySelectorAll("#video-title").length`, 
        {timeout: 10000}
      );
    }
    catch (err) {
      break;
    }

    thumbs = await page.$$eval("#video-title", els => {
      els.at(-1).scrollIntoView();
      return els.map(e => e.getAttribute("title"));
    });
  }

  console.log("total length", thumbs.length);
  console.log("first 10 thumbs:", thumbs.slice(0, 10));
  console.log("last 10 thumbs:", thumbs.slice(-10));
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close())
;

输出:

total length 1460
first 10 thumbs: [
  'iPhone 14/Pro Impressions: Welcome to Dynamic Island!',
  'Google Pixel Buds Pro Review: Just Get These!',
  'The Hyundai IONIQ 5: I Get It Now!',
  'Android 13 Hands-On: Top 5 Features!',
  'Dope Tech: The Most Extreme Gaming Monitor!',
  'Samsung Z Fold 4/ Flip 4 Impressions + Watch 5 Pro!',
  'Best Back to School Tech 2022!',
  'OnePlus 10T Impressions: Somebody That You Used to Know',
  'Asus Zenfone 9: The New Compact King!',
  "The iPad's Odd New Feature"
]
last 10 thumbs: [
  'HQ Tutorial: Rocket Dock Application',
  'Camstudio Clarity',
  'Tutorial: Camstudio HQ',
  'HQ Tutorial: Ccleaner',
  '15 Year old Golf Swing Analysis',
  'Fraps HD Test in 1080p (18 WOS)',
  'HP Pavilion dv7t Media Center Remote Overview',
  'High fps LG Voyager footage',
  '14 Year knock-down shot (11 Handicap)',
  '13-Year-Old Golf Swing Analysis'
]

这段代码保存了到目前为止收集到的所有拇指的array.在循环中,我首先判断是否有新的拇指--如果在大约10秒内没有新的拇指出现,中断并假设我们已经收集了所有的拇指.否则,将最后一个拇指滚动到视图中并收集所有拇指,从而扩展结果列表.

"等10秒换新拇指,然后退出"从技术上讲是一种竞赛状态.如果有一些滞后,可能会引发假阳性,但对于新手来说似乎足够好了.太过慷慨地延长超时时间会使脚本在擦除结束时无法退出,因此对于抓取大量频道和很少的视频来说并不是很好.也许更好的方法是在触发每个滚动之后等待/browseAPI响应.此响应对象中包含各种附加数据.

另见Puppeteer - scroll down until you can't anymore

Javascript相关问答推荐

如何在Jest中调用setupFile下列出的文件所需的函数?

使用typeof运算符获取对象值类型-接收字符串而不是数组

如何访问react路由v6加载器函数中的查询参数/搜索参数

无法将nPM simplex-noise包导入在JS项目中工作

使用JavaScript在ionic Web应用程序中更新.pane和.view的背景 colored颜色

Vue:ref不会创建react 性属性

我可以使用CSS有效地实现最大宽度=100%和最大高度=100%,而无需父母有明确的/固定的宽度和高度,替代方法吗?

调用removeEvents不起作用

在react js中使用react—router—dom中的Link组件,分配的右侧不能被 destruct ''

屏幕右侧屏障上的产卵点""

并不是所有的iframe都被更换

Angular 中的类型错误上不存在获取属性

IF语句的计算结果与实际情况相反

将Node.js包发布到GitHub包-错误ENEEDAUTH

在不删除代码的情况下禁用Java弹出功能WordPress

如何用javascript更改元素的宽度和高度?

我正在试着做一个TicTacToe Ai来和我玩.但是,我试着在第一个方块被点击时出现一个X,然后在第二个方块之后出现一个O

JAVASCRIPT SWITCH CASE语句:当表达式为';ALL';

为什么我不能使用其同级元素调用和更新子元素?

将Singleton实例设置为未定义后的Angular 变量引用持久性行为