Python 剧作家无休止的卷轴是不工作的

发布于12月16日

我有以下代码.它打开了无头浏览器，我还看到页面正在滚动，但parse方法中的响应对象没有任何HTML.当我不使用自动滚动时，这个蜘蛛可以完美地工作.

该代码仅用于从该网站提取产品名称和产品价格.

import scrapy
import re
from scrapy_playwright.page import PageMethod
from bs4 import BeautifulSoup


def should_abort_request(req):
   if req.resource_type == "image":
     return True
   if req.method.lower() == 'post':
     return True

return False


scrolling_script = """
  const scrolls = 8
  let scrollCount = 0

  // scroll down and then wait for 5s
  const scrollInterval = setInterval(() => {
    window.scrollTo(0, document.body.scrollHeight)
    scrollCount++

    if (scrollCount === numScrolls) {
      clearInterval(scrollInterval)
    }
  }, 5000)
  """


class AuchanSpider(scrapy.Spider):
  name = 'auchan'
  custom_settings = {
    'PLAYWRIGHT_ABORT_REQUEST': should_abort_request
  }
  start_urls = ['https://zakupy.auchan.pl/shop/list/8029?shType=id']

  def start_requests(self):
    for url in self.start_urls:
        yield scrapy.Request(
            url=url,
            callback=self.parse,
            meta={
                "playwright": True,
                "playwright_include_page": True,
                "playwright_page_methods": [
                    PageMethod("evaluate", scrolling_script),
                    #PageMethod("wait_for_timeout", 30000),
                    PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_"),
                    PageMethod("wait_for_selector", "._1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_:nth-child(60)")
                ],
            },
            errback=self.close_page,
            cb_kwargs=dict(main_url=url, page_number=0),
        )

async def parse(self, response, main_url, page_number):
    soup = BeautifulSoup(response.text, 'html.parser')
    product_containers = soup.find_all('div', class_='_1E5b _2I59 _1wkJ _3YFw igxN _7Zx6 Eb4X _390_')
    for product_container in product_containers:
        price = product_container.find(class_='_1-UB _1Evs').get_text()
        price = re.sub(r"[\n\t\s]*", "", price)
        yield {
            'productName': product_container.find(class_='_1DGZ').get_text(),
            'price': price
        }

async def close_page(self, failure):
    page = failure.request.meta["playwright_page"]
    await page.close()

import re from playwright.sync_api import sync_playwright # 1.37.0 from time import sleep with sync_playwright() as p: browser = p.chromium.launch(headless=False) page = browser.new_page() url = "https://zakupy.auchan.pl/shop/list/8029?shType=id" page.goto(url) page.click("#onetrust-accept-btn-handler") page.click("._3YI0") text = page.locator("._3MDH").text_content().strip() expected = int(re.search(r"\d+$", text).group()) records = {} while len(records) < expected: page.keyboard.press("PageDown") sleep(0.2) # save a bit of CPU items = page.eval_on_selector_all( "._1DGZ", """els => els.map(e => ({ href: e.href, text: e.textContent, }))""", ) for x in items: # assume hrefs are unique records[x["href"]] = x print(records) browser.close()

import json from playwright.sync_api import sync_playwright from time import sleep def scrape(page): url = "https://zakupy.auchan.pl/shop/list/8029?shType=id" items = [] done = False def handle_response(response): nonlocal done api_url = "https://zakupy.auchan.pl/api/v2/cache/products" if response.url.startswith(api_url): data = response.json() items.append(data) if data["pageCount"] == data["currentPage"]: with open("out.json", "w") as f: json.dump(items, f) done = True page.on("response", handle_response) page.goto(url) page.click("#onetrust-accept-btn-handler") page.click("._3YI0") while not done: page.keyboard.press("PageDown") sleep(0.2) # save a bit of CPU with sync_playwright() as p: browser = p.chromium.launch(headless=True) scrape(browser.new_page()) browser.close()

import json from playwright.sync_api import sync_playwright from time import sleep def scrape(page): url = "https://zakupy.auchan.pl/shop/list/8029?shType=id" api_url = "https://zakupy.auchan.pl/api/v2/cache/products" new_url = "https://zakupy.auchan.pl/api/v2/cache/products?listId=8029&itemsPerPage=500&page=1&cacheSegmentationCode=019_DEF&hl=pl" done = False def handle(route, request): route.continue_(url=new_url) page.route("https://zakupy.auchan.pl/api/v2/cache/products*", handle) def handle_response(response): nonlocal done if response.url.startswith(api_url): with open("out1.json", "w") as f: json.dump(response.json(), f) done = True page.on("response", handle_response) page.goto(url) page.click("#onetrust-accept-btn-handler") page.click("._3YI0") while not done: page.keyboard.press("PageDown") sleep(0.2) # save a bit of CPU with sync_playwright() as p: browser = p.chromium.launch(headless=True) scrape(browser.new_page()) browser.close()

Python 剧作家无休止的卷轴是不工作的

推荐答案

Python相关问答推荐

为什么图像结果翻转了90度？

使用子字符串动态更新Python DataFrame中的列

两极：滚动组，起始指数由不同列设置

从今天起的future 12个月内使用Python迭代

如何销毁框架并使其在tkinter中看起来像以前的样子？

有条件地采样我的大型DF的最有效方法

Pandas 填充条件是另一列

为什么我的Python代码在if-else声明中的行之前执行if-else声明中的行？

运行总计基于多列pandas的分组和总和

从dict的列中分钟

如何将Docker内部运行的mariadb与主机上Docker外部运行的Python脚本连接起来

Python解析整数格式说明符的规则？

Pandas：将多级列名改为一级

对象的`call`方法的setattr在Python中不起作用'

利用Selenium和Beautiful Soup实现Web抓取JavaScript表

使用BeautifulSoup抓取所有链接

numpy.unique如何消除重复列？

如何在两列上groupBy，并使用pyspark计算每个分组列的平均总价值

在matplotlib中使用不同大小的标记顶部添加批注

Gekko中基于时间的间隔约束