Python 如何加快 aiohttp 解析器 bs4

发布于05月15日

任务是从站点获取数据.我有800个URL需要请求.但这需要很长时间.我用aiohttp.在这个阶段，我收到了很多链接，通过点击每个链接，我也得到了很多链接.我应用了aiohttp，但代码仍然很慢:390.9560036659241秒.很抱歉，如果这是一个简单的问题，但我对asyncio几乎没有经验，因此如果有人能提供帮助，我将不胜感激.谢谢

import json
import time
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import asyncio
import aiohttp

iso_data = []
iso_list = []
iso_catalogue = []
iso_links = ''
start_time = time.time()


async def get_page_data(session, url):          #get links 256 from main page
    url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"

    async with session.get(url=url) as response:
        response_text = await response.text()

        soup = BeautifulSoup(response_text, "lxml")
        iso_link = soup.find("tbody")

        for iso in iso_link.find_all("tr"):
            iso_url = iso.find('a').attrs['href']
            d = iso.find('a').text
            m = iso.find('td', {'data-title': 'Title'}).text

            try:
                level_2 = (f'{d}{m}').strip()
            except:
                level_2 = "nothing"
            iso_links = f'https://www.iso.org{iso_url}'
            iso_list.append(iso_links)
            iso_data.append({'level_1': 'tc', 'level_2': level_2})
        return iso_list


async def collect_data():                            #get 800 links
   
    async with aiohttp.ClientSession() as session:
        for i in iso_list:
            response = await session.get(url=i)
            soup = BeautifulSoup(await response.text(), "lxml")
            row = soup.find_all('td', attrs={'data-title': 'Subcommittee'})
            if row:
                for el in row:
                    a = el.find('a').attrs['href']
                    iso_catalogue.append(f'https://www.iso.org{a}')
            else:
                iso_catalogue.append(iso_links)
        return iso_catalogue


async def gather_data():
    url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
    async with aiohttp.ClientSession() as session:
        response = await session.get(url=url)
        soup = BeautifulSoup(await response.text(), "lxml")

        tasks = []

        task = asyncio.create_task(get_page_data(session, url))
        tasks.append(task)

        await asyncio.gather(*tasks)

async def worker_iso(q):

    for urls in out:
        while True:
            response = await q.get(urls)
            soup = BeautifulSoup(await response.text(), "lxml")
            for i in soup.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
                a1 = i.find('a').attrs['href']
                iso_standarts = f'https://www.iso.org{a1}'
                iso_standart.append(iso_standarts)

            q.task_done()


def main():

    asyncio.run(gather_data())
    asyncio.run(collect_data())

    cur_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")

    finish_time = time.time() - start_time
    print(f"Spend time: {finish_time}")


if __name__ == "__main__":
    main()
 ``

import tqdm # <-- I use this for nice progress bar/timing import asyncio import aiohttp from bs4 import BeautifulSoup out = [] async def get_soup(session, url): async with session.get(url=url) as resp: return BeautifulSoup(await resp.text(), "lxml") async def worker(session, q): while True: url, link_name, title = await q.get() soup = await get_soup(session, url) links = soup.select('[data-title="Subcommittee"] a') if links: for a in links: out.append("https://www.iso.org" + a["href"]) else: out.append(url) q.task_done() async def main(): url = "https://www.iso.org/standards-catalogue/browse-by-tc.html" async with aiohttp.ClientSession() as session: soup = await get_soup(session, url) titles = soup.select('td[data-title="Title"]') links = soup.select('td[data-title="Committee"] a') committees = [] for a, t in zip(links, titles): committees.append( [ "https://www.iso.org" + a["href"], a.get_text(strip=True), t.get_text(strip=True), ] ) queue = asyncio.Queue(maxsize=16) tasks = [] # create 16 workers that will process data in parallel for i in range(16): task = asyncio.create_task(worker(session, queue)) tasks.append(task) # put some data to worker queue for c in tqdm.tqdm(committees): await queue.put(c) # wait for all data to be processed await queue.join() # cancel all worker tasks for task in tasks: task.cancel() # Wait until all worker tasks are cancelled. await asyncio.gather(*tasks, return_exceptions=True) print(len(out)) if __name__ == "__main__": asyncio.run(main())

100%|██████████████████████████████████████████████████████████████████| 256/256 [00:19<00:00, 13.18it/s] 653

Python 如何加快 aiohttp 解析器 bs4

推荐答案

Python相关问答推荐

如何根据参数推断对象的返回类型？

C#使用程序从Python中执行Exec文件

如何在python xsModel库中定义一个可选[December]字段，以产生受约束的SON模式

DataFrames与NaN的条件乘法

从spaCy的句子中提取日期

如何在Polars中从列表中的所有 struct 中 Select 字段？

Pandas Loc Select 到NaN和值列表

计算分布的标准差

使用groupby方法移除公共子字符串

Python列表不会在条件while循环中正确随机化'

如何合并两个列表，并获得每个索引值最高的列表名称？

lityter不让我输入左边的方括号，'

* 动态地 * 修饰Python中的递归函数

使用Openpyxl从Excel中的折线图更改图表样式

如何使用正则表达式修改toml文件中指定字段中的参数值

如何根据rame中的列值分别分组值

没有内置pip模块的Python3.11--S在做什么？

启动线程时，Python键盘模块冻结/不工作

如何将django url参数传递给模板&S url方法？

为什么在安装了64位Python的64位Windows 10上以32位运行？