任务是从站点获取数据.我有800个URL需要请求.但这需要很长时间.我用aiohttp.在这个阶段,我收到了很多链接,通过点击每个链接,我也得到了很多链接.我应用了aiohttp,但代码仍然很慢:390.9560036659241秒.很抱歉,如果这是一个简单的问题,但我对asyncio几乎没有经验,因此如果有人能提供帮助,我将不胜感激.谢谢
import json
import time
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import asyncio
import aiohttp
iso_data = []
iso_list = []
iso_catalogue = []
iso_links = ''
start_time = time.time()
async def get_page_data(session, url): #get links 256 from main page
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with session.get(url=url) as response:
response_text = await response.text()
soup = BeautifulSoup(response_text, "lxml")
iso_link = soup.find("tbody")
for iso in iso_link.find_all("tr"):
iso_url = iso.find('a').attrs['href']
d = iso.find('a').text
m = iso.find('td', {'data-title': 'Title'}).text
try:
level_2 = (f'{d}{m}').strip()
except:
level_2 = "nothing"
iso_links = f'https://www.iso.org{iso_url}'
iso_list.append(iso_links)
iso_data.append({'level_1': 'tc', 'level_2': level_2})
return iso_list
async def collect_data(): #get 800 links
async with aiohttp.ClientSession() as session:
for i in iso_list:
response = await session.get(url=i)
soup = BeautifulSoup(await response.text(), "lxml")
row = soup.find_all('td', attrs={'data-title': 'Subcommittee'})
if row:
for el in row:
a = el.find('a').attrs['href']
iso_catalogue.append(f'https://www.iso.org{a}')
else:
iso_catalogue.append(iso_links)
return iso_catalogue
async def gather_data():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
response = await session.get(url=url)
soup = BeautifulSoup(await response.text(), "lxml")
tasks = []
task = asyncio.create_task(get_page_data(session, url))
tasks.append(task)
await asyncio.gather(*tasks)
async def worker_iso(q):
for urls in out:
while True:
response = await q.get(urls)
soup = BeautifulSoup(await response.text(), "lxml")
for i in soup.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
a1 = i.find('a').attrs['href']
iso_standarts = f'https://www.iso.org{a1}'
iso_standart.append(iso_standarts)
q.task_done()
def main():
asyncio.run(gather_data())
asyncio.run(collect_data())
cur_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")
finish_time = time.time() - start_time
print(f"Spend time: {finish_time}")
if __name__ == "__main__":
main()
``