My objective is that for website https://data.eastmoney.com/executive/000001.html, when you scroll down you will find a big table and I want to turn it 成 a DataFrame in Python. Is BeautifulSoup enough to do so or do I have to use Selenium?
Stack OverFlow上的一些人说BeautifulSoup无法从互联网上抓取表数据,所以我try 了Selenium,这是代码:
driver = webdriver.Chrome()
driver.get('https://data.eastmoney.com/executive/000001.html')
table_element = driver.find_element_by_xpath("//table")
item_element = table_element.find_element_by_xpath("//tr[2]/td[3]")
item_text = item_element.text
df = pd.DataFrame([item_text], columns=["Item"])
print(df)
driver.quit()
结果如下:
Traceback (most recent call last):
File "selenium/webdriver/common/service.py", line 76, in start
stdin=PIPE)
File "subprocess.py", line 709, in __init__
restore_signals, start_new_session)
File "subprocess.py", line 1344, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'chromedriver': 'chromedriver'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/tmp/jqcore/jqboson/jqboson/core/entry.py", line 379, in _run
engine.start()
File "/tmp/jqcore/jqboson/jqboson/core/engine.py", line 231, in start
self._dispatcher.start()
File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 280, in start
self._run_loop()
File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 240, in _run_loop
self._loop.run()
File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 107, in run
self._handle_queue()
File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 153, in _handle_queue
message.callback(**message.callback_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_subscriber.py", line 228, in broadcast
consumer.send(market_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 59, in consumer_gen
msg_callback()
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 52, in msg_callback
callback(market_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 122, in wrapper
result = callback(*args, **kwargs)
File "/tmp/jqcore/jqboson/jqboson/core/strategy.py", line 474, in _wrapper
self._context.current_dt
File "/tmp/strategy/user_code.py", line 85, in handle_data
driver = webdriver.Chrome()
File "selenium/webdriver/chrome/webdriver.py", line 73, in __init__
self.service.start()
File "selenium/webdriver/common/service.py", line 83, in start
os.path.basename(self.path), self.start_error_message)
selenium.common.exceptions.WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home
基本上它说"Chromedriver. Exec需要位于PATH中".问题是我正在使用一个名为JoinQuant(www.joinquant.com)的在线回溯测试平台service.py所以这对Selenium来说很复杂-我必须使用Selenium从互联网上抓取这样的数据并将其转化为Python中的DataFrame吗?或者我可以使用BeautifulSoup等其他东西吗?对于BeautifulSoup来说,至少它不存在"驱动器需要处于PATH"的问题.
对于BeautifulSoup,我try 了以下内容:
# Web Crawler
# Sent HTTP Request to get Internet content
url = 'https://data.eastmoney.com/executive/000001.html'
response = requests.get(url)
html_content = response.text
# Check if the request is successful
if response.status_code == 200:
# Use BeautifulSoup to Analyze Internet information and get the table
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find_all('table')
# Acquire the rows and columns of the table
rows = table.find_all('tr')
data = []
for row in rows:
cols = row.find_all('td')
row_data = []
for col in cols:
row_data.append(col.text.strip())
data.append(row_data)
else:
print("Failed to Retrieve the Webpage.")
# Set up DataFrame
dataframe = pd.DataFrame(data)
# Print DataFrame
print(dataframe)
这是输出:
Traceback (most recent call last):
File "/tmp/jqcore/jqboson/jqboson/core/entry.py", line 379, in _run
engine.start()
File "/tmp/jqcore/jqboson/jqboson/core/engine.py", line 231, in start
self._dispatcher.start()
File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 280, in start
self._run_loop()
File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 240, in _run_loop
self._loop.run()
File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 107, in run
self._handle_queue()
File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 153, in _handle_queue
message.callback(**message.callback_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_subscriber.py", line 228, in broadcast
consumer.send(market_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 59, in consumer_gen
msg_callback()
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 52, in msg_callback
callback(market_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 122, in wrapper
result = callback(*args, **kwargs)
File "/tmp/jqcore/jqboson/jqboson/core/strategy.py", line 474, in _wrapper
self._context.current_dt
File "/tmp/strategy/user_code.py", line 114, in handle_data
rows = table.find_all('tr')
File "bs4/element.py", line 1884, in __getattr__
"ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
AttributeError: ResultSet object has no attribute 'find_all'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?
但如果你改变
table = soup.find_all('table')
成
table = soup.find('table')
结果如下:
Traceback (most recent call last):
File "/tmp/jqcore/jqboson/jqboson/core/entry.py", line 379, in _run
engine.start()
File "/tmp/jqcore/jqboson/jqboson/core/engine.py", line 231, in start
self._dispatcher.start()
File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 280, in start
self._run_loop()
File "/tmp/jqcore/jqboson/jqboson/core/dispatcher.py", line 240, in _run_loop
self._loop.run()
File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 107, in run
self._handle_queue()
File "/tmp/jqcore/jqboson/jqboson/core/loop/loop.py", line 153, in _handle_queue
message.callback(**message.callback_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_subscriber.py", line 228, in broadcast
consumer.send(market_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 59, in consumer_gen
msg_callback()
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 52, in msg_callback
callback(market_data)
File "/tmp/jqcore/jqboson/jqboson/core/mds/market_data_consumer_manager.py", line 122, in wrapper
result = callback(*args, **kwargs)
File "/tmp/jqcore/jqboson/jqboson/core/strategy.py", line 474, in _wrapper
self._context.current_dt
File "/tmp/strategy/user_code.py", line 114, in handle_data
rows = table.find_all('tr')
AttributeError: 'NoneType' object has no attribute 'find_all'
那么总结一下,我应该使用哪一个呢?Selium还是Beautiful汤?或者甚至是其他东西?我应该如何解决这个问题?