过滤到section
,然后 Select class_
:
from typing import Iterator
import bs4
from requests import Session
STRAINER = bs4.SoupStrainer(name='section', class_='item authors')
def fetch_authors(session: Session, article: int) -> bs4.ResultSet:
with session.get(
url=f'https://rpmgf.pt/ojs/index.php/rpmgf/article/view/{article}',
) as resp:
resp.raise_for_status()
dom = bs4.BeautifulSoup(markup=resp.text, parse_only=STRAINER, features='lxml')
return dom.find_all(name='span', class_='name')
def main() -> None:
with Session() as session:
for author_tag in fetch_authors(session=session, article=13494):
print(author_tag.text.strip())
if __name__ == '__main__':
main()
Maria João Gonçalves
Clara Fonseca
Inês Pintalhão
Rodrigo Costa
Ana Calafate
Manuel Henriques
或者,如果您还关心从属关系:
from typing import Iterator
import bs4
from requests import Session
STRAINER = bs4.SoupStrainer(name='section', class_='item authors')
def fetch_authors(session: Session, article: int) -> Iterator[tuple[str, str | None]]:
with session.get(
url=f'https://rpmgf.pt/ojs/index.php/rpmgf/article/view/{article}',
) as resp:
resp.raise_for_status()
dom = bs4.BeautifulSoup(markup=resp.text, parse_only=STRAINER) # , features='lxml')
for name in dom.find_all(name='span', class_='name'):
# Search through siblings for a matching affiliation tag
for affiliation in name.find_next_siblings(name='span'):
name_str = name.text.strip()
class_ = affiliation.attrs.get('class', ())[0]
if class_ == 'affiliation':
# If we've found an affiliation class on the soonest span sibling, use it
yield name_str, affiliation.text.strip()
break
elif class_ == 'name':
# If we've encountered the next name, there is no affiliation.
yield name_str, None
break
else:
# If there are no span siblings, there is no affiliation.
yield name.text.strip(), None
def main() -> None:
with Session() as session:
print('An article with some authors missing affiliation:')
for name, affiliation in fetch_authors(session=session, article=13545):
print(f'{name} ({affiliation})')
print()
print('An article with authors all having affiliation:')
for name, affiliation in fetch_authors(session=session, article=13494):
print(f'{name} ({affiliation})')
print()
if __name__ == '__main__':
main()
An article with some authors missing affiliation:
Andreia Oliveira (Médica)
Rita Paraíso (None)
Paola Lobão (None)
Vanessa Guerreiro (None)
An article with authors all having affiliation:
Maria João Gonçalves (USF Garcia de Orta)
Clara Fonseca (Assistente Graduada de Medicina Geral e Familiar na USF Garcia de Orta, ACeS Porto Ocidental)
Inês Pintalhão (Assistente de Medicina Geral e Familiar da USF Garcia de Orta, ACes Porto Ocidental)
Rodrigo Costa (Interno de Formação Específica de Medicina Geral e Familiar da USF Garcia de Orta, ACeS Porto Ocidental)
Ana Calafate (Assistente de Medicina Geral e Familiar da USF Garcia de Orta, ACeS Porto Ocidental)
Manuel Henriques (Assistente de Medicina Geral e Familiar da USF Garcia de Orta, ACeS Porto Ocidental)