moin_downloader/get_names.py
2025-04-17 22:20:12 +02:00

35 lines
1.1 KiB
Python

from bs4 import BeautifulSoup
def get_names(page, config_data: dict) -> list[tuple[str, str]]:
base = config_data["base"]
html_content: str = page.content()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Find the table with class "nb list"
table = soup.find("table", {"class": "nb list"})
assert table is not None
# Extract rows from the table body (tbody)
tbody = table.find("tbody")
rows = tbody.find_all("tr", {"class": "tbdata"})
# Extract the first and second column from all rows
list_entries: list[tuple[str, str]] = []
for row in rows:
cells = row.find_all("td")
if len(cells) > 1:
link_tag = cells[0].find("a")
assert link_tag is not None
entry_name: str = cells[0].text.strip().title().replace(" ", "_")
entry_link: str = f"{base}{link_tag.get("href")}"
entry_id: str = cells[1].text.strip()
entry_path: str = f"{entry_id}_{entry_name}"
list_entries.append((entry_path, entry_link))
return list_entries