moin_downloader/get_main.py
2025-04-17 22:20:12 +02:00

44 lines
1.4 KiB
Python

from bs4 import BeautifulSoup
import os
def get_main(
page, config_data: dict, list_entries: tuple[str, str]
) -> list[tuple[str, str]]:
base = config_data["base"]
page.goto(list_entries[1])
entry_path = list_entries[0]
os.makedirs(entry_path, mode=0o777, exist_ok=True)
with open(os.path.join(entry_path, "main.html"), "w") as f:
f.write(page.content())
html_content = page.content()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Find the table with class "nb list"
table = soup.find("table", {"class": "tb750 rw-table rw-all sections"})
assert table is not None
# Extract rows from the table body (tbody)
tbody = table.find("tbody")
rows = tbody.find_all("tr")
# Extract the first and second column from all rows
list_sub_entries: list[tuple[str, str]] = []
for row in rows:
cells = row.find_all("td", {"class": "tbdata"})
if len(cells) > 1:
link_tag = cells[0].find("a")
assert link_tag is not None
entry_name = cells[0].text.strip().replace(" ", "_")
entry_link = f"{base}{link_tag.get("href")}"
entry_status = cells[1].text.strip()
entry_path = f"{entry_status}_{entry_name}"
list_sub_entries.append((entry_path, entry_link))
return list_sub_entries