moin_downloader/get_main.py

from bs4 import BeautifulSoup
import os


def get_main(
    page, config_data: dict, list_entries: tuple[str, str]
) -> list[tuple[str, str]]:
    base = config_data["base"]
    page.goto(list_entries[1])
    entry_path = list_entries[0]
    os.makedirs(entry_path, mode=0o777, exist_ok=True)
    with open(os.path.join(entry_path, "main.html"), "w") as f:
        f.write(page.content())

    html_content = page.content()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the table with class "nb list"
    table = soup.find("table", {"class": "tb750 rw-table rw-all sections"})

    assert table is not None

    # Extract rows from the table body (tbody)
    tbody = table.find("tbody")
    rows = tbody.find_all("tr")

    # Extract the first and second column from all rows
    list_sub_entries: list[tuple[str, str]] = []

    for row in rows:
        cells = row.find_all("td", {"class": "tbdata"})

        if len(cells) > 1:
            link_tag = cells[0].find("a")
            assert link_tag is not None
            entry_name = cells[0].text.strip().replace(" ", "_")
            entry_link = f"{base}{link_tag.get("href")}"
            entry_status = cells[1].text.strip()
            entry_path = f"{entry_status}_{entry_name}"
            list_sub_entries.append((entry_path, entry_link))

    return list_sub_entries