moin_downloader/get_names.py

from bs4 import BeautifulSoup


def get_names(page, config_data: dict) -> list[tuple[str, str]]:
    base = config_data["base"]
    html_content: str = page.content()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the table with class "nb list"
    table = soup.find("table", {"class": "nb list"})

    assert table is not None

    # Extract rows from the table body (tbody)
    tbody = table.find("tbody")
    rows = tbody.find_all("tr", {"class": "tbdata"})

    # Extract the first and second column from all rows
    list_entries: list[tuple[str, str]] = []

    for row in rows:
        cells = row.find_all("td")

        if len(cells) > 1:
            link_tag = cells[0].find("a")
            assert link_tag is not None
            entry_name: str = cells[0].text.strip().title().replace(" ", "_")
            entry_link: str = f"{base}{link_tag.get("href")}"
            entry_id: str = cells[1].text.strip()
            entry_path: str = f"{entry_id}_{entry_name}"
            list_entries.append((entry_path, entry_link))

    return list_entries