moin_downloader/grep_subpages.py

import os
from bs4 import BeautifulSoup
import requests  # type: ignore


def download_pdfs(pdf_links):
    for filename_to_save, full_url in pdf_links:
        if os.path.isfile(filename_to_save) is False:
            try:
                response = requests.get(full_url, stream=True)

                if response.status_code == 200:
                    with open(filename_to_save, "wb") as file:
                        for chunk in response.iter_content(chunk_size=1024):
                            if chunk:  # filter out keep-alive new chunks
                                file.write(chunk)
                    print(f"Downloaded {filename_to_save} successfully.")
                else:
                    print(
                        f"Failed to download {full_url}. Status code: {response.status_code}"
                    )
            except Exception as e:
                print(f"An error occurred while downloading {full_url}: {e}")


def grep_subpages(
    page, config_data: dict, list_sub_entries: list[tuple[str, str]], path_name: str
) -> None:
    for entry in list_sub_entries:
        base = config_data["base"]
        page.goto(entry[1])

        page_content = page.content()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(page_content, "html.parser")

        pdf_links: list[tuple[str, str]] = []
        for link in soup.find_all("a", href=True):
            if link.text.endswith(".pdf"):
                os.makedirs(
                    os.path.join(path_name, entry[0]), mode=0o777, exist_ok=True
                )
                filename = os.path.join(
                    path_name, entry[0], link.text.strip().replace(" ", "_")
                )
                url = base + link["href"]
                pdf_links.append((filename, url))

        if len(pdf_links) > 0:
            download_pdfs(pdf_links)

        os.makedirs(path_name, mode=0o777, exist_ok=True)
        with open(os.path.join(path_name, entry[0] + ".html"), "w") as f:
            f.write(page.content())