moin_downloader/grep_subpages.py
2025-04-17 22:20:12 +02:00

55 lines
2 KiB
Python

import os
from bs4 import BeautifulSoup
import requests # type: ignore
def download_pdfs(pdf_links):
for filename_to_save, full_url in pdf_links:
if os.path.isfile(filename_to_save) is False:
try:
response = requests.get(full_url, stream=True)
if response.status_code == 200:
with open(filename_to_save, "wb") as file:
for chunk in response.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
file.write(chunk)
print(f"Downloaded {filename_to_save} successfully.")
else:
print(
f"Failed to download {full_url}. Status code: {response.status_code}"
)
except Exception as e:
print(f"An error occurred while downloading {full_url}: {e}")
def grep_subpages(
page, config_data: dict, list_sub_entries: list[tuple[str, str]], path_name: str
) -> None:
for entry in list_sub_entries:
base = config_data["base"]
page.goto(entry[1])
page_content = page.content()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(page_content, "html.parser")
pdf_links: list[tuple[str, str]] = []
for link in soup.find_all("a", href=True):
if link.text.endswith(".pdf"):
os.makedirs(
os.path.join(path_name, entry[0]), mode=0o777, exist_ok=True
)
filename = os.path.join(
path_name, entry[0], link.text.strip().replace(" ", "_")
)
url = base + link["href"]
pdf_links.append((filename, url))
if len(pdf_links) > 0:
download_pdfs(pdf_links)
os.makedirs(path_name, mode=0o777, exist_ok=True)
with open(os.path.join(path_name, entry[0] + ".html"), "w") as f:
f.write(page.content())