55 lines
2 KiB
Python
55 lines
2 KiB
Python
import os
|
|
from bs4 import BeautifulSoup
|
|
import requests # type: ignore
|
|
|
|
|
|
def download_pdfs(pdf_links):
|
|
for filename_to_save, full_url in pdf_links:
|
|
if os.path.isfile(filename_to_save) is False:
|
|
try:
|
|
response = requests.get(full_url, stream=True)
|
|
|
|
if response.status_code == 200:
|
|
with open(filename_to_save, "wb") as file:
|
|
for chunk in response.iter_content(chunk_size=1024):
|
|
if chunk: # filter out keep-alive new chunks
|
|
file.write(chunk)
|
|
print(f"Downloaded {filename_to_save} successfully.")
|
|
else:
|
|
print(
|
|
f"Failed to download {full_url}. Status code: {response.status_code}"
|
|
)
|
|
except Exception as e:
|
|
print(f"An error occurred while downloading {full_url}: {e}")
|
|
|
|
|
|
def grep_subpages(
|
|
page, config_data: dict, list_sub_entries: list[tuple[str, str]], path_name: str
|
|
) -> None:
|
|
for entry in list_sub_entries:
|
|
base = config_data["base"]
|
|
page.goto(entry[1])
|
|
|
|
page_content = page.content()
|
|
|
|
# Parse the HTML content using BeautifulSoup
|
|
soup = BeautifulSoup(page_content, "html.parser")
|
|
|
|
pdf_links: list[tuple[str, str]] = []
|
|
for link in soup.find_all("a", href=True):
|
|
if link.text.endswith(".pdf"):
|
|
os.makedirs(
|
|
os.path.join(path_name, entry[0]), mode=0o777, exist_ok=True
|
|
)
|
|
filename = os.path.join(
|
|
path_name, entry[0], link.text.strip().replace(" ", "_")
|
|
)
|
|
url = base + link["href"]
|
|
pdf_links.append((filename, url))
|
|
|
|
if len(pdf_links) > 0:
|
|
download_pdfs(pdf_links)
|
|
|
|
os.makedirs(path_name, mode=0o777, exist_ok=True)
|
|
with open(os.path.join(path_name, entry[0] + ".html"), "w") as f:
|
|
f.write(page.content())
|