import os from bs4 import BeautifulSoup import requests # type: ignore def download_pdfs(pdf_links): for filename_to_save, full_url in pdf_links: if os.path.isfile(filename_to_save) is False: try: response = requests.get(full_url, stream=True) if response.status_code == 200: with open(filename_to_save, "wb") as file: for chunk in response.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks file.write(chunk) print(f"Downloaded {filename_to_save} successfully.") else: print( f"Failed to download {full_url}. Status code: {response.status_code}" ) except Exception as e: print(f"An error occurred while downloading {full_url}: {e}") def grep_subpages( page, config_data: dict, list_sub_entries: list[tuple[str, str]], path_name: str ) -> None: for entry in list_sub_entries: base = config_data["base"] page.goto(entry[1]) page_content = page.content() # Parse the HTML content using BeautifulSoup soup = BeautifulSoup(page_content, "html.parser") pdf_links: list[tuple[str, str]] = [] for link in soup.find_all("a", href=True): if link.text.endswith(".pdf"): os.makedirs( os.path.join(path_name, entry[0]), mode=0o777, exist_ok=True ) filename = os.path.join( path_name, entry[0], link.text.strip().replace(" ", "_") ) url = base + link["href"] pdf_links.append((filename, url)) if len(pdf_links) > 0: download_pdfs(pdf_links) os.makedirs(path_name, mode=0o777, exist_ok=True) with open(os.path.join(path_name, entry[0] + ".html"), "w") as f: f.write(page.content())