44 lines
1.4 KiB
Python
44 lines
1.4 KiB
Python
from bs4 import BeautifulSoup
|
|
import os
|
|
|
|
|
|
def get_main(
|
|
page, config_data: dict, list_entries: tuple[str, str]
|
|
) -> list[tuple[str, str]]:
|
|
base = config_data["base"]
|
|
page.goto(list_entries[1])
|
|
entry_path = list_entries[0]
|
|
os.makedirs(entry_path, mode=0o777, exist_ok=True)
|
|
with open(os.path.join(entry_path, "main.html"), "w") as f:
|
|
f.write(page.content())
|
|
|
|
html_content = page.content()
|
|
|
|
# Parse the HTML content using BeautifulSoup
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
|
|
# Find the table with class "nb list"
|
|
table = soup.find("table", {"class": "tb750 rw-table rw-all sections"})
|
|
|
|
assert table is not None
|
|
|
|
# Extract rows from the table body (tbody)
|
|
tbody = table.find("tbody")
|
|
rows = tbody.find_all("tr")
|
|
|
|
# Extract the first and second column from all rows
|
|
list_sub_entries: list[tuple[str, str]] = []
|
|
|
|
for row in rows:
|
|
cells = row.find_all("td", {"class": "tbdata"})
|
|
|
|
if len(cells) > 1:
|
|
link_tag = cells[0].find("a")
|
|
assert link_tag is not None
|
|
entry_name = cells[0].text.strip().replace(" ", "_")
|
|
entry_link = f"{base}{link_tag.get("href")}"
|
|
entry_status = cells[1].text.strip()
|
|
entry_path = f"{entry_status}_{entry_name}"
|
|
list_sub_entries.append((entry_path, entry_link))
|
|
|
|
return list_sub_entries
|