Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ dependencies = [
"celery",
"redis",
"discord-py>=2.7.1",
"beautifulsoup4>=4.15.0",
]

[dependency-groups]
Expand Down
89 changes: 46 additions & 43 deletions src/ingestion/scrapers/html_scraper.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,46 @@
import httpx
import trafilatura

from src.config.logger import get_logger

log = get_logger(__name__)


def scrape(url: str) -> tuple[str, str | None]:
"""Fetch a URL and extract clean text + title.

Returns (text, title). Title may be None if not found. Text may be empty
if trafilatura couldn't extract anything; the caller should check and skip.
"""
log.info("scrape_started", url=url)
with httpx.Client(timeout=30, follow_redirects=True) as client:
response = client.get(url)
response.raise_for_status()

# Known limitations (tracked as a separate ticket):
# - Misses content inside aria-hidden="true" accordions (common on FAQ pages)
# - Misses question text but gets answer text on other FAQ pages, we should get both
# - Link formatting is markdown-style and may need cleanup downstream
# Improving extraction quality is a tuning ticket, not a blocker for development.
text = (
trafilatura.extract(
response.text,
favor_recall=True,
include_links=True,
include_tables=True,
include_formatting=False,
include_comments=False,
deduplicate=True,
output_format="txt",
)
or ""
)

metadata = trafilatura.extract_metadata(response.text)
title = metadata.title if metadata else None

log.info("scrape_complete", url=url, chars=len(text), title=title)
return text, title
import httpx
from bs4 import BeautifulSoup


def _extract_from_html(html: str) -> tuple[str, str | None]:
cut_marker = "<!-- close main-wrapper"
if cut_marker in html:
html = html.split(cut_marker)[0]

html = BeautifulSoup(html, "html.parser")
# removing elements by class
for el in html.select(
".footer, .global-nav, .navigation, " ".topbar, .content__meta, .visuallyhidden"
):
el.decompose()

# removing elements by tag
for tag in html(["header", "footer", "nav"]):
tag.decompose()

# formatting links after clearing having cleared clutter
for link in html.find_all("a"):
href = link.get("href")
if href and href[0] != "#":
if href[-3:] == "pdf":
link.string = f"{link.string} [PDF: {href}]"
else:
link.string = f"{link.string} [Link: {href}]"

text = html.get_text("\n", strip=True)

title = (html("title")[0]).get_text()

return text, title


def scrape(url: str) -> tuple[str, str | None]:
"""Fetch a URL and extract clean text + title.

Returns (text, title). Title may be None if not found.
"""
with httpx.Client(timeout=30, follow_redirects=True) as client:
response = client.get(url)
response.raise_for_status()

return _extract_from_html(response.text)
789 changes: 789 additions & 0 deletions tests/ingestion/fixtures/courseTest.html

Large diffs are not rendered by default.

947 changes: 947 additions & 0 deletions tests/ingestion/fixtures/faqTest.html

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions tests/ingestion/scraper_offline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from pathlib import Path

from src.ingestion.scrapers.html_scraper import _extract_from_html

FIXTURES = Path(__file__).parent / "fixtures"

# run with python -m tests.ingestion.scraper_offline


def load_fixture(name: str) -> str:
return (FIXTURES / name).read_text(encoding="utf-8")


def test_bcyber_page():
html = load_fixture("courseTest.html")

text, title = _extract_from_html(html)

assert title == "Courses and Registration (B.Cyber.) - School of Computer Science"
assert "Electives and Prohibited Courses" in text
assert "Skip to Main Content" not in text

# Optional if your extraction removes nav correctly
assert "Search this website" not in text

assert "[PDF:" in text

assert (
"[PDF: https://carleton.ca/scs/wp-content/uploads/BCyber-Course-Map-202630-3.pdf]" in text
)

assert (
"[PDF: https://carleton.ca/scs/wp-content/uploads/FINAL2-BCyber-Course-Map-202530.pdf]"
in text
)


def test_new_student_faq():
html = load_fixture("faqTest.html")

text, title = _extract_from_html(html)

question = "How do I build a timetable?"
answer = "Log into Carleton Central"
assert question in text
assert answer in text

question = "How do I build a timetable?"
answer = "Log into Carleton Central"
assert text.index(question) < text.index(answer)

assert "How do I view my grades or exam schedule" in text

# Use a phrase that appears in that answer block
assert "Information about how to view your grades or exam schedule" in text
assert "Skip to Main Content" not in text


test_new_student_faq()
test_bcyber_page()
24 changes: 24 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading