Mathematics is not a careful march down a well-cleared highway, but a journey into a strange wilderness, where the explorers often get lost, – W.S. Anglin.
"""
web_crawler.py
Author: Máximo Núñez Alarcón
Copyright (c) 2023 Máximo Núñez Alarcón. This work is licensed under the
terms of the MIT License. See LICENSE file for details.
Date: 2023-10-01
This script implements an asynchronous web crawler designed to
(i) efficiently navigate a local web server (such as a Hugo site),
(ii) identify pages containing specific keywords in their titles or meta-descriptions,
(iii) and then extract their full content in Markdown format.
Leveraging Python's `asyncio` for concurrent operations,
`aiohttp` for high-performance HTTP requests,
and BeautifulSoup for HTML parsing
this crawler is optimized for speed
and resource management.
The core functionality revolves around a breadth-first search (BFS) algorithm that
systematically explores internal links, ensuring comprehensive coverage of the target
domain.
Pages deemed "relevant" are those where a
pre-defined `PHRASE` (case-insensitive and whitespace-agnostic) is found within
their title tag or `meta name="description"`/ `meta property="og:description` attributes.
For each relevant page, the script extracts the title, description, and converts the entire HTML body into clean Markdown, providing a structured
output for further analysis or display.
Dependencies:
- `aiohttp`: For making asynchronous HTTP requests, enabling concurrent fetching of multiple pages.
- `BeautifulSoup` (bs4): For robust and flexible parsing of HTML and XML documents.
- `python-dotenv`: For securely loading configuration variables (like `BASE_URL` and `REQUEST_TIMEOUT`) from a `.env` file.
- `markdownify`: A utility for converting HTML content into Markdown format.
- `crawl4ai`: A custom web crawling library that provides advanced capabilities for content extraction.
Usage:
- Set the BASE_URL and REQUEST_TIMEOUT in a .env file.
- Call the main asynchronous function with your desired search phrase to initiate crawling.
Example:
```python
import asyncio
from web_crawler import main # Assuming this script is named web_crawler.py
if __name__ == "__main__":
asyncio.run(main("Your Search Phrase Here"))
"""
import asyncio # Core library for writing concurrent code using the async/await syntax.
import sys # Provides access to system-specific parameters and functions
from urllib.parse import urljoin, urldefrag, urlparse # For URL manipulation
import aiohttp # Asynchronous HTTP client/server framework, essential for non-blocking web requests.
import os # For interacting with the operating system
from bs4 import BeautifulSoup # For parsing HTML and XML documents
from markdownify import markdownify as md # Function for converting HTML strings to Markdown strings
from crawl4ai import AsyncWebCrawler # A specialized asynchronous web crawler class from the 'crawl4ai' library.
from dotenv import load_dotenv # For loading environment variables from .env files
from util import normalize_string # Importing a function to normalize strings
async def fetch_html(session: aiohttp.ClientSession, url: str) -> str | None:
"""
Asynchronously fetches the HTML content from a given URL using an aiohttp client session.
It attempts to decode the content using the server-declared charset, falling back to
Latin-1 if decoding errors occur. Returns None on network errors, timeouts, or if
the content is not HTML.
Parameters:
- session: aiohttp.ClientSession - An active aiohttp client session, optimized for making multiple HTTP requests efficiently by reusing connections.
- url: str - the URL of the web page to fetch.
Returns:
- str | None: HTML content of the page as a string if successful or None if there was an error
"""
load_dotenv() # Load environment variables from .env file
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", 15)) # Set request timeout, default 15 seconds
try:
# The 'async with' statement ensures the HTTP request is made asynchronously
# and the response object is properly closed, even if errors occur.
# 'session.get(url, timeout=REQUEST_TIMEOUT)' initiates the GET request.
# The 'await' keyword here pauses the execution of this 'fetch_html' coroutine
# until the HTTP response headers are received or a timeout occurs.
# While 'fetch_html' is paused, the asyncio event loop can execute other
# pending tasks, making the overall crawling process non-blocking and efficient.
async with session.get(url, timeout=REQUEST_TIMEOUT) as resp:
# Asynchronously get the URL (opens a connection and sends HTTP headers)
ctype = resp.headers.get("content-type", "") # Get the content type from headers to check if it's HTML
if resp.status != 200 or not ctype.startswith("text/html"):
return None # Return None if the HTTP status is not 200 (OK) or the content is not HTML
# 'await resp.read()' asynchronously reads the full response body into bytes.
# It yields control to the event loop, allowing other tasks
# to run while the data is being downloaded from the network.
raw = await resp.read()
# Determine the correct character encoding. Prioritize server-declared charset,
# then aiohttp's inferred encoding, finally falling back to UTF-8.
encoding = resp.charset or resp.get_encoding() or "utf-8"
try:
# Attempt to decode the raw bytes using the determined encoding and return the HTML content.
return raw.decode(encoding)
except (LookupError, UnicodeDecodeError):
# If the primary decoding fails (e.g., unknown charset or malformed bytes),
# fall back to 'latin-1' and ignore errors to prevent crashes,
# as 'latin-1' can decode any byte sequence.
return raw.decode("latin-1", errors="ignore")
except (aiohttp.ClientError, asyncio.TimeoutError):
# Catch any network-related errors (e.g., connection issues, DNS failures)
# or explicit timeouts during the request. Return None to indicate failure.
return None
def extract_links(html: str, base_url: str) -> set[str]:
"""
Parses an HTML string to extract all valid, same-domain absolute links.
It removes URL fragments (e.g., #section) and resolves relative URLs against the provided base URL.
Parameters:
- html: str - the HTML content to parse and extract links.
- base_url: str - the base URL of the page being parsed to resolve relative links and filter for same-domain links.
Returns:
- set[str]: A unique set of absolute URLs found within the HTML that belong
to the same domain as the `base_url`.
"""
soup = BeautifulSoup(html, "html.parser") # Parse HTML with BeautifulSoup
domain = urlparse(base_url).netloc # Extract the domain from base URL
links = set() # Initialize a set to automatically handle duplicate links
for a in soup.find_all("a", href=True): # Find all anchor tags that have an 'href' attribute.
raw = a["href"] # Get the raw value of the href attribute.
# 'urldefrag' splits a URL into its base part and its fragment identifier (e.g., #section), it returns (url, fragment)
# Strip off any fragment via urldefrag
cleaned, _ = urldefrag(raw) # Remove fragments from the URL
# 'urljoin' combines a base URL with a (potentially relative) URL
# to produce or resolve to a full, absolute URL.
abs_url = urljoin(base_url, cleaned)
if urlparse(abs_url).netloc == domain:
# Check if the extracted absolute URL belongs to the same domain as our base_url.
# And keeps only those whose domain matches base_url.
links.add(abs_url)
# Add the valid, same-domain absolute URL to our set.
return links
# Return the set of unique, absolute, same-domain links.
# ---------------- Title / description / body helpers -------------------------------
def extract_metadata_and_body_html(html: str) -> tuple[str, str, str]:
"""
Extracts the page title, meta description (standard or OpenGraph) and the raw HTML content of the body tag from an HTML string.
Returns empty strings if missing.
Parameters:
- html: str - the HTML content to parse
Returns:
- tuple[str, str, str]: A tuple containing the extracted page title, description, and raw HTML content of the body tag or empty strings if not found
"""
soup = BeautifulSoup(html, "html.parser") # Parse HTML with BeautifulSoup
# Uses BeautifulSoup to pull out: title, meta description, and body
# 1. Extract the page title from the TITLE tag
# The .string gets the text content and .strip() removes leading/trailing whitespace.
title = soup.title.string.strip() if soup.title and soup.title.string else ""
# 2. Extract the meta DESCRIPTION (standard or OpenGraph)
desc = ""
meta_name = soup.find("meta", attrs={"name": "description"}) # Find the standard meta description tag.
meta_og = soup.find("meta", attrs={"property": "og:description"}) # Find the OpenGraph description tag.
if meta_name and meta_name.get("content"):
desc = meta_name["content"].strip() # Get content from standard meta description if available.
elif meta_og and meta_og.get("content"):
desc = meta_og["content"].strip() # Get content from OpenGraph meta description if available
# 3. Extract raw HTML content of the BODY tag
body_element = soup.find('body') # Find the BODY element
# Convert the BeautifulSoup Tag object to its string representation (raw HTML).
body_html_string = str(body_element) if body_element else "" # Get body HTML as a string
return title, desc, body_html_string # Return title, description, and body HTML
def phrase_in_title_or_desc(html: str, phrase: str) -> bool:
"""
Checks if the given phrase appears (case-insensitive, whitespace-insensitive)
in the page's title OR meta-description,
then does a simple substring test against title or description.
This function uses the
'extract_metadata_and_body_html' helper to get the necessary text.
Parameters:
- html (str): The HTML content of the page to check.
- phrase (str): The target phrase to search for.
Returns:
- bool: True if the phrase is found in either the title or description, False otherwise.
"""
# Extract title and description using the helper function.
# We only need the first two return values for this check.
title, desc, _ = extract_metadata_and_body_html(html)
# Normalize both the search phrase and the text to be searched (title/description)
# to ensure case-insensitive and whitespace-insensitive comparison.
needle = normalize_string(phrase)
return needle in normalize_string(title) or needle in normalize_string(desc) # Check for presence in title or description
# ---------------- Main crawling loop ----------------------------------------
async def crawl_website_for_relevance(base_url: str, phrase: str):
"""
Performs a breadth-first crawl of the website starting from base_url.
It identifies pages where the `phrase` appears in the title or description
and collects their metadata and raw HTML body.
Parameters:
- base_url: str - the starting URL for crawling
- phrase: str - the keyword(s) or phrase to search for in page titles and descriptions
Returns:
- list[dict]: A list of dictionaries,
where each dictionary contains the 'url', 'title', 'description', and 'body_html_string' for each relevant page found.
"""
queue: list[str] = [base_url] # Initialize the queue for BFS with the base URL
seen: set[str] = set() # A set to keep track of visited URLs to avoid infinite loops and redundant fetches
relevant_pages_info: list[dict] = [] # A list to store the extracted info about pages that match the criteria
# This creates an asynchronous HTTP client session for persistent HTTP connection.
# Using a single session for multiple requests is crucial for performance in asynchronous web crawling.
# It allows for connection pooling and reuse (connections stay open by default), reducing the overhead of
# establishing new TCP connections for every request. The 'async with' ensures
# the session is properly opened and closed.
async with aiohttp.ClientSession() as session:
while queue:
# The main simple breadth-first crawling loop continues as long as there are URLs in the queue
url = queue.pop(0) # Get the next URL from the queue (breadth-first).
if url in seen: # Skip this URL if already visited
continue
seen.add(url) # Mark the current URL as seen or visited
print(f"Crawling: {url}") # Provide real-time feedback on the crawling process
html = await fetch_html(session, url)
# Here’s where the asynchronous I/O happens.
# The 'await' keyword tells the Python interpreter to pause the execution of this 'crawl_website_for_relevance'
# coroutine at this point.
# While 'fetch_html(session, url)' is waiting for the network response (which can take time)
# The coroutine yields control while waiting for the network.
# The asyncio event loop is free to switch to and execute other tasks (e.g., processing other URLs in the queue)
# When the response arrives, the execution of 'crawl_website_for_relevance' resumes, reads bytes,
# decodes, and returns the HTML string.
if html is None: # Skip if fetching failed (e.g., network error, non-HTML content)
continue
if phrase_in_title_or_desc(html, phrase): # Check if the extracted title or description contains the target phrase.
title, desc, body_html_string = extract_metadata_and_body_html(html) # Extract all relevant metadata and the body HTML from the fetched page.
relevant_pages_info.append({
"url": url,
"title": title,
"description": desc,
"body_html_string": body_html_string,
})
# If the page is relevant, store its details (add it to the list).
# Enqueue next links found on the current page
for link in extract_links(html, url): # Extract links from the current page
if link not in seen: # Only add unseen links to the queue
queue.append(link) # Add to the queue
return relevant_pages_info # Return the list of relevant pages
async def main(target_query: str):
"""
Main function to initiate the crawling process.
Parameters:
- target_query: str - the phrase to search for in titles/descriptions
"""
load_dotenv() # Load environment variables from .env file
base_url = os.getenv("BASE_URL") # Get the base URL from environment
if not base_url:
raise ValueError("BASE_URL not set in .env file") # Raise error if not set
if not base_url.startswith(("http://", "https://")):
raise ValueError("BASE_URL must start with http:// or https://") # Validate URL format
print(f"Starting full website crawl from: {base_url}") # Log the starting point
# Step 1: Crawl the website to identify all relevant URLs and extract their titles/descriptions
relevant_pages_data = await crawl_website_for_relevance(base_url, target_query)
print(f"\nFound {len(relevant_pages_data)} relevant pages for '{target_query}'") # Log the count of relevant pages
# Step 2: Get full Markdown content for each relevant page using AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
# AsyncWebCrawler is the core class for asynchronous web crawling in Crawl4AI.
# It may set up its own HTTP session pool or other resources,
# ensuring efficient management of network connections during crawling.
# Initialize the web crawler.
for i, page_info in enumerate(relevant_pages_data): # Iterate over relevant pages
url = page_info["url"] # Extract URL
title = page_info["title"] # Extract title
description = page_info["description"] # Extract description
content = page_info["body_html_string"] # Extract body HTML content
print(f"\n{'='*90}") # Print separator
print(f"## MATCH: {url}") # Log the matched URL
print("="*90)
print(f"**Title:** {title or '[none]'}\n") # Log the title
print(f"**Description:** {description or '[none]'}\n") # Log the description
# print(f"**Content:** {content or '[none]'}\n") # Log the body content
try:
# Use AsyncWebCrawler to get the markdown content.
# await crawler.arun(...) fetches the page (again), extracts clean Markdown, and returns it in result.markdown.
result = await crawler.arun(
url=url,
extraction_strategy="markdown" # Specify extraction strategy as markdown
)
print(result.markdown) # Print the extracted markdown
except Exception as e:
print(f"Error crawling {url} for markdown: {str(e)}") # Log any errors
print("="*90 + "\n") # Print separator
asyncio.run creates a fresh asyncio event loop (a newly created instance of Python’s asyncio event loop — an object that manages and schedules the execution of asynchronous code in a single thread), sets it as the current event loop for the thread, runs your top‐level main(…) coroutine to completion, then closes the loop (after the coroutine finishes or raises an exception, the loop is closed and cleaned up). If you press Ctrl+C, you get a clean shutdown.
if __name__ == "__main__": # Entry point for the script
try:
# 'asyncio.run()' is the function that starts the asyncio event loop.
# It takes the top-level asynchronous function ('main' with the search phrase in this case)
# and runs it until it completes. This is the bridge from synchronous
# execution to the asynchronous world.
asyncio.run(main("How to learn to type fast"))
except KeyboardInterrupt:
# This block catches a KeyboardInterrupt (typically Ctrl+C), allowing
# the program to exit gracefully instead of crashing.
sys.exit() # Exit the program gracefully or cleanly