Automated Hugo Page Creation with Ollama and Python

While GUIs are user friendly and ubiquitous, a well-designed CLI often lets you automate complex workflows and enhance productivity. By combining Ollama’s local LLMs with Python utilities like Trafilatura and BeautifulSoup, we can:
from utilollama import create_content # Importing the create_content function from utilollama
from trafilatura.settings import use_config  # For configuring trafilatura
from colorama import Fore  # For colored terminal output
from util import display_text_color, call_ollama, display_alarm_color  # Importing utility function for colored text display
import mymessages  # Importing custom messages for the chat system
from queryweb import my_duckduckgo_search, scrape_web_title, scrape_web_summarize, searx_search_fallback  # Importing the web search function from queryweb module
from urllib.parse import urljoin, urlparse

def initialize():
    """Initialize the environment and load necessary configurations.

    This function loads environment variables from a `.env` file located in
    the same directory as the script. It retrieves the model frontmatter and
    model name for content generation.

    Returns:
        tuple: A tuple containing:
            - str: The model frontmatter.
            - str: The model name.
    """
    import os
    from dotenv import load_dotenv

    # Get the directory of the current script
    script_dir = os.path.dirname(os.path.abspath(__file__))
    # Construct the path to the .env file
    dotenv_path = os.path.join(script_dir, '.env')

    # Load environment variables from the .env file
    load_dotenv(dotenv_path, override=True)

    # Retrieve model from environment variables to generate frontmatter
    model_frontmatter = os.getenv("MYMODEL_FRONTMATTER", "")
    # Default model from environment variables to use for content generation
    model = os.getenv("MODEL", "")

    return model_frontmatter, model

def check_webpage(url, check_external=False):
    """
    Check the accessibility of a given URL, including its links and images.

    This function verifies if the specified URL is accessible, checks all
    internal links (and external links if specified), and ensures that all
    images on the page are accessible.

    Args:
        url (str): The URL to check.
        check_external (bool): If True, checks all links, including external ones. Defaults to False.

    Returns:
        str: A report summarizing the accessibility of the URL, its links and images.
    """

    import requests
    from requests.adapters import HTTPAdapter
    from requests.packages.urllib3.util.retry import Retry
    from bs4 import BeautifulSoup
    from urllib.parse import urljoin, urlparse

    # Initialize a session with retry logic
    s = requests.Session()
    # Configure retries for HTTP requests
    # Retry on certain status codes and with exponential backoff
    retries = Retry(total=3, backoff_factor=1,
                    status_forcelist=[500, 502, 503, 504])
    # Mount the session to use the retry strategy for both HTTP and HTTPS
    s.mount('http://', HTTPAdapter(max_retries=retries))
    s.mount('https://', HTTPAdapter(max_retries=retries))
    # Step 1: Check if the URL is accessible
    try:
        # Send a HEAD request to the URL
        response = s.head(url, timeout=10)
        # If the response status code is not 200, return an error message
        if response.status_code != 200:
            return f"URL {url} is not accessible. Status code: {response.status_code}"
    except requests.exceptions.RequestException as e:
        # If there is an exception (e.g., network error, timeout), return an error message
        return f"Error accessing {url}: {e}"

    # Step 2: Parse the HTML content of the pag
    soup = BeautifulSoup(response.text, 'html.parser')

    # Step 3: Check links
    # Initialize a list to store broken links
    broken_links = []
    # Find all anchor tags with href attributes
    # and check if they are accessible
    for a_tag in soup.find_all('a', href=True):
        # Get the href attribute of the anchor tag
        link = a_tag['href']
        # Convert relative links to absolute links
        # using urljoin to handle relative URLs correctly
        absolute_link = urljoin(url, link)
        # Parse the absolute link to check its scheme and netloc
        parsed = urlparse(absolute_link)
        # Check if the link is HTTP/HTTPS and if it should be checked
        # against external links based on the check_external parameter
        # If check_external is True, check all links, otherwise only check internal links
        # This allows flexibility in checking links based on the user's preference
        # If the link is HTTP/HTTPS and either check_external is True or the link is internal
        # (i.e., it has the same netloc as the original URL), proceed to check the link
        # This ensures that only valid links are checked, avoiding unnecessary checks for non-HTTP links
        if parsed.scheme in ('http', 'https') and (check_external or parsed.netloc == urlparse(url).netloc):
            try:
                # Send a HEAD request to the absolute link
                # Using requests.head to check if the link is accessible
                link_response = s.head(absolute_link, timeout=5)
                # If the response status code is not 200, add it to the broken links list
                if link_response.status_code != 200:
                    # Add the absolute link to the broken links list
                    broken_links.append(absolute_link)
            except requests.exceptions.RequestException:
                # If there is an exception (e.g., network error, timeout), add the link to the broken links list
                broken_links.append(absolute_link)

    # Step 4: Check images and accessibility
    # Initialize a list to store broken images
    broken_images = []
    # Find all image tags with src attributes
    # and check if they are accessible
    # Using BeautifulSoup to parse the HTML content
    for img_tag in soup.find_all('img', src=True):
        # Get the src attribute of the image tag
        img_src = img_tag['src']
        # Use urljoin to handle relative URLs correctly
        # This will convert relative image URLs to absolute URLs based on the page URL
        absolute_img = urljoin(url, img_src)
        parsed_img = urlparse(absolute_img)
        # Check if the image is HTTP/HTTPS
        if parsed_img.scheme in ('http', 'https'):
            try:
                # Send a HEAD request to the absolute image URL
                # Using requests.head to check if the image is accessible
                img_response = s.head(absolute_img, timeout=5)
                # This checks if the image URL is reachable and accessible
                if img_response.status_code != 200:
                    # If the image is not accessible, add it to the broken images list
                    broken_images.append(absolute_img)
            except requests.exceptions.RequestException:
                # If there is an exception (e.g., network error, timeout), add the image to the broken images list
                # This handles cases where the image URL is not reachable
                broken_images.append(absolute_img)

    # Step 5: Generate report
    report = f"URL {url} is accessible.\n"
    # Report the status of links and images
    if broken_links:
        # If there are broken links, add them to the report
        report += f"Broken links found: {', '.join(broken_links)}\n"
    else:
        # If no broken links, indicate that all links are accessible
        report += "All checked links are accessible.\n"
    if broken_images:
        # If there are broken images, add them to the report
        report += f"Broken images found: {', '.join(broken_images)}\n"
    else:
        # If no broken images, indicate that all images are accessible
        report += "All images are accessible.\n"
    # If no broken links or images, indicate that the webpage is accessible
    if not broken_links and not broken_images:
        report += "The webpage is accessible and has no broken links or images.\n"
    else:
        report += "The webpage has broken links or images.\n"
    return report

def main(new_page):
    """
    Create content for a new page by checking its accessibility and scraping its content.

    This function verifies the accessibility of the specified webpage, checks all links
    and images, and scrapes the content to create frontmatter using a specified model.
    It generates new content based on the scraped title and content.

    Args:
        new_page (str): The URL of the webpage to create content for.

    Returns:
        None: The function does not return any value but performs actions based on the webpage status.
    """

    # Load environment variables and initialize configurations
    model_frontmatter, model = initialize() # Initialize the model frontmatter and model name
    report = check_webpage(new_page, check_external=True)  # Check the webpage for accessibility and broken links/images

    if "not accessible" in report:
        display_alarm_color("The webpage is not accessible. Exiting content creation.", Fore.RED)
        return  # Exit if the webpage is not accessible
    elif "Broken links found" in report or "Broken images found" in report:
        display_alarm_color("The webpage has broken links or images.", Fore.YELLOW)
        display_text_color(report, Fore.YELLOW)  # Display the report in yellow if there are broken links or images
    else:
        # If the webpage is accessible and has no broken links or images, proceed with content creation
        display_text_color("Webpage is accessible. Proceeding with content creation.", Fore.BLACK)  # Proceed with content creation if the webpage is accessible
        # Display the report in the terminal
        display_text_color(report, Fore.BLACK)

        display_text_color("Starting content creation...", Fore.BLACK)  # Display a message indicating the start of content creation
        display_text_color(f"Creating content: {new_page}.", Fore.BLACK)
        display_text_color(f"Scraping content from: {new_page}. Using model: {model_frontmatter}", Fore.BLACK)

        # Scrape the page title and content
        title = scrape_web_title(new_page)
        content = None  # Initialize content variable
        if title:
            display_text_color(f"Page title: {title}", Fore.BLACK)  # Display the scraped title
            content = scrape_web_summarize(new_page, model_frontmatter)  # Scrape the content of the page using the specified model frontmatter
            if content:
                display_text_color("Content scraped successfully.", Fore.BLACK)  # Display a message indicating successful content scraping
                display_text_color("Creating frontmatter...", Fore.BLACK)
                call_ollama(content= content, system_prompt = mymessages.query_frontmatter, model_name= model, role = "user", temperature = 0.7, max_tokens = 20000)  # Example usage of the create_frontmatter function
                display_text_color("Frontmatter created successfully.")
                display_text_color("Reporting the validity of the created content.", Fore.BLACK)
                # Call the Ollama model to check the proof of the content
                call_ollama(content=content, system_prompt=mymessages.check_proof, model_name=model, role="user", temperature=0.7, max_tokens=20000)
                display_text_color("Content validity checked successfully.", Fore.BLACK)
                display_text_color("Creating content...", Fore.BLACK)  # Display a message indicating the start of content creation
                create_content(title)  # Call the create_content function to create content for the page
                display_text_color("Content creation completed successfully.", Fore.BLACK)
                # Display a message indicating successful content creation
                display_text_color(f"Content created for: {title}", Fore.BLACK)
            else:
                # If no content was scraped, display a message
                display_text_color("No content found for the page.")
                display_alarm_color(f"{content},{new_page}, {model_frontmatter}", Fore.BLACK)

        else:
            # If no title was found, display a message
            display_text_color("No title found for the page.", Fore.BLACK)

if __name__ == "__main__":
    new_page = "http://192.168.1.36:1313/code/ollama/"
    main(new_page)