Logic will get you from A to B. Imagination will take you everywhere, Albert Einstein

While GUIs are user friendly and ubiquitous, a well-designed CLI often lets you automate complex workflows and enhance productivity. By combining Ollama’s local LLMs with Python utilities like Trafilatura and BeautifulSoup, we can:
from utilollama import create_content # Importing the create_content function from utilollama
from trafilatura.settings import use_config # For configuring trafilatura
from colorama import Fore # For colored terminal output
from util import display_text_color, call_ollama, display_alarm_color # Importing utility function for colored text display
import mymessages # Importing custom messages for the chat system
from queryweb import my_duckduckgo_search, scrape_web_title, scrape_web_summarize, searx_search_fallback # Importing the web search function from queryweb module
from urllib.parse import urljoin, urlparse
def initialize():
"""Initialize the environment and load necessary configurations.
This function loads environment variables from a `.env` file located in
the same directory as the script. It retrieves the model frontmatter and
model name for content generation.
Returns:
tuple: A tuple containing:
- str: The model frontmatter.
- str: The model name.
"""
import os
from dotenv import load_dotenv
# Get the directory of the current script
script_dir = os.path.dirname(os.path.abspath(__file__))
# Construct the path to the .env file
dotenv_path = os.path.join(script_dir, '.env')
# Load environment variables from the .env file
load_dotenv(dotenv_path, override=True)
# Retrieve model from environment variables to generate frontmatter
model_frontmatter = os.getenv("MYMODEL_FRONTMATTER", "")
# Default model from environment variables to use for content generation
model = os.getenv("MODEL", "")
return model_frontmatter, model
def check_webpage(url, check_external=False):
"""
Check the accessibility of a given URL, including its links and images.
This function verifies if the specified URL is accessible, checks all
internal links (and external links if specified), and ensures that all
images on the page are accessible.
Args:
url (str): The URL to check.
check_external (bool): If True, checks all links, including external ones. Defaults to False.
Returns:
str: A report summarizing the accessibility of the URL, its links and images.
"""
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
# Initialize a session with retry logic
s = requests.Session()
# Configure retries for HTTP requests
# Retry on certain status codes and with exponential backoff
retries = Retry(total=3, backoff_factor=1,
status_forcelist=[500, 502, 503, 504])
# Mount the session to use the retry strategy for both HTTP and HTTPS
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
# Step 1: Check if the URL is accessible
try:
# Send a HEAD request to the URL
response = s.head(url, timeout=10)
# If the response status code is not 200, return an error message
if response.status_code != 200:
return f"URL {url} is not accessible. Status code: {response.status_code}"
except requests.exceptions.RequestException as e:
# If there is an exception (e.g., network error, timeout), return an error message
return f"Error accessing {url}: {e}"
# Step 2: Parse the HTML content of the pag
soup = BeautifulSoup(response.text, 'html.parser')
# Step 3: Check links
# Initialize a list to store broken links
broken_links = []
# Find all anchor tags with href attributes
# and check if they are accessible
for a_tag in soup.find_all('a', href=True):
# Get the href attribute of the anchor tag
link = a_tag['href']
# Convert relative links to absolute links
# using urljoin to handle relative URLs correctly
absolute_link = urljoin(url, link)
# Parse the absolute link to check its scheme and netloc
parsed = urlparse(absolute_link)
# Check if the link is HTTP/HTTPS and if it should be checked
# against external links based on the check_external parameter
# If check_external is True, check all links, otherwise only check internal links
# This allows flexibility in checking links based on the user's preference
# If the link is HTTP/HTTPS and either check_external is True or the link is internal
# (i.e., it has the same netloc as the original URL), proceed to check the link
# This ensures that only valid links are checked, avoiding unnecessary checks for non-HTTP links
if parsed.scheme in ('http', 'https') and (check_external or parsed.netloc == urlparse(url).netloc):
try:
# Send a HEAD request to the absolute link
# Using requests.head to check if the link is accessible
link_response = s.head(absolute_link, timeout=5)
# If the response status code is not 200, add it to the broken links list
if link_response.status_code != 200:
# Add the absolute link to the broken links list
broken_links.append(absolute_link)
except requests.exceptions.RequestException:
# If there is an exception (e.g., network error, timeout), add the link to the broken links list
broken_links.append(absolute_link)
# Step 4: Check images and accessibility
# Initialize a list to store broken images
broken_images = []
# Find all image tags with src attributes
# and check if they are accessible
# Using BeautifulSoup to parse the HTML content
for img_tag in soup.find_all('img', src=True):
# Get the src attribute of the image tag
img_src = img_tag['src']
# Use urljoin to handle relative URLs correctly
# This will convert relative image URLs to absolute URLs based on the page URL
absolute_img = urljoin(url, img_src)
parsed_img = urlparse(absolute_img)
# Check if the image is HTTP/HTTPS
if parsed_img.scheme in ('http', 'https'):
try:
# Send a HEAD request to the absolute image URL
# Using requests.head to check if the image is accessible
img_response = s.head(absolute_img, timeout=5)
# This checks if the image URL is reachable and accessible
if img_response.status_code != 200:
# If the image is not accessible, add it to the broken images list
broken_images.append(absolute_img)
except requests.exceptions.RequestException:
# If there is an exception (e.g., network error, timeout), add the image to the broken images list
# This handles cases where the image URL is not reachable
broken_images.append(absolute_img)
# Step 5: Generate report
report = f"URL {url} is accessible.\n"
# Report the status of links and images
if broken_links:
# If there are broken links, add them to the report
report += f"Broken links found: {', '.join(broken_links)}\n"
else:
# If no broken links, indicate that all links are accessible
report += "All checked links are accessible.\n"
if broken_images:
# If there are broken images, add them to the report
report += f"Broken images found: {', '.join(broken_images)}\n"
else:
# If no broken images, indicate that all images are accessible
report += "All images are accessible.\n"
# If no broken links or images, indicate that the webpage is accessible
if not broken_links and not broken_images:
report += "The webpage is accessible and has no broken links or images.\n"
else:
report += "The webpage has broken links or images.\n"
return report
def main(new_page):
"""
Create content for a new page by checking its accessibility and scraping its content.
This function verifies the accessibility of the specified webpage, checks all links
and images, and scrapes the content to create frontmatter using a specified model.
It generates new content based on the scraped title and content.
Args:
new_page (str): The URL of the webpage to create content for.
Returns:
None: The function does not return any value but performs actions based on the webpage status.
"""
# Load environment variables and initialize configurations
model_frontmatter, model = initialize() # Initialize the model frontmatter and model name
report = check_webpage(new_page, check_external=True) # Check the webpage for accessibility and broken links/images
if "not accessible" in report:
display_alarm_color("The webpage is not accessible. Exiting content creation.", Fore.RED)
return # Exit if the webpage is not accessible
elif "Broken links found" in report or "Broken images found" in report:
display_alarm_color("The webpage has broken links or images.", Fore.YELLOW)
display_text_color(report, Fore.YELLOW) # Display the report in yellow if there are broken links or images
else:
# If the webpage is accessible and has no broken links or images, proceed with content creation
display_text_color("Webpage is accessible. Proceeding with content creation.", Fore.BLACK) # Proceed with content creation if the webpage is accessible
# Display the report in the terminal
display_text_color(report, Fore.BLACK)
display_text_color("Starting content creation...", Fore.BLACK) # Display a message indicating the start of content creation
display_text_color(f"Creating content: {new_page}.", Fore.BLACK)
display_text_color(f"Scraping content from: {new_page}. Using model: {model_frontmatter}", Fore.BLACK)
# Scrape the page title and content
title = scrape_web_title(new_page)
content = None # Initialize content variable
if title:
display_text_color(f"Page title: {title}", Fore.BLACK) # Display the scraped title
content = scrape_web_summarize(new_page, model_frontmatter) # Scrape the content of the page using the specified model frontmatter
if content:
display_text_color("Content scraped successfully.", Fore.BLACK) # Display a message indicating successful content scraping
display_text_color("Creating frontmatter...", Fore.BLACK)
call_ollama(content= content, system_prompt = mymessages.query_frontmatter, model_name= model, role = "user", temperature = 0.7, max_tokens = 20000) # Example usage of the create_frontmatter function
display_text_color("Frontmatter created successfully.")
display_text_color("Reporting the validity of the created content.", Fore.BLACK)
# Call the Ollama model to check the proof of the content
call_ollama(content=content, system_prompt=mymessages.check_proof, model_name=model, role="user", temperature=0.7, max_tokens=20000)
display_text_color("Content validity checked successfully.", Fore.BLACK)
display_text_color("Creating content...", Fore.BLACK) # Display a message indicating the start of content creation
create_content(title) # Call the create_content function to create content for the page
display_text_color("Content creation completed successfully.", Fore.BLACK)
# Display a message indicating successful content creation
display_text_color(f"Content created for: {title}", Fore.BLACK)
else:
# If no content was scraped, display a message
display_text_color("No content found for the page.")
display_alarm_color(f"{content},{new_page}, {model_frontmatter}", Fore.BLACK)
else:
# If no title was found, display a message
display_text_color("No title found for the page.", Fore.BLACK)
if __name__ == "__main__":
new_page = "http://192.168.1.36:1313/code/ollama/"
main(new_page)