Behind this mask there is more than just flesh. Beneath this mask there is an idea… and ideas are bulletproof, Alan Moore

In modern development workflows, augmenting traditional web scraping with generative AI can dramatically improve query relevance and content understanding. In this guide, we will:
# vim queryweb.com
import socket # For checking if Tor is running
import subprocess # For executing shell commands
import requests # For making HTTP requests
from bs4 import BeautifulSoup # Import BeautifulSoup for parsing HTML and XML documents
from trafilatura.settings import use_config # Importing trafilatura settings for better HTML parsing
import trafilatura # Import trafilatura for web crawling and content extraction
import asyncio # Import asyncio for asynchronous programming
import mymessages # Import custom messages module for predefined message templates
from duckduckgo_search import DDGS # Import DDGS for performing DuckDuckGo searches
import re # Importing regex for pattern matching
import os # For environment variable management
import time # For sleep functionality
from colorama import Fore, Style # Import Fore and Style for colored terminal text output
from util import display_text_color, call_ollama # Importing utility function for colored text display
from myscrape import scrape_web_content
from dotenv import load_dotenv # For loading environment variables from .env files
from utilollama import summarize
import time
from stem import Signal
from stem.control import Controller
import stem
import random
from dotenv import load_dotenv # For loading environment variables from .env files
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
dotenv_path = os.path.join(script_dir, '.env')
# Load environment variables from .env file
load_dotenv(dotenv_path, override=True)
# Get the model name for summarization from environment variables
model_query = os.environ.get("MYMODEL_QUERY", "") # Ollama model to make the query
max_results = int(os.environ.get("MAX_RESULTS", "2")) # Maximum results from environment
max_retries = int(os.environ.get("MAX_RETRIES", "4")) # Maximum retries from environment
SEARX_URL = os.environ.get("SEARX_URL", "http://127.0.0.1:8080/search") # SearX URL from environment
# TOR_CONTROL_PASS: This variable holds the password required to authenticate connections to the Tor control port.
TOR_CONTROL_PASS = os.environ.get("TOR_CONTROL_PASS", "YOUR_PASSWORD") # Tor control password
def query_generator(model_name: str, messages: list[dict]) -> str:
"""
Generate a DuckDuckGo search query for the model based on the last user message.
Args:
model_name: Ollama model name (e.g. 'deepseek-r1:8b')
messages: History of chat messages (each a dict with 'role' & 'content')
Returns:
The text of a DuckDuckGo query.
"""
print(f"Generating query using model: {model_name}") # Debug print statement
# Check if messages list is not empty and ensures that the last message contains a 'content' field.
# If not, it raises a ValueError.
if not messages or 'content' not in messages[-1]:
raise ValueError("`messages` must be a non-empty list of dicts with a 'content' field")
# It retrieves the content of the last user message, which will be used to generate a search query.
user_prompt = messages[-1]['content']
# An instruction string is constructed that tells the model to create a DuckDuckGo query based on the user's prompt.
# This is crafted to specify the expected format of the output. This clarity improves the chances of receiving a useful response.
instruction = (
"/no_think\n"
"You are a DuckDuckGo query generator. \n"
"Input: free-form user text. \n"
"Output: only the best DuckDuckGo search string.\n"
f"User prompt: {user_prompt}\n"
"Search query:"
)
try:
# Call the Ollama API to get the response based on the instruction
raw = call_ollama(instruction, mymessages.query_msg, model_name)
# Debug print statement to confirm response receipt
print("query_generator. Response received from model.")
except Exception as e:
# Handle any errors gracefully
raise RuntimeError(f"Ollama chat failed: {e}") # Raise an error with a message if the chat fails
# Check if the response is a string; if not, fallback to the original user prompt
if not isinstance(raw, str):
return user_prompt
# Strip out any <think>…</think> blocks from the response
cleaned = re.sub(r'<think>.*?<think>', '', raw, flags=re.DOTALL).strip()
# Split the cleaned response into lines and filter out empty lines
lines = [ln.strip() for ln in cleaned.splitlines() if ln.strip()]
# If there are no valid lines, fallback to user prompt
if not lines:
return user_prompt
# Get the last non-empty line as the candidate query
candidate = lines[-1]
# Remove optional leading "QUERY:" or surrounding quotes from the candidate query
query = re.sub(
r'^(?:QUERY:)?\s*["“]?(.+?)["”]?$',
r'\1',
candidate,
flags=re.IGNORECASE
).strip()
# Display the response from the model
display_text_color(f"Response from model: {query}", Fore.RED)
# Final fallback if query is empty
return query or user_prompt
def ai_web_search(query: str, model_name: str = "deepseek-r1:8b") -> str:
"""
It generates a refined search query based on the user's input and the conversation history, leveraging the specified AI model to enhance the quality of the search terms.
Args:
query (str): The search query to use
model_name (str): Name of the model to use for generating the query
Returns:
str: It returns the refined search query, ready for use in further web searches.
"""
# Initialize conversation with system and user prompts
messages = [
mymessages.assistant_msg, # System prompt from mymessages module
mymessages.myuser_msg, # User prompt from mymessages module
]
# Append the user's query to the conversation
messages.append({"role": "user", "content": query})
# It calls query_generator to generate a refined search query based on the conversation history. This utilizes the specified AI model.
query = query_generator(model_name, messages)
# Debug print statement to show the refined query
print(f"Refined search query: {query}")
# Return the final refined search query
return query
We use Tor to anonymize our DuckDuckGo API calls via DDGS. We use scrape_web_content. It scrapes url, falls back through several engines, then summarizes with Ollama (except trafilatura).
When you route DuckDuckGo searches through Tor, you often hit 403 rate-limits on their HTML endpoints. This happens because Tor exit nodes are shared and frequently blacklisted. In this article, we’ll:
Prerequisites: pip install duckduckgo_search stem requests beautifulsoup4. Your Tor Browser installation ships with its own torrc file under the “Data” directory (e.g., C:\Users\Owner\Desktop\Tor Browser\Browser\TorBrowser\Data\Tor\torrc), open it in a text editor, and add lines like these to enable the control port:
Install one of the SOCKS-capable backends that requests can use: pip install requests[socks] or, equivalently pip install PySocks.
ControlPort 9051 # Tells Tor to listen on localhost TCP port 9051 for control commands (e.g. NEWNYM).
HashedControlPassword 16:ABCD1234... # replace with your actual hashed password
# Instead of storing your cleartext password, you supply the hash.
# Generate it by running...
.\tor --hash-password "YOUR-PASSWORD"
# Run these command from a shell with Tor in your PATH
# Or from C:\Users\Owner\Desktop\Tor Browser\Browser\TorBrowser\Tor\
After saving, restart Tor Browser so it picks up the new torrc.
Use stem to signal a new identity before each attempt:
from stem import Signal
from stem.control import Controller
import random
import requests
from bs4 import BeautifulSoup
# Path to the Tor executable
# The r prefix indicates a raw string, which is useful for Windows paths to avoid issues with backslashes.
TOR_EXE_PATH = r"C:\Users\Owner\Desktop\Tor Browser\Browser\TorBrowser\Tor\tor.exe"
# The localhost address where the SOCKS proxy listens
TOR_SOCKS_HOST = "127.0.0.1"
# The port number for the Tor SOCKS proxy, which is 9050 by default.
TOR_SOCKS_PORT = 9050
# TOR_SOCKS: This variable defines the URL for the SOCKS proxy provided by Tor:
# 1. socks5h indicates that the connection will use the SOCKS5 protocol with hostname resolution through the proxy.
# 2. 127.0.0.1 refers to the localhost, meaning the proxy is running on the same machine as the application.
# 3. 9050 is the default port for the SOCKS proxy in Tor.
# This is where applications can connect to send their traffic through the Tor network, ensuring anonymity.
TOR_SOCKS = "socks5h://127.0.0.1:9050"
# This variable specifies the port used to communicate with the Tor control interface.
# Default Port: 9051 is the standard control port for Tor.
# This port allows clients to send commands to the Tor process, such as requesting a new circuit (via the NEWNYM command) or checking the status of the Tor connection.
TOR_CONTROL_PORT = 9051
# Create a small list of common browser UAs and pick one at random each try:
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:127.0) Gecko/20100101 Firefox/127.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:127.0) Gecko/20100101 Firefox/127.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 Safari/605.1.15"
]
def renew_tor_identity() -> str:
"""Tell/Signal Tor to build a new circuit (NEWNYM, new exit IP)."""
print("� Renewing Tor identity...") # Debug print statement
# Attempt to connect to the Tor control port and request a new identity
# This is done to change the exit node and thus the IP address used for requests.
# The function will retry once if the initial connection fails, allowing for Tor to be started if it wasn't running
attempt = 0
max_attempts = 2
# Loop to handle connection attempts
# It will try to connect to the Tor control port and signal for a new identity.
while attempt < max_attempts:
print(f"Attempt {attempt + 1} to renew Tor identity...")
# Use the stem library to connect to the Tor control port
try:
with Controller.from_port(port=TOR_CONTROL_PORT) as controller:
# Attempt to connect to the Tor control port
# If successful, authenticate with the control password and signal for a new identity
print("Connected to Tor control port.")
# Authenticate with the Tor control port using the predefined password
# This is necessary to send commands like NEWNYM
# The password is set in the torrc file
controller.authenticate(password=TOR_CONTROL_PASS)
controller.signal(Signal.NEWNYM)
print("Requested new Tor identity.")
# Wait for a short period to allow Tor to establish a new circuit
time.sleep(controller.get_newnym_wait())
return True
except stem.SocketError as e:
print(f"Could not connect to Tor on port {TOR_CONTROL_PORT}: {e}")
if attempt == 0:
print("Trying to launch Tor...")
# If the first attempt fails, try to start Tor
# This is useful if Tor wasn't running when the script started
start_tor()
attempt += 1
continue
else:
# If the second attempt also fails, print an error message and return False
print("Failed to launch Tor after retry.")
return False
except stem.connection.AuthenticationFailure as e:
# Handle authentication failure if the control password is incorrect
# This can happen if the password in the script does not match the one set in the torrc file
print(f"Tor authentication failed: {e}")
return False
except Exception as e:
# Catch any other unexpected exceptions and print an error message
print(f"Unexpected error: {e}")
return False
break
def random_ua() -> str:
"""Return a random User-Agent header from the predefined list."""
return random.choice(USER_AGENTS)
def tor_is_listening(host: str = TOR_SOCKS_HOST, port: int = TOR_SOCKS_PORT) -> bool:
"""Return True if something is already listening on the Tor SOCKS port.
Args:
Accepts host and port with default values to check the Tor SOCKS proxy.
Return:
If the connection is successful, it returns True, indicating that Tor is running.
If an exception occurs (such as a timeout or connection refusal), it returns False.
"""
try:
# Attempt to create a connection to the specified Tor socks host and port
with socket.create_connection((host, port), timeout=2):
return True # If successful, Tor is likely running
except Exception:
return False # If an exception occurs, Tor is not running
def start_tor():
"""Launch Tor in the background if it is not already running."""
if tor_is_listening(): # Check if Tor is already active.
return # If listening, exit the function, we are done.
# Check if the Tor executable exists
if not os.path.isfile(TOR_EXE_PATH):
raise RuntimeError(
f"Tor executable not found at {TOR_EXE_PATH}. "
"Install Tor Browser or Expert Bundle."
)
# Start Tor in a detached process (background) using subprocess.Popen,
# redirecting its output to DEVNULL to prevent console clutter.
subprocess.Popen([TOR_EXE_PATH], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# Implements a loop to wait up to 30 seconds for Tor to open its SOCKS port.
for _ in range(30):
if tor_is_listening(): # Each iteration check if Tor has opened the SOCKS port
return # If it has opened the SOCKS port, exit the function
print("Waiting for Tor to open SOCKS port...") # Debug print statement checking again
else:
# Raise an error if the port isn't opened
raise TimeoutError("Tor did not open the SOCKS port within 30s")
def duckduckgo_search(query: str, max_results = 2, max_retries=4):
"""
Perform a DuckDuckGo search and return up to `max_results` hits.
Args:
query (str): The search query to use
Returns:
str: The first search result URL or an error message
"""
print(f"DuckDuckGo: {query}") # Debug print statement for the query
# Ensure Tor is ready before making the search
start_tor()
# Build DDGS instance that always routes through Tor
# ddgs = DDGS(proxy=f"socks5h://{TOR_SOCKS_HOST}:{TOR_SOCKS_PORT}", timeout=20)
ddgs = DDGS(proxy=f"socks5h://{TOR_SOCKS_HOST}:{TOR_SOCKS_PORT}", timeout=20)
## Confirm that the query is valid
# It checks if the query is empty, if max_results is a positive integer, and if query is a string.
# If any of these checks fail, it prints an error message and returns an appropriate error message
if not query:
print("Error: No search query provided.")
return "No search query provided."
if not isinstance(max_results, int) or max_results <= 0:
print("Error: max_results must be a positive integer.")
return "max_results must be a positive integer."
if not isinstance(query, str):
print("Error: query must be a string.")
return "query must be a string."
delay = 2 # Initial backoff at 2 seconds
# Attempt to perform the search up to `max_retries` times
for attempt in range(1, max_retries + 1):
# Renew Tor exit IP
renew_tor_identity()
# Pick a fresh User-Agent
ua = random_ua()
# Set the User-Agent for the DDGS instance
# This is important to avoid 403 errors from DuckDuckGo, which may block requests
# due to suspicious activity or automated scraping attempts.
ddgs = DDGS(
proxy=TOR_SOCKS,
timeout=20,
headers={"User-Agent": ua}
)
# Attempt the search
try:
display_text_color(f"Attempt {attempt}: '{query}' via Tor exit—UA: {ua}", Fore.YELLOW)
# Perform the DuckDuckGo search using the DDGS instance
# The `text` method is used to perform a text search (returns a list of result dicts).
# The `max_results` parameter limits the number of results returned.
results = ddgs.text(query, max_results=max_results)
# Display the search results
for idx, result in enumerate(results, start=1):
# Check if the result has a title and href
if not result.get('title') or not result.get('href'):
display_text_color(f"Result {idx} is missing title or href.", Fore.RED)
continue
# Display the result title and URL in cyan color
display_text_color(f"{idx}. {result['title']}\n {result['href']}", Fore.CYAN)
# Scrape and summarize web content using Ollama
# This function will handle the scraping and summarization of the content at the URL
scrape_web_summarize(result['href'])
return # success
except Exception as e:
display_text_color(f"[{attempt}] Unexpected error: {e}", Fore.RED)
# If an exception occurs, wait and retry with exponential backoff
time.sleep(delay)
delay *= 2 # Exponential backoff
# if we get here, all retries failed, then log the failure
print("DuckDuckGo search failed after multiple retries.")
# Manual HTML Fallback
# If DDGS still 403s, fetch and parse DuckDuckGo’s HTML directly, you could...
duckduckgo_html_fallback(query)
# If we have a SearX instance (e.g., a container in Proxmox with a SearX docker image),
# we can try to use that as a fallback.
searx_search_fallback(query)
def my_duckduckgo_search(query: str, model_name="qwen3:8b"):
"""
Perform a DuckDuckGo search and return the first results.
Args:
query (str): The search query to use
model_name (str): Name of the model to use for generating the query
Returns:
None: It does not return anything, but prints the search results and scrapes the content of each result.
"""
print(f"my_duckduckgo_search: {query}") # Debug print statement
# Generate an improved search query using AI
improved_query = ai_web_search(query, model_query) # Leverage AI to refine the query
# Perform the DuckDuckGo search with the improved query
duckduckgo_search(improved_query, max_results, max_retries)
def scrape_web_title(url: str = "")-> str:
"""
Scrape the title of a web page.
Args:
url (str): The URL of the web page to scrape
Returns:
str: The title of the web page or an error message
"""
# Validate the URL format to ensure it starts with http:// or https://
if not url.startswith(("http://", "https://")):
return "Invalid URL format; make sure it starts with http:// or https://"
try:
# Send a GET request to the specified URL with a timeout of 10 seconds
response = requests.get(url, timeout=10)
response.raise_for_status() # Raise an error for bad responses (4xx or 5xx)
except ConnectionError:
return (
f"Unable to connect to {url}. "
"If this is your local Hugo server, ensure you ran:\n"
" hugo server --bind 0.0.0.0\n"
"so that the site is reachable from this script."
)
except RequestException as e:
return f"Error fetching content from {url}: {e}"
# Parse the response content to extract the <title> element
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the title from the <title> tag, if it exists
# If the title tag is not found, it returns None
title = soup.title.string if soup.title and soup.title.string else None
return title.strip() if title else "No title element found on page"
def scrape_web_summarize(url: str, model_name: str = "deepseek-r1:8b") -> str:
"""
Scrape and summarize web content using Ollama.
Args:
url (str): The URL of the web page to scrape
Returns:
str: The text content of the web page or an error message
"""
print(f"Scraping content from: {url}") # Debug print statement to track URL being scraped
# Validate the URL format to ensure it starts with http:// or https://
if not url.startswith(("http://", "https://")):
return "Invalid URL format"
try:
# Call a separate, auxiliary function to scrape the web content
content = scrape_web_content(url)
# If the content is None or empty, return an error message
if content is None or not content.strip():
return f"No content found at {url}. It may be blocked or unavailable."
# Summarize the scraped content using the Ollama model
return summarize(content)
except requests.RequestException as e:
# If there's an error fetching the content, return an error message
return f"Error fetching content from {url}: {e}"
def duckduckgo_html_fallback(query: str):
"""
Query DuckDuckGo’s HTML frontend and parse the links manually.
"""
display_text_color("duckduckgo_html_fallback", Fore.YELLOW)
# Ensure Tor is running before making the request
start_tor()
# Set the URL for DuckDuckGo's HTML frontend
# This is the endpoint that serves HTML search results
url = "https://html.duckduckgo.com/html"
# Prepare the payload with the search query
# The payload is a dictionary containing the search query
payload = {"q": query}
# Set the headers to include a random User-Agent
# This is important to avoid 403 errors from DuckDuckGo, which may block requests
headers = {"User-Agent": random_ua()}
# Set the proxies to use the Tor SOCKS proxy
# This ensures that the request is routed through Tor for anonymity
proxies = {"http": TOR_SOCKS, "https": TOR_SOCKS}
try:
# Send a POST request to DuckDuckGo's HTML frontend with the query
# This uses the Tor SOCKS proxy for anonymity
resp = requests.post(url, data=payload, headers=headers, proxies=proxies, timeout=20)
resp.raise_for_status() # if 403 persists, we’ll see an exception
# Parse the response content using BeautifulSoup
# This will allow us to extract the search results from the HTML
soup = BeautifulSoup(resp.text, "html.parser")
# Find the search results in the parsed HTML
# The results are contained in <a> tags with the class "result__a"
# Select the first 5 search results
# This limits the number of results to 5 for brevity
for a in soup.select("a.result__a")[:5]:
# Extract the title and href from each search result
# The title is the text of the link, and the href is the URL it points
title = a.get_text(strip=True)
href = a["href"]
# Scrape and summarize web content using Ollama
summary = scrape_web_summarize(href)
if summary:
# Display the title, href, and summary
display_text_color(f"{title}\n {href}\n {summary}", Fore.GREEN)
except requests.exceptions.HTTPError as e:
# Handle HTTP errors, such as 401 Unauthorized or 403 Forbidden
if resp.status_code == 401:
print("❌ Unauthorized (401): Access is denied for the DuckDuckGo HTML endpoint.")
elif resp.status_code == 403:
print("❌ Forbidden (403): DuckDuckGo is blocking your automated request. Try another IP, time, or proxy.")
else:
print(f"❌ HTTP error: {e}")
except requests.exceptions.ProxyError as e:
# Handle proxy errors
print(f"❌ Proxy connection error: {e}")
except requests.exceptions.RequestException as e:
# Handle other request exceptions
print(f"❌ Network error: {e}")
except Exception as e:
# Handle unexpected exceptions
print(f"❌ Unexpected error: {e}")
DuckDuckGo blocks automated access. The HTML interface at https://html.duckduckgo.com/html is not intended for programmatic scraping. DuckDuckGo actively blocks bot traffic, especially frequent requests or those via proxies/anonymizers like Tor.
If you have set up a SearXNG container in your homelab, then you have another alternative without relying in third party search engines policies.
def searx_search_fallback(query: str, max_results: int = 5, searx_url: str = None):
"""
Query our SearX instance and print parsed links.
query (str): The search query to use. Must be non-empty.
max_results (int): The maximum number of search results to return (default is 5).
searx_url (str): Full URL of your SearX search endpoint. If None, read from SEARX_URL env var or default to http://127.0.0.1:8080/search.
Raises:
ValueError: If query is empty or max_results is not positive.
RuntimeError: On HTTP or network failures after retries.
"""
display_text_color("searx_search_fallback", Fore.YELLOW)
# Validate the inputs
if not isinstance(query, str) or not query.strip():
raise ValueError("query must be a non-empty string")
if not isinstance(max_results, int) or max_results <= 0:
raise ValueError("max_results must be a positive integer")
# Get the SearX URL
# If searx_url is not provided, read it from the environment variable
searx_url = searx_url or SEARX_URL
if not searx_url.startswith(("http://", "https://")):
raise ValueError(f"Invalid SEARX_URL: {searx_url}")
# Ensure Tor is running before making the request
start_tor()
# Set the URL for the SearX instance
# This is the endpoint that serves SearX search results
url = "http://192.168.1.59:8080/search"
# Prepare the parameters for the SearX search
# The parameters include the search query, format (JSON), and language
params = {
"q": query, # The query parameter for the search
"format": "json", # Specify that the response should be in JSON format
"language": "en", # Set the language for the search results
"count": max_results # Set the maximum number of search results
}
# Set the headers to include a User-Agent
# This is important to avoid 403 errors from SearX, which may block requests
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MyBot/1.0; +http://yourdomain.example)",
}
# Set the SOCKS proxy for Tor
proxies = {"http": "socks5h://127.0.0.1:9050",
"https": "socks5h://127.0.0.1:9050"}
for attempt in range(1, 4):
try:
# Rotate Tor identity
# This is done to change the exit node and thus the IP address used for requests
with Controller.from_port(port=9051) as ctl:
# Authenticate with the Tor control port
ctl.authenticate(password=os.getenv(
"TOR_CONTROL_PASS", "Anawim"))
# Signal for a new identity
ctl.signal(Signal.NEWNYM)
# Wait for a short period to allow Tor to establish a new circuit
time.sleep(ctl.get_newnym_wait())
# Make the SearX request
resp = requests.get(
url, params=params, headers=headers, proxies=proxies)
# Raise an exception for HTTP errors (4xx or 5xx)
# This will raise an HTTPError if the response status code indicates an error
resp.raise_for_status()
# Parse the JSON response from SearX
# This will convert the response content into a Python dictionary
data = resp.json()
# Check if the response contains results
# If the "results" key is not present, print an error message and return
if "results" not in data:
print("No results found in SearX response.")
return
# Extract the search results from the parsed JSON
# The results are contained in the "results" key of the JSON response
results = data.get("results", [])
# Check if the results list is empty
if not results:
print("ℹ️ No results returned by SearX.")
# Iterate over the search results
for idx, result in enumerate(results[:max_results], start=1):
# Extract the title and href from each search result
title = result.get("title", "")
# The href is the URL of the search result
# If the "url" key is not present, it defaults to an empty string
href = result.get("url", "")
# We scrape and summarize as before
summary = scrape_web_summarize(href)
if summary:
# Print the result number, title, href, and summary
display_text_color(f"{idx}. {title}\n {href}\n {summary}", Fore.GREEN)
return
except Exception as e:
# Handle unexpected exceptions
print(f"Searx search failed: {e}")
if __name__ == "__main__":
my_duckduckgo_search("linux distributions","qwen3:8b") # Example usage of the search function