No website can stand without a strong backbone. And that backbone is technical SEO.
Content is the reason search began in the first place.
Search engine optimization (SEO) checking tools are built to automatically crawl anything from a single web page to a whole website. They simulate the way a search engine crawls your site in order to quickly spot any errors or areas for improvement.
Broken links are the ones that can’t fulfill the purpose of bringing the user to a new location. They are considered to have a negative impact on a site’s SEO performance and preventing them is, therefore, an important part of SEO. Futhermore, pointing your visitors towards a dead end is a terrible experience!
We are going to solve it using two Python files.
'''
File: brokenLinksWeb.py
Author: Máximo Núñez Alarcón
Description: How to Find Broken Links on any Website with Python
1. Find and Parse the website sitemap to create a list of all website's pages.
2. Scrap each page to get a list of every external link, then create a separate list with unique external links.
3. Make a request to the previous list to determine if they are valid or not.
4. We will create a list of matches with page and its broken link.
Credits Main Idea: MARTECH WITH ME, How to Find Broken Links on any Website with Python
'''
from usp.tree import sitemap_tree_for_homepage
import requests
from bs4 import BeautifulSoup
import time
from util import check_all_files, printUrl, append_to_file, cleanLog
from urllib.parse import urlparse, urljoin
from util import same_domain
def getPagesFromSitemap(fullDomain):
"""
Retrieve a list of all pages from the sitemap of a given domain.
Args:
- fullDomain (str): The full domain name.
Returns:
- list: A list of URLs of all pages found in the sitemap.
"""
# Initialize an empty list to store raw page URLs from the sitemap
listPagesRaw = []
# Parse the sitemap of the given domain
tree = sitemap_tree_for_homepage(fullDomain)
# Iterate over all pages in the sitemap and extract their URLs
for page in tree.all_pages():
listPagesRaw.append(page.url)
return listPagesRaw
def getListUniquePages(listPagesRaw):
"""
Go through the list of raw page links and output a list of unique page links.
Args:
- listPagesRaw (list): List of raw page links.
Returns:
- list: List of unique page links.
"""
listPages = []
for page in listPagesRaw:
if page in listPages:
pass # If the page is already in the list, skip it.
else:
# If a page link is already present in the list,
# it skips it to ensure that only unique links are included in the final list.
listPages.append(page)
return listPages
def getExternalLinkListRaw(listPages, yourDomain):
"""
Get the raw list of external links from the provided list of pages (listPages).
Args:
- listPages (list): List of pages to extract external links from.
- yourDomain (str): The domain of the website (e.g., "https://justtothepoint.com/").
Returns:
- list: List of raw external links in the format [source_page, link_url].
"""
externalLinksListRaw = [] # Initialize an empty list to store raw external links
count = 0 # Initialize a counter for tracking progress
length_list = len(listPages) # Get the total number of pages
# Define a user-agent header to mimic a web browser
user_agent = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
# Iterate over each page URL in the list
for url in listPages:
count = count + 1 # Increment the counter
response = requests.get(url, timeout=10) # Send HTTP request to the URL with a timeout to avoid hanging indefinitely
response.encoding = 'utf-8'
soup = BeautifulSoup(response.content, "html.parser") # Parse the response content "html.parser" is another option!
# Find all tags (links) in the parsed HTML
list_of_links = soup.find_all("a")
# Iterate over each link found on the page
for link in list_of_links:
href = link.get("href") # Get the 'href' attribute from the link
if href and not href.startswith("#"): # Exclude anchor links
if href.startswith('mailto:'):
continue # Skip processing mailto links
if 'twitter.com/intent' in href: # Check if the href attribute contains a Twitter intent URL
continue # Skip Twitter intent links
absolute_url = urljoin(url, href) # Ensure absolute URLs
if same_domain(absolute_url, "https://justtothepoint.com/"):
continue
externalLinksListRaw.append([url, absolute_url])
return externalLinksListRaw
def getUniqueExternalLinks(externalLinksListRaw):
"""
Retrieves unique external links from the list of raw external links, avoiding duplicates
Args:
- externalLinksListRaw (list): A list containing raw external links.
Returns:
- list: A list containing unique external links extracted from the raw list.
"""
uniqueExternalLinks = [] # Initialize an empty list to store unique external links
if externalLinksListRaw == []: # If the raw list is empty, return an empty list
return uniqueExternalLinks
for link in externalLinksListRaw:
# Each link is ['https://justtothepoint.com/english/listening/', 'http://www.eslvideo.com']
if link[1] in uniqueExternalLinks:
pass # If the link is already in the unique list, skip it
else:
# Add the link to the unique list if it's not already present
uniqueExternalLinks.append(link[1])
# Return the list of unique external links
return uniqueExternalLinks
def identifyBrokenLinks(uniqueExternalLinks):
"""
Identifies broken links from a list of unique external links.
Args:
- uniqueExternalLinks (list): A list containing unique external links.
Returns:
- list: A list containing broken links identified from the input list.
"""
count = 0 # Initialize a counter to keep track of the processed links
length_uniqueExternalLinks = len(uniqueExternalLinks)
# Get the total number of unique links
# User agent string for sending HTTP requests
user_agent = {'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'}
brokenLinksList = [] # Initialize an empty list to store broken links
for link in uniqueExternalLinks:# Get the total number of unique links
count = count + 1
try:
# Send an HTTP request to the link and get the status code
statusCode = requests.get(link, headers=user_agent, timeout=10).status_code
# If the status code is 404 (Not Found) or 404, 410, 500, 502, 503,
# add the link to the list of broken links
# headers: optional dictionary parameter that allows to specify custom headers to send with the request.
# timeout: optional parameter that specifies the maximum number of seconds to wait for the request to complete.
if statusCode in [400, 404, 410, 500, 502, 503]:
# If the statusCode variable contains one of the status codes [400, 404, 410, 500, 502, 503], it means that the HTTP GET request encountered an error.
# 400: Bad Request error, 404 Not Found, 410 Gone, 500 Internal Server Error, 502 Bad Gateway, 503 Service Unavailable.
brokenLinksList.append(link)
else:
pass # Otherwise, skip to the next link
except:
# If an exception occurs while processing the link, add it to the list of broken links
brokenLinksList.append(link)
return brokenLinksList # Return the list of broken links
def matchBrokenLinks(brokenLinksList,externalLinksListRaw):
''' Matches broken "pair" links [source_page, link_url] to the original raw list of all external links
Args:
- brokenLinksList: The list of broken links created in the previous step
- externalLinksListRaw: The raw list of all of internal pages and their external links
Returns: '''
brokenLinkLocation = []
# Iterate through each external link in the raw list
for link in externalLinksListRaw:
# Check if the link's URL is in the list of broken links
if link[1] in brokenLinksList:
# If the link is broken, append it with its location page link[0] to the brokenLinkLocation list
brokenLinkLocation.append([link[0],link[1]])
else:
pass # If the link is not broken, do nothing and pass to the next link
# Create a ./assets/brokenlinks.txt file for further processing by brokenLinksPage.py
with open("./assets/brokenlinks.txt", "w") as file:
file.write("Broken Links:\n")
for brokenlink in brokenLinkLocation:
file.write(f"Source page: {brokenlink[0]} - Broken link: {brokenlink[1]}\n")
return brokenLinkLocation
def main():
cleanLog()
time.sleep(3) # Adjust the duration as needed
# Get list of pages from sitemap
listPagesRaw = getPagesFromSitemap("https://justtothepoint.com/")
listPages = getListUniquePages(listPagesRaw)
# Pause for a short duration before making the next request
time.sleep(2) # Adjust the duration as needed
# Get external link list raw
externalLinksListRaw = getExternalLinkListRaw(listPages, "https://justtothepoint.com/")
# Pause again before making the next request
time.sleep(3) # Adjust the duration as needed
uniqueExternalLinks = getUniqueExternalLinks(externalLinksListRaw)
# Pause again before making the next request
time.sleep(3) # Adjust the duration as needed
# Identify broken links
brokenLinksList = identifyBrokenLinks(uniqueExternalLinks)
# Pause again before making the next request
time.sleep(3) # Adjust the duration as needed
brokenLinkLocation = matchBrokenLinks(brokenLinksList, externalLinksListRaw)
# Save the list of unique pages from the website for further processing (check their images are not corrupt or gone)
with open("./assets/listUniquePages.txt", "w") as file:
file.write("listPages:\n")
for page in listPages:
file.write(f"{page}\n")
if __name__ == "__main__":
main()
'''
File: brokenLinksPage.py
Author: Máximo Núñez Alarcón
Description: This script is designed to help a user to handle broken links on a website provided with a main file.
It contains lines with both a page and its broken link with the following format:
Source page: UrlPageBrokenLink - Broken link: URLBroken (Status code: XXXX)
It utilizes libraries such as requests, BeautifulSoup, and pyperclip to achieve its functionality.
Tool Online: https://www.brokenlinkcheck.com/broken-links.php#status
'''
# Import necessary libraries
import requests # Library for making HTTP requests
from bs4 import BeautifulSoup # Library for parsing HTML content
from urllib.parse import urlparse, urljoin # Library for parsing URLs
import re
import webbrowser
import subprocess
import os
import pyperclip
import time # Import the time module for introducing delays
from mymarkdown import url_to_md_path
localport = "40083/" # By default is 1313
localweb = "http://localhost:" + localport
''' By default, Hugo uses port 1313 for serving your website locally.
However, if the default port is unavailable or already in use by another application:
1. It will automatically selects the next available port. Look at the output generated when you start the Hugo server: Web Server is available at http://localhost:XXXXX/
2. You can manually specify a port when starting the server: hugo server -p 8080.
3. Open a terminal and run: ss -tuln or netstat -tuln. You will get a list of all listening TCP and UDP ports along with their associated processes.
Then, you can confirm this by accessing your Hugo site in your favorite web browser.'''
def get_broken_links(url):
"""
Retrieve broken links from a given URL.
Args:
- url (str): The URL of the webpage to check for broken links.
Returns:
- list: A list of broken links found on the webpage
"""
broken_links = [] # Initialize list to store broken links
parsed_url = urlparse(url) # Parse the URL to extract scheme and netloc (base URL)
'''
The scheme in a URL specifies the protocol used to access the resource on the server,
e.g., http, https, ftp, mailto, etc. The netloc in a URL specifies the network location of the resource.
It includes the hostname and optionally the port number, separated by a colon (:).
'''
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
try:
response = requests.get(url) # Send HTTP GET request to the specified URL
if response.status_code == 200: # Check if the request was successful (status code 200)
soup = BeautifulSoup(response.content, 'html.parser') # Parse the HTML content of the response
for link in soup.find_all('a', href=True): # Find all anchor tags () containing href attribute
href = link['href'] # Extract the value of the href attribute
if hasattr(href, 'startswith'):
if href.startswith('mailto:'): # Skip mailto links
continue # Skip processing mailto links
if 'twitter.com/intent' in href:
continue # Check if the href attribute contains a Twitter intent URL
full_url = urljoin(base_url, href) # Construct the full URL of the link
link_response = requests.head(full_url) # Send a HEAD request to check the status code of the link
if link_response.status_code in [400, 404, 410, 500, 502, 503]: # Check if the link is broken (status code >= 400)
broken_links.append(full_url) # Add broken link to the list
# Introduce a delay of 3 second between each request
time.sleep(3)
else:
print(f"Failed to fetch URL: {url}. Status code: {response.status_code}")
except Exception as e:
print(f"Error occurred: {e}")
return broken_links
def print_broken_links(broken_links):
if broken_links:
print("Broken links found:")
for link in broken_links:
print(link)
else:
print("No broken links found.")
def process_broken_link(source_page, broken_link):
"""
Callback function to process broken links.
Args:
- source_page (str): The URL of the page containing the broken link.
- broken_link (str): The URL of the broken link.
"""
print("-------------------------------------------------------")
print(f"Processing broken link from source page: {source_page} - Broken link: {broken_link}")
# Open in a browser in localhost (Hugo) the website to check the problem
webbrowser.open_new_tab(source_page.replace("https://justtothepoint.com", localweb))
# Turn the path source_page to the local file in Markdown format.
url_markdown = url_to_md_path(source_page)
print("Page:" + url_markdown)
# Open Visual Studio Code with the file to repair or delete the broken link
subprocess.Popen(["code", url_markdown])
#There is some cleaning up to do. Split the string based on the colon
print(broken_link)
broken_link = broken_link.split('link: ')[0]
# Extract the URL part
print(broken_link)
# Copy in the clipboard the broken link
pyperclip.copy(broken_link)
spam = pyperclip.paste()
# We process a line of the broken links once at at time
print("-------------------------------------------------------")
wait = input("Press a key to continue")
def process_broken_links_file(file_path, callback_function):
"""
Read a file containing broken links information and call a callback function for each broken link.
The format is as follows: Source page: UrlPageBrokenLink - Broken link: URLBroken (Status code: XXXX)
Args:
- file_path (str): The path to the broken links file.
- callback_function (callable): A function to be called for each broken link. It should accept two arguments: source_page and broken_link.
"""
print("process_broken_links_file")
try:
with open(file_path, 'r') as file:
for line in file:
print(line)
# Split the line into source page, broken link, and status code
if line.startswith("Source page:") and "Broken link:" in line:
source_page, broken_link = line.split(" - ")
match_source = re.search(r"https://\S+", source_page)
match_broken = re.search(r"https://\S+", broken_link)
if match_source:
source_page = match_source.group()
else:
print("No link found in the string.")
if match_broken:
broken_link = match_broken.group()
else:
print("No link found in the string.")
callback_function(source_page, broken_link)
except FileNotFoundError:
print(f"File not found: {file_path}")
except Exception as e:
print(f"An error occurred: {e}")
def main():
broken_links_file_path = "./assets/brokenlinks.txt"
process_broken_links_file(broken_links_file_path, process_broken_link)
# Entry point of the program
if __name__ == "__main__":
main()
Broken Link Checker finds broken links, missing images, etc within your HTML.
Installation and use in Ubuntu.
sudo apt install npm # We need NPM, Node Package Manager. It is a package manager for JavaScript. It is primarily used for managing and installing packages and dependencies for Node.js projects.
npm install broken-link-checker -g # Install Broken Link Checker
blc http://yoursite.com -ro > mybrokenSites.txt # This is a typical site-wide check, but I redirect the result to mybrokenSites.txt.
Online Broken Link Checker. It will not just validates your site and tells you which web references on your pages are dead, but it will also show to you where exactly those stale hyperlinks locate in your HTML code, highlighting problematic tags.
Free Link Checker (IndiaBook.com) is one of the fastest, most comprehensive and most detailed link checking free tool for webmasters.