There are two ways to do great mathematics. The first is to be smarter than everybody else. The second way is to be stupider than everybody else — but persistent, Raoul Bott.
I love authoring in Markdown as there is almost no friction - just write. This is the first of many articles where we will check all markdown files for broken or oversized images.
I don’t want to reinvent the wheel, so I use markdown-link-check, but it is far from perfect, so I want to monitor that there are not broken links, the pictures could correctly be seen, their sizes are appropriate, and their frontmatters have all the most important fields, so I have created a code that read recursively all my Markdown files and check them.
user@pc:~$ sudo npm install -g markdown-link-check # To install the command line tool (markdown-link-check) globally.
user@pc:~$ sudo npm install -g npm@10.6.0 # Update the tool
user@pc:~$ cd /home/username/directoryMarkdown/content
user@pc:~$ find . -name \*.md -print0 | xargs -0 -n1 markdown-link-check # Check links from a local markdown folder (recursive)
"""
File: checkmarkdown.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files.
Usage: Run the script (python checkmarkdown.py) to monitor all your markdown pages of your Hugo.
def check_markdown(markdown_file): Check the file with Markdown format.
delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"): delete files with '*sync-conflict*' in their names within the specified directory.
"""
from util import check_all_files, printUrl, append_to_file, cleanLog
import pathlib
import os, sys
from PIL import Image
from compressImage import compress_image
import requests
from urllib.parse import urlparse
from util import check_all_files
from checkMarkdownUtil import markdown_to_localwebhtml
from checkMarkdownImage import extract_images_from_markdown, check_images_markdown
from checkMarkdownUrl import extract_urls_from_markdown, check_urls_accessible
from checkMarkdownFormat import check_frontmatter, convert_relative_links_to_absolute, check_format_markdown
mywebsite = "/home/yourUserName/yourWebsite/content/"
def check_markdown(markdown_file):
"""
Check the file with Markdown format.
"""
append_to_file("----------------------------------------------")
append_to_file(f"Processing... {markdown_file}")
# Check and modify the formatting of a Markdown file, specifically handling images and links badly formatted.
# It is commented because it is very intensive.
# You may want to uncomment it: check_format_markdown(markdown_file)
convert_relative_links_to_absolute(markdown_file)
# is the frontmatter OK?
try:
check_frontmatter(markdown_file)
except:
append_to_file(f"Exception processing Markdown format: {markdown_file}")
print(f"Exception processing Markdown format: {markdown_file}")
# 1. Check images
# Extract all its images.
images = extract_images_from_markdown(markdown_file)
append_to_file(images)
# Are images belonging to the markdown_file OK?
check_images_markdown(markdown_file, images)
# 2. Check URLs
# Extract all its links.
urls = extract_urls_from_markdown(markdown_file)
append_to_file(urls) # Log purposes (Debugging.)
# Are all links belonging to the markdown_file accessible?
accessibility = check_urls_accessible(urls, markdown_file)
append_to_file(f"{markdown_file} is OK.")
append_to_file("----------------------------------------------")
def delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"):
"""
Delete files with '*sync-conflict*' in their names within the specified directory.
Args:
- directory (str): The directory path where files will be searched for and deleted.
"""
# Iterate over all files and directories in the specified directory
for path, dirs, files in os.walk(directory):
for file in files:
# Check if the file name contains '*sync-conflict*'
if "sync-conflict" in file:
# Construct the full file path
file_path = os.path.join(path, file)
try:
# Attempt to remove the file
os.remove(file_path)
print(f"Deleted file: {file_path}")
except Exception as e:
# Print error message if deletion fails
print(f"Error deleting file {file_path}: {e}")
if __name__ == "__main__":
os.system("clear")
cleanLog() # Clean the log, so we start with a clean log file.
delete_sync_conflict_files() # Delete files with '*sync-conflict*' in their names within the specified directory.
"""
File: checkMarkdownImage.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files' images.
def check_image_size(markdown_file, image_path, max_size_kb = 2024): Given a markdown_file and an image's path...
checks the image's size, and determines if it's OK based on a maximum size limit. If its size exceeds the maximum limit, it compresses it.
def check_image_ok(markdown_file, image_path): check if the image can be opened without errors and also checks its size (check_image_size).
def check_images_markdown(markdown_file, images): check if "images" belonging to the Markdown file are present and accessible.
def extract_images_from_markdown_article(markdown_file): extracts the paths of images from a Markdown file.
"""
import re, os, sys
import pathlib
from checkMarkdownUtil import value_field_from_markdown, filter_code_shortcodes_blocks, is_markdown_article, markdown_to_localwebhtml
from util import append_to_file, check_manually
from PIL import Image
from compressImage import compress_image
def check_image_size(markdown_file, image_path, max_size_kb = 2024):
''' It takes the markdown_file and the image's path as required arguments, checks the image's size, and determines if it's OK based on a maximum size limit.
If the image size exceeds the maximum size limit, it compresses it.
Args:
- markdown_file: The markdown file where the image is hosted.
- image_path: The path of the image.
- max_size_kb: The maximum size limit allowed.
Returns:
- True/False: If image_path's size does not exceed max_size_kb return True. Otherwise, returns false.'''
try:
# Get the size of the image file in bytes
image_size_bytes = os.path.getsize(image_path)
# Convert image size from bytes to kilobytes
image_size_kb = image_size_bytes / 1024
# Check if the image size exceeds the maximum size limit
if image_size_kb > max_size_kb:
append_to_file(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
print(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
# If the image size exceeds the maximum size limit, it compresses it.
compress_image(image_path)
# The original image has exceed the maximum size limit allowed.
with open("assets/imagestoobig.txt", 'a') as file:
file.write(str(image_path) + "\n")
return False
else:
return True
except Exception as e:
# Append_to_file or log the error message if the image cannot be opened with the exception.
append_to_file(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
print(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
return False
return True
def check_image_ok(markdown_file, image_path):
"""
Check if the image at the given path can be opened without errors.
It also checks its size (it uses check_image_size), and determines if it's OK based on a maximum size limit.
Args:
- image_path (str): The path to the image file.
- markdown_file: We use this parameter to be able to log and debug the Markdown file where the problem may be.
Returns:
- bool: True if the image can be opened without errors, False otherwise.
"""
try:
# Try to open the image
img = Image.open(image_path)
# Close the image to release resources.
img.close()
return check_image_size(markdown_file, image_path)
except Exception as e:
print(f"Error in {markdown_file}, opening image '{image_path.absolute()}': {e}")
return False
return True
def check_images_markdown(markdown_file, images):
"""
Check if "images" belonging to the Markdown file are present and accessible.
Parameters:
markdown_file (str): The path to the Markdown file.
images (list): List of image URLs.
Returns:
None
"""
markdown_dir = pathlib.Path(markdown_file)
# Iterate over each image URL in the list
for img in images:
# Resolve the relative URL to get the full path of the image
image_path = markdown_dir / img
image_path = pathlib.Path(image_path).resolve()
# Double check if the image file exists
if not os.path.exists(image_path):
# If the previous attempt to find the image proves to be unsuccessful,
# it concatenates the absolute path of the current working directory where the Hugo's markdown files are placed
# with os.path.abspath(img), that is, a normalized (absolute -current working directory-) version of the path img
image_path = "/home/yourUserName/yourWebsite/content" + os.path.abspath(img)
if not os.path.exists(image_path):
# The image could be /home/yourUserName/yourWebsite/static
image_path = "/home/yourUserName/yourWebsite/static" + os.path.abspath(img)
if not os.path.exists(image_path):
append_to_file("Error: Image file not found!")
append_to_file(f"{markdown_file} : {image_path}")
print(f"Image Not found in {markdown_file}: {image_path}")
check_manually(image_path, markdown_file, markdown_to_localwebhtml(markdown_file))
else:
check_image_ok(markdown_file, image_path)
else:
check_image_ok(markdown_file, image_path)
def extract_images_from_markdown_article(markdown_file):
"""
Extracts the paths of images from a Markdown file.
Parameters:
markdown_file (str): Path to the Markdown file.
Returns:
list: List of paths of images.
"""
image_paths = [] # Initializes an empty list to store the paths of images.
# Define the regular expression pattern to match image paths
# Lines starting with "image:" followed by the image paht.
image_pattern = re.compile(r'^\s*image:\s*(.*)$', re.MULTILINE)
# Read the contents of the Markdown file and stores it in the content variable
with open(markdown_file, 'r') as file:
content = file.read()
# Find all matches of the image pattern in the content of the Markdown file.
# It returns a list of image paths.
matches = image_pattern.findall(content)
# Add each matched image path to the list after stripping any leading or trailing whitespace
for match in matches:
image_paths.append(match.strip())
return image_paths
def extract_images_from_markdown(markdown_file):
'''Extract all images from a markdown_file.
Args:
- markdown_fil (str): The markdown_file from which to retrieve its images.
Returns:
- list: A list of all its images.'''
'''
--- This will be a Markdown file of an article (a list)
title: "Apocalipsis"
date: 2022-04-02T07:12:39+02:00
draft: false
author: Máximo Núñez Alarcón
description: Don't you have more than enough reading pleasure? ...
featured_image: /en/library/images/clasicos-ingles-8.jpg
keywords: free ebook,martha and the twins,read marta and the twins, ...
language: en
articles:
- title: Apocalipsis. 6=0
link: /library/Apocalipsis1/
image: /en/library/images/Apocalipsis1.png
- title: I'm sorry, Andrew
link: /library/Apocalipsis2/
image: /en/library/images/Apocalipsis2.png
'''
if is_markdown_article(markdown_file):
# List to store extracted image URLs
images = extract_images_from_markdown_article(markdown_file)
else:
images = []
# Regular expression pattern to match image markdown syntax ![alt text](image_url)
img_pattern = r"!\[.*\]\((.*?)\)"
try:
# Open the Markdown file
with open(markdown_file, 'r') as f:
# Read the contents of the file
unfilteredContent = f.read()
content = filter_code_shortcodes_blocks(unfilteredContent)
# Find all matches of the image pattern in the content
matches = re.findall(img_pattern, content)
# Add each matched image URL to the list
for match in matches:
images.append(match)
except:
print(f"Exception trying to open {markdown_file}")
return images
if __name__ == "__main__":
extract_images_from_markdown("/home/yourUserName/yourWebsite/content/en/library/iwantmore/index.md")