Check your Markdown pages

There are two ways to do great mathematics. The first is to be smarter than everybody else. The second way is to be stupider than everybody else — but persistent, Raoul Bott.

I love authoring in Markdown as there is almost no friction - just write. This is the first of many articles where we will check all markdown files for broken or oversized images.

I don’t want to reinvent the wheel, so I use markdown-link-check, but it is far from perfect, so I want to monitor that there are not broken links, the pictures could correctly be seen, their sizes are appropriate, and their frontmatters have all the most important fields, so I have created a code that read recursively all my Markdown files and check them.

user@pc:~$ sudo npm install -g markdown-link-check # To install the command line tool (markdown-link-check) globally.
user@pc:~$ sudo npm install -g npm@10.6.0 # Update the tool
user@pc:~$ cd /home/username/directoryMarkdown/content
user@pc:~$ find . -name \*.md -print0 | xargs -0 -n1 markdown-link-check # Check links from a local markdown folder (recursive)

Check Markdown pages

"""
File: checkmarkdown.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files. 

Usage: Run the script (python checkmarkdown.py) to monitor all your markdown pages of your Hugo.
def check_markdown(markdown_file): Check the file with Markdown format.
delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"): delete files with '*sync-conflict*' in their names within the specified directory.
"""

from util import check_all_files, printUrl, append_to_file, cleanLog
import pathlib
import os, sys
from PIL import Image
from compressImage import compress_image
import requests
from urllib.parse import urlparse
from util import check_all_files
from checkMarkdownUtil import markdown_to_localwebhtml
from checkMarkdownImage import extract_images_from_markdown, check_images_markdown
from checkMarkdownUrl import extract_urls_from_markdown, check_urls_accessible
from checkMarkdownFormat import check_frontmatter, convert_relative_links_to_absolute, check_format_markdown

mywebsite = "/home/yourUserName/yourWebsite/content/"


def check_markdown(markdown_file):
    """
    Check the file with Markdown format.
    """
    append_to_file("----------------------------------------------")
    append_to_file(f"Processing... {markdown_file}")
    # Check and modify the formatting of a Markdown file, specifically handling images and links badly formatted.
    # It is commented because it is very intensive. 
    # You may want to uncomment it: check_format_markdown(markdown_file)

    convert_relative_links_to_absolute(markdown_file)
    # is the frontmatter OK?
    try:    
        check_frontmatter(markdown_file)
    except:
        append_to_file(f"Exception processing Markdown format: {markdown_file}")
        print(f"Exception processing Markdown format: {markdown_file}")
    
    # 1. Check images
    # Extract all its images.
    images = extract_images_from_markdown(markdown_file)
    append_to_file(images)

    # Are images belonging to the markdown_file OK?
    check_images_markdown(markdown_file, images)
    
    # 2. Check URLs
    # Extract all its links.
    urls = extract_urls_from_markdown(markdown_file)
    append_to_file(urls) # Log purposes (Debugging.)

    # Are all links belonging to the markdown_file accessible?
    accessibility = check_urls_accessible(urls, markdown_file)    

    append_to_file(f"{markdown_file} is OK.")
    append_to_file("----------------------------------------------")
        
def delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"):
    """
    Delete files with '*sync-conflict*' in their names within the specified directory.

    Args:
    - directory (str): The directory path where files will be searched for and deleted.
    """
    # Iterate over all files and directories in the specified directory
    for path, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file name contains '*sync-conflict*'
            if "sync-conflict" in file:
                # Construct the full file path
                file_path = os.path.join(path, file)
                try:
                    # Attempt to remove the file
                    os.remove(file_path)
                    print(f"Deleted file: {file_path}")
                except Exception as e:
                    # Print error message if deletion fails
                    print(f"Error deleting file {file_path}: {e}")


if __name__ == "__main__":	
    os.system("clear")
    cleanLog() # Clean the log, so we start with a clean log file.
    delete_sync_conflict_files() # Delete files with '*sync-conflict*' in their names within the specified directory.

"""
File: checkMarkdownImage.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files' images. 
def check_image_size(markdown_file, image_path, max_size_kb = 2024): Given a markdown_file and an image's path...
checks the image's size, and determines if it's OK based on a maximum size limit. If its size exceeds the maximum limit, it compresses it.
def check_image_ok(markdown_file, image_path): check if the image can be opened without errors and also checks its size (check_image_size).
def check_images_markdown(markdown_file, images): check if "images" belonging to the Markdown file are present and accessible.
def extract_images_from_markdown_article(markdown_file): extracts the paths of images from a Markdown file.
"""

import re, os, sys
import pathlib
from checkMarkdownUtil import value_field_from_markdown, filter_code_shortcodes_blocks, is_markdown_article, markdown_to_localwebhtml
from util import append_to_file, check_manually
from PIL import Image
from compressImage import compress_image

def check_image_size(markdown_file, image_path, max_size_kb = 2024):
    ''' It takes the markdown_file and the image's path as required arguments, checks the image's size, and determines if it's OK based on a maximum size limit.
    If the image size exceeds the maximum size limit, it compresses it.

    Args:
    - markdown_file: The markdown file where the image is hosted.
    - image_path: The path of the image.
    - max_size_kb: The maximum size limit allowed.
    
    Returns:
    - True/False: If image_path's size does not exceed max_size_kb return True. Otherwise, returns false.'''
        
    try:
        # Get the size of the image file in bytes
        image_size_bytes = os.path.getsize(image_path)
        
        # Convert image size from bytes to kilobytes
        image_size_kb = image_size_bytes / 1024
        
        # Check if the image size exceeds the maximum size limit
        if image_size_kb > max_size_kb:
            append_to_file(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
            print(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
            # If the image size exceeds the maximum size limit, it compresses it.
            compress_image(image_path)
            # The original image has exceed the maximum size limit allowed.
            with open("assets/imagestoobig.txt", 'a') as file:
                file.write(str(image_path) + "\n")

            return False
        else:
            return True
    except Exception as e:
        # Append_to_file or log the error message if the image cannot be opened with the exception.
        append_to_file(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
        print(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
        return False

    return True

def check_image_ok(markdown_file, image_path):
    """
    Check if the image at the given path can be opened without errors.
    It also checks its size (it uses check_image_size), and determines if it's OK based on a maximum size limit.

    Args:
    - image_path (str): The path to the image file.
    - markdown_file: We use this parameter to be able to log and debug the Markdown file where the problem may be.

    Returns:
    - bool: True if the image can be opened without errors, False otherwise.
    """

    try:
        # Try to open the image
        img = Image.open(image_path)
        # Close the image to release resources.
        img.close()
        return check_image_size(markdown_file, image_path)
    except Exception as e:
        print(f"Error in {markdown_file}, opening image '{image_path.absolute()}': {e}")
        return False

    return True

def check_images_markdown(markdown_file, images):
    """
    Check if "images" belonging to the Markdown file are present and accessible.

    Parameters:
        markdown_file (str): The path to the Markdown file.
        images (list): List of image URLs.

    Returns:
        None
    """
    markdown_dir = pathlib.Path(markdown_file)
    # Iterate over each image URL in the list
    for img in images:
        # Resolve the relative URL to get the full path of the image
        image_path = markdown_dir / img
        image_path = pathlib.Path(image_path).resolve()

        # Double check if the image file exists
        if not os.path.exists(image_path):
            # If the previous attempt to find the image proves to be unsuccessful, 
            # it concatenates the absolute path of the current working directory where the Hugo's markdown files are placed 
            # with os.path.abspath(img), that is, a normalized (absolute -current working directory-) version of the path img
            image_path = "/home/yourUserName/yourWebsite/content" + os.path.abspath(img)
            if not os.path.exists(image_path):
                # The image could be /home/yourUserName/yourWebsite/static
                image_path = "/home/yourUserName/yourWebsite/static" + os.path.abspath(img)
                if not os.path.exists(image_path):
                    append_to_file("Error: Image file not found!")
                    append_to_file(f"{markdown_file} : {image_path}")
                    print(f"Image Not found in {markdown_file}: {image_path}")
                    check_manually(image_path, markdown_file, markdown_to_localwebhtml(markdown_file))
            else:   
                check_image_ok(markdown_file, image_path)
        else:   
            check_image_ok(markdown_file, image_path)

def extract_images_from_markdown_article(markdown_file):
    """
    Extracts the paths of images from a Markdown file.

    Parameters:
        markdown_file (str): Path to the Markdown file.

    Returns:
        list: List of paths of images.
    """
    image_paths = [] #  Initializes an empty list to store the paths of images.
    
    # Define the regular expression pattern to match image paths
    # Lines starting with "image:" followed by the image paht.
    image_pattern = re.compile(r'^\s*image:\s*(.*)$', re.MULTILINE)

    # Read the contents of the Markdown file and stores it in the content variable
    with open(markdown_file, 'r') as file:
        content = file.read()

    # Find all matches of the image pattern in the content of the Markdown file.
    # It returns a list of image paths.
    matches = image_pattern.findall(content)

    # Add each matched image path to the list after stripping any leading or trailing whitespace
    for match in matches:
        image_paths.append(match.strip())

    return image_paths

def extract_images_from_markdown(markdown_file):
    '''Extract all images from a markdown_file.
    Args:
    - markdown_fil (str): The markdown_file from which to retrieve its images. 

    Returns:
    - list: A list of all its images.'''

    '''
    --- This will be a Markdown file of an article (a list)
    title: "Apocalipsis"
    date: 2022-04-02T07:12:39+02:00
    draft: false
    author: Máximo Núñez Alarcón
    description: Don't you have more than enough reading pleasure? ...
    featured_image: /en/library/images/clasicos-ingles-8.jpg
    keywords: free ebook,martha and the twins,read marta and the twins, ...
    language: en
    articles:
        - title: Apocalipsis. 6=0
        link: /library/Apocalipsis1/
        image: /en/library/images/Apocalipsis1.png
        - title: I'm sorry, Andrew
        link: /library/Apocalipsis2/
        image: /en/library/images/Apocalipsis2.png
    '''
    if is_markdown_article(markdown_file):
        # List to store extracted image URLs
        images = extract_images_from_markdown_article(markdown_file)
    else:
        images = []

    # Regular expression pattern to match image markdown syntax ![alt text](image_url)
    img_pattern = r"!\[.*\]\((.*?)\)"
    
    try:
        # Open the Markdown file
        with open(markdown_file, 'r') as f:
            # Read the contents of the file
            unfilteredContent = f.read()
            content = filter_code_shortcodes_blocks(unfilteredContent)

            # Find all matches of the image pattern in the content
            matches = re.findall(img_pattern, content)
            
            # Add each matched image URL to the list
            for match in matches:
                images.append(match)
    except:
        print(f"Exception trying to open {markdown_file}")    
    return images


if __name__ == "__main__":	
    extract_images_from_markdown("/home/yourUserName/yourWebsite/content/en/library/iwantmore/index.md")

Maximize your online presence with our exclusive offer: Get a stunning hero banner, the hero you need and deserve, at an unbeatable price! Bew, 689282782, bupparchard@gmail.com

Related Posts