JustToThePoint English Website Version
JustToThePoint en español
Colaborate with us

Check your Markdown pages

There are two ways to do great mathematics. The first is to be smarter than everybody else. The second way is to be stupider than everybody else — but persistent, Raoul Bott.

I love authoring in Markdown as there is almost no friction - just write. This is the first of many articles where we will check all markdown files for broken or oversized images.

I don’t want to reinvent the wheel, so I use markdown-link-check, but it is far from perfect, so I want to monitor that there are not broken links, the pictures could correctly be seen, their sizes are appropriate, and their frontmatters have all the most important fields, so I have created a code that read recursively all my Markdown files and check them.

user@pc:~$ sudo npm install -g markdown-link-check # To install the command line tool (markdown-link-check) globally.
user@pc:~$ sudo npm install -g npm@10.6.0 # Update the tool
user@pc:~$ cd /home/username/directoryMarkdown/content
user@pc:~$ find . -name \*.md -print0 | xargs -0 -n1 markdown-link-check # Check links from a local markdown folder (recursive)

Check Markdown pages

"""
File: checkmarkdown.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files. 

Usage: Run the script (python checkmarkdown.py) to monitor all your markdown pages of your Hugo.
def check_markdown(markdown_file): Check the file with Markdown format.
delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"): delete files with '*sync-conflict*' in their names within the specified directory.
"""

from util import check_all_files, printUrl, append_to_file, cleanLog
import pathlib
import os, sys
from PIL import Image
from compressImage import compress_image
import requests
from urllib.parse import urlparse
from util import check_all_files
from checkMarkdownUtil import markdown_to_localwebhtml
from checkMarkdownImage import extract_images_from_markdown, check_images_markdown
from checkMarkdownUrl import extract_urls_from_markdown, check_urls_accessible
from checkMarkdownFormat import check_frontmatter, convert_relative_links_to_absolute, check_format_markdown

mywebsite = "/home/yourUserName/yourWebsite/content/"


def check_markdown(markdown_file):
    """
    Check the file with Markdown format.
    """
    append_to_file("----------------------------------------------")
    append_to_file(f"Processing... {markdown_file}")
    # Check and modify the formatting of a Markdown file, specifically handling images and links badly formatted.
    # It is commented because it is very intensive. 
    # You may want to uncomment it: check_format_markdown(markdown_file)

    convert_relative_links_to_absolute(markdown_file)
    # is the frontmatter OK?
    try:    
        check_frontmatter(markdown_file)
    except:
        append_to_file(f"Exception processing Markdown format: {markdown_file}")
        print(f"Exception processing Markdown format: {markdown_file}")
    
    # 1. Check images
    # Extract all its images.
    images = extract_images_from_markdown(markdown_file)
    append_to_file(images)

    # Are images belonging to the markdown_file OK?
    check_images_markdown(markdown_file, images)
    
    # 2. Check URLs
    # Extract all its links.
    urls = extract_urls_from_markdown(markdown_file)
    append_to_file(urls) # Log purposes (Debugging.)

    # Are all links belonging to the markdown_file accessible?
    accessibility = check_urls_accessible(urls, markdown_file)    

    append_to_file(f"{markdown_file} is OK.")
    append_to_file("----------------------------------------------")
        
def delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"):
    """
    Delete files with '*sync-conflict*' in their names within the specified directory.

    Args:
    - directory (str): The directory path where files will be searched for and deleted.
    """
    # Iterate over all files and directories in the specified directory
    for path, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file name contains '*sync-conflict*'
            if "sync-conflict" in file:
                # Construct the full file path
                file_path = os.path.join(path, file)
                try:
                    # Attempt to remove the file
                    os.remove(file_path)
                    print(f"Deleted file: {file_path}")
                except Exception as e:
                    # Print error message if deletion fails
                    print(f"Error deleting file {file_path}: {e}")


if __name__ == "__main__":	
    os.system("clear")
    cleanLog() # Clean the log, so we start with a clean log file.
    delete_sync_conflict_files() # Delete files with '*sync-conflict*' in their names within the specified directory.   
"""
File: checkMarkdownImage.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files' images. 
def check_image_size(markdown_file, image_path, max_size_kb = 2024): Given a markdown_file and an image's path...
checks the image's size, and determines if it's OK based on a maximum size limit. If its size exceeds the maximum limit, it compresses it.
def check_image_ok(markdown_file, image_path): check if the image can be opened without errors and also checks its size (check_image_size).
def check_images_markdown(markdown_file, images): check if "images" belonging to the Markdown file are present and accessible.
def extract_images_from_markdown_article(markdown_file): extracts the paths of images from a Markdown file.

"""
import re, os, sys
import pathlib
from checkMarkdownUtil import value_field_from_markdown, filter_code_shortcodes_blocks, is_markdown_article, markdown_to_localwebhtml
from util import append_to_file, check_manually
from PIL import Image
from compressImage import compress_image

def check_image_size(markdown_file, image_path, max_size_kb = 2024):
    ''' It takes the markdown_file and the image's path as required arguments, checks the image's size, and determines if it's OK based on a maximum size limit.
    If the image size exceeds the maximum size limit, it compresses it.

    Args:
    - markdown_file: The markdown file where the image is hosted.
    - image_path: The path of the image.
    - max_size_kb: The maximum size limit allowed.
    
    Returns:
    - True/False: If image_path's size does not exceed max_size_kb return True. Otherwise, returns false.'''
        
    try:
        # Get the size of the image file in bytes
        image_size_bytes = os.path.getsize(image_path)
        
        # Convert image size from bytes to kilobytes
        image_size_kb = image_size_bytes / 1024
        
        # Check if the image size exceeds the maximum size limit
        if image_size_kb > max_size_kb:
            append_to_file(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
            print(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
            # If the image size exceeds the maximum size limit, it compresses it.
            compress_image(image_path)
            # The original image has exceed the maximum size limit allowed.
            with open("assets/imagestoobig.txt", 'a') as file:
                file.write(str(image_path) + "\n")

            print(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit II.")
            return False
        else:
            return True
    except Exception as e:
        # Append_to_file or log the error message if the image cannot be opened with the exception.
        append_to_file(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
        print(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
        return False

    return True

def check_image_ok(markdown_file, image_path):
    """
    Check if the image at the given path can be opened without errors.
    It also checks its size (it uses check_image_size), and determines if it's OK based on a maximum size limit.

    Args:
    - image_path (str): The path to the image file.
    - markdown_file: We use this parameter to be able to log and debug the Markdown file where the problem may be.

    Returns:
    - bool: True if the image can be opened without errors, False otherwise.
    """

    try:
        # Try to open the image
        img = Image.open(image_path)
        # Close the image to release resources.
        img.close()
        return check_image_size(markdown_file, image_path)
    except Exception as e:
        print(f"Error in {markdown_file}, opening image '{image_path.absolute()}': {e}")
        return False

    return True

def check_images_markdown(markdown_file, images):
    """
    Check if "images" belonging to the Markdown file are present and accessible.

    Parameters:
        markdown_file (str): The path to the Markdown file.
        images (list): List of image URLs.

    Returns:
        None
    """
    # Example: markdown_file = /home/nmaximo7/justtothepoint/content/code/hugo8.md
    markdown_path = pathlib.Path(markdown_file)
    # Python's pathlib module enables you to handle file and folder paths in a modern way.
    markdown_dir = markdown_path.parent # /home/nmaximo7/justtothepoint/content/code
    # Iterate over each image URL in the list
    for img in images:
        # Resolve the relative URL to get the full path of the image, e.g., img = "/code/images/alexa.png"
        image_path = markdown_dir / img # If img were a relative path, it will combine them, but it is not the case
        image_path = pathlib.Path(image_path).resolve() # e.g., image_path = "/code/images/alexa.png"

        # Double check if the image file exists
        if not os.path.exists(image_path):
            # If the previous attempt to find the image proves to be unsuccessful, 
            # it concatenates the absolute path of the current working directory where the Hugo's markdown files are placed 
            # with os.path.abspath(img), that is, a normalized (absolute -current working directory-) version of the path img
            image_path = "/home/nmaximo7/justtothepoint/content" + os.path.abspath(img)
            # e.g., image_path = "/home/nmaximo7/justtothepoint/content/code/images/alexa.png". It does exist.
            if not os.path.exists(image_path):
                # The image could be /home/nmaximo7/justtothepoint/static. 
                # By default, the static/ directory in the site project is used for all static files (e.g. stylesheets, JavaScript, images). The static files are served on the site root path.
                image_path = "/home/nmaximo7/justtothepoint/static" + os.path.abspath(img)
                if not os.path.exists(image_path):
                    append_to_file("Error: Image file not found!")
                    append_to_file(f"{markdown_file} : {image_path}")
                    print(f"Image Not found in {markdown_file}: {image_path}")
                    check_manually(image_path, markdown_file, markdown_to_localwebhtml(markdown_file))
            else:   
                check_image_ok(markdown_file, image_path)
        else:   
            check_image_ok(markdown_file, image_path)

def extract_images_from_markdown_article(markdown_file):
    """
    Extracts the paths of images from a Markdown file.

    Parameters:
        markdown_file (str): Path to the Markdown file.

    Returns:
        list: List of paths of images.
    """
    image_paths = [] #  Initializes an empty list to store the paths of images.
    
    # Define the regular expression pattern to match image paths
    # Lines starting with "image:" followed by the image paht.
    image_pattern = re.compile(r'^\s*image:\s*(.*)$', re.MULTILINE)

    # Read the contents of the Markdown file and stores it in the content variable
    with open(markdown_file, 'r') as file:
        content = file.read()

    # Find all matches of the image pattern in the content of the Markdown file.
    # It returns a list of image paths.
    matches = image_pattern.findall(content)

    # Add each matched image path to the list after stripping any leading or trailing whitespace
    for match in matches:
        image_paths.append(match.strip())

    return image_paths

def extract_images_from_markdown(markdown_file):
    '''Extract all images from a markdown_file.
    Args:
    - markdown_fil (str): The markdown_file from which to retrieve its images. 

    Returns:
    - list: A list of all its images.'''

    '''
    --- This will be a Markdown file of an article (a list)
    title: "Apocalipsis"
    date: 2022-04-02T07:12:39+02:00
    draft: false
    author: Máximo Núñez Alarcón
    description: Don't you have more than enough reading pleasure? ...
    featured_image: /en/library/images/clasicos-ingles-8.jpg
    keywords: free ebook,martha and the twins,read marta and the twins, ...
    language: en
    articles:
        - title: Apocalipsis. 6=0
        link: /library/Apocalipsis1/
        image: /en/library/images/Apocalipsis1.png
        - title: I'm sorry, Andrew
        link: /library/Apocalipsis2/
        image: /en/library/images/Apocalipsis2.png
    '''
    if is_markdown_article(markdown_file):
        # List to store extracted image URLs
        images = extract_images_from_markdown_article(markdown_file)
    else:
        images = []

    # Regular expression pattern to match image markdown syntax ![alt text](image_url)
    img_pattern = r"!\[.*\]\((.*?)\)"
    
    try:
        # Open the Markdown file
        with open(markdown_file, 'r') as f:
            # Read the contents of the file
            unfilteredContent = f.read()
            content = filter_code_shortcodes_blocks(unfilteredContent)

            # Find all matches of the image pattern in the content
            matches = re.findall(img_pattern, content)
            
            # Add each matched image URL to the list
            for match in matches:
                images.append(match)
    except:
        print(f"Exception trying to open {markdown_file}")    
    return images


if __name__ == "__main__":	
    extract_images_from_markdown("/home/nmaximo7/justtothepoint/content/en/library/iwantmore/index.md")
'''
File: checkMarkdownUrl.py
Author: Máximo Núñez Alarcón
Description: Python functions to handle URLs in Markdown files:
* replace_url(content): Replace URLs starting with 'https://justtothepoint.com/' or 'http://justtothepoint.com/' with 'http://localhost:1313/'.
* extract_urls_from_markdown_article(markdown_file): Extracts the paths of urls from a Markdown file that is an article.
* extract_urls_from_markdown(markdown_file): Extract all urls from a markdown_file. 
* scrape_website(url, api_key, markdown_file, search_locally = True): scrape a website (by default, locally) using the ScrapingAnt API, free for personal use.
* check_absolute_url(url, markdown_file): check if an absolute URL is accessible (calling scrape_website).
If the URL is not accessible, remove the link from the markdown and let the user check manually what the problem is.
* check_urls_accessible(urls, markdown_file): check if the URLs in the list are accessible. The URLs may not be absolute urls.
It basically constructs absolute urls and call check_absolute_url to check if the urls are accessible.
'''

from checkMarkdownUtil import value_field_from_markdown, filter_code_shortcodes_blocks, is_markdown_article, get_markdown_real_content, markdown_to_localwebhtml, remove_link_from_markdown
import re
import os
import requests
from urllib.parse import urlparse, urljoin
from util import check_manually, append_to_file
import time

def replace_url(content):
    """
    Replace URLs starting with 'https://justtothepoint.com/' or 'http://justtothepoint.com/' with 'http://localhost:1313/'.

    Parameters:
        content (str): The content containing URLs.

    Returns:
        str: The content with URLs replaced.
    """
    # Define the regular expression pattern to match HTTP URLs
    url_pattern = re.compile(r'http://justtothepoint\.com/|https://justtothepoint\.com/')

    # Replace matched HTTP URLs with 'http://localhost:1313/'
    replaced_content = re.sub(url_pattern, 'http://localhost:1313/', content)

    return replaced_content


def extract_urls_from_markdown_article(markdown_file):
    """
    Extracts the paths of urls from a Markdown file that is an article.

    Parameters:
        markdown_file (str): Path to the Markdown file.

    Returns:
        list: List of urls.
    """
    urls = []
    
    # Define the regular expression pattern to match image paths
    '''
    ---
    title: "Biblioteca bilingüe, divertida y gratuita. Los cinco aprendices de Mago."
    date: 2022-04-02T07:12:39+02:00
    draft: false
    author: Máximo Núñez Alarcón
    description: Cinco amigos se ven envueltos en todo tipo de aventuras y desventuras. Escrito con sencillez y un gran sentido del humor para que su lectura no te deje indiferente.
    featured_image: /myImages/los-cinco-aprendices-de-mago-1-1.jpg
    keywords: lectura,leer,portal de lectura,libro,lectura divertida,fantasía,libros de aventuras,libros divertidos,libros gratuitos,colección,colección de libros,primeros lectores,primeras lecturas,lectura juvenil libros,lectura juvenil online,lecturas infantiles,libros para niños,cuentos para niños,libros online,libros free,ebooks gratis,leer online,libros para todos,leer es divertido,libros bilingües,biblioteca online,comprensión lectora,libros bilingües gratuitos,biblioteca bilingüe,libros entretenidos,aprender a leer,leer y divertirse,libros educativos y divertidos,libros educativos,libros para todas las edades,libros con comprensión lectora,libros para todos los niveles,cómo aprender a leer,quiero aprender a leer,lectura para todos,aprendices de mago,los cinco aprendices,los cinco aprendices de mago,magia,libros gratuitos de fantasia,libros de fantasía e ilusión
    language: es
    books2:
        - title: Una mañana en el campanario
        link: /library/Los5MagosL1C1Es/index.html
        image: https://justtothepoint.com/myImages/los-cinco-aprendices-de-mago-1-1.jpg
        quizlink: https://justtothepoint.com/quizzes/quizLos5Aprend1b1Es.html
        infolink: https://es.wikipedia.org/wiki/Nostradamus
        - title: El bosque animado
        link: /library/Los5MagosL1C2Es/index.html
        image: https://justtothepoint.com/myImages/los-cinco-aprendices-de-mago-1-2.jpg
        quizlink: https://justtothepoint.com/quizzes/quizLos5Aprend1b2Es.html
        infolink: https://www.rimador.net/
    '''
    url_pattern = re.compile(r'^\s*link:\s*(.*)$', re.MULTILINE)
    url_pattern2 = re.compile(r'^\s*quizlink:\s*(.*)$', re.MULTILINE)
    url_pattern3 = re.compile(r'^\s*infolink:\s*(.*)$', re.MULTILINE)


    # Read the contents of the Markdown file
    with open(markdown_file, 'r') as file:
        content = file.read()

    matches = url_pattern.findall(content)
    matches2 = url_pattern2.findall(content)
    matches3 = url_pattern3.findall(content)

    for match in matches, matches2, matches3:
        urls.append(match)

    # urls is a list of list of urls, we need to flatten it to be a list of urls
    flat_urls = []
    for sublist_urls in urls:
        if sublist_urls: # This condition ensures that only non-empty sublist are considered  
            for url in sublist_urls:
                flat_urls.append(url)
        
    return flat_urls


def extract_images_with_markdown(markdown_file):
    """
    Extract all urls/image Markdown shortcode from the given Markdown file.

    Parameters:
    - markdown_text (str): The Markdown file containing images or links.

    Returns:
    - list: A list containing all image Markdown syntax found in the file.
    """
    append_to_file(f"extract_images_from_markdown")
    # Read the content of the Markdown file
    with open(markdown_file, 'r') as file:
        markdown_text = file.read()

    # Regular expression pattern to match Markdown-style image syntax
    image_pattern = r"!*\[.*?\]\(.*?\)"

    # Find all matches of the image pattern in the content
    image_matches = re.findall(image_pattern, markdown_text)
    append_to_file(image_matches)
    return image_matches

def extract_urls_from_markdown(markdown_file):
    '''Extract all urls from a markdown_file.
    Args:
    - markdown_fil (str): The markdown_file from which to retrieve its images. 

    Returns:
    - list: A list of all its urls.'''
    if is_markdown_article(markdown_file):
        # List to store extracted URLs
        urls = extract_urls_from_markdown_article(markdown_file)
    else:
        urls = []
    # Regular expression pattern to match image markdown syntax ![alt text](image_url)
    image_pattern = r"\!\[.*\]\((.*?)\)"
    
    # Regular expression pattern to match URL markdown syntax [link text](link_url)
    url_pattern = r"\[.*\]\((.*?)\)"
    
    # Get the markdown_file real content
    content = get_markdown_real_content(markdown_file)
    
    # Find all matches of the URL pattern in the content
    url_matches = re.findall(url_pattern, content)
        
    # Find all matches of the image pattern in the content
    image_matches = re.findall(image_pattern, content)
        
    # Filter out URLs that are also used as image URLs
    for url in url_matches:
        if url not in image_matches:
            urls.append(url)
    
    return urls

def extract_urls2_from_markdown(markdown_file):
    '''Extract all urls and images from a markdown_file.
    Args:
    - markdown_fil (str): The markdown_file from which to retrieve its images. 

    Returns:
    - list: A list of all its urls.'''
    if is_markdown_article(markdown_file):
        # List to store extracted URLs
        urls = extract_urls_from_markdown_article(markdown_file)
    else:
        urls = []
    
    # Regular expression pattern to match URL markdown syntax [link text](link_url)
    url_pattern = r"\[.*\]\((.*?)\)"
    
    # Get the markdown_file real content
    content = get_markdown_real_content(markdown_file)
    
    # Find all matches of the URL pattern in the content
    url_matches = re.findall(url_pattern, content)
        
    for url in url_matches:
        urls.append(url)
    
    return urls


def scrape_website(url, api_key, markdown_file, search_locally = True):
    """
    Proxies are an anti-block measure for web scraping. They let you scrape faster and more efficiently and remain anonymous.
    Scrape a website using the ScrapingAnt API, free for personal use
    Credits: scrapingant.com, Python Requests Proxy Ultimate Guide
    Normally, your request goes directly from your computer to the target server. But when you add a proxy, it works as a go-between for you and the computer.
    Proxy servers transfer your request to your desired website and return the response.
    https://scrapingant.com/blog/python-requests-proxy
    ZenRows.com, How to Use a Proxy with Python Requests in 2024. Idea: Rotate IPs with a Free Solution.
    Parameters:
    - url (str): The URL to scrape
    - api_key (str): Your ScrapingAnt API key
    
    Returns:
    - str: The raw HTML or text content of the website
    """
    if search_locally:
        url = replace_url(url)
    # ZenRows.com, Bypass Error 403 Forbidden in Web Scraping
    # Set a fake user agent, i.e., a string sent by web clients with every request to identify themselves to the web server.
    # We change our headers to look like a regular browser.
    headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', # This is a Chrome User Agent
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,th-TH;q=0.6,th;q=0.5,en-US;q=0.4,en-NU;q=0.3' # https://myhttpheader.com
    }
    
    headers2 = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
    }

    api_url = "https://api.scrapingant.com/v2/general"
    params = {
        'url': url,
        'x-api-key': api_key
    }
    
    try:
        # First, we try a normal request
        response = requests.head(url, headers=headers, allow_redirects=True, timeout=5)

        if (response.status_code!=200):
            time.sleep(3)
            # 405, Method Not Allowed error typically occurs when the server receives a request method that it does not support for the target URL.
            print(f"{url} in {markdown_file} has an error in our first attempt!")
            # Next, we scrape it using the ScrapingAnt API
            response = requests.get(api_url, params=params, headers=headers2, allow_redirects=True, timeout=5)

        return response.status_code

    except requests.exceptions.HTTPError as errh:
        print(f"{url} in {markdown_file} has an exception. Http Error: {errh}")
        print ("Http Error:", errh)
    except requests.exceptions.ConnectionError as errc:
        print(f"{url} in {markdown_file} has an exception. Error Connecting: {errc}")
        print ("Error Connecting:", errc)
    except requests.exceptions.Timeout as errt:
        print(f"{url} in {markdown_file} has an exception. Timeout Error: {errt}")
        print ("Timeout Error:", errt)
    except requests.exceptions.RequestException as err:
        print(f"{url} in {markdown_file} has an exception. Something else {err}")
        
    return 505

def check_absolute_url(url, markdown_file):
    """
    Check if an absolute URL is accessible (calling scrape_website).
    If the URL is not accessible, remove the link from the markdown and let the user check manually what the problem is.
    Parameters:
        url (str): The absolute URL to check.
        markdown_file (str): The path to the markdown file where the link is.

    Returns:
        tuple: A tuple containing boolean accessibility status (True if accessible, False otherwise) 
        and HTTP response code.
    """
    api_key = os.getenv("SCRAPINGBEE")

    response = scrape_website(url, api_key, markdown_file)
    if (response==200):
        return 200
    else:
        print(f"check_absolute_url, {url}, {response}")
        remove_link_from_markdown(markdown_file, url)
        check_manually(url, markdown_file, markdown_to_localwebhtml(markdown_file)) 

def check_urls_accessible(urls, markdown_file):
    """
    Check if the URLs in the list are accessible. The URLs may not be absolute urls.
    It basically constructs absolute urls and call check_absolute_url to check if the urls are accessible.
    It is called by check_markdown(markdown_file) in checkmarkdown.py to check the markdown's links accessibility.

    Parameters:
        urls (list): A list of URLs to check.
        markdown_file (str): The path to the markdown file where the urls are.

    Returns:
        dict: A dictionary where keys are URLs and values are boolean values indicating accessibility.
    """
    accessibility = {}  # Dictionary to store accessibility status of each URL

    for url in urls:
        if not url.startswith(('http://', 'https://')):  # Check if the URL is relative
            if not url.endswith("/") and not url.endswith("html"): # Check if the relative URL ends with a slash
                url += '/'  # Append a slash to the URL if it doesn't have one

            absolute_url = urljoin("http://localhost:1313/", url)  # Construct absolute URL
        else:
            absolute_url = url  # Use the URL as is if it's already absolute
    
        accessibility[url] = check_absolute_url(absolute_url, markdown_file)
    
    return accessibility
'''
File: checkMarkdownUtil.py
Author: Máximo Núñez Alarcón
Description: Useful Python functions to handle Markdown files:
* value_field_from_markdown(markdown_file, field): Get whether the "field" is in the frontmatter...
... and its value of the Markdown file passed as a parameter.
* filter_code_shortcodes_blocks(content): filter out code blocks and shortcodes from Markdown content.
* is_markdown_article(markdown_file): check if the Markdown file given as a parameter represents an article.
An article is a Markdown file that has in its front matter a field type articles, books2 or cards.
* get_markdown_real_content(markdown_file): get the actual content of the Markdown file after filtering out pieces of code and its shortcodes.
* remove_link_from_markdown(markdown_file, url): remove the link "url" from "markdown_file".
* markdown_to_localwebhtml(markdown_file, path_markdown_content = "/home/nmaximo7/justtothepoint/content/", localserver = "http://localhost:1313/"):
Convert the Markdown file path to its corresponding local web URL.
'''

import re
from dotenv import load_dotenv
from util import append_to_file

def value_field_from_markdown(markdown_file, field):
    """
    Get whether the "field" is in the frontmatter and its value of the Markdown file passed as a parameter.
    
    Parameters:
        markdown_file (str): Path to the Markdown file.
        field (str): Name of the field to be retrieved.
    
    Returns:
        boolean: Determine if the field is found or not in the frontmatter.
        str: Value of the field in the front matter, or None if not found.
    """
    # Define the regular expression pattern to match the field in front matter
    field_pattern = re.compile(rf'^{field}: (.*)', re.MULTILINE)
    
    # Read the contents of the Markdown file
    with open(markdown_file, 'r') as file:
        content = file.read()
    
    # Search for the field value using regex
    match = re.search(field_pattern, content)
    
    # Return the field value if found, otherwise return None
    if match:
        return True, match.group(1)
        # The field is found (True), and match.group(1) is the field's value (.*)
    else:
        return False, None
        # The field has not been found

def is_markdown_article(markdown_file):
    """
    Check if the Markdown file given as a parameter represents an article.
    An article is a Markdown file that has in its front matter (it is use to add metadata to the content)...
    ... a field type articles, books2 or cards.

    Parameters:
        markdown_file (str): Path to the Markdown file.

    Returns:
        bool: True if the Markdown file represents an article, False otherwise.
    """

    # Check if the markdown file has a front matter
    with open(markdown_file, 'r') as file:
        content = file.read()

    if not content.startswith('---'):
        return False

    # Extract front matter content
    front_matter = content.split('---')[1].strip()

    # Check if the front matter contains the required fields
    required_fields = ["title", "date", "author", "description", "featured_image", "language"]
    for field in required_fields:
        if f"{field}:" not in front_matter:
            return False

    # Check if the "articles" field has the correct structure
    articles_index = front_matter.find("articles:")
    if articles_index == -1:
        articles_index = front_matter.find("books:")
        if articles_index == -1:
            articles_index = front_matter.find("books2:")
            if articles_index == -1:
                articles_index = front_matter.find("cards:")
                if articles_index == -1:
                    return False

    # Extract the content of the "articles" field
    articles_content = front_matter[articles_index:]
    if "- title:" not in articles_content or "link:" not in articles_content or "image:" not in articles_content:
        return False

    return True
        
def get_markdown_real_content(markdown_file):
    """
    Get the actual content of the Markdown file after filtering out pieces of code and its shortcodes.

    Parameters:
        markdown_file (str): Path to the Markdown file.

    Returns:
        str: The filtered Markdown content.
    """

    with open(markdown_file, 'r') as f:
        # Read the contents of the file
        unfilteredContent = f.read()
        content = filter_code_shortcodes_blocks(unfilteredContent)

    return content 

def remove_link_from_markdown(markdown_file, url):
    """
    Remove the broken link "url", syntax: ([name](url)), from "markdown_file".
    
    Parameters:
        markdown_file (str): Path to the Markdown file.
        url (str): URL for which to remove the link syntax.
    
    Returns:
        None
    """
    # Escape special characters in the URL for use in regex
    escaped_url = re.escape(url)
    
    # Define the regex pattern to match the link syntax
    link_pattern = rf'\[([^\]]+)\]\({escaped_url}\)'
    
    # Open the Markdown file and process line by line
    with open(markdown_file, 'r') as file:
        # Create an empty list to store modified lines
        modified_lines = []
        
        for line in file:
            # Check if the line contains the link syntax
            match = re.search(link_pattern, line)
            if match:
                '''If the link syntax is found, replace it with the link text
                re.sub(pattern, replacement, string) where:
                1. pattern specifies the regular expression pattern to search for in the string, link_pattern: ([name](url))
                2. replacement specifies the string to replace the matched pattern with. 
                r'\1' represents the matched text captured by the first capturing group '( )' in the pattern. 
                Recall that capturing groups are used to capture parts of the pattern for later use.
                In our particular case ([^\]]+), that is, between the bracket [], the text (e.g., name) inside the square brackets.
                3. string specifies the input string where the pattern will be searched for and replaced.
                '''
                modified_line = re.sub(link_pattern, r'\1', line)
                
                print("Remove link form markdown")
                print(f"Line being replace: {line}")
                print(f"New line: {modified_line}")
                modified_lines.append(modified_line)
            else:
                # If no match, keep the original line
                modified_lines.append(line)
    
    # Write the modified lines back to the Markdown file
    with open(markdown_file, 'w') as file:
        file.writelines(modified_lines)

def markdown_to_localwebhtml(markdown_file, path_markdown_content = "/home/nmaximo7/justtothepoint/content/", localserver = "http://localhost:1313/"):
    """
    Convert the Markdown file path to its corresponding local web URL.

    Parameters:
        markdown_file (str): The path to the Markdown file.
        path_markdown_content (str): The base directory path where the Markdown content is stored, by default is located at "/home/nmaximo7/justtothepoint/content/".
        localserver (str): The local server URL. 
        By default, hugo server will create a server running on localhost port 1313, i.e., "http://localhost:1313/".

    Returns:
        str: The local web URL corresponding to the Markdown file.
    """

    # Replace the path to Markdown content with the local server URL
    # e.g., webpage_aux = "/home/nmaximo7/justtothepoint/content/maths/stoketheorem.md"
    webpage_aux = markdown_file[:]  
    # e.g., webpage_aux = "http://localhost:1313/maths/stoketheorem.md"
    webpage_aux = webpage_aux.replace(path_markdown_content, localserver)
    # Remove the ".md" extension
    webpage = webpage_aux[:-3]
    if webpage.endswith("index"):
        webpage = webpage[:-5]
    return webpage.lower()


def cleanup_image_links(markdown_text):
    """
    Cleans up the formatting of images or links by adding or removing the exclamation tag when necessary.
    
    Parameters:
    - markdown_text (str): The Markdown text containing images or links.

    Returns:
    - str: The Markdown text with image links removed.
    """
    append_to_file(f"cleanup_image_links: {markdown_text}")
    # Regular expression pattern to match Markdown-style image links
    image_link_pattern = r"!*\[([^\]]*)\]\((.*?)\)"

    # Replace image links with the alt text (if any)
    modified_text = re.sub(image_link_pattern, r"[\1](\2)", markdown_text)

    # Check if the modified text ends with an image extension
    image_extensions = ('.png)', '.jpg)', '.jpeg)', '.gif)')
    for extension in image_extensions:
        if modified_text.endswith(extension):
            append_to_file("Image cleanup")
            # Add the exclamation tag if it was removed
            if not modified_text.startswith("!"):
                append_to_file("!" + modified_text)
                return "!" + modified_text
            else:
            # Make sure that only one exclamation tag is present
                append_to_file("!" + modified_text.strip("!"))
                return "!" + modified_text.strip("!")
            

    return modified_text
"""
File: checkMarkdownFormat.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files format. 
"""
from checkMarkdownUtil import filter_code_shortcodes_blocks, value_field_from_markdown, markdown_to_localwebhtml, cleanup_image_links
from naturalProcessing import extract_keywords
from util import check_manually, append_to_file
import re, os, yaml
from dotenv import load_dotenv
from checkMarkdownUrl import extract_urls_from_markdown, extract_urls2_from_markdown, extract_images_with_markdown
      
def check_frontmatter(markdown_file):
    with open(markdown_file, 'r') as file:
        lines = file.readlines()

    # Check if the first line is "---"
    if lines[0].strip() != '---':
        print("Error: Frontmatter delimiter '---' is missing as the first line.")
        check_manually("Frontmatter delimiter '---' is missing as the first line.", markdown_file, markdown_to_localwebhtml(markdown_file))
        return False

    # Find the second "---" delimiter
    end_frontmatter_index = None
    for i, line in enumerate(lines[1:], start=1):
        if line.strip() == '---':
            end_frontmatter_index = i
            break

    if end_frontmatter_index is None:
        check_manually("Closing delimiter '---' is missing", markdown_file, markdown_to_localwebhtml(markdown_file))
        print("Error: Closing '---' delimiter is missing.")
        return False
   
    if end_frontmatter_index <= 2 and not markdown_file.endswith("_index.md"):
        print("Error: Frontmatter should have at least two lines.")
        check_manually("Frontmatter should have at least two lines.", markdown_file, markdown_to_localwebhtml(markdown_file))
        return False

    # Parse the frontmatter content
    frontmatter_content = ''.join(lines[1:end_frontmatter_index])
    frontmatter_fields = ['title', 'date', 'featured_image', 'description', 'categories', 'keywords']
    for field in frontmatter_fields:
        if field not in frontmatter_content and not markdown_file.endswith("_index.md"):
            error = f"Error: '{field}' is missing"
            if field == "keywords":
                keywords = set_keywords(markdown_file)
                add_keywords(markdown_file, keywords)
                error = ','.join(keywords)

            check_manually(error, markdown_file, markdown_to_localwebhtml(markdown_file))
            print(f"Error: '{field}' is missing in the frontmatter in {markdown_file}.")
            return False

    # print("Frontmatter format is correct.")
    return True

def set_keywords(markdown_file):
    # Open the markdown_file
    with open(markdown_file, 'r') as f:
        # Read the contents of the file
        unfilteredContent = f.read()
        # Filter the content from code blocks and short-codes.
        content = filter_code_shortcodes_blocks(unfilteredContent)
        # Remove Markdown frontmatter and get the real content
        real_content = re.sub(r'^---\n(.*?)\n---\n', '', content, flags=re.DOTALL)

        # Get the language
        language = value_field_from_markdown(markdown_file, "language")[1]
        return extract_keywords(real_content, language)

def check_format_markdown(markdown_file):
    """
    Check and modify the formatting of a Markdown file, specifically handling image links badly formatted.

    This function reads a Markdown file, identifies its image links, cleans up their formatting, and writes the modified content back to the file.

    Parameters:
    - markdown_file (str): The path to the Markdown file to be checked and modified.

    Returns:
    - None
    """
    append_to_file("check_format_markdown")
    urls = extract_images_with_markdown(markdown_file)
    
    # Read the contents of the file
    with open(markdown_file, 'r') as file:
        file_content = file.read()
    
    # Clean up image links in the content
    for link in urls:  
        # Clean up the image/link formatting
        new_link = cleanup_image_links(link)

        # Replace the original image link with the cleaned-up version
        file_content = file_content.replace(link, new_link)

    # Write the modified content back to the file
    with open(markdown_file, 'w') as file:
        file.write(file_content)

def convert_relative_links_to_absolute(markdown_file):
    """
    Convert relative links in Markdown text to absolute links.

    Parameters:
    - markdown_text (str): The Markdown text containing links.
    - base_url (str): The base URL used to resolve relative links.

    Returns:
    - str: The Markdown text with relative links converted to absolute links.
    """
    urls = extract_urls2_from_markdown(markdown_file)
    load_dotenv() # Load environment variables from .env file
    base_url = os.getenv("WEBSITE")
    base_url_insecure = os.getenv("WEBSITE_INSECURE")
    hugo_content_dir = os.path.normpath(os.getenv("HUGO_CONTENT_DIR"))

    markdown_dir = os.path.dirname(markdown_file)
    markdown_dir_name = os.path.relpath(markdown_dir, hugo_content_dir)
    # markdown_dir_name is the directory where the markdown_file lives
    # Iterate over each link and convert relative links to absolute links    
    for link in urls: # e.g., link = "https://justtothepoint.com/selfhelp/autoayuda/", base_url = "https://justtothepoint.com"
        if (link.startswith(base_url)):
            new_link = link[len(base_url):] # new_link = /selfhelp/autoayuda/
        elif (link.startswith(base_url_insecure)):
            new_link = link[len(base_url_insecure):]
        elif link.startswith(".."):
            # Remove ".." and prepend with the directory where the Markdown file lives
            # link = "../images/myImage.png", link[3:] = images/myImage.png
            new_link = '/' + os.path.normpath(os.path.join(markdown_dir_name, link[3:]))
        else:
            new_link = link
            continue

        # Read the contents of the file
        with open(markdown_file, 'r') as file:
            file_content = file.read()

        # Replace old_string with new_string
        modified_content = file_content.replace(link, new_link)

        # Write the modified content back to the file
        with open(markdown_file, 'w') as file:
            file.write(modified_content)
Bitcoin donation

JustToThePoint Copyright © 2011 - 2025 Anawim. ALL RIGHTS RESERVED. Bilingual e-books, articles, and videos to help your child and your entire family succeed, develop a healthy lifestyle, and have a lot of fun. Social Issues, Join us.

This website uses cookies to improve your navigation experience.
By continuing, you are consenting to our use of cookies, in accordance with our Cookies Policy and Website Terms and Conditions of use.