There are two ways to do great mathematics. The first is to be smarter than everybody else. The second way is to be stupider than everybody else — but persistent, Raoul Bott.
I love authoring in Markdown as there is almost no friction - just write. This is the first of many articles where we will check all markdown files for broken or oversized images.
I don’t want to reinvent the wheel, so I use markdown-link-check, but it is far from perfect, so I want to monitor that there are not broken links, the pictures could correctly be seen, their sizes are appropriate, and their frontmatters have all the most important fields, so I have created a code that read recursively all my Markdown files and check them.
user@pc:~$ sudo npm install -g markdown-link-check # To install the command line tool (markdown-link-check) globally.
user@pc:~$ sudo npm install -g npm@10.6.0 # Update the tool
user@pc:~$ cd /home/username/directoryMarkdown/content
user@pc:~$ find . -name \*.md -print0 | xargs -0 -n1 markdown-link-check # Check links from a local markdown folder (recursive)
"""
File: checkmarkdown.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files.
Usage: Run the script (python checkmarkdown.py) to monitor all your markdown pages of your Hugo.
def check_markdown(markdown_file): Check the file with Markdown format.
delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"): delete files with '*sync-conflict*' in their names within the specified directory.
"""
from util import check_all_files, printUrl, append_to_file, cleanLog
import pathlib
import os, sys
from PIL import Image
from compressImage import compress_image
import requests
from urllib.parse import urlparse
from util import check_all_files
from checkMarkdownUtil import markdown_to_localwebhtml
from checkMarkdownImage import extract_images_from_markdown, check_images_markdown
from checkMarkdownUrl import extract_urls_from_markdown, check_urls_accessible
from checkMarkdownFormat import check_frontmatter, convert_relative_links_to_absolute, check_format_markdown
mywebsite = "/home/yourUserName/yourWebsite/content/"
def check_markdown(markdown_file):
"""
Check the file with Markdown format.
"""
append_to_file("----------------------------------------------")
append_to_file(f"Processing... {markdown_file}")
# Check and modify the formatting of a Markdown file, specifically handling images and links badly formatted.
# It is commented because it is very intensive.
# You may want to uncomment it: check_format_markdown(markdown_file)
convert_relative_links_to_absolute(markdown_file)
# is the frontmatter OK?
try:
check_frontmatter(markdown_file)
except:
append_to_file(f"Exception processing Markdown format: {markdown_file}")
print(f"Exception processing Markdown format: {markdown_file}")
# 1. Check images
# Extract all its images.
images = extract_images_from_markdown(markdown_file)
append_to_file(images)
# Are images belonging to the markdown_file OK?
check_images_markdown(markdown_file, images)
# 2. Check URLs
# Extract all its links.
urls = extract_urls_from_markdown(markdown_file)
append_to_file(urls) # Log purposes (Debugging.)
# Are all links belonging to the markdown_file accessible?
accessibility = check_urls_accessible(urls, markdown_file)
append_to_file(f"{markdown_file} is OK.")
append_to_file("----------------------------------------------")
def delete_sync_conflict_files(directory="/home/yourUserName/yourWebsite/content/"):
"""
Delete files with '*sync-conflict*' in their names within the specified directory.
Args:
- directory (str): The directory path where files will be searched for and deleted.
"""
# Iterate over all files and directories in the specified directory
for path, dirs, files in os.walk(directory):
for file in files:
# Check if the file name contains '*sync-conflict*'
if "sync-conflict" in file:
# Construct the full file path
file_path = os.path.join(path, file)
try:
# Attempt to remove the file
os.remove(file_path)
print(f"Deleted file: {file_path}")
except Exception as e:
# Print error message if deletion fails
print(f"Error deleting file {file_path}: {e}")
if __name__ == "__main__":
os.system("clear")
cleanLog() # Clean the log, so we start with a clean log file.
delete_sync_conflict_files() # Delete files with '*sync-conflict*' in their names within the specified directory.
"""
File: checkMarkdownImage.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files' images.
def check_image_size(markdown_file, image_path, max_size_kb = 2024): Given a markdown_file and an image's path...
checks the image's size, and determines if it's OK based on a maximum size limit. If its size exceeds the maximum limit, it compresses it.
def check_image_ok(markdown_file, image_path): check if the image can be opened without errors and also checks its size (check_image_size).
def check_images_markdown(markdown_file, images): check if "images" belonging to the Markdown file are present and accessible.
def extract_images_from_markdown_article(markdown_file): extracts the paths of images from a Markdown file.
"""
import re, os, sys
import pathlib
from checkMarkdownUtil import value_field_from_markdown, filter_code_shortcodes_blocks, is_markdown_article, markdown_to_localwebhtml
from util import append_to_file, check_manually
from PIL import Image
from compressImage import compress_image
def check_image_size(markdown_file, image_path, max_size_kb = 2024):
''' It takes the markdown_file and the image's path as required arguments, checks the image's size, and determines if it's OK based on a maximum size limit.
If the image size exceeds the maximum size limit, it compresses it.
Args:
- markdown_file: The markdown file where the image is hosted.
- image_path: The path of the image.
- max_size_kb: The maximum size limit allowed.
Returns:
- True/False: If image_path's size does not exceed max_size_kb return True. Otherwise, returns false.'''
try:
# Get the size of the image file in bytes
image_size_bytes = os.path.getsize(image_path)
# Convert image size from bytes to kilobytes
image_size_kb = image_size_bytes / 1024
# Check if the image size exceeds the maximum size limit
if image_size_kb > max_size_kb:
append_to_file(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
print(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit.")
# If the image size exceeds the maximum size limit, it compresses it.
compress_image(image_path)
# The original image has exceed the maximum size limit allowed.
with open("assets/imagestoobig.txt", 'a') as file:
file.write(str(image_path) + "\n")
print(f"{image_path} with {image_size_kb:.2f} KB in {markdown_file} exceeds the {max_size_kb} limit II.")
return False
else:
return True
except Exception as e:
# Append_to_file or log the error message if the image cannot be opened with the exception.
append_to_file(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
print(f"Error opening image {image_path} in {markdown_file}. Error: {e}")
return False
return True
def check_image_ok(markdown_file, image_path):
"""
Check if the image at the given path can be opened without errors.
It also checks its size (it uses check_image_size), and determines if it's OK based on a maximum size limit.
Args:
- image_path (str): The path to the image file.
- markdown_file: We use this parameter to be able to log and debug the Markdown file where the problem may be.
Returns:
- bool: True if the image can be opened without errors, False otherwise.
"""
try:
# Try to open the image
img = Image.open(image_path)
# Close the image to release resources.
img.close()
return check_image_size(markdown_file, image_path)
except Exception as e:
print(f"Error in {markdown_file}, opening image '{image_path.absolute()}': {e}")
return False
return True
def check_images_markdown(markdown_file, images):
"""
Check if "images" belonging to the Markdown file are present and accessible.
Parameters:
markdown_file (str): The path to the Markdown file.
images (list): List of image URLs.
Returns:
None
"""
# Example: markdown_file = /home/nmaximo7/justtothepoint/content/code/hugo8.md
markdown_path = pathlib.Path(markdown_file)
# Python's pathlib module enables you to handle file and folder paths in a modern way.
markdown_dir = markdown_path.parent # /home/nmaximo7/justtothepoint/content/code
# Iterate over each image URL in the list
for img in images:
# Resolve the relative URL to get the full path of the image, e.g., img = "/code/images/alexa.png"
image_path = markdown_dir / img # If img were a relative path, it will combine them, but it is not the case
image_path = pathlib.Path(image_path).resolve() # e.g., image_path = "/code/images/alexa.png"
# Double check if the image file exists
if not os.path.exists(image_path):
# If the previous attempt to find the image proves to be unsuccessful,
# it concatenates the absolute path of the current working directory where the Hugo's markdown files are placed
# with os.path.abspath(img), that is, a normalized (absolute -current working directory-) version of the path img
image_path = "/home/nmaximo7/justtothepoint/content" + os.path.abspath(img)
# e.g., image_path = "/home/nmaximo7/justtothepoint/content/code/images/alexa.png". It does exist.
if not os.path.exists(image_path):
# The image could be /home/nmaximo7/justtothepoint/static.
# By default, the static/ directory in the site project is used for all static files (e.g. stylesheets, JavaScript, images). The static files are served on the site root path.
image_path = "/home/nmaximo7/justtothepoint/static" + os.path.abspath(img)
if not os.path.exists(image_path):
append_to_file("Error: Image file not found!")
append_to_file(f"{markdown_file} : {image_path}")
print(f"Image Not found in {markdown_file}: {image_path}")
check_manually(image_path, markdown_file, markdown_to_localwebhtml(markdown_file))
else:
check_image_ok(markdown_file, image_path)
else:
check_image_ok(markdown_file, image_path)
def extract_images_from_markdown_article(markdown_file):
"""
Extracts the paths of images from a Markdown file.
Parameters:
markdown_file (str): Path to the Markdown file.
Returns:
list: List of paths of images.
"""
image_paths = [] # Initializes an empty list to store the paths of images.
# Define the regular expression pattern to match image paths
# Lines starting with "image:" followed by the image paht.
image_pattern = re.compile(r'^\s*image:\s*(.*)$', re.MULTILINE)
# Read the contents of the Markdown file and stores it in the content variable
with open(markdown_file, 'r') as file:
content = file.read()
# Find all matches of the image pattern in the content of the Markdown file.
# It returns a list of image paths.
matches = image_pattern.findall(content)
# Add each matched image path to the list after stripping any leading or trailing whitespace
for match in matches:
image_paths.append(match.strip())
return image_paths
def extract_images_from_markdown(markdown_file):
'''Extract all images from a markdown_file.
Args:
- markdown_fil (str): The markdown_file from which to retrieve its images.
Returns:
- list: A list of all its images.'''
'''
--- This will be a Markdown file of an article (a list)
title: "Apocalipsis"
date: 2022-04-02T07:12:39+02:00
draft: false
author: Máximo Núñez Alarcón
description: Don't you have more than enough reading pleasure? ...
featured_image: /en/library/images/clasicos-ingles-8.jpg
keywords: free ebook,martha and the twins,read marta and the twins, ...
language: en
articles:
- title: Apocalipsis. 6=0
link: /library/Apocalipsis1/
image: /en/library/images/Apocalipsis1.png
- title: I'm sorry, Andrew
link: /library/Apocalipsis2/
image: /en/library/images/Apocalipsis2.png
'''
if is_markdown_article(markdown_file):
# List to store extracted image URLs
images = extract_images_from_markdown_article(markdown_file)
else:
images = []
# Regular expression pattern to match image markdown syntax 
img_pattern = r"!\[.*\]\((.*?)\)"
try:
# Open the Markdown file
with open(markdown_file, 'r') as f:
# Read the contents of the file
unfilteredContent = f.read()
content = filter_code_shortcodes_blocks(unfilteredContent)
# Find all matches of the image pattern in the content
matches = re.findall(img_pattern, content)
# Add each matched image URL to the list
for match in matches:
images.append(match)
except:
print(f"Exception trying to open {markdown_file}")
return images
if __name__ == "__main__":
extract_images_from_markdown("/home/nmaximo7/justtothepoint/content/en/library/iwantmore/index.md")
'''
File: checkMarkdownUrl.py
Author: Máximo Núñez Alarcón
Description: Python functions to handle URLs in Markdown files:
* replace_url(content): Replace URLs starting with 'https://justtothepoint.com/' or 'http://justtothepoint.com/' with 'http://localhost:1313/'.
* extract_urls_from_markdown_article(markdown_file): Extracts the paths of urls from a Markdown file that is an article.
* extract_urls_from_markdown(markdown_file): Extract all urls from a markdown_file.
* scrape_website(url, api_key, markdown_file, search_locally = True): scrape a website (by default, locally) using the ScrapingAnt API, free for personal use.
* check_absolute_url(url, markdown_file): check if an absolute URL is accessible (calling scrape_website).
If the URL is not accessible, remove the link from the markdown and let the user check manually what the problem is.
* check_urls_accessible(urls, markdown_file): check if the URLs in the list are accessible. The URLs may not be absolute urls.
It basically constructs absolute urls and call check_absolute_url to check if the urls are accessible.
'''
from checkMarkdownUtil import value_field_from_markdown, filter_code_shortcodes_blocks, is_markdown_article, get_markdown_real_content, markdown_to_localwebhtml, remove_link_from_markdown
import re
import os
import requests
from urllib.parse import urlparse, urljoin
from util import check_manually, append_to_file
import time
def replace_url(content):
"""
Replace URLs starting with 'https://justtothepoint.com/' or 'http://justtothepoint.com/' with 'http://localhost:1313/'.
Parameters:
content (str): The content containing URLs.
Returns:
str: The content with URLs replaced.
"""
# Define the regular expression pattern to match HTTP URLs
url_pattern = re.compile(r'http://justtothepoint\.com/|https://justtothepoint\.com/')
# Replace matched HTTP URLs with 'http://localhost:1313/'
replaced_content = re.sub(url_pattern, 'http://localhost:1313/', content)
return replaced_content
def extract_urls_from_markdown_article(markdown_file):
"""
Extracts the paths of urls from a Markdown file that is an article.
Parameters:
markdown_file (str): Path to the Markdown file.
Returns:
list: List of urls.
"""
urls = []
# Define the regular expression pattern to match image paths
'''
---
title: "Biblioteca bilingüe, divertida y gratuita. Los cinco aprendices de Mago."
date: 2022-04-02T07:12:39+02:00
draft: false
author: Máximo Núñez Alarcón
description: Cinco amigos se ven envueltos en todo tipo de aventuras y desventuras. Escrito con sencillez y un gran sentido del humor para que su lectura no te deje indiferente.
featured_image: /myImages/los-cinco-aprendices-de-mago-1-1.jpg
keywords: lectura,leer,portal de lectura,libro,lectura divertida,fantasía,libros de aventuras,libros divertidos,libros gratuitos,colección,colección de libros,primeros lectores,primeras lecturas,lectura juvenil libros,lectura juvenil online,lecturas infantiles,libros para niños,cuentos para niños,libros online,libros free,ebooks gratis,leer online,libros para todos,leer es divertido,libros bilingües,biblioteca online,comprensión lectora,libros bilingües gratuitos,biblioteca bilingüe,libros entretenidos,aprender a leer,leer y divertirse,libros educativos y divertidos,libros educativos,libros para todas las edades,libros con comprensión lectora,libros para todos los niveles,cómo aprender a leer,quiero aprender a leer,lectura para todos,aprendices de mago,los cinco aprendices,los cinco aprendices de mago,magia,libros gratuitos de fantasia,libros de fantasía e ilusión
language: es
books2:
- title: Una mañana en el campanario
link: /library/Los5MagosL1C1Es/index.html
image: https://justtothepoint.com/myImages/los-cinco-aprendices-de-mago-1-1.jpg
quizlink: https://justtothepoint.com/quizzes/quizLos5Aprend1b1Es.html
infolink: https://es.wikipedia.org/wiki/Nostradamus
- title: El bosque animado
link: /library/Los5MagosL1C2Es/index.html
image: https://justtothepoint.com/myImages/los-cinco-aprendices-de-mago-1-2.jpg
quizlink: https://justtothepoint.com/quizzes/quizLos5Aprend1b2Es.html
infolink: https://www.rimador.net/
'''
url_pattern = re.compile(r'^\s*link:\s*(.*)$', re.MULTILINE)
url_pattern2 = re.compile(r'^\s*quizlink:\s*(.*)$', re.MULTILINE)
url_pattern3 = re.compile(r'^\s*infolink:\s*(.*)$', re.MULTILINE)
# Read the contents of the Markdown file
with open(markdown_file, 'r') as file:
content = file.read()
matches = url_pattern.findall(content)
matches2 = url_pattern2.findall(content)
matches3 = url_pattern3.findall(content)
for match in matches, matches2, matches3:
urls.append(match)
# urls is a list of list of urls, we need to flatten it to be a list of urls
flat_urls = []
for sublist_urls in urls:
if sublist_urls: # This condition ensures that only non-empty sublist are considered
for url in sublist_urls:
flat_urls.append(url)
return flat_urls
def extract_images_with_markdown(markdown_file):
"""
Extract all urls/image Markdown shortcode from the given Markdown file.
Parameters:
- markdown_text (str): The Markdown file containing images or links.
Returns:
- list: A list containing all image Markdown syntax found in the file.
"""
append_to_file(f"extract_images_from_markdown")
# Read the content of the Markdown file
with open(markdown_file, 'r') as file:
markdown_text = file.read()
# Regular expression pattern to match Markdown-style image syntax
image_pattern = r"!*\[.*?\]\(.*?\)"
# Find all matches of the image pattern in the content
image_matches = re.findall(image_pattern, markdown_text)
append_to_file(image_matches)
return image_matches
def extract_urls_from_markdown(markdown_file):
'''Extract all urls from a markdown_file.
Args:
- markdown_fil (str): The markdown_file from which to retrieve its images.
Returns:
- list: A list of all its urls.'''
if is_markdown_article(markdown_file):
# List to store extracted URLs
urls = extract_urls_from_markdown_article(markdown_file)
else:
urls = []
# Regular expression pattern to match image markdown syntax 
image_pattern = r"\!\[.*\]\((.*?)\)"
# Regular expression pattern to match URL markdown syntax [link text](link_url)
url_pattern = r"\[.*\]\((.*?)\)"
# Get the markdown_file real content
content = get_markdown_real_content(markdown_file)
# Find all matches of the URL pattern in the content
url_matches = re.findall(url_pattern, content)
# Find all matches of the image pattern in the content
image_matches = re.findall(image_pattern, content)
# Filter out URLs that are also used as image URLs
for url in url_matches:
if url not in image_matches:
urls.append(url)
return urls
def extract_urls2_from_markdown(markdown_file):
'''Extract all urls and images from a markdown_file.
Args:
- markdown_fil (str): The markdown_file from which to retrieve its images.
Returns:
- list: A list of all its urls.'''
if is_markdown_article(markdown_file):
# List to store extracted URLs
urls = extract_urls_from_markdown_article(markdown_file)
else:
urls = []
# Regular expression pattern to match URL markdown syntax [link text](link_url)
url_pattern = r"\[.*\]\((.*?)\)"
# Get the markdown_file real content
content = get_markdown_real_content(markdown_file)
# Find all matches of the URL pattern in the content
url_matches = re.findall(url_pattern, content)
for url in url_matches:
urls.append(url)
return urls
def scrape_website(url, api_key, markdown_file, search_locally = True):
"""
Proxies are an anti-block measure for web scraping. They let you scrape faster and more efficiently and remain anonymous.
Scrape a website using the ScrapingAnt API, free for personal use
Credits: scrapingant.com, Python Requests Proxy Ultimate Guide
Normally, your request goes directly from your computer to the target server. But when you add a proxy, it works as a go-between for you and the computer.
Proxy servers transfer your request to your desired website and return the response.
https://scrapingant.com/blog/python-requests-proxy
ZenRows.com, How to Use a Proxy with Python Requests in 2024. Idea: Rotate IPs with a Free Solution.
Parameters:
- url (str): The URL to scrape
- api_key (str): Your ScrapingAnt API key
Returns:
- str: The raw HTML or text content of the website
"""
if search_locally:
url = replace_url(url)
# ZenRows.com, Bypass Error 403 Forbidden in Web Scraping
# Set a fake user agent, i.e., a string sent by web clients with every request to identify themselves to the web server.
# We change our headers to look like a regular browser.
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', # This is a Chrome User Agent
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en;q=0.9,es-ES;q=0.8,es;q=0.7,th-TH;q=0.6,th;q=0.5,en-US;q=0.4,en-NU;q=0.3' # https://myhttpheader.com
}
headers2 = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
api_url = "https://api.scrapingant.com/v2/general"
params = {
'url': url,
'x-api-key': api_key
}
try:
# First, we try a normal request
response = requests.head(url, headers=headers, allow_redirects=True, timeout=5)
if (response.status_code!=200):
time.sleep(3)
# 405, Method Not Allowed error typically occurs when the server receives a request method that it does not support for the target URL.
print(f"{url} in {markdown_file} has an error in our first attempt!")
# Next, we scrape it using the ScrapingAnt API
response = requests.get(api_url, params=params, headers=headers2, allow_redirects=True, timeout=5)
return response.status_code
except requests.exceptions.HTTPError as errh:
print(f"{url} in {markdown_file} has an exception. Http Error: {errh}")
print ("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
print(f"{url} in {markdown_file} has an exception. Error Connecting: {errc}")
print ("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
print(f"{url} in {markdown_file} has an exception. Timeout Error: {errt}")
print ("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
print(f"{url} in {markdown_file} has an exception. Something else {err}")
return 505
def check_absolute_url(url, markdown_file):
"""
Check if an absolute URL is accessible (calling scrape_website).
If the URL is not accessible, remove the link from the markdown and let the user check manually what the problem is.
Parameters:
url (str): The absolute URL to check.
markdown_file (str): The path to the markdown file where the link is.
Returns:
tuple: A tuple containing boolean accessibility status (True if accessible, False otherwise)
and HTTP response code.
"""
api_key = os.getenv("SCRAPINGBEE")
response = scrape_website(url, api_key, markdown_file)
if (response==200):
return 200
else:
print(f"check_absolute_url, {url}, {response}")
remove_link_from_markdown(markdown_file, url)
check_manually(url, markdown_file, markdown_to_localwebhtml(markdown_file))
def check_urls_accessible(urls, markdown_file):
"""
Check if the URLs in the list are accessible. The URLs may not be absolute urls.
It basically constructs absolute urls and call check_absolute_url to check if the urls are accessible.
It is called by check_markdown(markdown_file) in checkmarkdown.py to check the markdown's links accessibility.
Parameters:
urls (list): A list of URLs to check.
markdown_file (str): The path to the markdown file where the urls are.
Returns:
dict: A dictionary where keys are URLs and values are boolean values indicating accessibility.
"""
accessibility = {} # Dictionary to store accessibility status of each URL
for url in urls:
if not url.startswith(('http://', 'https://')): # Check if the URL is relative
if not url.endswith("/") and not url.endswith("html"): # Check if the relative URL ends with a slash
url += '/' # Append a slash to the URL if it doesn't have one
absolute_url = urljoin("http://localhost:1313/", url) # Construct absolute URL
else:
absolute_url = url # Use the URL as is if it's already absolute
accessibility[url] = check_absolute_url(absolute_url, markdown_file)
return accessibility
'''
File: checkMarkdownUtil.py
Author: Máximo Núñez Alarcón
Description: Useful Python functions to handle Markdown files:
* value_field_from_markdown(markdown_file, field): Get whether the "field" is in the frontmatter...
... and its value of the Markdown file passed as a parameter.
* filter_code_shortcodes_blocks(content): filter out code blocks and shortcodes from Markdown content.
* is_markdown_article(markdown_file): check if the Markdown file given as a parameter represents an article.
An article is a Markdown file that has in its front matter a field type articles, books2 or cards.
* get_markdown_real_content(markdown_file): get the actual content of the Markdown file after filtering out pieces of code and its shortcodes.
* remove_link_from_markdown(markdown_file, url): remove the link "url" from "markdown_file".
* markdown_to_localwebhtml(markdown_file, path_markdown_content = "/home/nmaximo7/justtothepoint/content/", localserver = "http://localhost:1313/"):
Convert the Markdown file path to its corresponding local web URL.
'''
import re
from dotenv import load_dotenv
from util import append_to_file
def value_field_from_markdown(markdown_file, field):
"""
Get whether the "field" is in the frontmatter and its value of the Markdown file passed as a parameter.
Parameters:
markdown_file (str): Path to the Markdown file.
field (str): Name of the field to be retrieved.
Returns:
boolean: Determine if the field is found or not in the frontmatter.
str: Value of the field in the front matter, or None if not found.
"""
# Define the regular expression pattern to match the field in front matter
field_pattern = re.compile(rf'^{field}: (.*)', re.MULTILINE)
# Read the contents of the Markdown file
with open(markdown_file, 'r') as file:
content = file.read()
# Search for the field value using regex
match = re.search(field_pattern, content)
# Return the field value if found, otherwise return None
if match:
return True, match.group(1)
# The field is found (True), and match.group(1) is the field's value (.*)
else:
return False, None
# The field has not been found
def is_markdown_article(markdown_file):
"""
Check if the Markdown file given as a parameter represents an article.
An article is a Markdown file that has in its front matter (it is use to add metadata to the content)...
... a field type articles, books2 or cards.
Parameters:
markdown_file (str): Path to the Markdown file.
Returns:
bool: True if the Markdown file represents an article, False otherwise.
"""
# Check if the markdown file has a front matter
with open(markdown_file, 'r') as file:
content = file.read()
if not content.startswith('---'):
return False
# Extract front matter content
front_matter = content.split('---')[1].strip()
# Check if the front matter contains the required fields
required_fields = ["title", "date", "author", "description", "featured_image", "language"]
for field in required_fields:
if f"{field}:" not in front_matter:
return False
# Check if the "articles" field has the correct structure
articles_index = front_matter.find("articles:")
if articles_index == -1:
articles_index = front_matter.find("books:")
if articles_index == -1:
articles_index = front_matter.find("books2:")
if articles_index == -1:
articles_index = front_matter.find("cards:")
if articles_index == -1:
return False
# Extract the content of the "articles" field
articles_content = front_matter[articles_index:]
if "- title:" not in articles_content or "link:" not in articles_content or "image:" not in articles_content:
return False
return True
def get_markdown_real_content(markdown_file):
"""
Get the actual content of the Markdown file after filtering out pieces of code and its shortcodes.
Parameters:
markdown_file (str): Path to the Markdown file.
Returns:
str: The filtered Markdown content.
"""
with open(markdown_file, 'r') as f:
# Read the contents of the file
unfilteredContent = f.read()
content = filter_code_shortcodes_blocks(unfilteredContent)
return content
def remove_link_from_markdown(markdown_file, url):
"""
Remove the broken link "url", syntax: ([name](url)), from "markdown_file".
Parameters:
markdown_file (str): Path to the Markdown file.
url (str): URL for which to remove the link syntax.
Returns:
None
"""
# Escape special characters in the URL for use in regex
escaped_url = re.escape(url)
# Define the regex pattern to match the link syntax
link_pattern = rf'\[([^\]]+)\]\({escaped_url}\)'
# Open the Markdown file and process line by line
with open(markdown_file, 'r') as file:
# Create an empty list to store modified lines
modified_lines = []
for line in file:
# Check if the line contains the link syntax
match = re.search(link_pattern, line)
if match:
'''If the link syntax is found, replace it with the link text
re.sub(pattern, replacement, string) where:
1. pattern specifies the regular expression pattern to search for in the string, link_pattern: ([name](url))
2. replacement specifies the string to replace the matched pattern with.
r'\1' represents the matched text captured by the first capturing group '( )' in the pattern.
Recall that capturing groups are used to capture parts of the pattern for later use.
In our particular case ([^\]]+), that is, between the bracket [], the text (e.g., name) inside the square brackets.
3. string specifies the input string where the pattern will be searched for and replaced.
'''
modified_line = re.sub(link_pattern, r'\1', line)
print("Remove link form markdown")
print(f"Line being replace: {line}")
print(f"New line: {modified_line}")
modified_lines.append(modified_line)
else:
# If no match, keep the original line
modified_lines.append(line)
# Write the modified lines back to the Markdown file
with open(markdown_file, 'w') as file:
file.writelines(modified_lines)
def markdown_to_localwebhtml(markdown_file, path_markdown_content = "/home/nmaximo7/justtothepoint/content/", localserver = "http://localhost:1313/"):
"""
Convert the Markdown file path to its corresponding local web URL.
Parameters:
markdown_file (str): The path to the Markdown file.
path_markdown_content (str): The base directory path where the Markdown content is stored, by default is located at "/home/nmaximo7/justtothepoint/content/".
localserver (str): The local server URL.
By default, hugo server will create a server running on localhost port 1313, i.e., "http://localhost:1313/".
Returns:
str: The local web URL corresponding to the Markdown file.
"""
# Replace the path to Markdown content with the local server URL
# e.g., webpage_aux = "/home/nmaximo7/justtothepoint/content/maths/stoketheorem.md"
webpage_aux = markdown_file[:]
# e.g., webpage_aux = "http://localhost:1313/maths/stoketheorem.md"
webpage_aux = webpage_aux.replace(path_markdown_content, localserver)
# Remove the ".md" extension
webpage = webpage_aux[:-3]
if webpage.endswith("index"):
webpage = webpage[:-5]
return webpage.lower()
def cleanup_image_links(markdown_text):
"""
Cleans up the formatting of images or links by adding or removing the exclamation tag when necessary.
Parameters:
- markdown_text (str): The Markdown text containing images or links.
Returns:
- str: The Markdown text with image links removed.
"""
append_to_file(f"cleanup_image_links: {markdown_text}")
# Regular expression pattern to match Markdown-style image links
image_link_pattern = r"!*\[([^\]]*)\]\((.*?)\)"
# Replace image links with the alt text (if any)
modified_text = re.sub(image_link_pattern, r"[\1](\2)", markdown_text)
# Check if the modified text ends with an image extension
image_extensions = ('.png)', '.jpg)', '.jpeg)', '.gif)')
for extension in image_extensions:
if modified_text.endswith(extension):
append_to_file("Image cleanup")
# Add the exclamation tag if it was removed
if not modified_text.startswith("!"):
append_to_file("!" + modified_text)
return "!" + modified_text
else:
# Make sure that only one exclamation tag is present
append_to_file("!" + modified_text.strip("!"))
return "!" + modified_text.strip("!")
return modified_text
"""
File: checkMarkdownFormat.py
Author: Máximo Núñez Alarcón
Description: This code monitors your Hugo's markdown files format.
"""
from checkMarkdownUtil import filter_code_shortcodes_blocks, value_field_from_markdown, markdown_to_localwebhtml, cleanup_image_links
from naturalProcessing import extract_keywords
from util import check_manually, append_to_file
import re, os, yaml
from dotenv import load_dotenv
from checkMarkdownUrl import extract_urls_from_markdown, extract_urls2_from_markdown, extract_images_with_markdown
def check_frontmatter(markdown_file):
with open(markdown_file, 'r') as file:
lines = file.readlines()
# Check if the first line is "---"
if lines[0].strip() != '---':
print("Error: Frontmatter delimiter '---' is missing as the first line.")
check_manually("Frontmatter delimiter '---' is missing as the first line.", markdown_file, markdown_to_localwebhtml(markdown_file))
return False
# Find the second "---" delimiter
end_frontmatter_index = None
for i, line in enumerate(lines[1:], start=1):
if line.strip() == '---':
end_frontmatter_index = i
break
if end_frontmatter_index is None:
check_manually("Closing delimiter '---' is missing", markdown_file, markdown_to_localwebhtml(markdown_file))
print("Error: Closing '---' delimiter is missing.")
return False
if end_frontmatter_index <= 2 and not markdown_file.endswith("_index.md"):
print("Error: Frontmatter should have at least two lines.")
check_manually("Frontmatter should have at least two lines.", markdown_file, markdown_to_localwebhtml(markdown_file))
return False
# Parse the frontmatter content
frontmatter_content = ''.join(lines[1:end_frontmatter_index])
frontmatter_fields = ['title', 'date', 'featured_image', 'description', 'categories', 'keywords']
for field in frontmatter_fields:
if field not in frontmatter_content and not markdown_file.endswith("_index.md"):
error = f"Error: '{field}' is missing"
if field == "keywords":
keywords = set_keywords(markdown_file)
add_keywords(markdown_file, keywords)
error = ','.join(keywords)
check_manually(error, markdown_file, markdown_to_localwebhtml(markdown_file))
print(f"Error: '{field}' is missing in the frontmatter in {markdown_file}.")
return False
# print("Frontmatter format is correct.")
return True
def set_keywords(markdown_file):
# Open the markdown_file
with open(markdown_file, 'r') as f:
# Read the contents of the file
unfilteredContent = f.read()
# Filter the content from code blocks and short-codes.
content = filter_code_shortcodes_blocks(unfilteredContent)
# Remove Markdown frontmatter and get the real content
real_content = re.sub(r'^---\n(.*?)\n---\n', '', content, flags=re.DOTALL)
# Get the language
language = value_field_from_markdown(markdown_file, "language")[1]
return extract_keywords(real_content, language)
def check_format_markdown(markdown_file):
"""
Check and modify the formatting of a Markdown file, specifically handling image links badly formatted.
This function reads a Markdown file, identifies its image links, cleans up their formatting, and writes the modified content back to the file.
Parameters:
- markdown_file (str): The path to the Markdown file to be checked and modified.
Returns:
- None
"""
append_to_file("check_format_markdown")
urls = extract_images_with_markdown(markdown_file)
# Read the contents of the file
with open(markdown_file, 'r') as file:
file_content = file.read()
# Clean up image links in the content
for link in urls:
# Clean up the image/link formatting
new_link = cleanup_image_links(link)
# Replace the original image link with the cleaned-up version
file_content = file_content.replace(link, new_link)
# Write the modified content back to the file
with open(markdown_file, 'w') as file:
file.write(file_content)
def convert_relative_links_to_absolute(markdown_file):
"""
Convert relative links in Markdown text to absolute links.
Parameters:
- markdown_text (str): The Markdown text containing links.
- base_url (str): The base URL used to resolve relative links.
Returns:
- str: The Markdown text with relative links converted to absolute links.
"""
urls = extract_urls2_from_markdown(markdown_file)
load_dotenv() # Load environment variables from .env file
base_url = os.getenv("WEBSITE")
base_url_insecure = os.getenv("WEBSITE_INSECURE")
hugo_content_dir = os.path.normpath(os.getenv("HUGO_CONTENT_DIR"))
markdown_dir = os.path.dirname(markdown_file)
markdown_dir_name = os.path.relpath(markdown_dir, hugo_content_dir)
# markdown_dir_name is the directory where the markdown_file lives
# Iterate over each link and convert relative links to absolute links
for link in urls: # e.g., link = "https://justtothepoint.com/selfhelp/autoayuda/", base_url = "https://justtothepoint.com"
if (link.startswith(base_url)):
new_link = link[len(base_url):] # new_link = /selfhelp/autoayuda/
elif (link.startswith(base_url_insecure)):
new_link = link[len(base_url_insecure):]
elif link.startswith(".."):
# Remove ".." and prepend with the directory where the Markdown file lives
# link = "../images/myImage.png", link[3:] = images/myImage.png
new_link = '/' + os.path.normpath(os.path.join(markdown_dir_name, link[3:]))
else:
new_link = link
continue
# Read the contents of the file
with open(markdown_file, 'r') as file:
file_content = file.read()
# Replace old_string with new_string
modified_content = file_content.replace(link, new_link)
# Write the modified content back to the file
with open(markdown_file, 'w') as file:
file.write(modified_content)