Files
2026-05-21 13:37:53 +08:00

162 lines
5.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Convert *.md files to *.qmd files and pre-process them
# Randomize footnote identifiers in multiple Quarto files to avoid conflicts
# Convert reference-style links to inline links
# Remove line breaks within a straight angle quotation mark
# Reformat display math equations in Ulysses
# Copyright: © 2024Present Tom Ben
# License: MIT License
import re
import glob
import os
import random
import string
def get_md_files():
# Get all *.md files
return [f for f in glob.glob("contents/[0-9]*.md")]
def randomize_footnote_identifiers(qmd_content):
# Find all existing footnote identifiers (numbers)
existing_ids = set(re.findall(r'\[\^(\d+)\]', qmd_content))
# Generate a unique random identifier for each existing footnote
unique_ids = {}
for id in existing_ids:
# Generate a random string of 5 characters
new_id = ''.join(random.choices(
string.ascii_letters + string.digits, k=5))
while new_id in unique_ids.values():
new_id = ''.join(random.choices(
string.ascii_letters + string.digits, k=5))
unique_ids[id] = new_id
# Replace all footnote references and definitions with new identifiers
for old_id, new_id in unique_ids.items():
qmd_content = re.sub(rf'\[\^{old_id}\]', f'[^{new_id}]', qmd_content)
qmd_content = re.sub(rf'\[\^{old_id}\]:', f'[^{new_id}]:', qmd_content)
return qmd_content
def convert_reference_to_inline(qmd_content):
# Extract reference links
reference_links = {}
reference_pattern = re.compile(r'\n\[(\d+)\]:\s*(.*)')
for match in reference_pattern.findall(qmd_content):
reference_links[match[0]] = match[1]
# Remove the reference link definitions from the qmd_content
qmd_content = reference_pattern.sub('', qmd_content)
# Replace reference-style link usages with inline links
def replace_link(match):
text = match.group(1)
key = match.group(2)
url = reference_links.get(key, '')
return f'[{text}]({url})'
usage_pattern = re.compile(r'\[(.*?)\]\[(\d+)\]')
qmd_content = usage_pattern.sub(replace_link, qmd_content)
return qmd_content
def remove_linebreaks_in_quotes(text):
# Regular expression pattern to find blocks within single Chinese quotes
pattern = r'「[^」]*?」'
# Function to replace newlines in the found quoted text
def replace_newlines(m):
# Remove all newlines within the quote block
return m.group(0).replace('\n', '')
# Use re.sub to replace the newline characters in each match
cleaned_text = re.sub(pattern, replace_newlines, text)
return cleaned_text
def reformat_math_equations(content):
# Reformat display math with labels to block format
labeled_pattern = r"\$(.+?)\$ *(\{#.+?\})"
def replace_with_labeled_block(match):
equation = match.group(1).strip()
label = match.group(2).strip()
return f"$$\n{equation}\n$$ {label}"
content = re.sub(labeled_pattern, replace_with_labeled_block, content)
# Reformat display math without labels to block format
# Match `$$ ... $$` without label
display_pattern = r"(?<!\$)\$\$([^\$]+?)\$\$(?!\{#)"
def replace_with_display_block(match):
equation = match.group(1).strip()
return f"$$\n{equation}\n$$"
content = re.sub(display_pattern, replace_with_display_block, content)
return content
def process_file(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as f:
content = f.read()
# Remove links with `[@]` and a space before it
content = re.sub(r"\s*\[@\].*?[\]\)]", "", content)
# Remove square brackets enclosing the caption
content = re.sub(r"^\[(.*)\}\]$", r"\n :\1}", content, flags=re.MULTILINE)
# Merge multiple adjacent citations into one
content = re.sub(r"\][\(\[].*?;\s*\[", "; ", content)
# Replace '{{\<...\>}}' with '{{<...>}}'
content = re.sub(r"\{\{\\<(.*)\\>}}", r"{{<\1>}}", content)
# Remove comment blocks to avoid errors of Python filter
content = re.sub(r"^```{=comment}.*?^```$", "",
content, flags=re.DOTALL | re.MULTILINE)
# Randomize footnote identifiers
content = randomize_footnote_identifiers(content)
# Convert reference-style links to inline links
content = convert_reference_to_inline(content)
# Remove line breaks in quotes
content = remove_linebreaks_in_quotes(content)
# Reformat math equations
content = reformat_math_equations(content)
with open(output_file, "w", encoding="utf-8") as f:
f.write(content)
def main():
md_files = get_md_files()
# Create contents_tmp directory if it doesn't exist
tmp_dir = "contents_tmp"
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
# Convert *.md files to *.qmd files in contents_tmp directory
qmd_files = [os.path.join(tmp_dir, os.path.basename(
f).replace(".md", ".qmd")) for f in md_files]
for md_file, qmd_file in zip(md_files, qmd_files):
process_file(md_file, qmd_file)
# Process existing .qmd files in contents directory and output to contents_tmp
os.chdir('contents')
existing_qmd_files = glob.glob('*.qmd')
for qmd_file in existing_qmd_files:
output_file = os.path.join('..', tmp_dir, qmd_file)
process_file(qmd_file, output_file)
if __name__ == "__main__":
main()