add _extensions

This commit is contained in:
2026-05-21 13:37:53 +08:00
parent 6a9a5fc90e
commit 61bd0bea2f
252 changed files with 33972 additions and 1 deletions
@@ -0,0 +1,170 @@
--[[
capitalize-subtitle Capitalize first letter after colons and em dashes in bibliography entries
This filter capitalizes the first letter after colons and em dashes in bibliography
entries, following APA and similar styles that require subtitle capitalization.
It only processes paragraphs within bibliography divs
Must be run after Citeproc
Copyright: © 2025present Tom Ben
License: MIT
]]
local EXTRA_PUNCT = "“”‘’«»‹›„‟「」『』﹁﹂﹃﹄–—‐"
local APOSTROPHES = "'"
local HYPHENS = "%-"
local AFTER_PUNCT_CLASS = "%s%p" .. EXTRA_PUNCT
local WORD_TAIL_PATTERN = "[%a" .. APOSTROPHES .. HYPHENS .. "]*"
local TRIGGER_TAIL_PATTERN = "[^%a" .. APOSTROPHES .. HYPHENS .. "]*$"
local function is_punctuation(char)
if not char or char == "" then
return true
end
if char:match("[%s%p]") then
return true
end
return EXTRA_PUNCT:find(char, 1, true) ~= nil
end
local INTERNAL_MARKS = {
{
mark = ":",
skip = function(punct)
return punct == "" or punct:find("/")
end,
},
{ mark = "" },
}
local function capitalize_internal_marks(text)
for _, config in ipairs(INTERNAL_MARKS) do
local pattern = config.mark .. "([" .. AFTER_PUNCT_CLASS .. "]*)([a-z])"
text = text:gsub(pattern, function(punct, letter)
-- Require some separation after colon (e.g., space or quote) to avoid URLs and protocols
if config.skip and config.skip(punct) then
return config.mark .. punct .. letter
end
return config.mark .. punct .. letter:upper()
end)
end
return text
end
--- Capitalize first letter if it's lowercase.
--- Returns transformed string and a boolean indicating whether capitalization occurred.
local function capitalize_first(str)
if not str or str == "" then
return str, false
end
-- Separate leading punctuation/spaces (including common typographic quotes) from the word
local leading, remainder = str:match("^([%s%p" .. EXTRA_PUNCT .. "]*)(.*)")
if not remainder or remainder == "" then
return str, false
end
local already_capitalized = remainder:match("^([A-Z]" .. WORD_TAIL_PATTERN .. ")")
if already_capitalized then
return str, true
end
local word, suffix = remainder:match("^([a-z]" .. WORD_TAIL_PATTERN .. ")(.*)")
if not word then
local first_char = remainder:sub(1, 1)
if not is_punctuation(first_char) then
return str, true
end
return str, false
end
-- Only capitalize when the word is purely alphabetic (with optional hyphen/apostrophe)
-- and is followed by punctuation/space or nothing. This avoids changing items like e2105061118.
local next_char = suffix:sub(1, 1)
if suffix ~= "" and not is_punctuation(next_char) then
return str, false
end
return leading .. word:sub(1, 1):upper() .. word:sub(2) .. suffix, true
end
local process_inlines --- forward declaration for mutual recursion
local SIMPLE_WRAPPERS = {
Emph = pandoc.Emph,
Strong = pandoc.Strong,
SmallCaps = pandoc.SmallCaps,
}
local function rebuild_container(elem, capitalize_next)
if not elem.content then
return nil, capitalize_next
end
local processed
processed, capitalize_next = process_inlines(elem.content, capitalize_next)
local wrap = SIMPLE_WRAPPERS[elem.t]
if wrap then
return wrap(processed), capitalize_next
elseif elem.t == "Span" then
return pandoc.Span(processed, elem.attr), capitalize_next
elseif elem.t == "Quoted" then
return pandoc.Quoted(elem.quotetype, processed), capitalize_next
end
return nil, capitalize_next
end
--- Process a list of inlines recursively: capitalize word after colon or em dash.
-- Returns processed inline list and updated capitalize_next flag.
function process_inlines(inlines_list, capitalize_next)
local result = {}
capitalize_next = capitalize_next or false
for _, elem in ipairs(inlines_list) do
if elem.t == "Str" then
local text = elem.text
text = capitalize_internal_marks(text)
if capitalize_next then
local new_text, consumed = capitalize_first(text)
text = new_text
capitalize_next = not consumed
end
if text:find(":" .. TRIGGER_TAIL_PATTERN) or text:find("" .. TRIGGER_TAIL_PATTERN) then
capitalize_next = true
end
result[#result + 1] = pandoc.Str(text)
elseif elem.t == "Space" or elem.t == "SoftBreak" or elem.t == "LineBreak" then
result[#result + 1] = elem
else
local rebuilt
rebuilt, capitalize_next = rebuild_container(elem, capitalize_next)
if rebuilt then
result[#result + 1] = rebuilt
else
result[#result + 1] = elem
capitalize_next = false
end
end
end
return result, capitalize_next
end
--- Process paragraphs: capitalize word after colon or em dash
local function process_para(para)
local processed_content = process_inlines(para.content, false)
return pandoc.Para(processed_content)
end
--- Only process divs with bibliography classes
function Div(div)
-- Check for bibliography-related classes
if div.classes:includes("references") or
div.classes:includes("csl-bib-body") or
div.classes:includes("csl-entry") then
return pandoc.walk_block(div, { Para = process_para })
end
end
@@ -0,0 +1,91 @@
--- Modified from https://github.com/bcdavasconcelos/citetools/blob/main/_extensions/citetools/citation-backlinks.lua
--- Todo: even with link-citations and link-fields turned off, citation-backlinks still adds a single backlink to each reference in the final bibliography div.
--- citation-backlinks.lua adds citation backlinks to the bibliography
-- https://github.com/tarleb/citation-backlinks
--- Copyright: © 20222024 John MacFarlane and Albert Krewinkel and Bernardo Vasconcelos
--- License: MIT see LICENSE for details
-- Makes sure users know if their pandoc version is too old for this
-- filter.
PANDOC_VERSION:must_be_at_least '2.17'
-- cites is a table mapping citation item identifiers
-- to an array of cite identifiers
local cites = {}
-- counter for cite identifiers
local cite_number = 1
local function with_label(s, el)
if FORMAT == "latex" then
return { pandoc.RawInline("latex", "\\label{" .. s .. "}"), el }
end
if FORMAT == "typst" then
return { el, pandoc.RawInline("typst", " #label(\"" .. s .. "\")") }
end
if FORMAT == "docx" or FORMAT == "html" then
return { el }
end
end
function Cite(el)
local cite_id = "cite_" .. cite_number
cite_number = cite_number + 1
for _, citation in ipairs(el.citations) do
-- Only process the citation if they are cited in the text, excluding 'nocite'!!
if citation.mode == 'NormalCitation' then
if cites[citation.id] then
table.insert(cites[citation.id], cite_id)
else
cites[citation.id] = { cite_id }
end
end
end
if FORMAT == "typst" then
return pandoc.Span(with_label(cite_id, el))
else
return pandoc.Span(with_label(cite_id, el), pandoc.Attr(cite_id))
end
end
function append_inline(blocks, inlines)
local last = blocks[#blocks]
if last.t == 'Para' or last.t == 'Plain' then
-- append to last block
last.content:extend(inlines)
else
-- append as additional block
blocks[#blocks + 1] = pandoc.Plain(inlines)
end
return blocks
end
function Div(el)
local citation_id = el.identifier:match("ref%-(.+)")
local tex_return_link = pandoc.RawInline("latex", "\\Acrobatmenu{GoBack}{$\\hookleftarrow$}")
if citation_id then
local backlinks = pandoc.Inlines { pandoc.Space(), pandoc.Str("[") }
if FORMAT == "latex" then
table.insert(backlinks, tex_return_link)
end
for i, cite_id in ipairs(cites[citation_id] or {}) do
local marker = pandoc.Str(i)
if FORMAT == "latex" then
marker = pandoc.RawInline("latex", "\\pageref{" .. cite_id .. "}")
end
if #backlinks > 2 then
table.insert(backlinks, pandoc.Str(","))
table.insert(backlinks, pandoc.Space())
end
table.insert(backlinks, pandoc.Link(marker, "#" .. cite_id))
end
if #backlinks > 2 then
append_inline(el.content, backlinks .. { pandoc.Str("]") })
end
return el
end
end
@@ -0,0 +1,254 @@
"""
Citation Tools for Academic Writing
This script provides utilities for managing citations in academic writing:
1. Extract citation keys from Markdown files and create a filtered bibliography
2. Copy cited reference files to a specified directory for backup or sharing
Typical usage:
python citation-tools.py --extract
python citation-tools.py --copy
Copyright: © 2025Present Tom Ben
License: MIT License
"""
import os
import re
import shutil
import argparse
import json
from pathlib import Path
def extract_citation_keys(markdown_file):
"""Extract citation keys from a markdown file."""
with open(markdown_file, 'r', encoding='utf-8') as f:
content = f.read()
# Pattern 1: [@key] or [@key1; @key2] format
pattern1 = r'\[@([a-zA-Z0-9\-]+)(?:[\s\]\;\,]|$)'
# Pattern 2: standalone @key format
pattern2 = r'(?<![a-zA-Z0-9])@([a-zA-Z0-9\-]+)(?:[\s\.\,\;\:\)\]\}]|$)'
keys1 = re.findall(pattern1, content)
keys2 = re.findall(pattern2, content)
# Combine keys and filter out figure and table references
all_keys = set(keys1 + keys2)
return {key for key in all_keys if not (
key.startswith('fig-') or key.startswith('tbl-'))}
def load_csl_entries(csl_json_file):
"""Load CSL JSON entries from file."""
with open(csl_json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(
f"Expected a list of CSL JSON entries, got {type(data)} instead.")
return data
def parse_file_field(file_field):
"""Return a list of file paths from a CSL JSON file field."""
if not file_field or not isinstance(file_field, str):
return []
return [path.strip() for path in file_field.split(';') if path.strip()]
def build_citation_file_index(entries):
"""Build a dictionary mapping citation IDs to attached file paths."""
index = {}
for entry in entries:
key = entry.get('id')
if not key:
continue
paths = parse_file_field(entry.get('file'))
if paths:
index[key] = paths
return index
def extract_csl_json_entries(csl_json_file, citation_keys, remove_fields=None):
"""Extract CSL JSON entries for the given citation keys."""
if remove_fields is None:
remove_fields = ['file']
entries = load_csl_entries(csl_json_file)
citation_keys = set(citation_keys)
filtered_entries = []
for entry in entries:
key = entry.get('id')
if key and key in citation_keys:
entry_copy = {k: v for k, v in entry.items()
if k not in remove_fields}
filtered_entries.append(entry_copy)
filtered_entries.sort(key=lambda item: item.get('id', ''))
return json.dumps(filtered_entries, ensure_ascii=False, indent=2) + '\n'
def copy_cited_files(args):
"""Copy cited files from bibliography to a new folder."""
# Clean output directory if requested
if args.clean and os.path.exists(args.output_dir):
print(f"Cleaning output directory: {args.output_dir}")
shutil.rmtree(args.output_dir)
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
# Parse bibliography (silently)
entries = load_csl_entries(args.bib)
citation_files = build_citation_file_index(entries)
# Find all Markdown files in content directory
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
# Extract all citation keys from Markdown files
all_keys = set()
for md_file in markdown_files:
all_keys.update(extract_citation_keys(md_file))
# Copy files to output directory
copied_count = 0
missing_count = 0
file_not_found_count = 0
missing_keys = []
not_found_pairs = []
for key in all_keys:
if key in citation_files:
paths = citation_files[key]
existing_path = next(
(path for path in paths if os.path.exists(path)), None)
source_path = existing_path or paths[0]
_, file_extension = os.path.splitext(source_path)
dest_path = os.path.join(args.output_dir, f"{key}{file_extension}")
try:
if existing_path and os.path.exists(existing_path):
shutil.copy2(existing_path, dest_path)
copied_count += 1
else:
file_not_found_count += 1
not_found_pairs.append((key, source_path))
except Exception as e:
print(f"Error copying {key}: {e}")
else:
missing_count += 1
missing_keys.append(key)
# Print simplified summary
print(f"Markdown files in content directory: {len(markdown_files)}")
print(f"Total unique citation keys found: {len(all_keys)}")
print(f"Files successfully copied: {copied_count}")
print(f"Citation keys without file paths: {missing_count}")
print(
f"Files not found (path exists in bibliography but file missing): {file_not_found_count}")
if missing_keys:
print("\nCitation keys without file paths:")
for key in sorted(missing_keys):
print(f" - {key}")
if not_found_pairs:
print("\nCitation keys where file wasn't found:")
for key, path in sorted(not_found_pairs):
print(f" - {key}: {path}")
return all_keys
def extract_citations(args):
"""Extract citations from Markdown files and save them to a CSL JSON file."""
# Find all Markdown files in content directory
markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
# Extract all citation keys from Markdown files
all_keys = set()
for md_file in markdown_files:
all_keys.update(extract_citation_keys(md_file))
# Extract CSL JSON entries
json_content = extract_csl_json_entries(
args.bib, all_keys, args.remove_fields)
# Write to output file
with open(args.output_bib, 'w', encoding='utf-8') as f:
f.write(json_content)
# Print simplified summary
print(f"Markdown files in content directory: {len(markdown_files)}")
print(f"Total unique citation keys found: {len(all_keys)}")
print(f"Extracted citations to `{args.output_bib}`")
return all_keys
def main():
"""Parse command line arguments and execute the appropriate function."""
# Get script location and project root
script_dir = Path(__file__).parent.resolve()
project_root = script_dir.parent if script_dir.name == "_extensions" else script_dir
parser = argparse.ArgumentParser(
description='Citation tools for extracting and copying cited references')
# Common arguments
default_bib = os.path.expanduser(
"~/Library/CloudStorage/Dropbox/pkm/bibliography.json")
default_content_dir = str(project_root / "contents")
# Add command flags instead of subcommands
parser.add_argument('--extract', action='store_true',
help='Extract citations to a filtered CSL JSON file')
parser.add_argument('--copy', action='store_true',
help='Copy cited files to a directory')
# Common arguments for both commands
parser.add_argument('--bib',
default=default_bib,
help=f'Path to bibliography.json file (default: {default_bib})')
parser.add_argument('--content_dir',
default=default_content_dir,
help=f'Path to content directory with Markdown files (default: {default_content_dir})')
# Arguments specific to extract
parser.add_argument('--output_bib',
default=str(project_root / "citebib.json"),
help=f'Path to output CSL JSON file (default: {project_root}/citebib.json)')
parser.add_argument('--remove_fields',
nargs='+',
default=['file'],
help='Fields to remove from CSL JSON entries (default: file)')
# Arguments specific to copy
parser.add_argument('--output_dir',
default=os.path.expanduser(
"~/Downloads/cited-docs"),
help='Path to output directory for copied files (default: ~/Downloads/cited-docs)')
parser.add_argument('--clean',
action='store_true',
help='Clean the output directory before copying files')
args = parser.parse_args()
if args.extract:
extract_citations(args)
elif args.copy:
copy_cited_files(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
+161
View File
@@ -0,0 +1,161 @@
# Convert *.md files to *.qmd files and pre-process them
# Randomize footnote identifiers in multiple Quarto files to avoid conflicts
# Convert reference-style links to inline links
# Remove line breaks within a straight angle quotation mark
# Reformat display math equations in Ulysses
# Copyright: © 2024Present Tom Ben
# License: MIT License
import re
import glob
import os
import random
import string
def get_md_files():
# Get all *.md files
return [f for f in glob.glob("contents/[0-9]*.md")]
def randomize_footnote_identifiers(qmd_content):
# Find all existing footnote identifiers (numbers)
existing_ids = set(re.findall(r'\[\^(\d+)\]', qmd_content))
# Generate a unique random identifier for each existing footnote
unique_ids = {}
for id in existing_ids:
# Generate a random string of 5 characters
new_id = ''.join(random.choices(
string.ascii_letters + string.digits, k=5))
while new_id in unique_ids.values():
new_id = ''.join(random.choices(
string.ascii_letters + string.digits, k=5))
unique_ids[id] = new_id
# Replace all footnote references and definitions with new identifiers
for old_id, new_id in unique_ids.items():
qmd_content = re.sub(rf'\[\^{old_id}\]', f'[^{new_id}]', qmd_content)
qmd_content = re.sub(rf'\[\^{old_id}\]:', f'[^{new_id}]:', qmd_content)
return qmd_content
def convert_reference_to_inline(qmd_content):
# Extract reference links
reference_links = {}
reference_pattern = re.compile(r'\n\[(\d+)\]:\s*(.*)')
for match in reference_pattern.findall(qmd_content):
reference_links[match[0]] = match[1]
# Remove the reference link definitions from the qmd_content
qmd_content = reference_pattern.sub('', qmd_content)
# Replace reference-style link usages with inline links
def replace_link(match):
text = match.group(1)
key = match.group(2)
url = reference_links.get(key, '')
return f'[{text}]({url})'
usage_pattern = re.compile(r'\[(.*?)\]\[(\d+)\]')
qmd_content = usage_pattern.sub(replace_link, qmd_content)
return qmd_content
def remove_linebreaks_in_quotes(text):
# Regular expression pattern to find blocks within single Chinese quotes
pattern = r'「[^」]*?」'
# Function to replace newlines in the found quoted text
def replace_newlines(m):
# Remove all newlines within the quote block
return m.group(0).replace('\n', '')
# Use re.sub to replace the newline characters in each match
cleaned_text = re.sub(pattern, replace_newlines, text)
return cleaned_text
def reformat_math_equations(content):
# Reformat display math with labels to block format
labeled_pattern = r"\$(.+?)\$ *(\{#.+?\})"
def replace_with_labeled_block(match):
equation = match.group(1).strip()
label = match.group(2).strip()
return f"$$\n{equation}\n$$ {label}"
content = re.sub(labeled_pattern, replace_with_labeled_block, content)
# Reformat display math without labels to block format
# Match `$$ ... $$` without label
display_pattern = r"(?<!\$)\$\$([^\$]+?)\$\$(?!\{#)"
def replace_with_display_block(match):
equation = match.group(1).strip()
return f"$$\n{equation}\n$$"
content = re.sub(display_pattern, replace_with_display_block, content)
return content
def process_file(input_file, output_file):
with open(input_file, "r", encoding="utf-8") as f:
content = f.read()
# Remove links with `[@]` and a space before it
content = re.sub(r"\s*\[@\].*?[\]\)]", "", content)
# Remove square brackets enclosing the caption
content = re.sub(r"^\[(.*)\}\]$", r"\n :\1}", content, flags=re.MULTILINE)
# Merge multiple adjacent citations into one
content = re.sub(r"\][\(\[].*?;\s*\[", "; ", content)
# Replace '{{\<...\>}}' with '{{<...>}}'
content = re.sub(r"\{\{\\<(.*)\\>}}", r"{{<\1>}}", content)
# Remove comment blocks to avoid errors of Python filter
content = re.sub(r"^```{=comment}.*?^```$", "",
content, flags=re.DOTALL | re.MULTILINE)
# Randomize footnote identifiers
content = randomize_footnote_identifiers(content)
# Convert reference-style links to inline links
content = convert_reference_to_inline(content)
# Remove line breaks in quotes
content = remove_linebreaks_in_quotes(content)
# Reformat math equations
content = reformat_math_equations(content)
with open(output_file, "w", encoding="utf-8") as f:
f.write(content)
def main():
md_files = get_md_files()
# Create contents_tmp directory if it doesn't exist
tmp_dir = "contents_tmp"
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
# Convert *.md files to *.qmd files in contents_tmp directory
qmd_files = [os.path.join(tmp_dir, os.path.basename(
f).replace(".md", ".qmd")) for f in md_files]
for md_file, qmd_file in zip(md_files, qmd_files):
process_file(md_file, qmd_file)
# Process existing .qmd files in contents directory and output to contents_tmp
os.chdir('contents')
existing_qmd_files = glob.glob('*.qmd')
for qmd_file in existing_qmd_files:
output_file = os.path.join('..', tmp_dir, qmd_file)
process_file(qmd_file, output_file)
if __name__ == "__main__":
main()
+17
View File
@@ -0,0 +1,17 @@
--- Get a BibTeX/BibLaTeX or CSL JSON file cited from a large database
--- Source: https://pandoc.org/lua-filters.html#pandoc.utils.references
--- https://fosstodon.org/@pandoc/109549882954402931
--- Copyright: © 2022Present Albert Krewinkel
--- License: MIT License
-- *Note*: For BibLaTeX, it is needed to change the following entry name:
-- - journal -> journaltitle
-- - address -> location
-- - publisher -> institution (only for thesis)
function Pandoc(doc)
doc.meta.references = pandoc.utils.references(doc)
doc.meta.bibliography = nil
return doc
end
@@ -0,0 +1,14 @@
--- Remove hyperlinks from DOIs while preserving the DOI text
--- Useful for styles without DOIs but you don't want to edit the bibliography file
--- Copyright: © 2024Present Tom Ben
--- License: MIT License
function Link(el)
-- Check if this is a DOI link
if el.target:match("^https?://doi%.org") then
-- Return just the link text content without the link wrapper
return el.content
end
return el
end