add _extensions

2026-05-21 13:37:53 +08:00
parent 6a9a5fc90e
commit 61bd0bea2f
252 changed files with 33972 additions and 1 deletions
@@ -0,0 +1,170 @@
+--[[
+capitalize-subtitle – Capitalize first letter after colons and em dashes in bibliography entries
+
+This filter capitalizes the first letter after colons and em dashes in bibliography
+entries, following APA and similar styles that require subtitle capitalization.
+
+It only processes paragraphs within bibliography divs
+Must be run after Citeproc
+
+Copyright: © 2025–present Tom Ben
+License: MIT
+]]
+
+local EXTRA_PUNCT = "“”‘’«»‹›„‟「」『』﹁﹂﹃﹄–—‐"
+local APOSTROPHES = "'’"
+local HYPHENS = "%-‐"
+local AFTER_PUNCT_CLASS = "%s%p" .. EXTRA_PUNCT
+local WORD_TAIL_PATTERN = "[%a" .. APOSTROPHES .. HYPHENS .. "]*"
+local TRIGGER_TAIL_PATTERN = "[^%a" .. APOSTROPHES .. HYPHENS .. "]*$"
+
+local function is_punctuation(char)
+    if not char or char == "" then
+        return true
+    end
+    if char:match("[%s%p]") then
+        return true
+    end
+    return EXTRA_PUNCT:find(char, 1, true) ~= nil
+end
+
+local INTERNAL_MARKS = {
+    {
+        mark = ":",
+        skip = function(punct)
+            return punct == "" or punct:find("/")
+        end,
+    },
+    { mark = "—" },
+}
+
+local function capitalize_internal_marks(text)
+    for _, config in ipairs(INTERNAL_MARKS) do
+        local pattern = config.mark .. "([" .. AFTER_PUNCT_CLASS .. "]*)([a-z])"
+        text = text:gsub(pattern, function(punct, letter)
+            -- Require some separation after colon (e.g., space or quote) to avoid URLs and protocols
+            if config.skip and config.skip(punct) then
+                return config.mark .. punct .. letter
+            end
+            return config.mark .. punct .. letter:upper()
+        end)
+    end
+
+    return text
+end
+
+--- Capitalize first letter if it's lowercase.
+--- Returns transformed string and a boolean indicating whether capitalization occurred.
+local function capitalize_first(str)
+    if not str or str == "" then
+        return str, false
+    end
+    -- Separate leading punctuation/spaces (including common typographic quotes) from the word
+    local leading, remainder = str:match("^([%s%p" .. EXTRA_PUNCT .. "]*)(.*)")
+    if not remainder or remainder == "" then
+        return str, false
+    end
+    local already_capitalized = remainder:match("^([A-Z]" .. WORD_TAIL_PATTERN .. ")")
+    if already_capitalized then
+        return str, true
+    end
+    local word, suffix = remainder:match("^([a-z]" .. WORD_TAIL_PATTERN .. ")(.*)")
+    if not word then
+        local first_char = remainder:sub(1, 1)
+        if not is_punctuation(first_char) then
+            return str, true
+        end
+        return str, false
+    end
+    -- Only capitalize when the word is purely alphabetic (with optional hyphen/apostrophe)
+    -- and is followed by punctuation/space or nothing. This avoids changing items like e2105061118.
+    local next_char = suffix:sub(1, 1)
+    if suffix ~= "" and not is_punctuation(next_char) then
+        return str, false
+    end
+    return leading .. word:sub(1, 1):upper() .. word:sub(2) .. suffix, true
+end
+
+local process_inlines --- forward declaration for mutual recursion
+
+local SIMPLE_WRAPPERS = {
+    Emph = pandoc.Emph,
+    Strong = pandoc.Strong,
+    SmallCaps = pandoc.SmallCaps,
+}
+
+local function rebuild_container(elem, capitalize_next)
+    if not elem.content then
+        return nil, capitalize_next
+    end
+
+    local processed
+    processed, capitalize_next = process_inlines(elem.content, capitalize_next)
+
+    local wrap = SIMPLE_WRAPPERS[elem.t]
+    if wrap then
+        return wrap(processed), capitalize_next
+    elseif elem.t == "Span" then
+        return pandoc.Span(processed, elem.attr), capitalize_next
+    elseif elem.t == "Quoted" then
+        return pandoc.Quoted(elem.quotetype, processed), capitalize_next
+    end
+
+    return nil, capitalize_next
+end
+
+--- Process a list of inlines recursively: capitalize word after colon or em dash.
+-- Returns processed inline list and updated capitalize_next flag.
+function process_inlines(inlines_list, capitalize_next)
+    local result = {}
+    capitalize_next = capitalize_next or false
+
+    for _, elem in ipairs(inlines_list) do
+        if elem.t == "Str" then
+            local text = elem.text
+
+            text = capitalize_internal_marks(text)
+
+            if capitalize_next then
+                local new_text, consumed = capitalize_first(text)
+                text = new_text
+                capitalize_next = not consumed
+            end
+
+            if text:find(":" .. TRIGGER_TAIL_PATTERN) or text:find("—" .. TRIGGER_TAIL_PATTERN) then
+                capitalize_next = true
+            end
+
+            result[#result + 1] = pandoc.Str(text)
+        elseif elem.t == "Space" or elem.t == "SoftBreak" or elem.t == "LineBreak" then
+            result[#result + 1] = elem
+        else
+            local rebuilt
+            rebuilt, capitalize_next = rebuild_container(elem, capitalize_next)
+            if rebuilt then
+                result[#result + 1] = rebuilt
+            else
+                result[#result + 1] = elem
+                capitalize_next = false
+            end
+        end
+    end
+
+    return result, capitalize_next
+end
+
+--- Process paragraphs: capitalize word after colon or em dash
+local function process_para(para)
+    local processed_content = process_inlines(para.content, false)
+    return pandoc.Para(processed_content)
+end
+
+--- Only process divs with bibliography classes
+function Div(div)
+    -- Check for bibliography-related classes
+    if div.classes:includes("references") or
+        div.classes:includes("csl-bib-body") or
+        div.classes:includes("csl-entry") then
+        return pandoc.walk_block(div, { Para = process_para })
+    end
+end
@@ -0,0 +1,91 @@
+--- Modified from https://github.com/bcdavasconcelos/citetools/blob/main/_extensions/citetools/citation-backlinks.lua
+
+--- Todo: even with link-citations and link-fields turned off, citation-backlinks still adds a single backlink to each reference in the final bibliography div.
+
+--- citation-backlinks.lua – adds citation backlinks to the bibliography
+-- https://github.com/tarleb/citation-backlinks
+--- Copyright: © 2022–2024 John MacFarlane and Albert Krewinkel and Bernardo Vasconcelos
+--- License: MIT – see LICENSE for details
+
+-- Makes sure users know if their pandoc version is too old for this
+-- filter.
+PANDOC_VERSION:must_be_at_least '2.17'
+
+-- cites is a table mapping citation item identifiers
+-- to an array of cite identifiers
+local cites = {}
+
+-- counter for cite identifiers
+local cite_number = 1
+
+local function with_label(s, el)
+  if FORMAT == "latex" then
+    return { pandoc.RawInline("latex", "\\label{" .. s .. "}"), el }
+  end
+  if FORMAT == "typst" then
+    return { el, pandoc.RawInline("typst", " #label(\"" .. s .. "\")") }
+  end
+  if FORMAT == "docx" or FORMAT == "html" then
+    return { el }
+  end
+end
+
+function Cite(el)
+  local cite_id = "cite_" .. cite_number
+  cite_number = cite_number + 1
+  for _, citation in ipairs(el.citations) do
+    -- Only process the citation if they are cited in the text, excluding 'nocite'!!
+    if citation.mode == 'NormalCitation' then
+      if cites[citation.id] then
+        table.insert(cites[citation.id], cite_id)
+      else
+        cites[citation.id] = { cite_id }
+      end
+    end
+  end
+  if FORMAT == "typst" then
+    return pandoc.Span(with_label(cite_id, el))
+  else
+    return pandoc.Span(with_label(cite_id, el), pandoc.Attr(cite_id))
+  end
+end
+
+function append_inline(blocks, inlines)
+  local last = blocks[#blocks]
+  if last.t == 'Para' or last.t == 'Plain' then
+    -- append to last block
+    last.content:extend(inlines)
+  else
+    -- append as additional block
+    blocks[#blocks + 1] = pandoc.Plain(inlines)
+  end
+  return blocks
+end
+
+function Div(el)
+  local citation_id = el.identifier:match("ref%-(.+)")
+  local tex_return_link = pandoc.RawInline("latex", "\\Acrobatmenu{GoBack}{$\\hookleftarrow$}")
+
+  if citation_id then
+    local backlinks = pandoc.Inlines { pandoc.Space(), pandoc.Str("[") }
+    if FORMAT == "latex" then
+      table.insert(backlinks, tex_return_link)
+    end
+
+    for i, cite_id in ipairs(cites[citation_id] or {}) do
+      local marker = pandoc.Str(i)
+      if FORMAT == "latex" then
+        marker = pandoc.RawInline("latex", "\\pageref{" .. cite_id .. "}")
+      end
+      if #backlinks > 2 then
+        table.insert(backlinks, pandoc.Str(","))
+        table.insert(backlinks, pandoc.Space())
+      end
+      table.insert(backlinks, pandoc.Link(marker, "#" .. cite_id))
+    end
+    if #backlinks > 2 then
+      append_inline(el.content, backlinks .. { pandoc.Str("]") })
+    end
+    return el
+  end
+end
@@ -0,0 +1,254 @@
+"""
+Citation Tools for Academic Writing
+
+This script provides utilities for managing citations in academic writing:
+1. Extract citation keys from Markdown files and create a filtered bibliography
+2. Copy cited reference files to a specified directory for backup or sharing
+
+Typical usage:
+    python citation-tools.py --extract
+    python citation-tools.py --copy
+
+Copyright: © 2025–Present Tom Ben
+License: MIT License
+"""
+
+import os
+import re
+import shutil
+import argparse
+import json
+from pathlib import Path
+
+
+def extract_citation_keys(markdown_file):
+    """Extract citation keys from a markdown file."""
+    with open(markdown_file, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    # Pattern 1: [@key] or [@key1; @key2] format
+    pattern1 = r'\[@([a-zA-Z0-9\-]+)(?:[\s\]\;\,]|$)'
+
+    # Pattern 2: standalone @key format
+    pattern2 = r'(?<![a-zA-Z0-9])@([a-zA-Z0-9\-]+)(?:[\s\.\,\;\:\)\]\}]|$)'
+
+    keys1 = re.findall(pattern1, content)
+    keys2 = re.findall(pattern2, content)
+
+    # Combine keys and filter out figure and table references
+    all_keys = set(keys1 + keys2)
+    return {key for key in all_keys if not (
+        key.startswith('fig-') or key.startswith('tbl-'))}
+
+
+def load_csl_entries(csl_json_file):
+    """Load CSL JSON entries from file."""
+    with open(csl_json_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+
+    if not isinstance(data, list):
+        raise ValueError(
+            f"Expected a list of CSL JSON entries, got {type(data)} instead.")
+
+    return data
+
+
+def parse_file_field(file_field):
+    """Return a list of file paths from a CSL JSON file field."""
+    if not file_field or not isinstance(file_field, str):
+        return []
+
+    return [path.strip() for path in file_field.split(';') if path.strip()]
+
+
+def build_citation_file_index(entries):
+    """Build a dictionary mapping citation IDs to attached file paths."""
+    index = {}
+
+    for entry in entries:
+        key = entry.get('id')
+        if not key:
+            continue
+        paths = parse_file_field(entry.get('file'))
+        if paths:
+            index[key] = paths
+
+    return index
+
+
+def extract_csl_json_entries(csl_json_file, citation_keys, remove_fields=None):
+    """Extract CSL JSON entries for the given citation keys."""
+    if remove_fields is None:
+        remove_fields = ['file']
+
+    entries = load_csl_entries(csl_json_file)
+    citation_keys = set(citation_keys)
+    filtered_entries = []
+
+    for entry in entries:
+        key = entry.get('id')
+        if key and key in citation_keys:
+            entry_copy = {k: v for k, v in entry.items()
+                          if k not in remove_fields}
+            filtered_entries.append(entry_copy)
+
+    filtered_entries.sort(key=lambda item: item.get('id', ''))
+    return json.dumps(filtered_entries, ensure_ascii=False, indent=2) + '\n'
+
+
+def copy_cited_files(args):
+    """Copy cited files from bibliography to a new folder."""
+    # Clean output directory if requested
+    if args.clean and os.path.exists(args.output_dir):
+        print(f"Cleaning output directory: {args.output_dir}")
+        shutil.rmtree(args.output_dir)
+
+    # Create output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Parse bibliography (silently)
+    entries = load_csl_entries(args.bib)
+    citation_files = build_citation_file_index(entries)
+
+    # Find all Markdown files in content directory
+    markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
+
+    # Extract all citation keys from Markdown files
+    all_keys = set()
+    for md_file in markdown_files:
+        all_keys.update(extract_citation_keys(md_file))
+
+    # Copy files to output directory
+    copied_count = 0
+    missing_count = 0
+    file_not_found_count = 0
+    missing_keys = []
+    not_found_pairs = []
+
+    for key in all_keys:
+        if key in citation_files:
+            paths = citation_files[key]
+            existing_path = next(
+                (path for path in paths if os.path.exists(path)), None)
+            source_path = existing_path or paths[0]
+            _, file_extension = os.path.splitext(source_path)
+            dest_path = os.path.join(args.output_dir, f"{key}{file_extension}")
+
+            try:
+                if existing_path and os.path.exists(existing_path):
+                    shutil.copy2(existing_path, dest_path)
+                    copied_count += 1
+                else:
+                    file_not_found_count += 1
+                    not_found_pairs.append((key, source_path))
+            except Exception as e:
+                print(f"Error copying {key}: {e}")
+        else:
+            missing_count += 1
+            missing_keys.append(key)
+
+    # Print simplified summary
+    print(f"Markdown files in content directory: {len(markdown_files)}")
+    print(f"Total unique citation keys found: {len(all_keys)}")
+    print(f"Files successfully copied: {copied_count}")
+    print(f"Citation keys without file paths: {missing_count}")
+    print(
+        f"Files not found (path exists in bibliography but file missing): {file_not_found_count}")
+
+    if missing_keys:
+        print("\nCitation keys without file paths:")
+        for key in sorted(missing_keys):
+            print(f"  - {key}")
+
+    if not_found_pairs:
+        print("\nCitation keys where file wasn't found:")
+        for key, path in sorted(not_found_pairs):
+            print(f"  - {key}: {path}")
+
+    return all_keys
+
+
+def extract_citations(args):
+    """Extract citations from Markdown files and save them to a CSL JSON file."""
+    # Find all Markdown files in content directory
+    markdown_files = list(Path(args.content_dir).glob('[0-9]*.md'))
+
+    # Extract all citation keys from Markdown files
+    all_keys = set()
+    for md_file in markdown_files:
+        all_keys.update(extract_citation_keys(md_file))
+
+    # Extract CSL JSON entries
+    json_content = extract_csl_json_entries(
+        args.bib, all_keys, args.remove_fields)
+
+    # Write to output file
+    with open(args.output_bib, 'w', encoding='utf-8') as f:
+        f.write(json_content)
+
+    # Print simplified summary
+    print(f"Markdown files in content directory: {len(markdown_files)}")
+    print(f"Total unique citation keys found: {len(all_keys)}")
+    print(f"Extracted citations to `{args.output_bib}`")
+
+    return all_keys
+
+
+def main():
+    """Parse command line arguments and execute the appropriate function."""
+    # Get script location and project root
+    script_dir = Path(__file__).parent.resolve()
+    project_root = script_dir.parent if script_dir.name == "_extensions" else script_dir
+
+    parser = argparse.ArgumentParser(
+        description='Citation tools for extracting and copying cited references')
+
+    # Common arguments
+    default_bib = os.path.expanduser(
+        "~/Library/CloudStorage/Dropbox/pkm/bibliography.json")
+    default_content_dir = str(project_root / "contents")
+
+    # Add command flags instead of subcommands
+    parser.add_argument('--extract', action='store_true',
+                        help='Extract citations to a filtered CSL JSON file')
+    parser.add_argument('--copy', action='store_true',
+                        help='Copy cited files to a directory')
+
+    # Common arguments for both commands
+    parser.add_argument('--bib',
+                        default=default_bib,
+                        help=f'Path to bibliography.json file (default: {default_bib})')
+    parser.add_argument('--content_dir',
+                        default=default_content_dir,
+                        help=f'Path to content directory with Markdown files (default: {default_content_dir})')
+
+    # Arguments specific to extract
+    parser.add_argument('--output_bib',
+                        default=str(project_root / "citebib.json"),
+                        help=f'Path to output CSL JSON file (default: {project_root}/citebib.json)')
+    parser.add_argument('--remove_fields',
+                        nargs='+',
+                        default=['file'],
+                        help='Fields to remove from CSL JSON entries (default: file)')
+
+    # Arguments specific to copy
+    parser.add_argument('--output_dir',
+                        default=os.path.expanduser(
+                            "~/Downloads/cited-docs"),
+                        help='Path to output directory for copied files (default: ~/Downloads/cited-docs)')
+    parser.add_argument('--clean',
+                        action='store_true',
+                        help='Clean the output directory before copying files')
+
+    args = parser.parse_args()
+
+    if args.extract:
+        extract_citations(args)
+    elif args.copy:
+        copy_cited_files(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,161 @@
+# Convert *.md files to *.qmd files and pre-process them
+# Randomize footnote identifiers in multiple Quarto files to avoid conflicts
+# Convert reference-style links to inline links
+# Remove line breaks within a straight angle quotation mark
+# Reformat display math equations in Ulysses
+
+# Copyright: © 2024–Present Tom Ben
+# License: MIT License
+
+import re
+import glob
+import os
+import random
+import string
+
+
+def get_md_files():
+    # Get all *.md files
+    return [f for f in glob.glob("contents/[0-9]*.md")]
+
+
+def randomize_footnote_identifiers(qmd_content):
+    # Find all existing footnote identifiers (numbers)
+    existing_ids = set(re.findall(r'\[\^(\d+)\]', qmd_content))
+
+    # Generate a unique random identifier for each existing footnote
+    unique_ids = {}
+    for id in existing_ids:
+        # Generate a random string of 5 characters
+        new_id = ''.join(random.choices(
+            string.ascii_letters + string.digits, k=5))
+        while new_id in unique_ids.values():
+            new_id = ''.join(random.choices(
+                string.ascii_letters + string.digits, k=5))
+        unique_ids[id] = new_id
+
+    # Replace all footnote references and definitions with new identifiers
+    for old_id, new_id in unique_ids.items():
+        qmd_content = re.sub(rf'\[\^{old_id}\]', f'[^{new_id}]', qmd_content)
+        qmd_content = re.sub(rf'\[\^{old_id}\]:', f'[^{new_id}]:', qmd_content)
+
+    return qmd_content
+
+
+def convert_reference_to_inline(qmd_content):
+    # Extract reference links
+    reference_links = {}
+    reference_pattern = re.compile(r'\n\[(\d+)\]:\s*(.*)')
+    for match in reference_pattern.findall(qmd_content):
+        reference_links[match[0]] = match[1]
+
+    # Remove the reference link definitions from the qmd_content
+    qmd_content = reference_pattern.sub('', qmd_content)
+
+    # Replace reference-style link usages with inline links
+    def replace_link(match):
+        text = match.group(1)
+        key = match.group(2)
+        url = reference_links.get(key, '')
+        return f'[{text}]({url})'
+
+    usage_pattern = re.compile(r'\[(.*?)\]\[(\d+)\]')
+    qmd_content = usage_pattern.sub(replace_link, qmd_content)
+
+    return qmd_content
+
+
+def remove_linebreaks_in_quotes(text):
+    # Regular expression pattern to find blocks within single Chinese quotes
+    pattern = r'「[^」]*?」'
+
+    # Function to replace newlines in the found quoted text
+    def replace_newlines(m):
+        # Remove all newlines within the quote block
+        return m.group(0).replace('\n', '')
+
+    # Use re.sub to replace the newline characters in each match
+    cleaned_text = re.sub(pattern, replace_newlines, text)
+
+    return cleaned_text
+
+
+def reformat_math_equations(content):
+    # Reformat display math with labels to block format
+    labeled_pattern = r"\$(.+?)\$ *(\{#.+?\})"
+
+    def replace_with_labeled_block(match):
+        equation = match.group(1).strip()
+        label = match.group(2).strip()
+        return f"$$\n{equation}\n$$ {label}"
+
+    content = re.sub(labeled_pattern, replace_with_labeled_block, content)
+
+    # Reformat display math without labels to block format
+    # Match `$$ ... $$` without label
+    display_pattern = r"(?<!\$)\$\$([^\$]+?)\$\$(?!\{#)"
+
+    def replace_with_display_block(match):
+        equation = match.group(1).strip()
+        return f"$$\n{equation}\n$$"
+
+    content = re.sub(display_pattern, replace_with_display_block, content)
+
+    return content
+
+
+def process_file(input_file, output_file):
+    with open(input_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # Remove links with `[@]` and a space before it
+    content = re.sub(r"\s*\[@\].*?[\]\)]", "", content)
+    # Remove square brackets enclosing the caption
+    content = re.sub(r"^\[(.*)\}\]$", r"\n  :\1}", content, flags=re.MULTILINE)
+    # Merge multiple adjacent citations into one
+    content = re.sub(r"\][\(\[].*?;\s*\[", "; ", content)
+    # Replace '{{\<...\>}}' with '{{<...>}}'
+    content = re.sub(r"\{\{\\<(.*)\\>}}", r"{{<\1>}}", content)
+    # Remove comment blocks to avoid errors of Python filter
+    content = re.sub(r"^```{=comment}.*?^```$", "",
+                     content, flags=re.DOTALL | re.MULTILINE)
+
+    # Randomize footnote identifiers
+    content = randomize_footnote_identifiers(content)
+    # Convert reference-style links to inline links
+    content = convert_reference_to_inline(content)
+    # Remove line breaks in quotes
+    content = remove_linebreaks_in_quotes(content)
+    # Reformat math equations
+    content = reformat_math_equations(content)
+
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def main():
+    md_files = get_md_files()
+
+    # Create contents_tmp directory if it doesn't exist
+    tmp_dir = "contents_tmp"
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+
+    # Convert *.md files to *.qmd files in contents_tmp directory
+    qmd_files = [os.path.join(tmp_dir, os.path.basename(
+        f).replace(".md", ".qmd")) for f in md_files]
+
+    for md_file, qmd_file in zip(md_files, qmd_files):
+        process_file(md_file, qmd_file)
+
+    # Process existing .qmd files in contents directory and output to contents_tmp
+    os.chdir('contents')
+    existing_qmd_files = glob.glob('*.qmd')
+
+    for qmd_file in existing_qmd_files:
+        output_file = os.path.join('..', tmp_dir, qmd_file)
+        process_file(qmd_file, output_file)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,17 @@
+--- Get a BibTeX/BibLaTeX or CSL JSON file cited from a large database
+--- Source: https://pandoc.org/lua-filters.html#pandoc.utils.references
+--- https://fosstodon.org/@pandoc/109549882954402931
+
+--- Copyright: © 2022–Present Albert Krewinkel 
+--- License: MIT License
+
+-- *Note*: For BibLaTeX, it is needed to change the following entry name:
+-- - journal -> journaltitle
+-- - address -> location
+-- - publisher -> institution (only for thesis)
+
+function Pandoc(doc)
+  doc.meta.references = pandoc.utils.references(doc)
+  doc.meta.bibliography = nil
+  return doc
+end
@@ -0,0 +1,14 @@
+--- Remove hyperlinks from DOIs while preserving the DOI text
+--- Useful for styles without DOIs but you don't want to edit the bibliography file
+
+--- Copyright: © 2024–Present Tom Ben
+--- License: MIT License
+
+function Link(el)
+    -- Check if this is a DOI link
+    if el.target:match("^https?://doi%.org") then
+        -- Return just the link text content without the link wrapper
+        return el.content
+    end
+    return el
+end