Files
2026-05-21 13:37:53 +08:00

171 lines
5.5 KiB
Lua
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
--[[
capitalize-subtitle Capitalize first letter after colons and em dashes in bibliography entries
This filter capitalizes the first letter after colons and em dashes in bibliography
entries, following APA and similar styles that require subtitle capitalization.
It only processes paragraphs within bibliography divs
Must be run after Citeproc
Copyright: © 2025present Tom Ben
License: MIT
]]
local EXTRA_PUNCT = "“”‘’«»‹›„‟「」『』﹁﹂﹃﹄–—‐"
local APOSTROPHES = "'"
local HYPHENS = "%-"
local AFTER_PUNCT_CLASS = "%s%p" .. EXTRA_PUNCT
local WORD_TAIL_PATTERN = "[%a" .. APOSTROPHES .. HYPHENS .. "]*"
local TRIGGER_TAIL_PATTERN = "[^%a" .. APOSTROPHES .. HYPHENS .. "]*$"
local function is_punctuation(char)
if not char or char == "" then
return true
end
if char:match("[%s%p]") then
return true
end
return EXTRA_PUNCT:find(char, 1, true) ~= nil
end
local INTERNAL_MARKS = {
{
mark = ":",
skip = function(punct)
return punct == "" or punct:find("/")
end,
},
{ mark = "" },
}
local function capitalize_internal_marks(text)
for _, config in ipairs(INTERNAL_MARKS) do
local pattern = config.mark .. "([" .. AFTER_PUNCT_CLASS .. "]*)([a-z])"
text = text:gsub(pattern, function(punct, letter)
-- Require some separation after colon (e.g., space or quote) to avoid URLs and protocols
if config.skip and config.skip(punct) then
return config.mark .. punct .. letter
end
return config.mark .. punct .. letter:upper()
end)
end
return text
end
--- Capitalize first letter if it's lowercase.
--- Returns transformed string and a boolean indicating whether capitalization occurred.
local function capitalize_first(str)
if not str or str == "" then
return str, false
end
-- Separate leading punctuation/spaces (including common typographic quotes) from the word
local leading, remainder = str:match("^([%s%p" .. EXTRA_PUNCT .. "]*)(.*)")
if not remainder or remainder == "" then
return str, false
end
local already_capitalized = remainder:match("^([A-Z]" .. WORD_TAIL_PATTERN .. ")")
if already_capitalized then
return str, true
end
local word, suffix = remainder:match("^([a-z]" .. WORD_TAIL_PATTERN .. ")(.*)")
if not word then
local first_char = remainder:sub(1, 1)
if not is_punctuation(first_char) then
return str, true
end
return str, false
end
-- Only capitalize when the word is purely alphabetic (with optional hyphen/apostrophe)
-- and is followed by punctuation/space or nothing. This avoids changing items like e2105061118.
local next_char = suffix:sub(1, 1)
if suffix ~= "" and not is_punctuation(next_char) then
return str, false
end
return leading .. word:sub(1, 1):upper() .. word:sub(2) .. suffix, true
end
local process_inlines --- forward declaration for mutual recursion
local SIMPLE_WRAPPERS = {
Emph = pandoc.Emph,
Strong = pandoc.Strong,
SmallCaps = pandoc.SmallCaps,
}
local function rebuild_container(elem, capitalize_next)
if not elem.content then
return nil, capitalize_next
end
local processed
processed, capitalize_next = process_inlines(elem.content, capitalize_next)
local wrap = SIMPLE_WRAPPERS[elem.t]
if wrap then
return wrap(processed), capitalize_next
elseif elem.t == "Span" then
return pandoc.Span(processed, elem.attr), capitalize_next
elseif elem.t == "Quoted" then
return pandoc.Quoted(elem.quotetype, processed), capitalize_next
end
return nil, capitalize_next
end
--- Process a list of inlines recursively: capitalize word after colon or em dash.
-- Returns processed inline list and updated capitalize_next flag.
function process_inlines(inlines_list, capitalize_next)
local result = {}
capitalize_next = capitalize_next or false
for _, elem in ipairs(inlines_list) do
if elem.t == "Str" then
local text = elem.text
text = capitalize_internal_marks(text)
if capitalize_next then
local new_text, consumed = capitalize_first(text)
text = new_text
capitalize_next = not consumed
end
if text:find(":" .. TRIGGER_TAIL_PATTERN) or text:find("" .. TRIGGER_TAIL_PATTERN) then
capitalize_next = true
end
result[#result + 1] = pandoc.Str(text)
elseif elem.t == "Space" or elem.t == "SoftBreak" or elem.t == "LineBreak" then
result[#result + 1] = elem
else
local rebuilt
rebuilt, capitalize_next = rebuild_container(elem, capitalize_next)
if rebuilt then
result[#result + 1] = rebuilt
else
result[#result + 1] = elem
capitalize_next = false
end
end
end
return result, capitalize_next
end
--- Process paragraphs: capitalize word after colon or em dash
local function process_para(para)
local processed_content = process_inlines(para.content, false)
return pandoc.Para(processed_content)
end
--- Only process divs with bibliography classes
function Div(div)
-- Check for bibliography-related classes
if div.classes:includes("references") or
div.classes:includes("csl-bib-body") or
div.classes:includes("csl-entry") then
return pandoc.walk_block(div, { Para = process_para })
end
end