171 lines
5.5 KiB
Lua
171 lines
5.5 KiB
Lua
--[[
|
||
capitalize-subtitle – Capitalize first letter after colons and em dashes in bibliography entries
|
||
|
||
This filter capitalizes the first letter after colons and em dashes in bibliography
|
||
entries, following APA and similar styles that require subtitle capitalization.
|
||
|
||
It only processes paragraphs within bibliography divs
|
||
Must be run after Citeproc
|
||
|
||
Copyright: © 2025–present Tom Ben
|
||
License: MIT
|
||
]]
|
||
|
||
local EXTRA_PUNCT = "“”‘’«»‹›„‟「」『』﹁﹂﹃﹄–—‐"
|
||
local APOSTROPHES = "'’"
|
||
local HYPHENS = "%-‐"
|
||
local AFTER_PUNCT_CLASS = "%s%p" .. EXTRA_PUNCT
|
||
local WORD_TAIL_PATTERN = "[%a" .. APOSTROPHES .. HYPHENS .. "]*"
|
||
local TRIGGER_TAIL_PATTERN = "[^%a" .. APOSTROPHES .. HYPHENS .. "]*$"
|
||
|
||
local function is_punctuation(char)
|
||
if not char or char == "" then
|
||
return true
|
||
end
|
||
if char:match("[%s%p]") then
|
||
return true
|
||
end
|
||
return EXTRA_PUNCT:find(char, 1, true) ~= nil
|
||
end
|
||
|
||
local INTERNAL_MARKS = {
|
||
{
|
||
mark = ":",
|
||
skip = function(punct)
|
||
return punct == "" or punct:find("/")
|
||
end,
|
||
},
|
||
{ mark = "—" },
|
||
}
|
||
|
||
local function capitalize_internal_marks(text)
|
||
for _, config in ipairs(INTERNAL_MARKS) do
|
||
local pattern = config.mark .. "([" .. AFTER_PUNCT_CLASS .. "]*)([a-z])"
|
||
text = text:gsub(pattern, function(punct, letter)
|
||
-- Require some separation after colon (e.g., space or quote) to avoid URLs and protocols
|
||
if config.skip and config.skip(punct) then
|
||
return config.mark .. punct .. letter
|
||
end
|
||
return config.mark .. punct .. letter:upper()
|
||
end)
|
||
end
|
||
|
||
return text
|
||
end
|
||
|
||
--- Capitalize first letter if it's lowercase.
|
||
--- Returns transformed string and a boolean indicating whether capitalization occurred.
|
||
local function capitalize_first(str)
|
||
if not str or str == "" then
|
||
return str, false
|
||
end
|
||
-- Separate leading punctuation/spaces (including common typographic quotes) from the word
|
||
local leading, remainder = str:match("^([%s%p" .. EXTRA_PUNCT .. "]*)(.*)")
|
||
if not remainder or remainder == "" then
|
||
return str, false
|
||
end
|
||
local already_capitalized = remainder:match("^([A-Z]" .. WORD_TAIL_PATTERN .. ")")
|
||
if already_capitalized then
|
||
return str, true
|
||
end
|
||
local word, suffix = remainder:match("^([a-z]" .. WORD_TAIL_PATTERN .. ")(.*)")
|
||
if not word then
|
||
local first_char = remainder:sub(1, 1)
|
||
if not is_punctuation(first_char) then
|
||
return str, true
|
||
end
|
||
return str, false
|
||
end
|
||
-- Only capitalize when the word is purely alphabetic (with optional hyphen/apostrophe)
|
||
-- and is followed by punctuation/space or nothing. This avoids changing items like e2105061118.
|
||
local next_char = suffix:sub(1, 1)
|
||
if suffix ~= "" and not is_punctuation(next_char) then
|
||
return str, false
|
||
end
|
||
return leading .. word:sub(1, 1):upper() .. word:sub(2) .. suffix, true
|
||
end
|
||
|
||
local process_inlines --- forward declaration for mutual recursion
|
||
|
||
local SIMPLE_WRAPPERS = {
|
||
Emph = pandoc.Emph,
|
||
Strong = pandoc.Strong,
|
||
SmallCaps = pandoc.SmallCaps,
|
||
}
|
||
|
||
local function rebuild_container(elem, capitalize_next)
|
||
if not elem.content then
|
||
return nil, capitalize_next
|
||
end
|
||
|
||
local processed
|
||
processed, capitalize_next = process_inlines(elem.content, capitalize_next)
|
||
|
||
local wrap = SIMPLE_WRAPPERS[elem.t]
|
||
if wrap then
|
||
return wrap(processed), capitalize_next
|
||
elseif elem.t == "Span" then
|
||
return pandoc.Span(processed, elem.attr), capitalize_next
|
||
elseif elem.t == "Quoted" then
|
||
return pandoc.Quoted(elem.quotetype, processed), capitalize_next
|
||
end
|
||
|
||
return nil, capitalize_next
|
||
end
|
||
|
||
--- Process a list of inlines recursively: capitalize word after colon or em dash.
|
||
-- Returns processed inline list and updated capitalize_next flag.
|
||
function process_inlines(inlines_list, capitalize_next)
|
||
local result = {}
|
||
capitalize_next = capitalize_next or false
|
||
|
||
for _, elem in ipairs(inlines_list) do
|
||
if elem.t == "Str" then
|
||
local text = elem.text
|
||
|
||
text = capitalize_internal_marks(text)
|
||
|
||
if capitalize_next then
|
||
local new_text, consumed = capitalize_first(text)
|
||
text = new_text
|
||
capitalize_next = not consumed
|
||
end
|
||
|
||
if text:find(":" .. TRIGGER_TAIL_PATTERN) or text:find("—" .. TRIGGER_TAIL_PATTERN) then
|
||
capitalize_next = true
|
||
end
|
||
|
||
result[#result + 1] = pandoc.Str(text)
|
||
elseif elem.t == "Space" or elem.t == "SoftBreak" or elem.t == "LineBreak" then
|
||
result[#result + 1] = elem
|
||
else
|
||
local rebuilt
|
||
rebuilt, capitalize_next = rebuild_container(elem, capitalize_next)
|
||
if rebuilt then
|
||
result[#result + 1] = rebuilt
|
||
else
|
||
result[#result + 1] = elem
|
||
capitalize_next = false
|
||
end
|
||
end
|
||
end
|
||
|
||
return result, capitalize_next
|
||
end
|
||
|
||
--- Process paragraphs: capitalize word after colon or em dash
|
||
local function process_para(para)
|
||
local processed_content = process_inlines(para.content, false)
|
||
return pandoc.Para(processed_content)
|
||
end
|
||
|
||
--- Only process divs with bibliography classes
|
||
function Div(div)
|
||
-- Check for bibliography-related classes
|
||
if div.classes:includes("references") or
|
||
div.classes:includes("csl-bib-body") or
|
||
div.classes:includes("csl-entry") then
|
||
return pandoc.walk_block(div, { Para = process_para })
|
||
end
|
||
end
|