73 lines
2.6 KiB
Lua
73 lines
2.6 KiB
Lua
--- Emulate Pandoc's extension `east_asian_line_breaks` in Quarto
|
||
--- Ignore soft break adjacent to Chinese characters
|
||
--- Tracking Quarto issue: https://github.com/quarto-dev/quarto-cli/issues/8520
|
||
|
||
--- Copyright: © 2024–Present Tom Ben
|
||
--- License: MIT License
|
||
|
||
function is_chinese(text)
|
||
return text:find("[\228-\233][\128-\191][\128-\191]")
|
||
end
|
||
|
||
function is_ascii(char)
|
||
if char == nil then return false end
|
||
local ascii_code = string.byte(char)
|
||
return ascii_code >= 0 and ascii_code <= 127
|
||
end
|
||
|
||
function is_chinese_punctuation(char)
|
||
if char == nil then return false end
|
||
local punctuation_marks = ",。!?;:“”‘’()【】《》〈〉「」『』、"
|
||
return string.find(punctuation_marks, char, 1, true) ~= nil
|
||
end
|
||
|
||
function is_alphanumeric(char)
|
||
if char == nil then return false end
|
||
return char:match("[%w]") ~= nil
|
||
end
|
||
|
||
return {
|
||
{
|
||
Para = function(para)
|
||
local cs = para.content
|
||
for k, v in ipairs(cs) do
|
||
if v.t == 'SoftBreak' and cs[k - 1] and cs[k + 1] then
|
||
local p_text = cs[k - 1].text
|
||
local n_text = cs[k + 1].text
|
||
-- Ensure p_text and n_text are not nil and not empty strings
|
||
if p_text and n_text and #p_text > 0 and #n_text > 0 then
|
||
local prev_char -- Stores the last UTF-8 character of p_text
|
||
for char_item in p_text:gmatch("([\0-\x7F\xC2-\xF4][\x80-\xBF]*)") do
|
||
prev_char = char_item
|
||
end
|
||
|
||
local next_char -- Stores the first UTF-8 character of n_text
|
||
for char_item in n_text:gmatch("([\0-\x7F\xC2-\xF4][\x80-\xBF]*)") do
|
||
next_char = char_item
|
||
break -- Found the first character
|
||
end
|
||
|
||
-- Ensure characters were actually extracted
|
||
if prev_char and next_char then
|
||
-- Rule 1: Remove soft break between Chinese characters
|
||
if is_chinese(prev_char) and is_chinese(next_char) then
|
||
para.content[k] = pandoc.Str("")
|
||
-- Rule 2: Remove soft break after Chinese punctuation
|
||
elseif is_chinese_punctuation(prev_char) then
|
||
para.content[k] = pandoc.Str("")
|
||
-- Rule 3: Remove soft break before Chinese punctuation
|
||
elseif is_chinese_punctuation(next_char) then
|
||
para.content[k] = pandoc.Str("")
|
||
-- Rule 4: Keep soft break between Chinese chars and ASCII alphanumeric
|
||
-- This preserves spacing between Chinese and English words
|
||
-- No action needed - soft break remains
|
||
end
|
||
end
|
||
end
|
||
end
|
||
end
|
||
return para
|
||
end
|
||
}
|
||
}
|