su2026rwep/_extensions/drwater/ignore-softbreaks/ignore-softbreaks.lua

--- Emulate Pandoc's extension `east_asian_line_breaks` in Quarto
--- Ignore soft break adjacent to Chinese characters
--- Tracking Quarto issue: https://github.com/quarto-dev/quarto-cli/issues/8520

--- Copyright: © 2024–Present Tom Ben
--- License: MIT License

function is_chinese(text)
  return text:find("[\228-\233][\128-\191][\128-\191]")
end

function is_ascii(char)
  if char == nil then return false end
  local ascii_code = string.byte(char)
  return ascii_code >= 0 and ascii_code <= 127
end

function is_chinese_punctuation(char)
  if char == nil then return false end
  local punctuation_marks = "，。！？；：“”‘’（）【】《》〈〉「」『』、"
  return string.find(punctuation_marks, char, 1, true) ~= nil
end

function is_alphanumeric(char)
  if char == nil then return false end
  return char:match("[%w]") ~= nil
end

return {
  {
    Para = function(para)
      local cs = para.content
      for k, v in ipairs(cs) do
        if v.t == 'SoftBreak' and cs[k - 1] and cs[k + 1] then
          local p_text = cs[k - 1].text
          local n_text = cs[k + 1].text
          -- Ensure p_text and n_text are not nil and not empty strings
          if p_text and n_text and #p_text > 0 and #n_text > 0 then
            local prev_char -- Stores the last UTF-8 character of p_text
            for char_item in p_text:gmatch("([\0-\x7F\xC2-\xF4][\x80-\xBF]*)") do
              prev_char = char_item
            end

            local next_char -- Stores the first UTF-8 character of n_text
            for char_item in n_text:gmatch("([\0-\x7F\xC2-\xF4][\x80-\xBF]*)") do
              next_char = char_item
              break -- Found the first character
            end

            -- Ensure characters were actually extracted
            if prev_char and next_char then
              -- Rule 1: Remove soft break between Chinese characters
              if is_chinese(prev_char) and is_chinese(next_char) then
                para.content[k] = pandoc.Str("")
                -- Rule 2: Remove soft break after Chinese punctuation
              elseif is_chinese_punctuation(prev_char) then
                para.content[k] = pandoc.Str("")
                -- Rule 3: Remove soft break before Chinese punctuation
              elseif is_chinese_punctuation(next_char) then
                para.content[k] = pandoc.Str("")
                -- Rule 4: Keep soft break between Chinese chars and ASCII alphanumeric
                -- This preserves spacing between Chinese and English words
                -- No action needed - soft break remains
              end
            end
          end
        end
      end
      return para
    end
  }
}