diff --git a/luametalatex-back-pdf.lua b/luametalatex-back-pdf.lua index 7e03f8a..bb2eba4 100644 --- a/luametalatex-back-pdf.lua +++ b/luametalatex-back-pdf.lua @@ -75,10 +75,9 @@ local function write_infodir(p) return p:indirect(nil, string.format("<<%s%s>>", infodir, additional)) end -local function pdf_string(s) - -- Emulate other engines here: If looks like an escaped string, treat it as such. Otherwise, add parenthesis. - return s:match("^%(.*%)$") or s:match("^<.*>$") or '(' .. s .. ')' -end +local pdf_escape = require'luametalatex-pdf-escape' +local pdf_bytestring = pdf_escape.escape_bytes +local pdf_text = pdf_escape.escape_text callback.register("stop_run", function() if not pfile then @@ -185,7 +184,7 @@ local function get_action_attr(p, action, is_link) local action_attr = is_link and "/Subtype/Link/A<<" or "<<" local file = action.file if file then - action_attr = action_attr .. '/F' .. pdf_string(file) + action_attr = action_attr .. '/F' .. pdf_bytestring(file) local newwindow = action.new_window if newwindow and newwindow > 0 then action_attr = action_attr .. '/NewWindow ' .. (newwindow == 1 and 'true' or 'false') @@ -199,7 +198,7 @@ local function get_action_attr(p, action, is_link) local id = action.id if file then assert(type(id) == "string") - action_attr = action_attr .. "/S/GoToR/D" .. pdf_string(id) .. ">>" + action_attr = action_attr .. "/S/GoToR/D" .. pdf_bytestring(id) .. ">>" else local dest = dests[id] if not dest then @@ -207,7 +206,7 @@ local function get_action_attr(p, action, is_link) dests[id] = dest end if type(id) == "string" then - action_attr = action_attr .. "/S/GoTo/D" .. pdf_string(id) .. ">>" + action_attr = action_attr .. "/S/GoTo/D" .. pdf_bytestring(id) .. ">>" else action_attr = string.format("%s/S/GoTo/D %i 0 R>>", action_attr, dest) end @@ -577,11 +576,11 @@ token.luacmd("pdfextension", function(_, imm) local level = token.scan_int() local open = token.scan_keyword'open' local title = token.scan_string() - outline:add(pdf_string(title), action, level, open, attr) + outline:add(pdf_text(title), action, level, open, attr) else local count = token.scan_keyword'count' and token.scan_int() or 0 local title = token.scan_string() - outline:add_legacy(pdf_string(title), action, count, attr) + outline:add_legacy(pdf_text(title), action, count, attr) end elseif token.scan_keyword"dest" then local id diff --git a/luametalatex-nodewriter.lua b/luametalatex-nodewriter.lua index 438f5d9..5e6a331 100644 --- a/luametalatex-nodewriter.lua +++ b/luametalatex-nodewriter.lua @@ -324,7 +324,8 @@ function nodehandler.glue(p, n, x, y, outer, origin, level) -- Naturally this is end function nodehandler.kern() end function nodehandler.penalty() end -local literalescape = lpeg.Cs((lpeg.S'\\()\r'/{['\\'] = '\\\\', ['('] = '\\(', [')'] = '\\)', ['\r'] = '\\r'}+1)^0) + +local pdf_escape = require'luametalatex-pdf-escape'.escape_raw local match = lpeg.match local function do_commands(p, c, f, fid, x, y, outer, ...) local fonts = f.fonts @@ -407,20 +408,20 @@ function nodehandler.glyph(p, n, x, y, ...) -- if f.encodingbytes == -3 then if false then if index < 0x80 then - p.pending[#p.pending+1] = match(literalescape, string.pack('>B', index)) + p.pending[#p.pending+1] = pdf_escape(string.pack('>B', index)) elseif index < 0x7F80 then - p.pending[#p.pending+1] = match(literalescape, string.pack('>H', index+0x7F80)) + p.pending[#p.pending+1] = pdf_escape(string.pack('>H', index+0x7F80)) else - p.pending[#p.pending+1] = match(literalescape, string.pack('>BH', 0xFF, index-0x7F80)) + p.pending[#p.pending+1] = pdf_escape(string.pack('>BH', 0xFF, index-0x7F80)) end else - p.pending[#p.pending+1] = match(literalescape, string.pack('>H', index)) + p.pending[#p.pending+1] = pdf_escape(string.pack('>H', index)) end if not p.usedglyphs[index] then p.usedglyphs[index] = {index, math.floor(c.width * 1000 / f.size + .5), c.tounicode} end else - p.pending[#p.pending+1] = match(literalescape, string.char(getchar(n))) + p.pending[#p.pending+1] = pdf_escape(string.char(getchar(n))) if not p.usedglyphs[getchar(n)] then p.usedglyphs[getchar(n)] = {getchar(n), math.floor(c.width * 1000 / f.size + .5), c.tounicode} end diff --git a/luametalatex-pdf-escape.lua b/luametalatex-pdf-escape.lua new file mode 100644 index 0000000..8d87fe1 --- /dev/null +++ b/luametalatex-pdf-escape.lua @@ -0,0 +1,77 @@ +local mode = 6 +-- Control how much escaping is done... the mode is a bitset: +-- Bit 0: Disable auto-detection of pre-escaped input +-- Bit 1: Convert UTF-8 input to UTF-16 +-- Bit 2: Actually escape unescaped input instead of assuming that it is safe +-- +-- This currently results in 8 modes. Mode 7 is recommended if you can control +-- all new code, otherwise Mode 6 might be required. Code 0 is (mostly) compatible +-- with other engines. +-- +-- Also we have three distinct functions which relate to different uses. +-- escape_text is for text strings and fully respects the mode. +-- escape_bytes is for non-text byte strings and always acts as if Bit 1 is unset +-- (after all, UTF-16 doesn't make sense for non-text strings) +-- escape_raw always acts like mode 5 without the parens: Just escape, without any +-- other auto-detection or conversion. (This is used for actual content text) +local function setmode(new) + mode = new +end +local function is_escaped(s) + if mode & 1 == 1 then return false end + return (mode & 1 == 1) and (s:match("^%(.*%)$") or s:match("^<.*>$")) and true or false +end +local function to_utf16(s) + local i = 3 + local bytes = {0xFE, 0xFF} + for _, c in utf8.codes(s) do + if c < 0x10000 then + -- assert(c < 0xD800 or c >= 0xE000) + bytes[i] = c >> 8 + bytes[i+1] = c & 0xFF + i = i+2 + else + c = c-0x10000 + bytes[i] = 0xD8 | ((c>>18) & 3) + bytes[i+1] = (c>>10) & 0xFF + bytes[i+2] = 0xDC | ((c>>8) & 3) + bytes[i+3] = c & 0xFF + i = i+4 + end + end + return string.char(table.unpack(bytes)) +end +-- This is pretty much the minimal escaping possible: Only escape bytes which are +local l = lpeg +local simple_char = 1-l.S'()\\' +local semi_simple_char = simple_char + l.P'\\'/'\\\\' +local nested = l.P{'(' * (semi_simple_char + l.V(1))^0 * ')'} +local inner = (semi_simple_char + nested + (l.Cc'\\' * l.S'()'))^0 * -1 +local raw = l.Cs(inner) +local patt = l.Cs(l.Cc'(' * inner * l.Cc')') +local function escape_bytes(s) + if is_escaped(s) then return s end + if mode & 4 == 0 then + return '(' .. s .. ')' + end + return patt:match(s) +end +local function escape_text(s) + if is_escaped(s) then return s end + if mode & 2 == 2 then + s = to_utf16(s) + elseif mode & 4 == 0 then + return '(' .. s .. ')' + end + return patt:match(s) +end +local function escape_raw(s) + return raw:match(s) +end + +return { + escape_raw = escape_raw, + escape_bytes = escape_bytes, + escape_text = escape_text, + setmode = setmode, +} diff --git a/luametalatex-pdf-nametree.lua b/luametalatex-pdf-nametree.lua index 8ddab33..ccf3b98 100644 --- a/luametalatex-pdf-nametree.lua +++ b/luametalatex-pdf-nametree.lua @@ -14,10 +14,9 @@ local function write(pdf, tree, escaped, step) move(tree, #tree+1, 2*#tree-nextcount, nextcount+1) return write(pdf, tree, escaped, step*6) end -local function pdf_string(s) - -- Emulate other engines here: If looks like an escaped string, treat it as such. Otherwise, add parenthesis. - return s:match("^%(.*%)$") or s:match("^<.*>$") or '(' .. s .. ')' -end + +local pdf_bytestring = require'luametalatex-pdf-escape'.escape_bytes + local serialized = {} return function(values, pdf) local tree = {} @@ -35,7 +34,7 @@ return function(values, pdf) local key = tree[6*i+j] if key then local value = values[key] - key = pdf_string(key) + key = pdf_bytestring(key) tree[6*i+j] = key serialized[2*j-1] = key serialized[2*j] = value diff --git a/luametalatex-pdf.lua b/luametalatex-pdf.lua index b971bbc..0d0e88c 100644 --- a/luametalatex-pdf.lua +++ b/luametalatex-pdf.lua @@ -14,7 +14,8 @@ local function written(pdf, num) if not num or num == assigned then return end return num ~= delayed end -local function stream(pdf, num, dict, content, isfile) +-- raw: Pass on preencoded stream. Currently ignored. +local function stream(pdf, num, dict, content, isfile, raw) if not num then num = pdf:getobj() end if pdf[num] ~= assigned then error[[Invalid object]]