From 46cada8666b9c5d35cf84dbc562b144ddb13d008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcel=20Fabian=20Kr=C3=BCger?= Date: Wed, 15 Jul 2020 04:18:38 +0200 Subject: [PATCH] Improve T1 parser --- luametalatex-font-t1.lua | 132 ++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 56 deletions(-) diff --git a/luametalatex-font-t1.lua b/luametalatex-font-t1.lua index 6edb07d..4a5baf0 100644 --- a/luametalatex-font-t1.lua +++ b/luametalatex-font-t1.lua @@ -1,6 +1,8 @@ -local white = (lpeg.S'\0\9\10\12\13\32' + '%' * (1 - lpeg.S'\r\n')^0)^1 +local white = (lpeg.S'\0\9\10\12\13\32' + '%' * (1 - lpeg.S'\r\n')^0)^1 -- Whitespace + local regular = 1-lpeg.S'()<>[]{}/%\0\9\10\12\13\32' local lastbase = '123456789abcdefghiklmnopqrstuvwxyz' + local number = lpeg.Cmt(lpeg.R'09'^1/tonumber * '#', function(s, p, base) if base < 2 then return end local pattern @@ -13,6 +15,7 @@ local number = lpeg.Cmt(lpeg.R'09'^1/tonumber * '#', function(s, p, base) return p, num and tonumber(num, base) end) + (lpeg.S'+-'^-1 * ('.' * lpeg.R'09'^1 + lpeg.R'09'^1 * lpeg.P'.'^-1 * lpeg.R'09'^0) * (lpeg.S'eE' * lpeg.S'+-'^-1 * lpeg.R'09'^1)^-1)/tonumber + local literalstring = lpeg.P{'(' * lpeg.Cs(( lpeg.P'\\n'/'\n'+lpeg.P'\\r'/'\r'+lpeg.P'\\t'/'\t'+lpeg.P'\\b'/'\b'+lpeg.P'\\f'/'\f' +'\\'*lpeg.C(lpeg.R'07'*lpeg.R'07'^-2)/function(n)return string.char(tonumber(n, 8))end @@ -20,16 +23,27 @@ local literalstring = lpeg.P{'(' * lpeg.Cs(( +'\\'*lpeg.C(1)/1 +('\n' + ('\r' * lpeg.P'\n'^-1))/'\n' +(1-lpeg.S'()\\')+lpeg.V(1))^0) * ')'} + local hexstring = '<' * lpeg.Cs(( lpeg.C(lpeg.R'09'+lpeg.R'af'+lpeg.R'AF')*(lpeg.C(lpeg.R'09'+lpeg.R'af'+lpeg.R'AF')+lpeg.Cc'0')/function(a,b)return string.char(tonumber(a..b, 16))end)^0) * '>' + local name = lpeg.C(regular^1) local lname = '/' * name / 1 + +local boolean = (lpeg.P'true' + 'false')/{["true"] = true, ["false"] = false} + +-- Everything above this line works pretty reliable and can be understood by reading the PostScript specs. + +-- This is Type1 specific. The only thing which might need adjustment is adding alternative spellings for -|, RD, |-, |, etc. +local binary_bytes = lpeg.Cmt(number*white^-1*(lpeg.P'-| ' + 'RD '), function(s, p, l)return p+l, s:sub(p, p+l-1) end)*white^-1*(lpeg.P"|-"+"|"+"ND"+"NP") +-- Attention: The |-, |, ND, NP already contain an implicit `def` + local function decrypt(key, n, cipher) -- Generally you should never implement your own crypto. So we call a well known, peer reviewed, -- high-quality cryptographic library. --- Ha-Ha, of course we are implementing by ourselves. -- That might be completely unsecure, but given that the encryption keys are well known constants -- documented in the T1 Spec, there is no need to worry about it. - -- Also I do not think any cryptorgraphic library would implement this anyway, it doesn't even + -- Also I do not think any cryptographic library would implement this anyway, it doesn't even -- really deserve the term encryption. local decoded = {string.byte(cipher, 1,-1)} for i=1,#decoded do @@ -40,66 +54,77 @@ local function decrypt(key, n, cipher) return string.char(table.unpack(decoded, n+1)) end --- io.stdout:write(decrypt(55665, 4, string.sub(io.stdin:read'a', 7))) -local boolean = (lpeg.P'true' + 'false')/{["true"] = true, ["false"] = false} -local anytype = {hexstring + literalstring + number + lname + boolean + lpeg.V(2) + name, lpeg.Ct('[' * (white^-1 * lpeg.V(1))^0 * white^-1 * ']' + '{' * (white^-1 * lpeg.V(1))^0 * white^-1 * '}' * white^-1 * lpeg.P"executeonly"^-1)} -local dict = lpeg.Cf(lpeg.Carg(1) * lpeg.Cg(white^-1*lname*white^-1*(anytype)*white^-1*lpeg.P"readonly"^-1*white^-1*lpeg.P"noaccess"^-1*white^-1*(lpeg.P"def"+"ND"+"|-"))^0, rawset) -local encoding = (white+anytype-("dup"*white))^0/0 +local anytype = { + hexstring + + literalstring + + number + + lname + + boolean + + lpeg.V'array' + + name, + array = lpeg.Ct( '[' * (white^-1 * lpeg.V(1))^0 * white^-1 * ']' -- Arrays have two possible syntaxes + + '{' * (white^-1 * lpeg.V(1))^0 * white^-1 * '}') * (white * "executeonly")^-1 +} + +local function skip_until(p) + if type(p) == 'string' then p = p * -name end + return (white + anytype - p)^0/0 +end +local skip_to_begin = skip_until'begin' * 'begin' + +local def_like = (lpeg.P'def' + 'ND' + '|-') * -name + +local encoding = '/' * lpeg.C'Encoding' * -name + * skip_until'dup' * lpeg.Cf(lpeg.Ct'' * lpeg.Cg("dup"*white*number*white^-1*lname*white^-1*"put"*white)^0 , rawset) - * lpeg.P"readonly"^-1*white*"def" -local function parse_encoding(offset, str) - local found - found, offset = (encoding*lpeg.Cp()):match(str, offset) - return found, offset -end -local function parse_fontinfo(offset, str) - local found - repeat - found, offset = ((white+(anytype-name))^0/0*name*lpeg.Cp()):match(str, offset) - until found == 'begin' - found, offset = (dict*lpeg.Cp()):match(str, offset, {}) - offset = (white^-1*"end"*white^-1*lpeg.P"readonly"^-1*white^-1*"def"):match(str, offset) - return found, offset -end -local binary_bytes = lpeg.Cmt(number*white^-1*(lpeg.P'-| ' + 'RD '), function(s, p, l)return p+l, s:sub(p, p+l-1) end)*white^-1*(lpeg.P"|-"+"|"+"ND"+"NP") -local charstr = white^-1*lname*(white^-1*(anytype-lname))^0/0*white^-1 + * ("readonly"*white)^-1 * "def" + +local charstr = '/' * lpeg.C'CharStrings' * -name + * skip_until(lname) -- sometimes we get weird stuff in between. Just make sure that we don't swallow a charname * lpeg.Cf(lpeg.Ct'' - * lpeg.Cg(lname*white^-1*binary_bytes*white)^0 + * lpeg.Cg(lname*white^-1*binary_bytes*white)^0 -- Remember: binary_bytes includes a `def` , rawset) * lpeg.P"end"*white -local subrs = (white^-1*(anytype-("dup"*white)))^0/0*white^-1 - * lpeg.Cf(lpeg.Ct'' - * lpeg.Cg("dup"*white^-1*number*white^-1*binary_bytes*white)^0 - , rawset) - * (lpeg.P"readonly"*white)^-1 * (lpeg.P"noaccess"*white)^-1*(lpeg.P"def"+"ND"+"|-") + +local subrs = '/' * lpeg.C'Subrs' * -name + * skip_until'dup' + * lpeg.Cf(lpeg.Ct'' + * lpeg.Cg("dup"*white^-1*number*white^-1*binary_bytes*white)^0 + , rawset) + * (lpeg.P"readonly"*white)^-1 * (lpeg.P"noaccess"*white)^-1*(lpeg.P"def"+"ND"+"|-") + +-- lpeg.V(2) == dict_entries +local dict = skip_to_begin * lpeg.V(2) * white^-1 * 'end' * white * ('readonly' * white)^-1 * ('noaccess' * white)^-1 * def_like +local dict_entry = encoding + subrs + + '/' * lpeg.C'FontInfo' * dict + + lname -- key + * white^-1 + * anytype -- value + * ((white + anytype - (def_like + 'dict' + 'array') * -name)/0 * white^-1)^0 -- Sometimes we get Postscript code in between. + * def_like +local dict_entries = lpeg.P{ + lpeg.Cf(lpeg.Carg(1) * lpeg.Cg(white^-1*lpeg.V(3))^0, rawset), + lpeg.Cf(lpeg.Ct'' * lpeg.Cg(white^-1*lpeg.V(3))^0, rawset), + dict_entry, +} local function parse_private(offset, str) local mydict, found - repeat - found, offset = ((white+(anytype-name))^0/0*name*lpeg.Cp()):match(str, offset) - until found == 'begin' - mydict, offset = (dict*lpeg.Cp()):match(str, offset, {}) - found = (white^-1*lname):match(str, offset) - if found == "Subrs" then - mydict.Subrs, offset = (subrs*lpeg.Cp()):match(str, offset) - end + offset = (skip_to_begin * lpeg.Cp()):match(str, offset) + + -- Scan the dictionary + mydict, offset = (dict_entries*lpeg.Cp()):match(str, offset, {}) return mydict, offset end local function continue_maintable(offset, str, mydict) - mydict, offset = (dict*lpeg.Cp()):match(str, offset, mydict) + mydict, offset = (dict_entries*lpeg.Cp()):match(str, offset, mydict) local found = (white^-1*lname):match(str, offset) - if found == "FontInfo" then - mydict.FontInfo, offset = parse_fontinfo(offset, str) - return continue_maintable(offset, str, mydict) - elseif found == "Encoding" then - mydict.Encoding, offset = parse_encoding(offset, str) - return continue_maintable(offset, str, mydict) - elseif found == "Private" then + if found == "Private" then -- Scanned separatly because it isn't always ended in a regular way mydict.Private, offset = parse_private(offset, str) return continue_maintable(offset, str, mydict) - elseif found == "CharStrings" then - mydict.CharStrings, offset = (charstr*lpeg.Cp()):match(str, offset) + elseif found == "CharStrings" then -- This could be included in normal scanning, but it is our signal to terminate + found, mydict.CharStrings, offset = (charstr*lpeg.Cp()):match(str, offset) return mydict else local newoffset = ((white+name)^1/0*lpeg.Cp()):match(str, offset) @@ -107,23 +132,18 @@ local function continue_maintable(offset, str, mydict) return continue_maintable(newoffset, str, mydict) end end - print(str:sub(offset)) error[[Unable to read Type 1 font]] end local function parse_maintable(offset, str) local found - repeat - found, offset = ((white+(anytype-name))^0/0*name*lpeg.Cp()):match(str, offset) - until found == 'begin' + offset = (skip_to_begin * lpeg.Cp()):match(str, offset) return continue_maintable(offset, str, {}) end return function(filename) local file = io.open(filename, 'rb') - local _, length = string.unpack("