local lpeg = lpeg or require'lpeg'

local utf16be_to_utf8 = lpeg.Cs((
  lpeg.R('\x00\xD7', '\xE0\xFF') * 1 / function(s)
    local high, low = string.byte(s, 1, 2)
    return utf8.char(high << 8 | low)
  end
  + lpeg.R'\xD8\xDB' * 1 * lpeg.R'\xDC\xDF' * 1 / function(s)
    local hh, hl, lh, ll = string.byte(s, 1, 4)
    return utf8.char(((hh & 3) << 12 | hl << 10 | (lh & 3) << 8 | ll) + 0x10000)
  end
  + lpeg.Cg(2 * lpeg.Cc('\u{FFFD}'))
)^0) * -1

local function utf8cp_to_utf16be(utf8char)
  local codepoint = utf8.codepoint(utf8char)
  if codepoint < 0x10000 then
    return string.char(codepoint >> 8, codepoint & 0xFF)
  else
    codepoint = codepoint - 0x10000
    local high = (codepoint >> 10) | 0xD800
    local low = (codepoint & 0x3FF) | 0xDC00
    return string.char(high >> 8, high & 0xFF, low >> 8, low & 0xFF)
  end
end

local pdfdoc_mapping = {
  ['\x18'] = '\u{02D8}',
  ['\x19'] = '\u{02C7}',
  ['\x1A'] = '\u{02C6}',
  ['\x1B'] = '\u{02D9}',
  ['\x1C'] = '\u{02DD}',
  ['\x1D'] = '\u{02DB}',
  ['\x1E'] = '\u{02DA}',
  ['\x1F'] = '\u{02DC}',

  ['\x7F'] = '\u{FFFD}',
  ['\x80'] = '\u{2022}',
  ['\x81'] = '\u{2020}',
  ['\x82'] = '\u{2021}',
  ['\x83'] = '\u{2026}',
  ['\x84'] = '\u{2014}',
  ['\x85'] = '\u{2013}',
  ['\x86'] = '\u{0192}',
  ['\x87'] = '\u{2044}',
  ['\x88'] = '\u{2039}',
  ['\x89'] = '\u{203A}',
  ['\x8A'] = '\u{2212}',
  ['\x8B'] = '\u{2030}',
  ['\x8C'] = '\u{201E}',
  ['\x8D'] = '\u{201C}',
  ['\x8E'] = '\u{201D}',
  ['\x8F'] = '\u{2018}',
  ['\x90'] = '\u{2019}',
  ['\x91'] = '\u{201A}',
  ['\x92'] = '\u{2122}',
  ['\x93'] = '\u{FB01}',
  ['\x94'] = '\u{FB02}',
  ['\x95'] = '\u{0141}',
  ['\x96'] = '\u{0152}',
  ['\x97'] = '\u{0160}',
  ['\x98'] = '\u{0178}',
  ['\x99'] = '\u{017D}',
  ['\x9A'] = '\u{0131}',
  ['\x9B'] = '\u{0142}',
  ['\x9C'] = '\u{0153}',
  ['\x9D'] = '\u{0161}',
  ['\x9E'] = '\u{017E}',
  ['\x9F'] = '\u{FFFD}',
  ['\xA0'] = '\u{20AC}',
  ['\xAD'] = '\u{FFFD}',
}
local pdfdoc_to_utf8 = lpeg.Cs((
    lpeg.R('\x00\x17', '\x0D\x0D', '\x20\x7E')
  + lpeg.R('\xA1\xAC', '\xAE\xFF') / function(c) return utf8.char(string.byte(c)) end
  + lpeg.R('\x18\x1F', '\x7F\xA0', '\xAD\xAD') / pdfdoc_mapping
)^0) * -1

local text_string_to_utf8 = '\xFE\xFF' * utf16be_to_utf8 + '\u{FEFF}' * lpeg.C(lpeg.P(1)^0) * -1 + pdfdoc_to_utf8

local winansi_mapping = {
  ['\x80'] = '\u{20AC}',
  ['\x81'] = '\u{0081}',
  ['\x82'] = '\u{201A}',
  ['\x83'] = '\u{0192}',
  ['\x84'] = '\u{201E}',
  ['\x85'] = '\u{2026}',
  ['\x86'] = '\u{2020}',
  ['\x87'] = '\u{2021}',
  ['\x88'] = '\u{02C6}',
  ['\x89'] = '\u{2030}',
  ['\x8A'] = '\u{0160}',
  ['\x8B'] = '\u{2039}',
  ['\x8C'] = '\u{0152}',
  ['\x8D'] = '\u{008D}',
  ['\x8E'] = '\u{017D}',
  ['\x8F'] = '\u{008F}',
  ['\x90'] = '\u{0090}',
  ['\x91'] = '\u{2018}',
  ['\x92'] = '\u{2019}',
  ['\x93'] = '\u{201C}',
  ['\x94'] = '\u{201D}',
  ['\x95'] = '\u{2022}',
  ['\x96'] = '\u{2013}',
  ['\x97'] = '\u{2014}',
  ['\x98'] = '\u{02DC}',
  ['\x99'] = '\u{2122}',
  ['\x9A'] = '\u{0161}',
  ['\x9B'] = '\u{203A}',
  ['\x9C'] = '\u{0153}',
  ['\x9D'] = '\u{009D}',
  ['\x9E'] = '\u{017E}',
  ['\x9F'] = '\u{0178}',
  ['\xA0'] = '\u{00A0}',
  ['\xA1'] = '\u{00A1}',
  ['\xA2'] = '\u{00A2}',
  ['\xA3'] = '\u{00A3}',
  ['\xA4'] = '\u{00A4}',
  ['\xA5'] = '\u{00A5}',
  ['\xA6'] = '\u{00A6}',
  ['\xA7'] = '\u{00A7}',
  ['\xA8'] = '\u{00A8}',
  ['\xA9'] = '\u{00A9}',
  ['\xAA'] = '\u{00AA}',
  ['\xAB'] = '\u{00AB}',
  ['\xAC'] = '\u{00AC}',
  ['\xAD'] = '\u{00AD}',
  ['\xAE'] = '\u{00AE}',
  ['\xAF'] = '\u{00AF}',
  ['\xB0'] = '\u{00B0}',
  ['\xB1'] = '\u{00B1}',
  ['\xB2'] = '\u{00B2}',
  ['\xB3'] = '\u{00B3}',
  ['\xB4'] = '\u{00B4}',
  ['\xB5'] = '\u{00B5}',
  ['\xB6'] = '\u{00B6}',
  ['\xB7'] = '\u{00B7}',
  ['\xB8'] = '\u{00B8}',
  ['\xB9'] = '\u{00B9}',
  ['\xBA'] = '\u{00BA}',
  ['\xBB'] = '\u{00BB}',
  ['\xBC'] = '\u{00BC}',
  ['\xBD'] = '\u{00BD}',
  ['\xBE'] = '\u{00BE}',
  ['\xBF'] = '\u{00BF}',
  ['\xC0'] = '\u{00C0}',
  ['\xC1'] = '\u{00C1}',
  ['\xC2'] = '\u{00C2}',
  ['\xC3'] = '\u{00C3}',
  ['\xC4'] = '\u{00C4}',
  ['\xC5'] = '\u{00C5}',
  ['\xC6'] = '\u{00C6}',
  ['\xC7'] = '\u{00C7}',
  ['\xC8'] = '\u{00C8}',
  ['\xC9'] = '\u{00C9}',
  ['\xCA'] = '\u{00CA}',
  ['\xCB'] = '\u{00CB}',
  ['\xCC'] = '\u{00CC}',
  ['\xCD'] = '\u{00CD}',
  ['\xCE'] = '\u{00CE}',
  ['\xCF'] = '\u{00CF}',
  ['\xD0'] = '\u{00D0}',
  ['\xD1'] = '\u{00D1}',
  ['\xD2'] = '\u{00D2}',
  ['\xD3'] = '\u{00D3}',
  ['\xD4'] = '\u{00D4}',
  ['\xD5'] = '\u{00D5}',
  ['\xD6'] = '\u{00D6}',
  ['\xD7'] = '\u{00D7}',
  ['\xD8'] = '\u{00D8}',
  ['\xD9'] = '\u{00D9}',
  ['\xDA'] = '\u{00DA}',
  ['\xDB'] = '\u{00DB}',
  ['\xDC'] = '\u{00DC}',
  ['\xDD'] = '\u{00DD}',
  ['\xDE'] = '\u{00DE}',
  ['\xDF'] = '\u{00DF}',
  ['\xE0'] = '\u{00E0}',
  ['\xE1'] = '\u{00E1}',
  ['\xE2'] = '\u{00E2}',
  ['\xE3'] = '\u{00E3}',
  ['\xE4'] = '\u{00E4}',
  ['\xE5'] = '\u{00E5}',
  ['\xE6'] = '\u{00E6}',
  ['\xE7'] = '\u{00E7}',
  ['\xE8'] = '\u{00E8}',
  ['\xE9'] = '\u{00E9}',
  ['\xEA'] = '\u{00EA}',
  ['\xEB'] = '\u{00EB}',
  ['\xEC'] = '\u{00EC}',
  ['\xED'] = '\u{00ED}',
  ['\xEE'] = '\u{00EE}',
  ['\xEF'] = '\u{00EF}',
  ['\xF0'] = '\u{00F0}',
  ['\xF1'] = '\u{00F1}',
  ['\xF2'] = '\u{00F2}',
  ['\xF3'] = '\u{00F3}',
  ['\xF4'] = '\u{00F4}',
  ['\xF5'] = '\u{00F5}',
  ['\xF6'] = '\u{00F6}',
  ['\xF7'] = '\u{00F7}',
  ['\xF8'] = '\u{00F8}',
  ['\xF9'] = '\u{00F9}',
  ['\xFA'] = '\u{00FA}',
  ['\xFB'] = '\u{00FB}',
  ['\xFC'] = '\u{00FC}',
  ['\xFD'] = '\u{00FD}',
  ['\xFE'] = '\u{00FE}',
  ['\xFF'] = '\u{00FF}'
}
local winansi_to_utf8 = lpeg.Cs((
   lpeg.R('\x00\x7F')
  + lpeg.R('\x80\xFF') / winansi_mapping
)^0) * -1
local winansi_mapping_utf16be = {}
for k, v in pairs(winansi_mapping) do
  winansi_mapping_utf16be[k] = utf8cp_to_utf16be(v)
end
local winansi_to_utf16be = lpeg.Cs((
  lpeg.R'\x80\xFF' / winansi_mapping_utf16be
  + lpeg.Cc'\x00' * 1
)^0) * -1

local macroman_mapping = {
  ['\x80'] = '\u{00C4}',
  ['\x81'] = '\u{00C5}',
  ['\x82'] = '\u{00C7}',
  ['\x83'] = '\u{00C9}',
  ['\x84'] = '\u{00D1}',
  ['\x85'] = '\u{00D6}',
  ['\x86'] = '\u{00DC}',
  ['\x87'] = '\u{00E1}',
  ['\x88'] = '\u{00E0}',
  ['\x89'] = '\u{00E2}',
  ['\x8A'] = '\u{00E4}',
  ['\x8B'] = '\u{00E3}',
  ['\x8C'] = '\u{00E5}',
  ['\x8D'] = '\u{00E7}',
  ['\x8E'] = '\u{00E9}',
  ['\x8F'] = '\u{00E8}',
  ['\x90'] = '\u{00EA}',
  ['\x91'] = '\u{00EB}',
  ['\x92'] = '\u{00ED}',
  ['\x93'] = '\u{00EC}',
  ['\x94'] = '\u{00EE}',
  ['\x95'] = '\u{00EF}',
  ['\x96'] = '\u{00F1}',
  ['\x97'] = '\u{00F3}',
  ['\x98'] = '\u{00F2}',
  ['\x99'] = '\u{00F4}',
  ['\x9A'] = '\u{00F6}',
  ['\x9B'] = '\u{00F5}',
  ['\x9C'] = '\u{00FA}',
  ['\x9D'] = '\u{00F9}',
  ['\x9E'] = '\u{00FB}',
  ['\x9F'] = '\u{00FC}',
  ['\xA0'] = '\u{2020}',
  ['\xA1'] = '\u{00B0}',
  ['\xA2'] = '\u{00A2}',
  ['\xA3'] = '\u{00A3}',
  ['\xA4'] = '\u{00A7}',
  ['\xA5'] = '\u{2022}',
  ['\xA6'] = '\u{00B6}',
  ['\xA7'] = '\u{00DF}',
  ['\xA8'] = '\u{00AE}',
  ['\xA9'] = '\u{00A9}',
  ['\xAA'] = '\u{2122}',
  ['\xAB'] = '\u{00B4}',
  ['\xAC'] = '\u{00A8}',
  ['\xAD'] = '\u{2260}',
  ['\xAE'] = '\u{00C6}',
  ['\xAF'] = '\u{00D8}',
  ['\xB0'] = '\u{221E}',
  ['\xB1'] = '\u{00B1}',
  ['\xB2'] = '\u{2264}',
  ['\xB3'] = '\u{2265}',
  ['\xB4'] = '\u{00A5}',
  ['\xB5'] = '\u{00B5}',
  ['\xB6'] = '\u{2202}',
  ['\xB7'] = '\u{2211}',
  ['\xB8'] = '\u{220F}',
  ['\xB9'] = '\u{03C0}',
  ['\xBA'] = '\u{222B}',
  ['\xBB'] = '\u{00AA}',
  ['\xBC'] = '\u{00BA}',
  ['\xBD'] = '\u{03A9}',
  ['\xBE'] = '\u{00E6}',
  ['\xBF'] = '\u{00F8}',
  ['\xC0'] = '\u{00BF}',
  ['\xC1'] = '\u{00A1}',
  ['\xC2'] = '\u{00AC}',
  ['\xC3'] = '\u{221A}',
  ['\xC4'] = '\u{0192}',
  ['\xC5'] = '\u{2248}',
  ['\xC6'] = '\u{2206}',
  ['\xC7'] = '\u{00AB}',
  ['\xC8'] = '\u{00BB}',
  ['\xC9'] = '\u{2026}',
  ['\xCA'] = '\u{00A0}',
  ['\xCB'] = '\u{00C0}',
  ['\xCC'] = '\u{00C3}',
  ['\xCD'] = '\u{00D5}',
  ['\xCE'] = '\u{0152}',
  ['\xCF'] = '\u{0153}',
  ['\xD0'] = '\u{2013}',
  ['\xD1'] = '\u{2014}',
  ['\xD2'] = '\u{201C}',
  ['\xD3'] = '\u{201D}',
  ['\xD4'] = '\u{2018}',
  ['\xD5'] = '\u{2019}',
  ['\xD6'] = '\u{00F7}',
  ['\xD7'] = '\u{25CA}',
  ['\xD8'] = '\u{00FF}',
  ['\xD9'] = '\u{0178}',
  ['\xDA'] = '\u{2044}',
  ['\xDB'] = '\u{20AC}',
  ['\xDC'] = '\u{2039}',
  ['\xDD'] = '\u{203A}',
  ['\xDE'] = '\u{FB01}',
  ['\xDF'] = '\u{FB02}',
  ['\xE0'] = '\u{2021}',
  ['\xE1'] = '\u{00B7}',
  ['\xE2'] = '\u{201A}',
  ['\xE3'] = '\u{201E}',
  ['\xE4'] = '\u{2030}',
  ['\xE5'] = '\u{00C2}',
  ['\xE6'] = '\u{00CA}',
  ['\xE7'] = '\u{00C1}',
  ['\xE8'] = '\u{00CB}',
  ['\xE9'] = '\u{00C8}',
  ['\xEA'] = '\u{00CD}',
  ['\xEB'] = '\u{00CE}',
  ['\xEC'] = '\u{00CF}',
  ['\xED'] = '\u{00CC}',
  ['\xEE'] = '\u{00D3}',
  ['\xEF'] = '\u{00D4}',
  ['\xF0'] = '\u{1F34F}', -- apple '\u{F8FF}',
  ['\xF1'] = '\u{00D2}',
  ['\xF2'] = '\u{00DA}',
  ['\xF3'] = '\u{00DB}',
  ['\xF4'] = '\u{00D9}',
  ['\xF5'] = '\u{0131}',
  ['\xF6'] = '\u{02C6}',
  ['\xF7'] = '\u{02DC}',
  ['\xF8'] = '\u{00AF}',
  ['\xF9'] = '\u{02D8}',
  ['\xFA'] = '\u{02D9}',
  ['\xFB'] = '\u{02DA}',
  ['\xFC'] = '\u{00B8}',
  ['\xFD'] = '\u{02DD}',
  ['\xFE'] = '\u{02DB}',
  ['\xFF'] = '\u{02C7}'
}
local macroman_to_utf8 = lpeg.Cs((
   lpeg.R('\x00\x7F') / function(c) return utf8.char(string.byte(c)) end
  + lpeg.R('\x80\xFF') / macroman_mapping
)^0) * -1
local macroman_mapping_utf16be = {}
for k, v in pairs(macroman_mapping) do
  macroman_mapping_utf16be[k] = utf8cp_to_utf16be(v)
end
local macroman_to_utf16be = lpeg.Cs((
  lpeg.R'\x80\xFF' / macroman_mapping_utf16be
  + lpeg.Cc'\x00' * 1
)^0) * -1

return {
  utf16be_to_utf8 = utf16be_to_utf8,
  text_string_to_utf8 = text_string_to_utf8,
  winansi_to_utf8 = winansi_to_utf8,
  winansi_to_utf16be = winansi_to_utf16be,
  macroman_to_utf8 = macroman_to_utf8,
  macroman_to_utf16be = macroman_to_utf16be,
}