-- Kana Parser lua engine local vowels = {'a', 'e', 'i', 'o', 'u'} -- latin vowels local vowelsK = {'ã‚¢', 'エ', 'イ', 'オ', 'ウ'} -- katakana vowels local ambigousToN = {'ã‚', 'ãˆ', 'ã„', 'ãŠ', 'ã†', 'ã‚„', 'よ', 'ゆ'} -- characters ambiguous to preceding "n" local littleTsuWL = {'s', 't', 'k', 'p', 'c'} -- whitelist for little tsu gemination local transRaw = { -- latin -> hiragana n = 'ã‚“', a = 'ã‚', e = 'ãˆ', i = 'ã„', o = 'ãŠ', u = 'ã†', ba = 'ã°', be = 'ã¹', bi = 'ã³', bo = 'ã¼', bu = 'ã¶', bya = 'ã³ã‚ƒ', byo = 'ã³ã‚‡', byu = 'ã³ã‚…', cha = 'ã¡ã‚ƒ', che = 'ã¡ã‡', chi = 'ã¡', cho = 'ã¡ã‚‡', chu = 'ã¡ã‚…', da = 'ã ', de = 'ã§', di = 'ã§ãƒ', ['do'] = 'ã©', du = { 'ã¥', 'ã©ã…' }, dya = 'ã§ã‚ƒ', dyo = 'ã§ã‚‡', dyu = 'ã§ã‚…', fa = 'ãµã', fe = 'ãµã‡', fi = 'ãµãƒ', fo = 'ãµã‰', fya = 'ãµã‚ƒ', fyo = 'ãµã‚‡', fyu = 'ãµã‚…', ga = 'ãŒ', ge = 'ã’', gi = 'ãŽ', go = 'ã”', gu = 'ã', gwa = 'ãã', gwe = 'ãã‡', gwi = 'ããƒ', gwo = 'ãã‰', gya = 'ãŽã‚ƒ', gyo = 'ãŽã‚‡', gyu = 'ãŽã‚…', ha = 'ã¯', he = 'ã¸', hi = 'ã²', ho = 'ã»', hu = 'ãµ', hya = 'ã²ã‚ƒ', hyo = 'ã²ã‚‡', hyu = 'ã²ã‚…', ja = { 'ã˜ã‚ƒ', 'ã¢ã‚ƒ' }, je = 'ã˜ã‡', ji = { 'ã˜', 'ã¢' }, jo = { 'ã˜ã‚‡', 'ã¢ã‚‡' }, ju = { 'ã˜ã‚…', 'ã¢ã‚…' }, ka = 'ã‹', ke = 'ã‘', ki = 'ã', ko = 'ã“', ku = 'ã', kwa = 'ãã', kwe = 'ãã‡', kwi = 'ããƒ', kwo = 'ãã‰', kya = 'ãゃ', kyo = 'ãょ', kyu = 'ãã‚…', ma = 'ã¾', me = 'ã‚', mi = 'ã¿', mo = 'ã‚‚', mu = 'ã‚€', mya = 'ã¿ã‚ƒ', myo = 'ã¿ã‚‡', myu = 'ã¿ã‚…', na = 'ãª', ne = 'ã', ni = 'ã«', no = 'ã®', nu = 'ã¬', nya = 'ã«ã‚ƒ', nyo = 'ã«ã‚‡', nyu = 'ã«ã‚…', pa = 'ã±', pe = 'ãº', pi = 'ã´', po = 'ã½', pu = 'ã·', pya = 'ã´ã‚ƒ', pyo = 'ã´ã‚‡', pyu = 'ã´ã‚…', ra = 'ら', re = 'れ', ri = 'り', ro = 'ã‚', ru = 'ã‚‹', rya = 'りゃ', ryo = 'りょ', ryu = 'りゅ', sa = 'ã•', se = 'ã›', si = 'ã—', so = 'ã', su = 'ã™', sha = 'ã—ゃ', she = 'ã—ã‡', shi = 'ã—', sho = 'ã—ょ', shu = 'ã—ã‚…', ta = 'ãŸ', te = 'ã¦', ti = 'ã¦ãƒ', to = 'ã¨', tha = 'ã¦ã‚ƒ', tho = 'ã¦ã‚‡', thu = 'ã¦ã‚…', tsa = 'ã¤ã', tse = 'ã¤ã‡', tsu = 'ã¤', tsi = 'ã¤ãƒ', tso = 'ã¤ã‰', tu = 'ã¤', va = 'ã‚”ã', ve = 'ã‚”ã‡', vi = 'ã‚”ãƒ', vo = 'ã‚”ã‰', vu = 'ã‚”ã…', vya = 'ゔゃ', vyo = 'ゔょ', vyu = 'ゔゅ', wa = 'ã‚', we = { 'ã†ã‡', 'ã‚‘' }, wi = 'ã‚', wo = { 'ã‚’', 'ã†ã‰' }, ya = 'ã‚„', ye = 'ã„ã‡', yo = 'よ', yu = 'ゆ', za = 'ã–', ze = 'ãœ', zo = 'ãž', zu = 'ãš' } local transK = { -- hiragana -> katakana ['ã‚“'] = 'ン', ['ã‚'] = 'ã‚¢', ['ãˆ'] = 'エ', ['ã„'] = 'イ', ['ãŠ'] = 'オ', ['ã†'] = 'ウ', ['ã'] = 'ã‚¡', ['ãƒ'] = 'ã‚£', ['ã…'] = 'ã‚¥', ['ã‡'] = 'ã‚§', ['ã‰'] = 'ã‚©', ['ゃ'] = 'ャ', ['ã‚…'] = 'ュ', ['ょ'] = 'ョ', ['ã¯'] = 'ãƒ', ['ã¸'] = 'ヘ', ['ã²'] = 'ヒ', ['ã»'] = 'ホ', ['ãµ'] = 'フ', ['ã°'] = 'ãƒ', ['ã¹'] = 'ベ', ['ã³'] = 'ビ', ['ã¼'] = 'ボ', ['ã¶'] = 'ブ', ['ã±'] = 'パ', ['ãº'] = 'ペ', ['ã´'] = 'ピ', ['ã½'] = 'ãƒ', ['ã·'] = 'プ', ['ãŸ'] = 'ã‚¿', ['ã¦'] = 'テ', ['ã¡'] = 'ãƒ', ['ã¨'] = 'ト', ['ã¤'] = 'ツ', ['ã '] = 'ダ', ['ã§'] = 'デ', ['ã¢'] = 'ヂ', ['ã©'] = 'ド', ['ã¥'] = 'ヅ', ['ã‹'] = 'ã‚«', ['ã‘'] = 'ケ', ['ã'] = 'ã‚', ['ã“'] = 'コ', ['ã'] = 'ク', ['ãŒ'] = 'ガ', ['ã’'] = 'ゲ', ['ãŽ'] = 'ã‚®', ['ã”'] = 'ã‚´', ['ã'] = 'ã‚°', ['ã¾'] = 'マ', ['ã‚'] = 'マ', ['ã¿'] = 'マ', ['ã‚‚'] = 'モ', ['ã‚€'] = 'マ', ['ãª'] = 'ナ', ['ã'] = 'ãƒ', ['ã«'] = 'ニ', ['ã®'] = 'ノ', ['ã¬'] = 'ヌ', ['ら'] = 'ラ', ['れ'] = 'レ', ['り'] = 'リ', ['ã‚'] = 'ãƒ', ['ã‚‹'] = 'ル', ['ã•'] = 'サ', ['ã›'] = 'ã‚»', ['ã—'] = 'ã‚·', ['ã'] = 'ソ', ['ã™'] = 'ス', ['ã–'] = 'ã‚¶', ['ãœ'] = 'ゼ', ['ã˜'] = 'ジ', ['ãž'] = 'ゾ', ['ãš'] = 'ズ', ['ã‚'] = 'ワ', ['ã‚‘'] = 'ヱ', ['ã‚'] = 'ヰ', ['ã‚’'] = 'ヲ', ['ã‚„'] = 'ヤ', ['よ'] = 'ヨ', ['ゆ'] = 'ユ', ['ã‚”'] = 'ヴ', ['ã£'] = 'ッ' } local correctionsFromKana = { -- manual transliteration choices ['ã—'] = 'shi' } local longK = 'ー' local isolator = '\'' local prolongRules = { -- special rules for prolonging syllables o = 'u', e = 'i' } -- builds a reverse table local function rev(t) local res = {} for k, v in pairs(t) do if (type(v) == 'table') then res[v[1]] = k res[v[2]] = k else res[v] = k end end return res end -- builds the default translation tables latin <-> kana from transRaw local function buildDefaultTransTables() local tr, rtr = {}, {} for k, v in pairs(transRaw) do tr[k] = type(v) == 'table' and v[1] or v end rtr = rev(tr) -- apply corrections for i, v in pairs(correctionsFromKana) do rtr[i] = v end return tr, rtr, rev(transK) end -- decides which wovel should prolong the given vowel local function prolong(c) for i, v in ipairs(vowels) do if c == v then if prolongRules[c] then return prolongRules[c] else return c end end end return nil end -- checks if a katakana token is a vowel and returns its latin representation local function getWovelK(c) for i, v in ipairs(vowelsK) do if c == v then return vowels[i] end end return nil end -- checks if a given symbol is ambiguous to preceding n local function isAmbiguous(c) for i, v in ipairs(ambigousToN) do if c == v then return true end end return false end -- init translation tables local trans, revTrans, revTransK = buildDefaultTransTables() -- init default transliteration choices (everything default to first alternative) local transChoices = {} -- checks if two characters are valid candidates for little tsu local function isValidTsuCandidate(a, b) if a ~= b then return false end for i, v in ipairs(littleTsuWL) do if a == v then return true end end return false end -- checks if two characters are a little tsu used correctly and returns the gemination consonant if true local function getGeminationConsonant(a, b) if a ~= 'ã£' then return nil end -- disregard katakana, only hiragana is processed in romanization local tr = revTrans[b] if not tr then return nil end -- invalid hiragana character local fst = string.sub(tr, 1, 1) -- get first character of the transliteration for i, v in ipairs(littleTsuWL) do if fst == v then return fst end end return nil -- invalid gemination end -- parses an utf8 string into utf8 chars (tokens) local function tokenize(utf8str) assert(type(utf8str) == 'string') local res, seq, val = {}, 0, '' for i = 1, #utf8str do local c = string.byte(utf8str, i) if seq == 0 then if i ~= 1 then table.insert(res, val) end seq = c < 0x80 and 1 or c < 0xE0 and 2 or c < 0xF0 and 3 or c < 0xF8 and 4 or error('invalid UTF-8 character sequence') val = string.char(c) else val = val .. string.char(c) end seq = seq - 1 end table.insert(res, val) return res end -- PUBLIC API SECTION -- toggles used characters for supplied syllables (whitespace-separated) function toggleChars(input) local cur, choices = '', {} for s in string.gmatch(input, '%S+') do -- split by whitespaces cur = trans[s] if cur then -- don't process unknown syllables choices = transRaw[s] if type(choices) == 'table' then -- only process syllables with alternatives trans[s] = cur == choices[1] and choices[2] or choices[1] -- toggle between alternatives end end end end -- any kana to latin function toLatin(input) if input == '' then return end local tbl = tokenize(input) local buffer, res = {}, '' -- read tokenized input local tjoin, tfst, last, gc = '', '', 0, '' -- last is the last valid transliterated vowel, gc is the last gemination consonant for i, v in ipairs(tbl) do if revTransK[v] ~= nil then v = revTransK[v] end -- convert all katakana to hiragana table.insert(buffer, v) if #buffer == 2 then -- kana can be formed with up to two characters, always keep two in buffer tjoin, tfst, gc = revTrans[ buffer[1] .. buffer[2] ], revTrans[ buffer[1] ], getGeminationConsonant(buffer[1], buffer[2]) if tjoin ~= nil then -- double character res = res .. tjoin buffer, last = {}, string.sub(tjoin, -1) elseif gc then -- check for little tsu res = res .. gc buffer, last = {buffer[2]}, 0 elseif tfst ~= nil then -- single character res = res .. tfst if tfst == 'n' and isAmbiguous(buffer[2]) then -- ambiguous character succeeding an "n" res = res .. isolator end buffer, last = {buffer[2]}, string.sub(tfst, -1) elseif buffer[1] == longK and prolong(last) ~= nil then -- prolonging dash res = res .. prolong(last) buffer, last = {buffer[2]}, 0 else -- cannot transliterate, output as-is res = res .. buffer[1] buffer, last = {buffer[2]}, 0 end end end if #buffer == 1 then -- trailing character if revTrans[ buffer[1] ] ~= nil then -- single character res = res .. revTrans [ buffer[1] ] elseif buffer[1] == longK and prolong(last) ~= nil then -- prolonging dash res = res .. prolong(last) else -- cannot transliterate, output as-is res = res .. buffer[1] end end tex.print(res) end -- latin or katakana to hiragana, 'raw' parameter is for internal use, leave it blank to get output to TeX function toHiragana(input, raw) if input == '' then return end local tbl = tokenize(input) local buffer, res = {}, '' local t3, t2, t1, last, lastsym, lastcnd = '', '', '', 0, nil, nil for i, v in ipairs(tbl) do if revTransK[v] then v = revTransK[v] end -- translate katakana to hiragana on the go table.insert(buffer, v) if #buffer == 3 then t3, t2, t1 = trans[ buffer[1] .. buffer[2] .. buffer[3] ], trans[ buffer[1] .. buffer[2] ], trans[ buffer[1] ] if t3 ~= nil then -- all three letters yield translation if lastcnd then -- add little tsu res = res .. 'ã£' lastcnd = nil end res = res .. t3 last = buffer[3] buffer = {} elseif t2 ~= nil then -- first two letters yield translation if lastcnd then -- add little tsu res = res .. 'ã£' lastcnd = nil end res = res .. t2 last = buffer[2] buffer = {buffer[3]} elseif isValidTsuCandidate(buffer[1], buffer[2]) then -- test little tsu candidates if lastcnd then res = res .. lastcnd end -- add last consonant in raw form lastcnd = buffer[1] -- set last candidate consonant last = 0 -- is not vowel buffer = {buffer[2], buffer[3]} elseif t1 ~= nil then -- first letter yields translation : a, e, i, o, u, n res = res .. t1 last = buffer[1] buffer = {buffer[2], buffer[3]} elseif buffer[1] == longK and prolong(last) ~= nil then -- valid prolonger sign res = res .. trans[prolong(last)] buffer, last = {buffer[2], buffer[3]}, 0 elseif buffer[1] == isolator then -- isolating apostrophe, consume it buffer = {buffer[2], buffer[3]} else if lastcnd then -- add last consonant in raw form res = res .. lastcnd lastcnd = nil end -- this code allows for proper conversion of katakana's prolongation dash to hiragana t1 = revTrans[ buffer[1] ] if t1 then -- symbol is standalone hiragana last = string.sub(t1, -1) lastsym = buffer[1] elseif lastsym then -- attempt to merge symbol with previous symbol t1 = revTrans[ lastsym .. buffer[1] ] if t1 then -- symbol is a valid non-standalone hiragana compound last = string.sub(t1, -1) else -- symbol is an invalid non-standalone hiragana compound last = nil end lastsym = nil else last, lastsym = 0, nil end res = res .. buffer[1] buffer = {buffer[2], buffer[3]} end end end if #buffer == 2 then if trans[ buffer[1] .. buffer[2] ] ~= nil then -- first two symbols yield translation if lastcnd then res = res .. 'ã£' end -- add little tsu res = res .. trans[ buffer[1] .. buffer[2] ] last = buffer[2] buffer = {} elseif trans[ buffer[1] ] ~= nil then -- first symbol yields translation res = res .. trans[ buffer[1] ] last = buffer[1] buffer = {buffer[2]} elseif buffer[1] == longK and prolong(last) ~= nil then -- valid prolonger res = res .. trans[prolong(last)] buffer, last = {buffer[2]}, 0 elseif buffer[1] == isolator then -- consume isolator buffer = {buffer[2]} else if lastcnd then res = res .. lastcnd end -- add last consonant in raw form -- this code allows for proper conversion of katakana's prolongation dash to hiragana t1 = revTrans[ buffer[1] ] if t1 then -- symbol is standalone hiragana last = string.sub(t1, -1) lastsym = buffer[1] elseif lastsym then -- attempt to merge symbol with previous symbol t1 = revTrans[ lastsym .. buffer[1] ] if t1 then -- symbol is a valid non-standalone hiragana compound last = string.sub(t1, -1) else -- symbol is an invalid non-standalone hiragana compound last = nil end lastsym = nil -- erase last valid symbol else last, lastsym = 0, nil end res = res .. buffer[1] buffer = {buffer[2]} end end if #buffer == 1 then -- remaining symbol if trans[ buffer[1] ] ~= nil then res = res .. trans[ buffer[1] ] elseif buffer[1] == longK and prolong(last) ~= nil then res = res .. trans[prolong(last)] elseif buffer[1] ~= isolator then res = res .. buffer[1] end end if not raw then tex.print(res) else return res -- for internal use end end -- latin or hiragana to katakana function toKatakana(input) if input == '' then return end local hiraganized = tokenize(toHiragana(input, true)) -- convert everything to hiragana -- replace hiragana with katakana for i, v in ipairs(hiraganized) do if transK[v] ~= nil then hiraganized[i] = transK[v] end end -- insert prolonging symbols and prepare output local prev, nxt, vowel, tprev, tnext, res = hiraganized[1], '', '', '', '', hiraganized[1] local merge, toprolong = '', nil for i = 2, #hiraganized do nxt = hiraganized[i] vowel = getWovelK(nxt) if not toprolong then -- check prev for ending vowel tprev = revTransK[prev] if tprev then tprev = revTrans[tprev] if tprev then toprolong = prolong(string.sub(tprev, -1)) end end end if toprolong then -- check nxt for matching prolonger if toprolong == vowel then nxt = longK toprolong = nil elseif vowel then toprolong = prolong(vowel) else toprolong = nil end end -- try merging prev and nxt for a single token tprev, tnext = revTransK[prev], revTransK[nxt] if tprev and tnext then merge = revTrans[tprev .. tnext] if merge then toprolong = prolong(string.sub(merge, -1)) end end res = res .. nxt prev = nxt end tex.print(res) end