-- Kana Parser lua engine

local vowels = {'a', 'e', 'i', 'o', 'u'} -- latin vowels
local vowelsK = {'ã‚¢', 'ã‚¨', 'ã‚¤', 'ã‚ª', 'ã‚¦'} -- katakana vowels
local ambigousToN = {'ã‚', 'ãˆ', 'ã„', 'ãŠ', 'ã†', 'ã‚„', 'ã‚ˆ', 'ã‚†'} -- characters ambiguous to preceding "n"
local littleTsuWL = {'s', 't', 'k', 'p', 'c'} -- whitelist for little tsu gemination
local transRaw = { -- latin -> hiragana
	n = 'ã‚“', a = 'ã‚', e = 'ãˆ', i = 'ã„', o = 'ãŠ', u = 'ã†',
	ba = 'ã°', be = 'ã¹', bi = 'ã³', bo = 'ã¼', bu = 'ã¶',
	bya = 'ã³ã‚ƒ', byo = 'ã³ã‚‡', byu = 'ã³ã‚…',
	cha = 'ã¡ã‚ƒ', che = 'ã¡ã‡', chi = 'ã¡', cho = 'ã¡ã‚‡', chu = 'ã¡ã‚…',
	da = 'ã ', de = 'ã§', di = 'ã§ãƒ', ['do'] = 'ã©', du = { 'ã¥', 'ã©ã…' },
	dya = 'ã§ã‚ƒ', dyo = 'ã§ã‚‡', dyu = 'ã§ã‚…',
	fa = 'ãµã', fe = 'ãµã‡', fi = 'ãµãƒ', fo = 'ãµã‰',
	fya = 'ãµã‚ƒ', fyo = 'ãµã‚‡', fyu = 'ãµã‚…',
	ga = 'ãŒ', ge = 'ã’', gi = 'ãŽ', go = 'ã”', gu = 'ã',
	gwa = 'ãã', gwe = 'ãã‡', gwi = 'ããƒ', gwo = 'ãã‰', gya = 'ãŽã‚ƒ', gyo = 'ãŽã‚‡', gyu = 'ãŽã‚…',
	ha = 'ã¯', he = 'ã¸', hi = 'ã²', ho = 'ã»', hu = 'ãµ',
	hya = 'ã²ã‚ƒ', hyo = 'ã²ã‚‡', hyu = 'ã²ã‚…',
	ja = { 'ã˜ã‚ƒ', 'ã¢ã‚ƒ' }, je = 'ã˜ã‡', ji = { 'ã˜', 'ã¢' }, jo = { 'ã˜ã‚‡', 'ã¢ã‚‡' }, ju = { 'ã˜ã‚…', 'ã¢ã‚…' },
	ka = 'ã‹', ke = 'ã‘', ki = 'ã', ko = 'ã“', ku = 'ã',
	kwa = 'ãã', kwe = 'ãã‡', kwi = 'ããƒ', kwo = 'ãã‰', kya = 'ãã‚ƒ', kyo = 'ãã‚‡',	kyu = 'ãã‚…',
	ma = 'ã¾', me = 'ã‚', mi = 'ã¿', mo = 'ã‚‚', mu = 'ã‚€',
	mya = 'ã¿ã‚ƒ', myo = 'ã¿ã‚‡', myu = 'ã¿ã‚…',
	na = 'ãª', ne = 'ã', ni = 'ã«', no = 'ã®', nu = 'ã¬',
	nya = 'ã«ã‚ƒ', nyo = 'ã«ã‚‡', nyu = 'ã«ã‚…',
	pa = 'ã±', pe = 'ãº', pi = 'ã´', po = 'ã½', pu = 'ã·',
	pya = 'ã´ã‚ƒ', pyo = 'ã´ã‚‡', pyu = 'ã´ã‚…',
	ra = 'ã‚‰', re = 'ã‚Œ', ri = 'ã‚Š', ro = 'ã‚', ru = 'ã‚‹',
	rya = 'ã‚Šã‚ƒ', ryo = 'ã‚Šã‚‡', ryu = 'ã‚Šã‚…',
	sa = 'ã•', se = 'ã›', si = 'ã—',	so = 'ã', su = 'ã™',
	sha = 'ã—ã‚ƒ', she = 'ã—ã‡', shi = 'ã—', sho = 'ã—ã‚‡', shu = 'ã—ã‚…',
	ta = 'ãŸ', te = 'ã¦', ti = 'ã¦ãƒ', to = 'ã¨',
	tha = 'ã¦ã‚ƒ', tho = 'ã¦ã‚‡', thu = 'ã¦ã‚…',
	tsa = 'ã¤ã', tse = 'ã¤ã‡', tsu = 'ã¤', tsi = 'ã¤ãƒ', tso = 'ã¤ã‰',
	tu = 'ã¤',
	va = 'ã‚”ã', ve = 'ã‚”ã‡', vi = 'ã‚”ãƒ', vo = 'ã‚”ã‰', vu = 'ã‚”ã…',
	vya = 'ã‚”ã‚ƒ', vyo = 'ã‚”ã‚‡', vyu = 'ã‚”ã‚…',
	wa = 'ã‚', we = { 'ã†ã‡', 'ã‚‘' }, wi = 'ã‚', wo = { 'ã‚’', 'ã†ã‰' },
	ya = 'ã‚„', ye = 'ã„ã‡', yo = 'ã‚ˆ', yu = 'ã‚†',
	za = 'ã–', ze = 'ãœ', zo = 'ãž', zu = 'ãš'
}
local transK = { -- hiragana -> katakana
	['ã‚“'] = 'ãƒ³', ['ã‚'] = 'ã‚¢', ['ãˆ'] = 'ã‚¨', ['ã„'] = 'ã‚¤', ['ãŠ'] = 'ã‚ª', ['ã†'] = 'ã‚¦',
	['ã'] = 'ã‚¡', ['ãƒ'] = 'ã‚£', ['ã…'] = 'ã‚¥', ['ã‡'] = 'ã‚§', ['ã‰'] = 'ã‚©',
	['ã‚ƒ'] = 'ãƒ£', ['ã‚…'] = 'ãƒ¥', ['ã‚‡'] = 'ãƒ§',
	['ã¯'] = 'ãƒ', ['ã¸'] = 'ãƒ˜', ['ã²'] = 'ãƒ’', ['ã»'] = 'ãƒ›', ['ãµ'] = 'ãƒ•',
	['ã°'] = 'ãƒ', ['ã¹'] = 'ãƒ™', ['ã³'] = 'ãƒ“', ['ã¼'] = 'ãƒœ', ['ã¶'] = 'ãƒ–',
	['ã±'] = 'ãƒ‘', ['ãº'] = 'ãƒš', ['ã´'] = 'ãƒ”', ['ã½'] = 'ãƒ', ['ã·'] = 'ãƒ—',
	['ãŸ'] = 'ã‚¿', ['ã¦'] = 'ãƒ†', ['ã¡'] = 'ãƒ', ['ã¨'] = 'ãƒˆ', ['ã¤'] = 'ãƒ„',
	['ã '] = 'ãƒ€', ['ã§'] = 'ãƒ‡', ['ã¢'] = 'ãƒ‚', ['ã©'] = 'ãƒ‰', ['ã¥'] = 'ãƒ…',
	['ã‹'] = 'ã‚«', ['ã‘'] = 'ã‚±', ['ã'] = 'ã‚', ['ã“'] = 'ã‚³', ['ã'] = 'ã‚¯',
	['ãŒ'] = 'ã‚¬', ['ã’'] = 'ã‚²', ['ãŽ'] = 'ã‚®', ['ã”'] = 'ã‚´', ['ã'] = 'ã‚°',
	['ã¾'] = 'ãƒž', ['ã‚'] = 'ãƒž', ['ã¿'] = 'ãƒž', ['ã‚‚'] = 'ãƒ¢', ['ã‚€'] = 'ãƒž',
	['ãª'] = 'ãƒŠ', ['ã'] = 'ãƒ', ['ã«'] = 'ãƒ‹', ['ã®'] = 'ãƒŽ', ['ã¬'] = 'ãƒŒ',
	['ã‚‰'] = 'ãƒ©', ['ã‚Œ'] = 'ãƒ¬', ['ã‚Š'] = 'ãƒª', ['ã‚'] = 'ãƒ', ['ã‚‹'] = 'ãƒ«',
	['ã•'] = 'ã‚µ', ['ã›'] = 'ã‚»', ['ã—'] = 'ã‚·', ['ã'] = 'ã‚½', ['ã™'] = 'ã‚¹',
	['ã–'] = 'ã‚¶', ['ãœ'] = 'ã‚¼', ['ã˜'] = 'ã‚¸', ['ãž'] = 'ã‚¾', ['ãš'] = 'ã‚º',
	['ã‚'] = 'ãƒ¯', ['ã‚‘'] = 'ãƒ±', ['ã‚'] = 'ãƒ°', ['ã‚’'] = 'ãƒ²',
	['ã‚„'] = 'ãƒ¤', ['ã‚ˆ'] = 'ãƒ¨', ['ã‚†'] = 'ãƒ¦',
	['ã‚”'] = 'ãƒ´', ['ã£'] = 'ãƒƒ'
}
local correctionsFromKana = { -- manual transliteration choices
	['ã—'] = 'shi'
}
local longK = 'ãƒ¼'
local isolator = '\''
local prolongRules = { -- special rules for prolonging syllables
	o = 'u',
	e = 'i'
}

-- builds a reverse table
local function rev(t)
	local res = {}
	for k, v in pairs(t) do
		if (type(v) == 'table') then
			res[v[1]] = k
			res[v[2]] = k
		else
			res[v] = k
		end
	end
	return res
end

-- builds the default translation tables latin <-> kana from transRaw
local function buildDefaultTransTables()
	local tr, rtr = {}, {}
	
	for k, v in pairs(transRaw) do
		tr[k] = type(v) == 'table' and v[1] or v
	end

	rtr = rev(tr)

	-- apply corrections
	for i, v in pairs(correctionsFromKana) do
		rtr[i] = v
	end

	return tr, rtr, rev(transK)
end

-- decides which wovel should prolong the given vowel
local function prolong(c)
	for i, v in ipairs(vowels) do
		if c == v then
			if prolongRules[c] then return prolongRules[c] else return c end
		end
	end
	return nil
end

-- checks if a katakana token is a vowel and returns its latin representation
local function getWovelK(c)
	for i, v in ipairs(vowelsK) do
		if c == v then return vowels[i] end
	end
	return nil
end

-- checks if a given symbol is ambiguous to preceding n
local function isAmbiguous(c)
	for i, v in ipairs(ambigousToN) do
		if c == v then return true end
	end
	return false
end

-- init translation tables
local trans, revTrans, revTransK = buildDefaultTransTables()

-- init default transliteration choices (everything default to first alternative)
local transChoices = {}

-- checks if two characters are valid candidates for little tsu
local function isValidTsuCandidate(a, b)
	if a ~= b then return false end
	for i, v in ipairs(littleTsuWL) do
		if a == v then return true end
	end
	return false
end

-- checks if two characters are a little tsu used correctly and returns the gemination consonant if true
local function getGeminationConsonant(a, b)
	if a ~= 'ã£' then return nil end -- disregard katakana, only hiragana is processed in romanization
	local tr = revTrans[b]
	if not tr then return nil end -- invalid hiragana character
	local fst = string.sub(tr, 1, 1) -- get first character of the transliteration
	for i, v in ipairs(littleTsuWL) do
		if fst == v then return fst end
	end
	return nil -- invalid gemination
end

-- parses an utf8 string into utf8 chars (tokens)
local function tokenize(utf8str)
	assert(type(utf8str) == 'string')
	local res, seq, val = {}, 0, ''
	for i = 1, #utf8str do
		local c = string.byte(utf8str, i)
		if seq == 0 then
			if i ~= 1 then table.insert(res, val) end
			seq = c < 0x80 and 1 or c < 0xE0 and 2 or c < 0xF0 and 3 or
			      c < 0xF8 and 4 or error('invalid UTF-8 character sequence')
			val = string.char(c)
		else
			val = val .. string.char(c)
		end
		seq = seq - 1
	end
	table.insert(res, val)
	return res
end

-- PUBLIC API SECTION

-- toggles used characters for supplied syllables (whitespace-separated)
function toggleChars(input)
	local cur, choices = '', {}
	for s in string.gmatch(input, '%S+') do -- split by whitespaces
		cur = trans[s]
		if cur then -- don't process unknown syllables
			choices = transRaw[s]
			if type(choices) == 'table' then -- only process syllables with alternatives
				trans[s] = cur == choices[1] and choices[2] or choices[1] -- toggle between alternatives
			end
		end
	end
end

-- any kana to latin
function toLatin(input)
	if input == '' then return end
	local tbl = tokenize(input)
	local buffer, res = {}, ''

	-- read tokenized input
	local tjoin, tfst, last, gc = '', '', 0, '' -- last is the last valid transliterated vowel, gc is the last gemination consonant
	for i, v in ipairs(tbl) do
		if revTransK[v] ~= nil then v = revTransK[v] end -- convert all katakana to hiragana
		table.insert(buffer, v)

		if #buffer == 2 then -- kana can be formed with up to two characters, always keep two in buffer
			tjoin, tfst, gc = revTrans[ buffer[1] .. buffer[2] ], revTrans[ buffer[1] ], getGeminationConsonant(buffer[1], buffer[2])
			if tjoin ~= nil then -- double character
				res = res .. tjoin
				buffer, last = {}, string.sub(tjoin, -1)
			elseif gc then -- check for little tsu
				res = res .. gc
				buffer, last = {buffer[2]}, 0
			elseif tfst ~= nil then -- single character
				res = res .. tfst
				if tfst == 'n' and isAmbiguous(buffer[2]) then -- ambiguous character succeeding an "n"
					res = res .. isolator
				end
				buffer, last = {buffer[2]}, string.sub(tfst, -1)
			elseif buffer[1] == longK and prolong(last) ~= nil then -- prolonging dash
				res = res .. prolong(last)
				buffer, last = {buffer[2]}, 0
			else -- cannot transliterate, output as-is
				res = res .. buffer[1]
				buffer, last = {buffer[2]}, 0
			end
		end
	end

	if #buffer == 1 then -- trailing character
		if revTrans[ buffer[1] ] ~= nil then -- single character
			res = res .. revTrans [ buffer[1] ]
		elseif buffer[1] == longK and prolong(last) ~= nil then -- prolonging dash
			res = res .. prolong(last)
		else -- cannot transliterate, output as-is
			res = res .. buffer[1]
		end
	end

	tex.print(res)
end

-- latin or katakana to hiragana, 'raw' parameter is for internal use, leave it blank to get output to TeX
function toHiragana(input, raw)
	if input == '' then return end
	local tbl = tokenize(input)
	local buffer, res = {}, ''
	local t3, t2, t1, last, lastsym, lastcnd = '', '', '', 0, nil, nil

	for i, v in ipairs(tbl) do
		if revTransK[v] then v = revTransK[v] end -- translate katakana to hiragana on the go
		table.insert(buffer, v)

		if #buffer == 3 then
			t3, t2, t1 = trans[ buffer[1] .. buffer[2] .. buffer[3] ], trans[ buffer[1] .. buffer[2] ], trans[ buffer[1] ]
			if t3 ~= nil then -- all three letters yield translation
				if lastcnd then -- add little tsu
					res = res .. 'ã£'
					lastcnd = nil
				end
				res = res .. t3
				last = buffer[3]
				buffer = {}
			elseif t2 ~= nil then -- first two letters yield translation
				if lastcnd then -- add little tsu
					res = res .. 'ã£'
					lastcnd = nil
				end
				res = res .. t2
				last = buffer[2]
				buffer = {buffer[3]}
			elseif isValidTsuCandidate(buffer[1], buffer[2]) then -- test little tsu candidates
				if lastcnd then res = res .. lastcnd end -- add last consonant in raw form
				lastcnd = buffer[1] -- set last candidate consonant
				last = 0 -- is not vowel
				buffer = {buffer[2], buffer[3]}
			elseif t1 ~= nil then -- first letter yields translation : a, e, i, o, u, n
				res = res .. t1
				last = buffer[1]
				buffer = {buffer[2], buffer[3]}
			elseif buffer[1] == longK and prolong(last) ~= nil then -- valid prolonger sign
				res = res .. trans[prolong(last)]
				buffer, last = {buffer[2], buffer[3]}, 0
			elseif buffer[1] == isolator then -- isolating apostrophe, consume it
				buffer = {buffer[2], buffer[3]}
			else
				if lastcnd then -- add last consonant in raw form
					res = res .. lastcnd
					lastcnd = nil
				end

				-- this code allows for proper conversion of katakana's prolongation dash to hiragana
				t1 = revTrans[ buffer[1] ]
				if t1 then -- symbol is standalone hiragana
					last = string.sub(t1, -1)
					lastsym = buffer[1]
				elseif lastsym then -- attempt to merge symbol with previous symbol
					t1 = revTrans[ lastsym .. buffer[1] ]
					if t1 then -- symbol is a valid non-standalone hiragana compound
						last = string.sub(t1, -1)
					else -- symbol is an invalid non-standalone hiragana compound
						last = nil
					end
					lastsym = nil
				else
					last, lastsym = 0, nil
				end
				
				res = res .. buffer[1]
				buffer = {buffer[2], buffer[3]}
			end
		end
	end

	if #buffer == 2 then
		if trans[ buffer[1] .. buffer[2] ] ~= nil then -- first two symbols yield translation
			if lastcnd then res = res .. 'ã£' end -- add little tsu
			res = res .. trans[ buffer[1] .. buffer[2] ]
			last = buffer[2]
			buffer = {}
		elseif trans[ buffer[1] ] ~= nil then -- first symbol yields translation
			res = res .. trans[ buffer[1] ]
			last = buffer[1]
			buffer = {buffer[2]}
		elseif buffer[1] == longK and prolong(last) ~= nil then -- valid prolonger
			res = res .. trans[prolong(last)]
			buffer, last = {buffer[2]}, 0
		elseif buffer[1] == isolator then -- consume isolator
			buffer = {buffer[2]}
		else
			if lastcnd then res = res .. lastcnd end -- add last consonant in raw form

			-- this code allows for proper conversion of katakana's prolongation dash to hiragana
			t1 = revTrans[ buffer[1] ]
			if t1 then -- symbol is standalone hiragana
				last = string.sub(t1, -1)
				lastsym = buffer[1]
			elseif lastsym then -- attempt to merge symbol with previous symbol
				t1 = revTrans[ lastsym .. buffer[1] ]
				if t1 then -- symbol is a valid non-standalone hiragana compound
					last = string.sub(t1, -1)
				else -- symbol is an invalid non-standalone hiragana compound
					last = nil
				end
				lastsym = nil -- erase last valid symbol
			else
				last, lastsym = 0, nil
			end

			res = res .. buffer[1]
			buffer = {buffer[2]}
		end
	end

	if #buffer == 1 then -- remaining symbol
		if trans[ buffer[1] ] ~= nil then
			res = res .. trans[ buffer[1] ]
		elseif buffer[1] == longK and prolong(last) ~= nil then
			res = res .. trans[prolong(last)]
		elseif buffer[1] ~= isolator then
			res = res .. buffer[1]
		end
	end

	if not raw then
		tex.print(res)
	else
		return res -- for internal use
	end
end

-- latin or hiragana to katakana
function toKatakana(input)
	if input == '' then return end
	local hiraganized = tokenize(toHiragana(input, true)) -- convert everything to hiragana

	-- replace hiragana with katakana
	for i, v in ipairs(hiraganized) do
		if transK[v] ~= nil then
			hiraganized[i] = transK[v]
		end
	end

	-- insert prolonging symbols and prepare output
	local prev, nxt, vowel, tprev, tnext, res = hiraganized[1], '', '', '', '', hiraganized[1]
	local merge, toprolong = '', nil
	for i = 2, #hiraganized do
		nxt = hiraganized[i]

		vowel = getWovelK(nxt)

		if not toprolong then -- check prev for ending vowel
			tprev = revTransK[prev]
			if tprev then
				tprev = revTrans[tprev]
				if tprev then
					toprolong = prolong(string.sub(tprev, -1))
				end
			end
		end

		if toprolong then -- check nxt for matching prolonger
			if toprolong == vowel then
				nxt = longK
				toprolong = nil
			elseif vowel then
				toprolong = prolong(vowel)
			else
				toprolong = nil
			end
		end

		-- try merging prev and nxt for a single token
		tprev, tnext = revTransK[prev], revTransK[nxt]
		if tprev and tnext then
			merge = revTrans[tprev .. tnext]
			if merge then
				toprolong = prolong(string.sub(merge, -1))
			end
		end

		res = res .. nxt
		prev = nxt
	end

	tex.print(res)
end