-- Kana Parser lua engine

local vowels = {'a', 'e', 'i', 'o', 'u'} -- latin vowels
local vowelsK = {'ア', 'エ', 'イ', 'オ', 'ウ'} -- katakana vowels
local ambigousToN = {'あ', 'え', 'い', 'お', 'う', 'や', 'よ', 'ゆ'} -- characters ambiguous to preceding "n"
local littleTsuWL = {'s', 't', 'k', 'p', 'c'} -- whitelist for little tsu gemination
local transRaw = { -- latin -> hiragana
	n = 'ん', a = 'あ', e = 'え', i = 'い', o = 'お', u = 'う',
	ba = 'ば', be = 'べ', bi = 'び', bo = 'ぼ', bu = 'ぶ',
	bya = 'びゃ', byo = 'びょ', byu = 'びゅ',
	cha = 'ちゃ', che = 'ちぇ', chi = 'ち', cho = 'ちょ', chu = 'ちゅ',
	da = 'だ', de = 'で', di = 'でぃ', ['do'] = 'ど', du = { 'づ', 'どぅ' },
	dya = 'でゃ', dyo = 'でょ', dyu = 'でゅ',
	fa = 'ふぁ', fe = 'ふぇ', fi = 'ふぃ', fo = 'ふぉ',
	fya = 'ふゃ', fyo = 'ふょ', fyu = 'ふゅ',
	ga = 'が', ge = 'げ', gi = 'ぎ', go = 'ご', gu = 'ぐ',
	gwa = 'ぐぁ', gwe = 'ぐぇ', gwi = 'ぐぃ', gwo = 'ぐぉ', gya = 'ぎゃ', gyo = 'ぎょ', gyu = 'ぎゅ',
	ha = 'は', he = 'へ', hi = 'ひ', ho = 'ほ', hu = 'ふ',
	hya = 'ひゃ', hyo = 'ひょ', hyu = 'ひゅ',
	ja = { 'じゃ', 'ぢゃ' }, je = 'じぇ', ji = { 'じ', 'ぢ' }, jo = { 'じょ', 'ぢょ' }, ju = { 'じゅ', 'ぢゅ' },
	ka = 'か', ke = 'け', ki = 'き', ko = 'こ', ku = 'く',
	kwa = 'くぁ', kwe = 'くぇ', kwi = 'くぃ', kwo = 'くぉ', kya = 'きゃ', kyo = 'きょ',	kyu = 'きゅ',
	ma = 'ま', me = 'め', mi = 'み', mo = 'も', mu = 'む',
	mya = 'みゃ', myo = 'みょ', myu = 'みゅ',
	na = 'な', ne = 'ね', ni = 'に', no = 'の', nu = 'ぬ',
	nya = 'にゃ', nyo = 'にょ', nyu = 'にゅ',
	pa = 'ぱ', pe = 'ぺ', pi = 'ぴ', po = 'ぽ', pu = 'ぷ',
	pya = 'ぴゃ', pyo = 'ぴょ', pyu = 'ぴゅ',
	ra = 'ら', re = 'れ', ri = 'り', ro = 'ろ', ru = 'る',
	rya = 'りゃ', ryo = 'りょ', ryu = 'りゅ',
	sa = 'さ', se = 'せ', si = 'し',	so = 'そ', su = 'す',
	sha = 'しゃ', she = 'しぇ', shi = 'し', sho = 'しょ', shu = 'しゅ',
	ta = 'た', te = 'て', ti = 'てぃ', to = 'と',
	tha = 'てゃ', tho = 'てょ', thu = 'てゅ',
	tsa = 'つぁ', tse = 'つぇ', tsu = 'つ', tsi = 'つぃ', tso = 'つぉ',
	tu = 'つ',
	va = 'ゔぁ', ve = 'ゔぇ', vi = 'ゔぃ', vo = 'ゔぉ', vu = 'ゔぅ',
	vya = 'ゔゃ', vyo = 'ゔょ', vyu = 'ゔゅ',
	wa = 'わ', we = { 'うぇ', 'ゑ' }, wi = 'ゐ', wo = { 'を', 'うぉ' },
	ya = 'や', ye = 'いぇ', yo = 'よ', yu = 'ゆ',
	za = 'ざ', ze = 'ぜ', zo = 'ぞ', zu = 'ず'
}
local transK = { -- hiragana -> katakana
	['ん'] = 'ン', ['あ'] = 'ア', ['え'] = 'エ', ['い'] = 'イ', ['お'] = 'オ', ['う'] = 'ウ',
	['ぁ'] = 'ァ', ['ぃ'] = 'ィ', ['ぅ'] = 'ゥ', ['ぇ'] = 'ェ', ['ぉ'] = 'ォ',
	['ゃ'] = 'ャ', ['ゅ'] = 'ュ', ['ょ'] = 'ョ',
	['は'] = 'ハ', ['へ'] = 'ヘ', ['ひ'] = 'ヒ', ['ほ'] = 'ホ', ['ふ'] = 'フ',
	['ば'] = 'バ', ['べ'] = 'ベ', ['び'] = 'ビ', ['ぼ'] = 'ボ', ['ぶ'] = 'ブ',
	['ぱ'] = 'パ', ['ぺ'] = 'ペ', ['ぴ'] = 'ピ', ['ぽ'] = 'ポ', ['ぷ'] = 'プ',
	['た'] = 'タ', ['て'] = 'テ', ['ち'] = 'チ', ['と'] = 'ト', ['つ'] = 'ツ',
	['だ'] = 'ダ', ['で'] = 'デ', ['ぢ'] = 'ヂ', ['ど'] = 'ド', ['づ'] = 'ヅ',
	['か'] = 'カ', ['け'] = 'ケ', ['き'] = 'キ', ['こ'] = 'コ', ['く'] = 'ク',
	['が'] = 'ガ', ['げ'] = 'ゲ', ['ぎ'] = 'ギ', ['ご'] = 'ゴ', ['ぐ'] = 'グ',
	['ま'] = 'マ', ['め'] = 'マ', ['み'] = 'マ', ['も'] = 'モ', ['む'] = 'マ',
	['な'] = 'ナ', ['ね'] = 'ネ', ['に'] = 'ニ', ['の'] = 'ノ', ['ぬ'] = 'ヌ',
	['ら'] = 'ラ', ['れ'] = 'レ', ['り'] = 'リ', ['ろ'] = 'ロ', ['る'] = 'ル',
	['さ'] = 'サ', ['せ'] = 'セ', ['し'] = 'シ', ['そ'] = 'ソ', ['す'] = 'ス',
	['ざ'] = 'ザ', ['ぜ'] = 'ゼ', ['じ'] = 'ジ', ['ぞ'] = 'ゾ', ['ず'] = 'ズ',
	['わ'] = 'ワ', ['ゑ'] = 'ヱ', ['ゐ'] = 'ヰ', ['を'] = 'ヲ',
	['や'] = 'ヤ', ['よ'] = 'ヨ', ['ゆ'] = 'ユ',
	['ゔ'] = 'ヴ', ['っ'] = 'ッ'
}
local correctionsFromKana = { -- manual transliteration choices
	['し'] = 'shi'
}
local longK = 'ー'
local isolator = '\''
local prolongRules = { -- special rules for prolonging syllables
	o = 'u',
	e = 'i'
}

-- builds a reverse table
local function rev(t)
	local res = {}
	for k, v in pairs(t) do
		if (type(v) == 'table') then
			res[v[1]] = k
			res[v[2]] = k
		else
			res[v] = k
		end
	end
	return res
end

-- builds the default translation tables latin <-> kana from transRaw
local function buildDefaultTransTables()
	local tr, rtr = {}, {}
	
	for k, v in pairs(transRaw) do
		tr[k] = type(v) == 'table' and v[1] or v
	end

	rtr = rev(tr)

	-- apply corrections
	for i, v in pairs(correctionsFromKana) do
		rtr[i] = v
	end

	return tr, rtr, rev(transK)
end

-- decides which wovel should prolong the given vowel
local function prolong(c)
	for i, v in ipairs(vowels) do
		if c == v then
			if prolongRules[c] then return prolongRules[c] else return c end
		end
	end
	return nil
end

-- checks if a katakana token is a vowel and returns its latin representation
local function getWovelK(c)
	for i, v in ipairs(vowelsK) do
		if c == v then return vowels[i] end
	end
	return nil
end

-- checks if a given symbol is ambiguous to preceding n
local function isAmbiguous(c)
	for i, v in ipairs(ambigousToN) do
		if c == v then return true end
	end
	return false
end

-- init translation tables
local trans, revTrans, revTransK = buildDefaultTransTables()

-- init default transliteration choices (everything default to first alternative)
local transChoices = {}

-- checks if two characters are valid candidates for little tsu
local function isValidTsuCandidate(a, b)
	if a ~= b then return false end
	for i, v in ipairs(littleTsuWL) do
		if a == v then return true end
	end
	return false
end

-- checks if two characters are a little tsu used correctly and returns the gemination consonant if true
local function getGeminationConsonant(a, b)
	if a ~= 'っ' then return nil end -- disregard katakana, only hiragana is processed in romanization
	local tr = revTrans[b]
	if not tr then return nil end -- invalid hiragana character
	local fst = string.sub(tr, 1, 1) -- get first character of the transliteration
	for i, v in ipairs(littleTsuWL) do
		if fst == v then return fst end
	end
	return nil -- invalid gemination
end

-- parses an utf8 string into utf8 chars (tokens)
local function tokenize(utf8str)
	assert(type(utf8str) == 'string')
	local res, seq, val = {}, 0, ''
	for i = 1, #utf8str do
		local c = string.byte(utf8str, i)
		if seq == 0 then
			if i ~= 1 then table.insert(res, val) end
			seq = c < 0x80 and 1 or c < 0xE0 and 2 or c < 0xF0 and 3 or
			      c < 0xF8 and 4 or error('invalid UTF-8 character sequence')
			val = string.char(c)
		else
			val = val .. string.char(c)
		end
		seq = seq - 1
	end
	table.insert(res, val)
	return res
end

-- PUBLIC API SECTION

-- toggles used characters for supplied syllables (whitespace-separated)
function toggleChars(input)
	local cur, choices = '', {}
	for s in string.gmatch(input, '%S+') do -- split by whitespaces
		cur = trans[s]
		if cur then -- don't process unknown syllables
			choices = transRaw[s]
			if type(choices) == 'table' then -- only process syllables with alternatives
				trans[s] = cur == choices[1] and choices[2] or choices[1] -- toggle between alternatives
			end
		end
	end
end

-- any kana to latin
function toLatin(input)
	if input == '' then return end
	local tbl = tokenize(input)
	local buffer, res = {}, ''

	-- read tokenized input
	local tjoin, tfst, last, gc = '', '', 0, '' -- last is the last valid transliterated vowel, gc is the last gemination consonant
	for i, v in ipairs(tbl) do
		if revTransK[v] ~= nil then v = revTransK[v] end -- convert all katakana to hiragana
		table.insert(buffer, v)

		if #buffer == 2 then -- kana can be formed with up to two characters, always keep two in buffer
			tjoin, tfst, gc = revTrans[ buffer[1] .. buffer[2] ], revTrans[ buffer[1] ], getGeminationConsonant(buffer[1], buffer[2])
			if tjoin ~= nil then -- double character
				res = res .. tjoin
				buffer, last = {}, string.sub(tjoin, -1)
			elseif gc then -- check for little tsu
				res = res .. gc
				buffer, last = {buffer[2]}, 0
			elseif tfst ~= nil then -- single character
				res = res .. tfst
				if tfst == 'n' and isAmbiguous(buffer[2]) then -- ambiguous character succeeding an "n"
					res = res .. isolator
				end
				buffer, last = {buffer[2]}, string.sub(tfst, -1)
			elseif buffer[1] == longK and prolong(last) ~= nil then -- prolonging dash
				res = res .. prolong(last)
				buffer, last = {buffer[2]}, 0
			else -- cannot transliterate, output as-is
				res = res .. buffer[1]
				buffer, last = {buffer[2]}, 0
			end
		end
	end

	if #buffer == 1 then -- trailing character
		if revTrans[ buffer[1] ] ~= nil then -- single character
			res = res .. revTrans [ buffer[1] ]
		elseif buffer[1] == longK and prolong(last) ~= nil then -- prolonging dash
			res = res .. prolong(last)
		else -- cannot transliterate, output as-is
			res = res .. buffer[1]
		end
	end

	tex.print(res)
end

-- latin or katakana to hiragana, 'raw' parameter is for internal use, leave it blank to get output to TeX
function toHiragana(input, raw)
	if input == '' then return end
	local tbl = tokenize(input)
	local buffer, res = {}, ''
	local t3, t2, t1, last, lastsym, lastcnd = '', '', '', 0, nil, nil

	for i, v in ipairs(tbl) do
		if revTransK[v] then v = revTransK[v] end -- translate katakana to hiragana on the go
		table.insert(buffer, v)

		if #buffer == 3 then
			t3, t2, t1 = trans[ buffer[1] .. buffer[2] .. buffer[3] ], trans[ buffer[1] .. buffer[2] ], trans[ buffer[1] ]
			if t3 ~= nil then -- all three letters yield translation
				if lastcnd then -- add little tsu
					res = res .. 'っ'
					lastcnd = nil
				end
				res = res .. t3
				last = buffer[3]
				buffer = {}
			elseif t2 ~= nil then -- first two letters yield translation
				if lastcnd then -- add little tsu
					res = res .. 'っ'
					lastcnd = nil
				end
				res = res .. t2
				last = buffer[2]
				buffer = {buffer[3]}
			elseif isValidTsuCandidate(buffer[1], buffer[2]) then -- test little tsu candidates
				if lastcnd then res = res .. lastcnd end -- add last consonant in raw form
				lastcnd = buffer[1] -- set last candidate consonant
				last = 0 -- is not vowel
				buffer = {buffer[2], buffer[3]}
			elseif t1 ~= nil then -- first letter yields translation : a, e, i, o, u, n
				res = res .. t1
				last = buffer[1]
				buffer = {buffer[2], buffer[3]}
			elseif buffer[1] == longK and prolong(last) ~= nil then -- valid prolonger sign
				res = res .. trans[prolong(last)]
				buffer, last = {buffer[2], buffer[3]}, 0
			elseif buffer[1] == isolator then -- isolating apostrophe, consume it
				buffer = {buffer[2], buffer[3]}
			else
				if lastcnd then -- add last consonant in raw form
					res = res .. lastcnd
					lastcnd = nil
				end

				-- this code allows for proper conversion of katakana's prolongation dash to hiragana
				t1 = revTrans[ buffer[1] ]
				if t1 then -- symbol is standalone hiragana
					last = string.sub(t1, -1)
					lastsym = buffer[1]
				elseif lastsym then -- attempt to merge symbol with previous symbol
					t1 = revTrans[ lastsym .. buffer[1] ]
					if t1 then -- symbol is a valid non-standalone hiragana compound
						last = string.sub(t1, -1)
					else -- symbol is an invalid non-standalone hiragana compound
						last = nil
					end
					lastsym = nil
				else
					last, lastsym = 0, nil
				end
				
				res = res .. buffer[1]
				buffer = {buffer[2], buffer[3]}
			end
		end
	end

	if #buffer == 2 then
		if trans[ buffer[1] .. buffer[2] ] ~= nil then -- first two symbols yield translation
			if lastcnd then res = res .. 'っ' end -- add little tsu
			res = res .. trans[ buffer[1] .. buffer[2] ]
			last = buffer[2]
			buffer = {}
		elseif trans[ buffer[1] ] ~= nil then -- first symbol yields translation
			res = res .. trans[ buffer[1] ]
			last = buffer[1]
			buffer = {buffer[2]}
		elseif buffer[1] == longK and prolong(last) ~= nil then -- valid prolonger
			res = res .. trans[prolong(last)]
			buffer, last = {buffer[2]}, 0
		elseif buffer[1] == isolator then -- consume isolator
			buffer = {buffer[2]}
		else
			if lastcnd then res = res .. lastcnd end -- add last consonant in raw form

			-- this code allows for proper conversion of katakana's prolongation dash to hiragana
			t1 = revTrans[ buffer[1] ]
			if t1 then -- symbol is standalone hiragana
				last = string.sub(t1, -1)
				lastsym = buffer[1]
			elseif lastsym then -- attempt to merge symbol with previous symbol
				t1 = revTrans[ lastsym .. buffer[1] ]
				if t1 then -- symbol is a valid non-standalone hiragana compound
					last = string.sub(t1, -1)
				else -- symbol is an invalid non-standalone hiragana compound
					last = nil
				end
				lastsym = nil -- erase last valid symbol
			else
				last, lastsym = 0, nil
			end

			res = res .. buffer[1]
			buffer = {buffer[2]}
		end
	end

	if #buffer == 1 then -- remaining symbol
		if trans[ buffer[1] ] ~= nil then
			res = res .. trans[ buffer[1] ]
		elseif buffer[1] == longK and prolong(last) ~= nil then
			res = res .. trans[prolong(last)]
		elseif buffer[1] ~= isolator then
			res = res .. buffer[1]
		end
	end

	if not raw then
		tex.print(res)
	else
		return res -- for internal use
	end
end

-- latin or hiragana to katakana
function toKatakana(input)
	if input == '' then return end
	local hiraganized = tokenize(toHiragana(input, true)) -- convert everything to hiragana

	-- replace hiragana with katakana
	for i, v in ipairs(hiraganized) do
		if transK[v] ~= nil then
			hiraganized[i] = transK[v]
		end
	end

	-- insert prolonging symbols and prepare output
	local prev, nxt, vowel, tprev, tnext, res = hiraganized[1], '', '', '', '', hiraganized[1]
	local merge, toprolong = '', nil
	for i = 2, #hiraganized do
		nxt = hiraganized[i]

		vowel = getWovelK(nxt)

		if not toprolong then -- check prev for ending vowel
			tprev = revTransK[prev]
			if tprev then
				tprev = revTrans[tprev]
				if tprev then
					toprolong = prolong(string.sub(tprev, -1))
				end
			end
		end

		if toprolong then -- check nxt for matching prolonger
			if toprolong == vowel then
				nxt = longK
				toprolong = nil
			elseif vowel then
				toprolong = prolong(vowel)
			else
				toprolong = nil
			end
		end

		-- try merging prev and nxt for a single token
		tprev, tnext = revTransK[prev], revTransK[nxt]
		if tprev and tnext then
			merge = revTrans[tprev .. tnext]
			if merge then
				toprolong = prolong(string.sub(merge, -1))
			end
		end

		res = res .. nxt
		prev = nxt
	end

	tex.print(res)
end