Editing Module:Ko-rm

-- based on Wiktionary [[wikt:Module:ko-translit]] [[wikt:Module:ko-pron]] [[wikt:Module:ko-pron/data]] [[wikt:Module:ko]]
local p = {}
local Ugsub = mw.ustring.gsub
local Umatch = mw.ustring.match
local Ufind = mw.ustring.find
local Usub = mw.ustring.sub
local Uchar = mw.ustring.char
local codepoint = mw.ustring.codepoint
local data = mw.loadData('Module:Ko-rm/data')
local lib = require('Module:Feature')
local CleanTT = require('Module:Tt').CleanTT
local IPCvals = {
	['nn']		= {},
	['ni']		= {},
	['bcred']	= { offset = -1, alias = {'bd', 'bc'} }
}

function p.main(frame)
	local args = require('Module:Arguments').getArgs(frame, {
		parentFirst = true,
		wrappers = {'Template:Ko-rm'}
	})
	if (not args[1]) then return '' end
	
	return p._main(args)
end

function p._main(args)
	local str = CleanTT(mw.text.unstrip(args[1]))
	-- mw.logObject('input' .. args[1]) --debug
	-- mw.logObject('cleaned' .. str) --debug
	
	str = Ugsub(str, '%([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+%)', '')
	str = Ugsub(str, "%([一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]*'''[一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+'''[一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]*%)", '')
	str = Ugsub(str, '[一-鿿㐀-䶿𠀀-𮯯𰀀-𱍏]+%((.-)%)', '%1')
	str = Ugsub(str, '<sup.-※.-</sup>', '')
	str = Ugsub(str, '<span.-title.->(.-)</span>', '%1')
	str = Ugsub(str, '<[%w%p]+:(.-)>', '%1') --for manual readings
	str = Ugsub(str, '<%->', '-') --for manual hyphenation
	str = Ugsub(str, '< >', ' ') --for manual spacing
	
	-- manual inserting of irregular pronunciation corrections (IPC)
	while (function() -- Run if any of the IPC are present
		for n, d in pairs(IPCvals) do
			if Ufind(str, '<' .. n .. '>') ~= nil then mw.logObject(n, 'n')return true end
			if d.alias and #d.alias>0 then
				for _, a in ipairs(d.alias) do
					if Ufind(str, '<' .. a .. '>') ~= nil then mw.logObject(a, 'a') return true end
				end
			end
		end
		return false
	end)() do
		local ref, rpl, pos, off = '', '', nil, 0
		for n, d in pairs(IPCvals) do
			local vpos = Ufind(str, '<' .. n .. '>')
			if vpos ~= nil and (pos == nil or pos > vpos) then
				pos = Ufind(str, '<' .. n .. '>')
				rpl = n
				off = tonumber(d.offset) or 0
				ref = n
			end
			if d.alias and #d.alias>0 then
				for _, a in ipairs(d.alias) do
					vpos = Ufind(str, '<' .. a .. '>')
					if vpos ~= nil and (pos == nil or pos > vpos) then
						pos = Ufind(str, '<' .. a .. '>')
						rpl = a
						off = tonumber(d.offset) or 0
						ref = n
					end
				end
			end
		end
		
		if rpl ~= '' and ref ~= '' and pos ~= nil then
			str = Ugsub(str, '<' .. rpl .. '>', '', 1)
			pos = mw.ustring.len(Ugsub((Usub(str, 0, pos)), '[^가-힣 ]', ''))
			args[ref] = (args[ref] or '-1') .. ',' .. tostring(pos+off)
		end
	end

	if not Umatch(str, '[가-힣]') then
		return ''
	end

	--pronunciation exception(s)
	str = Ugsub(str, '여덟', '여덜')
	str = Ugsub(str, 'Ⅰ', '일')
	str = Ugsub(str, 'Ⅱ', '이')
	str = Ugsub(str, 'Ⅲ', '삼')
	str = Ugsub(str, 'Ⅳ', '사')
	str = Ugsub(str, 'Ⅴ', '오')
	str = Ugsub(str, 'Ⅵ', '육')
	str = Ugsub(str, 'Ⅶ', '칠')
	str = Ugsub(str, 'Ⅷ', '팔')
	str = Ugsub(str, 'Ⅸ', '구')
	str = Ugsub(str, 'Ⅹ', '십')

	-- pre-romanization punctuation conversion
	str = Ugsub(str, '[《「『【]', '“')
	str = Ugsub(str, '[》」』】]', '”')

	local revised = p.romanize(str, args)

	if (not revised) then
		return ''
	end
	
	--mw.logObject(revised,'revised') --debug
	
	if Umatch(revised, '[%.%?%!]') then
		revised = mw.ustring.upper(Usub(revised, 1, 1)) .. Usub(revised, 2, -1)
		revised = Ugsub(revised, "([%.%?%!]) ([a-z%'])", '%1 ^%2')
		revised = Ugsub(revised, "^%'%'%'", "'''^")
	end
	revised = Ugsub(revised, "([a-z])%-%'([a-z])", '%1-%2')
	revised = Ugsub(revised, "%^%'%'%'", "'''^")
	revised = Ugsub(revised, '%^%l', mw.ustring.upper)
	revised = Ugsub(revised, '%^', '')
	revised = Ugsub(revised, "%-'''%-", "'''-")
	revised = Ugsub(revised, '%-%-', '-')
	
	--punctuation fixing
	revised = Ugsub(revised, '…', '...')
	revised = Ugsub(revised, '！', '!')
	revised = Ugsub(revised, '？', '?')
	revised = Ugsub(revised, '”([A-Za-z])', '”-%1')
	revised = Ugsub(revised, '(//[^/@]-@@[^/@]-@@//)%-?([A-Za-z])', '%1-%2')
	revised = Ugsub(revised, '[·・]', ' - ')
	revised = Ugsub(revised, '——', '⸺')
	revised = Ugsub(revised, '&mdash;&mdash;', '⸺')
	
	--secondary romanisation system
	while revised:find('^.-//[^/]-//.-$') do
		local pre, dur, dur3, post = string.match(revised, '^(.-)//([^@/]-)@@([^@/]-)@@//(.-)$')
		if dur3 ~= nil then
			--mw.logObject(pre,'pre') mw.logObject(dur,'dur') mw.logObject(post,'post') --debug
			if mw.ustring.lower(dur3) ~= mw.ustring.lower(dur) then
				dur = '<span style="border-bottom-width:1px; border-bottom-style:dotted; border-bottom-color:rgb(128, 128, 128); cursor:help;" title="Spelled: ' .. p.capitalizer(dur3, true):gsub('"','&quot;') .. '">' .. p.capitalizer(dur, true) .. '</span>'
			else
				dur = p.capitalizer(dur, true)
			end
		else
			pre, dur, post = string.match(revised, '^(.-)//([^/]-)//(.-)$')
		end
		revised = pre .. dur .. post
	end
	
	--all case (|capi=1) or sentence case (|sent=1)
	if (args.capi or args.sent) then
		revised = p.capitalizer(revised, (args.capi or nil))
	end
	
	--post-capitalization punctuation fixing
	revised = Ugsub(revised, "”'", '”-')
	revised = Ugsub(revised, '[“”]', '"')
	revised = Ugsub(revised, '([%a])(%d+)', '%1-%2')
	revised = Ugsub(revised, '(%d+)([%a])', '%1-%2')
	revised = Ugsub(revised, '(%d+)-[Pp]x', '%1px') --lazy fix for accidental hyphenation of pixel amounts
	
	return revised
end

function p.romanize(text_param, args)
	local P, optional_params = {}, { 'nn', 'ni', 'bcred' }
	for _, pm in ipairs(optional_params) do
		P[pm] = { }
		if args[pm] then
			for pp in lib.gsplit(args[pm], ',', {removeEmpty=true}) do P[pm][tonumber(pp) or pp] = 1 end
		end
	end
	--mw.logObject(P,'P') --debug
	
	local T_index, T_next_index = 0,0
	local rom3 = false

	text_param = Ugsub(text_param, '["](.)', '%1')

	for primitive_word in mw.ustring.gmatch(text_param, '[%-ᄀ-ᄒ' .. 'ᅡ-ᅵ' .. 'ᆨ-ᇂ' .. "ㄱ-ㅣ가-힣' /「」%^]+") do
		--mw.logObject(primitive_word,'primitive_word') --debug
		--mw.logObject(text_param,'text_param') --debug
		local the_original = primitive_word
		primitive_word = Ugsub(primitive_word, "'''", 'ß')
		local bold_position, bold_count = {}, 0
		while Umatch(primitive_word, 'ß') do
			bold_position[(mw.ustring.find(primitive_word, 'ß')) + bold_count] = true
			primitive_word = Ugsub(primitive_word, 'ß', '', 1)
			bold_count = bold_count + 1
		end

		local word_set = { primitive_word }

		local word_set_romanisations = {}
		for _, respelling in ipairs(word_set) do
			--mw.logObject(word_set,'word_set') --debug
			--mw.logObject(respelling,'respelling') --debug
			local decomposed_syllables = p.decompose_syllable(respelling)
			--mw.logObject(decomposed_syllables,'decomposed_syllables') --debug
			local romanisation = {}
			local romanisation3 = {}
			local bold_insert_count = 0
			for index = 0, #decomposed_syllables, 1 do
				if index ~= 0 then T_index = T_index + 1 end
				local this_syllable_text = index ~= 0 and Usub(respelling, index, index) or ''
				local forced = ''
				--mw.logObject(this_syllable_text,'this_syllable_text_I') --debug
				while Umatch(this_syllable_text, '[/「」^]') do
					forced = forced .. this_syllable_text
					respelling = Usub(respelling, 2, -1)
					this_syllable_text = index ~= 0 and Usub(respelling, index, index) or ''
				end
				--mw.logObject(forced,'forced') --debug
				if (forced:find('//') and (not rom3)) then
					rom3 = true
				elseif forced:find('//') then
					rom3 = false
				end
				--mw.logObject(this_syllable_text,'this_syllable_text_F') --debug
				if this_syllable_text == '-' then
					if ((not rom3) and #romanisation3 > 0) then
						table.remove(romanisation3)
						table.remove(romanisation3)
						table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0),  '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')
						romanisation3 = {}
						table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), forced)
					end
				else
					T_next_index = T_index
					local syllable = decomposed_syllables[index] or { initial = 'Ø', vowel = 'Ø', final = 'X' }
					local next_index = index
					local next_syllable_text
					local saw_hyphen_after = false
					while true do
						next_index = next_index + 1
						T_next_index = T_next_index + 1
						next_syllable_text = next_index > #decomposed_syllables and '' or Usub(respelling, next_index, next_index)
						if next_syllable_text ~= '-' then
							break
						end
						saw_hyphen_after = true
					end
					local next_syllable = decomposed_syllables[next_index] or { initial = 'Ø', vowel = 'Ø', final = 'Ø' }
					syllable.final = data.FSC[syllable.final] or syllable.final

					if this_syllable_text == '넓' then
						if Umatch(next_syllable.initial, '[ᄌᄉ]') then
							syllable.final = 'ᆸ'

						elseif next_syllable.initial == 'ᄃ' then
							if Umatch(next_syllable.vowel, '[^ᅡᅵ]') then
								syllable.final = 'ᆸ'
							end
						end
					end
					
					local vowel = data.vowels[syllable.vowel][2]
					
					if P.nn[T_next_index] and Umatch(syllable.final .. next_syllable.initial, 'ᆫᄅ') then
						next_syllable.initial = 'ᄂ'
					end

					if P.ni[T_next_index] and next_syllable.initial == 'ᄋ' and  Umatch(next_syllable.vowel, '[ᅵᅣᅧᅭᅲ]') then
						next_syllable.initial = 'ᄂ'
					end

					if P.bcred[T_index] then
						syllable.final = data.boundary[syllable.final .. '-Ø'][1]
					end

					if index ~= 0 and this_syllable_text == '밟' and not
						Umatch(next_syllable.initial, '[ᄋᄒ]') then
							syllable.final = 'ᆸ'
					end

					if Umatch(this_syllable_text, '[닭뷁삵슭앍줅찱칡탉흙]') and not
						Umatch(next_syllable.initial .. ';' .. next_syllable.vowel, 'ᄋ;[ᅦᅧᅳᅴᅵ]') then
							syllable.final = 'ᆨ'
					end

					if next_syllable_text == '없' then
						if Umatch(syllable.final, '[ᆩᆪᆰᆿ]') then
							syllable.final = 'ᆨ'
						elseif Umatch(syllable.final, '[ᆬᆭ]') then
							syllable.final = 'ᆫ'
						elseif Umatch(syllable.final, '[ᆺᆻᆽᆾᇀ]') then
							syllable.final = 'ᆮ'
						elseif Umatch(syllable.final, '[ᆲᆳᆴᆶ]') then
							syllable.final = 'ᆯ'
						elseif syllable.final == 'ᆱ' then
							syllable.final = 'ᆷ'
						elseif Umatch(syllable.final, '[ᆵᆹᇁ]') then
							syllable.final = 'ᆸ'
						end
					end

					if (not P.bcred[T_index]) then
						if Umatch(syllable.final .. next_syllable.initial, 'ᇀᄋ') then
							if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
								syllable.final = 'ᆾ'
							end

						elseif Umatch(syllable.final .. next_syllable.initial, 'ᆴᄋ') then
							if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
								syllable.final = 'ᆯ'
								next_syllable.initial = 'ᄎ'
							end

						elseif Umatch(syllable.final .. next_syllable.initial, 'ᆮᄋ') and tonumber(s_variation or -1) ~= index then
							if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
								syllable.final = 'ᆽ'
							end

						elseif Umatch(syllable.final .. next_syllable.initial, 'ᆮᄒ') then
							if Umatch(next_syllable.vowel, '[ᅵᅧ]') then
								syllable.final = 'ᆾ'
								next_syllable.initial = 'ᄋ'
							end
						end
					end

					if syllable.final .. next_syllable.initial == 'ᆺᄋ' and not
						Umatch(next_syllable_text, '[아았어었에으은을음읍의이인일임입있]') then
							syllable.final = 'ᆮ'
					end

					local bound = syllable.final .. '-' .. next_syllable.initial
					if (not data.boundary[bound]) then
						mw.log('No boundary data for ' .. bound .. '.')
						return nil
					end
					
					local junction = data.boundary[bound][2]
					local junction3 = data.boundary[bound][3] or data.boundary[bound][2]
					--mw.logObject(junction, 'junction') --debug
					--mw.logObject(junction3, 'junction3') --debug
					
					if bold_position[index + bold_insert_count + 1] then
						junction = Ugsub(junction, '^.*$', function(matched)
							local a, b = string.match(matched, '^(ng);(.*)$')
							if ((not a) and (not b)) then a, b = string.match(matched, '^(.?%-?);(.*)$') end
							return Umatch(syllable.final .. next_syllable.initial, '^Ø?[ᄀ-ᄒ]$')
								and "'''" .. (a or '') .. ';' .. (b or '')
								or (a or '') .. "'''" .. ';' .. (b or '') end)

						bold_insert_count = bold_insert_count + 1
					end

					local final_cons, initial_cons = Umatch(junction, '^(.*);(.*)$')
					
					--special romanisation
					if rom3 then
						if (#romanisation3 == 0 and #romanisation > 0) then
							table.insert(romanisation3, romanisation[#romanisation])
						end
						local final_cons3, initial_cons3 = Umatch(junction3, '^(.*);(.*)$')
						table.insert(romanisation3, vowel)
						table.insert(romanisation3, final_cons3)
						table.insert(romanisation3, (saw_hyphen_after and '-' or ''))
						table.insert(romanisation3, initial_cons3)
					elseif ((not rom3) and #romanisation3 > 0) then
						table.remove(romanisation3)
						table.remove(romanisation3)
						table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0),  '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')
						romanisation3 = {}
					end
					
					table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0), forced)
					table.insert(romanisation, vowel)
					table.insert(romanisation, final_cons)
					table.insert(romanisation, (saw_hyphen_after and '-' or ''))
					table.insert(romanisation, initial_cons)
					
					--straggler characters at end of word set
					if index == #decomposed_syllables and lib.isNotEmpty(Usub(respelling, index+1, index+1)) then
						local N = Usub(respelling, index+1, #respelling)
						if (N:find('//') and #romanisation3 > 0) then
							table.remove(romanisation3)
							table.remove(romanisation3)
							table.insert(romanisation, #romanisation-(#romanisation>0 and 1 or 0),  '@@' .. Ugsub(table.concat(romanisation3), "[^A-Za-z\"]$", '') .. '@@')
							romanisation3 = {}
							table.insert(romanisation, N)
						else
							romanisation3 = {}
						end
					end
					--[[
					local currRom = { 
						syllable = syllable,
						vowel = vowel,
						final_cons = final_cons,
						initial_cons = initial_cons,
						totalRom = table.concat(romanisation),
						totalRom3 = table.concat(romanisation3)
					} --debug
					mw.logObject(currRom,'currRom') --debug
					--]]
				end
			end

			local temp_romanisation = table.concat(romanisation)
			--mw.logObject(temp_romanisation,'temp_romanisation') --debug
			for i = 1, 2 do
				temp_romanisation = Ugsub(temp_romanisation, '(.)…(.)', function(a, b)
					return a .. (data.AI[a .. b] and "'" or '') .. b end)
				temp_romanisation = Ugsub(temp_romanisation, "wo'e", 'woe')
				temp_romanisation = Ugsub(temp_romanisation, "yo'e", 'yoe')
				temp_romanisation = Ugsub(temp_romanisation, "we'o", 'weo')
				temp_romanisation = Ugsub(temp_romanisation, "we'u", 'weu')
				temp_romanisation = Ugsub(temp_romanisation, "ye'u", 'yeu')
				temp_romanisation = Ugsub(temp_romanisation, "yu'i", 'yui')
			end
			
			table.insert(word_set_romanisations, temp_romanisation)
		end

		text_param = Ugsub(
			text_param,
			p.pattern_escape(the_original),
			table.concat(word_set_romanisations, '/'),
			1
		)
	end

	return text_param
end

function p.decompose_jamo(syllable)
	if (not Umatch(syllable, '[가-힣]')) then
		if Umatch(syllable, '[ᄀ-ᄒ]') then
			return { initial = syllable, vowel = 'Ø', final = 'Ø' }
		elseif Umatch(syllable, '[ᅡ-ᅵ]') then
			return { initial = 'Ø', vowel = syllable, final = 'Ø' }
		elseif Umatch(syllable, '[ᆨ-ᇂ]') then
			return { initial = 'Ø', vowel = 'Ø', final = syllable }
		elseif Umatch(syllable, '[ㄱ-ㆎ]') then
			return { initial = 'Ø', vowel = 'Ø', final = syllable }
		else
			return { initial = 'Ø', vowel = ' ', final = 'X' }
		end
	end
	local cp = codepoint(syllable)
	if (not cp) then return { '', '', '' } end
	local relative_cp = cp - 0xAC00
	local jongseong = (((relative_cp % 28) ~= 0) and Uchar(0x11A7 + (relative_cp % 28))) or ''
	local jungseong = Uchar(0x1161 + math.floor((relative_cp % 588) / 28))
	local choseong = Uchar(0x1100 + math.floor(relative_cp / 588))
	return {
		initial = choseong,
		vowel = jungseong,
		final = jongseong
	}
end

function p.pattern_escape(text)
	if type(text) == 'table' then
		text = text.args[1]
	end
	text = Ugsub(text, '([%^$()%%.%[%]*+%-?])', '%%%1')
	return text
end

function p.decompose_syllable(word)
	local decomposed_syllables = {}
	for syllable in mw.text.gsplit(word, '') do
		--mw.logObject(syllable,'syllable') --debug
		if not Umatch(syllable, '[/「」%^]') then
			table.insert(decomposed_syllables, p.decompose_jamo(syllable))
		end
	end
	return decomposed_syllables
end

function p.capitalizer(str, all)
	if lib.isNotEmpty(str) then
		str = mw.text.split(str,'')
		--mw.logObject(str,'str') --debug
		local cap = true
		for index = 1,#str do
			if (str[index]:find(((all ~= nil) and "[^A-Za-z\-\"_#&]" or "[^A-Za-z\-\"_,%s#&]")) and str[index] ~= "'") or (cap and str[index] == ' ')then
				cap = true
				--mw.logObject(str[index],'skipped') --debug
            elseif cap and str[index] == '_' then
                cap = false
                str[index] = ''
			elseif cap then
				str[index] = mw.ustring.upper(str[index])
				--mw.logObject(str[index],'capped') --debug
				cap = false
			end
		end
		str = table.concat(str,'')
	end
	return str
end

function p.strip(str)
	if lib.isEmpty(str) then return '' end
	
	str = Ugsub(str,    '//(.-)//',          '%1'    ) --remove given name specifier
	str = Ugsub(str,    '%^',                ''      ) --remove capitalization marker
	str = Ugsub(str,    '<.>',               ''      ) --remove arbitrary separator
	str = Ugsub(str,    '<(%w+)>',					   --remove manual IPC markers
		function(term)
			for n, d in pairs(IPCvals) do
				if n == term then return '' end
				if d.alias and #d.alias>0 then
					for _, a in ipairs(d.alias) do
						if a == term then return '' end
					end
				end
			end
			return nil
		end)
	str = Ugsub(str,    '_',                 ''      ) --remove capitalization blacklister
	str = Ugsub(str,    '<([%w%p]+):.->',    '%1'    ) --reduce manual readings to just the text
	
	return str
end

return p