မော်ဂျူး:scripts/findBestScript

Documentation for this module may be created at မော်ဂျူး:scripts/findBestScript/doc
return function (export, text, lang, scripts, forceDetect)
	-- Ensure that "Hant", "Hans" and "Hani" are moved to the end of the list (in that order, if present), as they are a special-case.
	local old_scripts, Hani, Hant, Hans = scripts
	scripts = {}
	for _, script in ipairs(old_scripts) do
		if script._code == "Hani" then
			Hani = script
		elseif script._code == "Hant" then
			Hant = script
		elseif script._code == "Hans" then
			Hans = script
		else
			table.insert(scripts, script)
		end
	end
	if Hant then table.insert(scripts, Hant) end
	if Hans then table.insert(scripts, Hans) end
	if Hani then table.insert(scripts, Hani) end
	
	--[=[
		Remove any HTML entities; catfix function in [[မော်ဂျူး:utilities]]
		adds tagging to a no-break space (&nbsp;), which contains Latin characters;
		hence Latin was returned as the script if "Latn" is one of the language's scripts.
	]=]
	text = string.gsub(text, "&[a-zA-Z0-9]+;", "")
	
	-- Try to match every script against the text,
	-- and return the one with the most matching characters.
	local bestcount, Hantcount, Hanscount = 0, 0, 0
	local bestscript, Hant, Hans
	
	-- Remove any spacing or punctuation characters, and get resultant length.
	-- Counting instances of UTF-8 character pattern is faster than mw.ustring.len.
	local reducedText = mw.ustring.gsub(text, "[%s%p]+", "")
	local _, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
	
	-- If the length is 0 then we're probably dealing with a punctuation character, so only remove spacing characters, in case it is script-specific.
	if length == 0 then
		reducedText = mw.ustring.gsub(text, "[%s]+", "")
		_, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
		
		if length == 0 then
			return export.getByCode("None")
		end
	end
	
	for i, script in ipairs(scripts) do
		local count = script:countCharacters(reducedText)
		
		-- Special case for "Hani" (general Han), which is overridden by "Hant" (traditional) or "Hans" (simplified) if they are the best script checked so far. Otherwise, "Hani" would be selected in situations where exclusively traditional or simplified characters are used in strings with characters used in both varieties: "Hani" will match all CJK characters, while "Hant" and "Hans" will not match with shared characters. This is to prevent having to include all shared characters (>90%) on both the traditional and simplified character lists.
		-- Subject to the above, "Hant", "Hans" or "Hani" will be returned if they match at least one character, even if another script would otherwise be selected.
		if script._code == "Hani" then
			if Hantcount > Hanscount then
				return Hant
			elseif Hanscount > Hantcount then
				return Hans
			elseif count > 0 then
				return script
			end
		end
		
		if count >= length then
			return script
		elseif script._code == "Hant" then
			Hantcount = count
			Hant = script
		elseif script._code == "Hans" then
			Hanscount = count
			Hans = script
		end
		if count > bestcount then
			bestcount = count
			bestscript = script
		end
	end
	
	if bestscript then
		return bestscript
	-- Final check for languages that have "Hant" or "Hans" but not "Hani", but which still have multiple scripts (e.g. Macau Pidgin Portuguese): characters which are not exclusively traditional or simplified will not be found by the main check, so a separate "Hani" check is necessary to see if Han characters are present. If successful, return "Hant" or "Hans" as applicable.
	else
		for _, script in ipairs(scripts) do
			if script._code == "Hant" or script._code == "Hans" then
				if require("Module:scripts").getByCode("Hani"):countCharacters(reducedText) > 0 then return script end
			end
		end
	end
	
	-- No matching script was found. Return "None".
	return export.getByCode("None")
end