မော်ဂျူး:scripts/findBestScript
Documentation for this module may be created at မော်ဂျူး:scripts/findBestScript/doc
return function (export, text, lang, scripts, forceDetect)
-- Ensure that "Hant", "Hans" and "Hani" are moved to the end of the list (in that order, if present), as they are a special-case.
local old_scripts, Hani, Hant, Hans = scripts
scripts = {}
for _, script in ipairs(old_scripts) do
if script._code == "Hani" then
Hani = script
elseif script._code == "Hant" then
Hant = script
elseif script._code == "Hans" then
Hans = script
else
table.insert(scripts, script)
end
end
if Hant then table.insert(scripts, Hant) end
if Hans then table.insert(scripts, Hans) end
if Hani then table.insert(scripts, Hani) end
--[=[
Remove any HTML entities; catfix function in [[မော်ဂျူး:utilities]]
adds tagging to a no-break space ( ), which contains Latin characters;
hence Latin was returned as the script if "Latn" is one of the language's scripts.
]=]
text = string.gsub(text, "&[a-zA-Z0-9]+;", "")
-- Try to match every script against the text,
-- and return the one with the most matching characters.
local bestcount, Hantcount, Hanscount = 0, 0, 0
local bestscript, Hant, Hans
-- Remove any spacing or punctuation characters, and get resultant length.
-- Counting instances of UTF-8 character pattern is faster than mw.ustring.len.
local reducedText = mw.ustring.gsub(text, "[%s%p]+", "")
local _, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
-- If the length is 0 then we're probably dealing with a punctuation character, so only remove spacing characters, in case it is script-specific.
if length == 0 then
reducedText = mw.ustring.gsub(text, "[%s]+", "")
_, length = string.gsub(reducedText, "[\1-\127\194-\244][\128-\191]*", "")
if length == 0 then
return export.getByCode("None")
end
end
for i, script in ipairs(scripts) do
local count = script:countCharacters(reducedText)
-- Special case for "Hani" (general Han), which is overridden by "Hant" (traditional) or "Hans" (simplified) if they are the best script checked so far. Otherwise, "Hani" would be selected in situations where exclusively traditional or simplified characters are used in strings with characters used in both varieties: "Hani" will match all CJK characters, while "Hant" and "Hans" will not match with shared characters. This is to prevent having to include all shared characters (>90%) on both the traditional and simplified character lists.
-- Subject to the above, "Hant", "Hans" or "Hani" will be returned if they match at least one character, even if another script would otherwise be selected.
if script._code == "Hani" then
if Hantcount > Hanscount then
return Hant
elseif Hanscount > Hantcount then
return Hans
elseif count > 0 then
return script
end
end
if count >= length then
return script
elseif script._code == "Hant" then
Hantcount = count
Hant = script
elseif script._code == "Hans" then
Hanscount = count
Hans = script
end
if count > bestcount then
bestcount = count
bestscript = script
end
end
if bestscript then
return bestscript
-- Final check for languages that have "Hant" or "Hans" but not "Hani", but which still have multiple scripts (e.g. Macau Pidgin Portuguese): characters which are not exclusively traditional or simplified will not be found by the main check, so a separate "Hani" check is necessary to see if Han characters are present. If successful, return "Hant" or "Hans" as applicable.
else
for _, script in ipairs(scripts) do
if script._code == "Hant" or script._code == "Hans" then
if require("Module:scripts").getByCode("Hani"):countCharacters(reducedText) > 0 then return script end
end
end
end
-- No matching script was found. Return "None".
return export.getByCode("None")
end