Merge pull request #53737 from bruvzg/icu_strip_diacritics
This commit is contained in:
commit
c24bdfb327
@ -1233,6 +1233,13 @@
|
||||
Aligns shaped text to the given tab-stops.
|
||||
</description>
|
||||
</method>
|
||||
<method name="strip_diacritics" qualifiers="const">
|
||||
<return type="String" />
|
||||
<argument index="0" name="string" type="String" />
|
||||
<description>
|
||||
Strips diacritics from the string.
|
||||
</description>
|
||||
</method>
|
||||
<method name="tag_to_name" qualifiers="const">
|
||||
<return type="String" />
|
||||
<argument index="0" name="tag" type="int" />
|
||||
|
@ -4924,6 +4924,39 @@ String TextServerAdvanced::percent_sign(const String &p_language) const {
|
||||
return "%";
|
||||
}
|
||||
|
||||
String TextServerAdvanced::strip_diacritics(const String &p_string) const {
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
|
||||
// Get NFKD normalizer singleton.
|
||||
const UNormalizer2 *unorm = unorm2_getNFKDInstance(&err);
|
||||
ERR_FAIL_COND_V_MSG(U_FAILURE(err), TextServer::strip_diacritics(p_string), u_errorName(err));
|
||||
|
||||
// Convert to UTF-16.
|
||||
Char16String utf16 = p_string.utf16();
|
||||
|
||||
// Normalize.
|
||||
Char16String normalized;
|
||||
err = U_ZERO_ERROR;
|
||||
int32_t len = unorm2_normalize(unorm, utf16.ptr(), -1, nullptr, 0, &err);
|
||||
ERR_FAIL_COND_V_MSG(err != U_BUFFER_OVERFLOW_ERROR, TextServer::strip_diacritics(p_string), u_errorName(err));
|
||||
normalized.resize(len);
|
||||
err = U_ZERO_ERROR;
|
||||
unorm2_normalize(unorm, utf16.ptr(), -1, normalized.ptrw(), len, &err);
|
||||
ERR_FAIL_COND_V_MSG(U_FAILURE(err), TextServer::strip_diacritics(p_string), u_errorName(err));
|
||||
|
||||
// Convert back to UTF-32.
|
||||
String normalized_string = String::utf16(normalized.ptr(), len);
|
||||
|
||||
// Strip combining characters.
|
||||
String result;
|
||||
for (int i = 0; i < normalized_string.length(); i++) {
|
||||
if (u_getCombiningClass(normalized_string[i]) == 0) {
|
||||
result += normalized_string[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
TextServerAdvanced::TextServerAdvanced() {
|
||||
_insert_num_systems_lang();
|
||||
_insert_feature_sets();
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include <unicode/udata.h>
|
||||
#include <unicode/uiter.h>
|
||||
#include <unicode/uloc.h>
|
||||
#include <unicode/unorm2.h>
|
||||
#include <unicode/uscript.h>
|
||||
#include <unicode/ustring.h>
|
||||
#include <unicode/utypes.h>
|
||||
@ -501,6 +502,8 @@ public:
|
||||
virtual String parse_number(const String &p_string, const String &p_language = "") const override;
|
||||
virtual String percent_sign(const String &p_language = "") const override;
|
||||
|
||||
virtual String strip_diacritics(const String &p_string) const override;
|
||||
|
||||
TextServerAdvanced();
|
||||
~TextServerAdvanced();
|
||||
};
|
||||
|
@ -42,7 +42,7 @@ void TextServerManager::_bind_methods() {
|
||||
ClassDB::bind_method(D_METHOD("find_interface", "name"), &TextServerManager::find_interface);
|
||||
|
||||
ClassDB::bind_method(D_METHOD("set_primary_interface", "index"), &TextServerManager::set_primary_interface);
|
||||
ClassDB::bind_method(D_METHOD("get_primary_interface"), &TextServerManager::_get_primary_interface);
|
||||
ClassDB::bind_method(D_METHOD("get_primary_interface"), &TextServerManager::get_primary_interface);
|
||||
|
||||
ADD_SIGNAL(MethodInfo("interface_added", PropertyInfo(Variant::STRING_NAME, "interface_name")));
|
||||
ADD_SIGNAL(MethodInfo("interface_removed", PropertyInfo(Variant::STRING_NAME, "interface_name")));
|
||||
@ -118,10 +118,6 @@ Array TextServerManager::get_interfaces() const {
|
||||
return ret;
|
||||
}
|
||||
|
||||
Ref<TextServer> TextServerManager::_get_primary_interface() const {
|
||||
return primary_interface;
|
||||
}
|
||||
|
||||
void TextServerManager::set_primary_interface(const Ref<TextServer> &p_primary_interface) {
|
||||
if (p_primary_interface.is_null()) {
|
||||
print_verbose("TextServer: Clearing primary interface");
|
||||
@ -407,6 +403,8 @@ void TextServer::_bind_methods() {
|
||||
ClassDB::bind_method(D_METHOD("parse_number", "number", "language"), &TextServer::parse_number, DEFVAL(""));
|
||||
ClassDB::bind_method(D_METHOD("percent_sign", "language"), &TextServer::percent_sign, DEFVAL(""));
|
||||
|
||||
ClassDB::bind_method(D_METHOD("strip_diacritics", "string"), &TextServer::strip_diacritics);
|
||||
|
||||
/* Direction */
|
||||
BIND_ENUM_CONSTANT(DIRECTION_AUTO);
|
||||
BIND_ENUM_CONSTANT(DIRECTION_LTR);
|
||||
@ -1317,6 +1315,134 @@ void TextServer::shaped_text_draw_outline(RID p_shaped, RID p_canvas, const Vect
|
||||
}
|
||||
}
|
||||
|
||||
void TextServer::_diacritics_map_add(const String &p_from, char32_t p_to) {
|
||||
for (int i = 0; i < p_from.size(); i++) {
|
||||
diacritics_map[p_from[i]] = p_to;
|
||||
}
|
||||
}
|
||||
|
||||
void TextServer::_init_diacritics_map() {
|
||||
diacritics_map.clear();
|
||||
|
||||
// Latin.
|
||||
_diacritics_map_add(U"ÀÁÂÃÄÅĀĂĄǍǞǠǺȀȂȦḀẠẢẤẦẨẪẬẮẰẲẴẶ", U'A');
|
||||
_diacritics_map_add(U"àáâãäåāăąǎǟǡǻȁȃȧḁẚạảấầẩẫậắằẳẵặ", U'a');
|
||||
_diacritics_map_add(U"ǢǼ", U'Æ');
|
||||
_diacritics_map_add(U"ǣǽ", U'æ');
|
||||
_diacritics_map_add(U"ḂḄḆ", U'B');
|
||||
_diacritics_map_add(U"ḃḅḇ", U'b');
|
||||
_diacritics_map_add(U"ÇĆĈĊČḈ", U'C');
|
||||
_diacritics_map_add(U"çćĉċčḉ", U'c');
|
||||
_diacritics_map_add(U"ĎḊḌḎḐḒ", U'D');
|
||||
_diacritics_map_add(U"ďḋḍḏḑḓ", U'd');
|
||||
_diacritics_map_add(U"ÈÉÊËĒĔĖĘĚȆȨḔḖḘḚḜẸẺẼẾỀỂỄỆ", U'E');
|
||||
_diacritics_map_add(U"èéêëēĕėęěȇȩḕḗḙḛḝẹẻẽếềểễệ", U'e');
|
||||
_diacritics_map_add(U"Ḟ", U'F');
|
||||
_diacritics_map_add(U"ḟ", U'f');
|
||||
_diacritics_map_add(U"ĜĞĠĢǦǴḠ", U'G');
|
||||
_diacritics_map_add(U"ĝğġģǧǵḡ", U'g');
|
||||
_diacritics_map_add(U"ĤȞḢḤḦḨḪ", U'H');
|
||||
_diacritics_map_add(U"ĥȟḣḥḧḩḫẖ", U'h');
|
||||
_diacritics_map_add(U"ÌÍÎÏĨĪĬĮİǏȈȊḬḮỈỊ", U'I');
|
||||
_diacritics_map_add(U"ìíîïĩīĭįıǐȉȋḭḯỉị", U'i');
|
||||
_diacritics_map_add(U"Ĵ", U'J');
|
||||
_diacritics_map_add(U"ĵ", U'j');
|
||||
_diacritics_map_add(U"ĶǨḰḲḴ", U'K');
|
||||
_diacritics_map_add(U"ķĸǩḱḳḵ", U'k');
|
||||
_diacritics_map_add(U"ĹĻĽĿḶḸḺḼ", U'L');
|
||||
_diacritics_map_add(U"ĺļľŀḷḹḻḽ", U'l');
|
||||
_diacritics_map_add(U"ḾṀṂ", U'M');
|
||||
_diacritics_map_add(U"ḿṁṃ", U'm');
|
||||
_diacritics_map_add(U"ÑŃŅŇǸṄṆṈṊ", U'N');
|
||||
_diacritics_map_add(U"ñńņňʼnǹṅṇṉṋ", U'n');
|
||||
_diacritics_map_add(U"ÒÓÔÕÖŌŎŐƠǑǪǬȌȎȪȬȮȰṌṎṐṒỌỎỐỒỔỖỘỚỜỞỠỢ", U'O');
|
||||
_diacritics_map_add(U"òóôõöōŏőơǒǫǭȍȏȫȭȯȱṍṏṑṓọỏốồổỗộớờởỡợ", U'o');
|
||||
_diacritics_map_add(U"ṔṖ", U'P');
|
||||
_diacritics_map_add(U"ṗṕ", U'p');
|
||||
_diacritics_map_add(U"ŔŖŘȐȒṘṚṜṞ", U'R');
|
||||
_diacritics_map_add(U"ŕŗřȑȓṙṛṝṟ", U'r');
|
||||
_diacritics_map_add(U"ŚŜŞŠȘṠṢṤṦṨ", U'S');
|
||||
_diacritics_map_add(U"śŝşšſșṡṣṥṧṩẛẜẝ", U's');
|
||||
_diacritics_map_add(U"ŢŤȚṪṬṮṰ", U'T');
|
||||
_diacritics_map_add(U"ţťțṫṭṯṱẗ", U't');
|
||||
_diacritics_map_add(U"ÙÚÛÜŨŪŬŮŰŲƯǓǕǗǙǛȔȖṲṴṶṸṺỤỦỨỪỬỮỰ", U'U');
|
||||
_diacritics_map_add(U"ùúûüũūŭůűųưǔǖǘǚǜȕȗṳṵṷṹṻụủứừửữự", U'u');
|
||||
_diacritics_map_add(U"ṼṾ", U'V');
|
||||
_diacritics_map_add(U"ṽṿ", U'v');
|
||||
_diacritics_map_add(U"ŴẀẂẄẆẈ", U'W');
|
||||
_diacritics_map_add(U"ŵẁẃẅẇẉẘ", U'w');
|
||||
_diacritics_map_add(U"ẊẌ", U'X');
|
||||
_diacritics_map_add(U"ẋẍ", U'x');
|
||||
_diacritics_map_add(U"ÝŶẎỲỴỶỸỾ", U'Y');
|
||||
_diacritics_map_add(U"ýÿŷẏẙỳỵỷỹỿ", U'y');
|
||||
_diacritics_map_add(U"ŹŻŽẐẒẔ", U'Z');
|
||||
_diacritics_map_add(U"źżžẑẓẕ", U'z');
|
||||
|
||||
// Greek.
|
||||
_diacritics_map_add(U"ΆἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼ", U'Α');
|
||||
_diacritics_map_add(U"άἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ", U'α');
|
||||
_diacritics_map_add(U"ΈἘἙἚἛἜἝῈΈ", U'Ε');
|
||||
_diacritics_map_add(U"έἐἑἒἓἔἕὲέ", U'ε');
|
||||
_diacritics_map_add(U"ΉἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌ", U'Η');
|
||||
_diacritics_map_add(U"ήἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ", U'η');
|
||||
_diacritics_map_add(U"ΊΪἸἹἺἻἼἽἾἿῘῙῚΊ", U'Ι');
|
||||
_diacritics_map_add(U"ίΐϊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ", U'ι');
|
||||
_diacritics_map_add(U"ΌὈὉὊὋὌὍῸΌ", U'Ο');
|
||||
_diacritics_map_add(U"όὀὁὂὃὄὅὸό", U'ο');
|
||||
_diacritics_map_add(U"Ῥ", U'Ρ');
|
||||
_diacritics_map_add(U"ῤῥ", U'ρ');
|
||||
_diacritics_map_add(U"ΎΫϓϔὙὛὝὟῨῩῪΎ", U'Υ');
|
||||
_diacritics_map_add(U"ΰϋύὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ", U'υ');
|
||||
_diacritics_map_add(U"ΏὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼ", U'Ω');
|
||||
_diacritics_map_add(U"ώὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ", U'ω');
|
||||
|
||||
// Cyrillic.
|
||||
_diacritics_map_add(U"ӐӒ", U'А');
|
||||
_diacritics_map_add(U"ӑӓ", U'а');
|
||||
_diacritics_map_add(U"ЀЁӖ", U'Е');
|
||||
_diacritics_map_add(U"ѐёӗ", U'е');
|
||||
_diacritics_map_add(U"Ӛ", U'Ә');
|
||||
_diacritics_map_add(U"ӛ", U'ә');
|
||||
_diacritics_map_add(U"Ӝ", U'Ж');
|
||||
_diacritics_map_add(U"ӝ", U'ж');
|
||||
_diacritics_map_add(U"Ӟ", U'З');
|
||||
_diacritics_map_add(U"ӟ", U'з');
|
||||
_diacritics_map_add(U"Ѓ", U'Г');
|
||||
_diacritics_map_add(U"ѓ", U'г');
|
||||
_diacritics_map_add(U"Ї", U'І');
|
||||
_diacritics_map_add(U"ї", U'і');
|
||||
_diacritics_map_add(U"ЍӢӤЙ", U'И');
|
||||
_diacritics_map_add(U"ѝӣӥй", U'и');
|
||||
_diacritics_map_add(U"Ќ", U'К');
|
||||
_diacritics_map_add(U"ќ", U'к');
|
||||
_diacritics_map_add(U"Ӧ", U'О');
|
||||
_diacritics_map_add(U"ӧ", U'о');
|
||||
_diacritics_map_add(U"Ӫ", U'Ө');
|
||||
_diacritics_map_add(U"ӫ", U'ө');
|
||||
_diacritics_map_add(U"Ӭ", U'Э');
|
||||
_diacritics_map_add(U"ӭ", U'э');
|
||||
_diacritics_map_add(U"ЎӮӰӲ", U'У');
|
||||
_diacritics_map_add(U"ўӯӱӳ", U'у');
|
||||
_diacritics_map_add(U"Ӵ", U'Ч');
|
||||
_diacritics_map_add(U"ӵ", U'ч');
|
||||
_diacritics_map_add(U"Ӹ", U'Ы');
|
||||
_diacritics_map_add(U"ӹ", U'ы');
|
||||
}
|
||||
|
||||
String TextServer::strip_diacritics(const String &p_string) const {
|
||||
String result;
|
||||
for (int i = 0; i < p_string.length(); i++) {
|
||||
if (p_string[i] < 0x02B0 || p_string[i] > 0x036F) { // Skip combining diacritics.
|
||||
if (diacritics_map.has(p_string[i])) {
|
||||
result += diacritics_map[p_string[i]];
|
||||
} else {
|
||||
result += p_string[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
Array TextServer::_shaped_text_get_glyphs_wrapper(RID p_shaped) const {
|
||||
Array ret;
|
||||
|
||||
@ -1393,6 +1519,7 @@ Array TextServer::_shaped_text_get_ellipsis_glyphs_wrapper(RID p_shaped) const {
|
||||
}
|
||||
|
||||
TextServer::TextServer() {
|
||||
_init_diacritics_map();
|
||||
}
|
||||
|
||||
TextServer::~TextServer() {
|
||||
|
@ -194,6 +194,10 @@ protected:
|
||||
Vector<Glyph> glyphs_logical;
|
||||
};
|
||||
|
||||
Map<char32_t, char32_t> diacritics_map;
|
||||
void _diacritics_map_add(const String &p_from, char32_t p_to);
|
||||
void _init_diacritics_map();
|
||||
|
||||
static void _bind_methods();
|
||||
|
||||
public:
|
||||
@ -427,6 +431,8 @@ public:
|
||||
virtual String parse_number(const String &p_string, const String &p_language = "") const { return p_string; };
|
||||
virtual String percent_sign(const String &p_language = "") const { return "%"; };
|
||||
|
||||
virtual String strip_diacritics(const String &p_string) const;
|
||||
|
||||
TextServer();
|
||||
~TextServer();
|
||||
};
|
||||
@ -509,7 +515,6 @@ public:
|
||||
_FORCE_INLINE_ Ref<TextServer> get_primary_interface() const {
|
||||
return primary_interface;
|
||||
}
|
||||
Ref<TextServer> _get_primary_interface() const;
|
||||
void set_primary_interface(const Ref<TextServer> &p_primary_interface);
|
||||
|
||||
TextServerManager();
|
||||
|
@ -265,6 +265,29 @@ TEST_SUITE("[[TextServer]") {
|
||||
font.clear();
|
||||
}
|
||||
}
|
||||
|
||||
SUBCASE("[TextServer] Strip Diacritics") {
|
||||
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
|
||||
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
|
||||
TEST_FAIL_COND(ts.is_null(), "Invalid TS interface.");
|
||||
|
||||
if (ts->has_feature(TextServer::FEATURE_SHAPING)) {
|
||||
CHECK(ts->strip_diacritics(U"ٱلسَّلَامُ عَلَيْكُمْ") == U"ٱلسلام عليكم");
|
||||
}
|
||||
|
||||
CHECK(ts->strip_diacritics(U"pêches épinards tomates fraises") == U"peches epinards tomates fraises");
|
||||
CHECK(ts->strip_diacritics(U"ΆΈΉΊΌΎΏΪΫϓϔ") == U"ΑΕΗΙΟΥΩΙΥΥΥ");
|
||||
CHECK(ts->strip_diacritics(U"άέήίΐϊΰϋόύώ") == U"αεηιιιυυουω");
|
||||
CHECK(ts->strip_diacritics(U"ЀЁЃ ЇЌЍӢӤЙ ЎӮӰӲ ӐӒӖӚӜӞ ӦӪ Ӭ Ӵ Ӹ") == U"ЕЕГ ІКИИИИ УУУУ ААЕӘЖЗ ОӨ Э Ч Ы");
|
||||
CHECK(ts->strip_diacritics(U"ѐёѓ їќѝӣӥй ўӯӱӳ ӑӓӗӛӝӟ ӧӫ ӭ ӵ ӹ") == U"еег ікииии уууу ааеәжз оө э ч ы");
|
||||
CHECK(ts->strip_diacritics(U"ÀÁÂÃÄÅĀĂĄÇĆĈĊČĎÈÉÊËĒĔĖĘĚĜĞĠĢĤÌÍÎÏĨĪĬĮİĴĶĹĻĽÑŃŅŇŊÒÓÔÕÖØŌŎŐƠŔŖŘŚŜŞŠŢŤÙÚÛÜŨŪŬŮŰŲƯŴÝŶŹŻŽ") == U"AAAAAAAAACCCCCDEEEEEEEEEGGGGHIIIIIIIIIJKLLLNNNNŊOOOOOØOOOORRRSSSSTTUUUUUUUUUUUWYYZZZ");
|
||||
CHECK(ts->strip_diacritics(U"àáâãäåāăąçćĉċčďèéêëēĕėęěĝğġģĥìíîïĩīĭįĵķĺļľñńņňŋòóôõöøōŏőơŕŗřśŝşšţťùúûüũūŭůűųưŵýÿŷźżž") == U"aaaaaaaaacccccdeeeeeeeeegggghiiiiiiiijklllnnnnŋoooooøoooorrrssssttuuuuuuuuuuuwyyyzzz");
|
||||
CHECK(ts->strip_diacritics(U"ǍǏȈǑǪǬȌȎȪȬȮȰǓǕǗǙǛȔȖǞǠǺȀȂȦǢǼǦǴǨǸȆȐȒȘȚȞȨ Ḁ ḂḄḆ Ḉ ḊḌḎḐḒ ḔḖḘḚḜ Ḟ Ḡ ḢḤḦḨḪ ḬḮ ḰḲḴ ḶḸḺḼ ḾṀṂ ṄṆṈṊ ṌṎṐṒ ṔṖ ṘṚṜṞ ṠṢṤṦṨ ṪṬṮṰ ṲṴṶṸṺ") == U"AIIOOOOOOOOOUUUUUUUAAAAAAÆÆGGKNERRSTHE A BBB C DDDDD EEEEE F G HHHHH II KKK LLLL MMM NNNN OOOO PP RRRR SSSSS TTTT UUUUU");
|
||||
CHECK(ts->strip_diacritics(U"ǎǐȉȋǒǫǭȍȏȫȭȯȱǔǖǘǚǜȕȗǟǡǻȁȃȧǣǽǧǵǩǹȇȑȓșțȟȩ ḁ ḃḅḇ ḉ ḋḍḏḑḓ ḟ ḡ ḭḯ ḱḳḵ ḷḹḻḽ ḿṁṃ ṅṇṉṋ ṍṏṑṓ ṗṕ ṙṛṝṟ ṡṣṥṧṩ ṫṭṯṱ ṳṵṷṹṻ") == U"aiiiooooooooouuuuuuuaaaaaaææggknerrsthe a bbb c ddddd f g ii kkk llll mmm nnnn oooo pp rrrr sssss tttt uuuuu");
|
||||
CHECK(ts->strip_diacritics(U"ṼṾ ẀẂẄẆẈ ẊẌ Ẏ ẐẒẔ") == U"VV WWWWW XX Y ZZZ");
|
||||
CHECK(ts->strip_diacritics(U"ṽṿ ẁẃẅẇẉ ẋẍ ẏ ẑẓẕ ẖ ẗẘẙẛ") == U"vv wwwww xx y zzz h twys");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}; // namespace TestTextServer
|
||||
|
Loading…
Reference in New Issue
Block a user