Merge pull request #53956 from bruvzg/icu_uax_31
This commit is contained in:
commit
b7346e5025
1456
core/string/char_range.inc
Normal file
1456
core/string/char_range.inc
Normal file
File diff suppressed because it is too large
Load Diff
@ -33,6 +33,26 @@
|
||||
|
||||
#include "core/typedefs.h"
|
||||
|
||||
#include "char_range.inc"
|
||||
|
||||
static _FORCE_INLINE_ bool is_unicode_identifier_start(char32_t c) {
|
||||
for (int i = 0; xid_start[i].start != 0; i++) {
|
||||
if (c >= xid_start[i].start && c <= xid_start[i].end) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static _FORCE_INLINE_ bool is_unicode_identifier_continue(char32_t c) {
|
||||
for (int i = 0; xid_continue[i].start != 0; i++) {
|
||||
if (c >= xid_continue[i].start && c <= xid_continue[i].end) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static _FORCE_INLINE_ bool is_ascii_upper_case(char32_t c) {
|
||||
return (c >= 'A' && c <= 'Z');
|
||||
}
|
||||
|
@ -944,6 +944,21 @@
|
||||
Returns [code]true[/code] if locale is right-to-left.
|
||||
</description>
|
||||
</method>
|
||||
<method name="is_valid_identifier" qualifiers="const">
|
||||
<return type="bool" />
|
||||
<argument index="0" name="string" type="String" />
|
||||
<description>
|
||||
Returns [code]true[/code] is [code]string[/code] is a valid identifier.
|
||||
If the text server supports the [constant FEATURE_UNICODE_IDENTIFIERS] feature, a valid identifier must:
|
||||
- Conform to normalization form C.
|
||||
- Begin with a Unicode character of class XID_Start or [code]"_"[/code].
|
||||
- May contain Unicode characters of class XID_Continue in the other positions.
|
||||
- Use UAX #31 recommended scripts only (mixed scripts are allowed).
|
||||
If the [constant FEATURE_UNICODE_IDENTIFIERS] feature is not supported, a valid identifier must:
|
||||
- Begin with a Unicode character of class XID_Start or [code]"_"[/code].
|
||||
- May contain Unicode characters of class XID_Continue in the other positions.
|
||||
</description>
|
||||
</method>
|
||||
<method name="load_support_data">
|
||||
<return type="bool" />
|
||||
<argument index="0" name="filename" type="String" />
|
||||
@ -1713,7 +1728,10 @@
|
||||
TextServer supports locale dependent and context sensitive case conversion.
|
||||
</constant>
|
||||
<constant name="FEATURE_USE_SUPPORT_DATA" value="4096" enum="Feature">
|
||||
TextServer require external data file for some features.
|
||||
TextServer require external data file for some features, see [method load_support_data].
|
||||
</constant>
|
||||
<constant name="FEATURE_UNICODE_IDENTIFIERS" value="8192" enum="Feature">
|
||||
TextServer supports UAX #31 identifier validation, see [method is_valid_identifier].
|
||||
</constant>
|
||||
<constant name="CONTOUR_CURVE_TAG_ON" value="1" enum="ContourPointTag">
|
||||
Contour point is on the curve.
|
||||
|
@ -941,6 +941,13 @@
|
||||
Returns [code]true[/code] if locale is right-to-left.
|
||||
</description>
|
||||
</method>
|
||||
<method name="is_valid_identifier" qualifiers="virtual const">
|
||||
<return type="bool" />
|
||||
<argument index="0" name="string" type="String" />
|
||||
<description>
|
||||
Returns [code]true[/code] is [code]string[/code] is a valid identifier.
|
||||
</description>
|
||||
</method>
|
||||
<method name="load_support_data" qualifiers="virtual">
|
||||
<return type="bool" />
|
||||
<argument index="0" name="filename" type="String" />
|
||||
|
@ -346,6 +346,7 @@ bool TextServerAdvanced::has_feature(Feature p_feature) const {
|
||||
case FEATURE_FONT_VARIABLE:
|
||||
case FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION:
|
||||
case FEATURE_USE_SUPPORT_DATA:
|
||||
case FEATURE_UNICODE_IDENTIFIERS:
|
||||
return true;
|
||||
default: {
|
||||
}
|
||||
@ -5757,6 +5758,191 @@ PackedInt32Array TextServerAdvanced::string_get_word_breaks(const String &p_stri
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool TextServerAdvanced::is_valid_identifier(const String &p_string) const {
|
||||
enum UAX31SequenceStatus {
|
||||
SEQ_NOT_STARTED,
|
||||
SEQ_STARTED,
|
||||
SEQ_STARTED_VIR,
|
||||
SEQ_NEAR_END,
|
||||
};
|
||||
|
||||
const char32_t *str = p_string.ptr();
|
||||
int len = p_string.length();
|
||||
|
||||
if (len == 0) {
|
||||
return false; // Empty string.
|
||||
}
|
||||
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
Char16String utf16 = p_string.utf16();
|
||||
const UNormalizer2 *norm_c = unorm2_getNFCInstance(&err);
|
||||
if (U_FAILURE(err)) {
|
||||
return false; // Failed to load normalizer.
|
||||
}
|
||||
bool isnurom = unorm2_isNormalized(norm_c, utf16.ptr(), utf16.length(), &err);
|
||||
if (U_FAILURE(err) || !isnurom) {
|
||||
return false; // Do not conform to Normalization Form C.
|
||||
}
|
||||
|
||||
UAX31SequenceStatus A1_sequence_status = SEQ_NOT_STARTED;
|
||||
UScriptCode A1_scr = USCRIPT_INHERITED;
|
||||
UAX31SequenceStatus A2_sequence_status = SEQ_NOT_STARTED;
|
||||
UScriptCode A2_scr = USCRIPT_INHERITED;
|
||||
UAX31SequenceStatus B_sequence_status = SEQ_NOT_STARTED;
|
||||
UScriptCode B_scr = USCRIPT_INHERITED;
|
||||
|
||||
for (int i = 0; i < len; i++) {
|
||||
err = U_ZERO_ERROR;
|
||||
UScriptCode scr = uscript_getScript(str[i], &err);
|
||||
if (U_FAILURE(err)) {
|
||||
return false; // Invalid script.
|
||||
}
|
||||
if (uscript_getUsage(scr) != USCRIPT_USAGE_RECOMMENDED) {
|
||||
return false; // Not a recommended script.
|
||||
}
|
||||
uint8_t cat = u_charType(str[i]);
|
||||
int32_t jt = u_getIntPropertyValue(str[i], UCHAR_JOINING_TYPE);
|
||||
|
||||
// UAX #31 section 2.3 subsections A1, A2 and B, check ZWNJ and ZWJ usage.
|
||||
switch (A1_sequence_status) {
|
||||
case SEQ_NEAR_END: {
|
||||
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
|
||||
return false; // Mixed script.
|
||||
}
|
||||
if (jt == U_JT_RIGHT_JOINING || jt == U_JT_DUAL_JOINING) {
|
||||
A1_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
|
||||
} else if (jt != U_JT_TRANSPARENT) {
|
||||
return false; // Invalid end of sequence.
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED: {
|
||||
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
|
||||
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (jt != U_JT_TRANSPARENT) {
|
||||
if (str[i] == 0x200C /*ZWNJ*/) {
|
||||
A1_sequence_status = SEQ_NEAR_END;
|
||||
continue;
|
||||
} else {
|
||||
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (A1_sequence_status == SEQ_NOT_STARTED) {
|
||||
if (jt == U_JT_LEFT_JOINING || jt == U_JT_DUAL_JOINING) {
|
||||
A1_sequence_status = SEQ_STARTED;
|
||||
A1_scr = scr;
|
||||
}
|
||||
};
|
||||
|
||||
switch (A2_sequence_status) {
|
||||
case SEQ_NEAR_END: {
|
||||
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
|
||||
return false; // Mixed script.
|
||||
}
|
||||
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
|
||||
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
|
||||
return false; // Invalid end of sequence.
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED_VIR: {
|
||||
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (str[i] == 0x200C /*ZWNJ*/) {
|
||||
A2_sequence_status = SEQ_NEAR_END;
|
||||
continue;
|
||||
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED: {
|
||||
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
|
||||
A2_sequence_status = SEQ_STARTED_VIR;
|
||||
} else if (cat != U_MODIFIER_LETTER) {
|
||||
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (A2_sequence_status == SEQ_NOT_STARTED) {
|
||||
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
|
||||
A2_sequence_status = SEQ_STARTED;
|
||||
A2_scr = scr;
|
||||
}
|
||||
}
|
||||
|
||||
switch (B_sequence_status) {
|
||||
case SEQ_NEAR_END: {
|
||||
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
|
||||
return false; // Mixed script.
|
||||
}
|
||||
if (u_getIntPropertyValue(str[i], UCHAR_INDIC_SYLLABIC_CATEGORY) != U_INSC_VOWEL_DEPENDENT) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
|
||||
} else {
|
||||
return false; // Invalid end of sequence.
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED_VIR: {
|
||||
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (str[i] == 0x200D /*ZWJ*/) {
|
||||
B_sequence_status = SEQ_NEAR_END;
|
||||
continue;
|
||||
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case SEQ_STARTED: {
|
||||
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
} else {
|
||||
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
|
||||
B_sequence_status = SEQ_STARTED_VIR;
|
||||
} else if (cat != U_MODIFIER_LETTER) {
|
||||
B_sequence_status = SEQ_NOT_STARTED; // Reset.
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (B_sequence_status == SEQ_NOT_STARTED) {
|
||||
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
|
||||
B_sequence_status = SEQ_STARTED;
|
||||
B_scr = scr;
|
||||
}
|
||||
}
|
||||
|
||||
if (u_hasBinaryProperty(str[i], UCHAR_PATTERN_SYNTAX) || u_hasBinaryProperty(str[i], UCHAR_PATTERN_WHITE_SPACE) || u_hasBinaryProperty(str[i], UCHAR_NONCHARACTER_CODE_POINT)) {
|
||||
return false; // Not a XID_Start or XID_Continue character.
|
||||
}
|
||||
if (i == 0) {
|
||||
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || str[0] == 0x2118 || str[0] == 0x212E || str[0] == 0x309B || str[0] == 0x309C || str[0] == 0x005F)) {
|
||||
return false; // Not a XID_Start character.
|
||||
}
|
||||
} else {
|
||||
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || cat == U_NON_SPACING_MARK || cat == U_COMBINING_SPACING_MARK || cat == U_DECIMAL_DIGIT_NUMBER || cat == U_CONNECTOR_PUNCTUATION || str[i] == 0x2118 || str[i] == 0x212E || str[i] == 0x309B || str[i] == 0x309C || str[i] == 0x1369 || str[i] == 0x1371 || str[i] == 0x00B7 || str[i] == 0x0387 || str[i] == 0x19DA || str[i] == 0x0E33 || str[i] == 0x0EB3 || str[i] == 0xFF9E || str[i] == 0xFF9F)) {
|
||||
return false; // Not a XID_Continue character.
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TextServerAdvanced::TextServerAdvanced() {
|
||||
_insert_num_systems_lang();
|
||||
_insert_feature_sets();
|
||||
|
@ -702,6 +702,7 @@ public:
|
||||
virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const override;
|
||||
|
||||
virtual String strip_diacritics(const String &p_string) const override;
|
||||
virtual bool is_valid_identifier(const String &p_string) const override;
|
||||
|
||||
virtual String string_to_upper(const String &p_string, const String &p_language = "") const override;
|
||||
virtual String string_to_lower(const String &p_string, const String &p_language = "") const override;
|
||||
|
@ -297,6 +297,7 @@ void TextServerExtension::_bind_methods() {
|
||||
GDVIRTUAL_BIND(percent_sign, "language");
|
||||
|
||||
GDVIRTUAL_BIND(strip_diacritics, "string");
|
||||
GDVIRTUAL_BIND(is_valid_identifier, "string");
|
||||
|
||||
GDVIRTUAL_BIND(string_get_word_breaks, "string", "language");
|
||||
|
||||
@ -1498,6 +1499,14 @@ String TextServerExtension::percent_sign(const String &p_language) const {
|
||||
return "%";
|
||||
}
|
||||
|
||||
bool TextServerExtension::is_valid_identifier(const String &p_string) const {
|
||||
bool ret;
|
||||
if (GDVIRTUAL_CALL(is_valid_identifier, p_string, ret)) {
|
||||
return ret;
|
||||
}
|
||||
return TextServer::is_valid_identifier(p_string);
|
||||
}
|
||||
|
||||
String TextServerExtension::strip_diacritics(const String &p_string) const {
|
||||
String ret;
|
||||
if (GDVIRTUAL_CALL(strip_diacritics, p_string, ret)) {
|
||||
|
@ -496,6 +496,9 @@ public:
|
||||
virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const override;
|
||||
GDVIRTUAL2RC(PackedInt32Array, string_get_word_breaks, const String &, const String &);
|
||||
|
||||
virtual bool is_valid_identifier(const String &p_string) const override;
|
||||
GDVIRTUAL1RC(bool, is_valid_identifier, const String &);
|
||||
|
||||
virtual String string_to_upper(const String &p_string, const String &p_language = "") const override;
|
||||
virtual String string_to_lower(const String &p_string, const String &p_language = "") const override;
|
||||
GDVIRTUAL2RC(String, string_to_upper, const String &, const String &);
|
||||
|
@ -447,6 +447,7 @@ void TextServer::_bind_methods() {
|
||||
ClassDB::bind_method(D_METHOD("string_get_word_breaks", "string", "language"), &TextServer::string_get_word_breaks, DEFVAL(""));
|
||||
|
||||
ClassDB::bind_method(D_METHOD("strip_diacritics", "string"), &TextServer::strip_diacritics);
|
||||
ClassDB::bind_method(D_METHOD("is_valid_identifier", "string"), &TextServer::is_valid_identifier);
|
||||
|
||||
ClassDB::bind_method(D_METHOD("string_to_upper", "string", "language"), &TextServer::string_to_upper, DEFVAL(""));
|
||||
ClassDB::bind_method(D_METHOD("string_to_lower", "string", "language"), &TextServer::string_to_lower, DEFVAL(""));
|
||||
@ -545,6 +546,7 @@ void TextServer::_bind_methods() {
|
||||
BIND_ENUM_CONSTANT(FEATURE_FONT_VARIABLE);
|
||||
BIND_ENUM_CONSTANT(FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION);
|
||||
BIND_ENUM_CONSTANT(FEATURE_USE_SUPPORT_DATA);
|
||||
BIND_ENUM_CONSTANT(FEATURE_UNICODE_IDENTIFIERS);
|
||||
|
||||
/* FT Contour Point Types */
|
||||
BIND_ENUM_CONSTANT(CONTOUR_CURVE_TAG_ON);
|
||||
@ -1730,6 +1732,26 @@ Array TextServer::_shaped_text_get_ellipsis_glyphs_wrapper(const RID &p_shaped)
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool TextServer::is_valid_identifier(const String &p_string) const {
|
||||
const char32_t *str = p_string.ptr();
|
||||
int len = p_string.length();
|
||||
|
||||
if (len == 0) {
|
||||
return false; // Empty string.
|
||||
}
|
||||
|
||||
if (!is_unicode_identifier_start(str[0])) {
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 1; i < len; i++) {
|
||||
if (!is_unicode_identifier_continue(str[i])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
TextServer::TextServer() {
|
||||
_init_diacritics_map();
|
||||
}
|
||||
|
@ -148,6 +148,7 @@ public:
|
||||
FEATURE_FONT_VARIABLE = 1 << 10,
|
||||
FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION = 1 << 11,
|
||||
FEATURE_USE_SUPPORT_DATA = 1 << 12,
|
||||
FEATURE_UNICODE_IDENTIFIERS = 1 << 13,
|
||||
};
|
||||
|
||||
enum ContourPointTag {
|
||||
@ -464,6 +465,7 @@ public:
|
||||
virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const = 0;
|
||||
|
||||
virtual String strip_diacritics(const String &p_string) const;
|
||||
virtual bool is_valid_identifier(const String &p_string) const;
|
||||
|
||||
// Other string operations.
|
||||
virtual String string_to_upper(const String &p_string, const String &p_language = "") const = 0;
|
||||
|
@ -39,7 +39,7 @@
|
||||
|
||||
namespace TestTextServer {
|
||||
|
||||
TEST_SUITE("[[TextServer]") {
|
||||
TEST_SUITE("[TextServer]") {
|
||||
TEST_CASE("[TextServer] Init, font loading and shaping") {
|
||||
SUBCASE("[TextServer] Loading fonts") {
|
||||
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
|
||||
@ -492,6 +492,27 @@ TEST_SUITE("[[TextServer]") {
|
||||
}
|
||||
}
|
||||
|
||||
SUBCASE("[TextServer] Unicode identifiers") {
|
||||
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
|
||||
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
|
||||
TEST_FAIL_COND(ts.is_null(), "Invalid TS interface.");
|
||||
|
||||
static const char32_t *data[19] = { U"-30", U"100", U"10.1", U"10,1", U"1e2", U"1e-2", U"1e2e3", U"0xAB", U"AB", U"Test1", U"1Test", U"Test*1", U"test_testeT", U"test_tes teT", U"عَلَيْكُمْ", U"عَلَيْكُمْTest", U"ӒӖӚӜ", U"_test", U"ÂÃÄÅĀĂĄÇĆĈĊ" };
|
||||
static bool isid[19] = { false, false, false, false, false, false, false, false, true, true, false, false, true, false, true, true, true, true, true };
|
||||
for (int j = 0; j < 19; j++) {
|
||||
String s = String(data[j]);
|
||||
CHECK(ts->is_valid_identifier(s) == isid[j]);
|
||||
}
|
||||
|
||||
if (ts->has_feature(TextServer::FEATURE_UNICODE_IDENTIFIERS)) {
|
||||
// Test UAX 3.2 ZW(N)J usage.
|
||||
CHECK(ts->is_valid_identifier(U"\u0646\u0627\u0645\u0647\u200C\u0627\u06CC"));
|
||||
CHECK(ts->is_valid_identifier(U"\u0D26\u0D43\u0D15\u0D4D\u200C\u0D38\u0D3E\u0D15\u0D4D\u0D37\u0D3F"));
|
||||
CHECK(ts->is_valid_identifier(U"\u0DC1\u0DCA\u200D\u0DBB\u0DD3"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SUBCASE("[TextServer] Strip Diacritics") {
|
||||
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
|
||||
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
|
||||
|
Loading…
Reference in New Issue
Block a user