Merge pull request #53956 from bruvzg/icu_uax_31

This commit is contained in:
Rémi Verschelde 2022-08-02 08:54:19 +02:00 committed by GitHub
commit b7346e5025
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 1747 additions and 2 deletions

1456
core/string/char_range.inc Normal file

File diff suppressed because it is too large Load Diff

View File

@ -33,6 +33,26 @@
#include "core/typedefs.h"
#include "char_range.inc"
static _FORCE_INLINE_ bool is_unicode_identifier_start(char32_t c) {
for (int i = 0; xid_start[i].start != 0; i++) {
if (c >= xid_start[i].start && c <= xid_start[i].end) {
return true;
}
}
return false;
}
static _FORCE_INLINE_ bool is_unicode_identifier_continue(char32_t c) {
for (int i = 0; xid_continue[i].start != 0; i++) {
if (c >= xid_continue[i].start && c <= xid_continue[i].end) {
return true;
}
}
return false;
}
static _FORCE_INLINE_ bool is_ascii_upper_case(char32_t c) {
return (c >= 'A' && c <= 'Z');
}

View File

@ -944,6 +944,21 @@
Returns [code]true[/code] if locale is right-to-left.
</description>
</method>
<method name="is_valid_identifier" qualifiers="const">
<return type="bool" />
<argument index="0" name="string" type="String" />
<description>
Returns [code]true[/code] is [code]string[/code] is a valid identifier.
If the text server supports the [constant FEATURE_UNICODE_IDENTIFIERS] feature, a valid identifier must:
- Conform to normalization form C.
- Begin with a Unicode character of class XID_Start or [code]"_"[/code].
- May contain Unicode characters of class XID_Continue in the other positions.
- Use UAX #31 recommended scripts only (mixed scripts are allowed).
If the [constant FEATURE_UNICODE_IDENTIFIERS] feature is not supported, a valid identifier must:
- Begin with a Unicode character of class XID_Start or [code]"_"[/code].
- May contain Unicode characters of class XID_Continue in the other positions.
</description>
</method>
<method name="load_support_data">
<return type="bool" />
<argument index="0" name="filename" type="String" />
@ -1713,7 +1728,10 @@
TextServer supports locale dependent and context sensitive case conversion.
</constant>
<constant name="FEATURE_USE_SUPPORT_DATA" value="4096" enum="Feature">
TextServer require external data file for some features.
TextServer require external data file for some features, see [method load_support_data].
</constant>
<constant name="FEATURE_UNICODE_IDENTIFIERS" value="8192" enum="Feature">
TextServer supports UAX #31 identifier validation, see [method is_valid_identifier].
</constant>
<constant name="CONTOUR_CURVE_TAG_ON" value="1" enum="ContourPointTag">
Contour point is on the curve.

View File

@ -941,6 +941,13 @@
Returns [code]true[/code] if locale is right-to-left.
</description>
</method>
<method name="is_valid_identifier" qualifiers="virtual const">
<return type="bool" />
<argument index="0" name="string" type="String" />
<description>
Returns [code]true[/code] is [code]string[/code] is a valid identifier.
</description>
</method>
<method name="load_support_data" qualifiers="virtual">
<return type="bool" />
<argument index="0" name="filename" type="String" />

View File

@ -346,6 +346,7 @@ bool TextServerAdvanced::has_feature(Feature p_feature) const {
case FEATURE_FONT_VARIABLE:
case FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION:
case FEATURE_USE_SUPPORT_DATA:
case FEATURE_UNICODE_IDENTIFIERS:
return true;
default: {
}
@ -5757,6 +5758,191 @@ PackedInt32Array TextServerAdvanced::string_get_word_breaks(const String &p_stri
return ret;
}
bool TextServerAdvanced::is_valid_identifier(const String &p_string) const {
enum UAX31SequenceStatus {
SEQ_NOT_STARTED,
SEQ_STARTED,
SEQ_STARTED_VIR,
SEQ_NEAR_END,
};
const char32_t *str = p_string.ptr();
int len = p_string.length();
if (len == 0) {
return false; // Empty string.
}
UErrorCode err = U_ZERO_ERROR;
Char16String utf16 = p_string.utf16();
const UNormalizer2 *norm_c = unorm2_getNFCInstance(&err);
if (U_FAILURE(err)) {
return false; // Failed to load normalizer.
}
bool isnurom = unorm2_isNormalized(norm_c, utf16.ptr(), utf16.length(), &err);
if (U_FAILURE(err) || !isnurom) {
return false; // Do not conform to Normalization Form C.
}
UAX31SequenceStatus A1_sequence_status = SEQ_NOT_STARTED;
UScriptCode A1_scr = USCRIPT_INHERITED;
UAX31SequenceStatus A2_sequence_status = SEQ_NOT_STARTED;
UScriptCode A2_scr = USCRIPT_INHERITED;
UAX31SequenceStatus B_sequence_status = SEQ_NOT_STARTED;
UScriptCode B_scr = USCRIPT_INHERITED;
for (int i = 0; i < len; i++) {
err = U_ZERO_ERROR;
UScriptCode scr = uscript_getScript(str[i], &err);
if (U_FAILURE(err)) {
return false; // Invalid script.
}
if (uscript_getUsage(scr) != USCRIPT_USAGE_RECOMMENDED) {
return false; // Not a recommended script.
}
uint8_t cat = u_charType(str[i]);
int32_t jt = u_getIntPropertyValue(str[i], UCHAR_JOINING_TYPE);
// UAX #31 section 2.3 subsections A1, A2 and B, check ZWNJ and ZWJ usage.
switch (A1_sequence_status) {
case SEQ_NEAR_END: {
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
return false; // Mixed script.
}
if (jt == U_JT_RIGHT_JOINING || jt == U_JT_DUAL_JOINING) {
A1_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
} else if (jt != U_JT_TRANSPARENT) {
return false; // Invalid end of sequence.
}
} break;
case SEQ_STARTED: {
if ((A1_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A1_scr)) {
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (jt != U_JT_TRANSPARENT) {
if (str[i] == 0x200C /*ZWNJ*/) {
A1_sequence_status = SEQ_NEAR_END;
continue;
} else {
A1_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
}
} break;
default:
break;
}
if (A1_sequence_status == SEQ_NOT_STARTED) {
if (jt == U_JT_LEFT_JOINING || jt == U_JT_DUAL_JOINING) {
A1_sequence_status = SEQ_STARTED;
A1_scr = scr;
}
};
switch (A2_sequence_status) {
case SEQ_NEAR_END: {
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
return false; // Mixed script.
}
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
A2_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
return false; // Invalid end of sequence.
}
} break;
case SEQ_STARTED_VIR: {
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (str[i] == 0x200C /*ZWNJ*/) {
A2_sequence_status = SEQ_NEAR_END;
continue;
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
case SEQ_STARTED: {
if ((A2_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != A2_scr)) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
A2_sequence_status = SEQ_STARTED_VIR;
} else if (cat != U_MODIFIER_LETTER) {
A2_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
default:
break;
}
if (A2_sequence_status == SEQ_NOT_STARTED) {
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
A2_sequence_status = SEQ_STARTED;
A2_scr = scr;
}
}
switch (B_sequence_status) {
case SEQ_NEAR_END: {
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
return false; // Mixed script.
}
if (u_getIntPropertyValue(str[i], UCHAR_INDIC_SYLLABIC_CATEGORY) != U_INSC_VOWEL_DEPENDENT) {
B_sequence_status = SEQ_NOT_STARTED; // Valid end of sequence, reset.
} else {
return false; // Invalid end of sequence.
}
} break;
case SEQ_STARTED_VIR: {
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (str[i] == 0x200D /*ZWJ*/) {
B_sequence_status = SEQ_NEAR_END;
continue;
} else if (cat != U_MODIFIER_LETTER || u_getCombiningClass(str[i]) == 0) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
case SEQ_STARTED: {
if ((B_scr > USCRIPT_INHERITED) && (scr > USCRIPT_INHERITED) && (scr != B_scr)) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
} else {
if (u_getCombiningClass(str[i]) == 9 /*Virama Combining Class*/) {
B_sequence_status = SEQ_STARTED_VIR;
} else if (cat != U_MODIFIER_LETTER) {
B_sequence_status = SEQ_NOT_STARTED; // Reset.
}
}
} break;
default:
break;
}
if (B_sequence_status == SEQ_NOT_STARTED) {
if (cat == U_UPPERCASE_LETTER || cat == U_LOWERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_MODIFIER_LETTER || cat == U_OTHER_LETTER) {
B_sequence_status = SEQ_STARTED;
B_scr = scr;
}
}
if (u_hasBinaryProperty(str[i], UCHAR_PATTERN_SYNTAX) || u_hasBinaryProperty(str[i], UCHAR_PATTERN_WHITE_SPACE) || u_hasBinaryProperty(str[i], UCHAR_NONCHARACTER_CODE_POINT)) {
return false; // Not a XID_Start or XID_Continue character.
}
if (i == 0) {
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || str[0] == 0x2118 || str[0] == 0x212E || str[0] == 0x309B || str[0] == 0x309C || str[0] == 0x005F)) {
return false; // Not a XID_Start character.
}
} else {
if (!(cat == U_LOWERCASE_LETTER || cat == U_UPPERCASE_LETTER || cat == U_TITLECASE_LETTER || cat == U_OTHER_LETTER || cat == U_MODIFIER_LETTER || cat == U_LETTER_NUMBER || cat == U_NON_SPACING_MARK || cat == U_COMBINING_SPACING_MARK || cat == U_DECIMAL_DIGIT_NUMBER || cat == U_CONNECTOR_PUNCTUATION || str[i] == 0x2118 || str[i] == 0x212E || str[i] == 0x309B || str[i] == 0x309C || str[i] == 0x1369 || str[i] == 0x1371 || str[i] == 0x00B7 || str[i] == 0x0387 || str[i] == 0x19DA || str[i] == 0x0E33 || str[i] == 0x0EB3 || str[i] == 0xFF9E || str[i] == 0xFF9F)) {
return false; // Not a XID_Continue character.
}
}
}
return true;
}
TextServerAdvanced::TextServerAdvanced() {
_insert_num_systems_lang();
_insert_feature_sets();

View File

@ -702,6 +702,7 @@ public:
virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const override;
virtual String strip_diacritics(const String &p_string) const override;
virtual bool is_valid_identifier(const String &p_string) const override;
virtual String string_to_upper(const String &p_string, const String &p_language = "") const override;
virtual String string_to_lower(const String &p_string, const String &p_language = "") const override;

View File

@ -297,6 +297,7 @@ void TextServerExtension::_bind_methods() {
GDVIRTUAL_BIND(percent_sign, "language");
GDVIRTUAL_BIND(strip_diacritics, "string");
GDVIRTUAL_BIND(is_valid_identifier, "string");
GDVIRTUAL_BIND(string_get_word_breaks, "string", "language");
@ -1498,6 +1499,14 @@ String TextServerExtension::percent_sign(const String &p_language) const {
return "%";
}
bool TextServerExtension::is_valid_identifier(const String &p_string) const {
bool ret;
if (GDVIRTUAL_CALL(is_valid_identifier, p_string, ret)) {
return ret;
}
return TextServer::is_valid_identifier(p_string);
}
String TextServerExtension::strip_diacritics(const String &p_string) const {
String ret;
if (GDVIRTUAL_CALL(strip_diacritics, p_string, ret)) {

View File

@ -496,6 +496,9 @@ public:
virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const override;
GDVIRTUAL2RC(PackedInt32Array, string_get_word_breaks, const String &, const String &);
virtual bool is_valid_identifier(const String &p_string) const override;
GDVIRTUAL1RC(bool, is_valid_identifier, const String &);
virtual String string_to_upper(const String &p_string, const String &p_language = "") const override;
virtual String string_to_lower(const String &p_string, const String &p_language = "") const override;
GDVIRTUAL2RC(String, string_to_upper, const String &, const String &);

View File

@ -447,6 +447,7 @@ void TextServer::_bind_methods() {
ClassDB::bind_method(D_METHOD("string_get_word_breaks", "string", "language"), &TextServer::string_get_word_breaks, DEFVAL(""));
ClassDB::bind_method(D_METHOD("strip_diacritics", "string"), &TextServer::strip_diacritics);
ClassDB::bind_method(D_METHOD("is_valid_identifier", "string"), &TextServer::is_valid_identifier);
ClassDB::bind_method(D_METHOD("string_to_upper", "string", "language"), &TextServer::string_to_upper, DEFVAL(""));
ClassDB::bind_method(D_METHOD("string_to_lower", "string", "language"), &TextServer::string_to_lower, DEFVAL(""));
@ -545,6 +546,7 @@ void TextServer::_bind_methods() {
BIND_ENUM_CONSTANT(FEATURE_FONT_VARIABLE);
BIND_ENUM_CONSTANT(FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION);
BIND_ENUM_CONSTANT(FEATURE_USE_SUPPORT_DATA);
BIND_ENUM_CONSTANT(FEATURE_UNICODE_IDENTIFIERS);
/* FT Contour Point Types */
BIND_ENUM_CONSTANT(CONTOUR_CURVE_TAG_ON);
@ -1730,6 +1732,26 @@ Array TextServer::_shaped_text_get_ellipsis_glyphs_wrapper(const RID &p_shaped)
return ret;
}
bool TextServer::is_valid_identifier(const String &p_string) const {
const char32_t *str = p_string.ptr();
int len = p_string.length();
if (len == 0) {
return false; // Empty string.
}
if (!is_unicode_identifier_start(str[0])) {
return false;
}
for (int i = 1; i < len; i++) {
if (!is_unicode_identifier_continue(str[i])) {
return false;
}
}
return true;
}
TextServer::TextServer() {
_init_diacritics_map();
}

View File

@ -148,6 +148,7 @@ public:
FEATURE_FONT_VARIABLE = 1 << 10,
FEATURE_CONTEXT_SENSITIVE_CASE_CONVERSION = 1 << 11,
FEATURE_USE_SUPPORT_DATA = 1 << 12,
FEATURE_UNICODE_IDENTIFIERS = 1 << 13,
};
enum ContourPointTag {
@ -464,6 +465,7 @@ public:
virtual PackedInt32Array string_get_word_breaks(const String &p_string, const String &p_language = "") const = 0;
virtual String strip_diacritics(const String &p_string) const;
virtual bool is_valid_identifier(const String &p_string) const;
// Other string operations.
virtual String string_to_upper(const String &p_string, const String &p_language = "") const = 0;

View File

@ -39,7 +39,7 @@
namespace TestTextServer {
TEST_SUITE("[[TextServer]") {
TEST_SUITE("[TextServer]") {
TEST_CASE("[TextServer] Init, font loading and shaping") {
SUBCASE("[TextServer] Loading fonts") {
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
@ -492,6 +492,27 @@ TEST_SUITE("[[TextServer]") {
}
}
SUBCASE("[TextServer] Unicode identifiers") {
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);
TEST_FAIL_COND(ts.is_null(), "Invalid TS interface.");
static const char32_t *data[19] = { U"-30", U"100", U"10.1", U"10,1", U"1e2", U"1e-2", U"1e2e3", U"0xAB", U"AB", U"Test1", U"1Test", U"Test*1", U"test_testeT", U"test_tes teT", U"عَلَيْكُمْ", U"عَلَيْكُمْTest", U"ӒӖӚӜ", U"_test", U"ÂÃÄÅĀĂĄÇĆĈĊ" };
static bool isid[19] = { false, false, false, false, false, false, false, false, true, true, false, false, true, false, true, true, true, true, true };
for (int j = 0; j < 19; j++) {
String s = String(data[j]);
CHECK(ts->is_valid_identifier(s) == isid[j]);
}
if (ts->has_feature(TextServer::FEATURE_UNICODE_IDENTIFIERS)) {
// Test UAX 3.2 ZW(N)J usage.
CHECK(ts->is_valid_identifier(U"\u0646\u0627\u0645\u0647\u200C\u0627\u06CC"));
CHECK(ts->is_valid_identifier(U"\u0D26\u0D43\u0D15\u0D4D\u200C\u0D38\u0D3E\u0D15\u0D4D\u0D37\u0D3F"));
CHECK(ts->is_valid_identifier(U"\u0DC1\u0DCA\u200D\u0DBB\u0DD3"));
}
}
}
SUBCASE("[TextServer] Strip Diacritics") {
for (int i = 0; i < TextServerManager::get_singleton()->get_interface_count(); i++) {
Ref<TextServer> ts = TextServerManager::get_singleton()->get_interface(i);