From 7548e043fce1211e21030bb41c6fe6d6900a7a5a Mon Sep 17 00:00:00 2001 From: George Marques Date: Wed, 18 Jan 2023 22:56:00 -0300 Subject: [PATCH] Add support for Unicode identifiers in GDScript This is using an adapted version of UAX#31 to not rely on the ICU database (which isn't available in builds without TextServerAdvanced). It allows most characters used in diverse scripts but not everything. --- doc/classes/ProjectSettings.xml | 3 + modules/gdscript/gdscript_parser.cpp | 28 +++------ modules/gdscript/gdscript_parser.h | 5 +- modules/gdscript/gdscript_tokenizer.cpp | 62 +++++++++++++++---- modules/gdscript/gdscript_tokenizer.h | 7 +++ modules/gdscript/gdscript_warning.cpp | 5 ++ modules/gdscript/gdscript_warning.h | 1 + .../analyzer/warnings/lambda_unused_arg.out | 2 +- .../errors/identifier_similar_to_keyword.gd | 3 + .../errors/identifier_similar_to_keyword.out | 2 + .../parser/features/unicode_identifiers.gd | 35 +++++++++++ .../parser/features/unicode_identifiers.out | 14 +++++ .../parser/warnings/confusable_identifier.gd | 5 ++ .../parser/warnings/confusable_identifier.out | 6 ++ 14 files changed, 145 insertions(+), 33 deletions(-) create mode 100644 modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd create mode 100644 modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out create mode 100644 modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd create mode 100644 modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out create mode 100644 modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd create mode 100644 modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out diff --git a/doc/classes/ProjectSettings.xml b/doc/classes/ProjectSettings.xml index de41edc3056..b18f7f43140 100644 --- a/doc/classes/ProjectSettings.xml +++ b/doc/classes/ProjectSettings.xml @@ -384,6 +384,9 @@ When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when an [code]assert[/code] call always evaluates to true. + + When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when an indentifier contains characters that can be confused with something else, like when mixing different alphabets. + When set to [code]warn[/code] or [code]error[/code], produces a warning or an error respectively when a constant is used as a function. diff --git a/modules/gdscript/gdscript_parser.cpp b/modules/gdscript/gdscript_parser.cpp index f5d33063769..4228046ba2d 100644 --- a/modules/gdscript/gdscript_parser.cpp +++ b/modules/gdscript/gdscript_parser.cpp @@ -41,6 +41,7 @@ #include "core/os/os.h" #include "core/string/string_builder.h" #include "gdscript_warning.h" +#include "servers/text_server.h" #endif // DEBUG_ENABLED #ifdef TOOLS_ENABLED @@ -186,24 +187,6 @@ void GDScriptParser::push_error(const String &p_message, const Node *p_origin) { } #ifdef DEBUG_ENABLED -void GDScriptParser::push_warning(const Node *p_source, GDScriptWarning::Code p_code, const String &p_symbol1, const String &p_symbol2, const String &p_symbol3, const String &p_symbol4) { - ERR_FAIL_COND(p_source == nullptr); - Vector symbols; - if (!p_symbol1.is_empty()) { - symbols.push_back(p_symbol1); - } - if (!p_symbol2.is_empty()) { - symbols.push_back(p_symbol2); - } - if (!p_symbol3.is_empty()) { - symbols.push_back(p_symbol3); - } - if (!p_symbol4.is_empty()) { - symbols.push_back(p_symbol4); - } - push_warning(p_source, p_code, symbols); -} - void GDScriptParser::push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Vector &p_symbols) { ERR_FAIL_COND(p_source == nullptr); if (is_ignoring_warnings) { @@ -2251,7 +2234,14 @@ GDScriptParser::ExpressionNode *GDScriptParser::parse_expression(bool p_can_assi } GDScriptParser::IdentifierNode *GDScriptParser::parse_identifier() { - return static_cast(parse_identifier(nullptr, false)); + IdentifierNode *identifier = static_cast(parse_identifier(nullptr, false)); +#ifdef DEBUG_ENABLED + // Check for spoofing here (if available in TextServer) since this isn't called inside expressions. This is only relevant for declarations. + if (identifier && TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY) && TS->spoof_check(identifier->name.operator String())) { + push_warning(identifier, GDScriptWarning::CONFUSABLE_IDENTIFIER, identifier->name.operator String()); + } +#endif + return identifier; } GDScriptParser::ExpressionNode *GDScriptParser::parse_identifier(ExpressionNode *p_previous_operand, bool p_can_assign) { diff --git a/modules/gdscript/gdscript_parser.h b/modules/gdscript/gdscript_parser.h index 0903f62061d..f6d2a8feeeb 100644 --- a/modules/gdscript/gdscript_parser.h +++ b/modules/gdscript/gdscript_parser.h @@ -1361,8 +1361,11 @@ private: void clear(); void push_error(const String &p_message, const Node *p_origin = nullptr); #ifdef DEBUG_ENABLED - void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const String &p_symbol1 = String(), const String &p_symbol2 = String(), const String &p_symbol3 = String(), const String &p_symbol4 = String()); void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Vector &p_symbols); + template + void push_warning(const Node *p_source, GDScriptWarning::Code p_code, const Symbols &...p_symbols) { + push_warning(p_source, p_code, Vector{ p_symbols... }); + } #endif void make_completion_context(CompletionType p_type, Node *p_node, int p_argument = -1, bool p_force = false); diff --git a/modules/gdscript/gdscript_tokenizer.cpp b/modules/gdscript/gdscript_tokenizer.cpp index e17a804003e..d7f1114fd3b 100644 --- a/modules/gdscript/gdscript_tokenizer.cpp +++ b/modules/gdscript/gdscript_tokenizer.cpp @@ -31,10 +31,14 @@ #include "gdscript_tokenizer.h" #include "core/error/error_macros.h" +#include "core/string/char_utils.h" #ifdef TOOLS_ENABLED #include "editor/editor_settings.h" #endif +#ifdef DEBUG_ENABLED +#include "servers/text_server.h" +#endif static const char *token_names[] = { "Empty", // EMPTY, @@ -435,10 +439,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::check_vcs_marker(char32_t p_test, To } GDScriptTokenizer::Token GDScriptTokenizer::annotation() { - if (!is_ascii_identifier_char(_peek())) { + if (is_unicode_identifier_start(_peek())) { + _advance(); // Consume start character. + } else { push_error("Expected annotation identifier after \"@\"."); } - while (is_ascii_identifier_char(_peek())) { + while (is_unicode_identifier_continue(_peek())) { // Consume all identifier characters. _advance(); } @@ -447,7 +453,6 @@ GDScriptTokenizer::Token GDScriptTokenizer::annotation() { return annotation; } -GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { #define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ KEYWORD_GROUP('a') \ KEYWORD("as", Token::AS) \ @@ -512,8 +517,21 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { #define MIN_KEYWORD_LENGTH 2 #define MAX_KEYWORD_LENGTH 10 - // Consume all alphanumeric characters. - while (is_ascii_identifier_char(_peek())) { +#ifdef DEBUG_ENABLED +void GDScriptTokenizer::make_keyword_list() { +#define KEYWORD_LINE(keyword, token_type) keyword, +#define KEYWORD_GROUP_IGNORE(group) + keyword_list = { + KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE) + }; +#undef KEYWORD_LINE +#undef KEYWORD_GROUP_IGNORE +} +#endif // DEBUG_ENABLED + +GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { + // Consume all identifier characters. + while (is_unicode_identifier_continue(_peek())) { _advance(); } @@ -565,15 +583,28 @@ GDScriptTokenizer::Token GDScriptTokenizer::potential_identifier() { } // Not a keyword, so must be an identifier. - return make_identifier(name); + Token id = make_identifier(name); + +#ifdef DEBUG_ENABLED + // Additional checks for identifiers but only in debug and if it's available in TextServer. + if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) { + int64_t confusable = TS->is_confusable(name, keyword_list); + if (confusable >= 0) { + push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable])); + } + } +#endif // DEBUG_ENABLED + + return id; -#undef KEYWORDS -#undef MIN_KEYWORD_LENGTH -#undef MAX_KEYWORD_LENGTH #undef KEYWORD_GROUP_CASE #undef KEYWORD } +#undef MAX_KEYWORD_LENGTH +#undef MIN_KEYWORD_LENGTH +#undef KEYWORDS + void GDScriptTokenizer::newline(bool p_make_token) { // Don't overwrite previous newline, nor create if we want a line continuation. if (p_make_token && !pending_newline && !line_continuation) { @@ -720,7 +751,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::number() { error.rightmost_column = column + 1; push_error(error); has_error = true; - } else if (is_ascii_identifier_char(_peek())) { + } else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) { // Letter at the end of the number. push_error("Invalid numeric notation."); } @@ -1311,7 +1342,7 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { if (is_digit(c)) { return number(); - } else if (is_ascii_identifier_char(c)) { + } else if (is_unicode_identifier_start(c)) { return potential_identifier(); } @@ -1504,7 +1535,11 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() { } default: - return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1))); + if (is_whitespace(c)) { + return make_error(vformat(R"(Invalid white space character "\\u%X".)", static_cast(c))); + } else { + return make_error(vformat(R"(Unknown character "%s".)", String(&c, 1))); + } } } @@ -1514,4 +1549,7 @@ GDScriptTokenizer::GDScriptTokenizer() { tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size"); } #endif // TOOLS_ENABLED +#ifdef DEBUG_ENABLED + make_keyword_list(); +#endif // DEBUG_ENABLED } diff --git a/modules/gdscript/gdscript_tokenizer.h b/modules/gdscript/gdscript_tokenizer.h index 9588922122d..608840d3f1d 100644 --- a/modules/gdscript/gdscript_tokenizer.h +++ b/modules/gdscript/gdscript_tokenizer.h @@ -224,6 +224,9 @@ private: char32_t indent_char = '\0'; int position = 0; int length = 0; +#ifdef DEBUG_ENABLED + Vector keyword_list; +#endif // DEBUG_ENABLED #ifdef TOOLS_ENABLED HashMap comments; @@ -239,6 +242,10 @@ private: void _skip_whitespace(); void check_indent(); +#ifdef DEBUG_ENABLED + void make_keyword_list(); +#endif // DEBUG_ENABLED + Token make_error(const String &p_message); void push_error(const String &p_message); void push_error(const Token &p_error); diff --git a/modules/gdscript/gdscript_warning.cpp b/modules/gdscript/gdscript_warning.cpp index 184cecb3164..a6cbb7f6aec 100644 --- a/modules/gdscript/gdscript_warning.cpp +++ b/modules/gdscript/gdscript_warning.cpp @@ -155,6 +155,10 @@ String GDScriptWarning::get_message() const { CHECK_SYMBOLS(2); return vformat(R"(The function '%s()' is a static function but was called from an instance. Instead, it should be directly called from the type: '%s.%s()'.)", symbols[0], symbols[1], symbols[0]); } + case CONFUSABLE_IDENTIFIER: { + CHECK_SYMBOLS(1); + return vformat(R"(The identifier "%s" has misleading characters and might be confused with something else.)", symbols[0]); + } case WARNING_MAX: break; // Can't happen, but silences warning } @@ -219,6 +223,7 @@ String GDScriptWarning::get_name_from_code(Code p_code) { "SHADOWED_GLOBAL_IDENTIFIER", "INT_ASSIGNED_TO_ENUM", "STATIC_CALLED_ON_INSTANCE", + "CONFUSABLE_IDENTIFIER", }; static_assert((sizeof(names) / sizeof(*names)) == WARNING_MAX, "Amount of warning types don't match the amount of warning names."); diff --git a/modules/gdscript/gdscript_warning.h b/modules/gdscript/gdscript_warning.h index e3aee45f339..b485f02b9c5 100644 --- a/modules/gdscript/gdscript_warning.h +++ b/modules/gdscript/gdscript_warning.h @@ -78,6 +78,7 @@ public: SHADOWED_GLOBAL_IDENTIFIER, // A global class or function has the same name as variable. INT_ASSIGNED_TO_ENUM, // An integer value was assigned to an enum-typed variable without casting. STATIC_CALLED_ON_INSTANCE, // A static method was called on an instance of a class instead of on the class itself. + CONFUSABLE_IDENTIFIER, // The identifier contains misleading characters that can be confused. E.g. "usеr" (has Cyrillic "е" instead of Latin "e"). WARNING_MAX, }; diff --git a/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out b/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out index b018091c183..32e230fc801 100644 --- a/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out +++ b/modules/gdscript/tests/scripts/analyzer/warnings/lambda_unused_arg.out @@ -2,4 +2,4 @@ GDTEST_OK >> WARNING >> Line: 2 >> UNUSED_PARAMETER ->> +>> The parameter 'unused' is never used in the function ''. If this is intended, prefix it with an underscore: '_unused' diff --git a/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd new file mode 100644 index 00000000000..4b1f2840702 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.gd @@ -0,0 +1,3 @@ +func test(): + var аs # Using Cyrillic "а". + print(аs) diff --git a/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out new file mode 100644 index 00000000000..337dec2f4dc --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/errors/identifier_similar_to_keyword.out @@ -0,0 +1,2 @@ +GDTEST_PARSER_ERROR +Identifier "аs" is visually similar to the GDScript keyword "as" and thus not allowed. diff --git a/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd new file mode 100644 index 00000000000..523959a016e --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.gd @@ -0,0 +1,35 @@ +const π = PI +var ㄥ = π + +func test(): + var փորձարկում = "test" + prints("փորձարկում", փորձարկում) + var امتحان = "test" + prints("امتحان", امتحان) + var পরীক্ষা = "test" + prints("পরীক্ষা", পরীক্ষা) + var тест = "test" + prints("тест", тест) + var जाँच = "test" + prints("जाँच", जाँच) + var 기준 = "test" + prints("기준", 기준) + var 测试 = "test" + prints("测试", 测试) + var テスト = "test" + prints("テスト", テスト) + var 試験 = "test" + prints("試験", 試験) + var പരീക്ഷ = "test" + prints("പരീക്ഷ", പരീക്ഷ) + var ทดสอบ = "test" + prints("ทดสอบ", ทดสอบ) + var δοκιμή = "test" + prints("δοκιμή", δοκιμή) + + const d = 1.1 + _process(d) + print(is_equal_approx(ㄥ, PI + (d * PI))) + +func _process(Δ: float) -> void: + ㄥ += Δ * π diff --git a/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out new file mode 100644 index 00000000000..c071380a8fd --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/features/unicode_identifiers.out @@ -0,0 +1,14 @@ +GDTEST_OK +փորձարկում test +امتحان test +পরীক্ষা test +тест test +जाँच test +기준 test +测试 test +テスト test +試験 test +പരീക്ഷ test +ทดสอบ test +δοκιμή test +true diff --git a/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd new file mode 100644 index 00000000000..e2caac8ffd9 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.gd @@ -0,0 +1,5 @@ +func test(): + var port = 0 # Only latin characters. + var pοrt = 1 # The "ο" is Greek omicron. + + prints(port, pοrt) diff --git a/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out new file mode 100644 index 00000000000..c4833964438 --- /dev/null +++ b/modules/gdscript/tests/scripts/parser/warnings/confusable_identifier.out @@ -0,0 +1,6 @@ +GDTEST_OK +>> WARNING +>> Line: 3 +>> CONFUSABLE_IDENTIFIER +>> The identifier "pοrt" has misleading characters and might be confused with something else. +0 1