GDScript: Add raw string literals (r-strings)

This commit is contained in:
Danil Alexeev 2023-08-28 13:00:33 +03:00
parent 221884e6bc
commit 2964c7d51c
No known key found for this signature in database
GPG Key ID: 124453E157DA8DC7
13 changed files with 250 additions and 132 deletions

View File

@ -52,6 +52,7 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l
bool in_keyword = false; bool in_keyword = false;
bool in_word = false; bool in_word = false;
bool in_number = false; bool in_number = false;
bool in_raw_string = false;
bool in_node_path = false; bool in_node_path = false;
bool in_node_ref = false; bool in_node_ref = false;
bool in_annotation = false; bool in_annotation = false;
@ -234,15 +235,33 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l
} }
if (str[from] == '\\') { if (str[from] == '\\') {
Dictionary escape_char_highlighter_info; if (!in_raw_string) {
escape_char_highlighter_info["color"] = symbol_color; Dictionary escape_char_highlighter_info;
color_map[from] = escape_char_highlighter_info; escape_char_highlighter_info["color"] = symbol_color;
color_map[from] = escape_char_highlighter_info;
}
from++; from++;
Dictionary region_continue_highlighter_info; if (!in_raw_string) {
region_continue_highlighter_info["color"] = region_color; int esc_len = 0;
color_map[from + 1] = region_continue_highlighter_info; if (str[from] == 'u') {
esc_len = 4;
} else if (str[from] == 'U') {
esc_len = 6;
}
for (int k = 0; k < esc_len && from < line_length - 1; k++) {
if (!is_hex_digit(str[from + 1])) {
break;
}
from++;
}
Dictionary region_continue_highlighter_info;
region_continue_highlighter_info["color"] = region_color;
color_map[from + 1] = region_continue_highlighter_info;
}
continue; continue;
} }
@ -489,6 +508,12 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l
in_member_variable = false; in_member_variable = false;
} }
if (!in_raw_string && in_region == -1 && str[j] == 'r' && j < line_length - 1 && (str[j + 1] == '"' || str[j + 1] == '\'')) {
in_raw_string = true;
} else if (in_raw_string && in_region == -1) {
in_raw_string = false;
}
// Keep symbol color for binary '&&'. In the case of '&&&' use StringName color for the last ampersand. // Keep symbol color for binary '&&'. In the case of '&&&' use StringName color for the last ampersand.
if (!in_string_name && in_region == -1 && str[j] == '&' && !is_binary_op) { if (!in_string_name && in_region == -1 && str[j] == '&' && !is_binary_op) {
if (j >= 2 && str[j - 1] == '&' && str[j - 2] != '&' && prev_is_binary_op) { if (j >= 2 && str[j - 1] == '&' && str[j - 2] != '&' && prev_is_binary_op) {
@ -520,7 +545,9 @@ Dictionary GDScriptSyntaxHighlighter::_get_line_syntax_highlighting_impl(int p_l
in_annotation = false; in_annotation = false;
} }
if (in_node_ref) { if (in_raw_string) {
color = string_color;
} else if (in_node_ref) {
next_type = NODE_REF; next_type = NODE_REF;
color = node_ref_color; color = node_ref_color;
} else if (in_annotation) { } else if (in_annotation) {
@ -692,7 +719,7 @@ void GDScriptSyntaxHighlighter::_update_cache() {
} }
/* Strings */ /* Strings */
const Color string_color = EDITOR_GET("text_editor/theme/highlighting/string_color"); string_color = EDITOR_GET("text_editor/theme/highlighting/string_color");
List<String> strings; List<String> strings;
gdscript->get_string_delimiters(&strings); gdscript->get_string_delimiters(&strings);
for (const String &string : strings) { for (const String &string : strings) {

View File

@ -78,6 +78,7 @@ private:
Color built_in_type_color; Color built_in_type_color;
Color number_color; Color number_color;
Color member_color; Color member_color;
Color string_color;
Color node_path_color; Color node_path_color;
Color node_ref_color; Color node_ref_color;
Color annotation_color; Color annotation_color;

View File

@ -59,6 +59,7 @@ void GDScriptLanguage::get_string_delimiters(List<String> *p_delimiters) const {
p_delimiters->push_back("' '"); p_delimiters->push_back("' '");
p_delimiters->push_back("\"\"\" \"\"\""); p_delimiters->push_back("\"\"\" \"\"\"");
p_delimiters->push_back("''' '''"); p_delimiters->push_back("''' '''");
// NOTE: StringName, NodePath and r-strings are not listed here.
} }
bool GDScriptLanguage::is_using_templates() { bool GDScriptLanguage::is_using_templates() {

View File

@ -857,10 +857,14 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
STRING_NODEPATH, STRING_NODEPATH,
}; };
bool is_raw = false;
bool is_multiline = false; bool is_multiline = false;
StringType type = STRING_REGULAR; StringType type = STRING_REGULAR;
if (_peek(-1) == '&') { if (_peek(-1) == 'r') {
is_raw = true;
_advance();
} else if (_peek(-1) == '&') {
type = STRING_NAME; type = STRING_NAME;
_advance(); _advance();
} else if (_peek(-1) == '^') { } else if (_peek(-1) == '^') {
@ -890,7 +894,12 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
char32_t ch = _peek(); char32_t ch = _peek();
if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) { if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
Token error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion."); Token error;
if (is_raw) {
error = make_error("Invisible text direction control character present in the string, use regular string literal instead of r-string.");
} else {
error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
}
error.start_column = column; error.start_column = column;
error.leftmost_column = error.start_column; error.leftmost_column = error.start_column;
error.end_column = column + 1; error.end_column = column + 1;
@ -905,144 +914,164 @@ GDScriptTokenizer::Token GDScriptTokenizer::string() {
return make_error("Unterminated string."); return make_error("Unterminated string.");
} }
// Grab escape character. if (is_raw) {
char32_t code = _peek(); if (_peek() == quote_char) {
_advance(); _advance();
if (_is_at_end()) { if (_is_at_end()) {
return make_error("Unterminated string."); return make_error("Unterminated string.");
} }
result += '\\';
result += quote_char;
} else if (_peek() == '\\') { // For `\\\"`.
_advance();
if (_is_at_end()) {
return make_error("Unterminated string.");
}
result += '\\';
result += '\\';
} else {
result += '\\';
}
} else {
// Grab escape character.
char32_t code = _peek();
_advance();
if (_is_at_end()) {
return make_error("Unterminated string.");
}
char32_t escaped = 0; char32_t escaped = 0;
bool valid_escape = true; bool valid_escape = true;
switch (code) { switch (code) {
case 'a': case 'a':
escaped = '\a'; escaped = '\a';
break; break;
case 'b': case 'b':
escaped = '\b'; escaped = '\b';
break; break;
case 'f': case 'f':
escaped = '\f'; escaped = '\f';
break; break;
case 'n': case 'n':
escaped = '\n'; escaped = '\n';
break; break;
case 'r': case 'r':
escaped = '\r'; escaped = '\r';
break; break;
case 't': case 't':
escaped = '\t'; escaped = '\t';
break; break;
case 'v': case 'v':
escaped = '\v'; escaped = '\v';
break; break;
case '\'': case '\'':
escaped = '\''; escaped = '\'';
break; break;
case '\"': case '\"':
escaped = '\"'; escaped = '\"';
break; break;
case '\\': case '\\':
escaped = '\\'; escaped = '\\';
break; break;
case 'U': case 'U':
case 'u': { case 'u': {
// Hexadecimal sequence. // Hexadecimal sequence.
int hex_len = (code == 'U') ? 6 : 4; int hex_len = (code == 'U') ? 6 : 4;
for (int j = 0; j < hex_len; j++) { for (int j = 0; j < hex_len; j++) {
if (_is_at_end()) { if (_is_at_end()) {
return make_error("Unterminated string."); return make_error("Unterminated string.");
}
char32_t digit = _peek();
char32_t value = 0;
if (is_digit(digit)) {
value = digit - '0';
} else if (digit >= 'a' && digit <= 'f') {
value = digit - 'a';
value += 10;
} else if (digit >= 'A' && digit <= 'F') {
value = digit - 'A';
value += 10;
} else {
// Make error, but keep parsing the string.
Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
error.start_column = column;
error.leftmost_column = error.start_column;
error.end_column = column + 1;
error.rightmost_column = error.end_column;
push_error(error);
valid_escape = false;
break;
}
escaped <<= 4;
escaped |= value;
_advance();
} }
} break;
char32_t digit = _peek(); case '\r':
char32_t value = 0; if (_peek() != '\n') {
if (is_digit(digit)) { // Carriage return without newline in string. (???)
value = digit - '0'; // Just add it to the string and keep going.
} else if (digit >= 'a' && digit <= 'f') { result += ch;
value = digit - 'a'; _advance();
value += 10;
} else if (digit >= 'A' && digit <= 'F') {
value = digit - 'A';
value += 10;
} else {
// Make error, but keep parsing the string.
Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
error.start_column = column;
error.leftmost_column = error.start_column;
error.end_column = column + 1;
error.rightmost_column = error.end_column;
push_error(error);
valid_escape = false;
break; break;
} }
[[fallthrough]];
escaped <<= 4; case '\n':
escaped |= value; // Escaping newline.
newline(false);
_advance(); valid_escape = false; // Don't add to the string.
}
} break;
case '\r':
if (_peek() != '\n') {
// Carriage return without newline in string. (???)
// Just add it to the string and keep going.
result += ch;
_advance();
break; break;
} default:
[[fallthrough]]; Token error = make_error("Invalid escape in string.");
case '\n':
// Escaping newline.
newline(false);
valid_escape = false; // Don't add to the string.
break;
default:
Token error = make_error("Invalid escape in string.");
error.start_column = column - 2;
error.leftmost_column = error.start_column;
push_error(error);
valid_escape = false;
break;
}
// Parse UTF-16 pair.
if (valid_escape) {
if ((escaped & 0xfffffc00) == 0xd800) {
if (prev == 0) {
prev = escaped;
prev_pos = column - 2;
continue;
} else {
Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
error.start_column = column - 2; error.start_column = column - 2;
error.leftmost_column = error.start_column; error.leftmost_column = error.start_column;
push_error(error); push_error(error);
valid_escape = false; valid_escape = false;
prev = 0; break;
}
// Parse UTF-16 pair.
if (valid_escape) {
if ((escaped & 0xfffffc00) == 0xd800) {
if (prev == 0) {
prev = escaped;
prev_pos = column - 2;
continue;
} else {
Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
error.start_column = column - 2;
error.leftmost_column = error.start_column;
push_error(error);
valid_escape = false;
prev = 0;
}
} else if ((escaped & 0xfffffc00) == 0xdc00) {
if (prev == 0) {
Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate.");
error.start_column = column - 2;
error.leftmost_column = error.start_column;
push_error(error);
valid_escape = false;
} else {
escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
prev = 0;
}
} }
} else if ((escaped & 0xfffffc00) == 0xdc00) { if (prev != 0) {
if (prev == 0) { Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate"); error.start_column = prev_pos;
error.start_column = column - 2;
error.leftmost_column = error.start_column; error.leftmost_column = error.start_column;
push_error(error); push_error(error);
valid_escape = false;
} else {
escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
prev = 0; prev = 0;
} }
} }
if (prev != 0) {
Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
error.start_column = prev_pos;
error.leftmost_column = error.start_column;
push_error(error);
prev = 0;
}
}
if (valid_escape) { if (valid_escape) {
result += escaped; result += escaped;
}
} }
} else if (ch == quote_char) { } else if (ch == quote_char) {
if (prev != 0) { if (prev != 0) {
@ -1416,6 +1445,9 @@ GDScriptTokenizer::Token GDScriptTokenizer::scan() {
if (is_digit(c)) { if (is_digit(c)) {
return number(); return number();
} else if (c == 'r' && (_peek() == '"' || _peek() == '\'')) {
// Raw string literals.
return string();
} else if (is_unicode_identifier_start(c)) { } else if (is_unicode_identifier_start(c)) {
return potential_identifier(); return potential_identifier();
} }

View File

@ -0,0 +1,2 @@
func test():
print(r"\")

View File

@ -0,0 +1,2 @@
GDTEST_PARSER_ERROR
Unterminated string.

View File

@ -0,0 +1,2 @@
func test():
print(r"\\"")

View File

@ -0,0 +1,2 @@
GDTEST_PARSER_ERROR
Unterminated string.

View File

@ -0,0 +1,3 @@
func test():
# v
print(r"['"]*")

View File

@ -0,0 +1,2 @@
GDTEST_PARSER_ERROR
Closing "]" doesn't have an opening counterpart.

View File

@ -0,0 +1,22 @@
func test():
print(r"test ' \' \" \\ \n \t \u2023 test")
print(r"\n\\[\t ]*(\w+)")
print(r"")
print(r"\"")
print(r"\\\"")
print(r"\\")
print(r"\" \\\" \\\\\"")
print(r"\ \\ \\\ \\\\ \\\\\ \\")
print(r'"')
print(r'"(?:\\.|[^"])*"')
print(r"""""")
print(r"""test \t "test"="" " \" \\\" \ \\ \\\ test""")
print(r'''r"""test \t "test"="" " \" \\\" \ \\ \\\ test"""''')
print(r"\t
\t")
print(r"\t \
\t")
print(r"""\t
\t""")
print(r"""\t \
\t""")

View File

@ -0,0 +1,22 @@
GDTEST_OK
test ' \' \" \\ \n \t \u2023 test
\n\\[\t ]*(\w+)
\"
\\\"
\\
\" \\\" \\\\\"
\ \\ \\\ \\\\ \\\\\ \\
"
"(?:\\.|[^"])*"
test \t "test"="" " \" \\\" \ \\ \\\ test
r"""test \t "test"="" " \" \\\" \ \\ \\\ test"""
\t
\t
\t \
\t
\t
\t
\t \
\t

View File

@ -10,7 +10,7 @@
var regex = RegEx.new() var regex = RegEx.new()
regex.compile("\\w-(\\d+)") regex.compile("\\w-(\\d+)")
[/codeblock] [/codeblock]
The search pattern must be escaped first for GDScript before it is escaped for the expression. For example, [code]compile("\\d+")[/code] would be read by RegEx as [code]\d+[/code]. Similarly, [code]compile("\"(?:\\\\.|[^\"])*\"")[/code] would be read as [code]"(?:\\.|[^"])*"[/code]. The search pattern must be escaped first for GDScript before it is escaped for the expression. For example, [code]compile("\\d+")[/code] would be read by RegEx as [code]\d+[/code]. Similarly, [code]compile("\"(?:\\\\.|[^\"])*\"")[/code] would be read as [code]"(?:\\.|[^"])*"[/code]. In GDScript, you can also use raw string literals (r-strings). For example, [code]compile(r'"(?:\\.|[^"])*"')[/code] would be read the same.
Using [method search], you can find the pattern within the given text. If a pattern is found, [RegExMatch] is returned and you can retrieve details of the results using methods such as [method RegExMatch.get_string] and [method RegExMatch.get_start]. Using [method search], you can find the pattern within the given text. If a pattern is found, [RegExMatch] is returned and you can retrieve details of the results using methods such as [method RegExMatch.get_string] and [method RegExMatch.get_start].
[codeblock] [codeblock]
var regex = RegEx.new() var regex = RegEx.new()