2023-01-05 12:25:55 +00:00
/**************************************************************************/
/* gdscript_tokenizer.cpp */
/**************************************************************************/
/* This file is part of: */
/* GODOT ENGINE */
/* https://godotengine.org */
/**************************************************************************/
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
/* */
/* Permission is hereby granted, free of charge, to any person obtaining */
/* a copy of this software and associated documentation files (the */
/* "Software"), to deal in the Software without restriction, including */
/* without limitation the rights to use, copy, modify, merge, publish, */
/* distribute, sublicense, and/or sell copies of the Software, and to */
/* permit persons to whom the Software is furnished to do so, subject to */
/* the following conditions: */
/* */
/* The above copyright notice and this permission notice shall be */
/* included in all copies or substantial portions of the Software. */
/* */
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
2018-01-04 23:50:27 +00:00
2017-11-16 17:38:18 +00:00
# include "gdscript_tokenizer.h"
2017-08-27 19:07:15 +00:00
2020-11-07 22:33:38 +00:00
# include "core/error/error_macros.h"
2023-01-19 01:56:00 +00:00
# include "core/string/char_utils.h"
2020-05-01 22:14:56 +00:00
2023-01-19 01:56:00 +00:00
# ifdef DEBUG_ENABLED
# include "servers/text_server.h"
# endif
2020-05-01 22:14:56 +00:00
2023-06-13 14:56:21 +00:00
# ifdef TOOLS_ENABLED
# include "editor/editor_settings.h"
# endif
2020-05-01 22:14:56 +00:00
static const char * token_names [ ] = {
" Empty " , // EMPTY,
// Basic
" Annotation " , // ANNOTATION
" Identifier " , // IDENTIFIER,
" Literal " , // LITERAL,
// Comparison
" < " , // LESS,
" <= " , // LESS_EQUAL,
" > " , // GREATER,
" >= " , // GREATER_EQUAL,
" == " , // EQUAL_EQUAL,
" != " , // BANG_EQUAL,
// Logical
" and " , // AND,
" or " , // OR,
" not " , // NOT,
" && " , // AMPERSAND_AMPERSAND,
" || " , // PIPE_PIPE,
" ! " , // BANG,
// Bitwise
" & " , // AMPERSAND,
" | " , // PIPE,
" ~ " , // TILDE,
" ^ " , // CARET,
" << " , // LESS_LESS,
" >> " , // GREATER_GREATER,
// Math
" + " , // PLUS,
" - " , // MINUS,
" * " , // STAR,
2022-03-07 17:25:21 +00:00
" ** " , // STAR_STAR,
2020-05-01 22:14:56 +00:00
" / " , // SLASH,
" % " , // PERCENT,
// Assignment
" = " , // EQUAL,
" += " , // PLUS_EQUAL,
" -= " , // MINUS_EQUAL,
" *= " , // STAR_EQUAL,
2022-03-07 17:25:21 +00:00
" **= " , // STAR_STAR_EQUAL,
2020-05-01 22:14:56 +00:00
" /= " , // SLASH_EQUAL,
" %= " , // PERCENT_EQUAL,
" <<= " , // LESS_LESS_EQUAL,
" >>= " , // GREATER_GREATER_EQUAL,
" &= " , // AMPERSAND_EQUAL,
" |= " , // PIPE_EQUAL,
" ^= " , // CARET_EQUAL,
// Control flow
" if " , // IF,
" elif " , // ELIF,
" else " , // ELSE,
" for " , // FOR,
" while " , // WHILE,
" break " , // BREAK,
" continue " , // CONTINUE,
" pass " , // PASS,
" return " , // RETURN,
" match " , // MATCH,
// Keywords
" as " , // AS,
" assert " , // ASSERT,
" await " , // AWAIT,
" breakpoint " , // BREAKPOINT,
" class " , // CLASS,
" class_name " , // CLASS_NAME,
" const " , // CONST,
" enum " , // ENUM,
" extends " , // EXTENDS,
" func " , // FUNC,
" in " , // IN,
" is " , // IS,
" namespace " , // NAMESPACE
" preload " , // PRELOAD,
" self " , // SELF,
" signal " , // SIGNAL,
" static " , // STATIC,
" super " , // SUPER,
2020-07-16 01:02:44 +00:00
" trait " , // TRAIT,
2020-05-01 22:14:56 +00:00
" var " , // VAR,
" void " , // VOID,
" yield " , // YIELD,
// Punctuation
" [ " , // BRACKET_OPEN,
" ] " , // BRACKET_CLOSE,
" { " , // BRACE_OPEN,
" } " , // BRACE_CLOSE,
" ( " , // PARENTHESIS_OPEN,
" ) " , // PARENTHESIS_CLOSE,
" , " , // COMMA,
" ; " , // SEMICOLON,
" . " , // PERIOD,
" .. " , // PERIOD_PERIOD,
" : " , // COLON,
" $ " , // DOLLAR,
" -> " , // FORWARD_ARROW,
" _ " , // UNDERSCORE,
// Whitespace
" Newline " , // NEWLINE,
" Indent " , // INDENT,
" Dedent " , // DEDENT,
// Constants
" PI " , // CONST_PI,
" TAU " , // CONST_TAU,
" INF " , // CONST_INF,
" NaN " , // CONST_NAN,
// Error message improvement
" VCS conflict marker " , // VCS_CONFLICT_MARKER,
" ` " , // BACKTICK,
" ? " , // QUESTION_MARK,
// Special
" Error " , // ERROR,
" End of file " , // EOF,
2017-03-05 15:44:50 +00:00
} ;
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// Avoid desync.
static_assert ( sizeof ( token_names ) / sizeof ( token_names [ 0 ] ) = = GDScriptTokenizer : : Token : : TK_MAX , " Amount of token names don't match the amount of token types. " ) ;
2017-03-31 17:28:34 +00:00
2020-05-01 22:14:56 +00:00
const char * GDScriptTokenizer : : Token : : get_name ( ) const {
ERR_FAIL_INDEX_V_MSG ( type , TK_MAX , " <error> " , " Using token type out of the enum. " ) ;
return token_names [ type ] ;
2014-02-10 01:10:30 +00:00
}
2023-02-15 14:41:46 +00:00
bool GDScriptTokenizer : : Token : : can_precede_bin_op ( ) const {
switch ( type ) {
case IDENTIFIER :
case LITERAL :
case SELF :
case BRACKET_CLOSE :
case BRACE_CLOSE :
case PARENTHESIS_CLOSE :
case CONST_PI :
case CONST_TAU :
case CONST_INF :
case CONST_NAN :
return true ;
default :
return false ;
}
}
2020-08-17 23:14:46 +00:00
bool GDScriptTokenizer : : Token : : is_identifier ( ) const {
// Note: Most keywords should not be recognized as identifiers.
// These are only exceptions for stuff that already is on the engine's API.
switch ( type ) {
case IDENTIFIER :
case MATCH : // Used in String.match().
2023-02-09 14:17:37 +00:00
// Allow constants to be treated as regular identifiers.
case CONST_PI :
case CONST_INF :
case CONST_NAN :
case CONST_TAU :
2020-08-17 23:14:46 +00:00
return true ;
default :
return false ;
}
}
2020-08-19 13:19:05 +00:00
bool GDScriptTokenizer : : Token : : is_node_name ( ) const {
// This is meant to allow keywords with the $ notation, but not as general identifiers.
switch ( type ) {
case IDENTIFIER :
case AND :
case AS :
case ASSERT :
case AWAIT :
case BREAK :
case BREAKPOINT :
case CLASS_NAME :
case CLASS :
case CONST :
2023-02-09 14:17:37 +00:00
case CONST_PI :
case CONST_INF :
case CONST_NAN :
case CONST_TAU :
2020-08-19 13:19:05 +00:00
case CONTINUE :
case ELIF :
case ELSE :
case ENUM :
case EXTENDS :
case FOR :
case FUNC :
case IF :
case IN :
case IS :
case MATCH :
case NAMESPACE :
case NOT :
case OR :
case PASS :
case PRELOAD :
case RETURN :
case SELF :
case SIGNAL :
case STATIC :
case SUPER :
case TRAIT :
case UNDERSCORE :
case VAR :
case VOID :
case WHILE :
case YIELD :
return true ;
default :
return false ;
}
}
2020-05-01 22:14:56 +00:00
String GDScriptTokenizer : : get_token_name ( Token : : Type p_token_type ) {
ERR_FAIL_INDEX_V_MSG ( p_token_type , Token : : TK_MAX , " <error> " , " Using token type out of the enum. " ) ;
return token_names [ p_token_type ] ;
2017-03-31 17:28:34 +00:00
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : set_source_code ( const String & p_source_code ) {
source = p_source_code ;
2020-12-15 12:04:21 +00:00
if ( source . is_empty ( ) ) {
2020-07-27 10:43:20 +00:00
_source = U " " ;
2020-05-01 22:14:56 +00:00
} else {
_source = source . ptr ( ) ;
2017-03-31 17:28:34 +00:00
}
2020-05-01 22:14:56 +00:00
_current = _source ;
line = 1 ;
column = 1 ;
length = p_source_code . length ( ) ;
position = 0 ;
2017-03-31 17:28:34 +00:00
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : set_cursor_position ( int p_line , int p_column ) {
cursor_line = p_line ;
cursor_column = p_column ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : set_multiline_mode ( bool p_state ) {
multiline_mode = p_state ;
2014-02-10 01:10:30 +00:00
}
2021-03-25 13:36:29 +00:00
void GDScriptTokenizer : : push_expression_indented_block ( ) {
indent_stack_stack . push_back ( indent_stack ) ;
}
void GDScriptTokenizer : : pop_expression_indented_block ( ) {
ERR_FAIL_COND ( indent_stack_stack . size ( ) = = 0 ) ;
indent_stack = indent_stack_stack . back ( ) - > get ( ) ;
indent_stack_stack . pop_back ( ) ;
}
2020-05-01 22:14:56 +00:00
int GDScriptTokenizer : : get_cursor_line ( ) const {
return cursor_line ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
int GDScriptTokenizer : : get_cursor_column ( ) const {
return cursor_column ;
2019-04-25 12:43:48 +00:00
}
2020-07-06 15:24:24 +00:00
bool GDScriptTokenizer : : is_past_cursor ( ) const {
if ( line < cursor_line ) {
return false ;
}
if ( line > cursor_line ) {
return true ;
}
if ( column < cursor_column ) {
return false ;
}
return true ;
}
2020-07-27 10:43:20 +00:00
char32_t GDScriptTokenizer : : _advance ( ) {
2020-05-01 22:14:56 +00:00
if ( unlikely ( _is_at_end ( ) ) ) {
return ' \0 ' ;
}
_current + + ;
column + + ;
2020-07-06 15:24:24 +00:00
position + + ;
2020-05-01 22:14:56 +00:00
if ( column > rightmost_column ) {
rightmost_column = column ;
}
if ( unlikely ( _is_at_end ( ) ) ) {
// Add extra newline even if it's not there, to satisfy the parser.
newline ( true ) ;
// Also add needed unindent.
check_indent ( ) ;
}
return _peek ( - 1 ) ;
}
2014-02-10 01:10:30 +00:00
2020-07-27 10:43:20 +00:00
void GDScriptTokenizer : : push_paren ( char32_t p_char ) {
2020-05-01 22:14:56 +00:00
paren_stack . push_back ( p_char ) ;
2014-02-10 01:10:30 +00:00
}
2020-05-14 12:29:06 +00:00
2020-07-27 10:43:20 +00:00
bool GDScriptTokenizer : : pop_paren ( char32_t p_expected ) {
2020-12-15 12:04:21 +00:00
if ( paren_stack . is_empty ( ) ) {
2020-05-01 22:14:56 +00:00
return false ;
}
2020-07-27 10:43:20 +00:00
char32_t actual = paren_stack . back ( ) - > get ( ) ;
2020-05-01 22:14:56 +00:00
paren_stack . pop_back ( ) ;
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
return actual = = p_expected ;
}
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : pop_error ( ) {
Token error = error_stack . back ( ) - > get ( ) ;
error_stack . pop_back ( ) ;
return error ;
2014-02-10 01:10:30 +00:00
}
2020-07-06 15:24:24 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : make_token ( Token : : Type p_type ) {
2020-05-01 22:14:56 +00:00
Token token ( p_type ) ;
token . start_line = start_line ;
token . end_line = line ;
token . start_column = start_column ;
token . end_column = column ;
token . leftmost_column = leftmost_column ;
token . rightmost_column = rightmost_column ;
2020-07-06 15:24:24 +00:00
token . source = String ( _start , _current - _start ) ;
if ( p_type ! = Token : : ERROR & & cursor_line > - 1 ) {
// Also count whitespace after token.
int offset = 0 ;
while ( _peek ( offset ) = = ' ' | | _peek ( offset ) = = ' \t ' ) {
offset + + ;
}
int last_column = column + offset ;
// Check cursor position in token.
if ( start_line = = line ) {
// Single line token.
if ( cursor_line = = start_line & & cursor_column > = start_column & & cursor_column < = last_column ) {
token . cursor_position = cursor_column - start_column ;
if ( cursor_column = = start_column ) {
token . cursor_place = CURSOR_BEGINNING ;
} else if ( cursor_column < column ) {
token . cursor_place = CURSOR_MIDDLE ;
} else {
token . cursor_place = CURSOR_END ;
}
}
} else {
// Multi line token.
if ( cursor_line = = start_line & & cursor_column > = start_column ) {
// Is in first line.
token . cursor_position = cursor_column - start_column ;
if ( cursor_column = = start_column ) {
token . cursor_place = CURSOR_BEGINNING ;
} else {
token . cursor_place = CURSOR_MIDDLE ;
}
} else if ( cursor_line = = line & & cursor_column < = last_column ) {
// Is in last line.
token . cursor_position = cursor_column - start_column ;
if ( cursor_column < column ) {
token . cursor_place = CURSOR_MIDDLE ;
} else {
token . cursor_place = CURSOR_END ;
}
} else if ( cursor_line > start_line & & cursor_line < line ) {
// Is in middle line.
token . cursor_position = CURSOR_MIDDLE ;
}
}
}
2014-02-10 01:10:30 +00:00
2023-02-15 14:41:46 +00:00
last_token = token ;
2020-05-01 22:14:56 +00:00
return token ;
2014-02-10 01:10:30 +00:00
}
2020-07-06 15:24:24 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : make_literal ( const Variant & p_literal ) {
2020-05-01 22:14:56 +00:00
Token token = make_token ( Token : : LITERAL ) ;
token . literal = p_literal ;
return token ;
}
2014-02-10 01:10:30 +00:00
2020-07-06 15:24:24 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : make_identifier ( const StringName & p_identifier ) {
2020-05-01 22:14:56 +00:00
Token identifier = make_token ( Token : : IDENTIFIER ) ;
identifier . literal = p_identifier ;
return identifier ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : make_error ( const String & p_message ) {
Token error = make_token ( Token : : ERROR ) ;
error . literal = p_message ;
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
return error ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : push_error ( const String & p_message ) {
Token error = make_error ( p_message ) ;
error_stack . push_back ( error ) ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : push_error ( const Token & p_error ) {
error_stack . push_back ( p_error ) ;
}
2014-02-10 01:10:30 +00:00
2020-07-27 10:43:20 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : make_paren_error ( char32_t p_paren ) {
2020-12-15 12:04:21 +00:00
if ( paren_stack . is_empty ( ) ) {
2020-05-01 22:14:56 +00:00
return make_error ( vformat ( " Closing \" %c \" doesn't have an opening counterpart. " , p_paren ) ) ;
2017-03-05 15:44:50 +00:00
}
2020-05-01 22:14:56 +00:00
Token error = make_error ( vformat ( " Closing \" %c \" doesn't match the opening \" %c \" . " , p_paren , paren_stack . back ( ) - > get ( ) ) ) ;
paren_stack . pop_back ( ) ; // Remove opening one anyway.
return error ;
}
2014-02-10 01:10:30 +00:00
2020-07-27 10:43:20 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : check_vcs_marker ( char32_t p_test , Token : : Type p_double_type ) {
const char32_t * next = _current + 1 ;
2020-05-01 22:14:56 +00:00
int chars = 2 ; // Two already matched.
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// Test before consuming characters, since we don't want to consume more than needed.
while ( * next = = p_test ) {
chars + + ;
next + + ;
}
if ( chars > = 7 ) {
// It is a VCS conflict marker.
while ( chars > 1 ) {
// Consume all characters (first was already consumed by scan()).
_advance ( ) ;
chars - - ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
return make_token ( Token : : VCS_CONFLICT_MARKER ) ;
} else {
// It is only a regular double character token, so we consume the second character.
_advance ( ) ;
return make_token ( p_double_type ) ;
2014-02-10 01:10:30 +00:00
}
}
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : annotation ( ) {
2023-01-19 01:56:00 +00:00
if ( is_unicode_identifier_start ( _peek ( ) ) ) {
_advance ( ) ; // Consume start character.
} else {
2020-07-06 15:24:24 +00:00
push_error ( " Expected annotation identifier after \" @ \" . " ) ;
2014-02-10 01:10:30 +00:00
}
2023-01-19 01:56:00 +00:00
while ( is_unicode_identifier_continue ( _peek ( ) ) ) {
2020-05-01 22:14:56 +00:00
// Consume all identifier characters.
2014-02-10 01:10:30 +00:00
_advance ( ) ;
2020-05-14 14:41:43 +00:00
}
2020-05-01 22:14:56 +00:00
Token annotation = make_token ( Token : : ANNOTATION ) ;
2020-07-06 15:24:24 +00:00
annotation . literal = StringName ( annotation . source ) ;
2020-05-01 22:14:56 +00:00
return annotation ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
# define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
KEYWORD_GROUP ( ' a ' ) \
KEYWORD ( " as " , Token : : AS ) \
KEYWORD ( " and " , Token : : AND ) \
KEYWORD ( " assert " , Token : : ASSERT ) \
KEYWORD ( " await " , Token : : AWAIT ) \
KEYWORD_GROUP ( ' b ' ) \
KEYWORD ( " break " , Token : : BREAK ) \
KEYWORD ( " breakpoint " , Token : : BREAKPOINT ) \
KEYWORD_GROUP ( ' c ' ) \
KEYWORD ( " class " , Token : : CLASS ) \
KEYWORD ( " class_name " , Token : : CLASS_NAME ) \
KEYWORD ( " const " , Token : : CONST ) \
KEYWORD ( " continue " , Token : : CONTINUE ) \
KEYWORD_GROUP ( ' e ' ) \
KEYWORD ( " elif " , Token : : ELIF ) \
KEYWORD ( " else " , Token : : ELSE ) \
KEYWORD ( " enum " , Token : : ENUM ) \
KEYWORD ( " extends " , Token : : EXTENDS ) \
KEYWORD_GROUP ( ' f ' ) \
KEYWORD ( " for " , Token : : FOR ) \
KEYWORD ( " func " , Token : : FUNC ) \
KEYWORD_GROUP ( ' i ' ) \
KEYWORD ( " if " , Token : : IF ) \
KEYWORD ( " in " , Token : : IN ) \
KEYWORD ( " is " , Token : : IS ) \
KEYWORD_GROUP ( ' m ' ) \
KEYWORD ( " match " , Token : : MATCH ) \
KEYWORD_GROUP ( ' n ' ) \
KEYWORD ( " namespace " , Token : : NAMESPACE ) \
KEYWORD ( " not " , Token : : NOT ) \
KEYWORD_GROUP ( ' o ' ) \
KEYWORD ( " or " , Token : : OR ) \
KEYWORD_GROUP ( ' p ' ) \
KEYWORD ( " pass " , Token : : PASS ) \
KEYWORD ( " preload " , Token : : PRELOAD ) \
KEYWORD_GROUP ( ' r ' ) \
KEYWORD ( " return " , Token : : RETURN ) \
KEYWORD_GROUP ( ' s ' ) \
KEYWORD ( " self " , Token : : SELF ) \
KEYWORD ( " signal " , Token : : SIGNAL ) \
KEYWORD ( " static " , Token : : STATIC ) \
KEYWORD ( " super " , Token : : SUPER ) \
2020-07-16 01:02:44 +00:00
KEYWORD_GROUP ( ' t ' ) \
KEYWORD ( " trait " , Token : : TRAIT ) \
2020-05-01 22:14:56 +00:00
KEYWORD_GROUP ( ' v ' ) \
KEYWORD ( " var " , Token : : VAR ) \
KEYWORD ( " void " , Token : : VOID ) \
KEYWORD_GROUP ( ' w ' ) \
KEYWORD ( " while " , Token : : WHILE ) \
KEYWORD_GROUP ( ' y ' ) \
KEYWORD ( " yield " , Token : : YIELD ) \
KEYWORD_GROUP ( ' I ' ) \
KEYWORD ( " INF " , Token : : CONST_INF ) \
KEYWORD_GROUP ( ' N ' ) \
KEYWORD ( " NAN " , Token : : CONST_NAN ) \
KEYWORD_GROUP ( ' P ' ) \
KEYWORD ( " PI " , Token : : CONST_PI ) \
KEYWORD_GROUP ( ' T ' ) \
KEYWORD ( " TAU " , Token : : CONST_TAU )
# define MIN_KEYWORD_LENGTH 2
# define MAX_KEYWORD_LENGTH 10
2023-01-19 01:56:00 +00:00
# ifdef DEBUG_ENABLED
void GDScriptTokenizer : : make_keyword_list ( ) {
# define KEYWORD_LINE(keyword, token_type) keyword,
# define KEYWORD_GROUP_IGNORE(group)
keyword_list = {
KEYWORDS ( KEYWORD_GROUP_IGNORE , KEYWORD_LINE )
} ;
# undef KEYWORD_LINE
# undef KEYWORD_GROUP_IGNORE
}
# endif // DEBUG_ENABLED
GDScriptTokenizer : : Token GDScriptTokenizer : : potential_identifier ( ) {
2023-02-09 14:17:37 +00:00
bool only_ascii = _peek ( - 1 ) < 128 ;
2023-01-19 01:56:00 +00:00
// Consume all identifier characters.
while ( is_unicode_identifier_continue ( _peek ( ) ) ) {
2023-02-09 14:17:37 +00:00
char32_t c = _advance ( ) ;
only_ascii = only_ascii & & c < 128 ;
2020-05-01 22:14:56 +00:00
}
2014-02-10 01:10:30 +00:00
2022-09-29 09:53:28 +00:00
int len = _current - _start ;
2014-02-10 01:10:30 +00:00
2022-09-29 09:53:28 +00:00
if ( len = = 1 & & _peek ( - 1 ) = = ' _ ' ) {
2020-05-01 22:14:56 +00:00
// Lone underscore.
return make_token ( Token : : UNDERSCORE ) ;
}
2014-02-10 01:10:30 +00:00
2022-09-29 09:53:28 +00:00
String name ( _start , len ) ;
if ( len < MIN_KEYWORD_LENGTH | | len > MAX_KEYWORD_LENGTH ) {
2020-05-01 22:14:56 +00:00
// Cannot be a keyword, as the length doesn't match any.
return make_identifier ( name ) ;
}
2014-02-10 01:10:30 +00:00
2023-02-13 09:32:56 +00:00
if ( ! only_ascii ) {
// Kept here in case the order with push_error matters.
Token id = make_identifier ( name ) ;
# ifdef DEBUG_ENABLED
// Additional checks for identifiers but only in debug and if it's available in TextServer.
if ( TS - > has_feature ( TextServer : : FEATURE_UNICODE_SECURITY ) ) {
int64_t confusable = TS - > is_confusable ( name , keyword_list ) ;
if ( confusable > = 0 ) {
push_error ( vformat ( R " (Identifier " % s " is visually similar to the GDScript keyword " % s " and thus not allowed.) " , name , keyword_list [ confusable ] ) ) ;
}
}
# endif // DEBUG_ENABLED
// Cannot be a keyword, as keywords are ASCII only.
return id ;
}
2020-05-01 22:14:56 +00:00
// Define some helper macros for the switch case.
# define KEYWORD_GROUP_CASE(char) \
break ; \
case char :
# define KEYWORD(keyword, token_type) \
{ \
const int keyword_length = sizeof ( keyword ) - 1 ; \
static_assert ( keyword_length < = MAX_KEYWORD_LENGTH , " There's a keyword longer than the defined maximum length " ) ; \
static_assert ( keyword_length > = MIN_KEYWORD_LENGTH , " There's a keyword shorter than the defined minimum length " ) ; \
2022-09-29 09:53:28 +00:00
if ( keyword_length = = len & & name = = keyword ) { \
2020-05-01 22:14:56 +00:00
return make_token ( token_type ) ; \
} \
}
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// Find if it's a keyword.
switch ( _start [ 0 ] ) {
default :
KEYWORDS ( KEYWORD_GROUP_CASE , KEYWORD )
break ;
}
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// Check if it's a special literal
2022-09-29 09:53:28 +00:00
if ( len = = 4 ) {
2020-05-01 22:14:56 +00:00
if ( name = = " true " ) {
return make_literal ( true ) ;
} else if ( name = = " null " ) {
return make_literal ( Variant ( ) ) ;
}
2022-09-29 09:53:28 +00:00
} else if ( len = = 5 ) {
2020-05-01 22:14:56 +00:00
if ( name = = " false " ) {
return make_literal ( false ) ;
}
}
2017-03-31 17:28:34 +00:00
2020-05-01 22:14:56 +00:00
// Not a keyword, so must be an identifier.
2023-02-13 09:32:56 +00:00
return make_identifier ( name ) ;
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
# undef KEYWORD_GROUP_CASE
# undef KEYWORD
2014-02-10 01:10:30 +00:00
}
2023-01-19 01:56:00 +00:00
# undef MAX_KEYWORD_LENGTH
# undef MIN_KEYWORD_LENGTH
# undef KEYWORDS
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : newline ( bool p_make_token ) {
2020-09-18 11:35:51 +00:00
// Don't overwrite previous newline, nor create if we want a line continuation.
2020-05-01 22:14:56 +00:00
if ( p_make_token & & ! pending_newline & & ! line_continuation ) {
Token newline ( Token : : NEWLINE ) ;
newline . start_line = line ;
newline . end_line = line ;
newline . start_column = column - 1 ;
newline . end_column = column ;
newline . leftmost_column = newline . start_column ;
newline . rightmost_column = newline . end_column ;
pending_newline = true ;
2023-02-15 14:41:46 +00:00
last_token = newline ;
2020-05-01 22:14:56 +00:00
last_newline = newline ;
}
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// Increment line/column counters.
line + + ;
column = 1 ;
leftmost_column = 1 ;
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : number ( ) {
int base = 10 ;
bool has_decimal = false ;
bool has_exponent = false ;
bool has_error = false ;
2022-02-04 08:32:20 +00:00
bool ( * digit_check_func ) ( char32_t ) = is_digit ;
2020-05-01 22:14:56 +00:00
2023-02-15 14:41:46 +00:00
// Sign before hexadecimal or binary.
if ( ( _peek ( - 1 ) = = ' + ' | | _peek ( - 1 ) = = ' - ' ) & & _peek ( ) = = ' 0 ' ) {
_advance ( ) ;
}
2020-05-01 22:14:56 +00:00
if ( _peek ( - 1 ) = = ' . ' ) {
has_decimal = true ;
} else if ( _peek ( - 1 ) = = ' 0 ' ) {
if ( _peek ( ) = = ' x ' ) {
// Hexadecimal.
base = 16 ;
2022-02-04 08:32:20 +00:00
digit_check_func = is_hex_digit ;
2020-05-01 22:14:56 +00:00
_advance ( ) ;
} else if ( _peek ( ) = = ' b ' ) {
// Binary.
base = 2 ;
2022-02-04 08:32:20 +00:00
digit_check_func = is_binary_digit ;
2020-05-01 22:14:56 +00:00
_advance ( ) ;
}
}
2014-02-10 01:10:30 +00:00
2023-02-13 06:46:31 +00:00
if ( base ! = 10 & & is_underscore ( _peek ( ) ) ) { // Disallow `0x_` and `0b_`.
Token error = make_error ( vformat ( R " (Unexpected underscore after " 0 % c " .) " , _peek ( - 1 ) ) ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
has_error = true ;
}
bool previous_was_underscore = false ; // Allow `_` to be used in a number, for readability.
2022-02-04 08:32:20 +00:00
while ( digit_check_func ( _peek ( ) ) | | is_underscore ( _peek ( ) ) ) {
if ( is_underscore ( _peek ( ) ) ) {
2020-08-31 23:25:15 +00:00
if ( previous_was_underscore ) {
2023-02-13 06:46:31 +00:00
Token error = make_error ( R " (Multiple underscores cannot be adjacent in a numeric literal.) " ) ;
2020-08-31 23:25:15 +00:00
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
}
previous_was_underscore = true ;
2021-03-06 00:53:32 +00:00
} else {
previous_was_underscore = false ;
2020-08-31 23:25:15 +00:00
}
2020-05-01 22:14:56 +00:00
_advance ( ) ;
}
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// It might be a ".." token (instead of decimal point) so we check if it's not.
if ( _peek ( ) = = ' . ' & & _peek ( 1 ) ! = ' . ' ) {
if ( base = = 10 & & ! has_decimal ) {
has_decimal = true ;
} else if ( base = = 10 ) {
Token error = make_error ( " Cannot use a decimal point twice in a number. " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
has_error = true ;
} else if ( base = = 16 ) {
Token error = make_error ( " Cannot use a decimal point in a hexadecimal number. " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
has_error = true ;
} else {
Token error = make_error ( " Cannot use a decimal point in a binary number. " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
has_error = true ;
}
if ( ! has_error ) {
_advance ( ) ;
2014-02-10 01:10:30 +00:00
2020-05-01 22:14:56 +00:00
// Consume decimal digits.
2023-02-13 06:46:31 +00:00
if ( is_underscore ( _peek ( ) ) ) { // Disallow `10._`, but allow `10.`.
Token error = make_error ( R " (Unexpected underscore after decimal point.) " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
has_error = true ;
}
previous_was_underscore = false ;
2022-02-04 08:32:20 +00:00
while ( is_digit ( _peek ( ) ) | | is_underscore ( _peek ( ) ) ) {
2023-02-13 06:46:31 +00:00
if ( is_underscore ( _peek ( ) ) ) {
if ( previous_was_underscore ) {
Token error = make_error ( R " (Multiple underscores cannot be adjacent in a numeric literal.) " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
}
previous_was_underscore = true ;
} else {
previous_was_underscore = false ;
}
2020-05-01 22:14:56 +00:00
_advance ( ) ;
}
}
}
if ( base = = 10 ) {
if ( _peek ( ) = = ' e ' | | _peek ( ) = = ' E ' ) {
has_exponent = true ;
_advance ( ) ;
if ( _peek ( ) = = ' + ' | | _peek ( ) = = ' - ' ) {
// Exponent sign.
_advance ( ) ;
}
// Consume exponent digits.
2022-02-04 08:32:20 +00:00
if ( ! is_digit ( _peek ( ) ) ) {
2020-08-31 23:25:15 +00:00
Token error = make_error ( R " (Expected exponent value after " e " .) " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
}
previous_was_underscore = false ;
2022-02-04 08:32:20 +00:00
while ( is_digit ( _peek ( ) ) | | is_underscore ( _peek ( ) ) ) {
if ( is_underscore ( _peek ( ) ) ) {
2020-08-31 23:25:15 +00:00
if ( previous_was_underscore ) {
2023-02-13 06:46:31 +00:00
Token error = make_error ( R " (Multiple underscores cannot be adjacent in a numeric literal.) " ) ;
2020-08-31 23:25:15 +00:00
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
}
previous_was_underscore = true ;
2021-03-06 00:53:32 +00:00
} else {
previous_was_underscore = false ;
2020-08-31 23:25:15 +00:00
}
2020-05-01 22:14:56 +00:00
_advance ( ) ;
}
}
}
2019-10-13 19:48:18 +00:00
2020-05-01 22:14:56 +00:00
// Detect extra decimal point.
if ( ! has_error & & has_decimal & & _peek ( ) = = ' . ' & & _peek ( 1 ) ! = ' . ' ) {
Token error = make_error ( " Cannot use a decimal point twice in a number. " ) ;
error . start_column = column ;
error . leftmost_column = column ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
has_error = true ;
2023-01-19 01:56:00 +00:00
} else if ( is_unicode_identifier_start ( _peek ( ) ) | | is_unicode_identifier_continue ( _peek ( ) ) ) {
2020-05-01 22:14:56 +00:00
// Letter at the end of the number.
push_error ( " Invalid numeric notation. " ) ;
}
2019-10-13 19:48:18 +00:00
2020-05-01 22:14:56 +00:00
// Create a string with the whole number.
2022-09-29 09:53:28 +00:00
int len = _current - _start ;
String number = String ( _start , len ) . replace ( " _ " , " " ) ;
2020-05-01 22:14:56 +00:00
// Convert to the appropriate literal type.
if ( base = = 16 ) {
int64_t value = number . hex_to_int ( ) ;
return make_literal ( value ) ;
} else if ( base = = 2 ) {
int64_t value = number . bin_to_int ( ) ;
return make_literal ( value ) ;
} else if ( has_decimal | | has_exponent ) {
2020-07-24 18:07:57 +00:00
double value = number . to_float ( ) ;
2020-05-01 22:14:56 +00:00
return make_literal ( value ) ;
} else {
int64_t value = number . to_int ( ) ;
return make_literal ( value ) ;
}
2014-02-10 01:10:30 +00:00
}
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : string ( ) {
enum StringType {
STRING_REGULAR ,
STRING_NAME ,
STRING_NODEPATH ,
} ;
2014-02-10 01:10:30 +00:00
2023-08-28 10:00:33 +00:00
bool is_raw = false ;
2020-05-01 22:14:56 +00:00
bool is_multiline = false ;
StringType type = STRING_REGULAR ;
2014-02-10 01:10:30 +00:00
2023-08-28 10:00:33 +00:00
if ( _peek ( - 1 ) = = ' r ' ) {
is_raw = true ;
_advance ( ) ;
} else if ( _peek ( - 1 ) = = ' & ' ) {
2020-05-01 22:14:56 +00:00
type = STRING_NAME ;
_advance ( ) ;
} else if ( _peek ( - 1 ) = = ' ^ ' ) {
type = STRING_NODEPATH ;
2014-02-10 01:10:30 +00:00
_advance ( ) ;
2020-05-14 14:41:43 +00:00
}
2014-02-25 12:31:47 +00:00
2020-07-27 10:43:20 +00:00
char32_t quote_char = _peek ( - 1 ) ;
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
if ( _peek ( ) = = quote_char & & _peek ( 1 ) = = quote_char ) {
is_multiline = true ;
// Consume all quotes.
_advance ( ) ;
_advance ( ) ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
String result ;
2022-01-30 13:44:07 +00:00
char32_t prev = 0 ;
int prev_pos = 0 ;
2016-03-08 23:00:52 +00:00
2020-05-01 22:14:56 +00:00
for ( ; ; ) {
// Consume actual string.
if ( _is_at_end ( ) ) {
return make_error ( " Unterminated string. " ) ;
2014-02-25 12:31:47 +00:00
}
2020-07-27 10:43:20 +00:00
char32_t ch = _peek ( ) ;
2016-03-08 23:00:52 +00:00
2021-11-11 13:41:59 +00:00
if ( ch = = 0x200E | | ch = = 0x200F | | ( ch > = 0x202A & & ch < = 0x202E ) | | ( ch > = 0x2066 & & ch < = 0x2069 ) ) {
2023-08-28 10:00:33 +00:00
Token error ;
if ( is_raw ) {
error = make_error ( " Invisible text direction control character present in the string, use regular string literal instead of r-string. " ) ;
} else {
error = make_error ( " Invisible text direction control character present in the string, escape it ( \" \\ u " + String : : num_int64 ( ch , 16 ) + " \" ) to avoid confusion. " ) ;
}
2021-11-11 13:41:59 +00:00
error . start_column = column ;
error . leftmost_column = error . start_column ;
error . end_column = column + 1 ;
error . rightmost_column = error . end_column ;
push_error ( error ) ;
}
2020-05-01 22:14:56 +00:00
if ( ch = = ' \\ ' ) {
// Escape pattern.
_advance ( ) ;
if ( _is_at_end ( ) ) {
return make_error ( " Unterminated string. " ) ;
}
2014-02-25 12:31:47 +00:00
2023-08-28 10:00:33 +00:00
if ( is_raw ) {
if ( _peek ( ) = = quote_char ) {
_advance ( ) ;
if ( _is_at_end ( ) ) {
return make_error ( " Unterminated string. " ) ;
}
result + = ' \\ ' ;
result + = quote_char ;
} else if ( _peek ( ) = = ' \\ ' ) { // For `\\\"`.
_advance ( ) ;
if ( _is_at_end ( ) ) {
return make_error ( " Unterminated string. " ) ;
}
result + = ' \\ ' ;
result + = ' \\ ' ;
} else {
result + = ' \\ ' ;
}
} else {
// Grab escape character.
char32_t code = _peek ( ) ;
_advance ( ) ;
if ( _is_at_end ( ) ) {
return make_error ( " Unterminated string. " ) ;
}
2014-02-25 12:31:47 +00:00
2023-08-28 10:00:33 +00:00
char32_t escaped = 0 ;
bool valid_escape = true ;
2014-02-25 12:31:47 +00:00
2023-08-28 10:00:33 +00:00
switch ( code ) {
case ' a ' :
escaped = ' \a ' ;
break ;
case ' b ' :
escaped = ' \b ' ;
break ;
case ' f ' :
escaped = ' \f ' ;
break ;
case ' n ' :
escaped = ' \n ' ;
break ;
case ' r ' :
escaped = ' \r ' ;
break ;
case ' t ' :
escaped = ' \t ' ;
break ;
case ' v ' :
escaped = ' \v ' ;
break ;
case ' \' ' :
escaped = ' \' ' ;
break ;
case ' \" ' :
escaped = ' \" ' ;
break ;
case ' \\ ' :
escaped = ' \\ ' ;
break ;
case ' U ' :
case ' u ' : {
// Hexadecimal sequence.
int hex_len = ( code = = ' U ' ) ? 6 : 4 ;
for ( int j = 0 ; j < hex_len ; j + + ) {
if ( _is_at_end ( ) ) {
return make_error ( " Unterminated string. " ) ;
}
char32_t digit = _peek ( ) ;
char32_t value = 0 ;
if ( is_digit ( digit ) ) {
value = digit - ' 0 ' ;
} else if ( digit > = ' a ' & & digit < = ' f ' ) {
value = digit - ' a ' ;
value + = 10 ;
} else if ( digit > = ' A ' & & digit < = ' F ' ) {
value = digit - ' A ' ;
value + = 10 ;
} else {
// Make error, but keep parsing the string.
Token error = make_error ( " Invalid hexadecimal digit in unicode escape sequence. " ) ;
error . start_column = column ;
error . leftmost_column = error . start_column ;
error . end_column = column + 1 ;
error . rightmost_column = error . end_column ;
push_error ( error ) ;
valid_escape = false ;
break ;
}
escaped < < = 4 ;
escaped | = value ;
_advance ( ) ;
2020-05-01 22:14:56 +00:00
}
2023-08-28 10:00:33 +00:00
} break ;
case ' \r ' :
if ( _peek ( ) ! = ' \n ' ) {
// Carriage return without newline in string. (???)
// Just add it to the string and keep going.
result + = ch ;
_advance ( ) ;
2020-05-01 22:14:56 +00:00
break ;
}
2023-08-28 10:00:33 +00:00
[[fallthrough]] ;
case ' \n ' :
// Escaping newline.
newline ( false ) ;
valid_escape = false ; // Don't add to the string.
2020-05-01 22:14:56 +00:00
break ;
2023-08-28 10:00:33 +00:00
default :
Token error = make_error ( " Invalid escape in string. " ) ;
2022-01-30 13:44:07 +00:00
error . start_column = column - 2 ;
error . leftmost_column = error . start_column ;
push_error ( error ) ;
valid_escape = false ;
2023-08-28 10:00:33 +00:00
break ;
}
// Parse UTF-16 pair.
if ( valid_escape ) {
if ( ( escaped & 0xfffffc00 ) = = 0xd800 ) {
if ( prev = = 0 ) {
prev = escaped ;
prev_pos = column - 2 ;
continue ;
} else {
Token error = make_error ( " Invalid UTF-16 sequence in string, unpaired lead surrogate. " ) ;
error . start_column = column - 2 ;
error . leftmost_column = error . start_column ;
push_error ( error ) ;
valid_escape = false ;
prev = 0 ;
}
} else if ( ( escaped & 0xfffffc00 ) = = 0xdc00 ) {
if ( prev = = 0 ) {
Token error = make_error ( " Invalid UTF-16 sequence in string, unpaired trail surrogate. " ) ;
error . start_column = column - 2 ;
error . leftmost_column = error . start_column ;
push_error ( error ) ;
valid_escape = false ;
} else {
escaped = ( prev < < 10UL ) + escaped - ( ( 0xd800 < < 10UL ) + 0xdc00 - 0x10000 ) ;
prev = 0 ;
}
2022-01-30 13:44:07 +00:00
}
2023-08-28 10:00:33 +00:00
if ( prev ! = 0 ) {
Token error = make_error ( " Invalid UTF-16 sequence in string, unpaired lead surrogate. " ) ;
error . start_column = prev_pos ;
2022-01-30 13:44:07 +00:00
error . leftmost_column = error . start_column ;
push_error ( error ) ;
prev = 0 ;
}
}
2014-02-25 12:31:47 +00:00
2023-08-28 10:00:33 +00:00
if ( valid_escape ) {
result + = escaped ;
}
2020-05-01 22:14:56 +00:00
}
} else if ( ch = = quote_char ) {
2022-01-30 13:44:07 +00:00
if ( prev ! = 0 ) {
Token error = make_error ( " Invalid UTF-16 sequence in string, unpaired lead surrogate " ) ;
error . start_column = prev_pos ;
error . leftmost_column = error . start_column ;
push_error ( error ) ;
prev = 0 ;
}
2020-05-01 22:14:56 +00:00
_advance ( ) ;
if ( is_multiline ) {
if ( _peek ( ) = = quote_char & & _peek ( 1 ) = = quote_char ) {
// Ended the multiline string. Consume all quotes.
_advance ( ) ;
_advance ( ) ;
break ;
2021-03-19 00:24:18 +00:00
} else {
// Not a multiline string termination, add consumed quote.
result + = quote_char ;
2020-05-01 22:14:56 +00:00
}
} else {
// Ended single-line string.
break ;
}
2014-02-25 12:31:47 +00:00
} else {
2022-01-30 13:44:07 +00:00
if ( prev ! = 0 ) {
Token error = make_error ( " Invalid UTF-16 sequence in string, unpaired lead surrogate " ) ;
error . start_column = prev_pos ;
error . leftmost_column = error . start_column ;
push_error ( error ) ;
prev = 0 ;
}
2020-05-01 22:14:56 +00:00
result + = ch ;
_advance ( ) ;
if ( ch = = ' \n ' ) {
newline ( false ) ;
}
2014-02-25 12:31:47 +00:00
}
}
2022-01-30 13:44:07 +00:00
if ( prev ! = 0 ) {
Token error = make_error ( " Invalid UTF-16 sequence in string, unpaired lead surrogate " ) ;
error . start_column = prev_pos ;
error . leftmost_column = error . start_column ;
push_error ( error ) ;
prev = 0 ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
// Make the literal.
Variant string ;
switch ( type ) {
case STRING_NAME :
string = StringName ( result ) ;
break ;
case STRING_NODEPATH :
string = NodePath ( result ) ;
break ;
case STRING_REGULAR :
string = result ;
break ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
return make_literal ( string ) ;
2014-02-25 12:31:47 +00:00
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : check_indent ( ) {
ERR_FAIL_COND_MSG ( column ! = 1 , " Checking tokenizer indentation in the middle of a line. " ) ;
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
if ( _is_at_end ( ) ) {
// Send dedents for every indent level.
pending_indents - = indent_level ( ) ;
indent_stack . clear ( ) ;
return ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
for ( ; ; ) {
2020-07-27 10:43:20 +00:00
char32_t current_indent_char = _peek ( ) ;
2020-05-01 22:14:56 +00:00
int indent_count = 0 ;
2014-02-25 12:31:47 +00:00
2020-06-01 19:40:29 +00:00
if ( current_indent_char ! = ' ' & & current_indent_char ! = ' \t ' & & current_indent_char ! = ' \r ' & & current_indent_char ! = ' \n ' & & current_indent_char ! = ' # ' ) {
2020-05-01 22:14:56 +00:00
// First character of the line is not whitespace, so we clear all indentation levels.
// Unless we are in a continuation or in multiline mode (inside expression).
if ( line_continuation | | multiline_mode ) {
return ;
}
pending_indents - = indent_level ( ) ;
indent_stack . clear ( ) ;
return ;
2014-02-25 12:31:47 +00:00
}
2020-05-01 22:14:56 +00:00
if ( _peek ( ) = = ' \r ' ) {
_advance ( ) ;
if ( _peek ( ) ! = ' \n ' ) {
push_error ( " Stray carriage return character in source code. " ) ;
2019-04-09 15:08:36 +00:00
}
2020-05-14 14:41:43 +00:00
}
2020-05-01 22:14:56 +00:00
if ( _peek ( ) = = ' \n ' ) {
// Empty line, keep going.
_advance ( ) ;
newline ( false ) ;
continue ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
// Check indent level.
bool mixed = false ;
while ( ! _is_at_end ( ) ) {
2020-07-27 10:43:20 +00:00
char32_t space = _peek ( ) ;
2020-05-01 22:14:56 +00:00
if ( space = = ' \t ' ) {
// Consider individual tab columns.
column + = tab_size - 1 ;
indent_count + = tab_size ;
} else if ( space = = ' ' ) {
indent_count + = 1 ;
} else {
break ;
}
mixed = mixed | | space ! = current_indent_char ;
_advance ( ) ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
if ( _is_at_end ( ) ) {
// Reached the end with an empty line, so just dedent as much as needed.
pending_indents - = indent_level ( ) ;
indent_stack . clear ( ) ;
return ;
2020-05-14 14:41:43 +00:00
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
if ( _peek ( ) = = ' \r ' ) {
_advance ( ) ;
if ( _peek ( ) ! = ' \n ' ) {
push_error ( " Stray carriage return character in source code. " ) ;
}
2014-02-25 12:31:47 +00:00
}
2020-05-01 22:14:56 +00:00
if ( _peek ( ) = = ' \n ' ) {
// Empty line, keep going.
_advance ( ) ;
newline ( false ) ;
continue ;
2014-02-25 12:31:47 +00:00
}
2020-05-01 22:14:56 +00:00
if ( _peek ( ) = = ' # ' ) {
// Comment. Advance to the next line.
2020-11-29 02:37:57 +00:00
# ifdef TOOLS_ENABLED
String comment ;
while ( _peek ( ) ! = ' \n ' & & ! _is_at_end ( ) ) {
comment + = _advance ( ) ;
}
comments [ line ] = CommentData ( comment , true ) ;
# else
2020-05-01 22:14:56 +00:00
while ( _peek ( ) ! = ' \n ' & & ! _is_at_end ( ) ) {
_advance ( ) ;
}
2020-11-29 02:37:57 +00:00
# endif // TOOLS_ENABLED
2020-05-01 22:14:56 +00:00
if ( _is_at_end ( ) ) {
// Reached the end with an empty line, so just dedent as much as needed.
pending_indents - = indent_level ( ) ;
indent_stack . clear ( ) ;
return ;
}
_advance ( ) ; // Consume '\n'.
newline ( false ) ;
continue ;
2014-02-25 12:31:47 +00:00
}
2023-08-07 09:23:02 +00:00
if ( mixed & & ! line_continuation & & ! multiline_mode ) {
Token error = make_error ( " Mixed use of tabs and spaces for indentation. " ) ;
error . start_line = line ;
error . start_column = 1 ;
error . leftmost_column = 1 ;
error . rightmost_column = column ;
push_error ( error ) ;
}
2020-05-01 22:14:56 +00:00
if ( line_continuation | | multiline_mode ) {
// We cleared up all the whitespace at the beginning of the line.
2023-09-18 20:55:15 +00:00
// If this is a line continuation or we're in multiline mode then we don't want any indentation changes.
2020-05-01 22:14:56 +00:00
return ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
// Check if indentation character is consistent.
if ( indent_char = = ' \0 ' ) {
// First time indenting, choose character now.
indent_char = current_indent_char ;
} else if ( current_indent_char ! = indent_char ) {
2021-10-28 10:18:24 +00:00
Token error = make_error ( vformat ( " Used %s character for indentation instead of %s as used before in the file. " ,
_get_indent_char_name ( current_indent_char ) , _get_indent_char_name ( indent_char ) ) ) ;
2020-05-01 22:14:56 +00:00
error . start_line = line ;
error . start_column = 1 ;
error . leftmost_column = 1 ;
error . rightmost_column = column ;
push_error ( error ) ;
2020-05-14 14:41:43 +00:00
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
// Now we can do actual indentation changes.
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
// Check if indent or dedent.
int previous_indent = 0 ;
if ( indent_level ( ) > 0 ) {
previous_indent = indent_stack . back ( ) - > get ( ) ;
}
if ( indent_count = = previous_indent ) {
// No change in indentation.
return ;
}
if ( indent_count > previous_indent ) {
// Indentation increased.
indent_stack . push_back ( indent_count ) ;
pending_indents + + ;
2014-02-25 12:31:47 +00:00
} else {
2020-05-01 22:14:56 +00:00
// Indentation decreased (dedent).
if ( indent_level ( ) = = 0 ) {
push_error ( " Tokenizer bug: trying to dedent without previous indent. " ) ;
return ;
}
while ( indent_level ( ) > 0 & & indent_stack . back ( ) - > get ( ) > indent_count ) {
indent_stack . pop_back ( ) ;
pending_indents - - ;
}
if ( ( indent_level ( ) > 0 & & indent_stack . back ( ) - > get ( ) ! = indent_count ) | | ( indent_level ( ) = = 0 & & indent_count ! = 0 ) ) {
// Mismatched indentation alignment.
Token error = make_error ( " Unindent doesn't match the previous indentation level. " ) ;
error . start_line = line ;
error . start_column = 1 ;
error . leftmost_column = 1 ;
error . end_column = column + 1 ;
error . rightmost_column = column + 1 ;
push_error ( error ) ;
// Still, we'll be lenient and keep going, so keep this level in the stack.
indent_stack . push_back ( indent_count ) ;
}
2014-02-25 12:31:47 +00:00
}
2020-05-01 22:14:56 +00:00
break ; // Get out of the loop in any case.
2014-02-25 12:31:47 +00:00
}
}
2021-10-28 10:18:24 +00:00
String GDScriptTokenizer : : _get_indent_char_name ( char32_t ch ) {
ERR_FAIL_COND_V ( ch ! = ' ' & & ch ! = ' \t ' , String ( & ch , 1 ) . c_escape ( ) ) ;
return ch = = ' ' ? " space " : " tab " ;
}
2020-05-01 22:14:56 +00:00
void GDScriptTokenizer : : _skip_whitespace ( ) {
if ( pending_indents ! = 0 ) {
// Still have some indent/dedent tokens to give.
return ;
2020-05-14 14:41:43 +00:00
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
bool is_bol = column = = 1 ; // Beginning of line.
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
if ( is_bol ) {
check_indent ( ) ;
return ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
for ( ; ; ) {
2020-07-27 10:43:20 +00:00
char32_t c = _peek ( ) ;
2020-05-01 22:14:56 +00:00
switch ( c ) {
case ' ' :
_advance ( ) ;
break ;
case ' \t ' :
_advance ( ) ;
// Consider individual tab columns.
column + = tab_size - 1 ;
break ;
case ' \r ' :
_advance ( ) ; // Consume either way.
if ( _peek ( ) ! = ' \n ' ) {
push_error ( " Stray carriage return character in source code. " ) ;
return ;
}
break ;
case ' \n ' :
_advance ( ) ;
newline ( ! is_bol ) ; // Don't create new line token if line is empty.
check_indent ( ) ;
break ;
2020-11-29 02:37:57 +00:00
case ' # ' : {
2020-05-01 22:14:56 +00:00
// Comment.
2020-11-29 02:37:57 +00:00
# ifdef TOOLS_ENABLED
String comment ;
while ( _peek ( ) ! = ' \n ' & & ! _is_at_end ( ) ) {
comment + = _advance ( ) ;
}
comments [ line ] = CommentData ( comment , is_bol ) ;
# else
2020-05-01 22:14:56 +00:00
while ( _peek ( ) ! = ' \n ' & & ! _is_at_end ( ) ) {
_advance ( ) ;
}
2020-11-29 02:37:57 +00:00
# endif // TOOLS_ENABLED
2020-05-01 22:14:56 +00:00
if ( _is_at_end ( ) ) {
return ;
}
_advance ( ) ; // Consume '\n'
newline ( ! is_bol ) ;
check_indent ( ) ;
2020-11-29 02:37:57 +00:00
} break ;
2020-05-01 22:14:56 +00:00
default :
return ;
}
}
2014-02-25 12:31:47 +00:00
}
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : Token GDScriptTokenizer : : scan ( ) {
if ( has_error ( ) ) {
return pop_error ( ) ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
_skip_whitespace ( ) ;
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
if ( pending_newline ) {
pending_newline = false ;
if ( ! multiline_mode ) {
2021-03-12 13:35:16 +00:00
// Don't return newline tokens on multiline mode.
2020-05-01 22:14:56 +00:00
return last_newline ;
}
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
// Check for potential errors after skipping whitespace().
if ( has_error ( ) ) {
return pop_error ( ) ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
_start = _current ;
start_line = line ;
start_column = column ;
leftmost_column = column ;
rightmost_column = column ;
if ( pending_indents ! = 0 ) {
// Adjust position for indent.
_start - = start_column - 1 ;
start_column = 1 ;
leftmost_column = 1 ;
if ( pending_indents > 0 ) {
// Indents.
pending_indents - - ;
return make_token ( Token : : INDENT ) ;
} else {
// Dedents.
pending_indents + + ;
Token dedent = make_token ( Token : : DEDENT ) ;
dedent . end_column + = 1 ;
dedent . rightmost_column + = 1 ;
return dedent ;
}
2020-05-14 14:41:43 +00:00
}
2020-05-01 22:14:56 +00:00
if ( _is_at_end ( ) ) {
return make_token ( Token : : TK_EOF ) ;
2020-05-14 14:41:43 +00:00
}
2014-02-25 12:31:47 +00:00
2020-07-27 10:43:20 +00:00
const char32_t c = _advance ( ) ;
2020-05-14 12:29:06 +00:00
2020-05-01 22:14:56 +00:00
if ( c = = ' \\ ' ) {
// Line continuation with backslash.
if ( _peek ( ) = = ' \r ' ) {
if ( _peek ( 1 ) ! = ' \n ' ) {
return make_error ( " Unexpected carriage return character. " ) ;
}
_advance ( ) ;
}
if ( _peek ( ) ! = ' \n ' ) {
return make_error ( " Expected new line after \" \\ \" . " ) ;
}
_advance ( ) ;
newline ( false ) ;
line_continuation = true ;
return scan ( ) ; // Recurse to get next token.
2020-05-14 14:41:43 +00:00
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
line_continuation = false ;
2020-05-14 12:29:06 +00:00
2022-02-04 08:32:20 +00:00
if ( is_digit ( c ) ) {
2020-05-01 22:14:56 +00:00
return number ( ) ;
2023-08-28 10:00:33 +00:00
} else if ( c = = ' r ' & & ( _peek ( ) = = ' " ' | | _peek ( ) = = ' \' ' ) ) {
// Raw string literals.
return string ( ) ;
2023-01-19 01:56:00 +00:00
} else if ( is_unicode_identifier_start ( c ) ) {
2020-05-01 22:14:56 +00:00
return potential_identifier ( ) ;
}
2020-05-14 12:29:06 +00:00
2020-05-01 22:14:56 +00:00
switch ( c ) {
// String literals.
case ' " ' :
case ' \' ' :
return string ( ) ;
// Annotation.
case ' @ ' :
return annotation ( ) ;
// Single characters.
case ' ~ ' :
return make_token ( Token : : TILDE ) ;
case ' , ' :
return make_token ( Token : : COMMA ) ;
case ' : ' :
return make_token ( Token : : COLON ) ;
case ' ; ' :
return make_token ( Token : : SEMICOLON ) ;
case ' $ ' :
return make_token ( Token : : DOLLAR ) ;
case ' ? ' :
return make_token ( Token : : QUESTION_MARK ) ;
case ' ` ' :
return make_token ( Token : : BACKTICK ) ;
// Parens.
case ' ( ' :
push_paren ( ' ( ' ) ;
return make_token ( Token : : PARENTHESIS_OPEN ) ;
case ' [ ' :
push_paren ( ' [ ' ) ;
return make_token ( Token : : BRACKET_OPEN ) ;
case ' { ' :
push_paren ( ' { ' ) ;
return make_token ( Token : : BRACE_OPEN ) ;
case ' ) ' :
if ( ! pop_paren ( ' ( ' ) ) {
return make_paren_error ( c ) ;
}
return make_token ( Token : : PARENTHESIS_CLOSE ) ;
case ' ] ' :
if ( ! pop_paren ( ' [ ' ) ) {
return make_paren_error ( c ) ;
}
return make_token ( Token : : BRACKET_CLOSE ) ;
case ' } ' :
if ( ! pop_paren ( ' { ' ) ) {
return make_paren_error ( c ) ;
}
return make_token ( Token : : BRACE_CLOSE ) ;
// Double characters.
case ' ! ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : BANG_EQUAL ) ;
} else {
return make_token ( Token : : BANG ) ;
}
case ' . ' :
if ( _peek ( ) = = ' . ' ) {
_advance ( ) ;
return make_token ( Token : : PERIOD_PERIOD ) ;
2022-02-04 08:32:20 +00:00
} else if ( is_digit ( _peek ( ) ) ) {
2020-05-01 22:14:56 +00:00
// Number starting with '.'.
return number ( ) ;
} else {
return make_token ( Token : : PERIOD ) ;
}
case ' + ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : PLUS_EQUAL ) ;
2023-02-15 14:41:46 +00:00
} else if ( is_digit ( _peek ( ) ) & & ! last_token . can_precede_bin_op ( ) ) {
// Number starting with '+'.
return number ( ) ;
2020-05-01 22:14:56 +00:00
} else {
return make_token ( Token : : PLUS ) ;
}
case ' - ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : MINUS_EQUAL ) ;
2023-02-15 14:41:46 +00:00
} else if ( is_digit ( _peek ( ) ) & & ! last_token . can_precede_bin_op ( ) ) {
// Number starting with '-'.
return number ( ) ;
2020-05-01 22:14:56 +00:00
} else if ( _peek ( ) = = ' > ' ) {
_advance ( ) ;
return make_token ( Token : : FORWARD_ARROW ) ;
} else {
return make_token ( Token : : MINUS ) ;
}
case ' * ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : STAR_EQUAL ) ;
2022-03-07 17:25:21 +00:00
} else if ( _peek ( ) = = ' * ' ) {
if ( _peek ( 1 ) = = ' = ' ) {
_advance ( ) ;
_advance ( ) ; // Advance both '*' and '='
return make_token ( Token : : STAR_STAR_EQUAL ) ;
}
_advance ( ) ;
return make_token ( Token : : STAR_STAR ) ;
2020-05-01 22:14:56 +00:00
} else {
return make_token ( Token : : STAR ) ;
}
case ' / ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : SLASH_EQUAL ) ;
} else {
return make_token ( Token : : SLASH ) ;
}
case ' % ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : PERCENT_EQUAL ) ;
} else {
return make_token ( Token : : PERCENT ) ;
}
case ' ^ ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : CARET_EQUAL ) ;
} else if ( _peek ( ) = = ' " ' | | _peek ( ) = = ' \' ' ) {
// Node path
return string ( ) ;
} else {
return make_token ( Token : : CARET ) ;
}
case ' & ' :
if ( _peek ( ) = = ' & ' ) {
_advance ( ) ;
return make_token ( Token : : AMPERSAND_AMPERSAND ) ;
} else if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : AMPERSAND_EQUAL ) ;
} else if ( _peek ( ) = = ' " ' | | _peek ( ) = = ' \' ' ) {
// String Name
return string ( ) ;
} else {
return make_token ( Token : : AMPERSAND ) ;
}
case ' | ' :
if ( _peek ( ) = = ' | ' ) {
_advance ( ) ;
return make_token ( Token : : PIPE_PIPE ) ;
} else if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : PIPE_EQUAL ) ;
} else {
return make_token ( Token : : PIPE ) ;
}
2020-05-14 12:29:06 +00:00
2020-05-01 22:14:56 +00:00
// Potential VCS conflict markers.
case ' = ' :
if ( _peek ( ) = = ' = ' ) {
return check_vcs_marker ( ' = ' , Token : : EQUAL_EQUAL ) ;
} else {
return make_token ( Token : : EQUAL ) ;
}
case ' < ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : LESS_EQUAL ) ;
} else if ( _peek ( ) = = ' < ' ) {
if ( _peek ( 1 ) = = ' = ' ) {
_advance ( ) ;
_advance ( ) ; // Advance both '<' and '='
return make_token ( Token : : LESS_LESS_EQUAL ) ;
} else {
return check_vcs_marker ( ' < ' , Token : : LESS_LESS ) ;
}
} else {
return make_token ( Token : : LESS ) ;
}
case ' > ' :
if ( _peek ( ) = = ' = ' ) {
_advance ( ) ;
return make_token ( Token : : GREATER_EQUAL ) ;
} else if ( _peek ( ) = = ' > ' ) {
if ( _peek ( 1 ) = = ' = ' ) {
_advance ( ) ;
_advance ( ) ; // Advance both '>' and '='
return make_token ( Token : : GREATER_GREATER_EQUAL ) ;
} else {
return check_vcs_marker ( ' > ' , Token : : GREATER_GREATER ) ;
}
} else {
return make_token ( Token : : GREATER ) ;
}
2014-02-25 12:31:47 +00:00
2020-05-01 22:14:56 +00:00
default :
2023-01-19 01:56:00 +00:00
if ( is_whitespace ( c ) ) {
2023-02-07 07:34:16 +00:00
return make_error ( vformat ( R " (Invalid white space character U+%04X.) " , static_cast<int32_t>(c))) ;
2023-01-19 01:56:00 +00:00
} else {
2023-02-07 07:34:16 +00:00
return make_error ( vformat ( R " (Invalid character " % c " (U+%04X) . ) " , c, static_cast<int32_t>(c))) ;
2023-01-19 01:56:00 +00:00
}
2020-05-01 22:14:56 +00:00
}
2014-02-25 12:31:47 +00:00
}
2020-05-14 12:29:06 +00:00
2020-05-01 22:14:56 +00:00
GDScriptTokenizer : : GDScriptTokenizer ( ) {
# ifdef TOOLS_ENABLED
if ( EditorSettings : : get_singleton ( ) ) {
2021-08-15 17:14:46 +00:00
tab_size = EditorSettings : : get_singleton ( ) - > get_setting ( " text_editor/behavior/indent/size " ) ;
2020-05-01 22:14:56 +00:00
}
# endif // TOOLS_ENABLED
2023-01-19 01:56:00 +00:00
# ifdef DEBUG_ENABLED
make_keyword_list ( ) ;
# endif // DEBUG_ENABLED
2014-02-25 12:31:47 +00:00
}