From 439d43932133d32dcabd482f11842072d52b41e1 Mon Sep 17 00:00:00 2001 From: Zher Huei Lee Date: Sun, 23 Oct 2016 01:22:48 +0100 Subject: [PATCH] RegEx re-implemented as a module Re-wrote nrex as a module using godot-specific parts and new features: * Added string substitutions. * Named groups are now supported. * Removed use of mutable variables in RegEx. RegExMatch is returned instead. --- bin/tests/test_string.cpp | 15 +- doc/base/classes.xml | 126 +- drivers/SCsub | 1 - drivers/nrex/README.md | 75 - drivers/nrex/nrex.cpp | 1496 ----------------- drivers/nrex/nrex.hpp | 176 -- drivers/nrex/nrex_config.h | 12 - drivers/nrex/regex.cpp | 142 -- drivers/register_driver_types.cpp | 4 - {drivers/nrex => modules/regex}/SCsub | 2 +- modules/regex/config.py | 8 + modules/regex/regex.cpp | 1465 ++++++++++++++++ modules/regex/regex.h | 114 ++ .../regex/register_types.cpp | 42 +- modules/regex/register_types.h | 31 + 15 files changed, 1733 insertions(+), 1976 deletions(-) delete mode 100644 drivers/nrex/README.md delete mode 100644 drivers/nrex/nrex.cpp delete mode 100644 drivers/nrex/nrex.hpp delete mode 100644 drivers/nrex/nrex_config.h delete mode 100644 drivers/nrex/regex.cpp rename {drivers/nrex => modules/regex}/SCsub (50%) create mode 100644 modules/regex/config.py create mode 100644 modules/regex/regex.cpp create mode 100644 modules/regex/regex.h rename drivers/nrex/regex.h => modules/regex/register_types.cpp (72%) create mode 100644 modules/regex/register_types.h diff --git a/bin/tests/test_string.cpp b/bin/tests/test_string.cpp index 2e8f5c34944..4990c58896c 100644 --- a/bin/tests/test_string.cpp +++ b/bin/tests/test_string.cpp @@ -31,7 +31,6 @@ //#include "math_funcs.h" #include #include "os/os.h" -#include "drivers/nrex/regex.h" #include "core/io/ip_address.h" #include "test_string.h" @@ -462,18 +461,8 @@ bool test_25() { bool test_26() { - OS::get_singleton()->print("\n\nTest 26: RegEx\n"); - RegEx regexp("(.*):(.*)"); - - int res = regexp.find("name:password"); - printf("\tmatch: %s\n", (res>=0)?"true":"false"); - - printf("\t%i captures:\n", regexp.get_capture_count()); - for (int i = 0; i=0); + //TODO: Do replacement RegEx test + return true; }; struct test_27_data { diff --git a/doc/base/classes.xml b/doc/base/classes.xml index 5eb021f6c05..cafb1449158 100644 --- a/doc/base/classes.xml +++ b/doc/base/classes.xml @@ -32514,6 +32514,7 @@ would be read as [code]"(?:\\.|[^"])*"[/code] Currently supported features: * Capturing [code]()[/code] and non-capturing [code](?:)[/code] groups + * Named capturing groups [code](?P<name>)[/code] * Any character [code].[/code] * Shorthand character classes [code]\w \W \s \S \d \D[/code] * User-defined character classes such as [code][A-Za-z][/code] @@ -32522,7 +32523,7 @@ * Lazy (non-greedy) quantifiers [code]*?[/code] * Beginning [code]^[/code] and end [code]$[/code] anchors * Alternation [code]|[/code] - * Backreferences [code]\1[/code] and [code]\g{1}[/code] + * Backreferences [code]\1[/code], [code]\g{1}[/code], and [code]\g<name>[/code] * POSIX character classes [code][[:alnum:]][/code] * Lookahead [code](?=)[/code], [code](?!)[/code] and lookbehind [code](?<=)[/code], [code](?<!)[/code] * ASCII [code]\xFF[/code] and Unicode [code]\uFFFF[/code] code points (in a style similar to Python) @@ -32531,7 +32532,7 @@ - This method resets the state of the object, as it was freshly created. Namely, it unassigns the regular expression of this object, and forgets all captures made by the last [method find]. + This method resets the state of the object, as it was freshly created. Namely, it unassigns the regular expression of this object. @@ -32539,15 +32540,41 @@ - - - Compiles and assign the regular expression pattern to use. The limit on the number of capturing groups can be specified or made unlimited if negative. + Compiles and assign the regular expression pattern to use. - + + + Returns the number of numeric capturing groups. + + + + + + + Returns an array of names of named capturing groups. + + + + + + + Returns the expression used to compile the code. + + + + + + + Returns whether this object has a valid regular expression assigned. + + + + + @@ -32555,45 +32582,96 @@ - This method tries to find the pattern within the string, and returns the position where it was found. It also stores any capturing group (see [method get_capture]) for further retrieval. + Searches the text for the compiled pattern. Returns a [RegExMatch] container of the first matching reult if found, otherwise null. The starting point of the serch could be specified without moving the string start anchor. - + - + + + + + + + - Returns a captured group. A captured group is the part of a string that matches a part of the pattern delimited by parentheses (unless they are non-capturing parentheses [i](?:)[/i]). + Searches the specified text for the compiled pattern and returns the text with the result replaced. Escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] are automatically expanded and resolved. If no change was found the unmodified text is returned instead. - + + + + + + + + + + + + + + + + + Using results from the search, returns the specified string with escapes and backreferences such as [code]\1[/code] and [code]\g<name>[/code] expanded and resolved + + + + + + + + + Returns the end position of the match in the string. An interger can be specified for numeric groups or a string for named groups. Returns -1 if that group wasn't found or doesn't exist. Defaults to 0 (whole pattern). + + + + + + + Returns an array of the results of the numeric groups. + + + - Returns the number of capturing groups. A captured group is the part of a string that matches a part of the pattern delimited by parentheses (unless they are non-capturing parentheses [i](?:)[/i]). + Returns the number of numeric capturing groups. - + + + + + Returns a dictionary containing the named capturing groups and their results. + + + + + + + Returns an array of names of named capturing groups. + + + - + + Returns the starting position of the match in the string. An interger can be specified for numeric groups or a string for named groups. Returns -1 if that group wasn't found or doesn't exist. Defaults to 0 (whole pattern). - - + + + + - Return a list of all the captures made by the regular expression. - - - - - - - Returns whether this object has a valid regular expression assigned. + Returns the result of the match in the string. An interger can be specified for numeric groups or a string for named groups. Returns -1 if that group wasn't found or doesn't exist. Defaults to 0 (whole pattern). diff --git a/drivers/SCsub b/drivers/SCsub index 1f1509efa87..ab2c991f24a 100644 --- a/drivers/SCsub +++ b/drivers/SCsub @@ -25,7 +25,6 @@ SConscript('gl_context/SCsub'); # Core dependencies SConscript("png/SCsub"); -SConscript("nrex/SCsub"); # Tools override # FIXME: Should likely be integrated in the tools/ codebase diff --git a/drivers/nrex/README.md b/drivers/nrex/README.md deleted file mode 100644 index 7a942b24520..00000000000 --- a/drivers/nrex/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# NREX: Node RegEx - -[![Build Status](https://travis-ci.org/leezh/nrex.svg?branch=master)](https://travis-ci.org/leezh/nrex) - -** Version 0.2 ** - -Small node-based regular expression library. It only does text pattern -matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp` -and `nrex_config.h` to your project and follow the example: - - nrex regex; - regex.compile("^(fo+)bar$"); - - nrex_result captures[regex.capture_size()]; - if (regex.match("foobar", captures)) - { - std::cout << captures[0].start << std::endl; - std::cout << captures[0].length << std::endl; - } - -More details about its use is documented in `nrex.hpp` - -Currently supported features: - * Capturing `()` and non-capturing `(?:)` groups - * Any character `.` (includes newlines) - * Shorthand caracter classes `\w\W\s\S\d\D` - * POSIX character classes such as `[[:alnum:]]` - * Bracket expressions such as `[A-Za-z]` - * Simple quantifiers `?`, `*` and `+` - * Range quantifiers `{0,1}` - * Lazy (non-greedy) quantifiers `*?` - * Begining `^` and end `$` anchors - * Word boundaries `\b` - * Alternation `|` - * ASCII `\xFF` code points - * Unicode `\uFFFF` code points - * Positive `(?=)` and negative `(?!)` lookahead - * Positive `(?<=)` and negative `(? -#include -#define NREX_ISALPHANUM iswalnum -#define NREX_ISSPACE iswspace -#define NREX_STRLEN wcslen -#else -#include -#include -#define NREX_ISALPHANUM isalnum -#define NREX_ISSPACE isspace -#define NREX_STRLEN strlen -#endif - -#ifdef NREX_THROW_ERROR -#define NREX_COMPILE_ERROR(M) throw nrex_compile_error(M) -#else -#define NREX_COMPILE_ERROR(M) reset(); return false -#endif - -#ifndef NREX_NEW -#define NREX_NEW(X) new X -#define NREX_NEW_ARRAY(X, N) new X[N] -#define NREX_DELETE(X) delete X -#define NREX_DELETE_ARRAY(X) delete[] X -#endif - -template -class nrex_array -{ - private: - T* _data; - unsigned int _reserved; - unsigned int _size; - public: - nrex_array() - : _data(NREX_NEW_ARRAY(T, 2)) - , _reserved(2) - , _size(0) - { - } - - nrex_array(unsigned int reserved) - : _data(NREX_NEW_ARRAY(T, reserved ? reserved : 1)) - , _reserved(reserved ? reserved : 1) - , _size(0) - { - } - - ~nrex_array() - { - NREX_DELETE_ARRAY(_data); - } - - unsigned int size() const - { - return _size; - } - - void reserve(unsigned int size) - { - if (size < _size) { - size = _size; - } - if (size == 0) { - size = 1; - } - T* old = _data; - _data = NREX_NEW_ARRAY(T, size); - _reserved = size; - for (unsigned int i = 0; i < _size; ++i) - { - _data[i] = old[i]; - } - NREX_DELETE_ARRAY(old); - } - - void push(T item) - { - if (_size == _reserved) - { - reserve(_reserved * 2); - } - _data[_size] = item; - _size++; - } - - const T& top() const - { - return _data[_size - 1]; - } - - const T& operator[] (unsigned int i) const - { - return _data[i]; - } - - void pop() - { - if (_size > 0) - { - --_size; - } - } -}; - -static int nrex_parse_hex(nrex_char c) -{ - if ('0' <= c && c <= '9') - { - return int(c - '0'); - } - else if ('a' <= c && c <= 'f') - { - return int(c - 'a') + 10; - } - else if ('A' <= c && c <= 'F') - { - return int(c - 'A') + 10; - } - return -1; -} - -static nrex_char nrex_unescape(const nrex_char*& c) -{ - switch (c[1]) - { - case '0': ++c; return '\0'; - case 'a': ++c; return '\a'; - case 'e': ++c; return '\e'; - case 'f': ++c; return '\f'; - case 'n': ++c; return '\n'; - case 'r': ++c; return '\r'; - case 't': ++c; return '\t'; - case 'v': ++c; return '\v'; - case 'b': ++c; return '\b'; - case 'x': - { - int point = 0; - for (int i = 2; i <= 3; ++i) - { - int res = nrex_parse_hex(c[i]); - if (res == -1) - { - return '\0'; - } - point = (point << 4) + res; - } - c = &c[3]; - return nrex_char(point); - } - case 'u': - { - int point = 0; - for (int i = 2; i <= 5; ++i) - { - int res = nrex_parse_hex(c[i]); - if (res == -1) - { - return '\0'; - } - point = (point << 4) + res; - } - c = &c[5]; - return nrex_char(point); - } - } - return (++c)[0]; -} - -struct nrex_search -{ - const nrex_char* str; - nrex_result* captures; - int end; - bool complete; - nrex_array lookahead_pos; - - nrex_char at(int pos) - { - return str[pos]; - } - - nrex_search(const nrex_char* str, nrex_result* captures, int lookahead) - : str(str) - , captures(captures) - , end(0) - , lookahead_pos(lookahead) - { - } -}; - -struct nrex_node -{ - nrex_node* next; - nrex_node* previous; - nrex_node* parent; - bool quantifiable; - int length; - - nrex_node(bool quantify = false) - : next(NULL) - , previous(NULL) - , parent(NULL) - , quantifiable(quantify) - , length(-1) - { - } - - virtual ~nrex_node() - { - if (next) - { - NREX_DELETE(next); - } - } - - virtual int test(nrex_search* s, int pos) const - { - return next ? next->test(s, pos) : -1; - } - - virtual int test_parent(nrex_search* s, int pos) const - { - if (next) - { - pos = next->test(s, pos); - } - if (pos >= 0) - { - s->complete = true; - } - if (parent && pos >= 0) - { - pos = parent->test_parent(s, pos); - } - if (pos < 0) - { - s->complete = false; - } - return pos; - } - - void increment_length(int amount, bool subtract = false) - { - if (amount >= 0 && length >= 0) - { - if (!subtract) - { - length += amount; - } - else - { - length -= amount; - } - } - else - { - length = -1; - } - if (parent) - { - parent->increment_length(amount, subtract); - } - } -}; - -enum nrex_group_type -{ - nrex_group_capture, - nrex_group_non_capture, - nrex_group_bracket, - nrex_group_look_ahead, - nrex_group_look_behind, -}; - -struct nrex_node_group : public nrex_node -{ - nrex_group_type type; - int id; - bool negate; - nrex_array childset; - nrex_node* back; - - nrex_node_group(nrex_group_type type, int id = 0) - : nrex_node(true) - , type(type) - , id(id) - , negate(false) - , back(NULL) - { - if (type != nrex_group_bracket) - { - length = 0; - } - else - { - length = 1; - } - if (type == nrex_group_look_ahead || type == nrex_group_look_behind) - { - quantifiable = false; - } - } - - virtual ~nrex_node_group() - { - for (unsigned int i = 0; i < childset.size(); ++i) - { - NREX_DELETE(childset[i]); - } - - } - - int test(nrex_search* s, int pos) const - { - int old_start; - if (type == nrex_group_capture) - { - old_start = s->captures[id].start; - s->captures[id].start = pos; - } - for (unsigned int i = 0; i < childset.size(); ++i) - { - s->complete = false; - int offset = 0; - if (type == nrex_group_look_behind) - { - if (pos < length) - { - return -1; - } - offset = length; - } - if (type == nrex_group_look_ahead) - { - s->lookahead_pos.push(pos); - } - int res = childset[i]->test(s, pos - offset); - if (type == nrex_group_look_ahead) - { - s->lookahead_pos.pop(); - } - if (s->complete) - { - return res; - } - if (negate) - { - if (res < 0) - { - res = pos + 1; - } - else - { - return -1; - } - if (i + 1 < childset.size()) - { - continue; - } - } - if (res >= 0) - { - if (type == nrex_group_capture) - { - s->captures[id].length = res - pos; - } - else if (type == nrex_group_look_ahead || type == nrex_group_look_behind) - { - res = pos; - } - return next ? next->test(s, res) : res; - } - } - if (type == nrex_group_capture) - { - s->captures[id].start = old_start; - } - return -1; - } - - virtual int test_parent(nrex_search* s, int pos) const - { - if (type == nrex_group_capture) - { - s->captures[id].length = pos - s->captures[id].start; - } - if (type == nrex_group_look_ahead) - { - pos = s->lookahead_pos[id]; - } - return nrex_node::test_parent(s, pos); - } - - void add_childset() - { - if (childset.size() > 0 && type != nrex_group_bracket) - { - length = -1; - } - back = NULL; - } - - void add_child(nrex_node* node) - { - node->parent = this; - node->previous = back; - if (back && type != nrex_group_bracket) - { - back->next = node; - } - else - { - childset.push(node); - } - if (type != nrex_group_bracket) - { - increment_length(node->length); - } - back = node; - } - - nrex_node* swap_back(nrex_node* node) - { - if (!back) - { - add_child(node); - return NULL; - } - nrex_node* old = back; - if (!old->previous) - { - childset.pop(); - } - if (type != nrex_group_bracket) - { - increment_length(old->length, true); - } - back = old->previous; - add_child(node); - return old; - } - - void pop_back() - { - if (back) - { - nrex_node* old = back; - if (!old->previous) - { - childset.pop(); - } - if (type != nrex_group_bracket) - { - increment_length(old->length, true); - } - back = old->previous; - NREX_DELETE(old); - } - } -}; - -struct nrex_node_char : public nrex_node -{ - nrex_char ch; - - nrex_node_char(nrex_char c) - : nrex_node(true) - , ch(c) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos || s->at(pos) != ch) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -struct nrex_node_range : public nrex_node -{ - nrex_char start; - nrex_char end; - - nrex_node_range(nrex_char s, nrex_char e) - : nrex_node(true) - , start(s) - , end(e) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos) - { - return -1; - } - nrex_char c = s->at(pos); - if (c < start || end < c) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -enum nrex_class_type -{ - nrex_class_none, - nrex_class_alnum, - nrex_class_alpha, - nrex_class_blank, - nrex_class_cntrl, - nrex_class_digit, - nrex_class_graph, - nrex_class_lower, - nrex_class_print, - nrex_class_punct, - nrex_class_space, - nrex_class_upper, - nrex_class_xdigit, - nrex_class_word -}; - -static bool nrex_compare_class(const nrex_char** pos, const char* text) -{ - unsigned int i = 0; - for (i = 0; text[i] != '\0'; ++i) - { - if ((*pos)[i] != text[i]) - { - return false; - } - } - if ((*pos)[i++] != ':' || (*pos)[i] != ']') - { - return false; - } - *pos = &(*pos)[i]; - return true; -} - -#define NREX_COMPARE_CLASS(POS, NAME) if (nrex_compare_class(POS, #NAME)) return nrex_class_ ## NAME - -static nrex_class_type nrex_parse_class(const nrex_char** pos) -{ - NREX_COMPARE_CLASS(pos, alnum); - NREX_COMPARE_CLASS(pos, alpha); - NREX_COMPARE_CLASS(pos, blank); - NREX_COMPARE_CLASS(pos, cntrl); - NREX_COMPARE_CLASS(pos, digit); - NREX_COMPARE_CLASS(pos, graph); - NREX_COMPARE_CLASS(pos, lower); - NREX_COMPARE_CLASS(pos, print); - NREX_COMPARE_CLASS(pos, punct); - NREX_COMPARE_CLASS(pos, space); - NREX_COMPARE_CLASS(pos, upper); - NREX_COMPARE_CLASS(pos, xdigit); - NREX_COMPARE_CLASS(pos, word); - return nrex_class_none; -} - -struct nrex_node_class : public nrex_node -{ - nrex_class_type type; - - nrex_node_class(nrex_class_type t) - : nrex_node(true) - , type(t) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos) - { - return -1; - } - if (!test_class(s->at(pos))) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } - - bool test_class(nrex_char c) const - { - if ((0 <= c && c <= 0x1F) || c == 0x7F) - { - if (type == nrex_class_cntrl) - { - return true; - } - } - else if (c < 0x7F) - { - if (type == nrex_class_print) - { - return true; - } - else if (type == nrex_class_graph && c != ' ') - { - return true; - } - else if ('0' <= c && c <= '9') - { - switch (type) - { - case nrex_class_alnum: - case nrex_class_digit: - case nrex_class_xdigit: - case nrex_class_word: - return true; - default: - break; - } - } - else if ('A' <= c && c <= 'Z') - { - switch (type) - { - case nrex_class_alnum: - case nrex_class_alpha: - case nrex_class_upper: - case nrex_class_word: - return true; - case nrex_class_xdigit: - if (c <= 'F') - { - return true; - } - default: - break; - } - } - else if ('a' <= c && c <= 'z') - { - switch (type) - { - case nrex_class_alnum: - case nrex_class_alpha: - case nrex_class_lower: - case nrex_class_word: - return true; - case nrex_class_xdigit: - if (c <= 'f') - { - return true; - } - default: - break; - } - } - } - switch (c) - { - case ' ': - case '\t': - if (type == nrex_class_blank) - { - return true; - } - case '\r': - case '\n': - case '\f': - if (type == nrex_class_space) - { - return true; - } - break; - case '_': - if (type == nrex_class_word) - { - return true; - } - case ']': - case '[': - case '!': - case '"': - case '#': - case '$': - case '%': - case '&': - case '\'': - case '(': - case ')': - case '*': - case '+': - case ',': - case '.': - case '/': - case ':': - case ';': - case '<': - case '=': - case '>': - case '?': - case '@': - case '\\': - case '^': - case '`': - case '{': - case '|': - case '}': - case '~': - case '-': - if (type == nrex_class_punct) - { - return true; - } - break; - default: - break; - } - return false; - } -}; - -static bool nrex_is_shorthand(nrex_char repr) -{ - switch (repr) - { - case 'W': - case 'w': - case 'D': - case 'd': - case 'S': - case 's': - return true; - } - return false; -} - -struct nrex_node_shorthand : public nrex_node -{ - nrex_char repr; - - nrex_node_shorthand(nrex_char c) - : nrex_node(true) - , repr(c) - { - length = 1; - } - - int test(nrex_search* s, int pos) const - { - if (s->end <= pos || 0 > pos) - { - return -1; - } - bool found = false; - bool invert = false; - nrex_char c = s->at(pos); - switch (repr) - { - case '.': - found = true; - break; - case 'W': - invert = true; - case 'w': - if (c == '_' || NREX_ISALPHANUM(c)) - { - found = true; - } - break; - case 'D': - invert = true; - case 'd': - if ('0' <= c && c <= '9') - { - found = true; - } - break; - case 'S': - invert = true; - case 's': - if (NREX_ISSPACE(c)) - { - found = true; - } - break; - } - if (found == invert) - { - return -1; - } - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -static bool nrex_is_quantifier(nrex_char repr) -{ - switch (repr) - { - case '?': - case '*': - case '+': - case '{': - return true; - } - return false; -} - -struct nrex_node_quantifier : public nrex_node -{ - int min; - int max; - bool greedy; - nrex_node* child; - - nrex_node_quantifier(int min, int max) - : nrex_node() - , min(min) - , max(max) - , greedy(true) - , child(NULL) - { - } - - virtual ~nrex_node_quantifier() - { - if (child) - { - NREX_DELETE(child); - } - } - - int test(nrex_search* s, int pos) const - { - return test_step(s, pos, 0, pos); - } - - int test_step(nrex_search* s, int pos, int level, int start) const - { - if (pos > s->end) - { - return -1; - } - if (!greedy && level > min) - { - int res = pos; - if (next) - { - res = next->test(s, res); - } - if (s->complete) - { - return res; - } - if (res >= 0 && parent->test_parent(s, res) >= 0) - { - return res; - } - } - if (max >= 0 && level > max) - { - return -1; - } - if (level > 1 && level > min + 1 && pos == start) - { - return -1; - } - int res = pos; - if (level >= 1) - { - res = child->test(s, pos); - if (s->complete) - { - return res; - } - } - if (res >= 0) - { - int res_step = test_step(s, res, level + 1, start); - if (res_step >= 0) - { - return res_step; - } - else if (greedy && level >= min) - { - if (next) - { - res = next->test(s, res); - } - if (s->complete) - { - return res; - } - if (res >= 0 && parent->test_parent(s, res) >= 0) - { - return res; - } - } - } - return -1; - } - - virtual int test_parent(nrex_search* s, int pos) const - { - s->complete = false; - return pos; - } -}; - -struct nrex_node_anchor : public nrex_node -{ - bool end; - - nrex_node_anchor(bool end) - : nrex_node() - , end(end) - { - length = 0; - } - - int test(nrex_search* s, int pos) const - { - if (!end && pos != 0) - { - return -1; - } - else if (end && pos != s->end) - { - return -1; - } - return next ? next->test(s, pos) : pos; - } -}; - -struct nrex_node_word_boundary : public nrex_node -{ - bool inverse; - - nrex_node_word_boundary(bool inverse) - : nrex_node() - , inverse(inverse) - { - length = 0; - } - - int test(nrex_search* s, int pos) const - { - bool left = false; - bool right = false; - if (pos != 0) - { - nrex_char c = s->at(pos - 1); - if (c == '_' || NREX_ISALPHANUM(c)) - { - left = true; - } - } - if (pos != s->end) - { - nrex_char c = s->at(pos); - if (c == '_' || NREX_ISALPHANUM(c)) - { - right = true; - } - } - if ((left != right) == inverse) - { - return -1; - } - return next ? next->test(s, pos) : pos; - } -}; - -struct nrex_node_backreference : public nrex_node -{ - int ref; - - nrex_node_backreference(int ref) - : nrex_node(true) - , ref(ref) - { - length = -1; - } - - int test(nrex_search* s, int pos) const - { - nrex_result& r = s->captures[ref]; - for (int i = 0; i < r.length; ++i) - { - if (pos + i >= s->end) - { - return -1; - } - if (s->at(r.start + i) != s->at(pos + i)) - { - return -1; - } - } - return next ? next->test(s, pos + r.length) : pos + r.length; - } -}; - -bool nrex_has_lookbehind(nrex_array& stack) -{ - for (unsigned int i = 0; i < stack.size(); i++) - { - if (stack[i]->type == nrex_group_look_behind) - { - return true; - } - } - return false; -} - -nrex::nrex() - : _capturing(0) - , _lookahead_depth(0) - , _root(NULL) -{ -} - -nrex::nrex(const nrex_char* pattern, int captures) - : _capturing(0) - , _lookahead_depth(0) - , _root(NULL) -{ - compile(pattern, captures); -} - -nrex::~nrex() -{ - if (_root) - { - NREX_DELETE(_root); - } -} - -bool nrex::valid() const -{ - return (_root != NULL); -} - -void nrex::reset() -{ - _capturing = 0; - _lookahead_depth = 0; - if (_root) - { - NREX_DELETE(_root); - } - _root = NULL; -} - -int nrex::capture_size() const -{ - if (_root) - { - return _capturing + 1; - } - return 0; -} - -bool nrex::compile(const nrex_char* pattern, int captures) -{ - reset(); - nrex_node_group* root = NREX_NEW(nrex_node_group(nrex_group_capture, _capturing)); - nrex_array stack; - stack.push(root); - unsigned int lookahead_level = 0; - _root = root; - - for (const nrex_char* c = pattern; c[0] != '\0'; ++c) - { - if (c[0] == '(') - { - if (c[1] == '?') - { - if (c[2] == ':') - { - c = &c[2]; - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture)); - stack.top()->add_child(group); - stack.push(group); - } - else if (c[2] == '!' || c[2] == '=') - { - c = &c[2]; - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_ahead, lookahead_level++)); - group->negate = (c[0] == '!'); - stack.top()->add_child(group); - stack.push(group); - if (lookahead_level > _lookahead_depth) - { - _lookahead_depth = lookahead_level; - } - } - else if (c[2] == '<' && (c[3] == '!' || c[3] == '=')) - { - c = &c[3]; - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_behind)); - group->negate = (c[0] == '!'); - stack.top()->add_child(group); - stack.push(group); - } - else - { - NREX_COMPILE_ERROR("unrecognised qualifier for group"); - } - } - else if (captures >= 0 && _capturing < captures) - { - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_capture, ++_capturing)); - stack.top()->add_child(group); - stack.push(group); - } - else - { - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture)); - stack.top()->add_child(group); - stack.push(group); - } - } - else if (c[0] == ')') - { - if (stack.size() > 1) - { - if (stack.top()->type == nrex_group_look_ahead) - { - --lookahead_level; - } - stack.pop(); - } - else - { - NREX_COMPILE_ERROR("unexpected ')'"); - } - } - else if (c[0] == '[') - { - nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_bracket)); - stack.top()->add_child(group); - if (c[1] == '^') - { - group->negate = true; - ++c; - } - bool first_child = true; - nrex_char previous_child; - bool previous_child_single = false; - while (true) - { - group->add_childset(); - ++c; - if (c[0] == '\0') - { - NREX_COMPILE_ERROR("unclosed bracket expression '['"); - } - if (c[0] == '[' && c[1] == ':') - { - const nrex_char* d = &c[2]; - nrex_class_type cls = nrex_parse_class(&d); - if (cls != nrex_class_none) - { - c = d; - group->add_child(NREX_NEW(nrex_node_class(cls))); - previous_child_single = false; - } - else - { - group->add_child(NREX_NEW(nrex_node_char('['))); - previous_child = '['; - previous_child_single = true; - } - } - else if (c[0] == ']' && !first_child) - { - break; - } - else if (c[0] == '\\') - { - if (nrex_is_shorthand(c[1])) - { - group->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); - ++c; - previous_child_single = false; - } - else - { - const nrex_char* d = c; - nrex_char unescaped = nrex_unescape(d); - if (c == d) - { - NREX_COMPILE_ERROR("invalid escape token"); - } - group->add_child(NREX_NEW(nrex_node_char(unescaped))); - c = d; - previous_child = unescaped; - previous_child_single = true; - } - } - else if (previous_child_single && c[0] == '-') - { - bool is_range = false; - nrex_char next; - if (c[1] != '\0' && c[1] != ']') - { - if (c[1] == '\\') - { - const nrex_char* d = ++c; - next = nrex_unescape(d); - if (c == d) - { - NREX_COMPILE_ERROR("invalid escape token in range"); - } - } - else - { - next = c[1]; - ++c; - } - is_range = true; - } - if (is_range) - { - if (next < previous_child) - { - NREX_COMPILE_ERROR("text range out of order"); - } - group->pop_back(); - group->add_child(NREX_NEW(nrex_node_range(previous_child, next))); - previous_child_single = false; - } - else - { - group->add_child(NREX_NEW(nrex_node_char(c[0]))); - previous_child = c[0]; - previous_child_single = true; - } - } - else - { - group->add_child(NREX_NEW(nrex_node_char(c[0]))); - previous_child = c[0]; - previous_child_single = true; - } - first_child = false; - } - } - else if (nrex_is_quantifier(c[0])) - { - int min = 0; - int max = -1; - bool valid_quantifier = true; - if (c[0] == '?') - { - min = 0; - max = 1; - } - else if (c[0] == '+') - { - min = 1; - max = -1; - } - else if (c[0] == '*') - { - min = 0; - max = -1; - } - else if (c[0] == '{') - { - bool max_set = false; - const nrex_char* d = c; - while (true) - { - ++d; - if (d[0] == '\0') - { - valid_quantifier = false; - break; - } - else if (d[0] == '}') - { - break; - } - else if (d[0] == ',') - { - max_set = true; - continue; - } - else if (d[0] < '0' || '9' < d[0]) - { - valid_quantifier = false; - break; - } - if (max_set) - { - if (max < 0) - { - max = int(d[0] - '0'); - } - else - { - max = max * 10 + int(d[0] - '0'); - } - } - else - { - min = min * 10 + int(d[0] - '0'); - } - } - if (!max_set) - { - max = min; - } - if (valid_quantifier) - { - c = d; - } - } - if (valid_quantifier) - { - if (stack.top()->back == NULL || !stack.top()->back->quantifiable) - { - NREX_COMPILE_ERROR("element not quantifiable"); - } - nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max)); - if (min == max) - { - if (stack.top()->back->length >= 0) - { - quant->length = max * stack.top()->back->length; - } - } - else - { - if (nrex_has_lookbehind(stack)) - { - NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported"); - } - } - quant->child = stack.top()->swap_back(quant); - quant->child->previous = NULL; - quant->child->next = NULL; - quant->child->parent = quant; - if (c[1] == '?') - { - quant->greedy = false; - ++c; - } - } - else - { - stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); - } - } - else if (c[0] == '|') - { - if (nrex_has_lookbehind(stack)) - { - NREX_COMPILE_ERROR("alternations inside lookbehind not supported"); - } - stack.top()->add_childset(); - } - else if (c[0] == '^' || c[0] == '$') - { - stack.top()->add_child(NREX_NEW(nrex_node_anchor((c[0] == '$')))); - } - else if (c[0] == '.') - { - stack.top()->add_child(NREX_NEW(nrex_node_shorthand('.'))); - } - else if (c[0] == '\\') - { - if (nrex_is_shorthand(c[1])) - { - stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); - ++c; - } - else if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) - { - int ref = 0; - bool unclosed = false; - if (c[1] == 'g') - { - unclosed = true; - c = &c[2]; - } - while ('0' <= c[1] && c[1] <= '9') - { - ref = ref * 10 + int(c[1] - '0'); - ++c; - } - if (c[1] == '}') - { - unclosed = false; - ++c; - } - if (ref > _capturing || ref <= 0 || unclosed) - { - NREX_COMPILE_ERROR("backreference to non-existent capture"); - } - if (nrex_has_lookbehind(stack)) - { - NREX_COMPILE_ERROR("backreferences inside lookbehind not supported"); - } - stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref))); - } - else if (c[1] == 'b' || c[1] == 'B') - { - stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B'))); - ++c; - } - else - { - const nrex_char* d = c; - nrex_char unescaped = nrex_unescape(d); - if (c == d) - { - NREX_COMPILE_ERROR("invalid escape token"); - } - stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped))); - c = d; - } - } - else - { - stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); - } - } - if (stack.size() > 1) - { - NREX_COMPILE_ERROR("unclosed group '('"); - } - return true; -} - -bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int end) const -{ - if (!_root) - { - return false; - } - nrex_search s(str, captures, _lookahead_depth); - if (end >= offset) - { - s.end = end; - } - else - { - s.end = NREX_STRLEN(str); - } - for (int i = offset; i <= s.end; ++i) - { - for (int c = 0; c <= _capturing; ++c) - { - captures[c].start = 0; - captures[c].length = 0; - } - if (_root->test(&s, i) >= 0) - { - return true; - } - } - return false; -} diff --git a/drivers/nrex/nrex.hpp b/drivers/nrex/nrex.hpp deleted file mode 100644 index d30b7d01029..00000000000 --- a/drivers/nrex/nrex.hpp +++ /dev/null @@ -1,176 +0,0 @@ -// NREX: Node RegEx -// Version 0.2 -// -// Copyright (c) 2015-2016, Zher Huei Lee -// All rights reserved. -// -// This software is provided 'as-is', without any express or implied -// warranty. In no event will the authors be held liable for any damages -// arising from the use of this software. -// -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: -// -// 1. The origin of this software must not be misrepresented; you must not -// claim that you wrote the original software. If you use this software -// in a product, an acknowledgment in the product documentation would -// be appreciated but is not required. -// -// 2. Altered source versions must be plainly marked as such, and must not -// be misrepresented as being the original software. -// -// 3. This notice may not be removed or altered from any source -// distribution. -// - -#ifndef NREX_HPP -#define NREX_HPP - -#include "nrex_config.h" - -#ifdef NREX_UNICODE -typedef wchar_t nrex_char; -#else -typedef char nrex_char; -#endif - -/*! - * \brief Struct to contain the range of a capture result - * - * The range provided is relative to the begining of the searched string. - * - * \see nrex_node::match() - */ -struct nrex_result -{ - public: - int start; /*!< Start of text range */ - int length; /*!< Length of text range */ -}; - -class nrex_node; - -/*! - * \brief Holds the compiled regex pattern - */ -class nrex -{ - private: - unsigned int _capturing; - unsigned int _lookahead_depth; - nrex_node* _root; - public: - - /*! - * \brief Initialises an empty regex container - */ - nrex(); - - /*! - * \brief Initialises and compiles the regex pattern - * - * This calls nrex::compile() with the same arguments. To check whether - * the compilation was successfull, use nrex::valid(). - * - * If the NREX_THROW_ERROR was defined it would automatically throw a - * runtime error nrex_compile_error if it encounters a problem when - * parsing the pattern. - * - * \param pattern The regex pattern - * \param captures The maximum number of capture groups to allow. Any - * extra would be converted to non-capturing groups. - * If negative, no limit would be imposed. Defaults - * to 9. - * - * \see nrex::compile() - */ - nrex(const nrex_char* pattern, int captures = 9); - - ~nrex(); - - /*! - * \brief Removes the compiled regex and frees up the memory - */ - void reset(); - - /*! - * \brief Checks if there is a compiled regex being stored - * \return True if present, False if not present - */ - bool valid() const; - - /*! - * \brief Provides number of captures the compiled regex uses - * - * This is used to provide the array size of the captures needed for - * nrex::match() to work. The size is actually the number of capture - * groups + one for the matching of the entire pattern. This can be - * capped using the extra argument given in nrex::compile() - * (default 10). - * - * \return The number of captures - */ - int capture_size() const; - - /*! - * \brief Compiles the provided regex pattern - * - * This automatically removes the existing compiled regex if already - * present. - * - * If the NREX_THROW_ERROR was defined it would automatically throw a - * runtime error nrex_compile_error if it encounters a problem when - * parsing the pattern. - * - * \param pattern The regex pattern - * \param captures The maximum number of capture groups to allow. Any - * extra would be converted to non-capturing groups. - * If negative, no limit would be imposed. Defaults - * to 9. - * \return True if the pattern was succesfully compiled - */ - bool compile(const nrex_char* pattern, int captures = 9); - - /*! - * \brief Uses the pattern to search through the provided string - * \param str The text to search through. It only needs to be - * null terminated if the end point is not provided. - * This also determines the starting anchor. - * \param captures The array of results to store the capture results. - * The size of that array needs to be the same as the - * size given in nrex::capture_size(). As it matches - * the function fills the array with the results. 0 is - * the result for the entire pattern, 1 and above - * corresponds to the regex capture group if present. - * \param offset The starting point of the search. This does not move - * the starting anchor. Defaults to 0. - * \param end The end point of the search. This also determines - * the ending anchor. If a number less than the offset - * is provided, the search would be done until null - * termination. Defaults to -1. - * \return True if a match was found. False otherwise. - */ - bool match(const nrex_char* str, nrex_result* captures, int offset = 0, int end = -1) const; -}; - -#ifdef NREX_THROW_ERROR - -#include - -class nrex_compile_error : std::runtime_error -{ - public: - nrex_compile_error(const char* message) - : std::runtime_error(message) - { - } - - ~nrex_compile_error() throw() - { - } -}; - -#endif - -#endif // NREX_HPP diff --git a/drivers/nrex/nrex_config.h b/drivers/nrex/nrex_config.h deleted file mode 100644 index 540f34f8b49..00000000000 --- a/drivers/nrex/nrex_config.h +++ /dev/null @@ -1,12 +0,0 @@ -// Godot-specific configuration -// To use this, replace nrex_config.h - -#include "core/os/memory.h" - -#define NREX_UNICODE -//#define NREX_THROW_ERROR - -#define NREX_NEW(X) memnew(X) -#define NREX_NEW_ARRAY(X, N) memnew_arr(X, N) -#define NREX_DELETE(X) memdelete(X) -#define NREX_DELETE_ARRAY(X) memdelete_arr(X) diff --git a/drivers/nrex/regex.cpp b/drivers/nrex/regex.cpp deleted file mode 100644 index 7bf14d14ad2..00000000000 --- a/drivers/nrex/regex.cpp +++ /dev/null @@ -1,142 +0,0 @@ -/*************************************************************************/ -/* regex.cpp */ -/*************************************************************************/ -/* This file is part of: */ -/* GODOT ENGINE */ -/* http://www.godotengine.org */ -/*************************************************************************/ -/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ -/* */ -/* Permission is hereby granted, free of charge, to any person obtaining */ -/* a copy of this software and associated documentation files (the */ -/* "Software"), to deal in the Software without restriction, including */ -/* without limitation the rights to use, copy, modify, merge, publish, */ -/* distribute, sublicense, and/or sell copies of the Software, and to */ -/* permit persons to whom the Software is furnished to do so, subject to */ -/* the following conditions: */ -/* */ -/* The above copyright notice and this permission notice shall be */ -/* included in all copies or substantial portions of the Software. */ -/* */ -/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ -/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ -/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ -/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ -/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ -/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ -/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -/*************************************************************************/ -#include "regex.h" -#include "nrex.hpp" -#include "core/os/memory.h" - -void RegEx::_bind_methods() { - - ObjectTypeDB::bind_method(_MD("compile","pattern", "capture"),&RegEx::compile, DEFVAL(9)); - ObjectTypeDB::bind_method(_MD("find","text","start","end"),&RegEx::find, DEFVAL(0), DEFVAL(-1)); - ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); - ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); - ObjectTypeDB::bind_method(_MD("get_capture_count"),&RegEx::get_capture_count); - ObjectTypeDB::bind_method(_MD("get_capture","capture"),&RegEx::get_capture); - ObjectTypeDB::bind_method(_MD("get_capture_start","capture"),&RegEx::get_capture_start); - ObjectTypeDB::bind_method(_MD("get_captures"),&RegEx::_bind_get_captures); - -}; - -StringArray RegEx::_bind_get_captures() const { - - StringArray ret; - int count = get_capture_count(); - for (int i=0; i(); } void unregister_core_driver_types() { diff --git a/drivers/nrex/SCsub b/modules/regex/SCsub similarity index 50% rename from drivers/nrex/SCsub rename to modules/regex/SCsub index ee39fd26317..08824067616 100644 --- a/drivers/nrex/SCsub +++ b/modules/regex/SCsub @@ -2,6 +2,6 @@ Import('env') -env.add_source_files(env.drivers_sources, "*.cpp") +env.add_source_files(env.modules_sources, "*.cpp") Export('env') diff --git a/modules/regex/config.py b/modules/regex/config.py new file mode 100644 index 00000000000..667b5d8ba65 --- /dev/null +++ b/modules/regex/config.py @@ -0,0 +1,8 @@ +#!/usr/bin/env python + +def can_build(platform): + return True + +def configure(env): + pass + diff --git a/modules/regex/regex.cpp b/modules/regex/regex.cpp new file mode 100644 index 00000000000..8f26d764c42 --- /dev/null +++ b/modules/regex/regex.cpp @@ -0,0 +1,1465 @@ +/*************************************************************************/ +/* regex.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "regex.h" +#include +#include + +static int RegEx_hex2int(const CharType c) +{ + if ('0' <= c && c <= '9') + return int(c - '0'); + else if ('a' <= c && c <= 'f') + return int(c - 'a') + 10; + else if ('A' <= c && c <= 'F') + return int(c - 'A') + 10; + return -1; +} + +struct RegExSearch { + + Ref match; + const CharType* str; + int end; + int eof; + + // For standard quantifier behaviour, test_parent is used to check the + // rest of the pattern. If the pattern matches, to prevent the parent + // from testing again, the complete flag is used as a shortcut out. + bool complete; + + // With lookahead, the position needs to rewind to its starting position + // when test_parent is used. Due to functional programming, this state + // has to be kept as a parameter. + Vector lookahead_pos; + + CharType at(int p_pos) { + return str[p_pos]; + } + + RegExSearch(Ref& p_match, int p_end, int p_lookahead) : match(p_match) { + + str = p_match->string.c_str(); + end = p_end; + eof = p_match->string.length(); + complete = false; + lookahead_pos.resize(p_lookahead); + } + +}; + +struct RegExNode { + + RegExNode* next; + RegExNode* previous; + RegExNode* parent; + bool quantifiable; + int length; + + RegExNode() { + + next = NULL; + previous = NULL; + parent = NULL; + quantifiable = false; + length = -1; + } + + virtual ~RegExNode() { + + if (next) + memdelete(next); + } + + virtual int test(RegExSearch& s, int pos) const { + + return next ? next->test(s, pos) : -1; + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + if (next) + pos = next->test(s, pos); + + if (pos >= 0) { + s.complete = true; + if (parent) + pos = parent->test_parent(s, pos); + } + + if (pos < 0) + s.complete = false; + + return pos; + } + + void increment_length(int amount, bool subtract = false) { + + if (amount >= 0 && length >= 0) { + if (!subtract) + length += amount; + else + length -= amount; + } else { + length = -1; + } + + if (parent) + parent->increment_length(amount, subtract); + + } + +}; + +struct RegExNodeChar : public RegExNode { + + CharType ch; + + RegExNodeChar(CharType p_char) { + + length = 1; + quantifiable = true; + ch = p_char; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos || s.at(pos) != ch) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } + + static CharType parse_escape(const CharType*& c) { + + int point = 0; + switch (c[1]) { + case 'x': + for (int i = 2; i <= 3; ++i) { + int res = RegEx_hex2int(c[i]); + if (res == -1) + return '\0'; + point = (point << 4) + res; + } + c = &c[3]; + return CharType(point); + case 'u': + for (int i = 2; i <= 5; ++i) { + int res = RegEx_hex2int(c[i]); + if (res == -1) + return '\0'; + point = (point << 4) + res; + } + c = &c[5]; + return CharType(point); + case '0': ++c; return '\0'; + case 'a': ++c; return '\a'; + case 'e': ++c; return '\e'; + case 'f': ++c; return '\f'; + case 'n': ++c; return '\n'; + case 'r': ++c; return '\r'; + case 't': ++c; return '\t'; + case 'v': ++c; return '\v'; + case 'b': ++c; return '\b'; + default: break; + } + return (++c)[0]; + } +}; + +struct RegExNodeRange : public RegExNode { + + CharType start; + CharType end; + + RegExNodeRange(CharType p_start, CharType p_end) { + + length = 1; + quantifiable = true; + start = p_start; + end = p_end; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + CharType c = s.at(pos); + if (c < start || end < c) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct RegExNodeShorthand : public RegExNode { + + CharType repr; + + RegExNodeShorthand(CharType p_repr) { + + length = 1; + quantifiable = true; + repr = p_repr; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + bool found = false; + bool invert = false; + CharType c = s.at(pos); + switch (repr) { + case '.': + found = true; + break; + case 'W': + invert = true; + case 'w': + found = (c == '_' || iswalnum(c) != 0); + break; + case 'D': + invert = true; + case 'd': + found = ('0' <= c && c <= '9'); + break; + case 'S': + invert = true; + case 's': + found = (iswspace(c) != 0); + break; + default: + break; + } + + if (found == invert) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct RegExNodeClass : public RegExNode { + + enum Type { + Type_none, + Type_alnum, + Type_alpha, + Type_ascii, + Type_blank, + Type_cntrl, + Type_digit, + Type_graph, + Type_lower, + Type_print, + Type_punct, + Type_space, + Type_upper, + Type_xdigit, + Type_word + }; + + Type type; + + bool test_class(CharType c) const { + + static Vector REGEX_NODE_SPACE = String(" \t\r\n\f"); + static Vector REGEX_NODE_PUNCT = String("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"); + + switch (type) { + case Type_alnum: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return false; + case Type_alpha: + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return false; + case Type_ascii: + return (0x00 <= c && c <= 0x7F); + case Type_blank: + return (c == ' ' || c == '\t'); + case Type_cntrl: + return ((0x00 <= c && c <= 0x1F) || c == 0x7F); + case Type_digit: + return ('0' <= c && c <= '9'); + case Type_graph: + return (0x20 < c && c < 0x7F); + case Type_lower: + return ('a' <= c && c <= 'z'); + case Type_print: + return (0x1F < c && c < 0x1F); + case Type_punct: + return (REGEX_NODE_PUNCT.find(c) >= 0); + case Type_space: + return (REGEX_NODE_SPACE.find(c) >= 0); + case Type_upper: + return ('A' <= c && c <= 'Z'); + case Type_xdigit: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'f') return true; + if ('A' <= c && c <= 'F') return true; + return false; + case Type_word: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return (c == '_'); + default: + return false; + } + return false; + } + + RegExNodeClass(Type p_type) { + + length = 1; + quantifiable = true; + type = p_type; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + if (!test_class(s.at(pos))) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } + +#define REGEX_CMP_CLASS(POS, NAME) if (cmp_class(POS, #NAME)) return Type_ ## NAME + + static Type parse_type(const CharType*& p_pos) { + + REGEX_CMP_CLASS(p_pos, alnum); + REGEX_CMP_CLASS(p_pos, alpha); + REGEX_CMP_CLASS(p_pos, ascii); + REGEX_CMP_CLASS(p_pos, blank); + REGEX_CMP_CLASS(p_pos, cntrl); + REGEX_CMP_CLASS(p_pos, digit); + REGEX_CMP_CLASS(p_pos, graph); + REGEX_CMP_CLASS(p_pos, lower); + REGEX_CMP_CLASS(p_pos, print); + REGEX_CMP_CLASS(p_pos, punct); + REGEX_CMP_CLASS(p_pos, space); + REGEX_CMP_CLASS(p_pos, upper); + REGEX_CMP_CLASS(p_pos, xdigit); + REGEX_CMP_CLASS(p_pos, word); + return Type_none; + } + + static bool cmp_class(const CharType*& p_pos, const char* p_text) { + + unsigned int i = 0; + for (i = 0; p_text[i] != '\0'; ++i) + if (p_pos[i] != p_text[i]) + return false; + + if (p_pos[i++] != ':' || p_pos[i] != ']') + return false; + + p_pos = &p_pos[i]; + return true; + } +}; + +struct RegExNodeAnchorStart : public RegExNode { + + RegExNodeAnchorStart() { + + length = 0; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos != 0) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeAnchorEnd : public RegExNode { + + RegExNodeAnchorEnd() { + + length = 0; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos != s.eof) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeWordBoundary : public RegExNode { + + bool inverse; + + RegExNodeWordBoundary(bool p_inverse) { + + length = 0; + inverse = p_inverse; + } + + virtual int test(RegExSearch& s, int pos) const { + + bool left = false; + bool right = false; + + if (pos != 0) { + CharType c = s.at(pos - 1); + if (c == '_' || iswalnum(c)) + left = true; + } + + if (pos != s.eof) { + CharType c = s.at(pos); + if (c == '_' || iswalnum(c)) + right = true; + } + + if ((left == right) != inverse) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeQuantifier : public RegExNode { + + int min; + int max; + bool greedy; + RegExNode* child; + + RegExNodeQuantifier(int p_min, int p_max) { + + min = p_min; + max = p_max; + greedy = true; + child = NULL; + } + + ~RegExNodeQuantifier() { + + if (child) + memdelete(child); + } + + virtual int test(RegExSearch& s, int pos) const { + + return test_step(s, pos, 0, pos); + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + s.complete = false; + return pos; + } + + int test_step(RegExSearch& s, int pos, int level, int start) const { + + if (pos > s.end) + return -1; + + if (!greedy && level > min) { + int res = next ? next->test(s, pos) : pos; + if (s.complete) + return res; + + if (res >= 0 && parent->test_parent(s, res) >= 0) + return res; + } + + if (max >= 0 && level > max) + return -1; + + int res = pos; + if (level >= 1) { + if (level > min + 1 && pos == start) + return -1; + + res = child->test(s, pos); + if (s.complete) + return res; + } + + if (res >= 0) { + + int res_step = test_step(s, res, level + 1, start); + if (res_step >= 0) + return res_step; + + if (greedy && level >= min) { + if (next) + res = next->test(s, res); + if (s.complete) + return res; + + if (res >= 0 && parent->test_parent(s, res) >= 0) + return res; + } + } + return -1; + } +}; + +struct RegExNodeBackReference : public RegExNode { + + int id; + + RegExNodeBackReference(int p_id) { + + length = -1; + quantifiable = true; + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + for (int i = 0; i < ref.length; ++i) { + + if (pos + i >= s.end) + return -1; + + if (s.at(ref.start + i) != s.at(pos + i)) + return -1; + } + return next ? next->test(s, pos + ref.length) : pos + ref.length; + } +}; + + +struct RegExNodeGroup : public RegExNode { + + bool inverse; + bool reset_pos; + Vector childset; + RegExNode* back; + + RegExNodeGroup() { + + length = 0; + quantifiable = true; + inverse = false; + reset_pos = false; + back = NULL; + } + + virtual ~RegExNodeGroup() { + + for (int i = 0; i < childset.size(); ++i) + memdelete(childset[i]); + } + + virtual int test(RegExSearch& s, int pos) const { + + for (int i = 0; i < childset.size(); ++i) { + + s.complete = false; + + int res = childset[i]->test(s, pos); + + if (s.complete) + return res; + + if (inverse) { + if (res < 0) + res = pos + 1; + else + return -1; + + if (i + 1 < childset.size()) + continue; + } + + if (res >= 0) { + if (reset_pos) + res = pos; + return next ? next->test(s, res) : res; + } + } + return -1; + } + + void add_child(RegExNode* node) { + + node->parent = this; + node->previous = back; + + if (back) + back->next = node; + else + childset.push_back(node); + + increment_length(node->length); + + back = node; + } + + void add_childset() { + + if (childset.size() > 0) + length = -1; + back = NULL; + } + + RegExNode* swap_back(RegExNode* node) { + + RegExNode* old = back; + + if (old) { + if (!old->previous) + childset.remove(childset.size() - 1); + back = old->previous; + increment_length(old->length, true); + } + + add_child(node); + + return old; + } +}; + +struct RegExNodeCapturing : public RegExNodeGroup { + + int id; + + RegExNodeCapturing(int p_id = 0) { + + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + int old_start = ref.start; + ref.start = pos; + + int res = RegExNodeGroup::test(s, pos); + + if (res >= 0) { + if (!s.complete) + ref.length = res - pos; + } else { + ref.start = old_start; + } + + return res; + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + ref.length = pos - ref.start; + return RegExNode::test_parent(s, pos); + } + + static Variant parse_name(const CharType*& c, bool p_allow_numeric) { + + if (c[1] == '0') { + return -1; + } else if ('1' <= c[1] && c[1] <= '9') { + if (!p_allow_numeric) + return -1; + int res = (++c)[0] - '0'; + while ('0' <= c[1] && c[1] <= '9') + res = res * 10 + int((++c)[0] - '0'); + if ((++c)[0] != '>') + return -1; + return res; + } else if (iswalnum(c[1])) { + String res(++c, 1); + while (iswalnum(c[1])) + res += String(++c, 1); + if ((++c)[0] != '>') + return -1; + return res; + } + return -1; + } +}; + +struct RegExNodeLookAhead : public RegExNodeGroup { + + int id; + + RegExNodeLookAhead(bool p_inverse, int p_id = 0) { + + quantifiable = false; + inverse = p_inverse; + reset_pos = true; + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + s.lookahead_pos[id] = pos; + return RegExNodeGroup::test(s, pos); + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + return RegExNode::test_parent(s, s.lookahead_pos[id]); + } +}; + +struct RegExNodeLookBehind : public RegExNodeGroup { + + RegExNodeLookBehind(bool p_inverse, int p_id = 0) { + + quantifiable = false; + inverse = p_inverse; + reset_pos = true; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos < length) + return -1; + return RegExNodeGroup::test(s, pos - length); + } +}; + +struct RegExNodeBracket : public RegExNode { + + bool inverse; + Vector children; + + RegExNodeBracket() { + + length = 1; + quantifiable = true; + inverse = false; + } + + virtual ~RegExNodeBracket() { + + for (int i = 0; i < children.size(); ++i) + memdelete(children[i]); + } + + virtual int test(RegExSearch& s, int pos) const { + + for (int i = 0; i < children.size(); ++i) { + + int res = children[i]->test(s, pos); + + if (inverse) { + if (res < 0) + res = pos + 1; + else + return -1; + + if (i + 1 < children.size()) + continue; + } + + if (res >= 0) + return next ? next->test(s, res) : res; + } + return -1; + } + + void add_child(RegExNode* node) { + + node->parent = this; + children.push_back(node); + } + + void pop_back() { + + memdelete(children[children.size() - 1]); + children.remove(children.size() - 1); + } +}; + +#define REGEX_EXPAND_FAIL(MSG)\ +{\ + ERR_PRINT(MSG);\ + return String();\ +} + +String RegExMatch::expand(const String& p_template) const { + + String res; + for (const CharType* c = p_template.c_str(); *c != '\0'; ++c) { + if (c[0] == '\\') { + if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { + + int ref = 0; + bool unclosed = false; + + if (c[1] == 'g') { + unclosed = true; + c = &c[2]; + } + + while ('0' <= c[1] && c[1] <= '9') { + ref = ref * 10 + int(c[1] - '0'); + ++c; + } + + if (unclosed) { + if (c[1] != '}') + REGEX_EXPAND_FAIL("unclosed backreference '{'"); + ++c; + } + + res += get_string(ref); + + } else if (c[1] =='g' && c[2] == '<') { + + const CharType* d = &c[2]; + + Variant name = RegExNodeCapturing::parse_name(d, true); + if (name == Variant(-1)) + REGEX_EXPAND_FAIL("unrecognised character for group name"); + + c = d; + + res += get_string(name); + + } else { + + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_EXPAND_FAIL("invalid escape token"); + res += String(&ch, 1); + c = d; + } + } else { + res += String(c, 1); + } + } + return res; +} + +int RegExMatch::get_group_count() const { + + int count = 0; + for (int i = 1; i < captures.size(); ++i) + if (captures[i].name.get_type() == Variant::INT) + ++count; + return count; +} + +Array RegExMatch::get_group_array() const { + + Array res; + for (int i = 1; i < captures.size(); ++i) { + const RegExMatch::Group& capture = captures[i]; + if (capture.name.get_type() != Variant::INT) + continue; + + if (capture.start >= 0) + res.push_back(string.substr(capture.start, capture.length)); + else + res.push_back(String()); + } + return res; +} + +Array RegExMatch::get_names() const { + + Array res; + for (int i = 1; i < captures.size(); ++i) + if (captures[i].name.get_type() == Variant::STRING) + res.push_back(captures[i].name); + return res; +} + +Dictionary RegExMatch::get_name_dict() const { + + Dictionary res; + for (int i = 1; i < captures.size(); ++i) { + const RegExMatch::Group& capture = captures[i]; + if (capture.name.get_type() != Variant::STRING) + continue; + + if (capture.start >= 0) + res[capture.name] = string.substr(capture.start, capture.length); + else + res[capture.name] = String(); + } + return res; +} + +String RegExMatch::get_string(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) { + + const RegExMatch::Group& capture = captures[i]; + + if (capture.name != p_name) + continue; + + if (capture.start == -1) + return String(); + + return string.substr(capture.start, capture.length); + } + return String(); +} + +int RegExMatch::get_start(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) + if (captures[i].name == p_name) + return captures[i].start; + return -1; +} + +int RegExMatch::get_end(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) + if (captures[i].name == p_name) + return captures[i].start + captures[i].length; + return -1; +} + +RegExMatch::RegExMatch() { + +} + +static bool RegEx_is_shorthand(CharType ch) { + + switch (ch) { + case 'w': + case 'W': + case 'd': + case 'D': + case 's': + case 'S': + return true; + default: + break; + } + return false; +} + +#define REGEX_COMPILE_FAIL(MSG)\ +{\ + ERR_PRINT(MSG);\ + clear();\ + return FAILED;\ +} + +Error RegEx::compile(const String& p_pattern) { + + if (pattern == p_pattern) + return OK; + + clear(); + pattern = p_pattern; + group_names.push_back(0); + RegExNodeGroup* root_group = memnew(RegExNodeCapturing(0)); + root = root_group; + Vector stack; + stack.push_back(root_group); + int lookahead_level = 0; + int numeric_groups = 0; + const int numeric_max = 9; + + for (const CharType* c = p_pattern.c_str(); *c != '\0'; ++c) { + + switch (c[0]) { + case '(': + if (c[1] == '?') { + + RegExNodeGroup* group = NULL; + switch (c[2]) { + case ':': + c = &c[2]; + group = memnew(RegExNodeGroup()); + break; + case '!': + case '=': + group = memnew(RegExNodeLookAhead((c[2] == '!'), lookahead_level++)); + if (lookahead_depth < lookahead_level) + lookahead_depth = lookahead_level; + c = &c[2]; + break; + case '<': + if (c[3] == '!' || c[3] == '=') { + group = memnew(RegExNodeLookBehind((c[3] == '!'), lookahead_level++)); + c = &c[3]; + } + break; + case 'P': + if (c[3] == '<') { + const CharType* d = &c[3]; + Variant name = RegExNodeCapturing::parse_name(d, false); + if (name == Variant(-1)) + REGEX_COMPILE_FAIL("unrecognised character for group name"); + group = memnew(RegExNodeCapturing(group_names.size())); + group_names.push_back(name); + c = d; + } + default: + break; + } + if (!group) + REGEX_COMPILE_FAIL("unrecognised qualifier for group"); + stack[0]->add_child(group); + stack.insert(0, group); + + } else if (numeric_groups < numeric_max) { + + RegExNodeCapturing* group = memnew(RegExNodeCapturing(group_names.size())); + group_names.push_back(++numeric_groups); + stack[0]->add_child(group); + stack.insert(0, group); + + } else { + + RegExNodeGroup* group = memnew(RegExNodeGroup()); + stack[0]->add_child(group); + stack.insert(0, group); + } + break; + case ')': + if (stack.size() == 1) + REGEX_COMPILE_FAIL("unexpected ')'"); + stack.remove(0); + break; + case '\\': + if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { + + int ref = 0; + bool unclosed = false; + + if (c[1] == 'g') { + unclosed = true; + c = &c[2]; + } + + while ('0' <= c[1] && c[1] <= '9') { + ref = ref * 10 + int(c[1] - '0'); + ++c; + } + + if (unclosed) { + if (c[1] != '}') + REGEX_COMPILE_FAIL("unclosed backreference '{'"); + ++c; + } + + if (ref > numeric_groups || ref <= 0) + REGEX_COMPILE_FAIL("backreference not found"); + + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); + + for (int i = 0; i < group_names.size(); ++i) { + if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { + ref = group_names[i]; + break; + } + } + + stack[0]->add_child(memnew(RegExNodeBackReference(ref))); + + } if (c[1] =='g' && c[2] == '<') { + + const CharType* d = &c[2]; + + Variant name = RegExNodeCapturing::parse_name(d, true); + if (name == Variant(-1)) + REGEX_COMPILE_FAIL("unrecognised character for group name"); + + c = d; + + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); + + int ref = -1; + + for (int i = 0; i < group_names.size(); ++i) { + if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { + ref = group_names[i]; + break; + } + } + + if (ref == -1) + REGEX_COMPILE_FAIL("backreference not found"); + + stack[0]->add_child(memnew(RegExNodeBackReference(ref))); + + } else if (c[1] == 'b' || c[1] == 'B') { + + stack[0]->add_child(memnew(RegExNodeWordBoundary(*(++c) == 'B'))); + + } else if (RegEx_is_shorthand(c[1])) { + + stack[0]->add_child(memnew(RegExNodeShorthand(*(++c)))); + + } else { + + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + stack[0]->add_child(memnew(RegExNodeChar(ch))); + c = d; + + } + break; + case '[': + { + RegExNodeBracket* bracket = memnew(RegExNodeBracket()); + stack[0]->add_child(bracket); + if (c[1] == '^') { + bracket->inverse = true; + ++c; + } + bool first_child = true; + CharType previous_child; + bool previous_child_single = false; + while (true) { + ++c; + if (!first_child && c[0] == ']') { + + break; + + } else if (c[0] == '\0') { + + REGEX_COMPILE_FAIL("unclosed bracket expression '['"); + + } else if (c[0] == '\\') { + + if (RegEx_is_shorthand(c[1])) { + bracket->add_child(memnew(RegExNodeShorthand(*(++c)))); + } else { + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + bracket->add_child(memnew(RegExNodeChar(ch))); + c = d; + previous_child = ch; + previous_child_single = true; + } + + } else if (c[0] == ']' && c[1] == ':') { + + const CharType* d = &c[2]; + RegExNodeClass::Type type = RegExNodeClass::parse_type(d); + if (type != RegExNodeClass::Type_none) { + + c = d; + previous_child_single = false; + + } else { + + bracket->add_child(memnew(RegExNodeChar('['))); + previous_child = '['; + previous_child_single = true; + } + } else if (previous_child_single && c[0] == '-') { + + if (c[1] != '\0' && c[1] != ']') { + + CharType next; + + if (c[1] == '\\') { + const CharType* d = ++c; + next = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + } else { + next = *(++c); + } + + if (next < previous_child) + REGEX_COMPILE_FAIL("text range out of order"); + + bracket->pop_back(); + bracket->add_child(memnew(RegExNodeRange(previous_child, next))); + previous_child_single = false; + } else { + + bracket->add_child(memnew(RegExNodeChar('-'))); + previous_child = '-'; + previous_child_single = true; + } + } else { + + bracket->add_child(memnew(RegExNodeChar(c[0]))); + previous_child = c[0]; + previous_child_single = true; + } + first_child = false; + } + } + break; + case '|': + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("alternations inside lookbehind not supported"); + stack[0]->add_childset(); + break; + case '^': + stack[0]->add_child(memnew(RegExNodeAnchorStart())); + break; + case '$': + stack[0]->add_child(memnew(RegExNodeAnchorEnd())); + break; + case '.': + stack[0]->add_child(memnew(RegExNodeShorthand('.'))); + break; + case '?': + case '*': + case '+': + case '{': + { + int min_val = 0; + int max_val = -1; + bool valid = true; + const CharType* d = c; + bool max_set = true; + switch (c[0]) { + case '?': + min_val = 0; + max_val = 1; + break; + case '*': + min_val = 0; + max_val = -1; + break; + case '+': + min_val = 1; + max_val = -1; + break; + case '{': + max_set = false; + while (valid) { + ++d; + if (d[0] == '}') { + break; + } else if (d[0] == ',') { + max_set = true; + } else if ('0' <= d[0] && d[0] <= '9') { + if (max_set) { + if (max_val < 0) + max_val = int(d[0] - '0'); + else + max_val = max_val * 10 + int(d[0] - '0'); + } else { + min_val = min_val * 10 + int(d[0] - '0'); + } + } else { + valid = false; + } + } + break; + default: + break; + } + + if (!max_set) + max_val = min_val; + + if (valid) { + + c = d; + + if (stack[0]->back == NULL || !stack[0]->back->quantifiable) + REGEX_COMPILE_FAIL("element not quantifiable"); + + if (min_val != max_val) + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast(stack[i])) + REGEX_COMPILE_FAIL("variable length quantifiers inside lookbehind not supported"); + + RegExNodeQuantifier* quant = memnew(RegExNodeQuantifier(min_val, max_val)); + quant->child = stack[0]->swap_back(quant); + quant->child->previous = NULL; + quant->child->parent = quant; + + if (min_val == max_val && quant->child->length >= 0) + quant->length = max_val * quant->child->length; + + if (c[1] == '?') { + quant->greedy = false; + ++c; + } + break; + } + } + default: + stack[0]->add_child(memnew(RegExNodeChar(c[0]))); + break; + } + } + if (stack.size() > 1) + REGEX_COMPILE_FAIL("unclosed group '('"); + return OK; +} + +Ref RegEx::search(const String& p_text, int p_start, int p_end) const { + + Ref res = memnew(RegExMatch()); + + for (int i = 0; i < group_names.size(); ++i) { + RegExMatch::Group group; + group.name = group_names[i]; + res->captures.push_back(group); + } + + res->string = p_text; + + if (p_end < p_start || p_end > p_text.length()) + p_end = p_text.length(); + + RegExSearch s(res, p_end, lookahead_depth); + + for (int i = p_start; i <= s.end; ++i) { + for (int c = 0; c < group_names.size(); ++c) { + res->captures[c].start = -1; + res->captures[c].length = 0; + } + if (root->test(s, i) >= 0) + break; + } + + if (res->captures[0].start >= 0) + return res; + return NULL; +} + +String RegEx::sub(const String& p_text, const String& p_template, int p_start, int p_end) const { + + Ref m = search(p_text, p_start, p_end); + RegExMatch::Group& s = m->captures[0]; + if (s.start >= 0) { + String res = p_text.substr(0, s.start) + m->expand(p_template); + int end = s.start + s.length; + if (end < p_text.length()) + res += p_text.substr(end, p_text.length() - end); + return res; + } + return p_text; +} + +void RegEx::clear() { + + if (root) + memdelete(root); + + pattern.clear(); + group_names.clear(); + lookahead_depth = 0; +} + +bool RegEx::is_valid() const { + + return (root != NULL); +} + +String RegEx::get_pattern() const { + + return pattern; +} + +int RegEx::get_group_count() const { + + int count = 0; + for (int i = 1; i < group_names.size(); ++i) + if (group_names[i].get_type() == Variant::INT) + ++count; + return count; +} + +Array RegEx::get_names() const { + + Array res; + for (int i = 1; i < group_names.size(); ++i) + if (group_names[i].get_type() == Variant::STRING) + res.push_back(group_names[i]); + return res; +} + +RegEx::RegEx() { + + root = NULL; + lookahead_depth = 0; +} + +RegEx::RegEx(const String& p_pattern) { + + root = NULL; + compile(p_pattern); +} + +RegEx::~RegEx() { + + if (root) + memdelete(root); +} + +void RegExMatch::_bind_methods() { + + ObjectTypeDB::bind_method(_MD("expand","template"),&RegExMatch::expand); + ObjectTypeDB::bind_method(_MD("get_group_count"),&RegExMatch::get_group_count); + ObjectTypeDB::bind_method(_MD("get_group_array"),&RegExMatch::get_group_array); + ObjectTypeDB::bind_method(_MD("get_names"),&RegExMatch::get_names); + ObjectTypeDB::bind_method(_MD("get_name_dict"),&RegExMatch::get_name_dict); + ObjectTypeDB::bind_method(_MD("get_string","name"),&RegExMatch::get_string, DEFVAL(0)); + ObjectTypeDB::bind_method(_MD("get_start","name"),&RegExMatch::get_start, DEFVAL(0)); + ObjectTypeDB::bind_method(_MD("get_end","name"),&RegExMatch::get_end, DEFVAL(0)); +} + +void RegEx::_bind_methods() { + + ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); + ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile); + ObjectTypeDB::bind_method(_MD("search","text","start","end"),&RegEx::search, DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("sub","text","template","start","end"),&RegEx::sub, DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); + ObjectTypeDB::bind_method(_MD("get_pattern"),&RegEx::get_pattern); + ObjectTypeDB::bind_method(_MD("get_group_count"),&RegEx::get_group_count); + ObjectTypeDB::bind_method(_MD("get_names"),&RegEx::get_names); +} + diff --git a/modules/regex/regex.h b/modules/regex/regex.h new file mode 100644 index 00000000000..283368c34fc --- /dev/null +++ b/modules/regex/regex.h @@ -0,0 +1,114 @@ +/*************************************************************************/ +/* regex.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef REGEX_H +#define REGEX_H + +#include "core/vector.h" +#include "core/ustring.h" +#include "core/dictionary.h" +#include "core/reference.h" +#include "core/resource.h" + +class RegExNode; + +class RegExMatch : public Reference { + + OBJ_TYPE(RegExMatch, Reference); + + struct Group { + Variant name; + int start; + int length; + }; + + Vector captures; + String string; + + friend class RegEx; + friend class RegExSearch; + friend class RegExNodeCapturing; + friend class RegExNodeBackReference; + +protected: + + static void _bind_methods(); + +public: + + String expand(const String& p_template) const; + + int get_group_count() const; + Array get_group_array() const; + + Array get_names() const; + Dictionary get_name_dict() const; + + String get_string(const Variant& p_name) const; + int get_start(const Variant& p_name) const; + int get_end(const Variant& p_name) const; + + RegExMatch(); + +}; + +class RegEx : public Reference { + + OBJ_TYPE(RegEx, Reference); + + RegExNode* root; + Vector group_names; + String pattern; + int lookahead_depth; + +protected: + + static void _bind_methods(); + +public: + + void clear(); + Error compile(const String& p_pattern); + + Ref search(const String& p_text, int p_start = 0, int p_end = -1) const; + String sub(const String& p_text, const String& p_template, int p_start = 0, int p_end = -1) const; + + bool is_valid() const; + String get_pattern() const; + int get_group_count() const; + Array get_names() const; + + RegEx(); + RegEx(const String& p_pattern); + ~RegEx(); + +}; + +#endif // REGEX_H + diff --git a/drivers/nrex/regex.h b/modules/regex/register_types.cpp similarity index 72% rename from drivers/nrex/regex.h rename to modules/regex/register_types.cpp index 74495442c70..050cf3efff0 100644 --- a/drivers/nrex/regex.h +++ b/modules/regex/register_types.cpp @@ -1,5 +1,5 @@ /*************************************************************************/ -/* regex.h */ +/* register_types.cpp */ /*************************************************************************/ /* This file is part of: */ /* GODOT ENGINE */ @@ -26,40 +26,18 @@ /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /*************************************************************************/ -#ifndef REGEX_H -#define REGEX_H -#include "ustring.h" -#include "vector.h" -#include "core/reference.h" -#include "nrex.hpp" +#include "register_types.h" +#include "object_type_db.h" +#include "regex.h" -class RegEx : public Reference { +void register_regex_types() { - OBJ_TYPE(RegEx, Reference); + ObjectTypeDB::register_type(); + ObjectTypeDB::register_type(); +} - mutable String text; - mutable Vector captures; - nrex exp; +void unregister_regex_types() { -protected: +} - static void _bind_methods(); - StringArray _bind_get_captures() const; - -public: - - void clear(); - bool is_valid() const; - int get_capture_count() const; - int get_capture_start(int capture) const; - String get_capture(int capture) const; - Error compile(const String& p_pattern, int capture = 9); - int find(const String& p_text, int p_start = 0, int p_end = -1) const; - - RegEx(); - RegEx(const String& p_pattern); - ~RegEx(); -}; - -#endif // REGEX_H diff --git a/modules/regex/register_types.h b/modules/regex/register_types.h new file mode 100644 index 00000000000..df3b508e144 --- /dev/null +++ b/modules/regex/register_types.h @@ -0,0 +1,31 @@ +/*************************************************************************/ +/* register_types.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +void register_regex_types(); +void unregister_regex_types();