Updated nrex to v0.2

* Fixed capturing groups matching to invalid results
 * Fixed parents of recursive quantifiers not expanding properly
 * Fixed LookAhead sometimes adding to result
This commit is contained in:
Zher Huei Lee 2016-04-08 13:29:37 +01:00
parent d454e64f42
commit 6207d56b95
3 changed files with 113 additions and 43 deletions

View File

@ -1,6 +1,8 @@
# NREX: Node RegEx # NREX: Node RegEx
Version 0.1 [![Build Status](https://travis-ci.org/leezh/nrex.svg?branch=master)](https://travis-ci.org/leezh/nrex)
** Version 0.2 **
Small node-based regular expression library. It only does text pattern Small node-based regular expression library. It only does text pattern
matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp` matchhing, not replacement. To use add the files `nrex.hpp`, `nrex.cpp`
@ -38,7 +40,7 @@ Currently supported features:
## License ## License
Copyright (c) 2015, Zher Huei Lee Copyright (c) 2015-2016, Zher Huei Lee
All rights reserved. All rights reserved.
This software is provided 'as-is', without any express or implied This software is provided 'as-is', without any express or implied
@ -59,3 +61,15 @@ freely, subject to the following restrictions:
3. This notice may not be removed or altered from any source 3. This notice may not be removed or altered from any source
distribution. distribution.
# Changes
## Version 0.2 (2016-08-04)
* Fixed capturing groups matching to invalid results
* Fixed parents of recursive quantifiers not expanding properly
* Fixed LookAhead sometimes adding to result
* More verbose unit testing
## Version 0.1 (2015-12-04)
* Initial release

View File

@ -1,7 +1,7 @@
// NREX: Node RegEx // NREX: Node RegEx
// Version 0.1 // Version 0.2
// //
// Copyright (c) 2015, Zher Huei Lee // Copyright (c) 2015-2016, Zher Huei Lee
// All rights reserved. // All rights reserved.
// //
// This software is provided 'as-is', without any express or implied // This software is provided 'as-is', without any express or implied
@ -68,6 +68,13 @@ class nrex_array
{ {
} }
nrex_array(unsigned int size)
: _data(NREX_NEW_ARRAY(T, size))
, _reserved(size)
, _size(0)
{
}
~nrex_array() ~nrex_array()
{ {
NREX_DELETE_ARRAY(_data); NREX_DELETE_ARRAY(_data);
@ -100,7 +107,7 @@ class nrex_array
_size++; _size++;
} }
T& top() const T& top() const
{ {
return _data[_size - 1]; return _data[_size - 1];
} }
@ -189,17 +196,19 @@ struct nrex_search
nrex_result* captures; nrex_result* captures;
int end; int end;
bool complete; bool complete;
nrex_array<int> lookahead_pos;
nrex_char at(int pos) nrex_char at(int pos)
{ {
return str[pos]; return str[pos];
} }
nrex_search(const nrex_char* str, nrex_result* captures) nrex_search(const nrex_char* str, nrex_result* captures, int lookahead)
: str(str) : str(str)
, captures(captures) , captures(captures)
, end(0) , end(0)
{ {
lookahead_pos.reserve(lookahead);
} }
}; };
@ -239,13 +248,17 @@ struct nrex_node
{ {
pos = next->test(s, pos); pos = next->test(s, pos);
} }
if (pos >= 0)
{
s->complete = true;
}
if (parent && pos >= 0) if (parent && pos >= 0)
{ {
pos = parent->test_parent(s, pos); pos = parent->test_parent(s, pos);
} }
if (pos >= 0) if (pos < 0)
{ {
s->complete = true; s->complete = false;
} }
return pos; return pos;
} }
@ -274,25 +287,31 @@ struct nrex_node
} }
}; };
enum nrex_group_type
{
nrex_group_capture,
nrex_group_non_capture,
nrex_group_bracket,
nrex_group_look_ahead,
nrex_group_look_behind,
};
struct nrex_node_group : public nrex_node struct nrex_node_group : public nrex_node
{ {
static const int NonCapture = -1; nrex_group_type type;
static const int Bracket = -2; int id;
static const int LookAhead = -3;
static const int LookBehind = -4;
int mode;
bool negate; bool negate;
nrex_array<nrex_node*> childset; nrex_array<nrex_node*> childset;
nrex_node* back; nrex_node* back;
nrex_node_group(int mode) nrex_node_group(nrex_group_type type, int id = 0)
: nrex_node(true) : nrex_node(true)
, mode(mode) , type(type)
, id(id)
, negate(false) , negate(false)
, back(NULL) , back(NULL)
{ {
if (mode != Bracket) if (type != nrex_group_bracket)
{ {
length = 0; length = 0;
} }
@ -300,7 +319,7 @@ struct nrex_node_group : public nrex_node
{ {
length = 1; length = 1;
} }
if (mode == LookAhead || mode == LookBehind) if (type == nrex_group_look_ahead || type == nrex_group_look_behind)
{ {
quantifiable = false; quantifiable = false;
} }
@ -317,15 +336,17 @@ struct nrex_node_group : public nrex_node
int test(nrex_search* s, int pos) const int test(nrex_search* s, int pos) const
{ {
if (mode >= 0) int old_start;
if (type == nrex_group_capture)
{ {
s->captures[mode].start = pos; old_start = s->captures[id].start;
s->captures[id].start = pos;
} }
for (unsigned int i = 0; i < childset.size(); ++i) for (unsigned int i = 0; i < childset.size(); ++i)
{ {
s->complete = false; s->complete = false;
int offset = 0; int offset = 0;
if (mode == LookBehind) if (type == nrex_group_look_behind)
{ {
if (pos < length) if (pos < length)
{ {
@ -333,7 +354,15 @@ struct nrex_node_group : public nrex_node
} }
offset = length; offset = length;
} }
if (type == nrex_group_look_ahead)
{
s->lookahead_pos.push(pos);
}
int res = childset[i]->test(s, pos - offset); int res = childset[i]->test(s, pos - offset);
if (type == nrex_group_look_ahead)
{
s->lookahead_pos.pop();
}
if (s->complete) if (s->complete)
{ {
return res; return res;
@ -355,32 +384,40 @@ struct nrex_node_group : public nrex_node
} }
if (res >= 0) if (res >= 0)
{ {
if (mode >= 0) if (type == nrex_group_capture)
{ {
s->captures[mode].length = res - pos; s->captures[id].length = res - pos;
} }
else if (mode == LookAhead || mode == LookBehind) else if (type == nrex_group_look_ahead || type == nrex_group_look_behind)
{ {
res = pos; res = pos;
} }
return next ? next->test(s, res) : res; return next ? next->test(s, res) : res;
} }
} }
if (type == nrex_group_capture)
{
s->captures[id].start = old_start;
}
return -1; return -1;
} }
virtual int test_parent(nrex_search* s, int pos) const virtual int test_parent(nrex_search* s, int pos) const
{ {
if (mode >= 0) if (type == nrex_group_capture)
{ {
s->captures[mode].length = pos - s->captures[mode].start; s->captures[id].length = pos - s->captures[id].start;
}
if (type == nrex_group_look_ahead)
{
pos = s->lookahead_pos[id];
} }
return nrex_node::test_parent(s, pos); return nrex_node::test_parent(s, pos);
} }
void add_childset() void add_childset()
{ {
if (childset.size() > 0 && mode != Bracket) if (childset.size() > 0 && type != nrex_group_bracket)
{ {
length = -1; length = -1;
} }
@ -391,7 +428,7 @@ struct nrex_node_group : public nrex_node
{ {
node->parent = this; node->parent = this;
node->previous = back; node->previous = back;
if (back && mode != Bracket) if (back && type != nrex_group_bracket)
{ {
back->next = node; back->next = node;
} }
@ -399,7 +436,7 @@ struct nrex_node_group : public nrex_node
{ {
childset.push(node); childset.push(node);
} }
if (mode != Bracket) if (type != nrex_group_bracket)
{ {
increment_length(node->length); increment_length(node->length);
} }
@ -418,7 +455,7 @@ struct nrex_node_group : public nrex_node
{ {
childset.pop(); childset.pop();
} }
if (mode != Bracket) if (type != nrex_group_bracket)
{ {
increment_length(old->length, true); increment_length(old->length, true);
} }
@ -436,7 +473,7 @@ struct nrex_node_group : public nrex_node
{ {
childset.pop(); childset.pop();
} }
if (mode != Bracket) if (type != nrex_group_bracket)
{ {
increment_length(old->length, true); increment_length(old->length, true);
} }
@ -887,6 +924,12 @@ struct nrex_node_quantifier : public nrex_node
} }
return -1; return -1;
} }
virtual int test_parent(nrex_search* s, int pos) const
{
s->complete = false;
return pos;
}
}; };
struct nrex_node_anchor : public nrex_node struct nrex_node_anchor : public nrex_node
@ -986,7 +1029,7 @@ bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
{ {
for (unsigned int i = 0; i < stack.size(); i++) for (unsigned int i = 0; i < stack.size(); i++)
{ {
if (stack[i]->mode == nrex_node_group::LookBehind) if (stack[i]->type == nrex_group_look_behind)
{ {
return true; return true;
} }
@ -996,12 +1039,14 @@ bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
nrex::nrex() nrex::nrex()
: _capturing(0) : _capturing(0)
, _lookahead_depth(0)
, _root(NULL) , _root(NULL)
{ {
} }
nrex::nrex(const nrex_char* pattern, int captures) nrex::nrex(const nrex_char* pattern, int captures)
: _capturing(0) : _capturing(0)
, _lookahead_depth(0)
, _root(NULL) , _root(NULL)
{ {
compile(pattern, captures); compile(pattern, captures);
@ -1023,6 +1068,7 @@ bool nrex::valid() const
void nrex::reset() void nrex::reset()
{ {
_capturing = 0; _capturing = 0;
_lookahead_depth = 0;
if (_root) if (_root)
{ {
NREX_DELETE(_root); NREX_DELETE(_root);
@ -1042,9 +1088,10 @@ int nrex::capture_size() const
bool nrex::compile(const nrex_char* pattern, int captures) bool nrex::compile(const nrex_char* pattern, int captures)
{ {
reset(); reset();
nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing)); nrex_node_group* root = NREX_NEW(nrex_node_group(nrex_group_capture, _capturing));
nrex_array<nrex_node_group*> stack; nrex_array<nrex_node_group*> stack;
stack.push(root); stack.push(root);
unsigned int lookahead_level = 0;
_root = root; _root = root;
for (const nrex_char* c = pattern; c[0] != '\0'; ++c) for (const nrex_char* c = pattern; c[0] != '\0'; ++c)
@ -1056,22 +1103,26 @@ bool nrex::compile(const nrex_char* pattern, int captures)
if (c[2] == ':') if (c[2] == ':')
{ {
c = &c[2]; c = &c[2];
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture)); nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
stack.top()->add_child(group); stack.top()->add_child(group);
stack.push(group); stack.push(group);
} }
else if (c[2] == '!' || c[2] == '=') else if (c[2] == '!' || c[2] == '=')
{ {
c = &c[2]; c = &c[2];
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead)); nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_ahead, lookahead_level++));
group->negate = (c[0] == '!'); group->negate = (c[0] == '!');
stack.top()->add_child(group); stack.top()->add_child(group);
stack.push(group); stack.push(group);
if (lookahead_level > _lookahead_depth)
{
_lookahead_depth = lookahead_level;
}
} }
else if (c[2] == '<' && (c[3] == '!' || c[3] == '=')) else if (c[2] == '<' && (c[3] == '!' || c[3] == '='))
{ {
c = &c[3]; c = &c[3];
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind)); nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_look_behind));
group->negate = (c[0] == '!'); group->negate = (c[0] == '!');
stack.top()->add_child(group); stack.top()->add_child(group);
stack.push(group); stack.push(group);
@ -1083,13 +1134,13 @@ bool nrex::compile(const nrex_char* pattern, int captures)
} }
else if (captures >= 0 && _capturing < captures) else if (captures >= 0 && _capturing < captures)
{ {
nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing)); nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_capture, ++_capturing));
stack.top()->add_child(group); stack.top()->add_child(group);
stack.push(group); stack.push(group);
} }
else else
{ {
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture)); nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_non_capture));
stack.top()->add_child(group); stack.top()->add_child(group);
stack.push(group); stack.push(group);
} }
@ -1098,6 +1149,10 @@ bool nrex::compile(const nrex_char* pattern, int captures)
{ {
if (stack.size() > 1) if (stack.size() > 1)
{ {
if (stack.top()->type == nrex_group_look_ahead)
{
--lookahead_level;
}
stack.pop(); stack.pop();
} }
else else
@ -1107,7 +1162,7 @@ bool nrex::compile(const nrex_char* pattern, int captures)
} }
else if (c[0] == '[') else if (c[0] == '[')
{ {
nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket)); nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_group_bracket));
stack.top()->add_child(group); stack.top()->add_child(group);
if (c[1] == '^') if (c[1] == '^')
{ {
@ -1410,7 +1465,7 @@ bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int en
{ {
return false; return false;
} }
nrex_search s(str, captures); nrex_search s(str, captures, _lookahead_depth);
if (end >= offset) if (end >= offset)
{ {
s.end = end; s.end = end;

View File

@ -1,7 +1,7 @@
// NREX: Node RegEx // NREX: Node RegEx
// Version 0.1 // Version 0.2
// //
// Copyright (c) 2015, Zher Huei Lee // Copyright (c) 2015-2016, Zher Huei Lee
// All rights reserved. // All rights reserved.
// //
// This software is provided 'as-is', without any express or implied // This software is provided 'as-is', without any express or implied
@ -57,7 +57,8 @@ class nrex_node;
class nrex class nrex
{ {
private: private:
int _capturing; unsigned int _capturing;
unsigned int _lookahead_depth;
nrex_node* _root; nrex_node* _root;
public: public: