168 lines
7.3 KiB
C++
168 lines
7.3 KiB
C++
|
// © 2016 and later: Unicode, Inc. and others.
|
||
|
// License & terms of use: http://www.unicode.org/copyright.html
|
||
|
//
|
||
|
// rbbiscan.h
|
||
|
//
|
||
|
// Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||
|
// All Rights Reserved.
|
||
|
//
|
||
|
// This file contains declarations for class RBBIRuleScanner
|
||
|
//
|
||
|
|
||
|
|
||
|
#ifndef RBBISCAN_H
|
||
|
#define RBBISCAN_H
|
||
|
|
||
|
#include "unicode/utypes.h"
|
||
|
#include "unicode/uobject.h"
|
||
|
#include "unicode/rbbi.h"
|
||
|
#include "unicode/uniset.h"
|
||
|
#include "unicode/parseerr.h"
|
||
|
#include "uhash.h"
|
||
|
#include "uvector.h"
|
||
|
#include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
|
||
|
// looks up references to $variables within a set.
|
||
|
#include "rbbinode.h"
|
||
|
#include "rbbirpt.h"
|
||
|
|
||
|
U_NAMESPACE_BEGIN
|
||
|
|
||
|
class RBBIRuleBuilder;
|
||
|
class RBBISymbolTable;
|
||
|
|
||
|
|
||
|
//--------------------------------------------------------------------------------
|
||
|
//
|
||
|
// class RBBIRuleScanner does the lowest level, character-at-a-time
|
||
|
// scanning of break iterator rules.
|
||
|
//
|
||
|
// The output of the scanner is parse trees for
|
||
|
// the rule expressions and a list of all Unicode Sets
|
||
|
// encountered.
|
||
|
//
|
||
|
//--------------------------------------------------------------------------------
|
||
|
|
||
|
class RBBIRuleScanner : public UMemory {
|
||
|
public:
|
||
|
|
||
|
enum {
|
||
|
kStackSize = 100 // The size of the state stack for
|
||
|
}; // rules parsing. Corresponds roughly
|
||
|
// to the depth of parentheses nesting
|
||
|
// that is allowed in the rules.
|
||
|
|
||
|
struct RBBIRuleChar {
|
||
|
UChar32 fChar;
|
||
|
UBool fEscaped;
|
||
|
RBBIRuleChar() : fChar(0), fEscaped(false) {}
|
||
|
};
|
||
|
|
||
|
RBBIRuleScanner(RBBIRuleBuilder *rb);
|
||
|
|
||
|
|
||
|
virtual ~RBBIRuleScanner();
|
||
|
|
||
|
void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
|
||
|
// Return false if at end.
|
||
|
|
||
|
UBool push(const RBBIRuleChar &c); // Push (unget) one character.
|
||
|
// Only a single character may be pushed.
|
||
|
|
||
|
void parse(); // Parse the rules, generating two parse
|
||
|
// trees, one each for the forward and
|
||
|
// reverse rules,
|
||
|
// and a list of UnicodeSets encountered.
|
||
|
|
||
|
int32_t numRules(); // Return the number of rules that have been seen.
|
||
|
|
||
|
/**
|
||
|
* Return a rules string without unnecessary
|
||
|
* characters.
|
||
|
*/
|
||
|
static UnicodeString stripRules(const UnicodeString &rules);
|
||
|
private:
|
||
|
|
||
|
UBool doParseActions(int32_t a);
|
||
|
void error(UErrorCode e); // error reporting convenience function.
|
||
|
void fixOpStack(RBBINode::OpPrecedence p);
|
||
|
// a character.
|
||
|
void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
|
||
|
|
||
|
UChar32 nextCharLL();
|
||
|
#ifdef RBBI_DEBUG
|
||
|
void printNodeStack(const char *title);
|
||
|
#endif
|
||
|
RBBINode *pushNewNode(RBBINode::NodeType t);
|
||
|
void scanSet();
|
||
|
|
||
|
|
||
|
RBBIRuleBuilder *fRB; // The rule builder that we are part of.
|
||
|
|
||
|
int32_t fScanIndex; // Index of current character being processed
|
||
|
// in the rule input string.
|
||
|
int32_t fNextIndex; // Index of the next character, which
|
||
|
// is the first character not yet scanned.
|
||
|
UBool fQuoteMode; // Scan is in a 'quoted region'
|
||
|
int32_t fLineNum; // Line number in input file.
|
||
|
int32_t fCharNum; // Char position within the line.
|
||
|
UChar32 fLastChar; // Previous char, needed to count CR-LF
|
||
|
// as a single line, not two.
|
||
|
|
||
|
RBBIRuleChar fC; // Current char for parse state machine
|
||
|
// processing.
|
||
|
UnicodeString fVarName; // $variableName, valid when we've just
|
||
|
// scanned one.
|
||
|
|
||
|
RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
|
||
|
// parsing. index by p[state][char-class]
|
||
|
|
||
|
uint16_t fStack[kStackSize]; // State stack, holds state pushes
|
||
|
int32_t fStackPtr; // and pops as specified in the state
|
||
|
// transition rules.
|
||
|
|
||
|
RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
|
||
|
// during the parse of a rule
|
||
|
int32_t fNodeStackPtr;
|
||
|
|
||
|
|
||
|
UBool fReverseRule; // True if the rule currently being scanned
|
||
|
// is a reverse direction rule (if it
|
||
|
// starts with a '!')
|
||
|
|
||
|
UBool fLookAheadRule; // True if the rule includes a '/'
|
||
|
// somewhere within it.
|
||
|
|
||
|
UBool fNoChainInRule; // True if the current rule starts with a '^'.
|
||
|
|
||
|
RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
|
||
|
// $variable symbols.
|
||
|
|
||
|
UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
|
||
|
// the sets created while parsing rules.
|
||
|
// The key is the string used for creating
|
||
|
// the set.
|
||
|
|
||
|
UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
|
||
|
// the scanning of RBBI rules. The
|
||
|
// indicies for these are assigned by the
|
||
|
// perl script that builds the state tables.
|
||
|
// See rbbirpt.h.
|
||
|
|
||
|
int32_t fRuleNum; // Counts each rule as it is scanned.
|
||
|
|
||
|
int32_t fOptionStart; // Input index of start of a !!option
|
||
|
// keyword, while being scanned.
|
||
|
|
||
|
UnicodeSet *gRuleSet_rule_char;
|
||
|
UnicodeSet *gRuleSet_white_space;
|
||
|
UnicodeSet *gRuleSet_name_char;
|
||
|
UnicodeSet *gRuleSet_name_start_char;
|
||
|
|
||
|
RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
|
||
|
RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
|
||
|
};
|
||
|
|
||
|
U_NAMESPACE_END
|
||
|
|
||
|
#endif
|