// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html // // file: rbbirb.cpp // // Copyright (C) 2002-2011, International Business Machines Corporation and others. // All Rights Reserved. // // This file contains the RBBIRuleBuilder class implementation. This is the main class for // building (compiling) break rules into the tables required by the runtime // RBBI engine. // #include "unicode/utypes.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/brkiter.h" #include "unicode/rbbi.h" #include "unicode/ubrk.h" #include "unicode/unistr.h" #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/uchriter.h" #include "unicode/ustring.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" #include "cmemory.h" #include "cstring.h" #include "rbbirb.h" #include "rbbinode.h" #include "rbbiscan.h" #include "rbbisetb.h" #include "rbbitblb.h" #include "rbbidata.h" #include "uassert.h" U_NAMESPACE_BEGIN //---------------------------------------------------------------------------------------- // // Constructor. // //---------------------------------------------------------------------------------------- RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, UParseError *parseErr, UErrorCode &status) : fRules(rules), fStrippedRules(rules) { fStatus = &status; // status is checked below fParseError = parseErr; fDebugEnv = nullptr; #ifdef RBBI_DEBUG fDebugEnv = getenv("U_RBBIDEBUG"); #endif fForwardTree = nullptr; fReverseTree = nullptr; fSafeFwdTree = nullptr; fSafeRevTree = nullptr; fDefaultTree = &fForwardTree; fForwardTable = nullptr; fRuleStatusVals = nullptr; fChainRules = false; fLookAheadHardBreak = false; fUSetNodes = nullptr; fRuleStatusVals = nullptr; fScanner = nullptr; fSetBuilder = nullptr; if (parseErr) { uprv_memset(parseErr, 0, sizeof(UParseError)); } if (U_FAILURE(status)) { return; } fUSetNodes = new UVector(status); // bcos status gets overwritten here fRuleStatusVals = new UVector(status); fScanner = new RBBIRuleScanner(this); fSetBuilder = new RBBISetBuilder(this); if (U_FAILURE(status)) { return; } if (fSetBuilder == nullptr || fScanner == nullptr || fUSetNodes == nullptr || fRuleStatusVals == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } } //---------------------------------------------------------------------------------------- // // Destructor // //---------------------------------------------------------------------------------------- RBBIRuleBuilder::~RBBIRuleBuilder() { int i; for (i=0; ; i++) { RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); if (n==nullptr) { break; } delete n; } delete fUSetNodes; delete fSetBuilder; delete fForwardTable; delete fForwardTree; delete fReverseTree; delete fSafeFwdTree; delete fSafeRevTree; delete fScanner; delete fRuleStatusVals; } //---------------------------------------------------------------------------------------- // // flattenData() - Collect up the compiled RBBI rule data and put it into // the format for saving in ICU data files, // which is also the format needed by the RBBI runtime engine. // //---------------------------------------------------------------------------------------- static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} RBBIDataHeader *RBBIRuleBuilder::flattenData() { int32_t i; if (U_FAILURE(*fStatus)) { return nullptr; } // Remove whitespace from the rules to make it smaller. // The rule parser has already removed comments. fStrippedRules = fScanner->stripRules(fStrippedRules); // Calculate the size of each section in the data. // Sizes here are padded up to a multiple of 8 for better memory alignment. // Sections sizes actually stored in the header are for the actual data // without the padding. // int32_t headerSize = align8(sizeof(RBBIDataHeader)); int32_t forwardTableSize = align8(fForwardTable->getTableSize()); int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); int32_t trieSize = align8(fSetBuilder->getTrieSize()); int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); int32_t rulesLengthInUTF8 = 0; u_strToUTF8WithSub(nullptr, 0, &rulesLengthInUTF8, fStrippedRules.getBuffer(), fStrippedRules.length(), 0xfffd, nullptr, fStatus); *fStatus = U_ZERO_ERROR; int32_t rulesSize = align8((rulesLengthInUTF8+1)); int32_t totalSize = headerSize + forwardTableSize + reverseTableSize + statusTableSize + trieSize + rulesSize; #ifdef RBBI_DEBUG if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) { RBBIDebugPrintf("Header Size: %8d\n", headerSize); RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize); RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize); RBBIDebugPrintf("Trie Size: %8d\n", trieSize); RBBIDebugPrintf("Status Table Size: %8d\n", statusTableSize); RBBIDebugPrintf("Rules Size: %8d\n", rulesSize); RBBIDebugPrintf("-----------------------------\n"); RBBIDebugPrintf("Total Size: %8d\n", totalSize); } #endif RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); if (data == nullptr) { *fStatus = U_MEMORY_ALLOCATION_ERROR; return nullptr; } uprv_memset(data, 0, totalSize); data->fMagic = 0xb1a0; data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0]; data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1]; data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2]; data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3]; data->fLength = totalSize; data->fCatCount = fSetBuilder->getNumCharCategories(); data->fFTable = headerSize; data->fFTableLen = forwardTableSize; data->fRTable = data->fFTable + data->fFTableLen; data->fRTableLen = reverseTableSize; data->fTrie = data->fRTable + data->fRTableLen; data->fTrieLen = trieSize; data->fStatusTable = data->fTrie + data->fTrieLen; data->fStatusTableLen= statusTableSize; data->fRuleSource = data->fStatusTable + statusTableSize; data->fRuleSourceLen = rulesLengthInUTF8; uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); fForwardTable->exportTable((uint8_t *)data + data->fFTable); fForwardTable->exportSafeTable((uint8_t *)data + data->fRTable); fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); for (i=0; isize(); i++) { ruleStatusTable[i] = fRuleStatusVals->elementAti(i); } u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8, fStrippedRules.getBuffer(), fStrippedRules.length(), 0xfffd, nullptr, fStatus); if (U_FAILURE(*fStatus)) { return nullptr; } return data; } //---------------------------------------------------------------------------------------- // // createRuleBasedBreakIterator construct from source rules that are passed in // in a UnicodeString // //---------------------------------------------------------------------------------------- BreakIterator * RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, UParseError *parseError, UErrorCode &status) { // // Read the input rules, generate a parse tree, symbol table, // and list of all Unicode Sets referenced by the rules. // RBBIRuleBuilder builder(rules, parseError, status); if (U_FAILURE(status)) { // status checked here bcos build below doesn't return nullptr; } RBBIDataHeader *data = builder.build(status); if (U_FAILURE(status)) { return nullptr; } // // Create a break iterator from the compiled rules. // (Identical to creation from stored pre-compiled rules) // // status is checked after init in construction. RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); if (U_FAILURE(status)) { delete This; This = nullptr; } else if(This == nullptr) { // test for nullptr status = U_MEMORY_ALLOCATION_ERROR; } return This; } RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { if (U_FAILURE(status)) { return nullptr; } fScanner->parse(); if (U_FAILURE(status)) { return nullptr; } // // UnicodeSet processing. // Munge the Unicode Sets to create an initial set of character categories. // fSetBuilder->buildRanges(); // // Generate the DFA state transition table. // fForwardTable = new RBBITableBuilder(this, &fForwardTree, status); if (fForwardTable == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } fForwardTable->buildForwardTable(); // State table and character category optimization. // Merge equivalent rows and columns. // Note that this process alters the initial set of character categories, // causing the representation of UnicodeSets in the parse tree to become invalid. optimizeTables(); fForwardTable->buildSafeReverseTable(status); #ifdef RBBI_DEBUG if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { fForwardTable->printStates(); fForwardTable->printRuleStatusTable(); fForwardTable->printReverseTable(); } #endif // Generate the mapping tables (TRIE) from input code points to // the character categories. // fSetBuilder->buildTrie(); // // Package up the compiled data into a memory image // in the run-time format. // RBBIDataHeader *data = flattenData(); // returns nullptr if error if (U_FAILURE(status)) { return nullptr; } return data; } void RBBIRuleBuilder::optimizeTables() { bool didSomething; do { didSomething = false; // Begin looking for duplicates with char class 3. // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively, // and should not have other categories merged into them. IntPair duplPair = {3, 0}; while (fForwardTable->findDuplCharClassFrom(&duplPair)) { fSetBuilder->mergeCategories(duplPair); fForwardTable->removeColumn(duplPair.second); didSomething = true; } while (fForwardTable->removeDuplicateStates() > 0) { didSomething = true; } } while (didSomething); } U_NAMESPACE_END #endif /* #if !UCONFIG_NO_BREAK_ITERATION */