505 lines
15 KiB
C++
505 lines
15 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2010-2012, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*******************************************************************************
|
|
* file name: bytestriebuilder.cpp
|
|
* encoding: UTF-8
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2010sep25
|
|
* created by: Markus W. Scherer
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/bytestrie.h"
|
|
#include "unicode/bytestriebuilder.h"
|
|
#include "unicode/stringpiece.h"
|
|
#include "charstr.h"
|
|
#include "cmemory.h"
|
|
#include "uhash.h"
|
|
#include "uarrsort.h"
|
|
#include "uassert.h"
|
|
#include "ustr_imp.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
/*
|
|
* Note: This builder implementation stores (bytes, value) pairs with full copies
|
|
* of the byte sequences, until the BytesTrie is built.
|
|
* It might(!) take less memory if we collected the data in a temporary, dynamic trie.
|
|
*/
|
|
|
|
class BytesTrieElement : public UMemory {
|
|
public:
|
|
// Use compiler's default constructor, initializes nothing.
|
|
|
|
void setTo(StringPiece s, int32_t val, CharString &strings, UErrorCode &errorCode);
|
|
|
|
StringPiece getString(const CharString &strings) const {
|
|
int32_t offset=stringOffset;
|
|
int32_t length;
|
|
if(offset>=0) {
|
|
length=(uint8_t)strings[offset++];
|
|
} else {
|
|
offset=~offset;
|
|
length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
|
|
offset+=2;
|
|
}
|
|
return StringPiece(strings.data()+offset, length);
|
|
}
|
|
int32_t getStringLength(const CharString &strings) const {
|
|
int32_t offset=stringOffset;
|
|
if(offset>=0) {
|
|
return (uint8_t)strings[offset];
|
|
} else {
|
|
offset=~offset;
|
|
return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
|
|
}
|
|
}
|
|
|
|
char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
|
|
|
|
int32_t getValue() const { return value; }
|
|
|
|
int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
|
|
|
|
private:
|
|
const char *data(const CharString &strings) const {
|
|
int32_t offset=stringOffset;
|
|
if(offset>=0) {
|
|
++offset;
|
|
} else {
|
|
offset=~offset+2;
|
|
}
|
|
return strings.data()+offset;
|
|
}
|
|
|
|
// If the stringOffset is non-negative, then the first strings byte contains
|
|
// the string length.
|
|
// If the stringOffset is negative, then the first two strings bytes contain
|
|
// the string length (big-endian), and the offset needs to be bit-inverted.
|
|
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
|
|
int32_t stringOffset;
|
|
int32_t value;
|
|
};
|
|
|
|
void
|
|
BytesTrieElement::setTo(StringPiece s, int32_t val,
|
|
CharString &strings, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
int32_t length=s.length();
|
|
if(length>0xffff) {
|
|
// Too long: We store the length in 1 or 2 bytes.
|
|
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return;
|
|
}
|
|
int32_t offset=strings.length();
|
|
if(length>0xff) {
|
|
offset=~offset;
|
|
strings.append((char)(length>>8), errorCode);
|
|
}
|
|
strings.append((char)length, errorCode);
|
|
stringOffset=offset;
|
|
value=val;
|
|
strings.append(s, errorCode);
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
|
|
// TODO: add StringPiece::compare(), see ticket #8187
|
|
StringPiece thisString=getString(strings);
|
|
StringPiece otherString=other.getString(strings);
|
|
int32_t lengthDiff=thisString.length()-otherString.length();
|
|
int32_t commonLength;
|
|
if(lengthDiff<=0) {
|
|
commonLength=thisString.length();
|
|
} else {
|
|
commonLength=otherString.length();
|
|
}
|
|
int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
|
|
return diff!=0 ? diff : lengthDiff;
|
|
}
|
|
|
|
BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
|
|
: strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
|
|
bytes(NULL), bytesCapacity(0), bytesLength(0) {
|
|
if(U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
strings=new CharString();
|
|
if(strings==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
|
|
BytesTrieBuilder::~BytesTrieBuilder() {
|
|
delete strings;
|
|
delete[] elements;
|
|
uprv_free(bytes);
|
|
}
|
|
|
|
BytesTrieBuilder &
|
|
BytesTrieBuilder::add(StringPiece s, int32_t value, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) {
|
|
return *this;
|
|
}
|
|
if(bytesLength>0) {
|
|
// Cannot add elements after building.
|
|
errorCode=U_NO_WRITE_PERMISSION;
|
|
return *this;
|
|
}
|
|
if(elementsLength==elementsCapacity) {
|
|
int32_t newCapacity;
|
|
if(elementsCapacity==0) {
|
|
newCapacity=1024;
|
|
} else {
|
|
newCapacity=4*elementsCapacity;
|
|
}
|
|
BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
|
|
if(newElements==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
return *this; // error instead of dereferencing null
|
|
}
|
|
if(elementsLength>0) {
|
|
uprv_memcpy(newElements, elements, (size_t)elementsLength*sizeof(BytesTrieElement));
|
|
}
|
|
delete[] elements;
|
|
elements=newElements;
|
|
elementsCapacity=newCapacity;
|
|
}
|
|
elements[elementsLength++].setTo(s, value, *strings, errorCode);
|
|
return *this;
|
|
}
|
|
|
|
U_CDECL_BEGIN
|
|
|
|
static int32_t U_CALLCONV
|
|
compareElementStrings(const void *context, const void *left, const void *right) {
|
|
const CharString *strings=static_cast<const CharString *>(context);
|
|
const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
|
|
const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
|
|
return leftElement->compareStringTo(*rightElement, *strings);
|
|
}
|
|
|
|
U_CDECL_END
|
|
|
|
BytesTrie *
|
|
BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
|
|
buildBytes(buildOption, errorCode);
|
|
BytesTrie *newTrie=NULL;
|
|
if(U_SUCCESS(errorCode)) {
|
|
newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
|
|
if(newTrie==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
} else {
|
|
bytes=NULL; // The new trie now owns the array.
|
|
bytesCapacity=0;
|
|
}
|
|
}
|
|
return newTrie;
|
|
}
|
|
|
|
StringPiece
|
|
BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
|
|
buildBytes(buildOption, errorCode);
|
|
StringPiece result;
|
|
if(U_SUCCESS(errorCode)) {
|
|
result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void
|
|
BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
if(bytes!=NULL && bytesLength>0) {
|
|
// Already built.
|
|
return;
|
|
}
|
|
if(bytesLength==0) {
|
|
if(elementsLength==0) {
|
|
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return;
|
|
}
|
|
uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
|
|
compareElementStrings, strings,
|
|
FALSE, // need not be a stable sort
|
|
&errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
// Duplicate strings are not allowed.
|
|
StringPiece prev=elements[0].getString(*strings);
|
|
for(int32_t i=1; i<elementsLength; ++i) {
|
|
StringPiece current=elements[i].getString(*strings);
|
|
if(prev==current) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
prev=current;
|
|
}
|
|
}
|
|
// Create and byte-serialize the trie for the elements.
|
|
bytesLength=0;
|
|
int32_t capacity=strings->length();
|
|
if(capacity<1024) {
|
|
capacity=1024;
|
|
}
|
|
if(bytesCapacity<capacity) {
|
|
uprv_free(bytes);
|
|
bytes=static_cast<char *>(uprv_malloc(capacity));
|
|
if(bytes==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
bytesCapacity=0;
|
|
return;
|
|
}
|
|
bytesCapacity=capacity;
|
|
}
|
|
StringTrieBuilder::build(buildOption, elementsLength, errorCode);
|
|
if(bytes==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
|
|
BytesTrieBuilder &
|
|
BytesTrieBuilder::clear() {
|
|
strings->clear();
|
|
elementsLength=0;
|
|
bytesLength=0;
|
|
return *this;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::getElementStringLength(int32_t i) const {
|
|
return elements[i].getStringLength(*strings);
|
|
}
|
|
|
|
UChar
|
|
BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
|
|
return (uint8_t)elements[i].charAt(byteIndex, *strings);
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::getElementValue(int32_t i) const {
|
|
return elements[i].getValue();
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
|
|
const BytesTrieElement &firstElement=elements[first];
|
|
const BytesTrieElement &lastElement=elements[last];
|
|
int32_t minStringLength=firstElement.getStringLength(*strings);
|
|
while(++byteIndex<minStringLength &&
|
|
firstElement.charAt(byteIndex, *strings)==
|
|
lastElement.charAt(byteIndex, *strings)) {}
|
|
return byteIndex;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
|
|
int32_t length=0; // Number of different bytes at byteIndex.
|
|
int32_t i=start;
|
|
do {
|
|
char byte=elements[i++].charAt(byteIndex, *strings);
|
|
while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
|
|
++i;
|
|
}
|
|
++length;
|
|
} while(i<limit);
|
|
return length;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
|
|
do {
|
|
char byte=elements[i++].charAt(byteIndex, *strings);
|
|
while(byte==elements[i].charAt(byteIndex, *strings)) {
|
|
++i;
|
|
}
|
|
} while(--count>0);
|
|
return i;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
|
|
char b=(char)byte;
|
|
while(b==elements[i].charAt(byteIndex, *strings)) {
|
|
++i;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
|
|
: LinearMatchNode(len, nextNode), s(bytes) {
|
|
hash=static_cast<int32_t>(
|
|
static_cast<uint32_t>(hash)*37u + static_cast<uint32_t>(ustr_hashCharsN(bytes, len)));
|
|
}
|
|
|
|
UBool
|
|
BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
|
|
if(this==&other) {
|
|
return TRUE;
|
|
}
|
|
if(!LinearMatchNode::operator==(other)) {
|
|
return FALSE;
|
|
}
|
|
const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
|
|
return 0==uprv_memcmp(s, o.s, length);
|
|
}
|
|
|
|
void
|
|
BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
|
|
BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
|
|
next->write(builder);
|
|
b.write(s, length);
|
|
offset=b.write(b.getMinLinearMatch()+length-1);
|
|
}
|
|
|
|
StringTrieBuilder::Node *
|
|
BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
|
|
Node *nextNode) const {
|
|
return new BTLinearMatchNode(
|
|
elements[i].getString(*strings).data()+byteIndex,
|
|
length,
|
|
nextNode);
|
|
}
|
|
|
|
UBool
|
|
BytesTrieBuilder::ensureCapacity(int32_t length) {
|
|
if(bytes==NULL) {
|
|
return FALSE; // previous memory allocation had failed
|
|
}
|
|
if(length>bytesCapacity) {
|
|
int32_t newCapacity=bytesCapacity;
|
|
do {
|
|
newCapacity*=2;
|
|
} while(newCapacity<=length);
|
|
char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
|
|
if(newBytes==NULL) {
|
|
// unable to allocate memory
|
|
uprv_free(bytes);
|
|
bytes=NULL;
|
|
bytesCapacity=0;
|
|
return FALSE;
|
|
}
|
|
uprv_memcpy(newBytes+(newCapacity-bytesLength),
|
|
bytes+(bytesCapacity-bytesLength), bytesLength);
|
|
uprv_free(bytes);
|
|
bytes=newBytes;
|
|
bytesCapacity=newCapacity;
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::write(int32_t byte) {
|
|
int32_t newLength=bytesLength+1;
|
|
if(ensureCapacity(newLength)) {
|
|
bytesLength=newLength;
|
|
bytes[bytesCapacity-bytesLength]=(char)byte;
|
|
}
|
|
return bytesLength;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::write(const char *b, int32_t length) {
|
|
int32_t newLength=bytesLength+length;
|
|
if(ensureCapacity(newLength)) {
|
|
bytesLength=newLength;
|
|
uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
|
|
}
|
|
return bytesLength;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
|
|
return write(elements[i].getString(*strings).data()+byteIndex, length);
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
|
|
if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
|
|
return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
|
|
}
|
|
char intBytes[5];
|
|
int32_t length=1;
|
|
if(i<0 || i>0xffffff) {
|
|
intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
|
|
intBytes[1]=(char)((uint32_t)i>>24);
|
|
intBytes[2]=(char)((uint32_t)i>>16);
|
|
intBytes[3]=(char)((uint32_t)i>>8);
|
|
intBytes[4]=(char)i;
|
|
length=5;
|
|
// } else if(i<=BytesTrie::kMaxOneByteValue) {
|
|
// intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
|
|
} else {
|
|
if(i<=BytesTrie::kMaxTwoByteValue) {
|
|
intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
|
|
} else {
|
|
if(i<=BytesTrie::kMaxThreeByteValue) {
|
|
intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
|
|
} else {
|
|
intBytes[0]=(char)BytesTrie::kFourByteValueLead;
|
|
intBytes[1]=(char)(i>>16);
|
|
length=2;
|
|
}
|
|
intBytes[length++]=(char)(i>>8);
|
|
}
|
|
intBytes[length++]=(char)i;
|
|
}
|
|
intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
|
|
return write(intBytes, length);
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
|
|
int32_t offset=write(node);
|
|
if(hasValue) {
|
|
offset=writeValueAndFinal(value, FALSE);
|
|
}
|
|
return offset;
|
|
}
|
|
|
|
int32_t
|
|
BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
|
|
int32_t i=bytesLength-jumpTarget;
|
|
U_ASSERT(i>=0);
|
|
if(i<=BytesTrie::kMaxOneByteDelta) {
|
|
return write(i);
|
|
}
|
|
char intBytes[5];
|
|
int32_t length;
|
|
if(i<=BytesTrie::kMaxTwoByteDelta) {
|
|
intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
|
|
length=1;
|
|
} else {
|
|
if(i<=BytesTrie::kMaxThreeByteDelta) {
|
|
intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
|
|
length=2;
|
|
} else {
|
|
if(i<=0xffffff) {
|
|
intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
|
|
length=3;
|
|
} else {
|
|
intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
|
|
intBytes[1]=(char)(i>>24);
|
|
length=4;
|
|
}
|
|
intBytes[1]=(char)(i>>16);
|
|
}
|
|
intBytes[1]=(char)(i>>8);
|
|
}
|
|
intBytes[length++]=(char)i;
|
|
return write(intBytes, length);
|
|
}
|
|
|
|
U_NAMESPACE_END
|