406 lines
15 KiB
C++
406 lines
15 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2014, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*******************************************************************************
|
|
* norm2allmodes.h
|
|
*
|
|
* created on: 2014sep07
|
|
* created by: Markus W. Scherer
|
|
*/
|
|
|
|
#ifndef __NORM2ALLMODES_H__
|
|
#define __NORM2ALLMODES_H__
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
#include "unicode/edits.h"
|
|
#include "unicode/normalizer2.h"
|
|
#include "unicode/stringoptions.h"
|
|
#include "unicode/unistr.h"
|
|
#include "cpputils.h"
|
|
#include "normalizer2impl.h"
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
// Intermediate class:
|
|
// Has Normalizer2Impl and does boilerplate argument checking and setup.
|
|
class Normalizer2WithImpl : public Normalizer2 {
|
|
public:
|
|
Normalizer2WithImpl(const Normalizer2Impl &ni) : impl(ni) {}
|
|
virtual ~Normalizer2WithImpl();
|
|
|
|
// normalize
|
|
virtual UnicodeString &
|
|
normalize(const UnicodeString &src,
|
|
UnicodeString &dest,
|
|
UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
dest.setToBogus();
|
|
return dest;
|
|
}
|
|
const char16_t *sArray=src.getBuffer();
|
|
if(&dest==&src || sArray==nullptr) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
dest.setToBogus();
|
|
return dest;
|
|
}
|
|
dest.remove();
|
|
ReorderingBuffer buffer(impl, dest);
|
|
if(buffer.init(src.length(), errorCode)) {
|
|
normalize(sArray, sArray+src.length(), buffer, errorCode);
|
|
}
|
|
return dest;
|
|
}
|
|
virtual void
|
|
normalize(const char16_t *src, const char16_t *limit,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
|
|
|
// normalize and append
|
|
virtual UnicodeString &
|
|
normalizeSecondAndAppend(UnicodeString &first,
|
|
const UnicodeString &second,
|
|
UErrorCode &errorCode) const override {
|
|
return normalizeSecondAndAppend(first, second, true, errorCode);
|
|
}
|
|
virtual UnicodeString &
|
|
append(UnicodeString &first,
|
|
const UnicodeString &second,
|
|
UErrorCode &errorCode) const override {
|
|
return normalizeSecondAndAppend(first, second, false, errorCode);
|
|
}
|
|
UnicodeString &
|
|
normalizeSecondAndAppend(UnicodeString &first,
|
|
const UnicodeString &second,
|
|
UBool doNormalize,
|
|
UErrorCode &errorCode) const {
|
|
uprv_checkCanGetBuffer(first, errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
return first;
|
|
}
|
|
const char16_t *secondArray=second.getBuffer();
|
|
if(&first==&second || secondArray==nullptr) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return first;
|
|
}
|
|
int32_t firstLength=first.length();
|
|
UnicodeString safeMiddle;
|
|
{
|
|
ReorderingBuffer buffer(impl, first);
|
|
if(buffer.init(firstLength+second.length(), errorCode)) {
|
|
normalizeAndAppend(secondArray, secondArray+second.length(), doNormalize,
|
|
safeMiddle, buffer, errorCode);
|
|
}
|
|
} // The ReorderingBuffer destructor finalizes the first string.
|
|
if(U_FAILURE(errorCode)) {
|
|
// Restore the modified suffix of the first string.
|
|
first.replace(firstLength-safeMiddle.length(), 0x7fffffff, safeMiddle);
|
|
}
|
|
return first;
|
|
}
|
|
virtual void
|
|
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
|
UnicodeString &safeMiddle,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
|
virtual UBool
|
|
getDecomposition(UChar32 c, UnicodeString &decomposition) const override {
|
|
char16_t buffer[4];
|
|
int32_t length;
|
|
const char16_t *d=impl.getDecomposition(c, buffer, length);
|
|
if(d==nullptr) {
|
|
return false;
|
|
}
|
|
if(d==buffer) {
|
|
decomposition.setTo(buffer, length); // copy the string (Jamos from Hangul syllable c)
|
|
} else {
|
|
decomposition.setTo(false, d, length); // read-only alias
|
|
}
|
|
return true;
|
|
}
|
|
virtual UBool
|
|
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override {
|
|
char16_t buffer[30];
|
|
int32_t length;
|
|
const char16_t *d=impl.getRawDecomposition(c, buffer, length);
|
|
if(d==nullptr) {
|
|
return false;
|
|
}
|
|
if(d==buffer) {
|
|
decomposition.setTo(buffer, length); // copy the string (algorithmic decomposition)
|
|
} else {
|
|
decomposition.setTo(false, d, length); // read-only alias
|
|
}
|
|
return true;
|
|
}
|
|
virtual UChar32
|
|
composePair(UChar32 a, UChar32 b) const override {
|
|
return impl.composePair(a, b);
|
|
}
|
|
|
|
virtual uint8_t
|
|
getCombiningClass(UChar32 c) const override {
|
|
return impl.getCC(impl.getNorm16(c));
|
|
}
|
|
|
|
// quick checks
|
|
virtual UBool
|
|
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
return false;
|
|
}
|
|
const char16_t *sArray=s.getBuffer();
|
|
if(sArray==nullptr) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return false;
|
|
}
|
|
const char16_t *sLimit=sArray+s.length();
|
|
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
|
|
}
|
|
virtual UNormalizationCheckResult
|
|
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
|
|
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
|
|
}
|
|
virtual int32_t
|
|
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
return 0;
|
|
}
|
|
const char16_t *sArray=s.getBuffer();
|
|
if(sArray==nullptr) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return 0;
|
|
}
|
|
return (int32_t)(spanQuickCheckYes(sArray, sArray+s.length(), errorCode)-sArray);
|
|
}
|
|
virtual const char16_t *
|
|
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const = 0;
|
|
|
|
virtual UNormalizationCheckResult getQuickCheck(UChar32) const {
|
|
return UNORM_YES;
|
|
}
|
|
|
|
const Normalizer2Impl &impl;
|
|
};
|
|
|
|
class DecomposeNormalizer2 : public Normalizer2WithImpl {
|
|
public:
|
|
DecomposeNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
|
|
virtual ~DecomposeNormalizer2();
|
|
|
|
private:
|
|
virtual void
|
|
normalize(const char16_t *src, const char16_t *limit,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
|
impl.decompose(src, limit, &buffer, errorCode);
|
|
}
|
|
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
|
virtual void
|
|
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
|
UnicodeString &safeMiddle,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
|
impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
|
|
}
|
|
|
|
void
|
|
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
|
Edits *edits, UErrorCode &errorCode) const override {
|
|
if (U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
|
edits->reset();
|
|
}
|
|
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
|
|
impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
|
|
sink.Flush();
|
|
}
|
|
virtual UBool
|
|
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
return false;
|
|
}
|
|
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
|
|
const uint8_t *sLimit = s + sp.length();
|
|
return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
|
|
}
|
|
|
|
virtual const char16_t *
|
|
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override {
|
|
return impl.decompose(src, limit, nullptr, errorCode);
|
|
}
|
|
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
|
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
|
|
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
|
|
}
|
|
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
|
return impl.hasDecompBoundaryBefore(c);
|
|
}
|
|
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
|
return impl.hasDecompBoundaryAfter(c);
|
|
}
|
|
virtual UBool isInert(UChar32 c) const override {
|
|
return impl.isDecompInert(c);
|
|
}
|
|
};
|
|
|
|
class ComposeNormalizer2 : public Normalizer2WithImpl {
|
|
public:
|
|
ComposeNormalizer2(const Normalizer2Impl &ni, UBool fcc) :
|
|
Normalizer2WithImpl(ni), onlyContiguous(fcc) {}
|
|
virtual ~ComposeNormalizer2();
|
|
|
|
private:
|
|
virtual void
|
|
normalize(const char16_t *src, const char16_t *limit,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
|
impl.compose(src, limit, onlyContiguous, true, buffer, errorCode);
|
|
}
|
|
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
|
|
|
void
|
|
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
|
Edits *edits, UErrorCode &errorCode) const override {
|
|
if (U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
|
edits->reset();
|
|
}
|
|
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
|
|
impl.composeUTF8(options, onlyContiguous, s, s + src.length(),
|
|
&sink, edits, errorCode);
|
|
sink.Flush();
|
|
}
|
|
|
|
virtual void
|
|
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
|
UnicodeString &safeMiddle,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
|
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, safeMiddle, buffer, errorCode);
|
|
}
|
|
|
|
virtual UBool
|
|
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
return false;
|
|
}
|
|
const char16_t *sArray=s.getBuffer();
|
|
if(sArray==nullptr) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return false;
|
|
}
|
|
UnicodeString temp;
|
|
ReorderingBuffer buffer(impl, temp);
|
|
if(!buffer.init(5, errorCode)) { // small destCapacity for substring normalization
|
|
return false;
|
|
}
|
|
return impl.compose(sArray, sArray+s.length(), onlyContiguous, false, buffer, errorCode);
|
|
}
|
|
virtual UBool
|
|
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
return false;
|
|
}
|
|
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
|
|
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
|
|
}
|
|
virtual UNormalizationCheckResult
|
|
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
|
|
if(U_FAILURE(errorCode)) {
|
|
return UNORM_MAYBE;
|
|
}
|
|
const char16_t *sArray=s.getBuffer();
|
|
if(sArray==nullptr) {
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return UNORM_MAYBE;
|
|
}
|
|
UNormalizationCheckResult qcResult=UNORM_YES;
|
|
impl.composeQuickCheck(sArray, sArray+s.length(), onlyContiguous, &qcResult);
|
|
return qcResult;
|
|
}
|
|
virtual const char16_t *
|
|
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &) const override {
|
|
return impl.composeQuickCheck(src, limit, onlyContiguous, nullptr);
|
|
}
|
|
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
|
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const override {
|
|
return impl.getCompQuickCheck(impl.getNorm16(c));
|
|
}
|
|
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
|
return impl.hasCompBoundaryBefore(c);
|
|
}
|
|
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
|
return impl.hasCompBoundaryAfter(c, onlyContiguous);
|
|
}
|
|
virtual UBool isInert(UChar32 c) const override {
|
|
return impl.isCompInert(c, onlyContiguous);
|
|
}
|
|
|
|
const UBool onlyContiguous;
|
|
};
|
|
|
|
class FCDNormalizer2 : public Normalizer2WithImpl {
|
|
public:
|
|
FCDNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
|
|
virtual ~FCDNormalizer2();
|
|
|
|
private:
|
|
virtual void
|
|
normalize(const char16_t *src, const char16_t *limit,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
|
impl.makeFCD(src, limit, &buffer, errorCode);
|
|
}
|
|
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
|
virtual void
|
|
normalizeAndAppend(const char16_t *src, const char16_t *limit, UBool doNormalize,
|
|
UnicodeString &safeMiddle,
|
|
ReorderingBuffer &buffer, UErrorCode &errorCode) const override {
|
|
impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
|
|
}
|
|
virtual const char16_t *
|
|
spanQuickCheckYes(const char16_t *src, const char16_t *limit, UErrorCode &errorCode) const override {
|
|
return impl.makeFCD(src, limit, nullptr, errorCode);
|
|
}
|
|
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
|
virtual UBool hasBoundaryBefore(UChar32 c) const override {
|
|
return impl.hasFCDBoundaryBefore(c);
|
|
}
|
|
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
|
return impl.hasFCDBoundaryAfter(c);
|
|
}
|
|
virtual UBool isInert(UChar32 c) const override {
|
|
return impl.isFCDInert(c);
|
|
}
|
|
};
|
|
|
|
struct Norm2AllModes : public UMemory {
|
|
Norm2AllModes(Normalizer2Impl *i)
|
|
: impl(i), comp(*i, false), decomp(*i), fcd(*i), fcc(*i, true) {}
|
|
~Norm2AllModes();
|
|
|
|
static Norm2AllModes *createInstance(Normalizer2Impl *impl, UErrorCode &errorCode);
|
|
static Norm2AllModes *createNFCInstance(UErrorCode &errorCode);
|
|
static Norm2AllModes *createInstance(const char *packageName,
|
|
const char *name,
|
|
UErrorCode &errorCode);
|
|
|
|
static const Norm2AllModes *getNFCInstance(UErrorCode &errorCode);
|
|
static const Norm2AllModes *getNFKCInstance(UErrorCode &errorCode);
|
|
static const Norm2AllModes *getNFKC_CFInstance(UErrorCode &errorCode);
|
|
|
|
Normalizer2Impl *impl;
|
|
ComposeNormalizer2 comp;
|
|
DecomposeNormalizer2 decomp;
|
|
FCDNormalizer2 fcd;
|
|
ComposeNormalizer2 fcc;
|
|
};
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif // !UCONFIG_NO_NORMALIZATION
|
|
#endif // __NORM2ALLMODES_H__
|