487 lines
18 KiB
C++
487 lines
18 KiB
C++
// © 2016 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
/*
|
|
*****************************************************************************
|
|
*
|
|
* Copyright (C) 1998-2016, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*****************************************************************************
|
|
*
|
|
* ucnv_err.c
|
|
* Implements error behaviour functions called by T_UConverter_{from,to}Unicode
|
|
*
|
|
*
|
|
* Change history:
|
|
*
|
|
* 06/29/2000 helena Major rewrite of the callback APIs.
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_CONVERSION
|
|
|
|
#include "unicode/ucnv_err.h"
|
|
#include "unicode/ucnv_cb.h"
|
|
#include "ucnv_cnv.h"
|
|
#include "cmemory.h"
|
|
#include "unicode/ucnv.h"
|
|
#include "ustrfmt.h"
|
|
|
|
#define VALUE_STRING_LENGTH 48
|
|
/*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
|
|
#define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025
|
|
#define UNICODE_U_CODEPOINT 0x0055
|
|
#define UNICODE_X_CODEPOINT 0x0058
|
|
#define UNICODE_RS_CODEPOINT 0x005C
|
|
#define UNICODE_U_LOW_CODEPOINT 0x0075
|
|
#define UNICODE_X_LOW_CODEPOINT 0x0078
|
|
#define UNICODE_AMP_CODEPOINT 0x0026
|
|
#define UNICODE_HASH_CODEPOINT 0x0023
|
|
#define UNICODE_SEMICOLON_CODEPOINT 0x003B
|
|
#define UNICODE_PLUS_CODEPOINT 0x002B
|
|
#define UNICODE_LEFT_CURLY_CODEPOINT 0x007B
|
|
#define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D
|
|
#define UNICODE_SPACE_CODEPOINT 0x0020
|
|
#define UCNV_PRV_ESCAPE_ICU 0
|
|
#define UCNV_PRV_ESCAPE_C 'C'
|
|
#define UCNV_PRV_ESCAPE_XML_DEC 'D'
|
|
#define UCNV_PRV_ESCAPE_XML_HEX 'X'
|
|
#define UCNV_PRV_ESCAPE_JAVA 'J'
|
|
#define UCNV_PRV_ESCAPE_UNICODE 'U'
|
|
#define UCNV_PRV_ESCAPE_CSS2 'S'
|
|
#define UCNV_PRV_STOP_ON_ILLEGAL 'i'
|
|
|
|
/*
|
|
* IS_DEFAULT_IGNORABLE_CODE_POINT
|
|
* This is to check if a code point has the default ignorable unicode property.
|
|
* As such, this list needs to be updated if the ignorable code point list ever
|
|
* changes.
|
|
* To avoid dependency on other code, this list is hard coded here.
|
|
* When an ignorable code point is found and is unmappable, the default callbacks
|
|
* will ignore them.
|
|
* For a list of the default ignorable code points, use this link:
|
|
* https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%3ADI%3A%5D&abb=on&g=&i=
|
|
*
|
|
* This list should be sync with the one in CharsetCallback.java
|
|
*/
|
|
#define IS_DEFAULT_IGNORABLE_CODE_POINT(c) ( \
|
|
(c == 0x00AD) || \
|
|
(c == 0x034F) || \
|
|
(c == 0x061C) || \
|
|
(c == 0x115F) || \
|
|
(c == 0x1160) || \
|
|
(0x17B4 <= c && c <= 0x17B5) || \
|
|
(0x180B <= c && c <= 0x180F) || \
|
|
(0x200B <= c && c <= 0x200F) || \
|
|
(0x202A <= c && c <= 0x202E) || \
|
|
(0x2060 <= c && c <= 0x206F) || \
|
|
(c == 0x3164) || \
|
|
(0xFE00 <= c && c <= 0xFE0F) || \
|
|
(c == 0xFEFF) || \
|
|
(c == 0xFFA0) || \
|
|
(0xFFF0 <= c && c <= 0xFFF8) || \
|
|
(0x1BCA0 <= c && c <= 0x1BCA3) || \
|
|
(0x1D173 <= c && c <= 0x1D17A) || \
|
|
(0xE0000 <= c && c <= 0xE0FFF))
|
|
|
|
|
|
/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_FROM_U_CALLBACK_STOP (
|
|
const void *context,
|
|
UConverterFromUnicodeArgs *fromUArgs,
|
|
const char16_t* codeUnits,
|
|
int32_t length,
|
|
UChar32 codePoint,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
(void)context;
|
|
(void)fromUArgs;
|
|
(void)codeUnits;
|
|
(void)length;
|
|
if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
|
|
{
|
|
/*
|
|
* Skip if the codepoint has unicode property of default ignorable.
|
|
*/
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
/* the caller must have set the error code accordingly */
|
|
return;
|
|
}
|
|
|
|
|
|
/*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_TO_U_CALLBACK_STOP (
|
|
const void *context,
|
|
UConverterToUnicodeArgs *toUArgs,
|
|
const char* codePoints,
|
|
int32_t length,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
/* the caller must have set the error code accordingly */
|
|
(void)context; (void)toUArgs; (void)codePoints; (void)length; (void)reason; (void)err;
|
|
return;
|
|
}
|
|
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_FROM_U_CALLBACK_SKIP (
|
|
const void *context,
|
|
UConverterFromUnicodeArgs *fromUArgs,
|
|
const char16_t* codeUnits,
|
|
int32_t length,
|
|
UChar32 codePoint,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
(void)fromUArgs;
|
|
(void)codeUnits;
|
|
(void)length;
|
|
if (reason <= UCNV_IRREGULAR)
|
|
{
|
|
if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
|
|
{
|
|
/*
|
|
* Skip if the codepoint has unicode property of default ignorable.
|
|
*/
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
|
|
{
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
/* else the caller must have set the error code accordingly. */
|
|
}
|
|
/* else ignore the reset, close and clone calls. */
|
|
}
|
|
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_FROM_U_CALLBACK_SUBSTITUTE (
|
|
const void *context,
|
|
UConverterFromUnicodeArgs *fromArgs,
|
|
const char16_t* codeUnits,
|
|
int32_t length,
|
|
UChar32 codePoint,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
(void)codeUnits;
|
|
(void)length;
|
|
if (reason <= UCNV_IRREGULAR)
|
|
{
|
|
if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
|
|
{
|
|
/*
|
|
* Skip if the codepoint has unicode property of default ignorable.
|
|
*/
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
else if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
|
|
{
|
|
*err = U_ZERO_ERROR;
|
|
ucnv_cbFromUWriteSub(fromArgs, 0, err);
|
|
}
|
|
/* else the caller must have set the error code accordingly. */
|
|
}
|
|
/* else ignore the reset, close and clone calls. */
|
|
}
|
|
|
|
/*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
|
|
*uses a clean copy (resetted) of the converter, to convert that unicode
|
|
*escape sequence to the target codepage (if conversion failure happens then
|
|
*we revert to substituting with subchar)
|
|
*/
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_FROM_U_CALLBACK_ESCAPE (
|
|
const void *context,
|
|
UConverterFromUnicodeArgs *fromArgs,
|
|
const char16_t *codeUnits,
|
|
int32_t length,
|
|
UChar32 codePoint,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
|
|
char16_t valueString[VALUE_STRING_LENGTH];
|
|
int32_t valueStringLength = 0;
|
|
int32_t i = 0;
|
|
|
|
const char16_t *myValueSource = nullptr;
|
|
UErrorCode err2 = U_ZERO_ERROR;
|
|
UConverterFromUCallback original = nullptr;
|
|
const void *originalContext;
|
|
|
|
UConverterFromUCallback ignoredCallback = nullptr;
|
|
const void *ignoredContext;
|
|
|
|
if (reason > UCNV_IRREGULAR)
|
|
{
|
|
return;
|
|
}
|
|
else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
|
|
{
|
|
/*
|
|
* Skip if the codepoint has unicode property of default ignorable.
|
|
*/
|
|
*err = U_ZERO_ERROR;
|
|
return;
|
|
}
|
|
|
|
ucnv_setFromUCallBack (fromArgs->converter,
|
|
(UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
|
|
nullptr,
|
|
&original,
|
|
&originalContext,
|
|
&err2);
|
|
|
|
if (U_FAILURE (err2))
|
|
{
|
|
*err = err2;
|
|
return;
|
|
}
|
|
if(context==nullptr)
|
|
{
|
|
while (i < length)
|
|
{
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch(*((char*)context))
|
|
{
|
|
case UCNV_PRV_ESCAPE_JAVA:
|
|
while (i < length)
|
|
{
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
|
|
}
|
|
break;
|
|
|
|
case UCNV_PRV_ESCAPE_C:
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
|
|
|
|
if(length==2){
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
|
|
|
|
}
|
|
else{
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_U_LOW_CODEPOINT; /* adding u */
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
|
|
}
|
|
break;
|
|
|
|
case UCNV_PRV_ESCAPE_XML_DEC:
|
|
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
|
|
if(length==2){
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
|
|
}
|
|
else{
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
|
|
}
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
|
|
break;
|
|
|
|
case UCNV_PRV_ESCAPE_XML_HEX:
|
|
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
|
|
if(length==2){
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
|
|
}
|
|
else{
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
|
|
}
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
|
|
break;
|
|
|
|
case UCNV_PRV_ESCAPE_UNICODE:
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_PLUS_CODEPOINT; /* adding + */
|
|
if (length == 2) {
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
|
|
} else {
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
|
|
}
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */
|
|
break;
|
|
|
|
case UCNV_PRV_ESCAPE_CSS2:
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
|
|
/* Always add space character, because the next character might be whitespace,
|
|
which would erroneously be considered the termination of the escape sequence. */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_SPACE_CODEPOINT;
|
|
break;
|
|
|
|
default:
|
|
while (i < length)
|
|
{
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
|
valueString[valueStringLength++] = (char16_t) UNICODE_U_CODEPOINT; /* adding U */
|
|
valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
|
|
}
|
|
}
|
|
}
|
|
myValueSource = valueString;
|
|
|
|
/* reset the error */
|
|
*err = U_ZERO_ERROR;
|
|
|
|
ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
|
|
|
|
ucnv_setFromUCallBack (fromArgs->converter,
|
|
original,
|
|
originalContext,
|
|
&ignoredCallback,
|
|
&ignoredContext,
|
|
&err2);
|
|
if (U_FAILURE (err2))
|
|
{
|
|
*err = err2;
|
|
return;
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_TO_U_CALLBACK_SKIP (
|
|
const void *context,
|
|
UConverterToUnicodeArgs *toArgs,
|
|
const char* codeUnits,
|
|
int32_t length,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
(void)toArgs;
|
|
(void)codeUnits;
|
|
(void)length;
|
|
if (reason <= UCNV_IRREGULAR)
|
|
{
|
|
if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
|
|
{
|
|
*err = U_ZERO_ERROR;
|
|
}
|
|
/* else the caller must have set the error code accordingly. */
|
|
}
|
|
/* else ignore the reset, close and clone calls. */
|
|
}
|
|
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_TO_U_CALLBACK_SUBSTITUTE (
|
|
const void *context,
|
|
UConverterToUnicodeArgs *toArgs,
|
|
const char* codeUnits,
|
|
int32_t length,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
(void)codeUnits;
|
|
(void)length;
|
|
if (reason <= UCNV_IRREGULAR)
|
|
{
|
|
if (context == nullptr || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
|
|
{
|
|
*err = U_ZERO_ERROR;
|
|
ucnv_cbToUWriteSub(toArgs,0,err);
|
|
}
|
|
/* else the caller must have set the error code accordingly. */
|
|
}
|
|
/* else ignore the reset, close and clone calls. */
|
|
}
|
|
|
|
/*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
|
|
*and uses that as the substitution sequence
|
|
*/
|
|
U_CAPI void U_EXPORT2
|
|
UCNV_TO_U_CALLBACK_ESCAPE (
|
|
const void *context,
|
|
UConverterToUnicodeArgs *toArgs,
|
|
const char* codeUnits,
|
|
int32_t length,
|
|
UConverterCallbackReason reason,
|
|
UErrorCode * err)
|
|
{
|
|
char16_t uniValueString[VALUE_STRING_LENGTH];
|
|
int32_t valueStringLength = 0;
|
|
int32_t i = 0;
|
|
|
|
if (reason > UCNV_IRREGULAR)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if(context==nullptr)
|
|
{
|
|
while (i < length)
|
|
{
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
|
|
valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch(*((char*)context))
|
|
{
|
|
case UCNV_PRV_ESCAPE_XML_DEC:
|
|
while (i < length)
|
|
{
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
|
|
valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
|
|
}
|
|
break;
|
|
|
|
case UCNV_PRV_ESCAPE_XML_HEX:
|
|
while (i < length)
|
|
{
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_AMP_CODEPOINT; /* adding & */
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_HASH_CODEPOINT; /* adding # */
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
|
|
valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
|
|
}
|
|
break;
|
|
case UCNV_PRV_ESCAPE_C:
|
|
while (i < length)
|
|
{
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_RS_CODEPOINT; /* adding \ */
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_X_LOW_CODEPOINT; /* adding x */
|
|
valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
|
|
}
|
|
break;
|
|
default:
|
|
while (i < length)
|
|
{
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
|
|
uniValueString[valueStringLength++] = (char16_t) UNICODE_X_CODEPOINT; /* adding X */
|
|
uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
|
|
valueStringLength += 2;
|
|
}
|
|
}
|
|
}
|
|
/* reset the error */
|
|
*err = U_ZERO_ERROR;
|
|
|
|
ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
|
|
}
|
|
|
|
#endif
|