345 lines
12 KiB
C
345 lines
12 KiB
C
/*************************************************
|
|
* Perl-Compatible Regular Expressions *
|
|
*************************************************/
|
|
|
|
/* PCRE is a library of functions to support regular expressions whose syntax
|
|
and semantics are as close as possible to those of the Perl 5 language.
|
|
|
|
Written by Philip Hazel
|
|
Original API code Copyright (c) 1997-2012 University of Cambridge
|
|
New API code Copyright (c) 2016-2021 University of Cambridge
|
|
|
|
-----------------------------------------------------------------------------
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
* Redistributions of source code must retain the above copyright notice,
|
|
this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
* Neither the name of the University of Cambridge nor the names of its
|
|
contributors may be used to endorse or promote products derived from
|
|
this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
POSSIBILITY OF SUCH DAMAGE.
|
|
-----------------------------------------------------------------------------
|
|
*/
|
|
|
|
/* This module contains the function for checking a script run. */
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include "pcre2_internal.h"
|
|
|
|
|
|
/*************************************************
|
|
* Check script run *
|
|
*************************************************/
|
|
|
|
/* A script run is conceptually a sequence of characters all in the same
|
|
Unicode script. However, it isn't quite that simple. There are special rules
|
|
for scripts that are commonly used together, and also special rules for digits.
|
|
This function implements the appropriate checks, which is possible only when
|
|
PCRE2 is compiled with Unicode support. The function returns TRUE if there is
|
|
no Unicode support; however, it should never be called in that circumstance
|
|
because an error is given by pcre2_compile() if a script run is called for in a
|
|
version of PCRE2 compiled without Unicode support.
|
|
|
|
Arguments:
|
|
pgr point to the first character
|
|
endptr point after the last character
|
|
utf TRUE if in UTF mode
|
|
|
|
Returns: TRUE if this is a valid script run
|
|
*/
|
|
|
|
/* These are states in the checking process. */
|
|
|
|
enum { SCRIPT_UNSET, /* Requirement as yet unknown */
|
|
SCRIPT_MAP, /* Bitmap contains acceptable scripts */
|
|
SCRIPT_HANPENDING, /* Have had only Han characters */
|
|
SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
|
|
SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
|
|
SCRIPT_HANHANGUL /* Expect Han or Hangul */
|
|
};
|
|
|
|
#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
|
|
#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
|
|
|
|
BOOL
|
|
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
|
|
{
|
|
#ifdef SUPPORT_UNICODE
|
|
uint32_t require_state = SCRIPT_UNSET;
|
|
uint32_t require_map[FULL_MAPSIZE];
|
|
uint32_t map[FULL_MAPSIZE];
|
|
uint32_t require_digitset = 0;
|
|
uint32_t c;
|
|
|
|
#if PCRE2_CODE_UNIT_WIDTH == 32
|
|
(void)utf; /* Avoid compiler warning */
|
|
#endif
|
|
|
|
/* Any string containing fewer than 2 characters is a valid script run. */
|
|
|
|
if (ptr >= endptr) return TRUE;
|
|
GETCHARINCTEST(c, ptr);
|
|
if (ptr >= endptr) return TRUE;
|
|
|
|
/* Initialize the require map. This is a full-size bitmap that has a bit for
|
|
every script, as opposed to the maps in ucd_script_sets, which only have bits
|
|
for scripts less than ucp_Unknown - those that appear in script extension
|
|
lists. */
|
|
|
|
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
|
|
|
|
/* Scan strings of two or more characters, checking the Unicode characteristics
|
|
of each code point. There is special code for scripts that can be combined with
|
|
characters from the Han Chinese script. This may be used in conjunction with
|
|
four other scripts in these combinations:
|
|
|
|
. Han with Hiragana and Katakana is allowed (for Japanese).
|
|
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
|
|
. Han with Hangul is allowed (for Korean).
|
|
|
|
If the first significant character's script is one of the four, the required
|
|
script type is immediately known. However, if the first significant
|
|
character's script is Han, we have to keep checking for a non-Han character.
|
|
Hence the SCRIPT_HANPENDING state. */
|
|
|
|
for (;;)
|
|
{
|
|
const ucd_record *ucd = GET_UCD(c);
|
|
uint32_t script = ucd->script;
|
|
|
|
/* If the script is Unknown, the string is not a valid script run. Such
|
|
characters can only form script runs of length one (see test above). */
|
|
|
|
if (script == ucp_Unknown) return FALSE;
|
|
|
|
/* A character without any script extensions whose script is Inherited or
|
|
Common is always accepted with any script. If there are extensions, the
|
|
following processing happens for all scripts. */
|
|
|
|
if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
|
|
{
|
|
BOOL OK;
|
|
|
|
/* Set up a full-sized map for this character that can include bits for all
|
|
scripts. Copy the scriptx map for this character (which covers those
|
|
scripts that appear in script extension lists), set the remaining values to
|
|
zero, and then, except for Common or Inherited, add this script's bit to
|
|
the map. */
|
|
|
|
memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
|
|
memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
|
|
if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
|
|
|
|
/* Handle the different checking states */
|
|
|
|
switch(require_state)
|
|
{
|
|
/* First significant character - it might follow Common or Inherited
|
|
characters that do not have any script extensions. */
|
|
|
|
case SCRIPT_UNSET:
|
|
switch(script)
|
|
{
|
|
case ucp_Han:
|
|
require_state = SCRIPT_HANPENDING;
|
|
break;
|
|
|
|
case ucp_Hiragana:
|
|
case ucp_Katakana:
|
|
require_state = SCRIPT_HANHIRAKATA;
|
|
break;
|
|
|
|
case ucp_Bopomofo:
|
|
require_state = SCRIPT_HANBOPOMOFO;
|
|
break;
|
|
|
|
case ucp_Hangul:
|
|
require_state = SCRIPT_HANHANGUL;
|
|
break;
|
|
|
|
default:
|
|
memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
|
|
require_state = SCRIPT_MAP;
|
|
break;
|
|
}
|
|
break;
|
|
|
|
/* The first significant character was Han. An inspection of the Unicode
|
|
11.0.0 files shows that there are the following types of Script Extension
|
|
list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
|
|
scripts:
|
|
|
|
. Bopomofo + Han
|
|
. Han + Hiragana + Katakana
|
|
. Hiragana + Katakana
|
|
. Bopopmofo + Hangul + Han + Hiragana + Katakana
|
|
|
|
The following code tries to make sense of this. */
|
|
|
|
#define FOUND_BOPOMOFO 1
|
|
#define FOUND_HIRAGANA 2
|
|
#define FOUND_KATAKANA 4
|
|
#define FOUND_HANGUL 8
|
|
|
|
case SCRIPT_HANPENDING:
|
|
if (script != ucp_Han) /* Another Han does nothing */
|
|
{
|
|
uint32_t chspecial = 0;
|
|
|
|
if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
|
|
if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
|
|
if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
|
|
if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
|
|
|
|
if (chspecial == 0) return FALSE; /* Not allowed with Han */
|
|
|
|
if (chspecial == FOUND_BOPOMOFO)
|
|
require_state = SCRIPT_HANBOPOMOFO;
|
|
else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
|
|
require_state = SCRIPT_HANHIRAKATA;
|
|
|
|
/* Otherwise this character must be allowed with all of them, so remain
|
|
in the pending state. */
|
|
}
|
|
break;
|
|
|
|
/* Previously encountered one of the "with Han" scripts. Check that
|
|
this character is appropriate. */
|
|
|
|
case SCRIPT_HANHIRAKATA:
|
|
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
|
|
MAPBIT(map, ucp_Katakana) == 0) return FALSE;
|
|
break;
|
|
|
|
case SCRIPT_HANBOPOMOFO:
|
|
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
|
|
break;
|
|
|
|
case SCRIPT_HANHANGUL:
|
|
if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
|
|
break;
|
|
|
|
/* Previously encountered one or more characters that are allowed with a
|
|
list of scripts. */
|
|
|
|
case SCRIPT_MAP:
|
|
OK = FALSE;
|
|
|
|
for (int i = 0; i < FULL_MAPSIZE; i++)
|
|
{
|
|
if ((require_map[i] & map[i]) != 0)
|
|
{
|
|
OK = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!OK) return FALSE;
|
|
|
|
/* The rest of the string must be in this script, but we have to
|
|
allow for the Han complications. */
|
|
|
|
switch(script)
|
|
{
|
|
case ucp_Han:
|
|
require_state = SCRIPT_HANPENDING;
|
|
break;
|
|
|
|
case ucp_Hiragana:
|
|
case ucp_Katakana:
|
|
require_state = SCRIPT_HANHIRAKATA;
|
|
break;
|
|
|
|
case ucp_Bopomofo:
|
|
require_state = SCRIPT_HANBOPOMOFO;
|
|
break;
|
|
|
|
case ucp_Hangul:
|
|
require_state = SCRIPT_HANHANGUL;
|
|
break;
|
|
|
|
/* Compute the intersection of the required list of scripts and the
|
|
allowed scripts for this character. */
|
|
|
|
default:
|
|
for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
|
|
break;
|
|
}
|
|
|
|
break;
|
|
}
|
|
} /* End checking character's script and extensions. */
|
|
|
|
/* The character is in an acceptable script. We must now ensure that all
|
|
decimal digits in the string come from the same set. Some scripts (e.g.
|
|
Common, Arabic) have more than one set of decimal digits. This code does
|
|
not allow mixing sets, even within the same script. The vector called
|
|
PRIV(ucd_digit_sets)[] contains, in its first element, the number of
|
|
following elements, and then, in ascending order, the code points of the
|
|
'9' characters in every set of 10 digits. Each set is identified by the
|
|
offset in the vector of its '9' character. An initial check of the first
|
|
value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
|
|
|
|
if (ucd->chartype == ucp_Nd)
|
|
{
|
|
uint32_t digitset;
|
|
|
|
if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
|
|
{
|
|
int mid;
|
|
int bot = 1;
|
|
int top = PRIV(ucd_digit_sets)[0];
|
|
for (;;)
|
|
{
|
|
if (top <= bot + 1) /* <= rather than == is paranoia */
|
|
{
|
|
digitset = top;
|
|
break;
|
|
}
|
|
mid = (top + bot) / 2;
|
|
if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
|
|
}
|
|
}
|
|
|
|
/* A required value of 0 means "unset". */
|
|
|
|
if (require_digitset == 0) require_digitset = digitset;
|
|
else if (digitset != require_digitset) return FALSE;
|
|
} /* End digit handling */
|
|
|
|
/* If we haven't yet got to the end, pick up the next character. */
|
|
|
|
if (ptr >= endptr) return TRUE;
|
|
GETCHARINCTEST(c, ptr);
|
|
} /* End checking loop */
|
|
|
|
#else /* NOT SUPPORT_UNICODE */
|
|
(void)ptr;
|
|
(void)endptr;
|
|
(void)utf;
|
|
return TRUE;
|
|
#endif /* SUPPORT_UNICODE */
|
|
}
|
|
|
|
/* End of pcre2_script_run.c */
|