From 2595da57ce20413c4f8225fe52e34cba53c4aec5 Mon Sep 17 00:00:00 2001 From: Craig Bester Date: Thu, 19 Oct 2023 01:15:26 +0200 Subject: [PATCH] Decode hexadecimal character references --- instant-xml/src/impls.rs | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/instant-xml/src/impls.rs b/instant-xml/src/impls.rs index f675f17..1cdecdf 100644 --- a/instant-xml/src/impls.rs +++ b/instant-xml/src/impls.rs @@ -558,7 +558,7 @@ pub(crate) fn decode(input: &str) -> Result, Error> { for (i, &b) in input.as_bytes().iter().enumerate() { // use a state machine to find entities state = match (state, b) { - (DecodeState::Normal, b'&') => DecodeState::Entity([0; 4], 0), + (DecodeState::Normal, b'&') => DecodeState::Entity([0; 6], 0), (DecodeState::Normal, _) => DecodeState::Normal, (DecodeState::Entity(chars, len), b';') => { let decoded = match &chars[..len] { @@ -567,14 +567,26 @@ pub(crate) fn decode(input: &str) -> Result, Error> { [b'g', b't'] => '>', [b'l', b't'] => '<', [b'q', b'u', b'o', b't'] => '"', + [b'#', b'x' | b'X', hex @ ..] => { + // Hexadecimal character reference e.g. "|" -> '|' + str::from_utf8(hex) + .ok() + .and_then(|hex_str| u32::from_str_radix(hex_str, 16).ok()) + .and_then(char::from_u32) + .filter(valid_xml_character) + .ok_or_else(|| { + Error::InvalidEntity( + String::from_utf8_lossy(&chars[..len]).into_owned(), + ) + })? + } [b'#', decimal @ ..] => { // Decimal character reference e.g. "Ӓ" -> 'Ӓ' str::from_utf8(decimal) .ok() .and_then(|decimal_str| u32::from_str(decimal_str).ok()) .and_then(char::from_u32) - // Valid character ranges per https://www.w3.org/TR/xml/#NT-Char - .filter(|c| matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}')) + .filter(valid_xml_character) .ok_or_else(|| { Error::InvalidEntity( String::from_utf8_lossy(&chars[..len]).into_owned(), @@ -599,8 +611,8 @@ pub(crate) fn decode(input: &str) -> Result, Error> { DecodeState::Normal } (DecodeState::Entity(mut chars, len), b) => { - if len >= 4 { - let mut bytes = Vec::with_capacity(5); + if len >= 6 { + let mut bytes = Vec::with_capacity(7); bytes.extend(&chars[..len]); bytes.push(b); return Err(Error::InvalidEntity( @@ -634,7 +646,12 @@ pub(crate) fn decode(input: &str) -> Result, Error> { #[derive(Debug)] enum DecodeState { Normal, - Entity([u8; 4], usize), + Entity([u8; 6], usize), +} + +/// Valid character ranges per https://www.w3.org/TR/xml/#NT-Char +fn valid_xml_character(c: &char) -> bool { + matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}') } impl<'xml, T: FromXml<'xml>> FromXml<'xml> for Vec { @@ -885,10 +902,20 @@ mod tests { assert_eq!(decode("& foo").unwrap(), "& foo"); assert_eq!(decode("foo &").unwrap(), "foo &"); assert_eq!(decode("cbdtéda&sü").unwrap(), "cbdtéda&sü"); + // Decimal character references assert_eq!(decode("Ӓ").unwrap(), "Ӓ"); assert_eq!(decode("foo bar").unwrap(), "foo \t bar"); assert_eq!(decode("foo | bar").unwrap(), "foo | bar"); assert_eq!(decode("foo Ӓ bar").unwrap(), "foo Ӓ bar"); + // Hexadecimal character references + assert_eq!(decode("Ä").unwrap(), "Ä"); + assert_eq!(decode("Ä").unwrap(), "Ä"); + assert_eq!(decode("foo bar").unwrap(), "foo \t bar"); + assert_eq!(decode("foo | bar").unwrap(), "foo | bar"); + assert_eq!(decode("foo Ä bar").unwrap(), "foo Ä bar"); + assert_eq!(decode("foo Ä bar").unwrap(), "foo Ä bar"); + assert_eq!(decode("foo პ bar").unwrap(), "foo პ bar"); + assert!(decode("&").is_err()); assert!(decode("&#").is_err()); assert!(decode("&#;").is_err());