diff --git a/instant-xml/src/de.rs b/instant-xml/src/de.rs index 4164662..76cd47a 100644 --- a/instant-xml/src/de.rs +++ b/instant-xml/src/de.rs @@ -1,9 +1,10 @@ use std::borrow::Cow; use std::collections::{BTreeMap, VecDeque}; +use std::str::{self, FromStr}; use xmlparser::{ElementEnd, Token, Tokenizer}; -use crate::impls::{decode, CowStrAccumulator}; +use crate::impls::CowStrAccumulator; use crate::{Error, Id}; pub struct Deserializer<'cx, 'xml> { @@ -383,6 +384,108 @@ pub fn borrow_cow_slice_u8<'xml>( Ok(()) } +pub(crate) fn decode(input: &str) -> Result, Error> { + let mut result = String::with_capacity(input.len()); + let (mut state, mut last_end) = (DecodeState::Normal, 0); + for (i, &b) in input.as_bytes().iter().enumerate() { + // use a state machine to find entities + state = match (state, b) { + (DecodeState::Normal, b'&') => DecodeState::Entity([0; 6], 0), + (DecodeState::Normal, _) => DecodeState::Normal, + (DecodeState::Entity(chars, len), b';') => { + let decoded = match &chars[..len] { + [b'a', b'm', b'p'] => '&', + [b'a', b'p', b'o', b's'] => '\'', + [b'g', b't'] => '>', + [b'l', b't'] => '<', + [b'q', b'u', b'o', b't'] => '"', + [b'#', b'x' | b'X', hex @ ..] => { + // Hexadecimal character reference e.g. "|" -> '|' + str::from_utf8(hex) + .ok() + .and_then(|hex_str| u32::from_str_radix(hex_str, 16).ok()) + .and_then(char::from_u32) + .filter(valid_xml_character) + .ok_or_else(|| { + Error::InvalidEntity( + String::from_utf8_lossy(&chars[..len]).into_owned(), + ) + })? + } + [b'#', decimal @ ..] => { + // Decimal character reference e.g. "Ӓ" -> 'Ӓ' + str::from_utf8(decimal) + .ok() + .and_then(|decimal_str| u32::from_str(decimal_str).ok()) + .and_then(char::from_u32) + .filter(valid_xml_character) + .ok_or_else(|| { + Error::InvalidEntity( + String::from_utf8_lossy(&chars[..len]).into_owned(), + ) + })? + } + _ => { + return Err(Error::InvalidEntity( + String::from_utf8_lossy(&chars[..len]).into_owned(), + )) + } + }; + + let start = i - (len + 1); // current position - (length of entity characters + 1 for '&') + if last_end < start { + // Unwrap should be safe: `last_end` and `start` must be at character boundaries. + result.push_str(input.get(last_end..start).unwrap()); + } + + last_end = i + 1; + result.push(decoded); + DecodeState::Normal + } + (DecodeState::Entity(mut chars, len), b) => { + if len >= 6 { + let mut bytes = Vec::with_capacity(7); + bytes.extend(&chars[..len]); + bytes.push(b); + return Err(Error::InvalidEntity( + String::from_utf8_lossy(&bytes).into_owned(), + )); + } + + chars[len] = b; + DecodeState::Entity(chars, len + 1) + } + }; + } + + // Unterminated entity (& without ;) at end of input + if let DecodeState::Entity(chars, len) = state { + return Err(Error::InvalidEntity( + String::from_utf8_lossy(&chars[..len]).into_owned(), + )); + } + + Ok(match result.is_empty() { + true => Cow::Borrowed(input), + false => { + // Unwrap should be safe: `last_end` and `input.len()` must be at character boundaries. + result.push_str(input.get(last_end..input.len()).unwrap()); + Cow::Owned(result) + } + }) +} + +#[derive(Debug)] +enum DecodeState { + Normal, + Entity([u8; 6], usize), +} + +/// Valid character ranges per https://www.w3.org/TR/xml/#NT-Char +fn valid_xml_character(c: &char) -> bool { + matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}') +} + #[derive(Debug)] pub enum Node<'xml> { Attribute(Attribute<'xml>), @@ -418,3 +521,52 @@ pub struct Attribute<'xml> { pub local: &'xml str, pub value: &'xml str, } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_decode() { + decode_ok("foo", "foo"); + decode_ok("foo & bar", "foo & bar"); + decode_ok("foo < bar", "foo < bar"); + decode_ok("foo > bar", "foo > bar"); + decode_ok("foo " bar", "foo \" bar"); + decode_ok("foo ' bar", "foo ' bar"); + decode_ok("foo &lt; bar", "foo < bar"); + decode_ok("& foo", "& foo"); + decode_ok("foo &", "foo &"); + decode_ok("cbdtéda&sü", "cbdtéda&sü"); + // Decimal character references + decode_ok("Ӓ", "Ӓ"); + decode_ok("foo bar", "foo \t bar"); + decode_ok("foo | bar", "foo | bar"); + decode_ok("foo Ӓ bar", "foo Ӓ bar"); + // Hexadecimal character references + decode_ok("Ä", "Ä"); + decode_ok("Ä", "Ä"); + decode_ok("foo bar", "foo \t bar"); + decode_ok("foo | bar", "foo | bar"); + decode_ok("foo Ä bar", "foo Ä bar"); + decode_ok("foo Ä bar", "foo Ä bar"); + decode_ok("foo პ bar", "foo პ bar"); + + decode_err("&"); + decode_err("&#"); + decode_err("&#;"); + decode_err("foo&"); + decode_err("&bar"); + decode_err("&foo;"); + decode_err("&foobar;"); + decode_err("cbdtéd&ü"); + } + + fn decode_ok(input: &str, expected: &'static str) { + assert_eq!(super::decode(input).unwrap(), expected, "{input:?}"); + } + + fn decode_err(input: &str) { + assert!(super::decode(input).is_err(), "{input:?}"); + } +} diff --git a/instant-xml/src/impls.rs b/instant-xml/src/impls.rs index 6f4d159..10371aa 100644 --- a/instant-xml/src/impls.rs +++ b/instant-xml/src/impls.rs @@ -8,6 +8,7 @@ use std::{any::type_name, marker::PhantomData}; #[cfg(feature = "chrono")] use chrono::{DateTime, NaiveDate, Utc}; +use crate::de::decode; use crate::{Accumulate, Deserializer, Error, FromXml, Id, Kind, Serializer, ToXml}; // Deserializer @@ -513,108 +514,6 @@ fn encode(input: &str) -> Result, Error> { Ok(Cow::Owned(result)) } -pub(crate) fn decode(input: &str) -> Result, Error> { - let mut result = String::with_capacity(input.len()); - let (mut state, mut last_end) = (DecodeState::Normal, 0); - for (i, &b) in input.as_bytes().iter().enumerate() { - // use a state machine to find entities - state = match (state, b) { - (DecodeState::Normal, b'&') => DecodeState::Entity([0; 6], 0), - (DecodeState::Normal, _) => DecodeState::Normal, - (DecodeState::Entity(chars, len), b';') => { - let decoded = match &chars[..len] { - [b'a', b'm', b'p'] => '&', - [b'a', b'p', b'o', b's'] => '\'', - [b'g', b't'] => '>', - [b'l', b't'] => '<', - [b'q', b'u', b'o', b't'] => '"', - [b'#', b'x' | b'X', hex @ ..] => { - // Hexadecimal character reference e.g. "|" -> '|' - str::from_utf8(hex) - .ok() - .and_then(|hex_str| u32::from_str_radix(hex_str, 16).ok()) - .and_then(char::from_u32) - .filter(valid_xml_character) - .ok_or_else(|| { - Error::InvalidEntity( - String::from_utf8_lossy(&chars[..len]).into_owned(), - ) - })? - } - [b'#', decimal @ ..] => { - // Decimal character reference e.g. "Ӓ" -> 'Ӓ' - str::from_utf8(decimal) - .ok() - .and_then(|decimal_str| u32::from_str(decimal_str).ok()) - .and_then(char::from_u32) - .filter(valid_xml_character) - .ok_or_else(|| { - Error::InvalidEntity( - String::from_utf8_lossy(&chars[..len]).into_owned(), - ) - })? - } - _ => { - return Err(Error::InvalidEntity( - String::from_utf8_lossy(&chars[..len]).into_owned(), - )) - } - }; - - let start = i - (len + 1); // current position - (length of entity characters + 1 for '&') - if last_end < start { - // Unwrap should be safe: `last_end` and `start` must be at character boundaries. - result.push_str(input.get(last_end..start).unwrap()); - } - - last_end = i + 1; - result.push(decoded); - DecodeState::Normal - } - (DecodeState::Entity(mut chars, len), b) => { - if len >= 6 { - let mut bytes = Vec::with_capacity(7); - bytes.extend(&chars[..len]); - bytes.push(b); - return Err(Error::InvalidEntity( - String::from_utf8_lossy(&bytes).into_owned(), - )); - } - - chars[len] = b; - DecodeState::Entity(chars, len + 1) - } - }; - } - - // Unterminated entity (& without ;) at end of input - if let DecodeState::Entity(chars, len) = state { - return Err(Error::InvalidEntity( - String::from_utf8_lossy(&chars[..len]).into_owned(), - )); - } - - Ok(match result.is_empty() { - true => Cow::Borrowed(input), - false => { - // Unwrap should be safe: `last_end` and `input.len()` must be at character boundaries. - result.push_str(input.get(last_end..input.len()).unwrap()); - Cow::Owned(result) - } - }) -} - -#[derive(Debug)] -enum DecodeState { - Normal, - Entity([u8; 6], usize), -} - -/// Valid character ranges per https://www.w3.org/TR/xml/#NT-Char -fn valid_xml_character(c: &char) -> bool { - matches!(c, '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..='\u{10FFFF}') -} - impl<'xml, T: FromXml<'xml>> FromXml<'xml> for Vec { #[inline] fn matches(id: Id<'_>, field: Option>) -> bool { @@ -851,50 +750,6 @@ impl<'xml> FromXml<'xml> for IpAddr { mod tests { use super::*; - #[test] - fn test_decode() { - decode_ok("foo", "foo"); - decode_ok("foo & bar", "foo & bar"); - decode_ok("foo < bar", "foo < bar"); - decode_ok("foo > bar", "foo > bar"); - decode_ok("foo " bar", "foo \" bar"); - decode_ok("foo ' bar", "foo ' bar"); - decode_ok("foo &lt; bar", "foo < bar"); - decode_ok("& foo", "& foo"); - decode_ok("foo &", "foo &"); - decode_ok("cbdtéda&sü", "cbdtéda&sü"); - // Decimal character references - decode_ok("Ӓ", "Ӓ"); - decode_ok("foo bar", "foo \t bar"); - decode_ok("foo | bar", "foo | bar"); - decode_ok("foo Ӓ bar", "foo Ӓ bar"); - // Hexadecimal character references - decode_ok("Ä", "Ä"); - decode_ok("Ä", "Ä"); - decode_ok("foo bar", "foo \t bar"); - decode_ok("foo | bar", "foo | bar"); - decode_ok("foo Ä bar", "foo Ä bar"); - decode_ok("foo Ä bar", "foo Ä bar"); - decode_ok("foo პ bar", "foo პ bar"); - - decode_err("&"); - decode_err("&#"); - decode_err("&#;"); - decode_err("foo&"); - decode_err("&bar"); - decode_err("&foo;"); - decode_err("&foobar;"); - decode_err("cbdtéd&ü"); - } - - fn decode_ok(input: &str, expected: &'static str) { - assert_eq!(super::decode(input).unwrap(), expected, "{input:?}"); - } - - fn decode_err(input: &str) { - assert!(super::decode(input).is_err(), "{input:?}"); - } - #[test] fn encode_unicode() { let input = "Iñtërnâ&tiônàlizætiøn";