Accept browser-sent unencoded query characters.

Closes #941.

Co-authored-by: Vladimir Ignatev <ya.na.pochte@gmail.com>
This commit is contained in:
Sergio Benitez 2020-11-02 14:10:39 -08:00
parent 949bb01e2d
commit b5e4dded8a
4 changed files with 132 additions and 123 deletions

View File

@ -3,7 +3,7 @@ use pear::input::{Extent, Rewind};
use pear::macros::{parser, switch, parse_current_marker, parse_error, parse_try};
use crate::uri::{Uri, Origin, Authority, Absolute, Host};
use crate::parse::uri::tables::{is_reg_name_char, is_pchar, is_pchar_or_rchar};
use crate::parse::uri::tables::{is_reg_name_char, is_pchar, is_qchar, is_rchar};
use crate::parse::uri::RawInput;
type Result<'a, T> = pear::input::Result<T, RawInput<'a>>;
@ -15,12 +15,14 @@ pub fn uri<'a>(input: &mut RawInput<'a>) -> Result<'a, Uri<'a>> {
1 => switch! {
eat(b'*') => Uri::Asterisk,
eat(b'/') => Uri::Origin(Origin::new::<_, &str>("/", None)),
eat(b'%') => parse_error!("'%' is not a valid URI")?,
_ => unsafe {
// the `is_reg_name_char` guarantees ASCII
let host = Host::Raw(take_n_if(1, is_reg_name_char)?);
Uri::Authority(Authority::raw(input.start.into(), None, host, None))
}
},
// NOTE: We accept '%' even when it isn't followed by two hex digits.
_ => switch! {
peek(b'/') => Uri::Origin(origin()?),
_ => absolute_or_authority()?
@ -30,30 +32,31 @@ pub fn uri<'a>(input: &mut RawInput<'a>) -> Result<'a, Uri<'a>> {
#[parser]
pub fn origin<'a>(input: &mut RawInput<'a>) -> Result<'a, Origin<'a>> {
(peek(b'/')?, path_and_query(is_pchar)?).1
(peek(b'/')?, path_and_query(is_pchar, is_qchar)?).1
}
#[parser]
pub fn rocket_route_origin<'a>(input: &mut RawInput<'a>) -> Result<'a, Origin<'a>> {
(peek(b'/')?, path_and_query(is_pchar_or_rchar)?).1
fn is_pchar_or_rchar(c: &u8) -> bool { is_pchar(c) || is_rchar(c) }
fn is_qchar_or_rchar(c: &u8) -> bool { is_qchar(c) || is_rchar(c) }
(peek(b'/')?, path_and_query(is_pchar_or_rchar, is_qchar_or_rchar)?).1
}
#[parser]
fn path_and_query<'a, F>(input: &mut RawInput<'a>, is_good_char: F) -> Result<'a, Origin<'a>>
where F: Fn(&u8) -> bool + Copy
fn path_and_query<'a, F, Q>(
input: &mut RawInput<'a>,
is_path_char: F,
is_query_char: Q
) -> Result<'a, Origin<'a>>
where F: Fn(&u8) -> bool + Copy, Q: Fn(&u8) -> bool + Copy
{
let path = take_while(is_good_char)?;
// FIXME: this works on nightly but not stable! `Span` issues?
// let query = parse_try!(eat(b'?') => take_while(|c| is_good_char(c) || *c == b'?')?);
let query = switch! {
eat(b'?') => Some(take_while(|c| is_good_char(c) || *c == b'?')?),
_ => None
};
let path = take_while(is_path_char)?;
let query = parse_try!(eat(b'?') => take_while(is_query_char)?);
if path.is_empty() && query.is_none() {
parse_error!("expected path or query, found neither")?
} else {
// We know the string is ASCII because of the `is_good_char` checks above.
// We know the string is ASCII because of the `is_char` checks above.
Ok(unsafe {Origin::raw(input.start.into(), path.into(), query.map(|q| q.into())) })
}
}
@ -115,10 +118,10 @@ fn absolute<'a>(
}
};
let path_and_query = parse_try!(path_and_query(is_pchar));
let path_and_query = parse_try!(path_and_query(is_pchar, is_qchar));
(Some(authority), path_and_query)
},
eat(b':') => (None, Some(path_and_query(is_pchar)?)),
eat(b':') => (None, Some(path_and_query(is_pchar, is_qchar)?)),
_ => parse_error!("expected ':' but none was found")?
};

View File

@ -1,118 +1,78 @@
pub(crate) const PATH_CHARS: [u8; 256] = [
// 0 1 2 3 4 5 6 7 8 9
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
0, 0, 0, b'!', 0, 0, b'$', b'%', b'&', b'\'', // 3x
b'(', b')', b'*', b'+', b',', b'-', b'.', b'/', b'0', b'1', // 4x
b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b':', b';', // 5x
0, b'=', 0, 0, b'@', b'A', b'B', b'C', b'D', b'E', // 6x
b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', // 7x
b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', // 8x
b'Z', 0, 0, 0, 0, b'_', 0, b'a', b'b', b'c', // 9x
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', // 10x
b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', // 11x
b'x', b'y', b'z', 0, 0, 0, b'~', 0, 0, 0, // 12x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 13x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 14x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 15x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 17x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 19x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 21x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 22x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 23x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 24x
0, 0, 0, 0, 0, 0, // 25x
const fn char_table(sets: &[&[u8]]) -> [u8; 256] {
let mut table = [0u8; 256];
let mut i = 0;
while i < sets.len() {
let set: &[u8] = sets[i];
let mut j = 0;
while j < set.len() {
let c: u8 = set[j];
table[c as usize] = c;
j += 1;
}
i += 1;
}
table
}
const UNRESERVED: &[u8] = &[
b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', b'K', b'L',
b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X',
b'Y', b'Z', b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j',
b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v',
b'w', b'x', b'y', b'z', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
b'8', b'9', b'-', b'.', b'_', b'~',
];
#[inline(always)]
pub fn is_pchar(&c: &u8) -> bool {
PATH_CHARS[c as usize] != 0
}
pub(crate) const ROUTE_CHARS: [u8; 256] = [
// 0 1 2 3 4 5 6 7 8 9
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 5x
b'<', 0, b'>', 0, 0, 0, 0, 0, 0, 0, // 6x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 7x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 11x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 12x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 13x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 14x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 15x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 17x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 19x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 21x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 22x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 23x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 24x
0, 0, 0, 0, 0, 0, // 25x
const PCT_ENCODED: &[u8] = &[
b'%', b'A', b'B', b'C', b'D', b'E', b'F', b'a', b'b', b'c', b'd', b'e',
b'f', b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9'
];
#[inline(always)]
pub fn is_rchar(&c: &u8) -> bool {
ROUTE_CHARS[c as usize] != 0
}
#[inline(always)]
pub fn is_pchar_or_rchar(c: &u8) -> bool {
is_pchar(c) || is_rchar(c)
}
const REG_CHARS: [u8; 256] = [
// 0 1 2 3 4 5 6 7 8 9
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
0, 0, 0, b'!', 0, 0, b'$', 0, b'&', b'\'', // 3x
b'(', b')', b'*', b'+', b',', b'-', b'.', 0, b'0', b'1', // 4x
b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', 0, b';', // 5x
0, b'=', 0, 0, 0, b'A', b'B', b'C', b'D', b'E', // 6x
b'F', b'G', b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O', // 7x
b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W', b'X', b'Y', // 8x
b'Z', 0, 0, 0, 0, b'_', 0, b'a', b'b', b'c', // 9x
b'd', b'e', b'f', b'g', b'h', b'i', b'j', b'k', b'l', b'm', // 10x
b'n', b'o', b'p', b'q', b'r', b's', b't', b'u', b'v', b'w', // 11x
b'x', b'y', b'z', 0, 0, 0, b'~', 0, 0, 0, // 12x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 13x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 14x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 15x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 17x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 18x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 19x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 21x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 22x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 23x
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 24x
0, 0, 0, 0, 0, 0 // 25x
const SUB_DELIMS: &[u8] = &[
b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'='
];
pub const PATH_CHARS: [u8; 256] = char_table(&[
UNRESERVED, PCT_ENCODED, SUB_DELIMS, &[b':', b'@', b'/']
]);
const ROUTE_CHARS: [u8; 256] = char_table(&[&[
b'<', b'>'
]]);
const QUERY_CHARS: [u8; 256] = char_table(&[
&PATH_CHARS, &[b'/', b'?'],
// NOTE: these are _not_ accepted in RFC 7230/3986. However, browsers
// routinely send these unencoded, so allow them to support the real-world.
&[b'{', b'}', b'[', b']', b'\\', b'^', b'`', b'|'],
]);
const REG_NAME_CHARS: [u8; 256] = char_table(&[
UNRESERVED, PCT_ENCODED, SUB_DELIMS
]);
#[inline(always)]
pub fn is_reg_name_char(&c: &u8) -> bool {
REG_CHARS[c as usize] != 0
}
pub const fn is_pchar(&c: &u8) -> bool { PATH_CHARS[c as usize] != 0 }
#[inline(always)]
pub const fn is_rchar(&c: &u8) -> bool { ROUTE_CHARS[c as usize] != 0 }
#[inline(always)]
pub const fn is_qchar(&c: &u8) -> bool { QUERY_CHARS[c as usize] != 0 }
#[inline(always)]
pub const fn is_reg_name_char(&c: &u8) -> bool { REG_NAME_CHARS[c as usize] != 0 }
#[cfg(test)]
mod tests {
fn test_char_table(table: &[u8]) {
for (i, &v) in table.iter().enumerate() {
if v != 0 && v != 1 {
if v != 0 {
assert_eq!(i, v as usize);
}
}
@ -121,6 +81,8 @@ mod tests {
#[test]
fn check_tables() {
test_char_table(&super::PATH_CHARS[..]);
test_char_table(&super::REG_CHARS[..]);
test_char_table(&super::QUERY_CHARS[..]);
test_char_table(&super::ROUTE_CHARS[..]);
test_char_table(&super::REG_NAME_CHARS[..]);
}
}

View File

@ -37,6 +37,19 @@ macro_rules! assert_no_parse {
($($from:expr),+,) => (assert_no_parse!($($from),+))
}
macro_rules! assert_parse {
($($from:expr),+) => (
$(
if let Err(e) = from_str($from) {
println!("{:?} failed to parse", $from);
panic!("{}", e);
}
)+
);
($($from:expr),+,) => (assert_parse!($($from),+))
}
macro_rules! assert_displays_eq {
($($string:expr),+) => (
$(
@ -89,6 +102,25 @@ fn bad_parses() {
assert_no_parse!("://z7:77777777777777777777777777777`77777777777");
}
#[test]
fn test_parse_issue_924_samples() {
assert_parse!("/path?param={value}",
"/path/?param={value}",
"/some/path/?param={forgot-to-replace-placeholder}",
"/path?param={value}&onemore={value}",
"/some/path/?tags=[]", "/some/path/?tags=[rocket,is,perfect]",
"/some/path/?tags=[rocket|is\\perfect^`]&users={arenot}",
"/rocket/@user/",
"/rocket/@user/?tags=[rocket,%F0%9F%98%8B]",
"/rocket/?username=@sergio&tags=[rocket,%F0%9F%98%8B]",
"/rocket/?Key+With+Spaces=value+too",
"/rocket/?Key+With+\'",
"/rocket/?query=%3E5",
);
assert_no_parse!("/rocket/?query=>5", "/?#foo");
}
#[test]
fn single_byte() {
assert_parse_eq!(
@ -116,6 +148,7 @@ fn origin() {
"/hi%20there?a=b&c=d" => uri_origin("/hi%20there", Some("a=b&c=d")),
"/c/d/fa/b/c?abc" => uri_origin("/c/d/fa/b/c", Some("abc")),
"/xn--ls8h?emoji=poop" => uri_origin("/xn--ls8h", Some("emoji=poop")),
"/?t=[rocket|is\\here^`]&{ok}" => uri_origin("/", Some("t=[rocket|is\\here^`]&{ok}")),
);
}

View File

@ -28,10 +28,21 @@ use crate::uri::encoding::{percent_encode, DEFAULT_ENCODE_SET};
/// ## Parsing
///
/// The `Uri` type implements a full, zero-allocation, zero-copy [RFC 7230]
/// compliant parser. To parse an `&str` into a `Uri`, use the [`Uri::parse()`]
/// method. Alternatively, you may also use the `TryFrom<&str>` and
/// `TryFrom<String>` trait implementation. To inspect the parsed type, match on
/// the resulting `enum` and use the methods of the internal structure.
/// compliant "request target" parser with limited liberties for real-world
/// deviations. In particular, the parser deviates as follows:
///
/// * It accepts `%` characters without two trailing hex-digits unless it is
/// the only character in the URI.
///
/// * It accepts the following additional unencoded characters in query parts,
/// to match real-world browser behavior:
///
/// `{`, `}`, `[`, `]`, `\`, `^`, <code>&#96;</code>, `|`
///
/// To parse an `&str` into a `Uri`, use [`Uri::parse()`]. Alternatively, you
/// may also use the `TryFrom<&str>` and `TryFrom<String>` trait implementation.
/// To inspect the parsed type, match on the resulting `enum` and use the
/// methods of the internal structure.
///
/// [RFC 7230]: https://tools.ietf.org/html/rfc7230
///