From 40048a581158053ee89ced4c7cc21804b828b271 Mon Sep 17 00:00:00 2001 From: Ali MJ Al-Nasrawy Date: Mon, 28 Mar 2022 23:58:35 +0300 Subject: [PATCH] rework actix_router::Quoter (#2709) Co-authored-by: Rob Ede --- actix-router/Cargo.toml | 5 + actix-router/benches/quoter.rs | 52 +++++++ actix-router/src/de.rs | 2 +- actix-router/src/quoter.rs | 268 ++++++++++++--------------------- actix-router/src/url.rs | 2 +- 5 files changed, 157 insertions(+), 172 deletions(-) create mode 100644 actix-router/benches/quoter.rs diff --git a/actix-router/Cargo.toml b/actix-router/Cargo.toml index 6fcef125d..76f39f631 100644 --- a/actix-router/Cargo.toml +++ b/actix-router/Cargo.toml @@ -32,7 +32,12 @@ criterion = { version = "0.3", features = ["html_reports"] } firestorm = { version = "0.5", features = ["enable_system_time"] } http = "0.2.5" serde = { version = "1", features = ["derive"] } +percent-encoding = "2.1" [[bench]] name = "router" harness = false + +[[bench]] +name = "quoter" +harness = false diff --git a/actix-router/benches/quoter.rs b/actix-router/benches/quoter.rs new file mode 100644 index 000000000..c18f1620e --- /dev/null +++ b/actix-router/benches/quoter.rs @@ -0,0 +1,52 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; + +use std::borrow::Cow; + +fn compare_quoters(c: &mut Criterion) { + let mut group = c.benchmark_group("Compare Quoters"); + + let quoter = actix_router::Quoter::new(b"", b""); + let path_quoted = (0..=0x7f) + .map(|c| format!("%{:02X}", c)) + .collect::(); + let path_unquoted = ('\u{00}'..='\u{7f}').collect::(); + + group.bench_function("quoter_unquoted", |b| { + b.iter(|| { + for _ in 0..10 { + black_box(quoter.requote(path_unquoted.as_bytes())); + } + }); + }); + + group.bench_function("percent_encode_unquoted", |b| { + b.iter(|| { + for _ in 0..10 { + let decode = percent_encoding::percent_decode(path_unquoted.as_bytes()); + black_box(Into::>::into(decode)); + } + }); + }); + + group.bench_function("quoter_quoted", |b| { + b.iter(|| { + for _ in 0..10 { + black_box(quoter.requote(path_quoted.as_bytes())); + } + }); + }); + + group.bench_function("percent_encode_quoted", |b| { + b.iter(|| { + for _ in 0..10 { + let decode = percent_encoding::percent_decode(path_quoted.as_bytes()); + black_box(Into::>::into(decode)); + } + }); + }); + + group.finish(); +} + +criterion_group!(benches, compare_quoters); +criterion_main!(benches); diff --git a/actix-router/src/de.rs b/actix-router/src/de.rs index efafd08db..55fcdc912 100644 --- a/actix-router/src/de.rs +++ b/actix-router/src/de.rs @@ -7,7 +7,7 @@ use crate::path::{Path, PathIter}; use crate::{Quoter, ResourcePath}; thread_local! { - static FULL_QUOTER: Quoter = Quoter::new(b"+/%", b""); + static FULL_QUOTER: Quoter = Quoter::new(b"", b""); } macro_rules! unsupported_type { diff --git a/actix-router/src/quoter.rs b/actix-router/src/quoter.rs index 8a1e99e1d..6c929d3ac 100644 --- a/actix-router/src/quoter.rs +++ b/actix-router/src/quoter.rs @@ -1,132 +1,89 @@ -#[allow(dead_code)] -const GEN_DELIMS: &[u8] = b":/?#[]@"; - -#[allow(dead_code)] -const SUB_DELIMS_WITHOUT_QS: &[u8] = b"!$'()*,"; - -#[allow(dead_code)] -const SUB_DELIMS: &[u8] = b"!$'()*,+?=;"; - -#[allow(dead_code)] -const RESERVED: &[u8] = b":/?#[]@!$'()*,+?=;"; - -#[allow(dead_code)] -const UNRESERVED: &[u8] = b"abcdefghijklmnopqrstuvwxyz - ABCDEFGHIJKLMNOPQRSTUVWXYZ - 1234567890 - -._~"; - -const ALLOWED: &[u8] = b"abcdefghijklmnopqrstuvwxyz - ABCDEFGHIJKLMNOPQRSTUVWXYZ - 1234567890 - -._~ - !$'()*,"; - -const QS: &[u8] = b"+&=;b"; - -/// A quoter +/// Partial percent-decoding. +/// +/// Performs percent-decoding on a slice but can selectively skip decoding certain sequences. +/// +/// # Examples +/// ``` +/// # use actix_router::Quoter; +/// // + is set as a protected character and will not be decoded... +/// let q = Quoter::new(&[], b"+"); +/// +/// // ...but the other encoded characters (like the hyphen below) will. +/// assert_eq!(q.requote(b"/a%2Db%2Bc").unwrap(), b"/a-b%2Bc"); +/// ``` pub struct Quoter { - /// Simple bit-map of safe values in the 0-127 ASCII range. - safe_table: [u8; 16], - /// Simple bit-map of protected values in the 0-127 ASCII range. - protected_table: [u8; 16], + protected_table: AsciiBitmap, } impl Quoter { - pub fn new(safe: &[u8], protected: &[u8]) -> Quoter { - let mut quoter = Quoter { - safe_table: [0; 16], - protected_table: [0; 16], - }; - - // prepare safe table - for ch in 0..128 { - if ALLOWED.contains(&ch) { - set_bit(&mut quoter.safe_table, ch); - } - - if QS.contains(&ch) { - set_bit(&mut quoter.safe_table, ch); - } - } - - for &ch in safe { - set_bit(&mut quoter.safe_table, ch) - } + /// Constructs a new `Quoter` instance given a set of protected ASCII bytes. + /// + /// The first argument is ignored but is kept for backward compatibility. + /// + /// # Panics + /// Panics if any of the `protected` bytes are not in the 0-127 ASCII range. + pub fn new(_: &[u8], protected: &[u8]) -> Quoter { + let mut protected_table = AsciiBitmap::default(); // prepare protected table for &ch in protected { - set_bit(&mut quoter.safe_table, ch); - set_bit(&mut quoter.protected_table, ch); + protected_table.set_bit(ch); } - quoter + Quoter { protected_table } } - /// Decodes safe percent-encoded sequences from `val`. - /// - /// Returns `None` when no modification to the original byte string was required. - /// - /// Non-ASCII bytes are accepted as valid input. - /// - /// Behavior for invalid/incomplete percent-encoding sequences is unspecified and may include - /// removing the invalid sequence from the output or passing it as-is. - pub fn requote(&self, val: &[u8]) -> Option> { - let mut has_pct = 0; - let mut pct = [b'%', 0, 0]; - let mut idx = 0; - let mut cloned: Option> = None; - - let len = val.len(); - - while idx < len { - let ch = val[idx]; - - if has_pct != 0 { - pct[has_pct] = val[idx]; - has_pct += 1; - - if has_pct == 3 { - has_pct = 0; - let buf = cloned.as_mut().unwrap(); - - if let Some(ch) = hex_pair_to_char(pct[1], pct[2]) { - if ch < 128 { - if bit_at(&self.protected_table, ch) { - buf.extend_from_slice(&pct); - idx += 1; - continue; - } - - if bit_at(&self.safe_table, ch) { - buf.push(ch); - idx += 1; - continue; - } - } - - buf.push(ch); - } else { - buf.extend_from_slice(&pct[..]); - } + /// Decodes the next escape sequence, if any, and advances `val`. + #[inline(always)] + fn decode_next<'a>(&self, val: &mut &'a [u8]) -> Option<(&'a [u8], u8)> { + for i in 0..val.len() { + if let (prev, [b'%', p1, p2, rem @ ..]) = val.split_at(i) { + if let Some(ch) = hex_pair_to_char(*p1, *p2) + // ignore protected ascii bytes + .filter(|&ch| !(ch < 128 && self.protected_table.bit_at(ch))) + { + *val = rem; + return Some((prev, ch)); } - } else if ch == b'%' { - has_pct = 1; - - if cloned.is_none() { - let mut c = Vec::with_capacity(len); - c.extend_from_slice(&val[..idx]); - cloned = Some(c); - } - } else if let Some(ref mut cloned) = cloned { - cloned.push(ch) } - - idx += 1; } - cloned + None + } + + /// Partially percent-decodes the given bytes. + /// + /// Escape sequences of the protected set are *not* decoded. + /// + /// Returns `None` when no modification to the original bytes was required. + /// + /// Invalid/incomplete percent-encoding sequences are passed unmodified. + pub fn requote(&self, val: &[u8]) -> Option> { + let mut remaining = val; + + // early return indicates that no percent-encoded sequences exist and we can skip allocation + let (pre, decoded_char) = self.decode_next(&mut remaining)?; + + // decoded output will always be shorter than the input + let mut decoded = Vec::::with_capacity(val.len()); + + // push first segment and decoded char + decoded.extend_from_slice(pre); + decoded.push(decoded_char); + + // decode and push rest of segments and decoded chars + while let Some((prev, ch)) = self.decode_next(&mut remaining) { + // this ugly conditional achieves +50% perf in cases where this is a tight loop. + if !prev.is_empty() { + decoded.extend_from_slice(prev); + } + decoded.push(ch); + } + + decoded.extend_from_slice(remaining); + + Some(decoded) } pub(crate) fn requote_str_lossy(&self, val: &str) -> Option { @@ -135,24 +92,6 @@ impl Quoter { } } -/// Converts an ASCII character in the hex-encoded set (`0-9`, `A-F`, `a-f`) to its integer -/// representation from `0x0`–`0xF`. -/// -/// - `0x30 ('0') => 0x0` -/// - `0x39 ('9') => 0x9` -/// - `0x41 ('a') => 0xA` -/// - `0x61 ('A') => 0xA` -/// - `0x46 ('f') => 0xF` -/// - `0x66 ('F') => 0xF` -fn from_ascii_hex(v: u8) -> Option { - match v { - b'0'..=b'9' => Some(v - 0x30), // ord('0') == 0x30 - b'A'..=b'F' => Some(v - 0x41 + 10), // ord('A') == 0x41 - b'a'..=b'f' => Some(v - 0x61 + 10), // ord('a') == 0x61 - _ => None, - } -} - /// Decode a ASCII hex-encoded pair to an integer. /// /// Returns `None` if either portion of the decoded pair does not evaluate to a valid hex value. @@ -160,64 +99,52 @@ fn from_ascii_hex(v: u8) -> Option { /// - `0x33 ('3'), 0x30 ('0') => 0x30 ('0')` /// - `0x34 ('4'), 0x31 ('1') => 0x41 ('A')` /// - `0x36 ('6'), 0x31 ('1') => 0x61 ('a')` +#[inline(always)] fn hex_pair_to_char(d1: u8, d2: u8) -> Option { - let (d_high, d_low) = (from_ascii_hex(d1)?, from_ascii_hex(d2)?); + let d_high = char::from(d1).to_digit(16)?; + let d_low = char::from(d2).to_digit(16)?; // left shift high nibble by 4 bits - Some(d_high << 4 | d_low) + Some((d_high as u8) << 4 | (d_low as u8)) } -/// Sets bit in given bit-map to 1=true. -/// -/// # Panics -/// Panics if `ch` index is out of bounds. -fn set_bit(array: &mut [u8], ch: u8) { - array[(ch >> 3) as usize] |= 0b1 << (ch & 0b111) +#[derive(Debug, Default, Clone)] +struct AsciiBitmap { + array: [u8; 16], } -/// Returns true if bit to true in given bit-map. -/// -/// # Panics -/// Panics if `ch` index is out of bounds. -fn bit_at(array: &[u8], ch: u8) -> bool { - array[(ch >> 3) as usize] & (0b1 << (ch & 0b111)) != 0 +impl AsciiBitmap { + /// Sets bit in given bit-map to 1=true. + /// + /// # Panics + /// Panics if `ch` index is out of bounds. + fn set_bit(&mut self, ch: u8) { + self.array[(ch >> 3) as usize] |= 0b1 << (ch & 0b111) + } + + /// Returns true if bit to true in given bit-map. + /// + /// # Panics + /// Panics if `ch` index is out of bounds. + fn bit_at(&self, ch: u8) -> bool { + self.array[(ch >> 3) as usize] & (0b1 << (ch & 0b111)) != 0 + } } #[cfg(test)] mod tests { use super::*; - #[test] - fn hex_encoding() { - let hex = b"0123456789abcdefABCDEF"; - - for i in 0..256 { - let c = i as u8; - if hex.contains(&c) { - assert!(from_ascii_hex(c).is_some()) - } else { - assert!(from_ascii_hex(c).is_none()) - } - } - - let expected = [ - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, - ]; - for i in 0..hex.len() { - assert_eq!(from_ascii_hex(hex[i]).unwrap(), expected[i]); - } - } - #[test] fn custom_quoter() { let q = Quoter::new(b"", b"+"); assert_eq!(q.requote(b"/a%25c").unwrap(), b"/a%c"); - assert_eq!(q.requote(b"/a%2Bc").unwrap(), b"/a%2Bc"); + assert_eq!(q.requote(b"/a%2Bc"), None); let q = Quoter::new(b"%+", b"/"); assert_eq!(q.requote(b"/a%25b%2Bc").unwrap(), b"/a%b+c"); - assert_eq!(q.requote(b"/a%2fb").unwrap(), b"/a%2fb"); - assert_eq!(q.requote(b"/a%2Fb").unwrap(), b"/a%2Fb"); + assert_eq!(q.requote(b"/a%2fb"), None); + assert_eq!(q.requote(b"/a%2Fb"), None); assert_eq!(q.requote(b"/a%0Ab").unwrap(), b"/a\nb"); assert_eq!(q.requote(b"/a%FE\xffb").unwrap(), b"/a\xfe\xffb"); assert_eq!(q.requote(b"/a\xfe\xffb"), None); @@ -233,7 +160,8 @@ mod tests { #[test] fn invalid_sequences() { let q = Quoter::new(b"%+", b"/"); - assert_eq!(q.requote(b"/a%2x%2X%%").unwrap(), b"/a%2x%2X"); + assert_eq!(q.requote(b"/a%2x%2X%%"), None); + assert_eq!(q.requote(b"/a%20%2X%%").unwrap(), b"/a %2X%%"); } #[test] diff --git a/actix-router/src/url.rs b/actix-router/src/url.rs index e7dda3fca..8ac033861 100644 --- a/actix-router/src/url.rs +++ b/actix-router/src/url.rs @@ -3,7 +3,7 @@ use crate::ResourcePath; use crate::Quoter; thread_local! { - static DEFAULT_QUOTER: Quoter = Quoter::new(b"@:", b"%/+"); + static DEFAULT_QUOTER: Quoter = Quoter::new(b"", b"%/+"); } #[derive(Debug, Clone, Default)]