escapist-0.0.2/.cargo_vcs_info.json0000644000000001360000000000100126460ustar { "git": { "sha1": "2576f573f3e8d36d472eb349135cb81837861c3b" }, "path_in_vcs": "" }escapist-0.0.2/Cargo.toml0000644000000017650000000000100106550ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "escapist" version = "0.0.2" authors = ["Garen J. Torikian "] include = [ "src/**/*", "Cargo.toml", "README.md", "LICENSE", ] description = "This library is a minimal library for escaping HTML and href attributes; it can also unescape HTML." readme = "README.md" keywords = [ "html", "houdini", "escape", "unescape", ] categories = ["encoding"] license = "MIT" repository = "https://github.com/gjtorikian/escapist" [dependencies.entities] version = "1.0.1" escapist-0.0.2/Cargo.toml.orig000064400000000000000000000007651046102023000143350ustar 00000000000000[package] name = "escapist" version = "0.0.2" authors = ["Garen J. Torikian "] edition = "2021" repository = "https://github.com/gjtorikian/escapist" keywords = ["html", "houdini", "escape", "unescape"] categories = ["encoding"] description = "This library is a minimal library for escaping HTML and href attributes; it can also unescape HTML." readme = "README.md" license = "MIT" include = ["src/**/*", "Cargo.toml", "README.md", "LICENSE"] [dependencies] entities = "1.0.1" escapist-0.0.2/README.md000064400000000000000000000003321046102023000127130ustar 00000000000000# escapist Extremely minimal HTML/`href` escaping/unescaping. Emphasis on minimal. I wouldn't really use this if I were you. Essentially, a port of [Houdini](https://github.com/vmg/houdini) functions which I needed. escapist-0.0.2/src/escape/href.rs000064400000000000000000000034341046102023000147630ustar 00000000000000use crate::StrWrite; use std::{io::Result, str::from_utf8}; #[rustfmt::skip] static HREF_SAFE: [u8; 128] = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, ]; static HEX_CHARS: &[u8] = b"0123456789ABCDEF"; static AMP_ESCAPE: &str = "&"; static SINGLE_QUOTE_ESCAPE: &str = "'"; /// Writes an href to the buffer, escaping href unsafe bytes. pub fn escape_href(mut w: W, s: &str) -> Result<()> where W: StrWrite, { let bytes = s.as_bytes(); let mut mark = 0; for i in 0..bytes.len() { let c = bytes[i]; if c >= 0x80 || HREF_SAFE[c as usize] == 0 { // character needing escape // write partial substring up to mark if mark < i { w.write_str(&s[mark..i])?; } match c { b'&' => { w.write_str(AMP_ESCAPE)?; } b'\'' => { w.write_str(SINGLE_QUOTE_ESCAPE)?; } _ => { let mut buf = [0u8; 3]; buf[0] = b'%'; buf[1] = HEX_CHARS[((c as usize) >> 4) & 0xF]; buf[2] = HEX_CHARS[(c as usize) & 0xF]; let escaped = from_utf8(&buf).unwrap(); w.write_str(escaped)?; } } mark = i + 1; // all escaped characters are ASCII } } w.write_str(&s[mark..]) } escapist-0.0.2/src/escape/html.rs000064400000000000000000000161051046102023000150020ustar 00000000000000use std::io::{self}; use crate::StrWrite; const fn create_html_escape_table() -> [u8; 256] { let mut table = [0; 256]; table[b'"' as usize] = 1; table[b'&' as usize] = 2; table[b'<' as usize] = 3; table[b'>' as usize] = 4; table } static HTML_ESCAPE_TABLE: [u8; 256] = create_html_escape_table(); static HTML_ESCAPES: [&str; 5] = ["", """, "&", "<", ">"]; /// Writes the given string to the Write sink, replacing special HTML bytes /// (<, >, &, ") by escape sequences. pub fn escape_html(w: W, s: &str) -> io::Result<()> { #[cfg(all(target_arch = "x86_64", feature = "simd"))] { simd::escape_html(w, s) } #[cfg(not(all(target_arch = "x86_64", feature = "simd")))] { escape_html_scalar(w, s) } } fn escape_html_scalar(mut w: W, s: &str) -> io::Result<()> { let bytes = s.as_bytes(); let mut mark = 0; let mut i = 0; while i < s.len() { match bytes[i..] .iter() .position(|&c| HTML_ESCAPE_TABLE[c as usize] != 0) { Some(pos) => { i += pos; } None => break, } let c = bytes[i]; let escape = HTML_ESCAPE_TABLE[c as usize]; let escape_seq = HTML_ESCAPES[escape as usize]; w.write_str(&s[mark..i])?; w.write_str(escape_seq)?; i += 1; mark = i; // all escaped characters are ASCII } w.write_str(&s[mark..]) } #[cfg(all(target_arch = "x86_64", feature = "simd"))] mod simd { use super::StrWrite; use std::arch::x86_64::*; use std::io; use std::mem::size_of; const VECTOR_SIZE: usize = size_of::<__m128i>(); pub(super) fn escape_html(mut w: W, s: &str) -> io::Result<()> { // The SIMD accelerated code uses the PSHUFB instruction, which is part // of the SSSE3 instruction set. Further, we can only use this code if // the buffer is at least one VECTOR_SIZE in length to prevent reading // out of bounds. If either of these conditions is not met, we fall back // to scalar code. if is_x86_feature_detected!("ssse3") && s.len() >= VECTOR_SIZE { let bytes = s.as_bytes(); let mut mark = 0; unsafe { foreach_special_simd(bytes, 0, |i| { let escape_ix = *bytes.get_unchecked(i) as usize; let replacement = super::HTML_ESCAPES[super::HTML_ESCAPE_TABLE[escape_ix] as usize]; w.write_str(&s.get_unchecked(mark..i))?; mark = i + 1; // all escaped characters are ASCII w.write_str(replacement) })?; w.write_str(&s.get_unchecked(mark..)) } } else { super::escape_html_scalar(w, s) } } /// Creates the lookup table for use in `compute_mask`. const fn create_lookup() -> [u8; 16] { let mut table = [0; 16]; table[(b'<' & 0x0f) as usize] = b'<'; table[(b'>' & 0x0f) as usize] = b'>'; table[(b'&' & 0x0f) as usize] = b'&'; table[(b'"' & 0x0f) as usize] = b'"'; table[0] = 0b0111_1111; table } #[target_feature(enable = "ssse3")] /// Computes a byte mask at given offset in the byte buffer. Its first 16 (least significant) /// bits correspond to whether there is an HTML special byte (&, <, ", >) at the 16 bytes /// `bytes[offset..]`. For example, the mask `(1 << 3)` states that there is an HTML byte /// at `offset + 3`. It is only safe to call this function when /// `bytes.len() >= offset + VECTOR_SIZE`. unsafe fn compute_mask(bytes: &[u8], offset: usize) -> i32 { debug_assert!(bytes.len() >= offset + VECTOR_SIZE); let table = create_lookup(); let lookup = _mm_loadu_si128(table.as_ptr() as *const __m128i); let raw_ptr = bytes.as_ptr().offset(offset as isize) as *const __m128i; // Load the vector from memory. let vector = _mm_loadu_si128(raw_ptr); // We take the least significant 4 bits of every byte and use them as indices // to map into the lookup vector. // Note that shuffle maps bytes with their most significant bit set to lookup[0]. // Bytes that share their lower nibble with an HTML special byte get mapped to that // corresponding special byte. Note that all HTML special bytes have distinct lower // nibbles. Other bytes either get mapped to 0 or 127. let expected = _mm_shuffle_epi8(lookup, vector); // We compare the original vector to the mapped output. Bytes that shared a lower // nibble with an HTML special byte match *only* if they are that special byte. Bytes // that have either a 0 lower nibble or their most significant bit set were mapped to // 127 and will hence never match. All other bytes have non-zero lower nibbles but // were mapped to 0 and will therefore also not match. let matches = _mm_cmpeq_epi8(expected, vector); // Translate matches to a bitmask, where every 1 corresponds to a HTML special character // and a 0 is a non-HTML byte. _mm_movemask_epi8(matches) } /// Calls the given function with the index of every byte in the given byteslice /// that is either ", &, <, or > and for no other byte. /// Make sure to only call this when `bytes.len() >= 16`, undefined behaviour may /// occur otherwise. #[target_feature(enable = "ssse3")] unsafe fn foreach_special_simd( bytes: &[u8], mut offset: usize, mut callback: F, ) -> io::Result<()> where F: FnMut(usize) -> io::Result<()>, { // The strategy here is to walk the byte buffer in chunks of VECTOR_SIZE (16) // bytes at a time starting at the given offset. For each chunk, we compute a // a bitmask indicating whether the corresponding byte is a HTML special byte. // We then iterate over all the 1 bits in this mask and call the callback function // with the corresponding index in the buffer. // When the number of HTML special bytes in the buffer is relatively low, this // allows us to quickly go through the buffer without a lookup and for every // single byte. debug_assert!(bytes.len() >= VECTOR_SIZE); let upperbound = bytes.len() - VECTOR_SIZE; while offset < upperbound { let mut mask = compute_mask(bytes, offset); while mask != 0 { let ix = mask.trailing_zeros(); callback(offset + ix as usize)?; mask ^= mask & -mask; } offset += VECTOR_SIZE; } // Final iteration. We align the read with the end of the slice and // shift off the bytes at start we have already scanned. let mut mask = compute_mask(bytes, upperbound); mask >>= offset - upperbound; while mask != 0 { let ix = mask.trailing_zeros(); callback(offset + ix as usize)?; mask ^= mask & -mask; } Ok(()) } } escapist-0.0.2/src/escape/mod.rs000064400000000000000000000031661046102023000146200ustar 00000000000000mod href; mod html; use std::fmt::{Arguments, Write as FmtWrite}; use std::io::{self, ErrorKind, Write}; /// This wrapper exists because we can't have both a blanket implementation /// for all types implementing `Write` and types of the for `&mut W` where /// `W: StrWrite`. Since we need the latter a lot, we choose to wrap /// `Write` types. #[derive(Debug)] pub struct WriteWrapper(pub W); /// Trait that allows writing string slices. This is basically an extension /// of `std::io::Write` in order to include `String`. pub trait StrWrite { fn write_str(&mut self, s: &str) -> io::Result<()>; fn write_fmt(&mut self, args: Arguments) -> io::Result<()>; } impl StrWrite for WriteWrapper where W: Write, { #[inline] fn write_str(&mut self, s: &str) -> io::Result<()> { self.0.write_all(s.as_bytes()) } #[inline] fn write_fmt(&mut self, args: Arguments) -> io::Result<()> { self.0.write_fmt(args) } } impl<'w> StrWrite for String { #[inline] fn write_str(&mut self, s: &str) -> io::Result<()> { self.push_str(s); Ok(()) } #[inline] fn write_fmt(&mut self, args: Arguments) -> io::Result<()> { // FIXME: translate fmt error to io error? FmtWrite::write_fmt(self, args).map_err(|_| ErrorKind::Other.into()) } } impl StrWrite for &'_ mut W where W: StrWrite, { #[inline] fn write_str(&mut self, s: &str) -> io::Result<()> { (**self).write_str(s) } #[inline] fn write_fmt(&mut self, args: Arguments) -> io::Result<()> { (**self).write_fmt(args) } } pub use href::*; pub use html::*; escapist-0.0.2/src/lib.rs000064400000000000000000000012641046102023000133440ustar 00000000000000mod escape; mod unescape; // Branch prediction hint. This is currently only available on nightly but it // consistently improves performance by 10-15%. #[cfg(feature = "nightly")] use core::intrinsics::{likely, unlikely}; // On stable we can use #[cold] to get a equivalent effect: this attributes // suggests that the function is unlikely to be called #[cfg(not(feature = "nightly"))] #[inline] #[cold] fn cold() {} #[cfg(not(feature = "nightly"))] #[inline] fn likely(b: bool) -> bool { if !b { cold(); } b } #[cfg(not(feature = "nightly"))] #[inline] fn unlikely(b: bool) -> bool { if b { cold(); } b } pub use escape::*; pub use unescape::*; escapist-0.0.2/src/unescape/html.rs000064400000000000000000000023701046102023000153440ustar 00000000000000use crate::{likely, unescape, unlikely}; use super::CMARK_CTYPE_CLASS; // pub fn isspace(ch: u8) -> bool { // CMARK_CTYPE_CLASS[ch as usize] == 1 // } // pub fn ispunct(ch: u8) -> bool { // CMARK_CTYPE_CLASS[ch as usize] == 2 // } pub fn isdigit(ch: u8) -> bool { CMARK_CTYPE_CLASS[ch as usize] == 3 } // pub fn isalpha(ch: u8) -> bool { // CMARK_CTYPE_CLASS[ch as usize] == 4 // } // pub fn isalnum(ch: u8) -> bool { // CMARK_CTYPE_CLASS[ch as usize] == 3 || CMARK_CTYPE_CLASS[ch as usize] == 4 // } pub fn unescape_html(src: &[u8]) -> Vec { let size = src.len(); let mut i = 0; let mut v = Vec::with_capacity(size); while i < size { let org = i; while i < size && src[i] != b'&' { i += 1; } if likely(i > org) { if unlikely(org == 0) && i >= size { return src.to_vec(); } v.extend_from_slice(&src[org..i]); } // escaping if i >= size { return v; } i += 1; match unescape(&src[i..]) { Some((chs, size)) => { v.extend_from_slice(&chs); i += size; } None => v.push(b'&'), } } v } escapist-0.0.2/src/unescape/mod.rs000064400000000000000000000065541046102023000151670ustar 00000000000000mod html; use entities::ENTITIES; use std::cmp::min; use std::str::from_utf8_unchecked; pub const ENTITY_MIN_LENGTH: usize = 2; pub const ENTITY_MAX_LENGTH: usize = 32; #[rustfmt::skip] const CMARK_CTYPE_CLASS: [u8; 256] = [ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ /* 0 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, /* 1 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2 */ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 3 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, /* 4 */ 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 5 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, /* 6 */ 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 7 */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 0, /* 8 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 9 */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* d */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* e */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; pub fn unescape(text: &[u8]) -> Option<(Vec, usize)> { let mut has_semicolon = 0; if text.len() >= 3 && text[0] == b'#' { let mut codepoint: u32 = 0; let mut i = 0; let num_digits = if isdigit(text[1]) { i = 1; while i < text.len() && isdigit(text[i]) { codepoint = (codepoint * 10) + (text[i] as u32 - '0' as u32); codepoint = min(codepoint, 0x11_0000); i += 1; } i - 1 } else if text[1] == b'x' || text[1] == b'X' { i = 2; while i < text.len() && isxdigit(&text[i]) { codepoint = (codepoint * 16) + ((text[i] as u32 | 32) % 39 - 9); codepoint = min(codepoint, 0x11_0000); i += 1; } i - 2 } else { 0 }; if num_digits >= 1 && num_digits <= 8 && i < text.len() { if codepoint == 0 || (codepoint >= 0xD800 && codepoint <= 0xE000) || codepoint >= 0x110000 { codepoint = 0xFFFD; } if text[i] == b';' { has_semicolon = 1; } return Some(( char::from_u32(codepoint) .unwrap_or('\u{FFFD}') .to_string() .into_bytes(), i + has_semicolon, )); } } let size = min(text.len(), ENTITY_MAX_LENGTH); for i in ENTITY_MIN_LENGTH..size { if text[i] == b' ' { return None; } if text[i] == b';' { return lookup(&text[..i]).map(|e| (e.to_vec(), i + 1)); } } None } fn lookup(text: &[u8]) -> Option<&[u8]> { let entity_str = format!("&{};", unsafe { from_utf8_unchecked(text) }); let entity = ENTITIES.iter().find(|e| e.entity == entity_str); match entity { Some(e) => Some(e.characters.as_bytes()), None => None, } } fn isxdigit(ch: &u8) -> bool { (*ch >= b'0' && *ch <= b'9') || (*ch >= b'a' && *ch <= b'f') || (*ch >= b'A' && *ch <= b'F') } pub use html::*;