v_escape-base-0.1.0/.cargo_vcs_info.json0000644000000001421046102023000135460ustar { "git": { "sha1": "5c4dc01a5ae8c5cc460e97e0e58299bb90b6cb57" }, "path_in_vcs": "base" }v_escape-base-0.1.0/Cargo.lock0000644000000002351046102023000115240ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "v_escape-base" version = "0.1.0" v_escape-base-0.1.0/Cargo.toml0000644000000026061046102023000115530ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2024" name = "v_escape-base" version = "0.1.0" authors = ["Juan Aguilar Santillana "] build = false autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Base runtime for v_escape: SIMD-accelerated string escaping primitives." documentation = "https://docs.rs/v_escape-base" readme = "README.md" keywords = [ "escape", "simd", "html", "json", "latex", ] license = "MIT/Apache-2.0" repository = "https://github.com/zzau13/v_escape" resolver = "2" [package.metadata.docs.rs] features = [ "std", "alloc", "string", "fmt", "bytes", ] [features] alloc = [] bytes = [] default = [ "std", "bytes", "string", "fmt", ] fmt = [] std = ["alloc"] string = [] [lib] name = "v_escape_base" path = "src/lib.rs" [[test]] name = "lib" path = "tests/lib.rs" [[test]] name = "sys_info" path = "tests/sys_info.rs" v_escape-base-0.1.0/Cargo.toml.orig000064400000000000000000000023571046102023000152150ustar 00000000000000[package] name = "v_escape-base" version = "0.1.0" edition = "2024" authors = ["Juan Aguilar Santillana "] description = "Base runtime for v_escape: SIMD-accelerated string escaping primitives." documentation = "https://docs.rs/v_escape-base" keywords = ["escape", "simd", "html", "json", "latex"] license = "MIT/Apache-2.0" readme = "README.md" repository = "https://github.com/zzau13/v_escape" [features] default = ["std", "bytes", "string", "fmt"] # The 'std' feature permits the memchr crate to use the standard library. This # permits this crate to use runtime CPU feature detection to automatically # accelerate searching via vector instructions. Without the standard library, # this automatic detection is not possible. std = ["alloc"] # The 'alloc' feature enables some APIs that require allocation, such as # 'Finder::into_owned'. Note that this feature does not enable runtime CPU # feature detection. That still requires 'std'. alloc = [] # The 'string' feature enables the `escape_string` function. string = [] # The 'fmt' feature enables the `escape_fmt` function. fmt = [] # The 'bytes' feature enables the `escape_bytes` function. bytes = [] [package.metadata.docs.rs] features = ["std", "alloc", "string", "fmt", "bytes"] v_escape-base-0.1.0/LICENSE-APACHE000064400000000000000000000251301046102023000142440ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2019 Rust-iendo Barcelona Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. v_escape-base-0.1.0/LICENSE-MIT000064400000000000000000000020501046102023000137500ustar 00000000000000Copyright (c) 2019 Rust-iendo Barcelona Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. v_escape-base-0.1.0/README.md000064400000000000000000000016461046102023000136050ustar 00000000000000# [![Documentation](https://docs.rs/v_escape-base/badge.svg)](https://docs.rs/v_escape-base/) [![Latest version](https://img.shields.io/crates/v/v_escape-base.svg)](https://crates.io/crates/v_escape-base) # v_escape-base Base crate for v_escape. This crate provides the core SIMD-optimized escape functionality used by the v_escape ecosystem. ## Features - `std`: Enables the `std` library features and runtime CPU feature detection - `alloc`: Enables allocation-based APIs - `string`: Enables the `escape_string` function - `fmt`: Enables the `escape_fmt` function - `bytes`: Enables the `escape_bytes` function ## Documentation - Minimum supported Rust version: 1.85.0 or later ## License Licensed under either of - Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) - MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) at your option. v_escape-base-0.1.0/src/arch/aarch64.rs000064400000000000000000000031261046102023000156230ustar 00000000000000use core::arch::aarch64::int8x16_t; use crate::{Escapes, EscapesBuilder, Vector, generic::Generic, writer::Writer}; type NeonVector = int8x16_t; /// A function that performs escape operations using NEON SIMD vectorization. /// /// # Parameters /// - `haystack`: The input string to be escaped. /// - `writer`: The writer function. /// /// # Returns /// A result indicating success or failure of the escape operation. #[inline(always)] pub fn escape(haystack: &str, writer: impl Writer) -> Result<(), R> { let len = haystack.len(); if len < NeonVector::BYTES { return as Escapes>::byte_byte_escape(haystack, writer); } Generic::new(E::new::()).escape(haystack, writer) } /// A macro for creating a escape functions /// /// # Parameters /// - `$builder`: The type [`crate::EscapesBuilder`] of the builder #[macro_export] macro_rules! escape_builder { ($builder:ty) => { $crate::struct_string!($crate::builder_string!( escape_string, $crate::arch::aarch64::escape, escape, $builder )); $crate::struct_bytes!($crate::builder_bytes!( escape_bytes, $crate::arch::aarch64::escape, escape, $builder )); $crate::struct_display!( escape_fmt, escape_fmt_internal, $crate::builder_fmt!( escape_fmt_internal, $crate::arch::aarch64::escape, escape, $builder ), $builder ); }; } v_escape-base-0.1.0/src/arch/fallback.rs000064400000000000000000000016361046102023000161360ustar 00000000000000use crate::{Escapes, EscapesBuilder, writer::Writer}; /// A function that performs escape operations using fallback implementation. /// /// # Parameters /// - `haystack`: The input string to be escaped. /// - `writer`: The writer function. /// /// # Returns /// A result indicating success or failure of the escape operation. #[inline(always)] pub fn escape_fallback( haystack: &str, writer: impl Writer, ) -> Result<(), R> { // TODO: implement "1.21 Scanning for zero bytes" from Matters Computational by J. Arndt // https://www.researchgate.net/publication/267072412_Matters_Computational_Ideas_Algorithms_Source_Code // Is not possible with range of bytes, not exist operations for this or are very expensive // So we need to use different approach // But this is fallback implementation, so it's not priority E::Escapes::<()>::byte_byte_escape(haystack, writer) } v_escape-base-0.1.0/src/arch/mod.rs000064400000000000000000000026761046102023000151630ustar 00000000000000/// A module for x86_64 escape functions #[cfg(target_arch = "x86_64")] #[macro_use] pub mod x86_64; /// A module for aarch64 escape functions #[cfg(target_arch = "aarch64")] #[macro_use] pub mod aarch64; /// A module for wasm32 escape functions #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[macro_use] pub mod wasm32; /// A module for fallback escape functions pub mod fallback; /// A macro for creating a escape functions /// /// # Parameters /// - `$builder`: The type [`crate::EscapesBuilder`] of the builder #[cfg(not(any( target_arch = "x86_64", target_arch = "aarch64", all(target_arch = "wasm32", target_feature = "simd128") )))] #[macro_export] macro_rules! escape_builder { ($builder:ty) => { $crate::struct_string!($crate::builder_string!( escape_string, $crate::arch::fallback::escape_fallback, escape_fallback, $builder )); $crate::struct_bytes!($crate::builder_bytes!( escape_bytes, $crate::arch::fallback::escape_fallback, escape_fallback, $builder )); $crate::struct_display!( escape_fmt, escape_fmt_internal, $crate::builder_fmt!( escape_fmt_internal, $crate::arch::fallback::escape_fallback, escape_fallback, $builder ), $builder ); }; } v_escape-base-0.1.0/src/arch/wasm32.rs000064400000000000000000000032411046102023000155050ustar 00000000000000use core::arch::wasm32::v128; use crate::{Escapes, EscapesBuilder, Vector, generic::Generic, writer::Writer}; type WasmVector = v128; /// A function that performs escape operations using Wasm SIMD vectorization. /// /// # Parameters /// - `haystack`: The input string to be escaped. /// - `writer`: The writer function. /// /// # Returns /// A result indicating success or failure of the escape operation. #[inline(always)] pub fn escape(haystack: &str, writer: impl Writer) -> Result<(), R> { let len = haystack.len(); if len < WasmVector::BYTES { return as Escapes>::byte_byte_escape(haystack, writer); } // # Safety // E::new::() is unsafe because it operates simd instructions. Generic::new(E::new::()).escape(haystack, writer) } /// A macro for creating a escape functions /// /// # Parameters /// - `$builder`: The type [`crate::EscapesBuilder`] of the builder #[macro_export] macro_rules! escape_builder { ($builder:ty) => { $crate::struct_string!($crate::builder_string!( escape_string, $crate::arch::wasm32::escape, escape, $builder )); $crate::struct_bytes!($crate::builder_bytes!( escape_bytes, $crate::arch::wasm32::escape, escape, $builder )); $crate::struct_display!( escape_fmt, escape_fmt_internal, $crate::builder_fmt!( escape_fmt_internal, $crate::arch::wasm32::escape, escape, $builder ), $builder ); }; } v_escape-base-0.1.0/src/arch/x86_64/avx.rs000064400000000000000000000030551046102023000161300ustar 00000000000000use core::arch::x86_64::{__m128i, __m256i}; use crate::{Escapes, EscapesBuilder, Vector, generic::Generic, writer::Writer}; // Adapted from https://github.com/BurntSushi/memchr/blob/master/src/arch/x86_64/avx2/memchr.rs /// Returns true if AVX2 is available in the current environment. pub fn is_available() -> bool { #[cfg(not(target_feature = "sse2"))] { false } #[cfg(target_feature = "sse2")] { #[cfg(target_feature = "avx2")] { true } #[cfg(not(target_feature = "avx2"))] { #[cfg(feature = "std")] { std::is_x86_feature_detected!("avx2") } #[cfg(not(feature = "std"))] { false } } } } type AvxVector = __m256i; type SseVector = __m128i; /// A function that performs escape operations using AVX and SSE vectorization. /// /// # Parameters /// - `haystack`: The input string to be escaped. /// - `writer`: The writer function. /// /// # Returns /// A result indicating success or failure of the escape operation. #[inline(always)] pub fn escape(haystack: &str, mut writer: impl Writer) -> Result<(), R> { let len = haystack.len(); if len < AvxVector::BYTES { if len < SseVector::BYTES { return as Escapes>::byte_byte_escape(haystack, &mut writer); } return Generic::new(E::new::()).escape(haystack, writer); } Generic::new(E::new::()).escape(haystack, writer) } v_escape-base-0.1.0/src/arch/x86_64/mod.rs000064400000000000000000000075101046102023000161110ustar 00000000000000/// A module for AVX escape functions pub mod avx; /// A module for SSE escape functions pub mod sse; /// A macro for creating a escape functions /// /// # Parameters /// - `$name`: The name of the function. /// - `$writer_builder`: The function to use for the builder. /// - `$builder`: The type of the builder. /// - `$buffer`: The type of the buffer. #[doc(hidden)] #[macro_export] macro_rules! ifun { ( $name:ident, $writer_builder:path, $builder:ty, $buffer:ty $(,$retty:ty)? ) => { pub fn $name(haystack: &str, buffer: &mut $buffer) $(-> $retty)? { use core::sync::atomic::{AtomicPtr, Ordering}; type Fn = *mut (); type RealFn = fn(haystack: &str, buffer: &mut $buffer) $(-> $retty)?; static FN: AtomicPtr<()> = AtomicPtr::new(detect as Fn); #[cfg(target_feature = "sse2")] #[target_feature(enable = "sse2", enable = "avx2")] $writer_builder!(escape_avx2, $crate::arch::x86_64::avx::escape, escape, $builder); #[cfg(target_feature = "sse2")] #[target_feature(enable = "sse2")] $writer_builder!(escape_sse2, $crate::arch::x86_64::sse::escape, escape, $builder); $writer_builder!(escape_fallback, $crate::arch::fallback::escape_fallback, escape_fallback, $builder); unsafe fn detect(haystack: &str, buffer: &mut $buffer) $(-> $retty)? { let fun = { #[cfg(not(target_feature = "sse2"))] { escape_fallback } #[cfg(target_feature = "sse2")] { if $crate::arch::x86_64::avx::is_available() { escape_avx2 } else if $crate::arch::x86_64::sse::is_available() { escape_sse2 } else { escape_fallback } } }; FN.store(fun as Fn, Ordering::Relaxed); // SAFETY: The only thing we need to uphold here is the // `#[target_feature]` requirements. Since we check is_available // above before using the corresponding implementation, we are // guaranteed to only call code that is supported on the current // CPU. fun(haystack, buffer) } // SAFETY: By virtue of the caller contract, RealFn is a function // pointer, which is always safe to transmute with a *mut (). Also, // since we use $memchrty::is_available, it is guaranteed to be safe // to call $memchrty::$memchrfind. unsafe { let fun = FN.load(Ordering::Relaxed); core::mem::transmute::(fun)( haystack, buffer ) } } }; } /// A macro for creating a escape functions /// /// # Parameters /// - `$builder`: The type [`crate::EscapesBuilder`] of the builder #[macro_export] macro_rules! escape_builder { ($builder:ty) => { $crate::struct_display!( escape_fmt, escape_fmt_internal, $crate::ifun!( escape_fmt_internal, $crate::builder_fmt, $builder, core::fmt::Formatter<'_>, core::fmt::Result ), $builder ); $crate::struct_string!($crate::ifun!( escape_string, $crate::builder_string, $builder, String )); $crate::struct_bytes!($crate::ifun!( escape_bytes, $crate::builder_bytes, $builder, Vec )); }; } v_escape-base-0.1.0/src/arch/x86_64/sse.rs000064400000000000000000000017061046102023000161250ustar 00000000000000use core::arch::x86_64::__m128i; use crate::{Escapes, EscapesBuilder, Vector, generic::Generic, writer::Writer}; /// Returns true if SSE2 is available in the current environment. pub fn is_available() -> bool { #[cfg(target_feature = "sse2")] { true } #[cfg(not(target_feature = "sse2"))] { false } } type SseVector = __m128i; /// A function that performs escape operations using SSE vectorization. /// /// # Parameters /// - `haystack`: The input string to be escaped. /// - `writer`: The writer function. /// /// # Returns /// A result indicating success or failure of the escape operation. #[inline(always)] pub fn escape(haystack: &str, writer: impl Writer) -> Result<(), R> { let len = haystack.len(); if len < SseVector::BYTES { return as Escapes>::byte_byte_escape(haystack, writer); } Generic::new(E::new::()).escape(haystack, writer) } v_escape-base-0.1.0/src/escapes.rs000064400000000000000000000104641046102023000151040ustar 00000000000000use core::{fmt, str}; use crate::{ Vector, writer::{Writer, write, write_slice}, }; /// A builder trait for creating instances of types that implement the `Escapes` trait. /// /// # Type Parameters /// - `V`: The vector type implementing the `Vector` trait. pub trait EscapesBuilder { /// The `Escapes` type for a given vector type. /// /// # Type Parameters /// - `V`: The vector type implementing the `Vector` trait. type Escapes: Escapes; /// Creates a new instance of the `Escapes` type. /// /// # Returns /// An instance of a type that implements the `Escapes` trait. fn new() -> Self::Escapes; } /// A trait that abstracts masking functions for escape sequences. /// /// # Type Parameters /// - `V`: The vector type implementing the `Vector` trait. pub trait Escapes: Copy + fmt::Debug { /// The length of the escape sequence. const ESCAPE_LEN: usize; /// Indicates whether the escape sequence may produce false positives. const FALSE_POSITIVE: bool; /// The vector type used for masking operations. type Vector: Vector; /// Applies a mask to the given vector `v` to identify escape sequences. /// /// # Parameters /// - `v`: The vector to apply the mask to. /// /// # Returns /// A vector with the mask applied. fn masking(&self, v: Self::Vector) -> Self::Vector; /// Returns the escape sequence for a given position in the escaped array. /// /// # Parameters /// - `c`: The position of the character. /// /// # Returns /// A static string slice representing the escape sequence. fn escape(c: usize) -> &'static str; /// Returns the position of a character in the escaped array. /// /// # Parameters /// - `c`: The character to find the position for. /// /// # Returns /// The position of the character. fn position(c: u8) -> usize; /// Escapes a string by applying escape sequences and writing the result using a writer. /// /// # Parameters /// - `haystack`: The input string to be processed for escape sequences. /// - `writer`: A mutable writer function to handle the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the escape operation. #[inline(always)] fn byte_byte_escape(haystack: &str, mut writer: impl Writer) -> Result<(), R> { let len = haystack.len(); let start = haystack.as_ptr(); unsafe { Self::byte_byte_escape_raw(start, start.add(len), &mut writer) } } /// Escapes a range of bytes by applying escape sequences and writing the result using a writer. /// /// # Parameters /// - `haystack`: A pointer to the start of the byte range to be escaped. /// - `end`: A pointer to the end of the byte range to be escaped. /// - `writer`: A mutable writer function to handle the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the escape operation. /// /// # Safety /// This function is unsafe because it operates on raw pointers and assumes /// that the memory between `haystack` and `end` is valid and properly aligned. #[inline(always)] unsafe fn byte_byte_escape_raw( start: *const u8, end: *const u8, writer: &mut impl Writer, ) -> Result<(), R> { unsafe { let mut written = start; let mut cur = start; while cur < end { let c = *cur; // TODO: improve performance if Self::byte_byte_compare(c) { if written < cur { write_slice(written, cur, writer)?; } let escaped = Self::escape(Self::position(c)); write(escaped, writer)?; written = cur.add(1); } cur = cur.add(1); } if written < end { write_slice(written, end, writer)?; } Ok(()) } } /// Compares a byte to determine if it should be escaped. /// /// # Parameters /// - `c`: The byte to compare. /// /// # Returns /// `true` if the byte should be escaped, `false` otherwise. fn byte_byte_compare(c: u8) -> bool; } v_escape-base-0.1.0/src/ext.rs000064400000000000000000000022241046102023000142540ustar 00000000000000#![allow(dead_code)] // Adapted from https://github.com/BurntSushi/memchr/blob/master/src/ext.rs /// A trait for adding some helper routines to pointers. pub(crate) trait Pointer { /// Returns the distance, in units of `T`, between `self` and `origin`. /// /// # Safety /// /// Same as `ptr::offset_from` in addition to `self >= origin`. unsafe fn distance(self, origin: Self) -> usize; /// Casts this pointer to `usize`. /// /// Callers should not convert the `usize` back to a pointer if at all /// possible. (And if you believe it's necessary, open an issue to discuss /// why. Otherwise, it has the potential to violate pointer provenance.) /// The purpose of this function is just to be able to do arithmetic, i.e., /// computing offsets or alignments. fn to_usize(self) -> usize; } impl Pointer for *const T { unsafe fn distance(self, origin: *const T) -> usize { unsafe { // TODO: Replace with `ptr::sub_ptr` once stabilized. usize::try_from(self.offset_from(origin)).unwrap_unchecked() } } fn to_usize(self) -> usize { self as usize } } v_escape-base-0.1.0/src/generic.rs000064400000000000000000000253631046102023000151010ustar 00000000000000#![allow(dead_code)] // Adapted from https://github.com/BurntSushi/memchr/blob/master/src/arch/generic/memchr.rs use crate::{ Escapes, Vector, ext::Pointer, vector::MoveMask, writer::{Writer, write, write_slice}, }; /// A generic structure for handling escape sequences in a vectorized manner. /// /// # Type Parameters /// - `E`: The escape type implementing the `Escapes` trait. #[derive(Clone, Copy, Debug)] pub(crate) struct Generic { escapes: E, } impl Generic where E: Escapes, { /// The number of bytes processed per iteration in the search loop. const LOOP_SIZE: usize = 4 * E::Vector::BYTES; /// Creates a new `Generic` instance with the given escape handler. /// /// # Parameters /// - `escapes`: The escape handler to be used. #[inline(always)] pub(crate) fn new(escapes: E) -> Generic { Generic { escapes } } /// Escapes the input string by applying the escape sequences defined in the `Escapes` trait. /// /// # Parameters /// - `haystack`: The input string to be processed for escape sequences. /// - `writer`: The function to write the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the escape operation. #[inline(always)] pub(crate) fn escape( &mut self, haystack: &str, mut writer: impl Writer, ) -> Result<(), R> { let len = haystack.len(); let cur = haystack.as_ptr(); unsafe { self.escape_raw(cur, cur.add(len), &mut writer) } } /// Escapes the input data between the `start` and `end` pointers. /// /// # Parameters /// - `start`: The starting pointer of the data to be escaped. /// - `end`: The ending pointer of the data to be escaped. /// - `writer`: The function to write the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the escape operation. /// /// # Safety /// This function is unsafe because it operates on raw pointers and assumes /// that the memory between `start` and `end` is valid and properly aligned. #[inline(always)] pub(crate) unsafe fn escape_raw( &mut self, start: *const u8, end: *const u8, writer: &mut impl Writer, ) -> Result<(), R> { unsafe { let len = end.distance(start); let mut written = start; debug_assert!( len >= E::Vector::BYTES, "haystack has length {}, but must be at least {}", len, E::Vector::BYTES ); let align = E::Vector::BYTES - (start.to_usize() & E::Vector::ALIGN); if align > 0 { let x = E::Vector::load_unaligned(start); let mask = self.escapes.masking(x).movemask(); self.write_mask_unaligned(mask, start, align, &mut written, writer)?; } // Set `cur` to the first V-aligned pointer greater than `start`. let mut cur = start.add(align); debug_assert!(cur > start && end.sub(E::Vector::BYTES) >= start); if len >= Self::LOOP_SIZE { while cur <= end.sub(Self::LOOP_SIZE) { debug_assert_eq!(0, cur.to_usize() % E::Vector::BYTES); let a = E::Vector::load_aligned(cur); let b = E::Vector::load_aligned(cur.add(E::Vector::BYTES)); let c = E::Vector::load_aligned(cur.add(2 * E::Vector::BYTES)); let d = E::Vector::load_aligned(cur.add(3 * E::Vector::BYTES)); let eqa = self.escapes.masking(a); let eqb = self.escapes.masking(b); let eqc = self.escapes.masking(c); let eqd = self.escapes.masking(d); let or1 = eqa.or(eqb); let or2 = eqc.or(eqd); let or3 = or1.or(or2); if or3.movemask_will_have_non_zero() { self.write_mask(eqa.movemask(), cur, &mut written, writer)?; self.write_mask( eqb.movemask(), cur.add(E::Vector::BYTES), &mut written, writer, )?; self.write_mask( eqc.movemask(), cur.add(E::Vector::BYTES * 2), &mut written, writer, )?; self.write_mask( eqd.movemask(), cur.add(E::Vector::BYTES * 3), &mut written, writer, )?; } cur = cur.add(Self::LOOP_SIZE); } } // Handle any leftovers after the aligned loop above. while cur <= end.sub(E::Vector::BYTES) { debug_assert!(end.distance(cur) >= E::Vector::BYTES); let v = E::Vector::load_aligned(cur); let mask = self.escapes.masking(v).movemask(); self.write_mask(mask, cur, &mut written, writer)?; cur = cur.add(E::Vector::BYTES); } // Handle any remaining bytes that are less than a full vector's worth. if cur < end { debug_assert!(end.distance(cur) < E::Vector::BYTES); let rest = (E::Vector::BYTES - end.distance(cur)) as u32; let start = cur.sub(E::Vector::BYTES - end.distance(cur)); debug_assert_eq!(end.distance(start), E::Vector::BYTES); let x = E::Vector::load_unaligned(start); let mask = self.escapes.masking(x).movemask().shr(rest); self.write_mask(mask, cur, &mut written, writer)?; } if written < end { write_slice(written, end, writer)?; } Ok(()) } } /// Writes a single step of the escape process, handling any necessary escapes. /// /// # Parameters /// - `mask`: The mask indicating which bytes need to be escaped. /// - `cur`: The current pointer in the data. /// - `offset`: The offset from the current pointer. /// - `written`: A mutable reference to the pointer indicating the last written position. /// - `writer`: The function to write the escaped output. /// /// # Returns /// A `Result` containing the updated mask after clearing the least significant bit. /// /// # Safety /// This function is unsafe because it operates on raw pointers and assumes /// that the memory is valid. #[inline(always)] unsafe fn write_step( mask: <::Vector as Vector>::Mask, cur: *const u8, offset: usize, written: &mut *const u8, writer: &mut impl Writer, ) -> Result<<::Vector as Vector>::Mask, R> { unsafe { let c = E::position(*cur.add(offset)); if !E::FALSE_POSITIVE || c < E::ESCAPE_LEN { let at = cur.add(offset); if *written < at { write_slice(*written, at, writer)?; } write(E::escape(c), writer)?; *written = at.add(1); } Ok(mask.clear_least_significant_bit()) } } /// A helper function to write the escape mask, handling both aligned and unaligned data. /// /// # Parameters /// - `mask`: The mask indicating which bytes need to be escaped. /// - `cur`: The current pointer in the data. /// - `limit`: The limit up to which the mask should be processed. /// - `written`: A mutable reference to the pointer indicating the last written position. /// - `writer`: The function to write the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the write operation. /// /// # Safety /// This function is unsafe because it operates on raw pointers and assumes /// that the memory is valid. #[inline(always)] unsafe fn write_mask_helper( &mut self, mut mask: <::Vector as Vector>::Mask, cur: *const u8, limit: usize, written: &mut *const u8, writer: &mut impl Writer, ) -> Result<(), R> { unsafe { if mask.has_non_zero() { let mut offset = mask.first_offset(); while offset < limit { mask = Self::write_step(mask, cur, offset, written, writer)?; if !mask.has_non_zero() { break; } offset = mask.first_offset(); } } Ok(()) } } /// Writes the escape mask for unaligned data. /// /// # Parameters /// - `mask`: The mask indicating which bytes need to be escaped. /// - `cur`: The current pointer in the data. /// - `align`: The alignment offset. /// - `written`: A mutable reference to the pointer indicating the last written position. /// - `writer`: The function to write the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the write operation. /// /// # Safety /// This function is unsafe because it operates on raw pointers and assumes /// that the memory is valid. #[inline(always)] unsafe fn write_mask_unaligned( &mut self, mask: <::Vector as Vector>::Mask, cur: *const u8, align: usize, written: &mut *const u8, writer: &mut impl Writer, ) -> Result<(), R> { unsafe { self.write_mask_helper(mask, cur, align, written, writer) } } /// Writes the escape mask for aligned data. /// /// # Parameters /// - `mask`: The mask indicating which bytes need to be escaped. /// - `cur`: The current pointer in the data. /// - `written`: A mutable reference to the pointer indicating the last written position. /// - `writer`: The function to write the escaped output. /// /// # Returns /// A `Result` indicating the success or failure of the write operation. /// /// # Safety /// This function is unsafe because it operates on raw pointers and assumes /// that the memory is valid. #[inline(always)] unsafe fn write_mask( &mut self, mask: <::Vector as Vector>::Mask, cur: *const u8, written: &mut *const u8, writer: &mut impl Writer, ) -> Result<(), R> { unsafe { self.write_mask_helper(mask, cur, usize::MAX, written, writer) } } } v_escape-base-0.1.0/src/lib.rs000064400000000000000000000042001046102023000142160ustar 00000000000000//! A crate for escaping strings //! //! # Features //! //! - `std`: Enable standard library features //! - `alloc`: Enable alloc crate features //! - `string`: Enable `escape_string` function //! - `fmt`: Enable `escape_fmt` function //! //! # Examples //! //! ```rust //! use v_escape_base::{escape_builder, Escapes, EscapesBuilder, Vector}; //! //! #[derive(Debug, Clone, Copy)] //! struct Equal { //! a: V, //! } //! //! struct Builder; //! impl EscapesBuilder for Builder { //! type Escapes = Equal; //! //! fn new() -> Self::Escapes { //! Equal { a: V::splat(b'a') } //! } //! } //! //! impl Escapes for Equal { //! const ESCAPE_LEN: usize = 1; //! //! const FALSE_POSITIVE: bool = false; //! //! type Vector = V; //! //! #[inline(always)] //! fn masking(&self, vector2: V) -> V { //! self.a.cmpeq(vector2) //! } //! //! #[inline(always)] //! fn escape(_: usize) -> &'static str { //! "foo" //! } //! //! #[inline(always)] //! fn position(_: u8) -> usize { //! 0 //! } //! //! #[inline(always)] //! fn byte_byte_compare(c: u8) -> bool { //! c == b'a' //! } //! } //! //! escape_builder!(Builder); //! //! let mut buffer = String::new(); //! let haystack = "a".repeat(64); //! # #[cfg(feature = "string")] //! escape_string(&haystack, &mut buffer); //! # #[cfg(feature = "string")] //! assert_eq!(buffer, "foo".repeat(64)); //! //! let haystack = "a".repeat(64); //! # #[cfg(feature = "fmt")] //! assert_eq!(escape_fmt(&haystack).to_string(), "foo".repeat(64)); //! ``` #![deny(missing_docs)] #![no_std] /// A module for standard library #[cfg(any(test, feature = "std"))] extern crate std; /// A module for alloc crate #[cfg(any(test, feature = "alloc"))] extern crate alloc; /// A module for architecture-specific escape functions #[macro_use] pub mod arch; /// A module for escapes mod escapes; /// A module for extensions mod ext; /// A module for generic escape functions mod generic; mod vector; #[macro_use] mod writer; pub use escapes::{Escapes, EscapesBuilder}; pub use vector::Vector; v_escape-base-0.1.0/src/vector.rs000064400000000000000000000463041046102023000147650ustar 00000000000000// Adapted from https://github.com/BurntSushi/memchr/blob/master/src/vector.rs /// A trait for describing vector operations used by vectorized searchers. /// /// The trait is highly constrained to low level vector operations needed. /// In general, it was invented mostly to be generic over x86's __m128i and /// __m256i types. At time of writing, it also supports wasm and aarch64 /// 128-bit vector types as well. /// /// # Safety /// /// All methods are not safe since they are intended to be implemented using /// vendor intrinsics, which are also not safe. Callers must ensure that the /// appropriate target features are enabled in the calling function, and that /// the current CPU supports them. All implementations should avoid marking the /// routines with #\[target_feature\] and instead mark them as #[\inline(always)\] /// to ensure they get appropriately inlined. (inline(always) cannot be used /// with target_feature.) pub trait Vector: Copy + core::fmt::Debug { /// The number of bytes in the vector. That is, this is the size of the /// vector in memory. const BYTES: usize; /// The bits that must be zero in order for a `*const u8` pointer to be /// correctly aligned to read vector values. const ALIGN: usize; /// The type of the value returned by `Vector::movemask`. /// /// This supports abstracting over the specific representation used in /// order to accommodate different representations in different ISAs. type Mask: MoveMask; /// Create a vector with 8-bit lanes with the given byte repeated into each /// lane. fn splat(byte: u8) -> Self; /// Read a vector-size number of bytes from the given pointer. The pointer /// must be aligned to the size of the vector. /// /// # Safety /// /// Callers must guarantee that at least `BYTES` bytes are readable from /// `data` and that `data` is aligned to a `BYTES` boundary. unsafe fn load_aligned(data: *const u8) -> Self; /// Read a vector-size number of bytes from the given pointer. The pointer /// does not need to be aligned. /// /// # Safety /// /// Callers must guarantee that at least `BYTES` bytes are readable from /// `data`. unsafe fn load_unaligned(data: *const u8) -> Self; /// Convert the vector to a mask. fn movemask(self) -> Self::Mask; /// Compare two vectors for equality. fn cmpeq(self, vector2: Self) -> Self; /// Bitwise OR of two vectors. fn or(self, vector2: Self) -> Self; /// Add two vectors. fn add(self, vector2: Self) -> Self; /// Compare two vectors for greater than. fn gt(self, vector2: Self) -> Self; /// Returns true if and only if `Self::movemask` would return a mask that /// contains at least one non-zero bit. #[inline(always)] fn movemask_will_have_non_zero(self) -> bool { self.movemask().has_non_zero() } } /// A trait that abstracts over a vector-to-scalar operation called /// "move mask." /// /// On x86-64, this is `_mm_movemask_epi8` for SSE2 and `_mm256_movemask_epi8` /// for AVX2. It takes a vector of `u8` lanes and returns a scalar where the /// `i`th bit is set if and only if the most significant bit in the `i`th lane /// of the vector is set. The simd128 ISA for wasm32 also supports this /// exact same operation natively. /// /// ... But aarch64 doesn't. So we have to fake it with more instructions and /// a slightly different representation. We could do extra work to unify the /// representations, but then would require additional costs in the hot path /// for `memchr` and `packedpair`. So instead, we abstraction over the specific /// representation with this trait and define the operations we actually need. pub trait MoveMask: Copy + core::fmt::Debug { /// Returns true if and only if this mask has a a non-zero bit anywhere. fn has_non_zero(self) -> bool; /// Returns shifted the mask to the right by the specified number of positions. fn shr(self, rhs: u32) -> Self; /// Returns a mask that is equivalent to `self` but with the least /// significant 1-bit set to 0. fn clear_least_significant_bit(self) -> Self; /// Returns the offset of the first non-zero lane this mask represents. fn first_offset(self) -> usize; } /// This is a "sensible" movemask implementation where each bit represents /// whether the most significant bit is set in each corresponding lane of a /// vector. This is used on x86-64 and wasm, but such a mask is more expensive /// to get on aarch64 so we use something a little different. /// /// We call this "sensible" because this is what we get using native sse/avx /// movemask instructions. But neon has no such native equivalent. #[cfg(any( target_arch = "x86_64", all(target_arch = "wasm32", target_feature = "simd128") ))] #[derive(Clone, Copy)] pub struct SensibleMoveMask(u32); #[cfg(any( target_arch = "x86_64", all(target_arch = "wasm32", target_feature = "simd128") ))] impl core::fmt::Debug for SensibleMoveMask { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{:b}", self.0) } } #[cfg(any( target_arch = "x86_64", all(target_arch = "wasm32", target_feature = "simd128") ))] impl SensibleMoveMask { /// Get the mask in a form suitable for computing offsets. /// /// Basically, this normalizes to little endian. On big endian, this swaps /// the bytes. // TODO: Endianness does NOT affect the result of bitwise operations // (like <<, >>, &) or methods like `.trailing_zeros()` on the integer // returned by a SIMD movemask. The bit order in the movemask result is // defined by the SIMD instruction set (e.g., bit 0 corresponds to lane // 0, bit 1 to lane 1, etc.), regardless of how bytes are stored in // memory. So shifting the mask or counting trailing zeros is safe and // portable. #[inline(always)] fn get_for_offset(self) -> u32 { #[cfg(target_endian = "big")] { self.0.swap_bytes() } #[cfg(target_endian = "little")] { self.0 } } } #[cfg(any( target_arch = "x86_64", all(target_arch = "wasm32", target_feature = "simd128") ))] impl MoveMask for SensibleMoveMask { #[inline(always)] fn has_non_zero(self) -> bool { self.0 != 0 } #[inline(always)] fn clear_least_significant_bit(self) -> SensibleMoveMask { SensibleMoveMask(self.0 & (self.0 - 1)) } #[inline(always)] fn first_offset(self) -> usize { // We are dealing with little endian here (and if we aren't, we swap // the bytes so we are in practice), where the most significant byte // is at a higher address. That means the least significant bit that // is set corresponds to the position of our first matching byte. // That position corresponds to the number of zeros after the least // significant bit. self.get_for_offset().trailing_zeros() as usize } fn shr(self, rhs: u32) -> Self { // Endianness is not relevant here because the mask always uses // first_offset to compute the offset. SensibleMoveMask(self.0.wrapping_shr(rhs)) } } /// Noop implementation for types that don't support vectorization. impl Vector for () { const BYTES: usize = 0; const ALIGN: usize = 0; type Mask = (); #[inline(always)] fn splat(_byte: u8) -> Self { unreachable!() } #[inline(always)] unsafe fn load_aligned(_data: *const u8) -> Self { unreachable!() } #[inline(always)] unsafe fn load_unaligned(_data: *const u8) -> Self { unreachable!() } #[inline(always)] fn movemask(self) -> Self::Mask { unreachable!() } #[inline(always)] fn cmpeq(self, _vector2: Self) -> Self { unreachable!() } #[inline(always)] fn or(self, _vector2: Self) -> Self { unreachable!() } #[inline(always)] fn add(self, _vector2: Self) -> Self { unreachable!() } #[inline(always)] fn gt(self, _vector2: Self) -> Self { unreachable!() } } /// Noop implementation for types that don't support vectorization. impl MoveMask for () { #[inline(always)] fn has_non_zero(self) -> bool { unreachable!() } #[inline(always)] fn shr(self, _rhs: u32) -> Self { unreachable!() } #[inline(always)] fn clear_least_significant_bit(self) -> Self { unreachable!() } #[inline(always)] fn first_offset(self) -> usize { unreachable!() } } #[cfg(target_arch = "x86_64")] mod x86sse2 { use core::arch::x86_64::*; use super::{SensibleMoveMask, Vector}; impl Vector for __m128i { const BYTES: usize = 16; const ALIGN: usize = Self::BYTES - 1; type Mask = SensibleMoveMask; #[inline(always)] fn splat(byte: u8) -> Self { unsafe { _mm_set1_epi8(byte as i8) } } #[inline(always)] unsafe fn load_aligned(data: *const u8) -> Self { unsafe { _mm_load_si128(data as *const __m128i) } } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> Self { unsafe { _mm_loadu_si128(data as *const __m128i) } } #[inline(always)] fn movemask(self) -> Self::Mask { SensibleMoveMask(unsafe { _mm_movemask_epi8(self) } as u32) } #[inline(always)] fn cmpeq(self, vector2: Self) -> Self { unsafe { _mm_cmpeq_epi8(self, vector2) } } #[inline(always)] fn or(self, vector2: Self) -> Self { unsafe { _mm_or_si128(self, vector2) } } #[inline(always)] fn add(self, vector2: Self) -> Self { unsafe { _mm_add_epi8(self, vector2) } } #[inline(always)] fn gt(self, vector2: Self) -> Self { unsafe { _mm_cmpgt_epi8(self, vector2) } } } } #[cfg(target_arch = "x86_64")] mod x86avx2 { use core::arch::x86_64::*; use super::{SensibleMoveMask, Vector}; impl Vector for __m256i { const BYTES: usize = 32; const ALIGN: usize = Self::BYTES - 1; type Mask = SensibleMoveMask; #[inline(always)] fn splat(byte: u8) -> Self { unsafe { _mm256_set1_epi8(byte as i8) } } #[inline(always)] unsafe fn load_aligned(data: *const u8) -> Self { unsafe { _mm256_load_si256(data as *const __m256i) } } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> Self { unsafe { _mm256_loadu_si256(data as *const __m256i) } } #[inline(always)] fn movemask(self) -> Self::Mask { SensibleMoveMask(unsafe { _mm256_movemask_epi8(self) } as u32) } #[inline(always)] fn cmpeq(self, vector2: Self) -> Self { unsafe { _mm256_cmpeq_epi8(self, vector2) } } #[inline(always)] fn or(self, vector2: Self) -> Self { unsafe { _mm256_or_si256(self, vector2) } } fn add(self, vector2: Self) -> Self { unsafe { _mm256_add_epi8(self, vector2) } } fn gt(self, vector2: Self) -> Self { unsafe { _mm256_cmpgt_epi8(self, vector2) } } } } #[cfg(target_arch = "aarch64")] mod aarch64neon { use core::arch::aarch64::*; use super::{MoveMask, Vector}; impl Vector for int8x16_t { const BYTES: usize = 16; const ALIGN: usize = Self::BYTES - 1; type Mask = NeonMoveMask; #[inline(always)] fn splat(byte: u8) -> Self { unsafe { vdupq_n_s8(byte as i8) } } #[inline(always)] unsafe fn load_aligned(data: *const u8) -> Self { // I've tried `data.cast::().read()` instead, but // couldn't observe any benchmark differences. unsafe { Self::load_unaligned(data) } } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> Self { unsafe { vld1q_s8(data as *const i8) } } #[inline(always)] fn movemask(self) -> NeonMoveMask { let asu16s = unsafe { vreinterpretq_u16_s8(self) }; let mask = unsafe { vshrn_n_u16(asu16s, 4) }; let asu64 = unsafe { vreinterpret_u64_u8(mask) }; let scalar64 = unsafe { vget_lane_u64(asu64, 0) }; NeonMoveMask(scalar64 & 0x8888888888888888) } #[inline(always)] fn cmpeq(self, vector2: Self) -> Self { unsafe { vreinterpretq_s8_u8(vceqq_s8(self, vector2)) } } #[inline(always)] fn or(self, vector2: Self) -> Self { unsafe { vorrq_s8(self, vector2) } } /// This is the only interesting implementation of this routine. /// Basically, instead of doing the "shift right narrow" dance, we use /// adjacent folding max to determine whether there are any non-zero /// bytes in our mask. If there are, *then* we'll do the "shift right /// narrow" dance. In benchmarks, this does lead to slightly better /// throughput, but the win doesn't appear huge. #[inline(always)] fn movemask_will_have_non_zero(self) -> bool { let self_ = unsafe { vreinterpretq_u8_s8(self) }; let low = unsafe { vreinterpretq_u64_u8(vpmaxq_u8(self_, self_)) }; unsafe { vgetq_lane_u64(low, 0) != 0 } } #[inline(always)] fn add(self, vector2: Self) -> Self { unsafe { vaddq_s8(self, vector2) } } #[inline(always)] fn gt(self, vector2: Self) -> Self { unsafe { vreinterpretq_s8_u8(vcgtq_s8(self, vector2)) } } } /// Neon doesn't have a `movemask` that works like the one in x86-64, so we /// wind up using a different method[1]. The different method also produces /// a mask, but 4 bits are set in the neon case instead of a single bit set /// in the x86-64 case. We do an extra step to zero out 3 of the 4 bits, /// but we still wind up with at least 3 zeroes between each set bit. This /// generally means that we need to do some division by 4 before extracting /// offsets. /// /// In fact, the existence of this type is the entire reason that we have /// the `MoveMask` trait in the first place. This basically lets us keep /// the different representations of masks without being forced to unify /// them into a single representation, which could result in extra and /// unnecessary work. /// /// [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon #[derive(Clone, Copy)] pub struct NeonMoveMask(u64); impl core::fmt::Debug for NeonMoveMask { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{:b}", self.0) } } impl NeonMoveMask { /// Get the mask in a form suitable for computing offsets. /// /// Basically, this normalizes to little endian. On big endian, this /// swaps the bytes. // TODO: Endianness does NOT affect the result of bitwise operations // (like <<, >>, &) or methods like `.trailing_zeros()` on the integer // returned by a SIMD movemask. The bit order in the movemask result is // defined by the SIMD instruction set (e.g., bit 0 corresponds to lane // 0, bit 1 to lane 1, etc.), regardless of how bytes are stored in // memory. So shifting the mask or counting trailing zeros is safe and // portable. #[inline(always)] fn get_for_offset(self) -> u64 { #[cfg(target_endian = "big")] { self.0.swap_bytes() } #[cfg(target_endian = "little")] { self.0 } } } impl MoveMask for NeonMoveMask { #[inline(always)] fn has_non_zero(self) -> bool { self.0 != 0 } #[inline(always)] fn shr(self, rhs: u32) -> Self { // Mask is 64 bits instead of 16 bits (for a 128 bit vector) // so every position has 4 bits. We need to multiply the shift // amount by 4 to shift the bits correctly. // Endianness is not relevant here because the mask always uses // first_offset to compute the offset and shift operations always // respect the value. NeonMoveMask(self.0.wrapping_shr(rhs << 2)) } #[inline(always)] fn clear_least_significant_bit(self) -> NeonMoveMask { NeonMoveMask(self.0 & (self.0 - 1)) } #[inline(always)] fn first_offset(self) -> usize { // We are dealing with little endian here (and if we aren't, // we swap the bytes so we are in practice), where the most // significant byte is at a higher address. That means the least // significant bit that is set corresponds to the position of our // first matching byte. That position corresponds to the number of // zeros after the least significant bit. // // Note that unlike `SensibleMoveMask`, this mask has its bits // spread out over 64 bits instead of 16 bits (for a 128 bit // vector). Namely, where as x86-64 will turn // // 0x00 0xFF 0x00 0x00 0xFF // // into 10010, our neon approach will turn it into // // 10000000000010000000 // // And this happens because neon doesn't have a native `movemask` // instruction, so we kind of fake it[1]. Thus, we divide the // number of trailing zeros by 4 to get the "real" offset. // // [1]: https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon (self.get_for_offset().trailing_zeros() >> 2) as usize } } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] mod wasm_simd128 { use core::arch::wasm32::*; use super::{SensibleMoveMask, Vector}; impl Vector for v128 { const BYTES: usize = 16; const ALIGN: usize = Self::BYTES - 1; type Mask = SensibleMoveMask; #[inline(always)] fn splat(byte: u8) -> Self { u8x16_splat(byte) } #[inline(always)] unsafe fn load_aligned(data: *const u8) -> Self { unsafe { *data.cast() } } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> Self { unsafe { v128_load(data.cast()) } } #[inline(always)] fn movemask(self) -> SensibleMoveMask { SensibleMoveMask(u8x16_bitmask(self).into()) } #[inline(always)] fn cmpeq(self, vector2: Self) -> Self { i8x16_eq(self, vector2) } #[inline(always)] fn or(self, vector2: Self) -> Self { v128_or(self, vector2) } fn add(self, vector2: Self) -> Self { i8x16_add(self, vector2) } fn gt(self, vector2: Self) -> Self { i8x16_gt(self, vector2) } } } v_escape-base-0.1.0/src/writer.rs000064400000000000000000000127561046102023000150030ustar 00000000000000use core::{slice, str}; use crate::ext::Pointer; /// A trait for writing strings, defined as a function that takes a string slice /// and returns a `Result`. /// /// # Type Parameters /// - `R`: The result type for the writer function. // TODO: Maybe Writer, R> = FnMut(&str) -> Result // struct Foo; // impl core::ops::Add for Foo { // type Output = Foo; // #[inline(always)] // fn add(self, _rhs: Self) -> Self::Output { // self // } // } // impl From for () { // #[inline(always)] // fn from(_: Foo) -> Self { // () // } // } // And implement for &mut dyn core::fmt::Write and &mut dyn core::io::Write pub trait Writer: FnMut(&str) -> Result<(), R> {} impl Result<(), R>> Writer for T {} /// Writes a string slice using the writer function. /// /// # Parameters /// - `src`: The string slice to be written. /// - `writer`: The function to write the string slice. /// /// # Returns /// A `Result` indicating the success or failure of the write operation. #[inline(always)] pub(crate) fn write(src: &str, writer: &mut impl Writer) -> Result<(), R> { (writer)(src) } /// Writes a slice of bytes as a string using the writer function. /// /// # Parameters /// - `start`: The starting pointer of the byte slice. /// - `end`: The ending pointer of the byte slice. /// - `writer`: The function to write the string slice. /// /// # Returns /// A `Result` indicating the success or failure of the write operation. /// /// # Safety /// This function is unsafe because it assumes that the byte slice is valid UTF-8. #[inline(always)] pub(crate) unsafe fn write_slice( start: *const u8, end: *const u8, writer: &mut impl Writer, ) -> Result<(), R> { unsafe { write( str::from_utf8_unchecked(slice::from_raw_parts(start, end.distance(start))), writer, ) } } /// A macro for creating a builder function that appends a string to a `String`. /// /// # Parameters /// - `$name`: The name of the builder function. /// - `$fn`: The function to use for the builder. /// - `$fn_name`: The name of the function to use for the builder. /// - `$builder`: The type of the builder. #[doc(hidden)] #[macro_export] #[cfg(feature = "fmt")] macro_rules! builder_fmt { ($name:ident, $fn:path, $fn_name:ident, $builder:ty) => { fn $name(haystack: &str, buffer: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use $fn; let writer = |s: &str| buffer.write_str(s); $fn_name::<$builder, _>(haystack, writer) } }; } /// A macro for creating a function that return a `impl Display`. /// /// # Parameters /// - `$name`: The name of the function. /// - `$internal`: The internal function to use for the struct. /// - `$body`: The body of the struct. /// - `$builder`: The type of the builder. #[doc(hidden)] #[macro_export] #[cfg(feature = "fmt")] macro_rules! struct_display { ($name:ident, $internal:ident, $body:expr, $builder:ty) => { pub fn $name<'a>(haystack: &'a str) -> impl core::fmt::Display + 'a { struct __SDisplay<'a>(&'a str); impl<'a> core::fmt::Display for __SDisplay<'a> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { $body; $internal(self.0, f) } } __SDisplay(haystack) } }; } #[cfg(not(feature = "fmt"))] #[macro_export] #[doc(hidden)] macro_rules! struct_display { ($($tt:tt)*) => {}; } /// A macro for creating a builder function that appends a string to a `String`. /// /// # Parameters /// - `$name`: The name of the builder function. /// - `$fn`: The function to use for the builder. /// - `$fn_name`: The name of the function to use for the builder. /// - `$builder`: The type of the builder. #[doc(hidden)] #[macro_export] #[cfg(feature = "string")] macro_rules! builder_string { ($name:ident, $fn:path, $fn_name:ident, $builder:ty) => { pub fn $name(haystack: &str, buffer: &mut String) { use $fn; let writer = |s: &str| { buffer.push_str(s); Ok::<(), ()>(()) }; let _ = $fn_name::<$builder, _>(haystack, writer); } }; } #[doc(hidden)] #[macro_export] #[cfg(feature = "string")] macro_rules! struct_string { ($($tt:tt)*) => { $($tt)*; }; } #[cfg(not(feature = "string"))] #[macro_export] #[doc(hidden)] macro_rules! struct_string { ($($tt:tt)*) => {}; } /// A macro for creating a builder function that appends a Vector to a `Vector`. /// /// # Parameters /// - `$name`: The name of the builder function. /// - `$fn`: The function to use for the builder. /// - `$fn_name`: The name of the function to use for the builder. /// - `$builder`: The type of the builder. #[doc(hidden)] #[macro_export] #[cfg(feature = "bytes")] macro_rules! builder_bytes { ($name:ident, $fn:path, $fn_name:ident, $builder:ty) => { pub fn $name(haystack: &str, buffer: &mut Vec) { use $fn; let writer = |s: &str| { buffer.extend_from_slice(s.as_bytes()); Ok::<(), ()>(()) }; let _ = $fn_name::<$builder, _>(haystack, writer); } }; } #[doc(hidden)] #[macro_export] #[cfg(feature = "bytes")] macro_rules! struct_bytes { ($($tt:tt)*) => { $($tt)*; }; } #[cfg(not(feature = "bytes"))] #[macro_export] #[doc(hidden)] macro_rules! struct_bytes { ($($tt:tt)*) => {}; } v_escape-base-0.1.0/tests/lib.rs000064400000000000000000000641121046102023000146010ustar 00000000000000#![cfg(all(feature = "string", feature = "fmt", feature = "bytes"))] use v_escape_base::{Escapes, EscapesBuilder, Vector, escape_builder}; mod no_false_positive { use super::*; #[derive(Debug, Clone, Copy)] struct Equal { a: V, } struct Builder; impl EscapesBuilder for Builder { type Escapes = Equal; fn new() -> Self::Escapes { Equal { a: V::splat(b'a') } } } impl Escapes for Equal { const ESCAPE_LEN: usize = 1; const FALSE_POSITIVE: bool = false; type Vector = V; #[inline(always)] fn masking(&self, vector2: V) -> V { self.a.cmpeq(vector2) } #[inline(always)] fn escape(_: usize) -> &'static str { "foo" } #[inline(always)] fn position(_: u8) -> usize { 0 } #[inline(always)] fn byte_byte_compare(c: u8) -> bool { c == b'a' } } escape_builder!(Builder); #[test] fn test_escape_bytes() { let mut buffer = String::new(); let haystack = "a".repeat(64); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(64)); } #[test] fn test_escape_bytes_vec() { let mut buffer = Vec::new(); let haystack = "a".repeat(64); escape_bytes(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(64).as_bytes()); } #[test] fn test_escape_fmt() { let haystack = "a".repeat(64); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(64)); } // Test empty string #[test] fn test_empty_string() { let mut buffer = String::new(); escape_string("", &mut buffer); assert_eq!(buffer, ""); let result = escape_fmt("").to_string(); assert_eq!(result, ""); } #[test] fn test_empty_bytes() { let mut buffer = Vec::new(); escape_bytes("", &mut buffer); assert_eq!(buffer, b""); } // Test string with no escapes #[test] fn test_no_escapes() { let mut buffer = String::new(); let haystack = "hello world"; escape_string(haystack, &mut buffer); assert_eq!(buffer, haystack); let result = escape_fmt(haystack).to_string(); assert_eq!(result, haystack); } #[test] fn test_no_escapes_bytes() { let mut buffer = Vec::new(); let haystack = "hello world"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, haystack.as_bytes()); } // Test single character #[test] fn test_single_character() { let mut buffer = String::new(); escape_string("a", &mut buffer); assert_eq!(buffer, "foo"); let result = escape_fmt("a").to_string(); assert_eq!(result, "foo"); } #[test] fn test_single_character_bytes() { let mut buffer = Vec::new(); escape_bytes("a", &mut buffer); assert_eq!(buffer, b"foo"); } // Test mixed content #[test] fn test_mixed_content() { let mut buffer = String::new(); let haystack = "hello a world a test"; escape_string(haystack, &mut buffer); assert_eq!(buffer, "hello foo world foo test"); let result = escape_fmt(haystack).to_string(); assert_eq!(result, "hello foo world foo test"); } #[test] fn test_mixed_content_bytes() { let mut buffer = Vec::new(); let haystack = "hello a world a test"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, "hello foo world foo test".as_bytes()); } // Test consecutive escapes #[test] fn test_consecutive_escapes() { let mut buffer = String::new(); let haystack = "aaa"; escape_string(haystack, &mut buffer); assert_eq!(buffer, "foofoofoo"); let result = escape_fmt(haystack).to_string(); assert_eq!(result, "foofoofoo"); } #[test] fn test_consecutive_escapes_bytes() { let mut buffer = Vec::new(); let haystack = "aaa"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, b"foofoofoo"); } // Test escape at beginning #[test] fn test_escape_at_beginning() { let mut buffer = String::new(); let haystack = "ahello"; escape_string(haystack, &mut buffer); assert_eq!(buffer, "foohello"); let result = escape_fmt(haystack).to_string(); assert_eq!(result, "foohello"); } #[test] fn test_escape_at_beginning_bytes() { let mut buffer = Vec::new(); let haystack = "ahello"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, b"foohello"); } // Test escape at end #[test] fn test_escape_at_end() { let mut buffer = String::new(); let haystack = "helloa"; escape_string(haystack, &mut buffer); assert_eq!(buffer, "hellofoo"); let result = escape_fmt(haystack).to_string(); assert_eq!(result, "hellofoo"); } #[test] fn test_escape_at_end_bytes() { let mut buffer = Vec::new(); let haystack = "helloa"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, b"hellofoo"); } // Test large string #[test] fn test_large_string() { let mut buffer = String::new(); let haystack = "a".repeat(1000); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(1000)); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(1000)); } #[test] fn test_large_bytes() { let mut buffer = Vec::new(); let haystack = "a".repeat(1000); escape_bytes(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(1000).as_bytes()); } // Test with Cow types #[test] fn test_cow_types() { use std::borrow::Cow; let mut buffer = String::new(); let cow_owned = Cow::Owned("a".to_string()); escape_string(&cow_owned, &mut buffer); assert_eq!(buffer, "foo"); let result = escape_fmt(&cow_owned).to_string(); assert_eq!(result, "foo"); let cow_borrowed = Cow::Borrowed("a"); let mut buffer2 = String::new(); escape_string(&cow_borrowed, &mut buffer2); assert_eq!(buffer2, "foo"); let result2 = escape_fmt(&cow_borrowed).to_string(); assert_eq!(result2, "foo"); } #[test] fn test_cow_types_bytes() { use std::borrow::Cow; let mut buffer = Vec::new(); let cow_owned = Cow::Owned("a".to_string()); escape_bytes(&cow_owned, &mut buffer); assert_eq!(buffer, b"foo"); let cow_borrowed = Cow::Borrowed("a"); let mut buffer2 = Vec::new(); escape_bytes(&cow_borrowed, &mut buffer2); assert_eq!(buffer2, b"foo"); } // Test byte-by-byte escape functionality #[test] fn test_byte_byte_escape() { let mut buffer = String::new(); let writer = |s: &str| { buffer.push_str(s); Ok::<(), ()>(()) }; let result = Equal::<()>::byte_byte_escape("a", writer); assert!(result.is_ok()); assert_eq!(buffer, "foo"); } // Test position and escape functions #[test] fn test_position_and_escape() { assert_eq!(Equal::<()>::position(b'a'), 0); assert_eq!(Equal::<()>::escape(0), "foo"); } // Test byte_byte_compare #[test] fn test_byte_byte_compare() { assert!(Equal::<()>::byte_byte_compare(b'a')); assert!(!Equal::<()>::byte_byte_compare(b'b')); assert!(!Equal::<()>::byte_byte_compare(b'z')); } // Test with different string types #[test] fn test_different_string_types() { let mut buffer = String::new(); // String let string = String::from("a"); escape_string(&string, &mut buffer); assert_eq!(buffer, "foo"); // &str let mut buffer2 = String::new(); let str_ref = "a"; escape_string(str_ref, &mut buffer2); assert_eq!(buffer2, "foo"); // Box let mut buffer3 = String::new(); let boxed_str = "a".to_string().into_boxed_str(); escape_string(&boxed_str, &mut buffer3); assert_eq!(buffer3, "foo"); } #[test] fn test_different_string_types_bytes() { // String let mut buffer = Vec::new(); let string = String::from("a"); escape_bytes(&string, &mut buffer); assert_eq!(buffer, b"foo"); // &str let mut buffer2 = Vec::new(); let str_ref = "a"; escape_bytes(str_ref, &mut buffer2); assert_eq!(buffer2, b"foo"); // Box let mut buffer3 = Vec::new(); let boxed_str = "a".to_string().into_boxed_str(); escape_bytes(&boxed_str, &mut buffer3); assert_eq!(buffer3, b"foo"); } // Test Display trait implementation #[test] fn test_display_trait() { use std::fmt::Write; let haystack = "a"; let display = escape_fmt(haystack); let mut buffer = String::new(); write!(&mut buffer, "{}", display).unwrap(); assert_eq!(buffer, "foo"); } // Test with unicode characters (non-ASCII) #[test] fn test_unicode_characters() { let mut buffer = String::new(); let haystack = "a🚀a"; escape_string(haystack, &mut buffer); assert_eq!(buffer, "foo🚀foo"); let result = escape_fmt(haystack).to_string(); assert_eq!(result, "foo🚀foo"); } #[test] fn test_unicode_characters_bytes() { let mut buffer = Vec::new(); let haystack = "a🚀a"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, "foo🚀foo".as_bytes()); } // Test with emoji and special characters #[test] fn test_emoji_and_special_chars() { let mut buffer = String::new(); let haystack = "a🎉🌟a"; escape_string(haystack, &mut buffer); assert_eq!(buffer, "foo🎉🌟foo"); let result = escape_fmt(haystack).to_string(); assert_eq!(result, "foo🎉🌟foo"); } #[test] fn test_emoji_and_special_chars_bytes() { let mut buffer = Vec::new(); let haystack = "a🎉🌟a"; escape_bytes(haystack, &mut buffer); assert_eq!(buffer, "foo🎉🌟foo".as_bytes()); } // Test with very long strings to test vectorization #[test] fn test_very_long_strings() { let mut buffer = String::new(); let haystack = "a".repeat(10000); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(10000)); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(10000)); } #[test] fn test_very_long_bytes() { let mut buffer = Vec::new(); let haystack = "a".repeat(10000); escape_bytes(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(10000).as_bytes()); } // Test with strings that are exactly vector size #[test] fn test_vector_sized_strings() { // Test with different sizes that might trigger different vectorization paths for size in [16, 32, 64, 128] { let mut buffer = String::new(); let haystack = "a".repeat(size); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(size)); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(size)); } } #[test] fn test_vector_sized_bytes() { // Test with different sizes that might trigger different vectorization paths for size in [16, 32, 64, 128] { let mut buffer = Vec::new(); let haystack = "a".repeat(size); escape_bytes(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(size).as_bytes()); } } // Test with strings that are smaller than vector size #[test] fn test_small_strings() { for size in 1..16 { let mut buffer = String::new(); let haystack = "a".repeat(size); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(size)); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(size)); } } #[test] fn test_small_bytes() { for size in 1..16 { let mut buffer = Vec::new(); let haystack = "a".repeat(size); escape_bytes(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(size).as_bytes()); } } // Test with strings that have escapes at specific positions #[test] fn test_escapes_at_specific_positions() { let test_cases = vec![ ("a", "foo"), ("aa", "foofoo"), ("aaa", "foofoofoo"), ("a a", "foo foo"), (" a ", " foo "), ("a\na", "foo\nfoo"), ("a\ta", "foo\tfoo"), ]; for (input, expected) in test_cases { let mut buffer = String::new(); escape_string(input, &mut buffer); assert_eq!(buffer, expected, "Failed for input: {:?}", input); let result = escape_fmt(input).to_string(); assert_eq!(result, expected, "Failed for input: {:?}", input); } } #[test] fn test_escapes_at_specific_positions_bytes() { let test_cases = vec![ ("a", "foo"), ("aa", "foofoo"), ("aaa", "foofoofoo"), ("a a", "foo foo"), (" a ", " foo "), ("a\na", "foo\nfoo"), ("a\ta", "foo\tfoo"), ]; for (input, expected) in test_cases { let mut buffer = Vec::new(); escape_bytes(input, &mut buffer); assert_eq!(buffer, expected.as_bytes(), "Failed for input: {:?}", input); } } // Test with strings containing only non-escape characters #[test] fn test_only_non_escape_chars() { let test_strings = vec!["hello", "world", "test", "12345", "!@#$%", "🚀🌟🎉"]; for test_str in test_strings { let mut buffer = String::new(); escape_string(test_str, &mut buffer); assert_eq!(buffer, test_str, "Failed for input: {:?}", test_str); let result = escape_fmt(test_str).to_string(); assert_eq!(result, test_str, "Failed for input: {:?}", test_str); } } #[test] fn test_only_non_escape_chars_bytes() { let test_strings = vec!["hello", "world", "test", "12345", "!@#$%", "🚀🌟🎉"]; for test_str in test_strings { let mut buffer = Vec::new(); escape_bytes(test_str, &mut buffer); assert_eq!( buffer, test_str.as_bytes(), "Failed for input: {:?}", test_str ); } } // Test with strings containing only escape characters #[test] fn test_only_escape_chars() { let mut buffer = String::new(); let haystack = "a".repeat(10); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(10)); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(10)); } #[test] fn test_only_escape_chars_bytes() { let mut buffer = Vec::new(); let haystack = "a".repeat(10); escape_bytes(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(10).as_bytes()); } // Test with strings that have escapes at the boundary of vector operations #[test] fn test_boundary_escapes() { // Test with strings that might trigger different vectorization paths let sizes = [15, 16, 17, 31, 32, 33, 63, 64, 65]; for size in sizes { let mut buffer = String::new(); let haystack = "a".repeat(size); escape_string(&haystack, &mut buffer); assert_eq!(buffer, "foo".repeat(size), "Failed for size: {}", size); let result = escape_fmt(&haystack).to_string(); assert_eq!(result, "foo".repeat(size), "Failed for size: {}", size); } } #[test] fn test_boundary_escapes_bytes() { // Test with strings that might trigger different vectorization paths let sizes = [15, 16, 17, 31, 32, 33, 63, 64, 65]; for size in sizes { let mut buffer = Vec::new(); let haystack = "a".repeat(size); escape_bytes(&haystack, &mut buffer); assert_eq!( buffer, "foo".repeat(size).as_bytes(), "Failed for size: {}", size ); } } // Test with strings that have escapes at the very end #[test] fn test_escapes_at_end() { let test_cases = vec![ ("a", "foo"), ("sa", "sfoo"), ("cba", "cbfoo"), ("dcba", "dcbfoo"), ]; for (input, expected) in test_cases { let mut buffer = String::new(); escape_string(input, &mut buffer); assert_eq!(buffer, expected, "Failed for input: {:?}", input); let result = escape_fmt(input).to_string(); assert_eq!(result, expected, "Failed for input: {:?}", input); } } #[test] fn test_escapes_at_end_bytes() { let test_cases = vec![ ("a", "foo"), ("sa", "sfoo"), ("cba", "cbfoo"), ("dcba", "dcbfoo"), ]; for (input, expected) in test_cases { let mut buffer = Vec::new(); escape_bytes(input, &mut buffer); assert_eq!(buffer, expected.as_bytes(), "Failed for input: {:?}", input); } } // Test with strings that have escapes at the very beginning #[test] fn test_escapes_at_beginning() { let test_cases = vec![ ("a", "foo"), ("ab", "foob"), ("abc", "foobc"), ("abcd", "foobcd"), ]; for (input, expected) in test_cases { let mut buffer = String::new(); escape_string(input, &mut buffer); assert_eq!(buffer, expected, "Failed for input: {:?}", input); let result = escape_fmt(input).to_string(); assert_eq!(result, expected, "Failed for input: {:?}", input); } } #[test] fn test_escapes_at_beginning_bytes() { let test_cases = vec![ ("a", "foo"), ("ab", "foob"), ("abc", "foobc"), ("abcd", "foobcd"), ]; for (input, expected) in test_cases { let mut buffer = Vec::new(); escape_bytes(input, &mut buffer); assert_eq!(buffer, expected.as_bytes(), "Failed for input: {:?}", input); } } // Test with strings that have escapes in the middle #[test] fn test_escapes_in_middle() { let test_cases = vec![ ("sa", "sfoo"), ("cba", "cbfoo"), ("dcba", "dcbfoo"), ("edcba", "edcbfoo"), ]; for (input, expected) in test_cases { let mut buffer = String::new(); escape_string(input, &mut buffer); assert_eq!(buffer, expected, "Failed for input: {:?}", input); let result = escape_fmt(input).to_string(); assert_eq!(result, expected, "Failed for input: {:?}", input); } } #[test] fn test_escapes_in_middle_bytes() { let test_cases = vec![ ("sa", "sfoo"), ("cba", "cbfoo"), ("dcba", "dcbfoo"), ("edcba", "edcbfoo"), ]; for (input, expected) in test_cases { let mut buffer = Vec::new(); escape_bytes(input, &mut buffer); assert_eq!(buffer, expected.as_bytes(), "Failed for input: {:?}", input); } } // Test with strings that have multiple escapes in various positions #[test] fn test_multiple_escapes_various_positions() { let test_cases = vec![ ("aa", "foofoo"), ("aaa", "foofoofoo"), ("a a", "foo foo"), ("a a a", "foo foo foo"), (" a a ", " foo foo "), ("a\na\na", "foo\nfoo\nfoo"), ]; for (input, expected) in test_cases { let mut buffer = String::new(); escape_string(input, &mut buffer); assert_eq!(buffer, expected, "Failed for input: {:?}", input); let result = escape_fmt(input).to_string(); assert_eq!(result, expected, "Failed for input: {:?}", input); } } #[test] fn test_multiple_escapes_various_positions_bytes() { let test_cases = vec![ ("aa", "foofoo"), ("aaa", "foofoofoo"), ("a a", "foo foo"), ("a a a", "foo foo foo"), (" a a ", " foo foo "), ("a\na\na", "foo\nfoo\nfoo"), ]; for (input, expected) in test_cases { let mut buffer = Vec::new(); escape_bytes(input, &mut buffer); assert_eq!(buffer, expected.as_bytes(), "Failed for input: {:?}", input); } } } mod false_positive { use super::*; static V_ESCAPE_CHARS: [u8; 256] = { let mut chars = [6u8; 256]; chars[b'\"' as usize] = 0; chars[b'&' as usize] = 1; chars[b'\'' as usize] = 2; chars[b'/' as usize] = 3; chars[b'<' as usize] = 4; chars[b'>' as usize] = 5; chars }; static V_ESCAPE_QUOTES: [&str; 6usize] = [""", "&", "'", "/", "<", ">"]; const V_ESCAPE_LEN: usize = 6usize; #[derive(Debug, Clone, Copy)] struct Escape { translation_a: V, below_a: V, translation_b: V, below_b: V, c: V, } struct Builder; impl EscapesBuilder for Builder { type Escapes = Escape; fn new() -> Self::Escapes { Self::Escapes { translation_a: V::splat(88i8 as u8), below_a: V::splat(121i8 as u8), translation_b: V::splat(65i8 as u8), below_b: V::splat(124i8 as u8), c: V::splat(47i8 as u8), } } } impl Escapes for Escape { const ESCAPE_LEN: usize = 6usize; const FALSE_POSITIVE: bool = true; type Vector = V; fn masking(&self, vector2: V) -> V { vector2 .add(self.translation_a) .gt(self.below_a) .or(vector2.add(self.translation_b).gt(self.below_b)) .or(vector2.cmpeq(self.c)) } fn escape(i: usize) -> &'static str { V_ESCAPE_QUOTES[i] } fn position(i: u8) -> usize { V_ESCAPE_CHARS[i as usize] as usize } fn byte_byte_compare(c: u8) -> bool { (V_ESCAPE_CHARS[c as usize] as usize) < V_ESCAPE_LEN } } escape_builder!(Builder); #[test] fn test_false_positive() { let mut buffer = String::new(); let haystack = ">".to_string() + &"foobar".repeat(100) + "<"; escape_string(&haystack, &mut buffer); assert_eq!(buffer, ">".to_string() + &"foobar".repeat(100) + "<"); } #[test] fn test_false_positive_bytes() { let mut buffer = Vec::new(); let haystack = ">".to_string() + &"foobar".repeat(100) + "<"; escape_bytes(&haystack, &mut buffer); assert_eq!( buffer, (">".to_string() + &"foobar".repeat(100) + "<").as_bytes() ); } #[test] fn test_html_escape_mapping() { // Test that each character maps to the correct HTML escape sequence let test_cases = vec![ ('"', """), ('&', "&"), ('\'', "'"), ('/', "/"), ('<', "<"), ('>', ">"), ]; for (input_char, expected_escape) in test_cases { let input = input_char.to_string(); let mut buffer = String::new(); escape_string(&input, &mut buffer); assert_eq!( buffer, expected_escape, "Failed for character: {:?}", input_char ); let mut buffer_bytes = Vec::new(); escape_bytes(&input, &mut buffer_bytes); assert_eq!( buffer_bytes, expected_escape.as_bytes(), "Failed for character: {:?}", input_char ); } } #[test] fn test_html_escape_mapping_verify_indices() { // Verify that the V_ESCAPE_CHARS indices correctly map to V_ESCAPE_QUOTES assert_eq!(V_ESCAPE_CHARS[b'"' as usize], 0); assert_eq!(V_ESCAPE_CHARS[b'&' as usize], 1); assert_eq!(V_ESCAPE_CHARS[b'\'' as usize], 2); assert_eq!(V_ESCAPE_CHARS[b'/' as usize], 3); assert_eq!(V_ESCAPE_CHARS[b'<' as usize], 4); assert_eq!(V_ESCAPE_CHARS[b'>' as usize], 5); // Verify that the escape function returns the correct strings assert_eq!(Escape::<()>::escape(0), """); assert_eq!(Escape::<()>::escape(1), "&"); assert_eq!(Escape::<()>::escape(2), "'"); assert_eq!(Escape::<()>::escape(3), "/"); assert_eq!(Escape::<()>::escape(4), "<"); assert_eq!(Escape::<()>::escape(5), ">"); } #[test] fn test_complete_html_escaping() { // Test a string containing all HTML special characters let input = r#""#; let expected = r#"<script>alert("Hello & 'World'")</script>"#; let mut buffer = String::new(); escape_string(input, &mut buffer); assert_eq!(buffer, expected); let mut buffer_bytes = Vec::new(); escape_bytes(input, &mut buffer_bytes); assert_eq!(buffer_bytes, expected.as_bytes()); } } v_escape-base-0.1.0/tests/sys_info.rs000064400000000000000000000013021046102023000156540ustar 00000000000000// For debugging, particularly in CI, print out the byte order // and architecture of the current target. #[test] fn sys_info() { std::eprintln!(); #[cfg(target_arch = "x86_64")] std::eprintln!("RUNNING ON x86_64"); #[cfg(target_arch = "aarch64")] std::eprintln!("RUNNING ON aarch64"); #[cfg(target_arch = "wasm32")] std::eprintln!("RUNNING ON wasm32"); #[cfg(not(any( target_arch = "x86_64", target_arch = "aarch64", target_arch = "wasm32" )))] std::eprintln!("RUNNING ON Unknown architecture"); #[cfg(target_endian = "little")] std::eprintln!("LITTLE ENDIAN"); #[cfg(target_endian = "big")] std::eprintln!("BIG ENDIAN"); }