newline-converter-0.3.0/.cargo_vcs_info.json0000644000000001660000000000100145050ustar { "git": { "sha1": "a190f105f43a5963b698d24490b05deb4893145a" }, "path_in_vcs": "crates/newline-converter" }newline-converter-0.3.0/.gitignore000064400000000000000000000000071046102023000152570ustar 00000000000000target/newline-converter-0.3.0/Cargo.toml0000644000000023320000000000100125000ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "newline-converter" version = "0.3.0" authors = ["Michal Borejszo "] description = "Newline byte converter library" readme = "README.md" keywords = [ "newline", "linebreak", "conversion", ] categories = [ "encoding", "text-processing", ] license = "MIT" repository = "https://github.com/spitfire05/rnc/tree/master/crates/newline-converter" [lib] bench = false [[bench]] name = "bench" harness = false [dependencies.unicode-segmentation] version = "1.10" [dev-dependencies.criterion] version = "0.4.0" [dev-dependencies.fancy-regex] version = "0.10" [dev-dependencies.lazy-regex] version = "2.3" [dev-dependencies.once_cell] version = "1.15" [dev-dependencies.quickcheck] version = "1" newline-converter-0.3.0/Cargo.toml.orig000064400000000000000000000011551046102023000161630ustar 00000000000000[package] name = "newline-converter" version = "0.3.0" authors = ["Michal Borejszo "] edition = "2018" license = "MIT" description = "Newline byte converter library" repository = "https://github.com/spitfire05/rnc/tree/master/crates/newline-converter" readme = "README.md" keywords = ["newline", "linebreak", "conversion"] categories = ["encoding", "text-processing"] [dependencies] unicode-segmentation = "1.10" [dev-dependencies] criterion = "0.4.0" lazy-regex = "2.3" once_cell = "1.15" fancy-regex = "0.10" quickcheck = "1" [lib] bench = false [[bench]] name = "bench" harness = false newline-converter-0.3.0/README.md000064400000000000000000000031771046102023000145610ustar 00000000000000# newline-converter `newline-converter` is a simple library used for converting the newline characters in strings between Windows `\r\n` and Unix `\n` style. It mainly serves as a backend for [Rust Newline converter](https://github.com/spitfire05/rnc) CLI tool. [![Crates.io](https://img.shields.io/crates/v/newline-converter)](https://crates.io/crates/newline-converter) ## Comparision of newline-wrangling methods ### newline-converter (this crate) - ✅ Properly handles edge-cases like lone `\r` characters. For example, `\r\n` sequences won't become `\r\r\n` after `unix2dos` call: ```rust use newline_converter::unix2dos; assert_eq!( unix2dos("\nfoo\r\nbar\n"), "\r\nfoo\rbar\n" ); ``` - ✅ Is the fastest when input data is small (few bytes of text with line breaks). - ❌ Is the slowest (or second slowest in case of `unix2dos`) when dealing with larger data sets (ex. 100 paragraphs of [Lorem Ipsum](https://www.lipsum.com/)). ### `string.replace` - ❌ Does not handle edge cases properly in `unix2dos`. - ✅ Good performance on larger data sets. ### [regex](https://crates.io/crates/regex) crate `Regex::replace_all` - ❌ Does not handle edge cases properly in `unix2dos`, because of lack of support for look around. - ✅ The best performance with larger data sets. ### [fancy-regex](https://crates.io/crates/fancy-regex) crate `Regex::replace_all` - ✅ Properly handles edge cases. - ❌ `unix2dos` has worst performance of all implementations, by an order of magnitude (because of look around used). Look into `benches/bench.rs` for the comparision benchmarks. ## MSRV Minimum Supported Rust Version is `1.38.0`. newline-converter-0.3.0/benches/bench.rs000064400000000000000000000067771046102023000163470ustar 00000000000000use std::borrow::Cow; use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; use lazy_regex::{lazy_regex, Lazy}; use newline_converter::{dos2unix, unix2dos}; fn dos2unix_string_replace + ?Sized>(input: &T) -> String { input.as_ref().replace("\r\n", "\n") } fn unix2dos_string_replace + ?Sized>(input: &T) -> String { input.as_ref().replace("\n", "\r\n") } static RE_DOS: lazy_regex::Lazy = lazy_regex!("\r\n"); static RE_UNIX: lazy_regex::Lazy = lazy_regex!("\n"); static RE_UNIX_FANCY: Lazy = Lazy::new(|| fancy_regex::Regex::new("(?!\r)\n").unwrap()); fn dos2unix_regex + ?Sized>(input: &T) -> Cow { RE_DOS.replace_all(input.as_ref(), "\n") } fn unix2dos_regex + ?Sized>(input: &T) -> Cow { RE_UNIX.replace_all(input.as_ref(), "\r\n") } fn unix2dos_regex_fancy + ?Sized>(input: &T) -> Cow { RE_UNIX_FANCY.replace_all(input.as_ref(), "\r\n") } const DOS_INPUT: &str = "\r\nfoo\r\nbar\r\n"; const UNIX_INPUT: &str = "\nfoo\nbar\n"; fn bench_dos2unix(c: &mut Criterion) { let mut group = c.benchmark_group("dos2unix"); let i = DOS_INPUT; group.bench_with_input(BenchmarkId::new("newline-converter", ""), i, |b, i| { b.iter(|| dos2unix(i)) }); group.bench_with_input(BenchmarkId::new("string.replace", ""), i, |b, i| { b.iter(|| dos2unix_string_replace(i)) }); group.bench_with_input(BenchmarkId::new("regex", ""), i, |b, i| { b.iter(|| dos2unix_regex(i)) }); group.finish(); } fn bench_dos2unix_noop(c: &mut Criterion) { let mut group = c.benchmark_group("dos2unix_noop"); let i = UNIX_INPUT; group.bench_with_input(BenchmarkId::new("newline-converter", ""), i, |b, i| { b.iter(|| dos2unix(i)) }); group.bench_with_input(BenchmarkId::new("string.replace", ""), i, |b, i| { b.iter(|| dos2unix_string_replace(i)) }); group.bench_with_input(BenchmarkId::new("regex", ""), i, |b, i| { b.iter(|| dos2unix_regex(i)) }); group.finish(); } fn bench_unix2dos(c: &mut Criterion) { let mut group = c.benchmark_group("unix2dos"); let i = UNIX_INPUT; group.bench_with_input(BenchmarkId::new("newline-converter", ""), i, |b, i| { b.iter(|| unix2dos(i)) }); group.bench_with_input(BenchmarkId::new("string.replace", ""), i, |b, i| { b.iter(|| unix2dos_string_replace(i)) }); group.bench_with_input(BenchmarkId::new("regex", ""), i, |b, i| { b.iter(|| unix2dos_regex(i)) }); group.bench_with_input(BenchmarkId::new("fancy_regex", ""), i, |b, i| { b.iter(|| unix2dos_regex_fancy(i)) }); group.finish(); } fn bench_unix2dos_noop(c: &mut Criterion) { let mut group = c.benchmark_group("unix2dos_noop"); let i = DOS_INPUT; group.bench_with_input(BenchmarkId::new("newline-converter", ""), i, |b, i| { b.iter(|| unix2dos(i)) }); group.bench_with_input(BenchmarkId::new("string.replace", ""), i, |b, i| { b.iter(|| unix2dos_string_replace(i)) }); group.bench_with_input(BenchmarkId::new("regex", ""), i, |b, i| { b.iter(|| unix2dos_regex(i)) }); group.bench_with_input(BenchmarkId::new("fancy_regex", ""), i, |b, i| { b.iter(|| unix2dos_regex_fancy(i)) }); group.finish(); } criterion_group!( benches, bench_dos2unix, bench_dos2unix_noop, bench_unix2dos, bench_unix2dos_noop ); criterion_main!(benches); newline-converter-0.3.0/changelog.md000064400000000000000000000003731046102023000155460ustar 00000000000000# 0.3.0 - Add extension trait interface. # 0.2.2 - Fix panic or undesired behavior when dealing with non-ASCII input. # 0.2.1 - Minor optimization of `unix2dos`. # 0.2.0 - Changed `dos2unix` and `unix2dos` functions to accept `AsRef` as input.newline-converter-0.3.0/src/lib.rs000064400000000000000000000213111046102023000151730ustar 00000000000000//! A library for newline character converting. //! //! # Examples //! //! Using the extension trait: //! //! ``` //! use newline_converter::AsRefStrExt; //! assert_eq!("foo\r\nbar", "foo\nbar".to_dos()); //! ``` //! //! ``` //! use newline_converter::AsRefStrExt; //! assert_eq!("foo\nbar", "foo\r\nbar".to_unix()); //! ``` //! //! Using conversion functions directly: //! //! ``` //! assert_eq!("foo\r\nbar", newline_converter::unix2dos("foo\nbar")); //! ``` //! //! ``` //! assert_eq!("foo\nbar", newline_converter::dos2unix("foo\r\nbar")); //! ``` //! //! The conversion functions are **lazy** - they don't perform any allocations if the input is already in correct format. #![deny(missing_docs)] #![deny(clippy::unwrap_used)] #![deny(clippy::expect_used)] use std::borrow::Cow; use unicode_segmentation::UnicodeSegmentation; const UNPACK_MSG: &str = "Grapheme should always be found -- Please file a bug report"; /// Converts DOS-style line endings (`\r\n`) to UNIX-style (`\n`). /// /// The input string may already be in correct format, so this function /// returns `Cow`, to avoid unnecessary allocation and copying. /// /// # Examples /// ``` /// assert_eq!(newline_converter::dos2unix("\r\nfoo\r\nbar\r\n"), "\nfoo\nbar\n"); /// ``` /// /// Lone `\r` bytes will be preserved: /// ``` /// assert_eq!( /// newline_converter::dos2unix("\nfoo\rbar\r\n"), /// "\nfoo\rbar\n" /// ); /// ``` pub fn dos2unix + ?Sized>(input: &T) -> Cow { let mut iter = input.as_ref().chars().peekable(); let input = input.as_ref(); let mut output: Option = None; while let Some(current) = iter.next() { if '\r' == current { if let Some('\n') = iter.peek() { // drop it if output.is_none() { let n = input.chars().filter(|x| *x == '\r').count(); let mut buffer = String::with_capacity(input.len() - n); let i = input .grapheme_indices(true) .find(|(_, x)| *x == "\r\n") .map_or_else(|| unreachable!("{}", UNPACK_MSG), |(i, _)| i); let (past, _) = input.split_at(i); buffer.push_str(past); output = Some(buffer); } continue; } } if let Some(o) = output.as_mut() { o.push(current); } } match output { None => Cow::Borrowed(input), Some(o) => Cow::Owned(o), } } #[allow(clippy::match_like_matches_macro)] // MSRV 1.38, matches! macro available in 1.42 /// Converts UNIX-style line endings (`\n`) to DOS-style (`\r\n`). /// /// The input string may already be in correct format, so this function /// returns `Cow`, to avoid unnecessary allocation and copying. /// /// # Examples /// ``` /// assert_eq!(newline_converter::unix2dos("\nfoo\nbar\n"), "\r\nfoo\r\nbar\r\n"); /// ``` /// /// Already present DOS line breaks are respected: /// ``` /// assert_eq!(newline_converter::unix2dos("\nfoo\r\nbar\n"), "\r\nfoo\r\nbar\r\n"); /// ``` pub fn unix2dos + ?Sized>(input: &T) -> Cow { let mut output: Option = None; let mut last_char: Option = None; let input = input.as_ref(); for (i, current) in input.chars().enumerate() { if '\n' == current && (i == 0 || match last_char { Some('\r') => false, _ => true, }) { if output.is_none() { let n = input.chars().filter(|x| *x == '\n').count(); let mut buffer = String::with_capacity(input.len() + n); let i = input .grapheme_indices(true) .find(|(_, x)| *x == "\n") .map_or_else(|| unreachable!("{}", UNPACK_MSG), |(i, _)| i); let (past, _) = input.split_at(i); buffer.push_str(past); output = Some(buffer); } match output.as_mut() { Some(o) => o.push('\r'), None => unreachable!(), } } last_char = Some(current); if let Some(o) = output.as_mut() { o.push(current); } } match output { Some(o) => Cow::Owned(o), None => Cow::Borrowed(input), } } /// Extension trait for converting between DOS and UNIX linebreaks. pub trait AsRefStrExt { /// Converts linebreaks to DOS (`\r\n`). See [`unix2dos`] for more info. /// /// # Examples /// /// ``` /// use newline_converter::AsRefStrExt; /// assert_eq!("foo\r\nbar", "foo\nbar".to_dos()); /// ``` fn to_dos(&self) -> Cow; /// Converts linebreaks to UNIX (`\n`). See [`dos2unix`] for more info. /// /// # Examples /// /// ``` /// use newline_converter::AsRefStrExt; /// assert_eq!("foo\nbar", "foo\r\nbar".to_unix()); /// ``` fn to_unix(&self) -> Cow; } impl AsRefStrExt for T where T: AsRef, { fn to_dos(&self) -> Cow { unix2dos(self) } fn to_unix(&self) -> Cow { dos2unix(self) } } #[cfg(test)] mod tests { use super::*; use quickcheck::{quickcheck, TestResult}; #[test] fn middle() { assert_eq!(dos2unix("foo\r\nbar"), "foo\nbar".to_dos().to_unix()); assert_eq!(unix2dos("foo\nbar"), "foo\r\nbar"); } #[test] fn beginning() { assert_eq!(dos2unix("\r\nfoobar"), "\nfoobar"); assert_eq!(unix2dos("\nfoobar"), "\r\nfoobar"); } #[test] fn end() { assert_eq!(dos2unix("foobar\r\n"), "foobar\n"); assert_eq!(unix2dos("foobar\n"), "foobar\r\n"); } #[test] fn all() { assert_eq!(dos2unix("\r\nfoo\r\nbar\r\n"), "\nfoo\nbar\n"); assert_eq!(unix2dos("\nfoo\nbar\n"), "\r\nfoo\r\nbar\r\n"); } #[test] fn advanced() { assert_eq!(unix2dos("\rfoo\r\nbar\n"), "\rfoo\r\nbar\r\n"); assert_eq!(dos2unix("\nfoo\rbar\r\n"), "\nfoo\rbar\n"); } #[test] fn not_mutated_dos2unix() { let converted = dos2unix("\nfoo\nbar\n"); assert_eq!(converted, Cow::Borrowed("\nfoo\nbar\n") as Cow); } #[test] fn mutated_dos2unix() { let converted = dos2unix("\r\nfoo\r\nbar\r\n"); assert_eq!( converted, Cow::Owned(String::from("\nfoo\nbar\n")) as Cow ); } #[test] fn not_mutated_unix2dos() { let converted = unix2dos("\r\nfoo\r\nbar\r\n"); assert_eq!(converted, Cow::Borrowed("\r\nfoo\r\nbar\r\n") as Cow); } #[test] fn mutated_unix2dos() { let converted = unix2dos("\nfoo\nbar\n"); assert_eq!( converted, Cow::Owned(String::from("\r\nfoo\r\nbar\r\n")) as Cow ); } #[test] fn non_ascii_characters_unix2dos() { assert_eq!( unix2dos("Zażółć\ngęślą\njaźń\n"), "Zażółć\r\ngęślą\r\njaźń\r\n" ); } #[test] fn non_ascii_characters_dos2unix() { assert_eq!( dos2unix("Zażółć\r\ngęślą\r\njaźń\r\n"), "Zażółć\ngęślą\njaźń\n" ); } #[test] // https://github.com/spitfire05/rnc/issues/14 fn panics_in_0_2_1_unix2dos() { assert_eq!(unix2dos("ä\n"), "ä\r\n"); } #[test] // https://github.com/spitfire05/rnc/issues/14 fn panics_in_0_2_1_dos2unix() { assert_eq!(dos2unix("ä\r\n"), "ä\n"); } #[test] fn just_linebreak_dos2unix() { assert_eq!(dos2unix("\r\n"), "\n"); } #[test] fn just_linebreak_unix2dos() { assert_eq!(unix2dos("\n"), "\r\n"); } quickcheck! { fn dos_unix_dos(data: String) -> TestResult { if data.contains("\r\n") { return TestResult::discard(); } TestResult::from_bool(data.replace('\n', "\r\n") == unix2dos(&dos2unix(&data))) } fn unix_dos_unix(data: String) -> bool { data.replace("\r\n", "\n") == dos2unix(&unix2dos(&data)) } fn unix_contains_no_crlf(data: String) -> bool { !dos2unix(&data).contains("\r\n") } fn dos_has_no_lf_without_cr(data: String) -> bool { let dos = unix2dos(&data); let crlf = dos.graphemes(true).filter(|x| *x == "\r\n").count(); let lf = dos.chars().filter(|x| *x == '\n').count(); lf == crlf } fn to_unix_equals_dos2unix(data: String) -> bool { dos2unix(&data) == data.to_unix() } fn to_dos_equals_unix2dos(data: String) -> bool { unix2dos(&data) == data.to_dos() } } }