hifijson-0.2.3/.cargo_vcs_info.json0000644000000001360000000000100126470ustar { "git": { "sha1": "d54ca41c9b9923c55f865369e412067f0dd53e96" }, "path_in_vcs": "" }hifijson-0.2.3/.github/workflows/rust.yml000064400000000000000000000007521046102023000165600ustar 00000000000000name: Rust on: push: branches: [ "main" ] pull_request: branches: [ "main" ] env: CARGO_TERM_COLOR: always jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - name: Build run: cargo build --verbose - name: Build without default features run: cargo build --no-default-features - name: Build with Serde run: cargo build --features serde - name: Run tests run: cargo test --features serde --verbose hifijson-0.2.3/.gitignore000064400000000000000000000000101046102023000134160ustar 00000000000000/target hifijson-0.2.3/Cargo.lock0000644000000026210000000000100106230ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "hifijson" version = "0.2.3" dependencies = [ "memmap2", "serde", "serde_json", ] [[package]] name = "itoa" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" [[package]] name = "libc" version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "memmap2" version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" dependencies = [ "libc", ] [[package]] name = "ryu" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" [[package]] name = "serde" version = "1.0.152" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" [[package]] name = "serde_json" version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7434af0dc1cbd59268aa98b4c22c131c0584d2232f6fb166efb993e2832e896a" dependencies = [ "itoa", "ryu", "serde", ] hifijson-0.2.3/Cargo.toml0000644000000027050000000000100106510ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.56" name = "hifijson" version = "0.2.3" authors = ["Michael Färber "] build = false autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "High-fidelity JSON lexer and parser" readme = "README.md" keywords = ["json"] categories = ["parser-implementations"] license = "MIT" repository = "https://github.com/01mf02/hifijson" [features] alloc = [] default = [ "std", "alloc", ] std = [] [lib] name = "hifijson" path = "src/lib.rs" [[example]] name = "bench" path = "examples/bench.rs" [[example]] name = "cat" path = "examples/cat.rs" [[example]] name = "count" path = "examples/count.rs" [[test]] name = "serde" path = "tests/serde.rs" [[test]] name = "tests" path = "tests/tests.rs" [dependencies.serde] version = "1.0.152" optional = true [dev-dependencies.memmap2] version = "0.9" [dev-dependencies.serde_json] version = "1.0.92" features = ["arbitrary_precision"] hifijson-0.2.3/Cargo.toml.orig000064400000000000000000000010521046102023000143240ustar 00000000000000[package] name = "hifijson" version = "0.2.3" edition = "2021" authors = ["Michael Färber "] description = "High-fidelity JSON lexer and parser" repository = "https://github.com/01mf02/hifijson" license = "MIT" keywords = ["json"] categories = ["parser-implementations"] rust-version = "1.56" [features] default = ["std", "alloc"] std = [] alloc = [] [dependencies] serde = { version = "1.0.152", optional = true } [dev-dependencies] memmap2 = "0.9" serde_json = { version = "1.0.92", features = ["arbitrary_precision"] } hifijson-0.2.3/README.md000064400000000000000000000155441046102023000127270ustar 00000000000000# High-fidelity JSON lexer and parser ![Build status](https://github.com/01mf02/hifijson/workflows/Rust/badge.svg) [![Crates.io](https://img.shields.io/crates/v/hifijson.svg)](https://crates.io/crates/hifijson) [![Documentation](https://docs.rs/hifijson/badge.svg)](https://docs.rs/hifijson) [![Rust 1.56+](https://img.shields.io/badge/rust-1.56+-orange.svg)](https://www.rust-lang.org) `hifijson` is a Rust crate that provides a high-fidelity JSON lexer and parser. In this context, high-fidelity means that unlike many other parsers, `hifijson` aims to preserve input data very faithfully, in particular numbers. * Zero dependencies: Not even `alloc` is obligatory! * `no_std`: Can be used on embedded systems without standard library. * Reading from slices and from byte iterators: This is important if you are writing an application that should read from files as well as from standard input, for example. * Performance * Portability * Mostly zero-copy deserialisation: Due to the presence of escaped characters in JSON strings, full zero-copy deserialisation of JSON data is not possible. However, `hifijson` attempts to minimise allocations in presence of strings. * Deserialisation via `serde` ## Comparison to `serde_json` `serde_json` is currently the most popular JSON parser written in Rust. However, there are some deficiencies of `serde_json`: * Numbers can be parsed with arbitrary precision (via the feature flag `arbitrary_precision`), but they cannot be deserialised (by implementing the `Deserialize` trait) to anything else than a `serde_json::Value` [#896](https://github.com/serde-rs/json/issues/896). Instead, one has to deserialize to `serde_json::Value`, then convert that to something else, which costs time. * When using `arbitrary_precision`, `serde_json` incorrectly parses or rejects certain input; for example, it incorrectly parses `{"$serde_json::private::Number": "1.0"}` as number 1.0 and incorrectly rejects `{"$serde_json::private::Number": "foo"}`. I consider both of these to be bugs, but although they are known, the `serde_json` maintainers are ["fine sticking with this behaviour"](https://github.com/serde-rs/json/issues/826#issuecomment-1019360407). * The behaviour of `serde_json` can be customised to some degree via feature flags. However, this is a relatively inflexible solution; for example, you can specify whether to preserve the order of keys in objects by using the `preserve_order` feature flag, but what happens when you have an object that contains the same key several times, for example `{"a": 1, "a": 2}`? Currently, `serde_json` parses this as `{"a": 2}`, silently discarding information. What if you would like to fail in this case? Well, you can just implement `Deserialize` yourself. Except ... that you cannot, if you are using `arbitrary_precision`. Ouch. You should probably use `serde_json` if you want to serialise / deserialise your existing Rust datatypes. However, if you want to process arbitrary JSON coming from the external world, require some control over what kind of input you read, or just care about fast build times and minimal dependencies, then `hifijson` might be for you. There is also [`serde-json-core`] for embedded usage of JSON; however, this crate neither supports arbitrary-precision numbers, reading from byte iterators, nor escape sequences in strings. ### Performance [`cargo run --release --example bench`](examples/bench.rs) measures the time that `serde_json` and `hifijson` take to parse large JSON data to their respective `Value` types. For better comparability, I enabled `serde_json`'s `arbitrary_precision` flag, which parses numbers to strings like `hifijson`. Still, this is somewhat of an apples-to-oranges comparison because a `serde_json` `Value` uses `String` for numbers and strings where a `hifijson` `Value` uses `&str` for numbers and `Cow` for strings. This gives `hifijson` an advantage for the "pi" and "hello" benchmarks, but a disadvantage for the "hello-world" benchmark. | Benchmark | Size | `serde_json` | `hifijson` | | ----------- | ------: | -----------: | ---------: | | null | 47 MiB | 549 ms | 736 ms | | pi | 66 MiB | 2484 ms | 1383 ms | | hello | 76 MiB | 1762 ms | 1334 ms | | hello-world | 143 MiB | 1786 ms | 2933 ms | | arr | 28 MiB | 970 ms | 1056 ms | | tree | 39 MiB | 2221 ms | 2822 ms | The results are mixed: While `hifijson` is faster on numbers and strings not containing escape sequences, it is slower on keywords (`null`, `true`, `false`) and deeply nested arrays. Also note that `serde_json` parses numbers much faster without `arbitrary_precision`. Suggestions on how to improve `hifijson`'s performance are welcome. :) ## Lexer Writing a JSON *parser* is remarkably easy --- the hard part is actually lexing. This is why `hifijson` provides you first and foremost with a lexer, which you can then use to build a parser yourself. Yes, you. You can do it. `hifijson` tries to give you some basic abstractions to help you. For example, the default parser is implemented in less than 40 lines of code. ## Default parser [Parsing JSON is a minefield](http://seriot.ch/projects/parsing_json.html), because the JSON standard is underspecified or downright contradictory in certain aspects. For this reason, a parser has to make certain decisions which inputs to accept and which to reject. `hifijson` comes with a default parser that might be good enough for many use cases. This parser makes the following choices: * Validation of strings: The parser validates that strings are valid UTF-8. * Concatenation of JSON values: Many JSON processing tools accept multiple root JSON values in a JSON file. For example, `[] 42 true {"a": "b"}`. However, defining formally what these tools actually accept or reject is not simple. For example, `serde_json` accepts `[]"a"`, but it rejects `42"a"`. The default behaviour of this parser is to accept any concatenation of `JSON-text` (as defined in [RFC 8259]) that can be somehow reconstructed. This allows for weird-looking things like `nulltruefalse`, `1.0"a"`, but some values cannot be reconstructed, such as `1.042.0`, because this may be either a concatenation of `1.0` and `42.0` or a concatenation of `1.04` and `2.0`. In that sense, `hifijson` attempts to implement a policy that is as permissive and easily describable as possible. Furthermore, the parser passes all tests of the [JSON parsing test suite](https://github.com/nst/JSONTestSuite). ## Fuzzing To run the fuzzer, [install `cargo-fuzz`](https://rust-fuzz.github.io/book/cargo-fuzz/setup.html). Then, if you do not wish to use the nightly Rust compiler as default, run the fuzzer by `cargo +nightly fuzz run all`. [`serde-json-core`]: https://github.com/rust-embedded-community/serde-json-core [RFC 8259]: https://www.rfc-editor.org/rfc/rfc8259 hifijson-0.2.3/examples/bench.rs000064400000000000000000000032571046102023000147110ustar 00000000000000use std::time::Instant; /// Append a binary tree of depth `d` to `s`. fn binary(d: usize, s: &mut String) { s.push('['); if d > 0 { binary(d - 1, s); s.push(','); binary(d - 1, s); } s.push(']'); } /// Create a JSON array that contains `n` repetitions of `s`. fn many(s: &str, n: usize) -> String { let mut json = "[".to_string(); json.push_str(s); for _ in 1..n { json.push(','); json.push_str(s); } json.push(']'); json } fn main() { let mut tree = String::new(); binary(23, &mut tree); const N: usize = 10_000_000; println!("Benchmark | Size | `serde_json` | `hifijson`"); println!("- | -: | -: | -:"); for (name, json) in [ ("null", many("null", N)), ("pi", many("3.1415", N)), ("hello", many(r#""hello""#, N)), ("hello-world", many(r#""hello\nworld""#, N)), ("arr", many("[]", N)), ("tree", tree), ] { print!("{name}"); print!(" | {} MiB", json.len() / 1024 / 1024); let now = Instant::now(); serde(json.as_bytes()); print!(" | {} ms", now.elapsed().as_millis()); let now = Instant::now(); hifi(json.as_bytes()); print!(" | {} ms", now.elapsed().as_millis()); println!(); } } fn serde(s: &[u8]) { serde_json::from_slice::(s).unwrap(); } fn hifi(s: &[u8]) { use hifijson::token::Lex; let mut lexer = hifijson::SliceLexer::new(s); //lexer.exactly_one(hifijson::ignore::parse).unwrap(); lexer.exactly_one(hifijson::value::parse_unbounded).unwrap(); //hifijson::serde::exactly_one::(&mut lexer).unwrap(); } hifijson-0.2.3/examples/cat.rs000064400000000000000000000146101046102023000143740ustar 00000000000000//! JSON validator & pretty-printer. use core::ops::Deref; use hifijson::{str, value, Error, Expect, IterLexer, LexAlloc, LexWrite, SliceLexer, Token}; use std::{fs, io}; #[derive(Default)] struct Cli { parse: bool, many: bool, silent: bool, path: Option, } #[derive(Debug, Default)] struct PathElem { ints: Vec, strs: Vec, } impl, Str: Deref> TryFrom> for PathElem { type Error = (); fn try_from(v: value::Value) -> Result { let mut elem = Self::default(); use value::Value::*; match v { Array(arr) => { for x in arr { match x { Number((n, parts)) if parts.is_int() => elem.ints.push(n.parse().unwrap()), String(s) => elem.strs.push(s.to_string()), _ => todo!(), } } } _ => todo!(), } Ok(elem) } } fn process(cli: &Cli, lexer: &mut L) -> Result<(), Error> { if cli.parse { if cli.many { let vs = core::iter::from_fn(|| Some(value::parse_unbounded(lexer.ws_token()?, lexer))); for v in vs { let v = v?; if !cli.silent { println!("{}", v) }; } } else { let v = lexer.exactly_one(value::parse_unbounded)?; if !cli.silent { println!("{}", v) }; } } else { let mut seen = false; while let Some(token) = lexer.ws_token() { if seen && !cli.many { Err(Expect::Eof)? } if cli.silent { lex(token, lexer, &|_| ())?; } else { let path: Vec<_> = cli.path.as_deref().map(parse_path).unwrap_or(Vec::new()); use std::io::Write; filter(&path, token, lexer, &|b| io::stdout().write_all(b).unwrap())?; } seen = true; } if !cli.many && !seen { Err(Expect::Value)? } } Ok(()) } fn filter( path: &[PathElem], token: Token, lexer: &mut L, print: &impl Fn(&[u8]), ) -> Result<(), Error> { let (elem, rest) = if let Some(path) = path.split_first() { path } else { lex(token, lexer, print)?; println!(); return Ok(()); }; match token { Token::LSquare => { let mut idx = 0; lexer.seq(Token::RSquare, |token, lexer| { let out = if elem.ints.is_empty() || elem.ints.contains(&idx) { filter(rest, token, lexer, print) } else { hifijson::ignore::parse(token, lexer) }; idx += 1; out })?; } Token::LCurly => { let mut idx = 0; lexer.seq(Token::RCurly, |token, lexer| { idx += 1; let key = lexer.str_colon(token, |lexer| lexer.str_string().map_err(Error::Str))?; let token = lexer.ws_token().ok_or(Expect::Value)?; if elem.strs.is_empty() || elem.strs.iter().any(|s| s == key.deref()) { filter(rest, token, lexer, print) } else { hifijson::ignore::parse(token, lexer) } })?; } _ => todo!(), } Ok(()) } fn lex(token: Token, lexer: &mut L, print: &impl Fn(&[u8])) -> Result<(), Error> { match token { Token::Null => print(b"null"), Token::True => print(b"true"), Token::False => print(b"false"), Token::DigitOrMinus => { let mut num = Default::default(); let _pos = lexer.num_bytes(&mut num)?; print(&num) } Token::Quote => lex_string(lexer, print)?, Token::LSquare => { print(b"["); let mut first = true; lexer.seq(Token::RSquare, |token, lexer| { if !core::mem::take(&mut first) { print(b","); } lex(token, lexer, print) })?; print(b"]"); } Token::LCurly => { print(b"{"); let mut first = true; lexer.seq(Token::RCurly, |token, lexer| { if !core::mem::take(&mut first) { print(b","); } lexer.str_colon(token, |lexer| lex_string(lexer, print).map_err(Error::Str))?; print(b":"); lex(lexer.ws_token().ok_or(Expect::Value)?, lexer, print) })?; print(b"}") } _ => Err(Expect::Value)?, } Ok(()) } fn lex_string(lexer: &mut L, print: &impl Fn(&[u8])) -> Result<(), str::Error> { print(b"\""); let mut bytes = L::Bytes::default(); lexer.str_bytes(&mut bytes)?; print(&bytes); print(b"\""); Ok(()) } fn process_file(cli: &Cli, path: &str) -> io::Result<()> { let file = fs::File::open(path)?; let mmap = unsafe { memmap2::Mmap::map(&file) }?; process(cli, &mut SliceLexer::new(&mmap)).unwrap(); Ok(()) } fn process_stdin(cli: &Cli) -> io::Result<()> { use io::Read; process(cli, &mut IterLexer::new(io::stdin().bytes())).unwrap(); Ok(()) } /// Parse something like `[1]["a", "b"][]` to a path. /// /// This is interpreted similarly to jq `.[1].["a", "b"].[]`. fn parse_path(path: &str) -> Vec { use hifijson::token::Lex; let lexer = &mut SliceLexer::new(path.as_bytes()); core::iter::from_fn(|| Some(value::parse_unbounded(lexer.ws_token()?, lexer))) .map(|e| PathElem::try_from(e.unwrap()).unwrap()) .collect() } fn main() -> io::Result<()> { let mut cli = Cli::default(); let mut files = Vec::new(); let mut args = std::env::args().skip(1); while let Some(arg) = args.next() { match &*arg { "--parse" => cli.parse = true, "--many" => cli.many = true, "--silent" => cli.silent = true, "--path" => cli.path = args.next(), _ => files.push(arg), } } match &files[..] { [] => process_stdin(&cli), args => args.iter().try_for_each(|a| process_file(&cli, a)), } } hifijson-0.2.3/examples/count.rs000064400000000000000000000027611046102023000147610ustar 00000000000000use hifijson::*; /// Count the number of parsed values. fn count(token: Token, lexer: &mut impl Lex) -> Result { match token { Token::Null | Token::True | Token::False => Ok(1), Token::DigitOrMinus => Ok(lexer.num_ignore().map(|_| 1)?), Token::Quote => Ok(lexer.str_ignore().map(|_| 1)?), Token::LSquare => { let mut sum = 1; lexer.seq(Token::RSquare, |token, lexer| { sum += count(token, lexer)?; Ok::<_, hifijson::Error>(()) })?; Ok(sum) } Token::LCurly => { let mut sum = 1; lexer.seq(Token::RCurly, |token, lexer| { lexer.str_colon(token, |lexer| lexer.str_ignore().map_err(Error::Str))?; sum += count(lexer.ws_token().ok_or(Expect::Value)?, lexer)?; Ok::<_, hifijson::Error>(()) })?; Ok(sum) } _ => Err(hifijson::Expect::Value)?, } } fn process(mut lexer: impl Lex) -> Result { lexer.exactly_one(|token, lexer| count(token, lexer)) } fn main() { let filename = std::env::args().nth(1); let n = if let Some(filename) = filename { let file = std::fs::read_to_string(filename).expect("read file"); process(hifijson::SliceLexer::new(file.as_bytes())) } else { use std::io::Read; process(hifijson::IterLexer::new(std::io::stdin().bytes())) }; println!("{:?}", n) } hifijson-0.2.3/src/escape.rs000064400000000000000000000107251046102023000140410ustar 00000000000000//! Escape sequences. use crate::Read; use core::fmt; /// Escape sequence, such as `\n` or `\u00d6`. pub enum Escape { /// `\"` QuotationMark, /// `\\` ReverseSolidus, /// `\/` Solidus, /// `\b` Backspace, /// `\f` FormFeed, /// `\n` LineFeed, /// `\t` Tab, /// `\r` CarriageReturn, /// `\uHHHH`, where `HHHH` is a hexadecimal number Unicode(u16), } impl Escape { /// Try to interpret an ASCII character as first character of an escape sequence. pub fn try_from(c: u8) -> Option { use Escape::*; Some(match c { b'"' => QuotationMark, b'\\' => ReverseSolidus, b'/' => Solidus, b'b' => Backspace, b'f' => FormFeed, b'n' => LineFeed, b'r' => CarriageReturn, b't' => Tab, b'u' => Unicode(0), _ => return None, }) } fn as_char(&self) -> Result { use Escape::*; Ok(match self { QuotationMark => '"', ReverseSolidus => '\\', Solidus => '/', Backspace => 'b', FormFeed => 'f', LineFeed => 'n', CarriageReturn => 'r', Tab => 't', Unicode(u) => return Err(*u), }) } /// Return escape sequence as UTF-16. pub fn as_u16(&self) -> u16 { use Escape::*; match self { QuotationMark => 0x0022, ReverseSolidus => 0x005C, Solidus => 0x002F, Backspace => 0x0008, FormFeed => 0x000C, LineFeed => 0x000A, CarriageReturn => 0x000D, Tab => 0x0009, Unicode(u) => *u, } } } impl fmt::Display for Escape { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.as_char() { Ok(c) => write!(f, "\\{}", c), Err(u) => write!(f, "\\u{:04x}", u), } } } pub(crate) fn decode_hex(val: u8) -> Option { match val { b'0'..=b'9' => Some(val - b'0'), b'a'..=b'f' => Some(val - b'a' + 10), b'A'..=b'F' => Some(val - b'A' + 10), _ => None, } } /// Escape sequence lexing error. #[derive(Debug, PartialEq, Eq)] pub enum Error { /// `\x` or `\U` UnknownKind, /// `\u000X` InvalidHex, /// `\uDC37` InvalidChar(u32), /// `\uD801` ExpectedLowSurrogate, /// `\` or `\u00` Eof, } impl core::fmt::Display for Error { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use Error::*; match self { UnknownKind => "unknown escape sequence type".fmt(f), InvalidHex => "invalid hexadecimal sequence".fmt(f), InvalidChar(c) => write!(f, "invalid character with index {}", c), ExpectedLowSurrogate => "expected low surrogate".fmt(f), Eof => "unterminated escape sequence".fmt(f), } } } /// Escape sequence lexing. /// /// This does not require any allocation. pub trait Lex: Read { /// Convert a read escape sequence to a char, potentially reading more. fn escape_char(&mut self, escape: Escape) -> Result { let escape = match escape { Escape::Unicode(high @ (0xD800..=0xDBFF)) => { if self.read() != Some(b'\\') { return Err(Error::ExpectedLowSurrogate); } if let Escape::Unicode(low @ (0xDC00..=0xDFFF)) = self.escape()? { ((high - 0xD800) as u32 * 0x400 + (low - 0xDC00) as u32) + 0x10000 } else { return Err(Error::ExpectedLowSurrogate); } } e => e.as_u16() as u32, }; char::from_u32(escape).ok_or(Error::InvalidChar(escape)) } /// Read an escape sequence such as `\n` or `\u0009` (without leading `\`). fn escape(&mut self) -> Result { let typ = self.read().ok_or(Error::Eof)?; let escape = Escape::try_from(typ).ok_or(Error::UnknownKind)?; if matches!(escape, Escape::Unicode(_)) { let mut hex = 0; for _ in 0..4 { let h = self.read().ok_or(Error::Eof)?; let h = decode_hex(h).ok_or(Error::InvalidHex)?; hex = (hex << 4) + (h as u16); } Ok(Escape::Unicode(hex)) } else { Ok(escape) } } } impl Lex for T where T: Read {} hifijson-0.2.3/src/ignore.rs000064400000000000000000000012671046102023000140650ustar 00000000000000//! Discarding values. use crate::{Error, Expect, Lex, Token}; /// Parse and discard a value. pub fn parse(token: Token, lexer: &mut L) -> Result<(), Error> { match token { Token::Null | Token::True | Token::False => Ok(()), Token::DigitOrMinus => Ok(lexer.num_ignore().map(|_| ())?), Token::Quote => Ok(lexer.str_ignore()?), Token::LSquare => lexer.seq(Token::RSquare, parse), Token::LCurly => lexer.seq(Token::RCurly, |token, lexer| { lexer.str_colon(token, |lexer| lexer.str_ignore().map_err(Error::Str))?; parse(lexer.ws_token().ok_or(Expect::Value)?, lexer) }), _ => Err(Expect::Value)?, } } hifijson-0.2.3/src/lib.rs000064400000000000000000000314361046102023000133510ustar 00000000000000//! High-fidelity JSON lexer and parser. //! //! //! # Introduction //! //! JSON is a data format that is underspecified and sometimes contradictory. //! As reference, I recommend the excellent article "[Parsing JSON is a Minefield]". //! In particular, it is ambiguous how to parse strings and numbers. //! For example, JSON does not impose any restriction on the maximal size of numbers, //! but in reality, most JSON parsers use a lossy representation, //! for example 64-bit floating point. //! This is allowed by the JSON specification; however, //! at the same time, if we are allowed to fix arbitrary maximal sizes, //! then a parser that fails on every input is a valid parser! //! I hope that I could convince you at this point that this is all quite a mess. //! The best I can do to help you around this mess is to give *you* //! a tool to deal with this mess in the way that suits you most. //! hifijson is this tool. //! //! [Parsing JSON is a Minefield]: http://seriot.ch/projects/parsing_json.html //! //! What makes hifijson so flexible is that unlike most other JSON parsers, //! it exposes its basic building blocks, called [*lexers*](#lexers), //! that allow you to build your own parsers on top of them. //! //! Because hifijson exposes a variety of lexers and parsers, //! you can combine them in a way that allows you to achieve your desired behaviour, //! without having to write everything from scratch. //! For example, suppose that your input data does not contain escape sequences (`\n`, `\uxxxx`); //! then you can use the [`str::LexWrite::str_bytes`] function that is //! guaranteed to never allocate memory when lexing from a slice, //! making it suitable for usage in embedded environments. //! Or suppose that you are reading an object `{"title": ..., "reviews": ...}`, //! and you do not feel like caring about reviews today. //! Then you can simply skip reading the value for reviews by using [`ignore::parse`]. //! Going wild and stretching the syntax a bit, you can also make //! a parser that accepts any value (instead of only strings as mandated by JSON) as object key. //! Or, if you just want to have a complete JSON value, then //! you can use [`value::parse_unbounded`]. //! The choice is yours. //! //! In summary, hifijson aims to give you the tools to interpret JSON data //! flexibly and performantly. //! //! ## Lexers //! //! A *lexer* is a program that breaks up input into small units, and //! a parser combines these small units to transform them into the desired shape. //! For example, consider the JSON input `[null, {}]`. //! A lexer could break these into the units `[`, `null`, `,`, `{`, `}`, and `]`. //! //! JSON parsing consists of about three mostly independent lexing tasks: //! strings, numbers, and everything else (in decreasing order of difficulty). //! The "everything else" part is what I call a [token](Token). //! That includes: //! * `[`, `]` (start and end of an array) //! * `{`, `}` (start and end of an object) //! * `,` //! * `:` //! * `null` //! * `true` //! * `false` //! * `"` (start of a string) //! * `-` and any digit from 0 to 9 (start of a number) //! //! So every token has a maximal length (namely length 5). //! //! What about strings and numbers? //! The token lexer is not responsible for lexing them, but it //! returns special tokens to indicate that it has encountered the *start* of a string or number. //! Once that we get such a token, we can use one of the many string/number lexers, //! which differ most prominently in their memory allocation behaviour. //! For example, //! * [`str::Lex::str_ignore`] discards a string, //! * [`str::LexWrite::str_bytes`] reads a string, but does not interpret escape sequences, and //! * [`str::LexAlloc::str_string`] reads a string and interprets escape sequences. //! //! In particular, //! lexers that implement the [`Lex`] trait do *never* allocate memory; //! lexers that implement the [`LexWrite`] trait only allocate memory when lexing from iterators, //! and //! lexers that implement the [`LexAlloc`] trait may allocate memory when lexing from both //! iterators and slices. //! //! ## Slices and Iterators //! //! One important feature of hifijson is that it allows to read from both //! [slices](SliceLexer) and [iterators](IterLexer) over bytes. //! This is useful when your application should support reading from both //! files and streams (such as standard input). //! //! ## Feature Flags //! //! If you build hifijson without the feature flag `alloc`, you disable any allocation. //! If you build hifijson with the feature flag `serde`, //! then you can use hifijson to deserialise JSON to data types implementing `serde::Deserialize`. //! //! //! # Examples //! //! ## Parsing strings to values //! //! Let us consider a very simple usage: //! Parsing a JSON value from a string. //! For this, we first have to create a lexer from the string, //! then call the value parser on the lexer: //! //! ~~~ //! // our input JSON that we want to parse //! let json = br#"[null, true, false, "hello", 0, 3.1415, [1, 2], {"x": 1, "y": 2}]"#; //! //! // the lexer on our input -- just creating it does not actually run it yet //! let mut lexer = hifijson::SliceLexer::new(json); //! //! use hifijson::token::Lex; //! // now we are going -- we try to //! // obtain exactly one JSON value from the lexer and //! // parse it to a value, allowing for arbitrarily deep (unbounded) nesting //! let value = lexer.exactly_one(|token, lexer| hifijson::value::parse_unbounded(token, lexer)); //! let value = value.expect("parse"); //! //! // yay, we got an array! //! assert!(matches!(value, hifijson::value::Value::Array(_))); //! assert_eq!( //! value.to_string(), //! // printing a value yields a compact representation with minimal spaces //! r#"[null,true,false,"hello",0,3.1415,[1,2],{"x":1,"y":2}]"# //! ); //! ~~~ //! //! ## Parsing files and streams //! //! The following example reads JSON from a file if an argument is given, //! otherwise from standard input: //! //! ~~~ no_run //! /// Parse a single JSON value and print it. //! /// //! /// Note that the `LexAlloc` trait indicates that this lexer allocates memory. //! fn process(mut lexer: impl hifijson::LexAlloc) { //! let value = lexer.exactly_one(|token, lexer| hifijson::value::parse_unbounded(token, lexer)); //! let value = value.expect("parse"); //! println!("{}", value); //! } //! //! let filename = std::env::args().nth(1); //! if let Some(filename) = filename { //! let file = std::fs::read(filename).expect("read file"); //! process(hifijson::SliceLexer::new(&file)) //! } else { //! use std::io::Read; //! process(hifijson::IterLexer::new(std::io::stdin().bytes())) //! } //! ~~~ //! //! We just made a pretty printer (stretching the definition of pretty pretty far). //! //! ## Operating on the lexer //! //! Often, it is better for performance to operate directly on the tokens that the lexer yields //! rather than parsing everything into a value and then processing the value. //! For example, the following example counts the number of values in the input JSON. //! Unlike the previous examples, it requires only constant memory! //! //! ~~~ //! use hifijson::{Token, Error}; //! //! /// Recursively count the number of values in the value starting with `token`. //! /// //! /// The `Lex` trait indicates that this lexer does *not* allocate memory. //! fn count(token: Token, lexer: &mut impl hifijson::Lex) -> Result { //! match token { //! // the JSON values "null", "true", and "false" //! Token::Null | Token::True | Token::False => Ok(1), //! //! // the lexer reads only the first character of numbers and strings, //! // therefore, we have to consume the rest ourselves //! Token::DigitOrMinus => Ok(lexer.num_ignore().map(|_| 1)?), //! Token::Quote => Ok(lexer.str_ignore().map(|_| 1)?), //! //! // start of array ('[') //! Token::LSquare => { //! // an array is a value itself, so start with 1 //! let mut sum = 1; //! // perform the following for every item of the array //! lexer.seq(Token::RSquare, |token, lexer| { //! sum += count(token, lexer)?; //! Ok::<_, Error>(()) //! })?; //! Ok(sum) //! } //! //! // start of object ('{') //! Token::LCurly => { //! let mut sum = 1; //! // perform the following for every key-value pair of the object //! lexer.seq(Token::RCurly, |token, lexer| { //! /// read the key, ignoring it, and then the ':' after it //! lexer.str_colon(token, |lexer| lexer.str_ignore().map_err(Error::Str))?; //! /// now read the token after ':' //! let token = lexer.ws_token().ok_or(hifijson::Expect::Value)?; //! sum += count(token, lexer)?; //! Ok::<_, Error>(()) //! })?; //! Ok(sum) //! } //! _ => Err(hifijson::Expect::Value)?, //! } //! } //! //! fn process(mut lexer: impl hifijson::Lex) -> Result { //! lexer.exactly_one(|token, lexer| count(token, lexer)) //! } //! //! let json = br#"[null, true, false, "hello", 0, 3.1415, [1, 2], {"x": 1, "y": 2}]"#; //! let mut lexer = hifijson::SliceLexer::new(json); //! let n = process(lexer).unwrap(); //! assert_eq!(n, 13) //! ~~~ //! //! ## More Examples //! //! See the `cat` example for a more worked version of a JSON "pretty" printer //! that can be also used to lazily filter parts of the data based on a path. //! hifijson also powers all JSON reading in the [jaq](https://crates.io/crates/jaq) crate, //! for which it was originally created. #![no_std] #![forbid(unsafe_code)] #![warn(missing_docs)] macro_rules! impl_from { ($from:ty, $to:ty, $proj:expr) => { impl From<$from> for $to { fn from(x: $from) -> Self { $proj(x) } } }; } #[cfg(feature = "alloc")] extern crate alloc; #[cfg(feature = "std")] extern crate std; mod read; mod write; use read::Read; use write::Write; pub mod escape; pub mod num; pub mod str; pub mod token; pub use token::{Expect, Token}; pub mod ignore; #[cfg(feature = "serde")] pub mod serde; #[cfg(feature = "alloc")] pub mod value; /// Lexing without any need for memory allocation. pub trait Lex: token::Lex + num::Lex + str::Lex {} impl Lex for T where T: token::Lex + num::Lex + str::Lex {} /// Lexing that does not allocate memory from slices, but from iterators. pub trait LexWrite: Lex + num::LexWrite + str::LexWrite {} impl LexWrite for T where T: Lex + num::LexWrite + str::LexWrite {} /// Lexing that allocates memory both from slices and iterators. pub trait LexAlloc: LexWrite + str::LexAlloc {} impl LexAlloc for T where T: LexWrite + str::LexAlloc {} /// JSON lexer from a shared byte slice. pub struct SliceLexer<'a> { slice: &'a [u8], } impl<'a> SliceLexer<'a> { /// Create a new slice lexer. /// /// A fast way to obtain the contents of a file as `&[u8]` is memory mapping; /// see for example the [memmap2](https://docs.rs/memmap2) crate. /// pub fn new(slice: &'a [u8]) -> Self { Self { slice } } /// Return remaining input as a subslice of the original data. /// /// This can be used to find the place where an error occurred. pub fn as_slice(&self) -> &'a [u8] { self.slice } } /// JSON lexer from an iterator over (fallible) bytes. /// /// This can be used to lex from a [`Read`](std::io::Read) as follows: /// /// ~~~ /// use std::io::Read; /// let read = std::io::stdin(); /// let lexer = hifijson::IterLexer::new(read.bytes()); /// ~~~ pub struct IterLexer { bytes: I, last: Option, /// error occurred during reading a byte pub error: Option, } impl>> IterLexer { /// Create a new iterator lexer. pub fn new(iter: I) -> Self { Self { bytes: iter, last: None, error: None, } } } /// Parse error. #[derive(Debug, PartialEq, Eq)] pub enum Error { /// maximal parsing depth has been exceeded Depth, /// number lexing has failed Num(num::Error), /// string lexing has failed Str(str::Error), /// we did not obtain a token that we expected Token(token::Expect), } impl_from!(num::Error, Error, Error::Num); impl_from!(str::Error, Error, Error::Str); impl_from!(token::Expect, Error, Error::Token); use core::fmt::{self, Display}; impl Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use Error::*; match self { Depth => "maximal depth exceeded".fmt(f), Num(num::Error::ExpectedDigit) => "expected digit".fmt(f), Str(e) => e.fmt(f), Token(e) => write!(f, "{} expected", e), } } } #[cfg(feature = "std")] impl std::error::Error for Error {} hifijson-0.2.3/src/num.rs000064400000000000000000000204251046102023000133760ustar 00000000000000//! Numbers. use crate::{Read, Write}; use core::num::NonZeroUsize; /// Number lexing error. #[derive(Debug, PartialEq, Eq)] pub enum Error { /// The only thing that can go wrong during number lexing is /// that we are not reading even a single digit. /// Once a single digit has been read, /// unexpected sequences afterwards are ignored by this lexer. /// For example, if the lexer encounters `42abc`, /// it returns only `42` and does not touch `abc`. ExpectedDigit, } /// Position of `.` and `e`/`E` in the string representation of a number. /// /// Because a number cannot start with `.` or `e`/`E`, /// these positions must always be greater than zero. #[derive(Debug, Default, PartialEq, Eq)] pub struct Parts { /// position of the dot pub dot: Option, /// position of the exponent character (`e`/`E`) pub exp: Option, } impl Parts { /// Return true if the number contains neither a dot not an exponent. pub fn is_int(&self) -> bool { self.dot.is_none() && self.exp.is_none() } } /// Number lexing, ignoring the number. pub trait Lex: Read { /// Perform `f` for every digit read. fn digits_foreach(&mut self, mut f: impl FnMut(u8)) { while let Some(digit @ (b'0'..=b'9')) = self.peek_next() { f(*digit); self.read_next() } } /// Return number of digits read and fail if no digit was encountered. fn digits1_ignore(&mut self) -> Result { let mut len = 0; self.digits_foreach(|_| len += 1); NonZeroUsize::new(len).ok_or(Error::ExpectedDigit) } /// Run function for every digit, fail if no digit encountered. fn digits1_foreach(&mut self, mut f: impl FnMut(u8)) -> Result { let mut len = 0; self.digits_foreach(|d| { f(d); len += 1 }); NonZeroUsize::new(len).ok_or(Error::ExpectedDigit) } /// Run function for each character of a number. fn num_foreach(&mut self, mut f: impl FnMut(u8)) -> Result { let mut pos = 0; let mut parts = Parts::default(); if let Some(b'-') = self.peek_next() { f(b'-'); self.read_next(); pos += 1; } match self.peek_next() { Some(b'0') => { f(b'0'); self.read_next(); pos += 1; } Some(digit @ b'1'..=b'9') => { f(*digit); self.read_next(); pos += 1; self.digits_foreach(|digit| { f(digit); pos += 1 }) } _ => return Err(Error::ExpectedDigit), } loop { match self.peek_next() { Some(b'.') if parts.is_int() => { parts.dot = Some(NonZeroUsize::new(pos).unwrap()); f(b'.'); self.read_next(); pos += 1 + self.digits1_foreach(&mut f)?.get(); } Some(exp @ (b'e' | b'E')) if parts.exp.is_none() => { parts.exp = Some(NonZeroUsize::new(pos).unwrap()); f(*exp); self.read_next(); if let Some(sign @ (b'+' | b'-')) = self.peek_next() { f(*sign); self.read_next(); pos += 1; } pos += 1 + self.digits1_foreach(&mut f)?.get(); } _ => return Ok(parts), } } } /// Lex a number and ignore its contents, saving only its parts. fn num_ignore(&mut self) -> Result { self.num_foreach(|_| ()) } } impl Lex for T where T: Read {} /// Number lexing, keeping the number. pub trait LexWrite: Lex + Write { /// String type to save numbers as. type Num: core::ops::Deref; /// Write a number to bytes and save its parts. fn num_bytes(&mut self, bytes: &mut Self::Bytes) -> Result; /// Read a number to a string and save its parts. fn num_string(&mut self) -> Result<(Self::Num, Parts), Error>; } fn digits(s: &[u8]) -> usize { s.iter() .position(|c| !matches!(c, b'0'..=b'9')) .unwrap_or(s.len()) } impl<'a> LexWrite for crate::SliceLexer<'a> { type Num = &'a str; fn num_bytes(&mut self, bytes: &mut Self::Bytes) -> Result { let mut pos = usize::from(self.slice[0] == b'-'); let mut parts = Parts::default(); let digits1 = |s| NonZeroUsize::new(digits(s)).ok_or(Error::ExpectedDigit); pos += if self.slice.get(pos) == Some(&b'0') { 1 } else { digits1(&self.slice[pos..])?.get() }; loop { match self.slice.get(pos) { Some(b'.') if parts.dot.is_none() && parts.exp.is_none() => { parts.dot = Some(NonZeroUsize::new(pos).unwrap()); pos += 1; pos += digits1(&self.slice[pos..])?.get() } Some(b'e' | b'E') if parts.exp.is_none() => { parts.exp = Some(NonZeroUsize::new(pos).unwrap()); pos += 1; if matches!(self.slice.get(pos), Some(b'+' | b'-')) { pos += 1; } pos += digits1(&self.slice[pos..])?.get() } None | Some(_) => { *bytes = &self.slice[..pos]; self.slice = &self.slice[pos..]; return Ok(parts); } } } } fn num_string(&mut self) -> Result<(Self::Num, Parts), Error> { let mut num = Default::default(); let pos = self.num_bytes(&mut num)?; // SAFETY: conversion to UTF-8 always succeeds because // lex_number validates everything it writes to num Ok((core::str::from_utf8(num).unwrap(), pos)) } } #[cfg(feature = "alloc")] impl>> crate::IterLexer { fn digits(&mut self, num: &mut ::Bytes) -> Result<(), Error> { let mut some_digit = false; while let Some(digit @ (b'0'..=b'9')) = self.last { some_digit = true; num.push(digit); self.last = self.read(); } if some_digit && self.error.is_none() { Ok(()) } else { Err(Error::ExpectedDigit) } } } #[cfg(feature = "alloc")] impl>> LexWrite for crate::IterLexer { type Num = alloc::string::String; fn num_bytes(&mut self, num: &mut Self::Bytes) -> Result { let mut parts = Parts::default(); if self.last == Some(b'-') { num.push(b'-'); self.last = self.read(); } if self.last == Some(b'0') { num.push(b'0'); self.last = self.read(); } else { self.digits(num)?; } loop { match self.last { Some(b'.') if parts.dot.is_none() && parts.exp.is_none() => { parts.dot = Some(NonZeroUsize::new(num.len()).unwrap()); num.push(b'.'); self.last = self.read(); self.digits(num)?; } Some(e @ (b'e' | b'E')) if parts.exp.is_none() => { parts.exp = Some(NonZeroUsize::new(num.len()).unwrap()); num.push(e); self.last = self.read(); if let Some(sign @ (b'+' | b'-')) = self.last { num.push(sign); self.last = self.read(); } self.digits(num)?; } _ => return Ok(parts), } } } fn num_string(&mut self) -> Result<(Self::Num, Parts), Error> { let mut num = Default::default(); let pos = self.num_bytes(&mut num)?; // SAFETY: conversion to UTF-8 always succeeds because // lex_number validates everything it writes to num Ok((alloc::string::String::from_utf8(num).unwrap(), pos)) } } hifijson-0.2.3/src/read.rs000064400000000000000000000063231046102023000135130ustar 00000000000000/// Low-level input operations. pub trait Read { /// Return `true` if the given byte sequence is a prefix of the input. fn strip_prefix(&mut self, s: [u8; N]) -> bool; /// Run a function on current input until a certain condition is fulfilled. fn foreach_until(&mut self, mut f: impl FnMut(u8), mut stop: impl FnMut(u8) -> bool) { self.skip_until(|c| { stop(c) || { f(c); false } }) } fn skip_until(&mut self, stop: impl FnMut(u8) -> bool); /// Ignore input until `stop` yields true. fn skip_next_until(&mut self, stop: impl FnMut(u8) -> bool); /// Read a byte, do not put it into buffer. fn read(&mut self) -> Option; /// Read a byte and put it into buffer. fn read_next(&mut self); /// Peek at the byte from the buffer. fn peek_next(&self) -> Option<&u8>; /// Take the byte from the buffer. fn take_next(&mut self) -> Option; } impl<'a> Read for crate::SliceLexer<'a> { fn strip_prefix(&mut self, s: [u8; N]) -> bool { if let Some(rest) = self.slice.strip_prefix(&s) { self.slice = rest; true } else { false } } fn skip_until(&mut self, stop: impl FnMut(u8) -> bool) { use crate::Write; self.write_until(&mut &[][..], stop) } fn skip_next_until(&mut self, stop: impl FnMut(u8) -> bool) { self.skip_until(stop) } fn read(&mut self) -> Option { let (head, rest) = self.slice.split_first()?; self.slice = rest; Some(*head) } fn read_next(&mut self) { self.slice = &self.slice[1..] } fn peek_next(&self) -> Option<&u8> { self.slice.first() } fn take_next(&mut self) -> Option { self.read() } } impl>> Read for crate::IterLexer { fn strip_prefix(&mut self, s: [u8; N]) -> bool { for c1 in s { match self.read() { Some(c2) if c1 == c2 => continue, Some(_) | None => return false, } } true } fn skip_until(&mut self, mut stop: impl FnMut(u8) -> bool) { for c in self.bytes.by_ref() { match c { Ok(c) if !stop(c) => continue, Ok(c) => self.last = Some(c), Err(e) => { self.last = Some(0); self.error = Some(e); } } return; } self.last = None } fn skip_next_until(&mut self, mut stop: impl FnMut(u8) -> bool) { match self.last { Some(last) if stop(last) => (), _ => self.skip_until(stop), } } fn read(&mut self) -> Option { match self.bytes.next()? { Ok(b) => Some(b), Err(e) => { self.error = Some(e); None } } } fn read_next(&mut self) { self.skip_until(|_| true) } fn take_next(&mut self) -> Option { self.last.take() } fn peek_next(&self) -> Option<&u8> { self.last.as_ref() } } hifijson-0.2.3/src/serde.rs000064400000000000000000000141601046102023000137000ustar 00000000000000//! Deserialisation via serde. //! //! Example usage: //! //! ~~~ //! let input = b"[0, 1]"; //! let mut lexer = hifijson::SliceLexer::new(input); //! let value: Vec<_> = hifijson::serde::exactly_one(&mut lexer).unwrap(); //! assert_eq!(vec![0, 1], value); //! ~~~ use crate::{Expect, Lex, LexAlloc, Token}; use alloc::string::{String, ToString}; use core::fmt; use serde::de::{self, DeserializeSeed, Visitor}; use serde::Deserialize; /// Deserialisation error. #[derive(Debug)] pub enum Error { /// parse error Parse(crate::Error), /// error produced by serde Custom(String), /// `2e1000` (we were not able to fit a number into its type) Number(String), } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use Error::*; match self { Parse(e) => e.fmt(f), Custom(e) => e.fmt(f), Number(n) => write!(f, "number overflow: {}", n), } } } impl_from!(crate::Error, Error, Error::Parse); impl_from!(Expect, Error, |e| Error::Parse(crate::Error::Token(e))); impl std::error::Error for Error {} type Result = core::result::Result; impl de::Error for Error { fn custom(e: T) -> Self { Self::Custom(e.to_string()) } } struct TokenLexer { token: Token, lexer: L, } fn parse_number(n: &str) -> Result { n.parse().map_err(|_| Error::Number(n.to_string())) } macro_rules! deserialize_number { ($deserialize:ident, $visit:ident) => { fn $deserialize>(self, visitor: V) -> Result { let (n, _parts) = self.lexer.num_string().map_err(crate::Error::Num)?; visitor.$visit(parse_number(&n)?) } }; } impl<'de, 'a, L: LexAlloc + 'de> de::Deserializer<'de> for TokenLexer<&'a mut L> { type Error = Error; fn deserialize_any(self, visitor: V) -> Result where V: Visitor<'de>, { use crate::Error::{Num, Str}; match self.token { Token::Null => visitor.visit_unit(), Token::True => visitor.visit_bool(true), Token::False => visitor.visit_bool(false), Token::Quote => visitor.visit_str(&self.lexer.str_string().map_err(Str)?), Token::DigitOrMinus => { let (n, parts) = self.lexer.num_string().map_err(Num)?; if parts.is_int() { if n.starts_with('-') { visitor.visit_i64(parse_number(&n)?) } else { visitor.visit_u64(parse_number(&n)?) } } else { visitor.visit_f64(parse_number(&n)?) } } Token::LSquare => visitor.visit_seq(CommaSeparated::new(self.lexer)), Token::LCurly => visitor.visit_map(CommaSeparated::new(self.lexer)), _ => Err(Expect::Value)?, } } serde::forward_to_deserialize_any! { bool char str string bytes byte_buf option unit unit_struct newtype_struct seq tuple tuple_struct map struct enum identifier ignored_any } deserialize_number!(deserialize_u8, visit_u8); deserialize_number!(deserialize_u16, visit_u16); deserialize_number!(deserialize_u32, visit_u32); deserialize_number!(deserialize_u64, visit_u64); deserialize_number!(deserialize_u128, visit_u128); deserialize_number!(deserialize_i8, visit_i8); deserialize_number!(deserialize_i16, visit_i16); deserialize_number!(deserialize_i32, visit_i32); deserialize_number!(deserialize_i64, visit_i64); deserialize_number!(deserialize_i128, visit_i128); deserialize_number!(deserialize_f32, visit_f32); deserialize_number!(deserialize_f64, visit_f64); } struct CommaSeparated<'a, L> { lexer: &'a mut L, first: bool, } impl<'a, L> CommaSeparated<'a, L> { fn new(lexer: &'a mut L) -> Self { CommaSeparated { lexer, first: true } } } impl<'a, L: Lex> CommaSeparated<'a, L> { // Comma is required before every element except the first. fn comma(&mut self, token: &mut Token) -> Result<()> { if !core::mem::take(&mut self.first) { if *token != Token::Comma { Err(Expect::CommaOrEnd)? } else { *token = self.lexer.ws_token().ok_or(Expect::Value)?; } } Ok(()) } } impl<'de, 'a, L: LexAlloc + 'de> de::SeqAccess<'de> for CommaSeparated<'a, L> { type Error = Error; fn next_element_seed(&mut self, seed: T) -> Result> where T: DeserializeSeed<'de>, { let token = self.lexer.ws_token(); let mut token = token.ok_or(Expect::ValueOrEnd)?; if token == Token::RSquare { return Ok(None); }; self.comma(&mut token)?; let lexer = &mut *self.lexer; seed.deserialize(TokenLexer { token, lexer }).map(Some) } } impl<'de, 'a, L: LexAlloc + 'de> de::MapAccess<'de> for CommaSeparated<'a, L> { type Error = Error; fn next_key_seed(&mut self, seed: K) -> Result> where K: DeserializeSeed<'de>, { let token = self.lexer.ws_token(); let mut token = token.ok_or(Expect::ValueOrEnd)?; if token == Token::RCurly { return Ok(None); }; self.comma(&mut token)?; if token != Token::Quote { Err(Expect::String)? } let lexer = &mut *self.lexer; seed.deserialize(TokenLexer { token, lexer }).map(Some) } fn next_value_seed(&mut self, seed: V) -> Result where V: DeserializeSeed<'de>, { let lexer = &mut *self.lexer; let colon = lexer.ws_token().filter(|t| *t == Token::Colon); colon.ok_or(Expect::Colon)?; let token = lexer.ws_token().ok_or(Expect::Value)?; seed.deserialize(TokenLexer { token, lexer }) } } /// Deserialise a single value. pub fn exactly_one<'a, T: Deserialize<'a>, L: LexAlloc + 'a>(lexer: &mut L) -> Result { lexer.exactly_one(|token, lexer| T::deserialize(TokenLexer { token, lexer })) } hifijson-0.2.3/src/str.rs000064400000000000000000000231251046102023000134070ustar 00000000000000//! Strings. //! //! Converting JSON strings to Rust strings can require allocation, because //! escape sequences (such as `\n` or `\\`) in the JSON input //! have to be converted to Rust characters. //! For example, //! `\n` is mapped to the new line character, and //! `\\` is mapped to a single backslash. //! //! To provide flexibility, this module provides //! three different traits to parse JSON strings: //! //! * `Lex`: This is the most basic trait and allows only to //! lex a string and discard its contents. //! This can be useful if you know beforehand that //! you do not care about the contents of the string, //! because it is very fast and does not allocate memory. //! * `LexWrite`: This trait lexes a string, //! but does not map escape sequences to the corresponding Rust characters. //! This never allocates memory when lexing from slices, //! but it always allocates memory when lexing from an iterator. //! * `LexAlloc`: This trait lexes a string, //! mapping escape sequences to corresponding Rust characters. //! Like `LexWrite`, this always allocates memory when lexing from an iterator, //! but it allocates memory when lexing from a slice *only* if the input string contains at least one escape sequence. //! //! When in doubt, go for `LexAlloc`. use crate::escape::{self, Escape}; use crate::{Read, Write}; use core::fmt; use core::ops::Deref; /// Wrapper type to facilitate printing strings as JSON. pub struct Display(Str); impl Display { /// Create a new string to be printed as JSON string. pub fn new(s: Str) -> Self { Self(s) } } impl> fmt::Display for Display { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { '"'.fmt(f)?; for c in self.0.chars() { match c { '\\' | '"' | '\n' | '\r' | '\t' => c.escape_default().try_for_each(|c| c.fmt(f)), c if (c as u32) < 20 => write!(f, "\\u{:04x}", c as u16), c => c.fmt(f), }? } '"'.fmt(f) } } /// String lexing error. #[derive(Debug, PartialEq, Eq)] pub enum Error { /// ASCII control sequence (between 0 and 0x1F) was found Control, /// escape sequence (starting with `'\n'`) could not be decoded Escape(escape::Error), /// string was not terminated Eof, /// string is not in UTF-8 Utf8(core::str::Utf8Error), } impl Error { /// True if the string is not in UTF-8 or an UTF-16 escape sequence is invalid. /// /// These errors do never occur when parsing strings via /// [`Lex::str_ignore`] or [`LexWrite::str_bytes`]. /// However, they can occur when parsing strings via /// [`LexAlloc::str_string`]. pub fn is_unicode_error(&self) -> bool { use escape::Error::*; matches!( self, Self::Utf8(_) | Self::Escape(InvalidChar(_) | ExpectedLowSurrogate) ) } } impl_from!(escape::Error, Error, Error::Escape); impl core::fmt::Display for Error { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use Error::*; match self { Control => "invalid string control character".fmt(f), Escape(e) => e.fmt(f), Eof => "unterminated string".fmt(f), Utf8(e) => e.fmt(f), } } } /// String lexing state machine. #[derive(Default)] struct State { /// Are we in an escape sequence, and if so, /// are we in a unicode escape sequence, and if so, /// at which position in the hex code are we? escape: Option>, /// Did we encounter an error so far? error: Option, } impl State { /// Process the next character of a string, /// return whether the string is finished or an error occurred. fn process(&mut self, c: u8) -> bool { // are we in an escape sequence (started by '\')? if let Some(unicode) = &mut self.escape { // are we in a Unicode escape sequence (started by "\u")? if let Some(hex_pos) = unicode { if escape::decode_hex(c).is_none() { self.error = Some(escape::Error::InvalidHex.into()); } else if *hex_pos < 3 { *hex_pos += 1 } else { self.escape = None; } } else { // we are in a non-Unicode escape sequence, // let us see which kind of sequence ... match Escape::try_from(c) { Some(Escape::Unicode(_)) => *unicode = Some(0), Some(_) => self.escape = None, None => self.error = Some(escape::Error::UnknownKind.into()), } } } else { // we are not in any escape sequence match c { b'"' => return true, b'\\' => self.escape = Some(None), 0..=0x1F => self.error = Some(Error::Control), _ => return false, }; } self.error.is_some() } /// Ensure that once `process` has returned `true`, the string has actually terminated. fn finish(self, mut next: impl FnMut() -> Option) -> Result<(), Error> { match self.error { Some(e) => Err(e), None if self.escape.is_some() => Err(escape::Error::Eof)?, None if next() != Some(b'"') => Err(Error::Eof), None => Ok(()), } } } /// String lexing that does never allocate. pub trait Lex: escape::Lex { /// Read a string without saving it. fn str_ignore(&mut self) -> Result<(), Error> { self.str_foreach(|_| ()) } /// Run a function for every character of the string. fn str_foreach(&mut self, f: impl FnMut(u8)) -> Result<(), Error> { let mut state = State::default(); self.foreach_until(f, |c| state.process(c)); state.finish(|| self.take_next()) } } impl Lex for T where T: escape::Lex {} /// String lexing that allocates only when lexing from iterators. pub trait LexWrite: escape::Lex + Read + Write { /// Read a string to bytes, copying escape sequences one-to-one. fn str_bytes(&mut self, bytes: &mut Self::Bytes) -> Result<(), Error> { let mut state = State::default(); self.write_until(bytes, |c| state.process(c)); state.finish(|| self.take_next()) } /// Lex a string by executing `on_string` on every string and `on_bytes` on every escape sequence. fn str_fold, T>( &mut self, mut out: T, on_string: impl Fn(&mut Self::Bytes, &mut T) -> Result<(), E>, on_escape: impl Fn(&mut Self, Escape, &mut T) -> Result<(), E>, ) -> Result { fn string_end(c: u8) -> bool { matches!(c, b'\\' | b'"' | 0..=0x1F) } let mut bytes = Self::Bytes::default(); self.write_until(&mut bytes, string_end); on_string(&mut bytes, &mut out)?; match self.take_next().ok_or(Error::Eof)? { b'\\' => (), b'"' => return Ok(out), 0..=0x1F => return Err(Error::Control)?, _ => unreachable!(), } loop { let escape = self.escape().map_err(Error::Escape)?; on_escape(self, escape, &mut out)?; self.write_until(&mut bytes, string_end); on_string(&mut bytes, &mut out)?; match self.take_next().ok_or(Error::Eof)? { b'\\' => continue, b'"' => return Ok(out), 0..=0x1F => return Err(Error::Control)?, _ => unreachable!(), } } } } impl LexWrite for T where T: Read + Write {} /// String lexing that always allocates when lexing from iterators and /// allocates when lexing from slices that contain escape sequences. pub trait LexAlloc: LexWrite { /// The type of string that we are lexing into. type Str: Deref; /// Lex a JSON string to a Rust string. fn str_string(&mut self) -> Result; } #[cfg(feature = "alloc")] impl<'a> LexAlloc for crate::SliceLexer<'a> { type Str = alloc::borrow::Cow<'a, str>; fn str_string(&mut self) -> Result { use alloc::borrow::Cow; let on_string = |bytes: &mut Self::Bytes, out: &mut Self::Str| { match core::str::from_utf8(bytes).map_err(Error::Utf8)? { s if s.is_empty() => (), s if out.is_empty() => *out = Cow::Borrowed(s), s => out.to_mut().push_str(s), }; Ok::<_, Error>(()) }; use crate::escape::Lex; self.str_fold(Cow::Borrowed(""), on_string, |lexer, escape, out| { out.to_mut().push(lexer.escape_char(escape)?); Ok(()) }) } } #[cfg(feature = "alloc")] impl>> LexAlloc for crate::IterLexer { type Str = alloc::string::String; fn str_string(&mut self) -> Result { use alloc::string::String; let on_string = |bytes: &mut Self::Bytes, out: &mut Self::Str| { match bytes { b if b.is_empty() => (), b if out.is_empty() => { *out = String::from_utf8(core::mem::take(b)) .map_err(|e| Error::Utf8(e.utf8_error()))? } b => out.push_str(core::str::from_utf8(b).map_err(Error::Utf8)?), } Ok::<_, Error>(()) }; use crate::escape::Lex; self.str_fold(Self::Str::new(), on_string, |lexer, escape, out| { out.push(lexer.escape_char(escape)?); Ok(()) }) } } hifijson-0.2.3/src/token.rs000064400000000000000000000130161046102023000137150ustar 00000000000000//! Tokens. /// What we expected to get, but did not get. #[derive(Debug, PartialEq, Eq)] pub enum Expect { /// ` ` or `]` or `,` Value, /// `[` or `{` ValueOrEnd, /// `[1` or `[1 2` CommaOrEnd, /// `{0: 1}` String, /// `{"a" 1}` Colon, /// `true false` (when parsing exactly one value) Eof, } impl core::fmt::Display for Expect { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use Expect::*; match self { Value => "value".fmt(f), ValueOrEnd => "value or end of sequence".fmt(f), CommaOrEnd => "comma or end of sequence".fmt(f), String => "string".fmt(f), Colon => "colon".fmt(f), Eof => "end of file".fmt(f), } } } /// JSON lexer token. #[derive(Debug, PartialEq, Eq)] pub enum Token { /// `null` Null, /// `true` True, /// `false` False, /// `,` Comma, /// `:` Colon, /// `[` LSquare, /// `]` RSquare, /// `{` LCurly, /// `}` RCurly, /// `"` Quote, /// a digit (0-9) or a minus (`-`) DigitOrMinus, /// anything else Error, } impl core::fmt::Display for Token { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use Token::*; match self { Null => "null".fmt(f), True => "true".fmt(f), False => "false".fmt(f), Comma => ",".fmt(f), Colon => ":".fmt(f), LSquare => "[".fmt(f), RSquare => "]".fmt(f), LCurly => "{".fmt(f), RCurly => "}".fmt(f), Quote => '"'.fmt(f), DigitOrMinus => "number".fmt(f), Error => "unknown token".fmt(f), } } } impl Token { /// Return `Ok(())` if `self` equals `token`, else return `Err(err)`. pub fn equals_or(&self, token: Token, err: E) -> Result<(), E> { if *self == token { Ok(()) } else { Err(err) } } } /// Lexing that does not require allocation. pub trait Lex: crate::Read { /// Skip input until the earliest non-whitespace character. fn eat_whitespace(&mut self) { self.skip_next_until(|c| !matches!(c, b' ' | b'\t' | b'\r' | b'\n')) } /// Skip potential whitespace and return the following token if there is some. fn ws_token(&mut self) -> Option { self.eat_whitespace(); Some(self.token(*self.peek_next()?)) } /// Return `out` if the input matches `s`, otherwise return an error. fn exact(&mut self, s: [u8; N], out: Token) -> Token { // we are calling this function without having advanced before self.take_next(); if self.strip_prefix(s) { out } else { Token::Error } } /// Convert a character to a token, such as '`:`' to `Token::Colon`. /// /// When the token consists of several characters, such as /// `null`, `true`, or `false`, /// also consume the following characters. fn token(&mut self, c: u8) -> Token { let token = match c { // it is important to `return` here in order not to read a byte, // like we do for the regular, single-character tokens b'n' => return self.exact([b'u', b'l', b'l'], Token::Null), b't' => return self.exact([b'r', b'u', b'e'], Token::True), b'f' => return self.exact([b'a', b'l', b's', b'e'], Token::False), b'0'..=b'9' | b'-' => return Token::DigitOrMinus, b'"' => Token::Quote, b'[' => Token::LSquare, b']' => Token::RSquare, b'{' => Token::LCurly, b'}' => Token::RCurly, b',' => Token::Comma, b':' => Token::Colon, _ => Token::Error, }; self.take_next(); token } /// Parse a string with given function, followed by a colon. fn str_colon, F>(&mut self, token: Token, f: F) -> Result where F: FnOnce(&mut Self) -> Result, { token.equals_or(Token::Quote, Expect::String)?; let key = f(self)?; let colon = self.ws_token().filter(|t| *t == Token::Colon); colon.ok_or(Expect::Colon)?; Ok(key) } /// Execute `f` for every item in the comma-separated sequence until `end`. fn seq, F>(&mut self, end: Token, mut f: F) -> Result<(), E> where F: FnMut(Token, &mut Self) -> Result<(), E>, { let mut token = self.ws_token().ok_or(Expect::ValueOrEnd)?; if token == end { return Ok(()); }; loop { f(token, self)?; token = self.ws_token().ok_or(Expect::CommaOrEnd)?; if token == end { return Ok(()); } else if token == Token::Comma { token = self.ws_token().ok_or(Expect::Value)?; } else { return Err(Expect::CommaOrEnd)?; } } } /// Parse once using given function and assure that the function has consumed all tokens. fn exactly_one, F>(&mut self, f: F) -> Result where F: FnOnce(Token, &mut Self) -> Result, { let token = self.ws_token().ok_or(Expect::Value)?; let v = f(token, self)?; self.eat_whitespace(); match self.peek_next() { None => Ok(v), Some(_) => Err(Expect::Eof)?, } } } impl Lex for T where T: crate::Read {} hifijson-0.2.3/src/value.rs000064400000000000000000000101341046102023000137070ustar 00000000000000//! Parsing and values. use crate::{num, str, token, Error, LexAlloc, Token}; use alloc::vec::Vec; use core::fmt; use core::ops::Deref; /// JSON value. #[derive(Debug)] pub enum Value { /// `null` Null, /// `true` or `false` Bool(bool), /// string representation of a number with positional information Number((Num, num::Parts)), /// string String(Str), /// array Array(Vec), /// mapping from strings to values Object(Vec<(Str, Self)>), } impl, NumR, StrL: PartialEq, StrR> PartialEq> for Value { fn eq(&self, other: &Value) -> bool { use Value::*; match (self, other) { (Null, Null) => true, (Bool(l), Bool(r)) => l == r, (Number((nl, pl)), Number((nr, pr))) => nl == nr && pl == pr, (String(l), String(r)) => l == r, (Array(l), Array(r)) => l == r, (Object(l), Object(r)) => { let mut lr = l.iter().zip(r.iter()); l.len() == r.len() && lr.all(|((kl, vl), (kr, vr))| kl == kr && vl == vr) } _ => false, } } } impl, Str: Deref> fmt::Display for Value { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use Value::*; match self { Null => "null".fmt(f), Bool(b) => b.fmt(f), Number((n, _)) => n.fmt(f), String(s) => str::Display::new(&**s).fmt(f), Array(a) => { "[".fmt(f)?; let mut iter = a.iter(); iter.next().iter().try_for_each(|v| write!(f, "{}", v))?; iter.try_for_each(|v| write!(f, ",{}", v))?; "]".fmt(f) } Object(o) => { "{".fmt(f)?; let mut iter = o.iter().map(|(k, v)| (str::Display::new(&**k), v)); iter.next() .iter() .try_for_each(|(k, v)| write!(f, "{}:{}", k, v))?; iter.try_for_each(|(k, v)| write!(f, ",{}:{}", k, v))?; "}".fmt(f) } } } } /// Parse a value, using `f` to parse recursive values inside arrays / objects. fn parse( token: Token, lexer: &mut L, f: impl Fn(Token, &mut L) -> Result, Error>, ) -> Result, Error> { match token { Token::Null => Ok(Value::Null), Token::True => Ok(Value::Bool(true)), Token::False => Ok(Value::Bool(false)), Token::DigitOrMinus => Ok(Value::Number(lexer.num_string()?)), Token::Quote => Ok(Value::String(lexer.str_string()?)), Token::LSquare => Ok(Value::Array({ let mut arr = Vec::new(); lexer.seq(Token::RSquare, |token, lexer| { arr.push(f(token, lexer)?); Ok::<_, Error>(()) })?; arr })), Token::LCurly => Ok(Value::Object({ let mut obj = Vec::new(); lexer.seq(Token::RCurly, |token, lexer| { let key = lexer.str_colon(token, |lexer| lexer.str_string().map_err(Error::Str))?; let value = f(lexer.ws_token().ok_or(token::Expect::Value)?, lexer)?; obj.push((key, value)); Ok::<_, Error>(()) })?; obj })), _ => Err(token::Expect::Value)?, } } /// Parse a value, not limiting the recursion depth. /// /// To prevent stack overflows, consider using [`parse_bounded`]. pub fn parse_unbounded( token: Token, lexer: &mut L, ) -> Result, Error> { parse(token, lexer, parse_unbounded) } /// Parse an value, limiting the recursion to `depth`. /// /// This serves to prevent stack overflows. pub fn parse_bounded( depth: usize, token: Token, lexer: &mut L, ) -> Result, Error> { let d = depth.checked_sub(1).ok_or(Error::Depth)?; parse(token, lexer, |token, lexer| parse_bounded(d, token, lexer)) } hifijson-0.2.3/src/write.rs000064400000000000000000000022441046102023000137300ustar 00000000000000pub trait Write { type Bytes: core::ops::Deref + Default; /// Write input to `bytes` until `stop` yields true. /// /// This function does not return a new [`Self::Bytes`] to avoid allocations. fn write_until(&mut self, bytes: &mut Self::Bytes, stop: impl FnMut(u8) -> bool); } impl<'a> Write for crate::SliceLexer<'a> { type Bytes = &'a [u8]; fn write_until(&mut self, bytes: &mut &'a [u8], mut stop: impl FnMut(u8) -> bool) { let pos = self.slice.iter().position(|c| stop(*c)); let pos = pos.unwrap_or(self.slice.len()); *bytes = &self.slice[..pos]; self.slice = &self.slice[pos..] } } #[cfg(feature = "alloc")] impl>> Write for crate::IterLexer { type Bytes = alloc::vec::Vec; fn write_until(&mut self, bytes: &mut Self::Bytes, mut stop: impl FnMut(u8) -> bool) { use crate::Read; bytes.clear(); while let Some(c) = self.read() { if stop(c) { self.last = Some(c); return; } else { bytes.push(c) } } self.last = None } } hifijson-0.2.3/tests/serde.rs000064400000000000000000000027031046102023000142530ustar 00000000000000#![cfg(feature = "serde")] use serde::Deserialize; pub fn from_slice<'a, T: Deserialize<'a>>(s: &'a [u8]) -> Result { hifijson::serde::exactly_one(&mut hifijson::SliceLexer::new(s)) } #[test] fn basic() { assert_eq!((), from_slice(b"null").unwrap()); assert_eq!(true, from_slice(b"true").unwrap()); assert_eq!(false, from_slice(b"false").unwrap()); } #[test] fn numbers() { assert_eq!(0, from_slice(b"0").unwrap()); assert_eq!(42, from_slice(b"42").unwrap()); assert_eq!(3.1415, from_slice(b"3.1415").unwrap()); assert_eq!(-42, from_slice(b"-42").unwrap()); } #[test] fn strings() { assert_eq!("asdf", from_slice::(br#""asdf""#).unwrap()); } #[test] fn arrays() { assert_eq!(Vec::<()>::new(), from_slice::>(b"[]").unwrap()); assert_eq!(vec![0], from_slice::>(b"[0]").unwrap()); assert_eq!(vec![0, 1], from_slice::>(b"[0, 1]").unwrap()); assert_eq!(vec![0, -1], from_slice::>(b"[0, -1]").unwrap()); assert_eq!(vec![0.0, 1.0], from_slice::>(b"[0, 1]").unwrap()); } #[test] fn objects() { use std::collections::HashMap; let a = HashMap::from([("a".to_string(), 1)]); let b = HashMap::from([("a".to_string(), 1), ("b".to_string(), 2)]); assert_eq!(HashMap::::new(), from_slice(b"{}").unwrap()); assert_eq!(a, from_slice(br#"{"a": 1}"#).unwrap()); assert_eq!(b, from_slice(br#"{"a": 1, "b": 2}"#).unwrap()); } hifijson-0.2.3/tests/tests.rs000064400000000000000000000151661046102023000143220ustar 00000000000000use core::num::NonZeroUsize; use hifijson::token::{Lex, Token}; use hifijson::value::{self, Value}; use hifijson::{escape, ignore, num, str, Error, Expect, IterLexer, LexAlloc, SliceLexer}; fn bol(b: bool) -> Value { Value::Bool(b) } fn num(n: Num, dot: Option, exp: Option) -> Value { let dot = dot.map(|i| NonZeroUsize::new(i).unwrap()); let exp = exp.map(|i| NonZeroUsize::new(i).unwrap()); Value::Number((n, hifijson::num::Parts { dot, exp })) } fn int(i: Num) -> Value { num(i, None, None) } fn arr(v: [Value; N]) -> Value { Value::Array(v.into()) } fn obj(v: [(Str, Value); N]) -> Value { Value::Object(v.into()) } fn iter_of_slice(slice: &[u8]) -> impl Iterator> + '_ { slice.iter().copied().map(Ok) } fn parses_to(slice: &[u8], v: Value<&str, &str>) -> Result<(), Error> { SliceLexer::new(slice).exactly_one(ignore::parse)?; IterLexer::new(iter_of_slice(slice)).exactly_one(ignore::parse)?; let parsed = SliceLexer::new(slice).exactly_one(value::parse_unbounded)?; assert_eq!(parsed, v); let parsed = IterLexer::new(iter_of_slice(slice)).exactly_one(value::parse_unbounded)?; assert_eq!(parsed, v); Ok(()) } fn parses_to_binary_string(slice: &[u8], v: &[u8]) -> Result<(), Error> { SliceLexer::new(slice).exactly_one(ignore::parse)?; IterLexer::new(iter_of_slice(slice)).exactly_one(ignore::parse)?; let parsed = SliceLexer::new(slice).exactly_one(parse_binary_string)?; assert_eq!(parsed, v); let parsed = IterLexer::new(iter_of_slice(slice)).exactly_one(parse_binary_string)?; assert_eq!(parsed, v); Ok(()) } fn parse_binary_string(token: Token, lexer: &mut L) -> Result, Error> { if token != hifijson::Token::Quote { Err(Error::Token(Expect::String))? } let on_string = |bytes: &mut L::Bytes, out: &mut Vec| { out.extend_from_slice(bytes); Ok(()) }; lexer.str_fold(Vec::new(), on_string, |lexer, escape, out| { let c = lexer.escape_char(escape).map_err(str::Error::Escape)?; out.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()); Ok(()) }) } fn fails_with(slice: &[u8], e: Error) { let parsed = SliceLexer::new(slice).exactly_one(ignore::parse); assert_eq!(parsed.unwrap_err(), e); let parsed = IterLexer::new(iter_of_slice(slice)).exactly_one(ignore::parse); assert_eq!(parsed.unwrap_err(), e); parse_fails_with(slice, e) } fn parse_fails_with(slice: &[u8], e: Error) { let parsed = SliceLexer::new(slice).exactly_one(value::parse_unbounded); assert_eq!(parsed.unwrap_err(), e); let parsed = IterLexer::new(iter_of_slice(slice)).exactly_one(value::parse_unbounded); assert_eq!(parsed.unwrap_err(), e); } #[test] fn basic() -> Result<(), Error> { parses_to(b"null", Value::Null)?; parses_to(b"false", Value::Bool(false))?; parses_to(b"true", Value::Bool(true))?; fails_with(b"nul", Expect::Value.into()); fails_with(b"fal", Expect::Value.into()); fails_with(b"t", Expect::Value.into()); fails_with(b"a", Expect::Value.into()); fails_with(b"true false", Expect::Eof.into()); Ok(()) } #[test] fn numbers() -> Result<(), Error> { parses_to(b"0", num("0", None, None))?; parses_to(b"42", num("42", None, None))?; parses_to(b"-0", num("-0", None, None))?; parses_to(b"-42", num("-42", None, None))?; parses_to(b"3.14", num("3.14", Some(1), None))?; // speed of light in m/s parses_to(b"299e6", num("299e6", None, Some(3)))?; // now a bit more precise parses_to(b"299.792e6", num("299.792e6", Some(3), Some(7)))?; fails_with(b"-", num::Error::ExpectedDigit.into()); Ok(()) } #[test] fn strings() -> Result<(), Error> { // greetings to Japan parses_to(r#""Hello 日本""#.as_bytes(), Value::String("Hello 日本"))?; // single-character escape sequences parses_to( br#""\"\\\/\b\f\n\r\t""#, Value::String("\"\\/\u{8}\u{c}\n\r\t"), )?; // UTF-16 surrogate pairs parses_to(br#""\uD801\uDC37""#, Value::String("𐐷"))?; // the smallest value representable with a surrogate pair parses_to(br#""\ud800\udc00""#, Value::String("𐀀"))?; // the largest value representable with a surrogate pair parses_to(br#""\udbff\udfff""#, Value::String("􏿿"))?; parses_to(br#""aa\nbb\ncc""#, Value::String("aa\nbb\ncc"))?; let escape = |e| Error::Str(str::Error::Escape(e)); fails_with(br#""\x""#, escape(escape::Error::UnknownKind)); fails_with(br#""\U""#, escape(escape::Error::UnknownKind)); fails_with(br#""\"#, escape(escape::Error::Eof)); fails_with(br#""\u00"#, escape(escape::Error::Eof)); fails_with("\"\u{0}\"".as_bytes(), str::Error::Control.into()); // corresponds to ASCII code 31 in decimal notation fails_with("\"\u{1F}\"".as_bytes(), str::Error::Control.into()); fails_with(br#""abcd"#, str::Error::Eof.into()); parse_fails_with(br#""\uDC37""#, escape(escape::Error::InvalidChar(0xdc37))); parse_fails_with(br#""\uD801""#, escape(escape::Error::ExpectedLowSurrogate)); let s = [34, 159, 146, 150]; let err = core::str::from_utf8(&s[1..]).unwrap_err(); parse_fails_with(&s, str::Error::Utf8(err).into()); Ok(()) } #[test] fn arrays() -> Result<(), Error> { parses_to(b"[]", arr([]))?; parses_to(b"[false, true]", arr([bol(false), bol(true)]))?; parses_to(b"[0, 1]", arr([int("0"), int("1")]))?; parses_to(b"[[]]", arr([arr([])]))?; fails_with(b"[", Expect::ValueOrEnd.into()); fails_with(b"[1", Expect::CommaOrEnd.into()); fails_with(b"[1 2", Expect::CommaOrEnd.into()); fails_with(b"[1,", Expect::Value.into()); Ok(()) } #[test] fn objects() -> Result<(), Error> { parses_to(b"{}", obj([]))?; parses_to(br#"{"a": 0}"#, obj([("a", int("0"))]))?; parses_to( br#"{"a": 0, "b": 1}"#, obj([("a", int("0")), ("b", int("1"))]), )?; fails_with(b"{", Expect::ValueOrEnd.into()); fails_with(b"{0", Expect::String.into()); fails_with(br#"{"a" 1"#, Expect::Colon.into()); fails_with(br#"{"a": 1"#, Expect::CommaOrEnd.into()); fails_with(br#"{"a": 1,"#, Expect::Value.into()); Ok(()) } #[test] fn binary_strings() -> Result<(), Error> { parses_to_binary_string(br#""aaa\nbbb\nccc""#, b"aaa\nbbb\nccc")?; parses_to_binary_string(b"\"aaa\xffbbb\xffccc\"", b"aaa\xffbbb\xffccc")?; parses_to_binary_string( b"\"aaa\\u2200\xe2\x88\x80ccc\"", "aaa\u{2200}\u{2200}ccc".as_bytes(), )?; Ok(()) }