calamine-0.34.0/.cargo_vcs_info.json0000644000000001361046102023000127130ustar { "git": { "sha1": "1fb5fac11213f458c7efa5779e6f63d72c49653b" }, "path_in_vcs": "" }calamine-0.34.0/.github/ISSUE_TEMPLATE/bug_report.yml000064400000000000000000000047361046102023000201110ustar 00000000000000name: 🐞 Bug description: File a bug/issue in calamine title: "Bug: add a description here" labels: [bug] body: - type: markdown attributes: value: Please fill in the title above and the sections below to submit your bug report. - type: textarea attributes: label: Current behavior description: A description of what you're experiencing. validations: required: true - type: textarea attributes: label: Expected behavior description: A description of what you expected to happen. validations: required: true - type: textarea attributes: label: Sample code to reproduce description: Please add a small, complete, sample program that demonstrates your issue. value: | ``` use calamine::{open_workbook, Error, Reader, Xlsx}; fn main() -> Result<(), Error> { let test_file = "somefile.xlsx"; let mut workbook: Xlsx<_> = open_workbook(test_file)?; let sheet_range = workbook.worksheet_range("Sheet1")?; let mut iter = sheet_range.deserialize()?; if let Some(result) = iter.next() { let (label, value): (String, f64) = result?; assert_eq!(label, "celsius"); assert_eq!(value, 22.2222); Ok(()) } else { Err(From::from("Expected at least one record but got none")) } } ``` render: rust validations: required: true - type: textarea attributes: label: Test file description: > Please attach the test file that you used in the sample code above. If you are unable to share the original file, please create a minimal test file that reproduces the issue. Tip: You can attach files by clicking this area to highlight it and then dragging files in. validations: required: true - type: textarea attributes: label: Environment description: | Add any relevant version or system information: value: | - calamine version: - Cargo.toml dependency line for calamine: - rustc version: - Excel/OpenOffice/LibreOffice version: - OS: render: text validations: required: false - type: checkboxes attributes: label: Checklist description: > Ensure that the following have been included. options: - label: I have added a complete sample program that compiles in Rust. - label: I have added a test file. required: false calamine-0.34.0/.github/ISSUE_TEMPLATE/feature_request.yml000064400000000000000000000011351046102023000211320ustar 00000000000000name: 🎯 Feature Request description: Request a new feature in calamine title: "feature request: add a description here" labels: [feature request] body: - type: markdown attributes: value: | Use the dialog box below to request a new feature in calamine. - type: textarea attributes: label: Feature Request description: | Describe the new feature that you would like to see in calamine. It is important to explain what your personal use case is. Consider adding a screenshot or a sample file to help explain the request. validations: required: true calamine-0.34.0/.github/ISSUE_TEMPLATE/question.yml000064400000000000000000000016021046102023000175750ustar 00000000000000name: ❓ Question description: Ask a question about calamine title: "question: add a description here" labels: [question] body: - type: markdown attributes: value: > General questions on how to do something with the calamine crate should be asked on [StackOverflow](http://stackoverflow.com/questions/tagged/calamine) with a tag of "calamine". This has a better chance of getting several answers and also helps others who might have similar questions in the future. If you don't get an answer on StackOverflow, you can come back here and refer to your question there. If your question is "Is this a bug" then please file a bug report instead. Other questions can be asked below. - type: textarea attributes: label: Question description: Ask a question that doesn't belong on StackOverflow validations: required: true calamine-0.34.0/.github/workflows/rust.yml000064400000000000000000000031531046102023000166000ustar 00000000000000name: Rust on: [ push, pull_request ] jobs: rust: runs-on: ubuntu-24.04 strategy: matrix: toolchain: - "1.83" # MSRV. - stable - beta - nightly steps: - uses: actions/checkout@v4 - name: ensure dependencies if: ${{ startsWith(matrix.toolchain, '1.') && fromJson(matrix.toolchain) < 1.85 }} run: CARGO_RESOLVER_INCOMPATIBLE_RUST_VERSIONS="fallback" cargo update - name: Install Rust toolchain uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.toolchain }} - name: Toolchain version run: | cargo -vV rustc -vV - name: Build run: | cargo build - name: Run tests run: | cargo test --all-features - name: Install rustfmt, clippy uses: dtolnay/rust-toolchain@master if: ${{ matrix.toolchain == 'stable' }} with: toolchain: ${{ matrix.toolchain }} components: rustfmt, clippy - name: Format checks if: ${{ matrix.toolchain == 'stable' }} run: | cargo fmt -- --check - name: Clippy checks if: ${{ matrix.toolchain == 'stable' }} run: | cargo clippy --all-targets --all-features -- -Dwarnings wasm: runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v4 - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: targets: wasm32-unknown-unknown - name: Check wasm32 compilation run: | cargo check --target wasm32-unknown-unknown cargo check --target wasm32-unknown-unknown --all-features calamine-0.34.0/.github/workflows/typos.yml000064400000000000000000000005021046102023000167540ustar 00000000000000name: Typos Check on: pull_request: workflow_dispatch: jobs: run: name: Spell Check with Typos runs-on: ubuntu-latest steps: - name: Checkout Actions Repository uses: actions/checkout@v4 - name: Check spelling uses: crate-ci/typos@master with: config: ./.typos.toml calamine-0.34.0/.gitignore000064400000000000000000000001401046102023000134440ustar 00000000000000target Cargo.lock *.bk .vim /.idea/ fuzz.xlsx .idea nyc.rs .DS_Store .vscode/ examples/doc_*.rs calamine-0.34.0/.typos.toml000064400000000000000000000001671046102023000136160ustar 00000000000000[default.extend-words] typ = "typ" ODF = "ODF" PERMUT = "PERMUT" AVERAGEIFS = "AVERAGEIFS" Shs = "Shs" TTEST = "TTEST" calamine-0.34.0/Cargo.lock0000644000000706141046102023000106760ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "adler2" version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "aho-corasick" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "alloca" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5a7d05ea6aea7e9e64d25b9156ba2fee3fdd659e34e41063cd2fc7cd020d7f4" dependencies = [ "cc", ] [[package]] name = "anes" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" version = "3.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", "windows-sys", ] [[package]] name = "atoi_simd" version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ad17c7c205c2c28b527b9845eeb91cf1b4d008b438f98ce0e628227a822758e" dependencies = [ "debug_unsafe", ] [[package]] name = "autocfg" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "block-buffer" version = "0.10.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" dependencies = [ "generic-array", ] [[package]] name = "bumpalo" version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "calamine" version = "0.34.0" dependencies = [ "atoi_simd", "byteorder", "chrono", "codepage", "criterion", "encoding_rs", "env_logger", "fast-float2", "glob", "log", "quick-xml", "rstest", "serde", "serde_derive", "sha2", "zip", ] [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" version = "1.2.56" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" dependencies = [ "find-msvc-tools", "shlex", ] [[package]] name = "cfg-if" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "chrono" version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" dependencies = [ "num-traits", "serde", ] [[package]] name = "ciborium" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ "ciborium-io", "ciborium-ll", "serde", ] [[package]] name = "ciborium-io" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" [[package]] name = "ciborium-ll" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ "ciborium-io", "half", ] [[package]] name = "clap" version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" version = "4.5.60" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" dependencies = [ "anstyle", "clap_lex", ] [[package]] name = "clap_lex" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" [[package]] name = "codepage" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" dependencies = [ "encoding_rs", ] [[package]] name = "colorchoice" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "cpufeatures" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" dependencies = [ "libc", ] [[package]] name = "crc32fast" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] [[package]] name = "criterion" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "950046b2aa2492f9a536f5f4f9a3de7b9e2476e575e05bd6c333371add4d98f3" dependencies = [ "alloca", "anes", "cast", "ciborium", "clap", "criterion-plot", "itertools", "num-traits", "oorandom", "page_size", "plotters", "rayon", "regex", "serde", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8d80a2f4f5b554395e47b5d8305bc3d27813bacb73493eb1001e8f76dae29ea" dependencies = [ "cast", "itertools", ] [[package]] name = "crossbeam-deque" version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" dependencies = [ "generic-array", "typenum", ] [[package]] name = "debug_unsafe" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85d3cef41d236720ed453e102153a53e4cc3d2fde848c0078a50cf249e8e3e5b" [[package]] name = "digest" version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", ] [[package]] name = "either" version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" [[package]] name = "encoding_rs" version = "0.8.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" dependencies = [ "cfg-if", ] [[package]] name = "env_filter" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0" dependencies = [ "log", "regex", ] [[package]] name = "env_logger" version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c863f0904021b108aa8b2f55046443e6b1ebde8fd4a15c399893aae4fa069f" dependencies = [ "anstream", "anstyle", "env_filter", "jiff", "log", ] [[package]] name = "equivalent" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "fast-float2" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" [[package]] name = "find-msvc-tools" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "flate2" version = "1.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4a3d7db9596fecd151c5f638c0ee5d5bd487b6e0ea232e5dc96d5250f6f94b1d" dependencies = [ "crc32fast", "libz-rs-sys", "miniz_oxide", ] [[package]] name = "generic-array" version = "0.14.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", ] [[package]] name = "glob" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" [[package]] name = "half" version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ "cfg-if", "crunchy", "zerocopy", ] [[package]] name = "hashbrown" version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" [[package]] name = "indexmap" version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" dependencies = [ "equivalent", "hashbrown", ] [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" dependencies = [ "either", ] [[package]] name = "itoa" version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49" dependencies = [ "jiff-static", "log", "portable-atomic", "portable-atomic-util", "serde", ] [[package]] name = "jiff-static" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "js-sys" version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", ] [[package]] name = "libc" version = "0.2.174" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" [[package]] name = "libz-rs-sys" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "172a788537a2221661b480fee8dc5f96c580eb34fa88764d3205dc356c7e4221" dependencies = [ "zlib-rs", ] [[package]] name = "log" version = "0.4.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" [[package]] name = "memchr" version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "miniz_oxide" version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", ] [[package]] name = "num-traits" version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" [[package]] name = "oorandom" version = "11.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" [[package]] name = "page_size" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" dependencies = [ "libc", "winapi", ] [[package]] name = "plotters" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" dependencies = [ "num-traits", "plotters-backend", "plotters-svg", "wasm-bindgen", "web-sys", ] [[package]] name = "plotters-backend" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" [[package]] name = "plotters-svg" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" dependencies = [ "plotters-backend", ] [[package]] name = "portable-atomic" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "portable-atomic-util" version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" dependencies = [ "portable-atomic", ] [[package]] name = "proc-macro2" version = "1.0.95" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" dependencies = [ "unicode-ident", ] [[package]] name = "quick-xml" version = "0.39.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d" dependencies = [ "encoding_rs", "memchr", ] [[package]] name = "quote" version = "1.0.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "regex" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "relative-path" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" [[package]] name = "rstest" version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5a3193c063baaa2a95a33f03035c8a72b83d97a54916055ba22d35ed3839d49" dependencies = [ "rstest_macros", ] [[package]] name = "rstest_macros" version = "0.26.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c845311f0ff7951c5506121a9ad75aec44d083c31583b2ea5a30bcb0b0abba0" dependencies = [ "cfg-if", "glob", "proc-macro2", "quote", "regex", "relative-path", "rustc_version", "syn", "unicode-ident", ] [[package]] name = "rustc_version" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "rustversion" version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "semver" version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" [[package]] name = "serde" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.219" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.143" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" dependencies = [ "itoa", "memchr", "ryu", "serde", ] [[package]] name = "sha2" version = "0.10.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", "cpufeatures", "digest", ] [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "simd-adler32" version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d66dc143e6b11c1eddc06d5c423cfc97062865baf299914ab64caa38182078fe" [[package]] name = "syn" version = "2.0.104" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "typed-path" version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3015e6ce46d5ad8751e4a772543a30c7511468070e98e64e20165f8f81155b64" [[package]] name = "typenum" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" [[package]] name = "unicode-ident" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" dependencies = [ "same-file", "winapi-util", ] [[package]] name = "wasm-bindgen" version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", "rustversion", "wasm-bindgen-macro", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ "bumpalo", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] [[package]] name = "web-sys" version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ "windows-sys", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" [[package]] name = "windows-link" version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.53.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" dependencies = [ "windows-link", "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" [[package]] name = "windows_aarch64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" [[package]] name = "windows_i686_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" [[package]] name = "windows_i686_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" [[package]] name = "windows_i686_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" [[package]] name = "windows_x86_64_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" [[package]] name = "windows_x86_64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" [[package]] name = "windows_x86_64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" [[package]] name = "zerocopy" version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" version = "0.8.40" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "zip" version = "7.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "268bf6f9ceb991e07155234071501490bb41fd1e39c6a588106dad10ae2a5804" dependencies = [ "crc32fast", "flate2", "indexmap", "memchr", "typed-path", "zopfli", ] [[package]] name = "zlib-rs" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" [[package]] name = "zopfli" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edfc5ee405f504cd4984ecc6f14d02d55cfda60fa4b689434ef4102aae150cd7" dependencies = [ "bumpalo", "crc32fast", "log", "simd-adler32", ] calamine-0.34.0/Cargo.toml0000644000000045501046102023000107150ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.83" name = "calamine" version = "0.34.0" authors = ["Johann Tuffe "] build = false exclude = ["tests/**/*"] autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "An Excel/OpenDocument Spreadsheet reader and deserializer in pure Rust" documentation = "https://docs.rs/calamine" readme = "README.md" keywords = [ "excel", "ods", "xls", "xlsx", "xlsb", ] categories = [ "encoding", "parsing", "text-processing", ] license = "MIT" repository = "https://github.com/tafia/calamine" [package.metadata.docs.rs] features = [ "chrono", "picture", ] rustdoc-args = [ "--cfg", "docsrs", ] [features] chrono = ["dep:chrono"] dates = ["chrono"] default = [] picture = [] [lib] name = "calamine" path = "src/lib.rs" [[example]] name = "excel_to_csv" path = "examples/excel_to_csv.rs" [[example]] name = "search_errors" path = "examples/search_errors.rs" [[bench]] name = "basic" path = "benches/basic.rs" harness = false [dependencies.atoi_simd] version = "0.17" [dependencies.byteorder] version = "1.5" [dependencies.chrono] version = "0.4" features = ["serde"] optional = true default-features = false [dependencies.codepage] version = "0.1" [dependencies.encoding_rs] version = "0.8" [dependencies.fast-float2] version = "0.2" [dependencies.log] version = "0.4" [dependencies.quick-xml] version = "0.39" features = ["encoding"] [dependencies.serde] version = "1.0" [dependencies.zip] version = "7.0" features = ["deflate"] default-features = false [dev-dependencies.criterion] version = "0.8" features = ["html_reports"] [dev-dependencies.env_logger] version = "0.11" [dev-dependencies.glob] version = "0.3" [dev-dependencies.rstest] version = "0.26" default-features = false [dev-dependencies.serde_derive] version = "1.0" [dev-dependencies.sha2] version = "0.10" calamine-0.34.0/Cargo.toml.orig000064400000000000000000000027661046102023000143630ustar 00000000000000[package] name = "calamine" version = "0.34.0" authors = ["Johann Tuffe "] repository = "https://github.com/tafia/calamine" documentation = "https://docs.rs/calamine" description = "An Excel/OpenDocument Spreadsheet reader and deserializer in pure Rust" license = "MIT" readme = "README.md" keywords = ["excel", "ods", "xls", "xlsx", "xlsb"] categories = ["encoding", "parsing", "text-processing"] exclude = ["tests/**/*"] edition = "2021" rust-version = "1.83" # For quick-xml + encoding_rs. [dependencies] log = "0.4" serde = "1.0" codepage = "0.1" atoi_simd = "0.17" byteorder = "1.5" encoding_rs = "0.8" fast-float2 = "0.2" zip = { version = "7.0", default-features = false, features = ["deflate"] } quick-xml = { version = "0.39", features = ["encoding"] } # Optional dependencies. chrono = { version = "0.4", features = ["serde"], optional = true, default-features = false } [dev-dependencies] glob = "0.3" sha2 = "0.10" env_logger = "0.11" serde_derive = "1.0" rstest = { version = "0.26", default-features = false } criterion = { version = "0.8", features = ["html_reports"] } [[bench]] name = "basic" harness = false [features] default = [] # `chrono`: Adds support for Chrono dates and times. chrono = ["dep:chrono"] # `picture`: Adds support for reading raw data for pictures in spreadsheets. picture = [] # `dates`: Backwards compatible synonym for the `chrono` feature. dates = ["chrono"] [package.metadata.docs.rs] features = ["chrono", "picture"] rustdoc-args = ["--cfg", "docsrs"] calamine-0.34.0/Changelog.md000064400000000000000000000500751046102023000137010ustar 00000000000000# Changelog This is the changelog/release notes for the `calamine` crate. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [0.34.0] - 2026-03-07 ### Changed - Updated benchmarking to uses the `criterion.rs` crate. This removed the requirement for using `+nightly` for `cargo bench`. [PR #620]. [PR #620]: https://github.com/tafia/calamine/pull/620 - Updated dependencies for release 0.34.0: - `quick-xml`: 0.38 -> 0.39. [PR #617] [PR #617]: https://github.com/tafia/calamine/pull/617 - Refactored CFB to resolve special case in the caller of `fn get_chain`. [PR #615]. [PR #615]: https://github.com/tafia/calamine/pull/615 - Performance improvements: - Cell reader buffer reuse and optimised maps. [PR #611] - Optimised XLS sector chain reads. [PR #609] - Ensured `fast_float2` is used at all float-parsing call sites. [PR #608] - Cached path and attr lookups, reused buffers, minimised allocations. [PR #606] [PR #606]: https://github.com/tafia/calamine/pull/606 [PR #608]: https://github.com/tafia/calamine/pull/608 [PR #609]: https://github.com/tafia/calamine/pull/609 [PR #611]: https://github.com/tafia/calamine/pull/611 ### Fixed - Fixed capitalized workbook/book stream handling in XLS files. [PR #618]. [PR #618]: https://github.com/tafia/calamine/pull/618 - Fixed issue where BIFF5 `Lbl` and `ExternSheet` records were incorrectly parsed as BIFF8. [PR #613]. [PR #613]: https://github.com/tafia/calamine/pull/613 - Fixed VBA `check_variable_record` to handle optional records. [PR #614]. [PR #614]: https://github.com/tafia/calamine/pull/614 - Fixed premature error when BIFF version is not Biff8 in XLS files. [PR #619]. [PR #619]: https://github.com/tafia/calamine/pull/619 - Fixed issue where XLSX cells with an empty `` value returned a string with index 0 instead of an Empty value. [Issue #607]. [Issue #607]: https://github.com/tafia/calamine/issues/607 - Fixed millisecond rounding issue where 1000 milliseconds weren't rounded up to the next second. [Issue #602], [PR #605]. [Issue #602]: https://github.com/tafia/calamine/issues/602 [PR #605]: https://github.com/tafia/calamine/pull/605 ## [0.33.0] - 2026-02-04 ### Added - Added support for reading data from Pivot Tables, which involves reading data from the internal Pivot Cache. [PR #559]. [PR #559]: https://github.com/tafia/calamine/pull/559 ### Changed - Update dependencies for release 0.33.0: - `zip`: 4.2.0 -> 7.0. - `atoi_simd`: 0.16 -> 0.17 ### Fixed - Fixed potential memory exhaustion issue in ODS files that could be triggered via repeated empty rows/columns. The fix adds limits to prevent memory exhaustion from malicious ODS files that declare billions of repeated cells via `table:number-rows-repeated` and `table:number-columns`-repeated attributes. The change adds the following protection layers: - Add cap for columns per row at `MAX_COLUMNS` (16,384). - Add cap for total row repeats at `MAX_ROWS` (1,048,576). - Add cap for total cells at `MAX_CELLS` (100 million) in `get_range()`. These limits match XLSX's existing row/column limits and prevent a 7KB malicious file from attempting to allocate memory for 17+ billion cells. When MAX_CELLS is exceeded, return `OdsError::CellLimitExceeded` instead of silently returning an empty range. This ensures callers are properly informed of truncation rather than receiving silent data loss. [Issue #594], [PR #596]. [Issue #594]: https://github.com/tafia/calamine/issues/594 [PR #596]: https://github.com/tafia/calamine/pull/596 - Fixed an issue where XLSX files with tables that had the internal `insertRow` attribute set returned a `Dimensions` object where the end row was less than the start row. This caused an assert/panic when trying to create a `Range` object to return the table range. [Issue #589]. [Issue #589]: https://github.com/tafia/calamine/issues/589 - Fixed an issue with XLSX files where worksheet tables used the unusual, but valid, absolute reference system like `"/xl/tables/table1.xml"` instead of the common Excel generated relative system `"../tables/table1.xml"`. [Issue #587]. [Issue #587]: https://github.com/tafia/calamine/issues/587 ## [0.32.0] - 2025-11-20 ### Changed - Refactored VBA reading functions to be on-demand for better performance. - Simplified `vba_project()` function return type from `Option>` to `Result>` for more idiomatic error handling. This is a breaking change. ### Fixed - Fixed out-of-memory vulnerabilities in XLS file parsing by bounding allocations. - Fixed and extended support for XLSX shared formulas with handling of ranges, absolute references, and column/row ranges in XLSX files. - Fixed XLSX issue with missing shared string sub-elements. Also improved error messages for shared string parsing issues. - Fixed acceptance of XLS `XLUnicodeRichExtendedString` records without reserved tags. - Fixed various edge cases in XLS handling that could lead to parsing errors. ## [0.31.0] - 2025-09-27 ### Changed - Upgraded `quick-xml` to v0.38. This was a significant change in `quick-xml` relative to v0.37 and required changes in `calamine` to entity handling. It also fixes EOL handling which may lead to regressions in `calamine` applications if they expected to see `"\r\n"` in strings instead of the correct (for XML and Excel) `"\n"`. For most users these will be inconsequential changes but please take note before upgrading production code. - Renamed the `"dates"` feature flag to `"chrono"` since there is now some native date handling features without `"chrono"`. The `"chrono"` flag is more specific and accurate. The `"dates"` flag is still supported as before for backward compatibility. This change also made some datatype methods related to date/times available in the `"default"` feature set. They were previously hidden unnecessarily behind the "dates/"chrono" flag. ### Added - Added a conversion function to `ExcelDateTime` to convert the inner serial Excel datetime to standard year, month, date, hour, minute, second and millisecond components. Works for 1900 and 1904 epochs. ### Fixed - Fixed issue where Excel xlsx shared formula failed if it contained Unicode characters. [Issue #553]. [Issue #553]: https://github.com/tafia/calamine/issues/553 - Fixed issue where Excel XML escapes in strings weren't unescaped. For example `"_x000D_" -> "\r"`. [Issue #469]. [Issue #469]: https://github.com/tafia/calamine/issues/469 ## [0.30.1] - 2025-09-06 ### Added - Added `Debug` and `Clone` to `Table` for easier debugging. [PR #547]. [PR #547]: https://github.com/tafia/calamine/issues/547 ### Fixed - Fixed issue [Issue #548] for xls files where the `SST` record had an incorrect number of unique strings. [Issue #548]: https://github.com/tafia/calamine/issues/548 ## [0.30.0] - 2025-08-07 ### Changed - Unpinned the `zip.rs` dependency from v4.2.0 to allow cargo to choose the correct version for the user's rustc version. The Rust MSRV was bumped to v1.75.0 (which it should have been for for `zip.rs` compatibility in previous releases). See the discussion at [Issue #527]. [Issue #527]: https://github.com/tafia/calamine/issues/527 ## [0.29.0] - 2025-07-17 ### Added - Add additional documentation and examples for the `Range`, `Cell`, `XlsxError` and `Table` structs, and `Xlsx` Table and Merge methods. Issue #459 ### Changed - Pin zip.rs to v4.2.0. The current latest release of `zip.rs`, v4.3.0, requires a MSRV of v1.85.0. This release pins `zip.rs` to v4.2.0 to allow users to maintain a MSRV of v1.73.0 for at least one more release. It is likely that `calamine` v0.30.0 or later will move back to the latest `zip.rs` v4.x and require rustc v1.85.0. ### Fixed - Fixed issue where XLSX files had Windows style directory separators for internal paths instead of the required Unix style separators. Issue #530. - Fixed several XLS parsing issues which could lead to out-of-memory errors. PR #525. - Fixed numeric underflow in `Xlsx::from_sparse()` and also ensured that the associated `Range` of cells would be in row-column order. PR #524. ## [0.28.0] - 2025-06-19 ### Changed - Bump zip to 4.0. ## [0.27.0] - 2025-04-22 ### Added - (xls): add one more `Error` variant related to formatting. ### Changed - Bump dependencies. - (xls): Invalid formats parsing. - Always parse string cell as string. - Pin zip crate to 2.5. - (xlsx): check 'closing' tag name with more prefixes. ## [0.26.1] - 2024-10-10 ### Fixed - Sparse cells expect 0 index rows, even when using `header_row`. ## [0.26.0] - 2024-10-08 ### Added - Ability to merge cells from xls and xlsx. - Options to keep first empty rows for xlsx. - Support consecutive repeated empty cells for ods. - New `header_row` config. ### Changed - Bump MSRV to 1.73. - Fix broken links in README. - Enable dates and pictures features in `docs.rs` build. - Fix broken fuzzer. ## [0.25.0] - 2024-05-25 ### Added - Add `is_error` and `get_error` methods to the `DataType` trait. - Add deserializer helper functions. - Support getting merged region. - `Range::headers` method. - Expose some `Dimensions` methods. ### Changed - Use `OnceLock` instead of `once_cell` crate (MSRV: 1.71). ### Fixed - Use case insensitive comparison when searching for file in xlsx. - Do not panic when reading cell format with invalid index. ## [0.24.0] - 2024-02-08 ### Added - Introduce a `DataType` trait implemented by both `Data` and `DataRef`. - `Data` and `DataType` now return `Some(0{.0})` and `Some(1{.0})` rather than `None` when `.as_i64` or `.as_f64` is used on a Bool value. - Detect xlsb/ods password protected files. - Introduce `is_x` methods for date and time variants. ### Changed - **BREAKING**: rename `DataType` enum to `Data` and `DataTypeRef` to `DataRef`. - DateTime(f64) to DateTime(ExcelDateTime). ### Fixed - Getting tables names from xlsx workbooks without `_rels` files. ## [0.23.1] - 2023-12-19 ### Fixed - `worksheet_formula` not returning all formula. ## [0.23.0] - 2023-12-12 ### Added - New `DataTypeRef` available from `worksheet_range_ref` to reduce memory usage. - Detect if workbook is password protected. ### Changed - Add benchmark plot. ### Fixed - Truncated text in xls. ## [0.22.1] - 2023-10-08 ### Added - Support label cells for xls. ### Changed - Update GitHub actions. - Clippy. - Preallocate several buffers. ### Fixed - Regression on `Range::get`. - Spelling of formula error type. ## [0.22.0] - 2023-09-04 ### Added - Add support of sheet type and visibility. - Implement blank string handling. ### Changed - Improve `de_opt_f64` example. - Remove datetime notice from README. - Clippy. - Bump MSRV to 1.63 (breaking). - Set edition to 2021. ## [0.21.2] - 2023-06-25 ### Fixed - Formula with string not displaying properly. ## [0.21.1] - 2023-06-17 ### Changed - Bump MSRV to 1.60.0 due to log dependencies. ### Fixed - Xls: formula values ignored. - Xls: formula (string) not displayed properly. ## [0.21.0] - 2023-06-13 ### Added - Support for duration. ### Changed - Add MSRV. ### Fixed - (xlsx) support `r` attribute. - Support `PROJECTCOMPATVERSION` in vba. - Incorrect date parsing due to excel bug. ## [0.20.0] - 2023-05-29 ### Added - (all) parse format/style information to infer cell as datetime. - (ods) support number-columns-repeated attribute. ### Changed - Bump dependencies. - Multiple clippy refactorings. ## [0.19.2] - 2023-02-09 ### Added - Extract picture data by turning `picture` feature on. ## [0.19.1] - 2022-10-20 ### Fixed - Wrong range len calculation. - Date precision. ## [0.19.0] - 2022-10-20 ### Added - Always return sheet names in lexicographic order (`BTreeMap`). ### Changed - Bump dependencies (quick-xml in particular and chrono). - Remove travis. ### Fixed - Several decoding issues in xls and xlsb. - Wrong decimal parsing. ## [0.18.0] - 2021-02-23 ### Added - Improve conversions from raw data to primitives. - Replace macro matches! by match expression to reduce MSRV. ### Changed - Fix two typos in README. ### Fixed - Allow empty value cells in xlsx. - Obscure xls parsing errors (#195). ## [0.17.0] - 2021-02-03 ### Added - Use `chunks_exact` instead of chunks where possible. - Detect date/time formatted cells in XLSX. - Brute force file detection if extension is not known. - Support xlsx sheet sizes beyond `u32::MAX`. ### Changed - Add regression tests that fail with miri. - Ensure doctest functions actually run. - Run cargo fmt to fix travis tests. ### Fixed - Make `to_u32`, `read_slice` safe and sound. - Security issue #199. - Fix Float reading for XLSB. ## [0.16.2] - 2020-09-26 ### Changed - Add `deserialize_with` example in README. - Correct MBSC to MBCS in vba.rs (misspelled before). - Use 2018 edition paths. ### Fixed - Skip phonetic run. - Fix XLS float parsing error. - Add the ability to read formula values from XLSB. - Support integral date types. ## [0.16.1] - 2019-11-20 ### Added - Make `Metadata.sheets` (and `Reader.sheet_names`) always return names in workbook order. ### Changed - Fix warnings in tests. ## [0.16.0] - 2019-10-11 ### Added - Deprecate failure and impl `std::error::Error` for all errors. - Add `dates` feature to enrich `DataType` with date conversions functions. ## [0.15.6] - 2019-08-24 ### Added - Update dependencies. ## [0.15.5] - 2019-07-15 ### Fixed - Wrong bound comparisons. ## [0.15.4] - 2019-04-11 ### Added - Improve deserializer. - Bump dependencies. ## [0.15.3] - 2018-12-14 ### Added - Add several new convenient fn to `DataType`. - Add a `Range::range` fn to get sub-ranges. - Add a new `Range::cells` iterator. - Implement `DoubleEndedIterator` when possible. - Add a `Range::get` fn (similar to slice's). ### Changed - Add some missing `size_hint` impl in iterators. - Add some `ExactSizeIterator`. ## [0.15.2] - 2018-12-14 ### Added - Consider empty cell as empty str if deserializing to str or String. ## [0.15.1] - 2018-12-13 ### Fixed - Xls - allow sectors ending after eof (truncate them!). ## [0.15.0] - 2018-12-13 ### Added - Codepage/`encoding_rs` for codepage mapping. ## [0.14.10] - 2018-11-23 ### Fixed - Serde map do not stop at first empty value. ## [0.14.9] - 2018-11-23 ### Fixed - Do not return map keys for empty cells. Fixes not working `#[serde(default)]`. ## [0.14.8] - 2018-11-23 ### Added - Bump dependencies. - Add a `RangeDeserializerBuilder::with_headers` fn to improve serde deserializer. ## [0.14.7] - 2018-10-23 ### Added - Ods, support *text:s* and *text:p*. ## [0.14.6] - 2018-09-20 ### Fixed - Support `MulRk` for xls files. ## [0.14.5] - 2018-08-28 ### Changed - Bump dependencies. ### Fixed - Properly parse richtext ods files. ## [0.14.4] - 2018-08-28 ### Added - Ods: display sheet names in order. ## [0.14.3] - 2018-08-09 ### Added - Handle 'covered cells' which are behind merge-cells in ODS. ## [0.14.2] - 2018-08-03 ### Changed - Bump dependencies. ### Fixed - Boolean detection and missing repeated cells in ODS. ## [0.14.1] - 2018-05-08 ### Fixed - Possibility of index out of bound in `get_value` and eventually in Index<(usize, usize)>. ## [0.14.0] - 2018-04-27 ### Added - Have Range `start`/`end` return None if the range is actually empty. - Have `Range::get_value` return an Option if the index is out of range. ## [0.13.1] - 2018-03-23 ### Added - Make `Range::from_sparse` public. ### Changed - Bump dependencies. ## [0.13.0] - 2018-01-27 ### Added - Migrate from error-chain to failure. - More documentation on error. - Bump dependencies (calamine, `encoding_rs` and zip). - Process any Read not only Files. ### Changed - Simplify Reader trait (enable direct Xlsx read etc ...). - Always initialize at creation. - Fix various typos. ## [0.12.1] - 2017-11-27 ### Added - Update dependencies. ## [0.12.0] - 2017-10-27 ### Added - Add serde deserialization. ## [0.11.8] - 2017-08-22 ### Changed - Update dependencies, in particular quick-xml 0.9.1. ## [0.11.7] - 2017-07-08 ### Changed - Bump dependencies. ### Fixed - Add a bound check when decoding cfb. ## [0.11.6] - 2017-07-05 ### Changed - Bump dependencies. - Ignore .bk files. ## [0.11.5] - 2017-05-12 ### Changed - Bump dependencies. ## [0.11.4] - 2017-05-08 ### Changed - Update to quick-xml 0.7.3 and `encoding_rs` 0.6.6. ## [0.11.3] - 2017-05-05 ### Added - Implement `Display` for `DataType` and `CellTypeError`. - Add a `CellType` alias trait. ## [0.11.2] - 2017-05-04 ### Changed - Update to quick-xml 0.7.1. ## [0.11.1] - 2017-05-03 ### Changed - Update `encoding_rs` to 0.6.2. - Add benches and avoid clearing a buffer supposed to be reused. ## [0.11.0] - 2017-04-27 ### Added - Add support for formula parsing/decoding. ### Changed - Make `Range` generic over its content. ### Fixed - Convert codepage 21010 as codepage 1200. - Support `EUC_KR` encoding. ## [0.10.2] - 2017-04-18 ### Fixed - Error while using a singlebyte encoding for xls files (`read_dbcs`). ## [0.10.1] - 2017-04-18 ### Fixed - Error while using a singlebyte encoding for xls files (`short_strings`). ## [0.10.0] - 2017-04-14 ### Added - Support defined names for named ranges. ### Changed - Better internal logic. ## [0.9.0] - 2017-04-12 ### Added - Add Index/IndexMut for Range. ### Changed - Rename `Excel` in `Sheets` to accommodate `OpenDocument`. ## [0.8.0] - 2017-04-12 ### Added - Add basic support for `OpenDocument` spreadsheets. - Force rustfmt on travis checks. ### Changed - Apply rustfmt. ## [0.7.0] - 2017-03-23 ### Changed - Update dependencies. - Rustfmt. ### Fixed - Extend appveyor paths to be able to use curl. - Extract richtext reading from `read_shared_strings` to `read_string`. - Enable namespaced xmls when parsing xlsx files. ## [0.6.0] - 2017-03-06 ### Changed - Bump dependencies. - Move from rust-encoding to `encoding_rs` (faster), loses some decoders. ## [0.5.1] - 2017-03-06 ### Changed - Bump to quick-xml 0.6.0 (supposedly faster). ## [0.5.0] - 2017-02-07 ### Added - Xlsx - support 'inlineStr' elements (`` nodes). ### Changed - Rustfmt the code. - Bump dependencies (error-chain 0.8.1, quick-xml 0.5.0). ### Fixed - Xlsx - support sheetnames prefixed with 'xl/' or '/xl/'. ## [0.4.0] - 2017-01-09 ### Added - Adds a new `worksheet_range_by_index` function. - Adds new `ErrorKind`s. ### Changed - Replace `try!` with `?` operator. - Simplify `search_error` example by using a `run()` function. ## [0.3.3] - 2017-01-09 ### Changed - Update dependencies (error-chain and byteorder). ## [0.3.2] - 2016-11-27 ### Changed - Update dependencies. ## [0.3.1] - 2016-11-17 ### Changed - (xls) preload vba only instead of sheets only. - (vba) consume cfb in constructor and do not store cfb. ## [0.3.0] - 2016-11-16 ### Added - (all) better `Range` initialization via `Range::from_sparse`. - (all) several new fn in `Range` (`used_cells`, `start`, `end` ...). ### Changed - Adds a `range_eq!` macro in tests. ## [0.2.1] - 2016-11-15 ### Added - (xls) early exit if workbook is password protected. ### Fixed - (xls) allow directory start to empty sector if version = 3. - (vba) support all project codepage encodings. - (xls) better decoding based on codepage. - (xlsb) simplify setting values and early exit when stepping into an invalid `BrtRowHdr`. - (xlsb) fix record length calculation. ## [0.2.0] - 2016-11-14 ### Added - Add new methods for `Range`: `width`, `height`, `is_empty`. ### Changed - Less `unwrap`s, no unused imports. - Range bounds is not (`start`, `end`) instead of (`position`, `size`). ### Fixed - (all) allow range to resize when we try to set a value out of bounds. ## [0.1.3] - 2016-11-11 ### Fixed - (xls) better management of continue record for `rich_extended_strings`. ## [0.1.2] - 2016-11-11 ### Fixed - (all) return error when trying to set out of bound values in `Range`. - (xls) do a proper encoding when reading cells (force 2 bytes unicode instead of utf8). - (xls) support continue records. - (all) allow empty rows iterator. ## [0.1.1] - 2016-11-09 ### Fixed - Remove some development `println!`. ## [0.1.0] - 2016-11-09 ### Changed - First release. calamine-0.34.0/LICENSE-MIT.md000064400000000000000000000020721046102023000135150ustar 00000000000000The MIT License (MIT) Copyright (c) 2016 Johann Tuffe Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. calamine-0.34.0/README.md000064400000000000000000000322011046102023000127360ustar 00000000000000# calamine An Excel/OpenDocument Spreadsheets file reader/deserializer, in pure Rust. [![GitHub CI Rust tests](https://github.com/tafia/calamine/workflows/Rust/badge.svg)](https://github.com/tafia/calamine/actions) [![Build status](https://ci.appveyor.com/api/projects/status/njpnhq54h5hxsgel/branch/master?svg=true)](https://ci.appveyor.com/project/tafia/calamine/branch/master) [Documentation](https://docs.rs/calamine/) ## Description **calamine** is a pure Rust library to read and deserialize any spreadsheet file: - excel like (`xls`, `xlsx`, `xlsm`, `xlsb`, `xla`, `xlam`) - opendocument spreadsheets (`ods`) As long as your files are *simple enough*, this library should just work. ## Examples ### Serde deserialization It is as simple as: ```rust use calamine::{open_workbook, Error, Xlsx, Reader, RangeDeserializerBuilder}; fn example() -> Result<(), Error> { let path = format!("{}/tests/temperature.xlsx", env!("CARGO_MANIFEST_DIR")); let mut workbook: Xlsx<_> = open_workbook(path)?; let range = workbook.worksheet_range("Sheet1")?; let mut iter = RangeDeserializerBuilder::new().from_range(&range)?; if let Some(result) = iter.next() { let (label, value): (String, f64) = result?; assert_eq!(label, "celsius"); assert_eq!(value, 22.2222); Ok(()) } else { Err(From::from("expected at least one record but got none")) } } ``` Calamine provides helper functions to deal with invalid type values. For instance, to deserialize a column which should contain floats but may also contain invalid values (i.e. strings), you can use the [`deserialize_as_f64_or_none`](https://docs.rs/calamine/latest/calamine/fn.deserialize_as_f64_or_none.html) helper function with Serde's [`deserialize_with`](https://serde.rs/field-attrs.html) field attribute: ```rust use calamine::{deserialize_as_f64_or_none, open_workbook, RangeDeserializerBuilder, Reader, Xlsx}; use serde::Deserialize; #[derive(Deserialize)] struct Record { metric: String, #[serde(deserialize_with = "deserialize_as_f64_or_none")] value: Option, } fn main() -> Result<(), Box> { let path = format!("{}/tests/excel.xlsx", env!("CARGO_MANIFEST_DIR")); let mut excel: Xlsx<_> = open_workbook(path)?; let range = excel .worksheet_range("Sheet1") .map_err(|_| calamine::Error::Msg("Cannot find Sheet1"))?; let iter_records = RangeDeserializerBuilder::with_headers(&["metric", "value"]).from_range(&range)?; for result in iter_records { let record: Record = result?; println!("metric={:?}, value={:?}", record.metric, record.value); } Ok(()) } ``` The [`deserialize_as_f64_or_none`](https://docs.rs/calamine/latest/calamine/fn.deserialize_as_f64_or_none.html) function discards all invalid values. If instead you would like to return them as `String`s, you can use the similar [`deserialize_as_f64_or_string`](https://docs.rs/calamine/latest/calamine/fn.deserialize_as_f64_or_string.html) function. ### Reader: Simple ```rust use calamine::{Reader, Xlsx, open_workbook}; let mut excel: Xlsx<_> = open_workbook("file.xlsx").unwrap(); if let Ok(r) = excel.worksheet_range("Sheet1") { for row in r.rows() { println!("row={:?}, row[0]={:?}", row, row[0]); } } ``` ### Reader: With header row ```rs use calamine::{HeaderRow, Reader, Xlsx, open_workbook}; let mut excel: Xlsx<_> = open_workbook("file.xlsx").unwrap(); let sheet1 = excel .with_header_row(HeaderRow::Row(3)) .worksheet_range("Sheet1") .unwrap(); ``` Note that `xlsx` and `xlsb` files support lazy loading, so specifying a header row takes effect immediately when reading a sheet range. In contrast, for `xls` and `ods` files, all sheets are loaded at once when opening the workbook with default settings. As a result, setting the header row only applies afterward and does not provide any performance benefits. ### Reader: More complex Let's assume - the file type (xls, xlsx ...) cannot be known at static time - we need to get all data from the workbook - we need to parse the vba - we need to see the defined names - and the formula! ```rust use calamine::{Reader, open_workbook_auto, Xlsx, DataType}; // opens a new workbook let path = ...; // we do not know the file type let mut workbook = open_workbook_auto(path).expect("Cannot open file"); // Read whole worksheet data and provide some statistics if let Some(Ok(range)) = workbook.worksheet_range("Sheet1") { let total_cells = range.get_size().0 * range.get_size().1; let non_empty_cells: usize = range.used_cells().count(); println!("Found {} cells in 'Sheet1', including {} non empty cells", total_cells, non_empty_cells); // alternatively, we can manually filter rows assert_eq!(non_empty_cells, range.rows() .flat_map(|r| r.iter().filter(|&c| c != &DataType::Empty)).count()); } // Check if the workbook has a vba project if let Ok(Some(vba)) = workbook.vba_project() { let module1 = vba.get_module("Module 1").unwrap(); println!("Module 1 code:"); println!("{}", module1); for r in vba.get_references() { if r.is_missing() { println!("Reference {} is broken or not accessible", r.name); } } } // You can also get defined names definition (string representation only) for name in workbook.defined_names() { println!("name: {}, formula: {}", name.0, name.1); } // Now get all formula! let sheets = workbook.sheet_names().to_owned(); for s in sheets { println!("found {} formula in '{}'", workbook .worksheet_formula(&s) .expect("sheet not found") .expect("error while getting formula") .rows().flat_map(|r| r.iter().filter(|f| !f.is_empty())) .count(), s); } ``` ## Crate Features The following is a list of the optional features supported by the `calamine` crate. They are all off by default. - `chrono`: Adds support for Chrono date/time types to the API. - `dates`: A deprecated backwards compatible synonym for the `chrono` feature. - `picture`: Adds support for reading raw data for pictures in spreadsheets. A `calamine` feature can be enabled in your `Cargo.toml` file as follows: ```bash cargo add calamine -F chrono ``` ### Others Browse the [examples](https://github.com/tafia/calamine/tree/master/examples) directory. ## Performance As `calamine` is readonly, the comparisons will only involve reading an excel `xlsx` file and then iterating over the rows. Along with `calamine`, three other libraries were chosen, from three different languages: - [`excelize`](https://github.com/qax-os/excelize) written in `go` - [`ClosedXML`](https://github.com/ClosedXML/ClosedXML) written in `C#` - [`openpyxl`](https://foss.heptapod.net/openpyxl/openpyxl) written in `python` The benchmarks were done using this [dataset](https://raw.githubusercontent.com/wiki/jqnatividad/qsv/files/NYC_311_SR_2010-2020-sample-1M.7z), a `186MB` `xlsx` file when the `csv` is converted. The plotting data was gotten from the [`sysinfo`](https://github.com/GuillaumeGomez/sysinfo) crate, at a sample interval of `200ms`. The program samples the reported values for the running process and records it. The programs are all structured to follow the same constructs: `calamine`: ```rust use calamine::{open_workbook, Reader, Xlsx}; fn main() { // Open workbook let mut excel: Xlsx<_> = open_workbook("NYC_311_SR_2010-2020-sample-1M.xlsx").expect("failed to find file"); // Get worksheet let sheet = excel .worksheet_range("NYC_311_SR_2010-2020-sample-1M") .unwrap() .unwrap(); // iterate over rows for _row in sheet.rows() {} } ``` `excelize`: ```go package main import ( "fmt" "github.com/xuri/excelize/v2" ) func main() { // Open workbook file, err := excelize.OpenFile(`NYC_311_SR_2010-2020-sample-1M.xlsx`) if err != nil { fmt.Println(err) return } defer func() { // Close the spreadsheet. if err := file.Close(); err != nil { fmt.Println(err) } }() // Select worksheet rows, err := file.Rows("NYC_311_SR_2010-2020-sample-1M") if err != nil { fmt.Println(err) return } // Iterate over rows for rows.Next() { } } ``` `ClosedXML`: ```csharp using ClosedXML.Excel; internal class Program { private static void Main(string[] args) { // Open workbook using var workbook = new XLWorkbook("NYC_311_SR_2010-2020-sample-1M.xlsx"); // Get Worksheet // "NYC_311_SR_2010-2020-sample-1M" var worksheet = workbook.Worksheet(1); // Iterate over rows foreach (var row in worksheet.Rows()) { } } } ``` `openpyxl`: ```python from openpyxl import load_workbook # Open workbook wb = load_workbook( filename=r'NYC_311_SR_2010-2020-sample-1M.xlsx', read_only=True) # Get worksheet ws = wb['NYC_311_SR_2010-2020-sample-1M'] # Iterate over rows for row in ws.rows: _ = row # Close the workbook after reading wb.close() ``` ### Benchmarks The benchmarking was done using [`hyperfine`](https://github.com/sharkdp/hyperfine) with `--warmup 3` on an `AMD RYZEN 9 5900X @ 4.0GHz` running `Windows 11`. Both `calamine` and `ClosedXML` were built in release mode. ```bash 0.22.1 calamine.exe Time (mean Β± Οƒ): 25.278 s Β± 0.424 s [User: 24.852 s, System: 0.470 s] Range (min … max): 24.980 s … 26.369 s 10 runs v2.8.0 excelize.exe Time (mean Β± Οƒ): 44.254 s Β± 0.574 s [User: 46.071 s, System: 7.754 s] Range (min … max): 42.947 s … 44.911 s 10 runs 0.102.1 closedxml.exe Time (mean Β± Οƒ): 178.343 s Β± 3.673 s [User: 177.442 s, System: 2.612 s] Range (min … max): 173.232 s … 185.086 s 10 runs 3.0.10 openpyxl.py Time (mean Β± Οƒ): 238.554 s Β± 1.062 s [User: 238.016 s, System: 0.661 s] Range (min … max): 236.798 s … 240.167 s 10 runs ``` `calamine` is 1.75x faster than `excelize`, 7.05x faster than `ClosedXML`, and 9.43x faster than `openpyxl`. The spreadsheet has a range of 1,000,001 rows and 41 columns, for a total of 41,000,041 cells in the range. Of those, 28,056,975 cells had values. Going off of that number: - `calamine` => 1,122,279 cells per second - `excelize` => 633,998 cells per second - `ClosedXML` => 157,320 cells per second - `openpyxl` => 117,612 cells per second ### Plots #### Disk Read ![bytes_from_disk](https://github.com/RoloEdits/calamine/assets/12489689/fcca1147-d73f-4d1c-b273-e7e4c183ab29) As stated, the filesize on disk is `186MB`: - `calamine` => `186MB` - `ClosedXML` => `208MB`. - `openpyxl` => `192MB`. - `excelize` => `1.5GB`. When asking one of the maintainers of `excelize`, I got this [response](https://github.com/qax-os/excelize/issues/1695#issuecomment-1772239230): > To avoid high memory usage for reading large files, this library allows user-specific UnzipXMLSizeLimit options when opening the workbook, to set the memory limit on the unzipping worksheet and shared string table in bytes, worksheet XML will be extracted to the system temporary directory when the file size is over this value, so you can see that data written in reading mode, and you can change the default for that to avoid this behavior. > > \- xuri #### Disk Write ![bytes_to_disk](https://github.com/RoloEdits/calamine/assets/12489689/befa9893-7658-41a7-8cbd-b0ce5a7d9341) As seen in the previous section, `excelize` is writing to disk to save memory. The others don't employ that kind of mechanism. #### Memory ![mem_usage](https://github.com/RoloEdits/calamine/assets/12489689/c83fdf6b-1442-4e22-8eca-84cbc1db4a26) ![virt_mem_usage](https://github.com/RoloEdits/calamine/assets/12489689/840a96ed-33d7-44f7-8276-80bb7a02557f) > [!NOTE] > `ClosedXML` was reporting a constant `2.5TB` of virtual memory usage, so it was excluded from the chart. The stepping and falling for `calamine` is from the grows of `Vec`s and the freeing of memory right after, with the memory usage dropping down again. The sudden jump at the end is when the sheet is being read into memory. The others, being garbage collected, have a more linear climb all the way through. #### CPU ![cpu_usage](https://github.com/RoloEdits/calamine/assets/12489689/c3aa55a8-b008-48ee-ba04-c08bd91c1f6f) Very noisy chart, but `excelize`'s spikes must be from the GC? ## Unsupported Many (most) parts of the specifications are not implemented, the focus has been put on reading cell **values** and **vba** code. The main unsupported items are: - no support for writing excel files, this is a read-only library - no support for reading extra content, such as formatting, excel parameter, encrypted components etc ... - no support for reading VB for opendocuments ## Credits Thanks to [xlsx-js](https://github.com/SheetJS/js-xlsx) developers! This library is by far the simplest open source implementation I could find and helps making sense out of the official documentation. Thanks also to all the contributors! ## License MIT calamine-0.34.0/examples/excel_to_csv.rs000064400000000000000000000056211046102023000163260ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! An example for using the `calamine` crate to convert an Excel file to CSV. //! //! Converts XLSX, XLSM, XLSB, and XLS files. The filename and sheet name must //! be specified as command line arguments. The output CSV will be written to a //! file with the same name as the input file, but with a `.csv` extension. use std::env; use std::fs::File; use std::io::{BufWriter, Write}; use std::path::PathBuf; use calamine::{open_workbook_auto, Data, Range, Reader}; // usage: cargo run --example excel_to_csv file.xls[xmb] sheet_name // // Where: // - `file.xls[xmb]` is the Excel file to convert. Required. // - `sheet_name` is the name of the sheet to convert. Required. // // The output will be written to a file with the same name as the input file, // including the path, but with a `.csv` extension. // fn main() { let excel_file = env::args() .nth(1) .expect("Please provide an excel file to convert"); let sheet_name = env::args() .nth(2) .expect("Expecting a sheet name as second argument"); let excel_path = PathBuf::from(excel_file); match excel_path.extension().and_then(|s| s.to_str()) { Some("xlsx") | Some("xlsm") | Some("xlsb") | Some("xls") => (), _ => panic!("Expecting an excel file"), } let csv_path = excel_path.with_extension("csv"); let mut csv_file = BufWriter::new(File::create(csv_path).unwrap()); let mut workbook = open_workbook_auto(&excel_path).unwrap(); let range = workbook.worksheet_range(&sheet_name).unwrap(); write_to_csv(&mut csv_file, &range).unwrap(); } // Write the Excel data as strings to a CSV file. Uses a semicolon (`;`) as the // field separator. // // Note, this is a simplified version of CSV and doesn't handle quoting of // separators or other special cases. See the `csv.rs` crate for a more robust // solution. fn write_to_csv(output_file: &mut W, range: &Range) -> std::io::Result<()> { let max_column = range.get_size().1 - 1; for rows in range.rows() { for (col_number, cell_data) in rows.iter().enumerate() { match cell_data { Data::Empty => Ok(()), Data::Int(i) => write!(output_file, "{i}"), Data::Bool(b) => write!(output_file, "{b}"), Data::Error(e) => write!(output_file, "{e:?}"), Data::Float(f) => write!(output_file, "{f}"), Data::DateTime(d) => write!(output_file, "{}", d.as_f64()), Data::String(s) | Data::DateTimeIso(s) | Data::DurationIso(s) => { write!(output_file, "{s}") } }?; // Write the field separator except for the last column. if col_number != max_column { write!(output_file, ";")?; } } write!(output_file, "\r\n")?; } Ok(()) } calamine-0.34.0/examples/search_errors.rs000064400000000000000000000077261046102023000165220ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! An example for using the `calamine` crate to to search a directory for Excel //! files and check them for errors. //! //! Recursively searches for XLSX, XLSM, XLSB, and XLS Excel files and parses //! them to check for any `calamine` errors. Also checks for and counts the //! number of missing VBA references or the number of cells with Excel errors. //! use glob::{glob, GlobError}; use std::env; use std::fs::File; use std::io::{BufWriter, Write}; use std::path::PathBuf; use calamine::{open_workbook_auto, Data, Error, Reader}; #[derive(Debug)] #[allow(dead_code)] // Simple error type to handle the various errors that can occur. enum FileStatus { VbaError(Error), RangeError(Error), WorkbookOpenError(Error), Glob(GlobError), } // usage: cargo run --example search_errors [dir] // // Where: // // - `[dir]` is the directory to search for Excel files. Defaults to `.`. // // The analysis is written to an output file called `{dir}_errors.csv`. If no // directory is specified, it defaults to the current directory. The output file // will contain the file path, the number of missing VBA references, and the // number of cells with errors for each Excel file found. Alternatively, if an // error occurs while processing a file, it will be logged in the output. // fn main() -> Result<(), FileStatus> { let search_dir = env::args().nth(1).unwrap_or_else(|| ".".to_string()); let file_pattern = format!("{search_dir}/**/*.xl*"); let mut file_count = 0; // Strip/convert any directory characters to create an output filename. let mut output_filename = file_pattern .chars() .take_while(|c| *c != '*') .filter_map(|c| match c { ':' => None, '/' | '\\' | ' ' => Some('_'), c => Some(c), }) .collect::(); // Append "_errors.csv" to the output filename. output_filename.push_str("errors.csv"); // Use a default output filename for the default search directory. if search_dir == "." { output_filename = "errors.csv".to_string(); } let mut output_file = BufWriter::new(File::create(&output_filename).unwrap()); // Iterate through any Excel files that were found. for file in glob(&file_pattern).unwrap() { file_count += 1; let file = file.map_err(FileStatus::Glob)?; match analyze(&file) { Ok((missing_vba_refs, cell_errors)) => { writeln!( output_file, "{file:?}: Missing VBA refs = {missing_vba_refs:?}. Cell errors = {cell_errors}." ) } Err(e) => writeln!(output_file, "{file:?}: Error = {e:?}."), } .unwrap_or_else(|e| println!("{e:?}")); } println!("Analyzed {file_count} excel files. See '{output_filename}' for analysis."); Ok(()) } // Function to analyze a single Excel file for errors, missing VBA references // and cell errors. fn analyze(file: &PathBuf) -> Result<(Option, usize), FileStatus> { let mut workbook = open_workbook_auto(file).map_err(FileStatus::WorkbookOpenError)?; let mut num_cell_errors = 0; let mut num_missing_vba_refs = None; // Check if the workbook has a VBA project and count missing references. if let Some(vba) = workbook.vba_project().map_err(FileStatus::VbaError)? { num_missing_vba_refs = Some( vba.get_references() .iter() .filter(|r| r.is_missing()) .count(), ); } // Iterate through all sheets and count cell errors. for sheet_name in workbook.sheet_names() { let range = workbook .worksheet_range(&sheet_name) .map_err(FileStatus::RangeError)?; num_cell_errors += range .rows() .flat_map(|r| r.iter().filter(|c| matches!(**c, Data::Error(_)))) .count(); } Ok((num_missing_vba_refs, num_cell_errors)) } calamine-0.34.0/src/auto.rs000064400000000000000000000141261046102023000135720ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! A module to convert file extension to reader use crate::errors::Error; use crate::vba::VbaProject; use crate::{ open_workbook, open_workbook_from_rs, Data, DataRef, HeaderRow, Metadata, Ods, Range, Reader, ReaderRef, Xls, Xlsb, Xlsx, }; use std::fs::File; use std::io::BufReader; use std::path::Path; /// A wrapper over all sheets when the file type is not known at static time pub enum Sheets { /// Xls reader Xls(Xls), /// Xlsx reader Xlsx(Xlsx), /// Xlsb reader Xlsb(Xlsb), /// Ods reader Ods(Ods), } /// Opens a workbook and define the file type at runtime. /// /// Whenever possible use the statically known `open_workbook` function instead pub fn open_workbook_auto

(path: P) -> Result>, Error> where P: AsRef, { let path = path.as_ref(); Ok(match path.extension().and_then(|e| e.to_str()) { Some("xls" | "xla") => Sheets::Xls(open_workbook(path).map_err(Error::Xls)?), Some("xlsx" | "xlsm" | "xlam") => Sheets::Xlsx(open_workbook(path).map_err(Error::Xlsx)?), Some("xlsb") => Sheets::Xlsb(open_workbook(path).map_err(Error::Xlsb)?), Some("ods") => Sheets::Ods(open_workbook(path).map_err(Error::Ods)?), _ => { if let Ok(ret) = open_workbook::, _>(path) { return Ok(Sheets::Xls(ret)); } else if let Ok(ret) = open_workbook::, _>(path) { return Ok(Sheets::Xlsx(ret)); } else if let Ok(ret) = open_workbook::, _>(path) { return Ok(Sheets::Xlsb(ret)); } else if let Ok(ret) = open_workbook::, _>(path) { return Ok(Sheets::Ods(ret)); } else { return Err(Error::Msg("Cannot detect file format")); }; } }) } /// Opens a workbook from the given bytes. /// /// Whenever possible use the statically known `open_workbook_from_rs` function instead pub fn open_workbook_auto_from_rs(data: RS) -> Result, Error> where RS: std::io::Read + std::io::Seek + Clone, { if let Ok(ret) = open_workbook_from_rs::, RS>(data.clone()) { Ok(Sheets::Xls(ret)) } else if let Ok(ret) = open_workbook_from_rs::, RS>(data.clone()) { Ok(Sheets::Xlsx(ret)) } else if let Ok(ret) = open_workbook_from_rs::, RS>(data.clone()) { Ok(Sheets::Xlsb(ret)) } else if let Ok(ret) = open_workbook_from_rs::, RS>(data) { Ok(Sheets::Ods(ret)) } else { Err(Error::Msg("Cannot detect file format")) } } impl Reader for Sheets where RS: std::io::Read + std::io::Seek, { type Error = Error; /// Creates a new instance. fn new(_reader: RS) -> Result { Err(Error::Msg("Sheets must be created from a Path")) } fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { match self { Sheets::Xls(e) => { e.with_header_row(header_row); } Sheets::Xlsx(e) => { e.with_header_row(header_row); } Sheets::Xlsb(e) => { e.with_header_row(header_row); } Sheets::Ods(e) => { e.with_header_row(header_row); } } self } /// Gets `VbaProject` fn vba_project(&mut self) -> Result, Self::Error> { match self { Sheets::Xls(e) => e.vba_project().map_err(Error::Xls), Sheets::Xlsx(e) => e.vba_project().map_err(Error::Xlsx), Sheets::Xlsb(e) => e.vba_project().map_err(Error::Xlsb), Sheets::Ods(e) => e.vba_project().map_err(Error::Ods), } } /// Initialize fn metadata(&self) -> &Metadata { match self { Sheets::Xls(e) => e.metadata(), Sheets::Xlsx(e) => e.metadata(), Sheets::Xlsb(e) => e.metadata(), Sheets::Ods(e) => e.metadata(), } } /// Read worksheet data in corresponding worksheet path fn worksheet_range(&mut self, name: &str) -> Result, Self::Error> { match self { Sheets::Xls(e) => e.worksheet_range(name).map_err(Error::Xls), Sheets::Xlsx(e) => e.worksheet_range(name).map_err(Error::Xlsx), Sheets::Xlsb(e) => e.worksheet_range(name).map_err(Error::Xlsb), Sheets::Ods(e) => e.worksheet_range(name).map_err(Error::Ods), } } /// Read worksheet formula in corresponding worksheet path fn worksheet_formula(&mut self, name: &str) -> Result, Self::Error> { match self { Sheets::Xls(e) => e.worksheet_formula(name).map_err(Error::Xls), Sheets::Xlsx(e) => e.worksheet_formula(name).map_err(Error::Xlsx), Sheets::Xlsb(e) => e.worksheet_formula(name).map_err(Error::Xlsb), Sheets::Ods(e) => e.worksheet_formula(name).map_err(Error::Ods), } } fn worksheets(&mut self) -> Vec<(String, Range)> { match self { Sheets::Xls(e) => e.worksheets(), Sheets::Xlsx(e) => e.worksheets(), Sheets::Xlsb(e) => e.worksheets(), Sheets::Ods(e) => e.worksheets(), } } #[cfg(feature = "picture")] fn pictures(&self) -> Option)>> { match self { Sheets::Xls(e) => e.pictures(), Sheets::Xlsx(e) => e.pictures(), Sheets::Xlsb(e) => e.pictures(), Sheets::Ods(e) => e.pictures(), } } } impl ReaderRef for Sheets where RS: std::io::Read + std::io::Seek, { fn worksheet_range_ref<'a>( &'a mut self, name: &str, ) -> Result>, Self::Error> { match self { Sheets::Xlsx(e) => e.worksheet_range_ref(name).map_err(Error::Xlsx), Sheets::Xlsb(e) => e.worksheet_range_ref(name).map_err(Error::Xlsb), Sheets::Xls(_) => unimplemented!(), Sheets::Ods(_) => unimplemented!(), } } } calamine-0.34.0/src/cfb.rs000064400000000000000000000347421046102023000133620ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! Compound File Binary format MS-CFB use std::borrow::Cow; use std::cmp::min; use std::io::Read; use log::debug; use encoding_rs::{Encoding, UTF_16LE, UTF_8}; use crate::utils::*; const RESERVED_SECTORS: u32 = 0xFFFF_FFFA; const DIFSECT: u32 = 0xFFFF_FFFC; // const FATSECT: u32 = 0xFFFF_FFFD; const ENDOFCHAIN: u32 = 0xFFFF_FFFE; //const FREESECT: u32 = 0xFFFF_FFFF; /// A Cfb specific error enum #[derive(Debug)] pub enum CfbError { Io(std::io::Error), Ole, EmptyRootDir, StreamNotFound(String), Invalid { name: &'static str, expected: &'static str, found: u16, }, CodePageNotFound(u16), SectorBeyondEof(u32), } impl From for CfbError { fn from(e: std::io::Error) -> CfbError { CfbError::Io(e) } } impl std::fmt::Display for CfbError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { CfbError::Io(e) => write!(f, "I/O error: {e}"), CfbError::Ole => write!(f, "Invalid OLE signature (not an office document?)"), CfbError::EmptyRootDir => write!(f, "Empty Root directory"), CfbError::StreamNotFound(e) => write!(f, "Cannot find {e} stream"), CfbError::Invalid { name, expected, found, } => write!(f, "Invalid {name}, expecting {expected} found {found:X}"), CfbError::CodePageNotFound(e) => write!(f, "Codepage {e:X} not found"), CfbError::SectorBeyondEof(i) => write!(f, "Sector {i} points past end of file"), } } } impl std::error::Error for CfbError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { CfbError::Io(e) => Some(e), _ => None, } } } /// A struct for managing Compound File Binary format #[derive(Debug, Clone)] pub struct Cfb { directories: Vec, sectors: Sectors, fats: Vec, mini_sectors: Sectors, mini_fats: Vec, } impl Cfb { /// Create a new `Cfb` /// /// Starts reading project metadata (header, directories, sectors and minisectors). pub fn new(mut reader: &mut R, len: usize) -> Result { // load header let (h, mut difat) = Header::from_reader(&mut reader)?; let sector_size = h.sector_size as usize; let maxid = len / sector_size; let mut sectors = Sectors::new(sector_size, Vec::with_capacity(1024), maxid as u32); // load fat and dif sectors debug!("load difat {h:?}"); let mut sector_id = h.difat_start; while sector_id < RESERVED_SECTORS { difat.extend(to_u32(sectors.get(sector_id, reader)?)); sector_id = difat.pop().unwrap(); //TODO: check if in infinite loop } // load the FATs debug!("load fat (len {})", h.fat_len); let mut fats = Vec::with_capacity(h.fat_len); for id in difat.into_iter().filter(|id| *id < DIFSECT) { fats.extend(to_u32(sectors.get(id, reader)?)); } // get the list of directory sectors debug!("load directories"); // we want to read the full chain, so pass no size restriction let dirs = sectors.get_chain(h.dir_start, &fats, reader, usize::MAX)?; let dirs = dirs .chunks(128) .map(|c| Directory::from_slice(c, h.sector_size)) .collect::>(); if dirs.is_empty() || (h.version != 3 && dirs[0].start == ENDOFCHAIN) { return Err(CfbError::EmptyRootDir); } // load the mini streams debug!("load minis {dirs:?}"); let (mini_fats, ministream) = if h.mini_fat_len > 0 { let ministream = sectors.get_chain(dirs[0].start, &fats, reader, dirs[0].len)?; let minifat = sectors.get_chain( h.mini_fat_start, &fats, reader, h.mini_fat_len * h.sector_size as usize, )?; let minifat = to_u32(&minifat).collect(); (minifat, ministream) } else { (Vec::new(), Vec::new()) }; Ok(Cfb { directories: dirs, sectors, fats, mini_sectors: Sectors::new(64, ministream, (maxid * 8) as u32), mini_fats, }) } /// Checks if directory exists pub fn has_directory(&self, name: &str) -> bool { self.directories.iter().any(|d| &*d.name == name) } /// Gets a stream by name out of directories pub fn get_stream(&mut self, name: &str, r: &mut R) -> Result, CfbError> { match self.directories.iter().find(|d| &*d.name == name) { None => Err(CfbError::StreamNotFound(name.to_string())), Some(d) => { if d.len < 4096 { // TODO: Study the possibility to return a `VecArray` (stack allocated) self.mini_sectors .get_chain(d.start, &self.mini_fats, r, d.len) } else { self.sectors.get_chain(d.start, &self.fats, r, d.len) } } } } } /// A hidden struct which defines cfb files structure #[derive(Debug)] struct Header { version: u16, sector_size: SectorSize, dir_start: u32, fat_len: usize, mini_fat_len: usize, mini_fat_start: u32, difat_start: u32, } #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum SectorSize { Small = 512, Large = 4096, } impl Header { fn from_reader(f: &mut R) -> Result<(Header, Vec), CfbError> { let mut buf = [0u8; 512]; f.read_exact(&mut buf)?; // check ole signature let signature = buf .get(0..8) .map(|slice| u64::from_le_bytes(slice.try_into().unwrap())); if signature != Some(0xE11A_B1A1_E011_CFD0) { return Err(CfbError::Ole); } let version = read_u16(&buf[26..28]); let sector_size = match read_u16(&buf[30..32]) { 0x0009 => SectorSize::Small, 0x000C => { // sector size is 4096 bytes, but header is 512 bytes, // so the remaining sector bytes have to be read let mut buf_end = [0u8; 4096 - 512]; f.read_exact(&mut buf_end)?; SectorSize::Large } s => { return Err(CfbError::Invalid { name: "sector shift", expected: "0x09 or 0x0C", found: s, }); } }; if read_u16(&buf[32..34]) != 0x0006 { return Err(CfbError::Invalid { name: "minisector shift", expected: "0x06", found: read_u16(&buf[32..34]), }); } let fat_len = read_usize(&buf[44..48]); let dir_start = read_u32(&buf[48..52]); let mini_fat_start = read_u32(&buf[60..64]); let mini_fat_len = read_usize(&buf[64..68]); let difat_start = read_u32(&buf[68..72]); let difat_len = read_usize(&buf[62..76]); let mut difat = Vec::with_capacity(difat_len); difat.extend(to_u32(&buf[76..512])); Ok(( Header { version, sector_size, dir_start, fat_len, mini_fat_len, mini_fat_start, difat_start, }, difat, )) } } /// A struct corresponding to the elementary block of memory /// /// `data` will persist in memory to ensure the file is read once #[derive(Debug, Clone)] struct Sectors { data: Vec, size: usize, maxid: u32, } impl Sectors { fn new(size: usize, data: Vec, maxid: u32) -> Sectors { Sectors { data, size, maxid } } fn get(&mut self, id: u32, r: &mut R) -> Result<&[u8], CfbError> { if self.maxid < id { return Err(CfbError::SectorBeyondEof(id)); } let start = id as usize * self.size; let end = start + self.size; if end > self.data.len() { let mut len = self.data.len(); self.data.resize(end, 0); // read_exact or stop if EOF while len < end { let read = r.read(&mut self.data[len..end])?; if read == 0 { return Ok(&self.data[start..len]); } len += read; } } Ok(&self.data[start..end]) } fn get_chain( &mut self, mut sector_id: u32, fats: &[u32], r: &mut R, len: usize, ) -> Result, CfbError> { let mut chain = Vec::new(); while sector_id != ENDOFCHAIN { let sector = self.get(sector_id, r)?; let remaining = len - chain.len(); let to_copy = sector.len().min(remaining); chain.extend_from_slice(§or[..to_copy]); if chain.len() >= len { break; } sector_id = fats[sector_id as usize]; } Ok(chain) } } /// A struct representing sector organizations, behaves similarly to a tree #[derive(Debug, Clone)] struct Directory { name: String, start: u32, len: usize, } impl Directory { fn from_slice(buf: &[u8], sector_size: SectorSize) -> Directory { let mut name = UTF_16LE.decode(&buf[..64]).0.into_owned(); if let Some(l) = name.as_bytes().iter().position(|b| *b == 0) { name.truncate(l); } let start = read_u32(&buf[116..120]); let len: usize = if sector_size == SectorSize::Small { read_u32(&buf[120..124]).try_into().unwrap() } else { read_u64(&buf[120..128]).try_into().unwrap() }; Directory { start, len, name } } } /// Decompresses stream pub fn decompress_stream(s: &[u8]) -> Result, CfbError> { const POWER_2: [usize; 16] = [ 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, ]; debug!("decompress stream"); let mut res = Vec::new(); if s[0] != 0x01 { return Err(CfbError::Invalid { name: "signature", expected: "0x01", found: s[0] as u16, }); } let mut i = 1; while i < s.len() { let chunk_header = read_u16(&s[i..]); i += 2; // each 'chunk' is 4096 wide, let's reserve that space let start = res.len(); res.reserve(4096); let chunk_size = chunk_header & 0x0FFF; let chunk_signature = (chunk_header & 0x7000) >> 12; let chunk_flag = (chunk_header & 0x8000) >> 15; assert_eq!(chunk_signature, 0b011, "i={}, len={}", i, s.len()); if chunk_flag == 0 { // uncompressed res.extend_from_slice(&s[i..i + 4096]); i += 4096; } else { let mut chunk_len = 0; let mut buf = [0u8; 4096]; 'chunk: loop { if i >= s.len() { break; } let bit_flags = s[i]; i += 1; chunk_len += 1; for bit_index in 0..8 { if chunk_len > chunk_size { break 'chunk; } if (bit_flags & (1 << bit_index)) == 0 { // literal token res.push(s[i]); i += 1; chunk_len += 1; } else { // copy token let token = read_u16(&s[i..]); i += 2; chunk_len += 2; let decomp_len = res.len() - start; let bit_count = (4..16).find(|i| POWER_2[*i] >= decomp_len).unwrap(); let len_mask = 0xFFFF >> bit_count; let mut len = (token & len_mask) as usize + 3; let offset = ((token & !len_mask) >> (16 - bit_count)) as usize + 1; while len > offset { buf[..offset].copy_from_slice(&res[res.len() - offset..]); res.extend_from_slice(&buf[..offset]); len -= offset; } buf[..len] .copy_from_slice(&res[res.len() - offset..res.len() - offset + len]); res.extend_from_slice(&buf[..len]); } } } } } Ok(res) } #[derive(Clone, Debug, PartialEq, Eq)] pub struct XlsEncoding { encoding: &'static Encoding, } impl XlsEncoding { pub fn from_codepage(codepage: u16) -> Result { let e = codepage::to_encoding(codepage).ok_or(CfbError::CodePageNotFound(codepage))?; Ok(XlsEncoding { encoding: e }) } fn high_byte(&self, high_byte: Option) -> Option { high_byte.or_else(|| { if self.encoding == UTF_8 || self.encoding.is_single_byte() { None } else { Some(false) } }) } pub fn decode_to( &self, stream: &[u8], len: usize, s: &mut String, high_byte: Option, ) -> (usize, usize) { let (l, ub, bytes) = match self.high_byte(high_byte) { None => { let l = min(stream.len(), len); (l, l, Cow::Borrowed(&stream[..l])) } Some(false) => { let l = min(stream.len(), len); // add 0x00 high bytes to unicodes let mut bytes = vec![0; l * 2]; for (i, sce) in stream.iter().take(l).enumerate() { bytes[2 * i] = *sce; } (l, l, Cow::Owned(bytes)) } Some(true) => { let l = min(stream.len() / 2, len); (l, 2 * l, Cow::Borrowed(&stream[..2 * l])) } }; s.push_str(&self.encoding.decode(&bytes).0); (l, ub) } pub fn decode_all(&self, stream: &[u8]) -> String { self.encoding.decode(stream).0.into_owned() } } calamine-0.34.0/src/changelog.rs000064400000000000000000000002231046102023000145420ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! Changelog for `calamine`. #![doc = include_str!("../Changelog.md")] calamine-0.34.0/src/datatype.rs000064400000000000000000001526141046102023000144420ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. use std::fmt; #[cfg(feature = "chrono")] use std::sync::OnceLock; use serde::de::Visitor; use serde::Deserialize; use super::CellErrorType; // Constants used in Excel date calculations. const DAY_SECONDS: f64 = 24.0 * 60.0 * 60.; const HOUR_SECONDS: u64 = 60 * 60; const MINUTE_SECONDS: u64 = 60; const YEAR_DAYS: u64 = 365; const YEAR_DAYS_4: u64 = YEAR_DAYS * 4 + 1; const YEAR_DAYS_100: u64 = YEAR_DAYS * 100 + 25; const YEAR_DAYS_400: u64 = YEAR_DAYS * 400 + 97; #[cfg(feature = "chrono")] static EXCEL_EPOCH: OnceLock = OnceLock::new(); #[cfg(feature = "chrono")] // https://learn.microsoft.com/en-us/office/troubleshoot/excel/1900-and-1904-date-system const EXCEL_1900_1904_DIFF: f64 = 1462.; #[cfg(feature = "chrono")] const MS_MULTIPLIER: f64 = 24f64 * 60f64 * 60f64 * 1e+3f64; /// An enum to represent all different data types that can appear as /// a value in a worksheet cell #[derive(Debug, Clone, PartialEq, Default)] pub enum Data { /// Signed integer Int(i64), /// Float Float(f64), /// String String(String), /// Boolean Bool(bool), /// Date or Time DateTime(ExcelDateTime), /// Date, Time or Date/Time in ISO 8601 DateTimeIso(String), /// Duration in ISO 8601 DurationIso(String), /// Error Error(CellErrorType), /// Empty cell #[default] Empty, } /// An enum to represent all different data types that can appear as /// a value in a worksheet cell impl DataType for Data { fn is_empty(&self) -> bool { *self == Data::Empty } fn is_int(&self) -> bool { matches!(*self, Data::Int(_)) } fn is_float(&self) -> bool { matches!(*self, Data::Float(_)) } fn is_bool(&self) -> bool { matches!(*self, Data::Bool(_)) } fn is_string(&self) -> bool { matches!(*self, Data::String(_)) } fn is_duration_iso(&self) -> bool { matches!(*self, Data::DurationIso(_)) } fn is_datetime(&self) -> bool { matches!(*self, Data::DateTime(_)) } fn is_datetime_iso(&self) -> bool { matches!(*self, Data::DateTimeIso(_)) } fn is_error(&self) -> bool { matches!(*self, Data::Error(_)) } fn get_int(&self) -> Option { if let Data::Int(v) = self { Some(*v) } else { None } } fn get_float(&self) -> Option { if let Data::Float(v) = self { Some(*v) } else { None } } fn get_bool(&self) -> Option { if let Data::Bool(v) = self { Some(*v) } else { None } } fn get_string(&self) -> Option<&str> { if let Data::String(v) = self { Some(&**v) } else { None } } fn get_datetime(&self) -> Option { match self { Data::DateTime(v) => Some(*v), _ => None, } } fn get_datetime_iso(&self) -> Option<&str> { match self { Data::DateTimeIso(v) => Some(&**v), _ => None, } } fn get_duration_iso(&self) -> Option<&str> { match self { Data::DurationIso(v) => Some(&**v), _ => None, } } fn get_error(&self) -> Option<&CellErrorType> { match self { Data::Error(e) => Some(e), _ => None, } } fn as_string(&self) -> Option { match self { Data::Float(v) => Some(v.to_string()), Data::Int(v) => Some(v.to_string()), Data::String(v) => Some(v.clone()), _ => None, } } fn as_i64(&self) -> Option { match self { Data::Int(v) => Some(*v), Data::Float(v) => Some(*v as i64), Data::Bool(v) => Some(*v as i64), Data::String(v) => atoi_simd::parse::(v.as_bytes()).ok(), _ => None, } } fn as_f64(&self) -> Option { match self { Data::Int(v) => Some(*v as f64), Data::Float(v) => Some(*v), Data::Bool(v) => Some((*v as i32).into()), Data::String(v) => fast_float2::parse(v).ok(), _ => None, } } } impl PartialEq<&str> for Data { fn eq(&self, other: &&str) -> bool { matches!(self, Data::String(s) if s == other) } } impl PartialEq for Data { fn eq(&self, other: &str) -> bool { matches!(self, Data::String(s) if s == other) } } impl PartialEq for Data { fn eq(&self, other: &f64) -> bool { matches!(self, Data::Float(s) if *s == *other) } } impl PartialEq for Data { fn eq(&self, other: &bool) -> bool { matches!(self, Data::Bool(s) if *s == *other) } } impl PartialEq for Data { fn eq(&self, other: &i64) -> bool { matches!(self, Data::Int(s) if *s == *other) } } impl fmt::Display for Data { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), fmt::Error> { match self { Data::Int(e) => write!(f, "{e}"), Data::Float(e) => write!(f, "{e}"), Data::String(e) => write!(f, "{e}"), Data::Bool(e) => write!(f, "{e}"), Data::DateTime(e) => write!(f, "{e}"), Data::DateTimeIso(e) => write!(f, "{e}"), Data::DurationIso(e) => write!(f, "{e}"), Data::Error(e) => write!(f, "{e}"), Data::Empty => Ok(()), } } } impl<'de> Deserialize<'de> for Data { #[inline] fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { struct DataVisitor; impl<'de> Visitor<'de> for DataVisitor { type Value = Data; fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { formatter.write_str("any valid JSON value") } #[inline] fn visit_bool(self, value: bool) -> Result { Ok(Data::Bool(value)) } #[inline] fn visit_i64(self, value: i64) -> Result { Ok(Data::Int(value)) } #[inline] fn visit_u64(self, value: u64) -> Result { Ok(Data::Int(value as i64)) } #[inline] fn visit_f64(self, value: f64) -> Result { Ok(Data::Float(value)) } #[inline] fn visit_str(self, value: &str) -> Result where E: serde::de::Error, { self.visit_string(String::from(value)) } #[inline] fn visit_string(self, value: String) -> Result { Ok(Data::String(value)) } #[inline] fn visit_none(self) -> Result { Ok(Data::Empty) } #[inline] fn visit_some(self, deserializer: D) -> Result where D: serde::Deserializer<'de>, { Deserialize::deserialize(deserializer) } #[inline] fn visit_unit(self) -> Result { Ok(Data::Empty) } } deserializer.deserialize_any(DataVisitor) } } macro_rules! define_from { ($variant:path, $ty:ty) => { impl From<$ty> for Data { fn from(v: $ty) -> Self { $variant(v) } } }; } define_from!(Data::Int, i64); define_from!(Data::Float, f64); define_from!(Data::String, String); define_from!(Data::Bool, bool); define_from!(Data::Error, CellErrorType); impl<'a> From<&'a str> for Data { fn from(v: &'a str) -> Self { Data::String(String::from(v)) } } impl From<()> for Data { fn from(_: ()) -> Self { Data::Empty } } impl From> for Data where Data: From, { fn from(v: Option) -> Self { match v { Some(v) => From::from(v), None => Data::Empty, } } } /// An enum to represent all different data types that can appear as /// a value in a worksheet cell #[derive(Debug, Clone, PartialEq, Default)] pub enum DataRef<'a> { /// Signed integer Int(i64), /// Float Float(f64), /// String String(String), /// Shared String SharedString(&'a str), /// Boolean Bool(bool), /// Date or Time DateTime(ExcelDateTime), /// Date, Time or Date/Time in ISO 8601 DateTimeIso(String), /// Duration in ISO 8601 DurationIso(String), /// Error Error(CellErrorType), /// Empty cell #[default] Empty, } impl DataType for DataRef<'_> { fn is_empty(&self) -> bool { *self == DataRef::Empty } fn is_int(&self) -> bool { matches!(*self, DataRef::Int(_)) } fn is_float(&self) -> bool { matches!(*self, DataRef::Float(_)) } fn is_bool(&self) -> bool { matches!(*self, DataRef::Bool(_)) } fn is_string(&self) -> bool { matches!(*self, DataRef::String(_) | DataRef::SharedString(_)) } fn is_duration_iso(&self) -> bool { matches!(*self, DataRef::DurationIso(_)) } fn is_datetime(&self) -> bool { matches!(*self, DataRef::DateTime(_)) } fn is_datetime_iso(&self) -> bool { matches!(*self, DataRef::DateTimeIso(_)) } fn is_error(&self) -> bool { matches!(*self, DataRef::Error(_)) } fn get_int(&self) -> Option { if let DataRef::Int(v) = self { Some(*v) } else { None } } fn get_float(&self) -> Option { if let DataRef::Float(v) = self { Some(*v) } else { None } } fn get_bool(&self) -> Option { if let DataRef::Bool(v) = self { Some(*v) } else { None } } fn get_string(&self) -> Option<&str> { match self { DataRef::String(v) => Some(&**v), DataRef::SharedString(v) => Some(v), _ => None, } } fn get_datetime(&self) -> Option { match self { DataRef::DateTime(v) => Some(*v), _ => None, } } fn get_datetime_iso(&self) -> Option<&str> { match self { DataRef::DateTimeIso(v) => Some(&**v), _ => None, } } fn get_duration_iso(&self) -> Option<&str> { match self { DataRef::DurationIso(v) => Some(&**v), _ => None, } } fn get_error(&self) -> Option<&CellErrorType> { match self { DataRef::Error(e) => Some(e), _ => None, } } fn as_string(&self) -> Option { match self { DataRef::Float(v) => Some(v.to_string()), DataRef::Int(v) => Some(v.to_string()), DataRef::String(v) => Some(v.clone()), DataRef::SharedString(v) => Some(v.to_string()), _ => None, } } fn as_i64(&self) -> Option { match self { DataRef::Int(v) => Some(*v), DataRef::Float(v) => Some(*v as i64), DataRef::Bool(v) => Some(*v as i64), DataRef::String(v) => atoi_simd::parse::(v.as_bytes()).ok(), DataRef::SharedString(v) => atoi_simd::parse::(v.as_bytes()).ok(), _ => None, } } fn as_f64(&self) -> Option { match self { DataRef::Int(v) => Some(*v as f64), DataRef::Float(v) => Some(*v), DataRef::Bool(v) => Some((*v as i32).into()), DataRef::String(v) => fast_float2::parse(v).ok(), DataRef::SharedString(v) => fast_float2::parse(v).ok(), _ => None, } } } impl PartialEq<&str> for DataRef<'_> { fn eq(&self, other: &&str) -> bool { matches!(self, DataRef::String(s) if s == other) } } impl PartialEq for DataRef<'_> { fn eq(&self, other: &str) -> bool { matches!(self, DataRef::String(s) if s == other) } } impl PartialEq for DataRef<'_> { fn eq(&self, other: &f64) -> bool { matches!(self, DataRef::Float(s) if *s == *other) } } impl PartialEq for DataRef<'_> { fn eq(&self, other: &bool) -> bool { matches!(self, DataRef::Bool(s) if *s == *other) } } impl PartialEq for DataRef<'_> { fn eq(&self, other: &i64) -> bool { matches!(self, DataRef::Int(s) if *s == *other) } } /// A trait to represent all different data types that can appear as /// a value in a worksheet cell pub trait DataType { /// Assess if datatype is empty fn is_empty(&self) -> bool; /// Assess if datatype is a int fn is_int(&self) -> bool; /// Assess if datatype is a float fn is_float(&self) -> bool; /// Assess if datatype is a bool fn is_bool(&self) -> bool; /// Assess if datatype is a string fn is_string(&self) -> bool; /// Assess if datatype is a `CellErrorType` fn is_error(&self) -> bool; /// Assess if datatype is an ISO8601 duration fn is_duration_iso(&self) -> bool; /// Assess if datatype is a datetime fn is_datetime(&self) -> bool; /// Assess if datatype is an ISO8601 datetime fn is_datetime_iso(&self) -> bool; /// Try getting int value fn get_int(&self) -> Option; /// Try getting float value fn get_float(&self) -> Option; /// Try getting bool value fn get_bool(&self) -> Option; /// Try getting string value fn get_string(&self) -> Option<&str>; /// Try getting datetime value fn get_datetime(&self) -> Option; /// Try getting datetime ISO8601 value fn get_datetime_iso(&self) -> Option<&str>; /// Try getting duration ISO8601 value fn get_duration_iso(&self) -> Option<&str>; /// Try getting Error value fn get_error(&self) -> Option<&CellErrorType>; /// Try converting data type into a string fn as_string(&self) -> Option; /// Try converting data type into an int fn as_i64(&self) -> Option; /// Try converting data type into a float fn as_f64(&self) -> Option; /// Try converting data type into a date #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] fn as_date(&self) -> Option { use std::str::FromStr; if self.is_datetime_iso() { self.as_datetime().map(|dt| dt.date()).or_else(|| { self.get_datetime_iso() .and_then(|s| chrono::NaiveDate::from_str(s).ok()) }) } else { self.as_datetime().map(|dt| dt.date()) } } /// Try converting data type into a time #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] fn as_time(&self) -> Option { use std::str::FromStr; if self.is_datetime_iso() { self.as_datetime().map(|dt| dt.time()).or_else(|| { self.get_datetime_iso() .and_then(|s| chrono::NaiveTime::from_str(s).ok()) }) } else if self.is_duration_iso() { self.get_duration_iso() .and_then(|s| chrono::NaiveTime::parse_from_str(s, "PT%HH%MM%S%.fS").ok()) } else { self.as_datetime().map(|dt| dt.time()) } } /// Try converting data type into a duration #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] fn as_duration(&self) -> Option { use chrono::Timelike; if self.is_datetime() { self.get_datetime().and_then(|dt| dt.as_duration()) } else if self.is_duration_iso() { // need replace in the future to something like chrono::Duration::from_str() // https://github.com/chronotope/chrono/issues/579 self.as_time().map(|t| { chrono::Duration::nanoseconds( t.num_seconds_from_midnight() as i64 * 1_000_000_000 + t.nanosecond() as i64, ) }) } else { None } } // Try converting data type into a datetime. #[cfg(feature = "chrono")] fn as_datetime(&self) -> Option { use std::str::FromStr; if self.is_int() || self.is_float() { self.as_f64() .map(|f| ExcelDateTime::from_value_only(f).as_datetime()) } else if self.is_datetime() { self.get_datetime().map(|d| d.as_datetime()) } else if self.is_datetime_iso() { self.get_datetime_iso() .map(|s| chrono::NaiveDateTime::from_str(s).ok()) } else { None } .flatten() } } impl<'a> From> for Data { fn from(value: DataRef<'a>) -> Self { match value { DataRef::Int(v) => Data::Int(v), DataRef::Float(v) => Data::Float(v), DataRef::String(v) => Data::String(v), DataRef::SharedString(v) => Data::String(v.into()), DataRef::Bool(v) => Data::Bool(v), DataRef::DateTime(v) => Data::DateTime(v), DataRef::DateTimeIso(v) => Data::DateTimeIso(v), DataRef::DurationIso(v) => Data::DurationIso(v), DataRef::Error(v) => Data::Error(v), DataRef::Empty => Data::Empty, } } } /// Excel datetime type. Possible: date, time, datetime, duration. /// At this time we can only determine datetime (date and time are datetime too) and duration. #[derive(Debug, Clone, Copy, PartialEq)] pub enum ExcelDateTimeType { /// `DateTime` DateTime, /// `TimeDelta` (Duration) TimeDelta, } /// Structure for Excel date and time representation. #[derive(Debug, Clone, Copy, PartialEq)] pub struct ExcelDateTime { value: f64, datetime_type: ExcelDateTimeType, is_1904: bool, } impl ExcelDateTime { /// Creates a new `ExcelDateTime` pub fn new(value: f64, datetime_type: ExcelDateTimeType, is_1904: bool) -> Self { ExcelDateTime { value, datetime_type, is_1904, } } // Is used only for converting excel value to chrono. #[cfg(feature = "chrono")] fn from_value_only(value: f64) -> Self { ExcelDateTime { value, ..Default::default() } } /// True if excel datetime has duration format (`[hh]:mm:ss`, for example) pub fn is_duration(&self) -> bool { matches!(self.datetime_type, ExcelDateTimeType::TimeDelta) } /// True if excel datetime has datetime format (not duration) pub fn is_datetime(&self) -> bool { matches!(self.datetime_type, ExcelDateTimeType::DateTime) } /// Converting data type into a float pub fn as_f64(&self) -> f64 { self.value } /// Convert an Excel serial datetime to standard date components. /// /// Datetimes in Excel are serial dates with days counted from an epoch /// (usually 1900-01-01) and where the time is a percentage/decimal of the /// milliseconds in the day. Both the date and time are stored in the same /// f64 value. For example, 2025/10/13 12:00:00 is stored as 45943.5. /// /// This function returns a tuple of (year, month, day, hour, minutes, /// seconds, milliseconds). It works for serial dates in both the 1900 and /// 1904 epochs. /// /// This function always returns a date, even if the serial value is outside /// of Excel's range of `0.0 <= datetime < 10000.0`. It also returns, as /// Excel does, the invalid date 1900/02/29 due to the [Excel 1900 leap year /// bug](https://en.wikipedia.org/wiki/Leap_year_problem#Occurrences). /// /// Excel only supports millisecond precision and it also doesn't use or /// encode timezone information in any way. /// /// # Examples /// /// An example of converting an Excel date/time to standard components. /// /// ``` /// use calamine::{ExcelDateTime, ExcelDateTimeType}; /// /// // Create an Excel datetime from the serial value 45943.541 which is /// // equivalent to the date "2025/10/13 12:59:02.400". /// let excel_datetime = ExcelDateTime::new( /// 45943.541, /// ExcelDateTimeType::DateTime, /// false, // Using 1900 epoch (not 1904). /// ); /// /// // Convert to standard date/time components. /// let (year, month, day, hour, min, sec, milli) = excel_datetime.to_ymd_hms_milli(); /// /// assert_eq!(year, 2025); /// assert_eq!(month, 10); /// assert_eq!(day, 13); /// assert_eq!(hour, 12); /// assert_eq!(min, 59); /// assert_eq!(sec, 2); /// assert_eq!(milli, 400); /// ``` /// pub fn to_ymd_hms_milli(&self) -> (u16, u8, u8, u8, u8, u8, u16) { Self::excel_to_standard_datetime(self.value, self.is_1904) } /// Try converting data type into a duration. #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn as_duration(&self) -> Option { let ms = self.value * MS_MULTIPLIER; Some(chrono::Duration::milliseconds(ms.round() as i64)) } /// Try converting data type into a datetime. #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn as_datetime(&self) -> Option { let excel_epoch = EXCEL_EPOCH.get_or_init(|| { chrono::NaiveDate::from_ymd_opt(1899, 12, 30) .unwrap() .and_time(chrono::NaiveTime::MIN) }); let f = if self.is_1904 { self.value + EXCEL_1900_1904_DIFF } else { self.value }; let f = if f >= 60.0 { f } else { f + 1.0 }; let ms = f * MS_MULTIPLIER; let excel_duration = chrono::Duration::milliseconds(ms.round() as i64); excel_epoch.checked_add_signed(excel_duration) } // Convert an Excel serial datetime to its date components. // // Datetimes in Excel are serial dates with days counted from an epoch and // where the time is a percentage/decimal of the milliseconds in the day. // Both the date and time are stored in the same f64 value. // // The calculation back to standard date and time components is deceptively // tricky since simple division doesn't work due to the 4/100/400 year leap // day changes. The basic approach is to divide the range into 400 year // blocks, 100 year blocks, 4 year blocks and 1 year blocks to calculate the // year (relative to the epoch). The remaining days and seconds are used to // calculate the year day and time. To make the leap year calculations // easier we move the effective epoch back to 1600-01-01 which is the // closest 400 year epoch before 1900/1904. // // In addition we need to handle both a 1900 and 1904 epoch and we need to // account for the Excel date bug where it treats 1900 as a leap year. // // Works in the range 1899-12-31/1904-01-01 to 9999-12-31. // // Leap seconds and the timezone aren't taken into account since Excel // doesn't handle them. // fn excel_to_standard_datetime( excel_datetime: f64, is_1904: bool, ) -> (u16, u8, u8, u8, u8, u8, u16) { let mut months = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]; // Convert the seconds to a whole number of days. let mut days = excel_datetime.floor() as u64; // Move the epoch to 1600-01-01 to make the leap calculations easier. if is_1904 { // 1904 epoch dates. days += 111_033; } else if days > YEAR_DAYS { // 1900 epoch years other than 1900. days += 109_571; } else { // Adjust for the Excel 1900 leap year bug. days += 109_572; } // Get the number of 400 year blocks. let year_days_400 = days / YEAR_DAYS_400; let mut days = days % YEAR_DAYS_400; // Get the number of 100 year blocks. There are 2 kinds: those starting // from a %400 year with an extra leap day (36,525 days) and those // starting from other 100 year intervals with 1 day less (36,524 days). let year_days_100; if days < YEAR_DAYS_100 { year_days_100 = days / YEAR_DAYS_100; days %= YEAR_DAYS_100; } else { year_days_100 = 1 + (days - YEAR_DAYS_100) / (YEAR_DAYS_100 - 1); days = (days - YEAR_DAYS_100) % (YEAR_DAYS_100 - 1); } // Get the number of 4 year blocks. There are 2 kinds: a 4 year block // with a leap day (1461 days) and a 4 year block starting from non-leap // %100 years without a leap day (1460 days). We also need to account // for whether a 1461 day block was preceded by a 1460 day block at the // start of the 100 year block. let year_days_4; let mut non_leap_year_block = false; if year_days_100 == 0 { // Any 4 year block in a 36,525 day 100 year block. Has extra leap. year_days_4 = days / YEAR_DAYS_4; days %= YEAR_DAYS_4; } else if days < YEAR_DAYS_4 { // A 4 year block at the start of a 36,524 day 100 year block. year_days_4 = days / (YEAR_DAYS_4 - 1); days %= YEAR_DAYS_4 - 1; non_leap_year_block = true; } else { // A non-initial 4 year block in a 36,524 day 100 year block. year_days_4 = 1 + (days - (YEAR_DAYS_4 - 1)) / YEAR_DAYS_4; days = (days - (YEAR_DAYS_4 - 1)) % YEAR_DAYS_4; } // Get the number of 1 year blocks. We need to account for leap years // and non-leap years and whether the non-leap occurs after a leap year. let year_days_1; if non_leap_year_block { // A non-leap block not preceded by a leap block. year_days_1 = days / YEAR_DAYS; days %= YEAR_DAYS; } else if days < YEAR_DAYS + 1 { // A leap year block. year_days_1 = days / (YEAR_DAYS + 1); days %= YEAR_DAYS + 1; } else { // A non-leap block preceded by a leap block. year_days_1 = 1 + (days - (YEAR_DAYS + 1)) / YEAR_DAYS; days = (days - (YEAR_DAYS + 1)) % YEAR_DAYS; } // Calculate the year as the number of blocks*days since the epoch. let year = 1600 + year_days_400 * 400 + year_days_100 * 100 + year_days_4 * 4 + year_days_1; // Convert from 0 indexed to 1 indexed days. days += 1; // Adjust February day count for leap years. if Self::is_leap_year(year) { months[1] = 29; } // Handle edge cases due to Excel erroneously treating 1900 as a leap year. if !is_1904 && year == 1900 { months[1] = 29; // Adjust last day of 1900. if excel_datetime.trunc() == 366.0 { days += 1; } } // Calculate the relevant month based on the sequential number of days. let mut month = 1; for month_days in months { if days > month_days { days -= month_days; month += 1; } else { break; } } // The final remainder is the day of the month. let day = days; // Get the time part of the Excel datetime. let time = excel_datetime.fract(); let mut milli = ((time * DAY_SECONDS).fract() * 1000.0).round() as u64; let mut day_as_seconds = (time * DAY_SECONDS) as u64; // Handle millisecond overflow due to rounding. if milli == 1000 { day_as_seconds += 1; milli = 0; } // Calculate the hours, minutes and seconds in the day. let hour = day_as_seconds / HOUR_SECONDS; let min = (day_as_seconds - hour * HOUR_SECONDS) / MINUTE_SECONDS; let sec = (day_as_seconds - hour * HOUR_SECONDS - min * MINUTE_SECONDS) % MINUTE_SECONDS; // Return the date and time components. ( year as u16, month as u8, day as u8, hour as u8, min as u8, sec as u8, milli as u16, ) } // Check if a year is a leap year. fn is_leap_year(year: u64) -> bool { year % 4 == 0 && (year % 100 != 0 || year % 400 == 0) } } impl Default for ExcelDateTime { fn default() -> Self { ExcelDateTime { value: 0., datetime_type: ExcelDateTimeType::DateTime, is_1904: false, } } } impl fmt::Display for ExcelDateTime { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), fmt::Error> { write!(f, "{}", self.value) } } #[cfg(all(test, feature = "chrono"))] mod date_tests { use super::*; #[test] fn test_dates() { use chrono::{Duration, NaiveDate, NaiveDateTime, NaiveTime}; #[allow(clippy::excessive_precision)] let unix_epoch = Data::Float(25569.); assert_eq!( unix_epoch.as_datetime(), Some(NaiveDateTime::new( NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), NaiveTime::from_hms_opt(0, 0, 0).unwrap(), )) ); // test for https://github.com/tafia/calamine/issues/251 let unix_epoch_precision = Data::Float(44484.7916666667); assert_eq!( unix_epoch_precision.as_datetime(), Some(NaiveDateTime::new( NaiveDate::from_ymd_opt(2021, 10, 15).unwrap(), NaiveTime::from_hms_opt(19, 0, 0).unwrap(), )) ); // test rounding #[allow(clippy::excessive_precision)] let date = Data::Float(0.18737500000000001); assert_eq!( date.as_time(), Some(NaiveTime::from_hms_milli_opt(4, 29, 49, 200).unwrap()) ); #[allow(clippy::excessive_precision)] let date = Data::Float(0.25951736111111101); assert_eq!( date.as_time(), Some(NaiveTime::from_hms_milli_opt(6, 13, 42, 300).unwrap()) ); // test overflow assert_eq!(Data::Float(1e20).as_time(), None); #[allow(clippy::excessive_precision)] let unix_epoch_15h30m = Data::Float(25569.645833333333333); let chrono_dt = NaiveDateTime::new( NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), NaiveTime::from_hms_opt(15, 30, 0).unwrap(), ); let micro = Duration::microseconds(1); assert!(unix_epoch_15h30m.as_datetime().unwrap() - chrono_dt < micro); } #[test] fn test_int_dates() { use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; let unix_epoch = Data::Int(25569); assert_eq!( unix_epoch.as_datetime(), Some(NaiveDateTime::new( NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), NaiveTime::from_hms_opt(0, 0, 0).unwrap(), )) ); let time = Data::Int(44060); assert_eq!( time.as_datetime(), Some(NaiveDateTime::new( NaiveDate::from_ymd_opt(2020, 8, 17).unwrap(), NaiveTime::from_hms_opt(0, 0, 0).unwrap(), )) ); } } #[cfg(test)] mod tests { use super::*; #[test] fn test_partial_eq() { assert_eq!(Data::String("value".to_string()), "value"); assert_eq!(Data::String("value".to_string()), "value"[..]); assert_eq!(Data::Float(100.0), 100.0f64); assert_eq!(Data::Bool(true), true); assert_eq!(Data::Int(100), 100i64); } #[test] fn test_as_i64_with_bools() { assert_eq!(Data::Bool(true).as_i64(), Some(1)); assert_eq!(Data::Bool(false).as_i64(), Some(0)); assert_eq!(DataRef::Bool(true).as_i64(), Some(1)); assert_eq!(DataRef::Bool(false).as_i64(), Some(0)); } #[test] fn test_as_f64_with_bools() { assert_eq!(Data::Bool(true).as_f64(), Some(1.0)); assert_eq!(Data::Bool(false).as_f64(), Some(0.0)); assert_eq!(DataRef::Bool(true).as_f64(), Some(1.0)); assert_eq!(DataRef::Bool(false).as_f64(), Some(0.0)); } #[test] fn test_datetimes_1900_epoch() { #[allow(clippy::excessive_precision)] let test_data = vec![ (0.0, (1899, 12, 31, 0, 0, 0, 0)), (0.99998842592, (1899, 12, 31, 23, 59, 59, 0)), (30188.010650613425, (1982, 8, 25, 0, 15, 20, 213)), (60376.011670023145, (2065, 4, 19, 0, 16, 48, 290)), (90565.038488958337, (2147, 12, 15, 0, 55, 25, 446)), (120753.04359827546, (2230, 8, 10, 1, 2, 46, 891)), (150942.04462496529, (2313, 4, 6, 1, 4, 15, 597)), (181130.04838991899, (2395, 11, 30, 1, 9, 40, 889)), (211318.04968240741, (2478, 7, 25, 1, 11, 32, 560)), (241507.06272186342, (2561, 3, 21, 1, 30, 19, 169)), (271695.07529606484, (2643, 11, 15, 1, 48, 25, 580)), (301884.08578609955, (2726, 7, 12, 2, 3, 31, 919)), (332072.09111094906, (2809, 3, 6, 2, 11, 11, 986)), (362261.10042934027, (2891, 10, 31, 2, 24, 37, 95)), (392449.10772245371, (2974, 6, 26, 2, 35, 7, 220)), (422637.11472348380, (3057, 2, 19, 2, 45, 12, 109)), (452826.12962951389, (3139, 10, 17, 3, 6, 39, 990)), (483014.13065105322, (3222, 6, 11, 3, 8, 8, 251)), (513203.13834000000, (3305, 2, 5, 3, 19, 12, 576)), (543391.14563164348, (3387, 10, 1, 3, 29, 42, 574)), (573579.15105107636, (3470, 5, 27, 3, 37, 30, 813)), (603768.17683137732, (3553, 1, 21, 4, 14, 38, 231)), (633956.17810832174, (3635, 9, 16, 4, 16, 28, 559)), (664145.17914608796, (3718, 5, 13, 4, 17, 58, 222)), (694333.18173372687, (3801, 1, 6, 4, 21, 41, 794)), (724522.20596981479, (3883, 9, 2, 4, 56, 35, 792)), (754710.22586672450, (3966, 4, 28, 5, 25, 14, 885)), (784898.22645513888, (4048, 12, 21, 5, 26, 5, 724)), (815087.24078782403, (4131, 8, 18, 5, 46, 44, 68)), (845275.24167987274, (4214, 4, 13, 5, 48, 1, 141)), (875464.24574438657, (4296, 12, 7, 5, 53, 52, 315)), (905652.26028449077, (4379, 8, 3, 6, 14, 48, 580)), (935840.28212659725, (4462, 3, 28, 6, 46, 15, 738)), (966029.31343063654, (4544, 11, 22, 7, 31, 20, 407)), (996217.33233511576, (4627, 7, 19, 7, 58, 33, 754)), (1026406.3386936343, (4710, 3, 15, 8, 7, 43, 130)), (1056594.3536005903, (4792, 11, 7, 8, 29, 11, 91)), (1086783.3807329629, (4875, 7, 4, 9, 8, 15, 328)), (1116971.3963169097, (4958, 2, 27, 9, 30, 41, 781)), (1147159.3986627546, (5040, 10, 23, 9, 34, 4, 462)), (1177348.4009715857, (5123, 6, 20, 9, 37, 23, 945)), (1207536.4013501736, (5206, 2, 12, 9, 37, 56, 655)), (1237725.4063915510, (5288, 10, 8, 9, 45, 12, 230)), (1267913.4126710880, (5371, 6, 4, 9, 54, 14, 782)), (1298101.4127558796, (5454, 1, 28, 9, 54, 22, 108)), (1328290.4177795255, (5536, 9, 24, 10, 1, 36, 151)), (1358478.5068125231, (5619, 5, 20, 12, 9, 48, 602)), (1388667.5237100578, (5702, 1, 14, 12, 34, 8, 549)), (1418855.5389640625, (5784, 9, 8, 12, 56, 6, 495)), (1449044.5409515856, (5867, 5, 6, 12, 58, 58, 217)), (1479232.5416002662, (5949, 12, 30, 12, 59, 54, 263)), (1509420.5657561459, (6032, 8, 24, 13, 34, 41, 331)), (1539609.5822754744, (6115, 4, 21, 13, 58, 28, 601)), (1569797.5849178126, (6197, 12, 14, 14, 2, 16, 899)), (1599986.6085352316, (6280, 8, 10, 14, 36, 17, 444)), (1630174.6096927200, (6363, 4, 6, 14, 37, 57, 451)), (1660363.6234115392, (6445, 11, 30, 14, 57, 42, 757)), (1690551.6325035533, (6528, 7, 26, 15, 10, 48, 307)), (1720739.6351839120, (6611, 3, 22, 15, 14, 39, 890)), (1750928.6387498612, (6693, 11, 15, 15, 19, 47, 988)), (1781116.6697262037, (6776, 7, 11, 16, 4, 24, 344)), (1811305.6822216667, (6859, 3, 7, 16, 22, 23, 952)), (1841493.6874536921, (6941, 10, 31, 16, 29, 55, 999)), (1871681.7071789235, (7024, 6, 26, 16, 58, 20, 259)), (1901870.7111390624, (7107, 2, 21, 17, 4, 2, 415)), (1932058.7211762732, (7189, 10, 16, 17, 18, 29, 630)), (1962247.7412190163, (7272, 6, 11, 17, 47, 21, 323)), (1992435.7454845603, (7355, 2, 5, 17, 53, 29, 866)), (2022624.7456143056, (7437, 10, 2, 17, 53, 41, 76)), (2052812.7465977315, (7520, 5, 28, 17, 55, 6, 44)), (2083000.7602910995, (7603, 1, 21, 18, 14, 49, 151)), (2113189.7623349307, (7685, 9, 16, 18, 17, 45, 738)), (2143377.7708298611, (7768, 5, 12, 18, 29, 59, 700)), (2173566.7731624190, (7851, 1, 7, 18, 33, 21, 233)), (2203754.8016744559, (7933, 9, 2, 19, 14, 24, 673)), (2233942.8036205554, (8016, 4, 27, 19, 17, 12, 816)), (2264131.8080603937, (8098, 12, 22, 19, 23, 36, 418)), (2294319.8239109721, (8181, 8, 17, 19, 46, 25, 908)), (2324508.8387420601, (8264, 4, 13, 20, 7, 47, 314)), (2354696.8552963310, (8346, 12, 8, 20, 31, 37, 603)), (2384885.8610853008, (8429, 8, 3, 20, 39, 57, 770)), (2415073.8682530904, (8512, 3, 29, 20, 50, 17, 67)), (2445261.8770581828, (8594, 11, 22, 21, 2, 57, 827)), (2475450.8910360998, (8677, 7, 19, 21, 23, 5, 519)), (2505638.8991848612, (8760, 3, 14, 21, 34, 49, 572)), (2535827.9021521294, (8842, 11, 8, 21, 39, 5, 944)), (2566015.9022965971, (8925, 7, 4, 21, 39, 18, 426)), (2596203.9070343636, (9008, 2, 28, 21, 46, 7, 769)), (2626392.9152275696, (9090, 10, 24, 21, 57, 55, 662)), (2656580.9299968979, (9173, 6, 19, 22, 19, 11, 732)), (2686769.9332335186, (9256, 2, 13, 22, 23, 51, 376)), (2716957.9360968866, (9338, 10, 9, 22, 27, 58, 771)), (2747146.9468795368, (9421, 6, 5, 22, 43, 30, 392)), (2777334.9502990046, (9504, 1, 30, 22, 48, 25, 834)), (2807522.9540709145, (9586, 9, 24, 22, 53, 51, 727)), (2837711.9673210187, (9669, 5, 20, 23, 12, 56, 536)), (2867899.9693762613, (9752, 1, 14, 23, 15, 54, 109)), (2898088.9702850925, (9834, 9, 10, 23, 17, 12, 632)), (2958465.9999884260, (9999, 12, 31, 23, 59, 59, 0)), ]; for test in test_data { let (excel_serial_datetime, expected) = test; let datetime = ExcelDateTime::new(excel_serial_datetime, ExcelDateTimeType::DateTime, false); let got = datetime.to_ymd_hms_milli(); assert_eq!(expected, got); } } #[test] fn test_dates_only_1900_epoch() { let test_data = vec![ (0.0, (1899, 12, 31)), (1.0, (1900, 1, 1)), (58.0, (1900, 2, 27)), (59.0, (1900, 2, 28)), (60.0, (1900, 2, 29)), (61.0, (1900, 3, 1)), (62.0, (1900, 3, 2)), (71.0, (1900, 3, 11)), (99.0, (1900, 4, 8)), (256.0, (1900, 9, 12)), (364.0, (1900, 12, 29)), (365.0, (1900, 12, 30)), (366.0, (1900, 12, 31)), (367.0, (1901, 1, 1)), (489.0, (1901, 5, 3)), (652.0, (1901, 10, 13)), (777.0, (1902, 2, 15)), (888.0, (1902, 6, 6)), (999.0, (1902, 9, 25)), (1001.0, (1902, 9, 27)), (1212.0, (1903, 4, 26)), (1313.0, (1903, 8, 5)), (1461.0, (1903, 12, 31)), (1462.0, (1904, 1, 1)), (1520.0, (1904, 2, 28)), (1521.0, (1904, 2, 29)), (1522.0, (1904, 3, 1)), (2615.0, (1907, 2, 27)), (2616.0, (1907, 2, 28)), (2617.0, (1907, 3, 1)), (2618.0, (1907, 3, 2)), (2619.0, (1907, 3, 3)), (2620.0, (1907, 3, 4)), (2621.0, (1907, 3, 5)), (2622.0, (1907, 3, 6)), (36161.0, (1999, 1, 1)), (36191.0, (1999, 1, 31)), (36192.0, (1999, 2, 1)), (36219.0, (1999, 2, 28)), (36220.0, (1999, 3, 1)), (36250.0, (1999, 3, 31)), (36251.0, (1999, 4, 1)), (36280.0, (1999, 4, 30)), (36281.0, (1999, 5, 1)), (36311.0, (1999, 5, 31)), (36312.0, (1999, 6, 1)), (36341.0, (1999, 6, 30)), (36342.0, (1999, 7, 1)), (36372.0, (1999, 7, 31)), (36373.0, (1999, 8, 1)), (36403.0, (1999, 8, 31)), (36404.0, (1999, 9, 1)), (36433.0, (1999, 9, 30)), (36434.0, (1999, 10, 1)), (36464.0, (1999, 10, 31)), (36465.0, (1999, 11, 1)), (36494.0, (1999, 11, 30)), (36495.0, (1999, 12, 1)), (36525.0, (1999, 12, 31)), (36526.0, (2000, 1, 1)), (36556.0, (2000, 1, 31)), (36557.0, (2000, 2, 1)), (36585.0, (2000, 2, 29)), (36586.0, (2000, 3, 1)), (36616.0, (2000, 3, 31)), (36617.0, (2000, 4, 1)), (36646.0, (2000, 4, 30)), (36647.0, (2000, 5, 1)), (36677.0, (2000, 5, 31)), (36678.0, (2000, 6, 1)), (36707.0, (2000, 6, 30)), (36708.0, (2000, 7, 1)), (36738.0, (2000, 7, 31)), (36739.0, (2000, 8, 1)), (36769.0, (2000, 8, 31)), (36770.0, (2000, 9, 1)), (36799.0, (2000, 9, 30)), (36800.0, (2000, 10, 1)), (36830.0, (2000, 10, 31)), (36831.0, (2000, 11, 1)), (36860.0, (2000, 11, 30)), (36861.0, (2000, 12, 1)), (36891.0, (2000, 12, 31)), (36892.0, (2001, 1, 1)), (36922.0, (2001, 1, 31)), (36923.0, (2001, 2, 1)), (36950.0, (2001, 2, 28)), (36951.0, (2001, 3, 1)), (36981.0, (2001, 3, 31)), (36982.0, (2001, 4, 1)), (37011.0, (2001, 4, 30)), (37012.0, (2001, 5, 1)), (37042.0, (2001, 5, 31)), (37043.0, (2001, 6, 1)), (37072.0, (2001, 6, 30)), (37073.0, (2001, 7, 1)), (37103.0, (2001, 7, 31)), (37104.0, (2001, 8, 1)), (37134.0, (2001, 8, 31)), (37135.0, (2001, 9, 1)), (37164.0, (2001, 9, 30)), (37165.0, (2001, 10, 1)), (37195.0, (2001, 10, 31)), (37196.0, (2001, 11, 1)), (37225.0, (2001, 11, 30)), (37226.0, (2001, 12, 1)), (37256.0, (2001, 12, 31)), (182623.0, (2400, 1, 1)), (182653.0, (2400, 1, 31)), (182654.0, (2400, 2, 1)), (182682.0, (2400, 2, 29)), (182683.0, (2400, 3, 1)), (182713.0, (2400, 3, 31)), (182714.0, (2400, 4, 1)), (182743.0, (2400, 4, 30)), (182744.0, (2400, 5, 1)), (182774.0, (2400, 5, 31)), (182775.0, (2400, 6, 1)), (182804.0, (2400, 6, 30)), (182805.0, (2400, 7, 1)), (182835.0, (2400, 7, 31)), (182836.0, (2400, 8, 1)), (182866.0, (2400, 8, 31)), (182867.0, (2400, 9, 1)), (182896.0, (2400, 9, 30)), (182897.0, (2400, 10, 1)), (182927.0, (2400, 10, 31)), (182928.0, (2400, 11, 1)), (182957.0, (2400, 11, 30)), (182958.0, (2400, 12, 1)), (182988.0, (2400, 12, 31)), (767011.0, (4000, 1, 1)), (767041.0, (4000, 1, 31)), (767042.0, (4000, 2, 1)), (767070.0, (4000, 2, 29)), (767071.0, (4000, 3, 1)), (767101.0, (4000, 3, 31)), (767102.0, (4000, 4, 1)), (767131.0, (4000, 4, 30)), (767132.0, (4000, 5, 1)), (767162.0, (4000, 5, 31)), (767163.0, (4000, 6, 1)), (767192.0, (4000, 6, 30)), (767193.0, (4000, 7, 1)), (767223.0, (4000, 7, 31)), (767224.0, (4000, 8, 1)), (767254.0, (4000, 8, 31)), (767255.0, (4000, 9, 1)), (767284.0, (4000, 9, 30)), (767285.0, (4000, 10, 1)), (767315.0, (4000, 10, 31)), (767316.0, (4000, 11, 1)), (767345.0, (4000, 11, 30)), (767346.0, (4000, 12, 1)), (767376.0, (4000, 12, 31)), (884254.0, (4321, 1, 1)), (884284.0, (4321, 1, 31)), (884285.0, (4321, 2, 1)), (884312.0, (4321, 2, 28)), (884313.0, (4321, 3, 1)), (884343.0, (4321, 3, 31)), (884344.0, (4321, 4, 1)), (884373.0, (4321, 4, 30)), (884374.0, (4321, 5, 1)), (884404.0, (4321, 5, 31)), (884405.0, (4321, 6, 1)), (884434.0, (4321, 6, 30)), (884435.0, (4321, 7, 1)), (884465.0, (4321, 7, 31)), (884466.0, (4321, 8, 1)), (884496.0, (4321, 8, 31)), (884497.0, (4321, 9, 1)), (884526.0, (4321, 9, 30)), (884527.0, (4321, 10, 1)), (884557.0, (4321, 10, 31)), (884558.0, (4321, 11, 1)), (884587.0, (4321, 11, 30)), (884588.0, (4321, 12, 1)), (884618.0, (4321, 12, 31)), (2958101.0, (9999, 1, 1)), (2958131.0, (9999, 1, 31)), (2958132.0, (9999, 2, 1)), (2958159.0, (9999, 2, 28)), (2958160.0, (9999, 3, 1)), (2958190.0, (9999, 3, 31)), (2958191.0, (9999, 4, 1)), (2958220.0, (9999, 4, 30)), (2958221.0, (9999, 5, 1)), (2958251.0, (9999, 5, 31)), (2958252.0, (9999, 6, 1)), (2958281.0, (9999, 6, 30)), (2958282.0, (9999, 7, 1)), (2958312.0, (9999, 7, 31)), (2958313.0, (9999, 8, 1)), (2958343.0, (9999, 8, 31)), (2958344.0, (9999, 9, 1)), (2958373.0, (9999, 9, 30)), (2958374.0, (9999, 10, 1)), (2958404.0, (9999, 10, 31)), (2958405.0, (9999, 11, 1)), (2958434.0, (9999, 11, 30)), (2958435.0, (9999, 12, 1)), (2958465.0, (9999, 12, 31)), ]; for test in test_data { let (excel_serial_datetime, expected) = test; let datetime = ExcelDateTime::new(excel_serial_datetime, ExcelDateTimeType::DateTime, false); let got = datetime.to_ymd_hms_milli(); let got = (got.0, got.1, got.2); // Date parts only. assert_eq!(expected, got); } } #[test] fn test_dates_only_1904_epoch() { let test_data = vec![(0.0, (1904, 1, 1))]; for test in test_data { let (excel_serial_datetime, expected) = test; let datetime = ExcelDateTime::new(excel_serial_datetime, ExcelDateTimeType::DateTime, true); let got = datetime.to_ymd_hms_milli(); let got = (got.0, got.1, got.2); // Date parts only. assert_eq!(expected, got); } } #[test] fn test_times_only_both_epochs() { #[allow(clippy::excessive_precision)] let test_data = vec![ (0.0, (0, 0, 0, 0)), (1.0650613425925924e-2, (0, 15, 20, 213)), (1.1670023148148148e-2, (0, 16, 48, 290)), (3.8488958333333337e-2, (0, 55, 25, 446)), (4.3598275462962965e-2, (1, 2, 46, 891)), (4.4624965277777782e-2, (1, 4, 15, 597)), (4.8389918981481483e-2, (1, 9, 40, 889)), (4.9682407407407404e-2, (1, 11, 32, 560)), (6.2721863425925936e-2, (1, 30, 19, 169)), (7.5296064814814809e-2, (1, 48, 25, 580)), (8.5786099537037031e-2, (2, 3, 31, 919)), (9.1110949074074077e-2, (2, 11, 11, 986)), (0.10042934027777778, (2, 24, 37, 95)), (0.10772245370370370, (2, 35, 7, 220)), (0.11472348379629631, (2, 45, 12, 109)), (0.12962951388888888, (3, 6, 39, 990)), (0.13065105324074075, (3, 8, 8, 251)), (0.13833999999999999, (3, 19, 12, 576)), (0.14563164351851851, (3, 29, 42, 574)), (0.15105107638888890, (3, 37, 30, 813)), (0.17683137731481480, (4, 14, 38, 231)), (0.17810832175925925, (4, 16, 28, 559)), (0.17914608796296297, (4, 17, 58, 222)), (0.18173372685185185, (4, 21, 41, 794)), (0.20596981481481480, (4, 56, 35, 792)), (0.22586672453703704, (5, 25, 14, 885)), (0.22645513888888891, (5, 26, 5, 724)), (0.24078782407407406, (5, 46, 44, 68)), (0.24167987268518520, (5, 48, 1, 141)), (0.24574438657407408, (5, 53, 52, 315)), (0.26028449074074073, (6, 14, 48, 580)), (0.28212659722222222, (6, 46, 15, 738)), (0.31343063657407405, (7, 31, 20, 407)), (0.33233511574074076, (7, 58, 33, 754)), (0.33869363425925925, (8, 7, 43, 130)), (0.35360059027777774, (8, 29, 11, 91)), (0.38073296296296300, (9, 8, 15, 328)), (0.39631690972222228, (9, 30, 41, 781)), (0.39866275462962958, (9, 34, 4, 462)), (0.40097158564814817, (9, 37, 23, 945)), (0.40135017361111114, (9, 37, 56, 655)), (0.40639155092592594, (9, 45, 12, 230)), (0.41267108796296298, (9, 54, 14, 782)), (0.41275587962962962, (9, 54, 22, 108)), (0.41777952546296299, (10, 1, 36, 151)), (0.50681252314814818, (12, 9, 48, 602)), (0.52371005787037039, (12, 34, 8, 549)), (0.53896406249999995, (12, 56, 6, 495)), (0.54095158564814816, (12, 58, 58, 217)), (0.54160026620370372, (12, 59, 54, 263)), (0.56575614583333333, (13, 34, 41, 331)), (0.58227547453703699, (13, 58, 28, 601)), (0.58491781249999997, (14, 2, 16, 899)), (0.60853523148148148, (14, 36, 17, 444)), (0.60969271990740748, (14, 37, 57, 451)), (0.62341153935185190, (14, 57, 42, 757)), (0.63250355324074070, (15, 10, 48, 307)), (0.63518391203703706, (15, 14, 39, 890)), (0.63874986111111109, (15, 19, 47, 988)), (0.66972620370370362, (16, 4, 24, 344)), (0.68222166666666662, (16, 22, 23, 952)), (0.68745369212962970, (16, 29, 55, 999)), (0.70717892361111112, (16, 58, 20, 259)), (0.71113906250000003, (17, 4, 2, 415)), (0.72117627314814825, (17, 18, 29, 630)), (0.74121901620370367, (17, 47, 21, 323)), (0.74548456018518516, (17, 53, 29, 866)), (0.74561430555555563, (17, 53, 41, 76)), (0.74659773148148145, (17, 55, 6, 44)), (0.76029109953703700, (18, 14, 49, 151)), (0.76233493055555546, (18, 17, 45, 738)), (0.77082986111111118, (18, 29, 59, 700)), (0.77316241898148153, (18, 33, 21, 233)), (0.80167445601851861, (19, 14, 24, 673)), (0.80362055555555545, (19, 17, 12, 816)), (0.80806039351851855, (19, 23, 36, 418)), (0.82391097222222232, (19, 46, 25, 908)), (0.83874206018518516, (20, 7, 47, 314)), (0.85529633101851854, (20, 31, 37, 603)), (0.86108530092592594, (20, 39, 57, 770)), (0.86825309027777775, (20, 50, 17, 67)), (0.87705818287037041, (21, 2, 57, 827)), (0.89103609953703700, (21, 23, 5, 519)), (0.89918486111111118, (21, 34, 49, 572)), (0.90215212962962965, (21, 39, 5, 944)), (0.90229659722222222, (21, 39, 18, 426)), (0.90703436342592603, (21, 46, 7, 769)), (0.91522756944444439, (21, 57, 55, 662)), (0.92999689814814823, (22, 19, 11, 732)), (0.93323351851851843, (22, 23, 51, 376)), (0.93609688657407408, (22, 27, 58, 771)), (0.94687953703703709, (22, 43, 30, 392)), (0.95029900462962968, (22, 48, 25, 834)), (0.95407091435185187, (22, 53, 51, 727)), (0.96732101851851848, (23, 12, 56, 536)), (0.96937626157407408, (23, 15, 54, 109)), (0.97028509259259266, (23, 17, 12, 632)), (0.99999998842592586, (23, 59, 59, 999)), ]; for test in test_data { let (excel_serial_datetime, expected) = test; // 1900 epoch. let datetime = ExcelDateTime::new(excel_serial_datetime, ExcelDateTimeType::DateTime, false); let got = datetime.to_ymd_hms_milli(); let got = (got.3, got.4, got.5, got.6); // Time parts only. assert_eq!(expected, got); // 1904 epoch. let datetime = ExcelDateTime::new(excel_serial_datetime, ExcelDateTimeType::DateTime, true); let got = datetime.to_ymd_hms_milli(); let got = (got.3, got.4, got.5, got.6); // Time parts only. assert_eq!(expected, got); } } } calamine-0.34.0/src/de.rs000064400000000000000000000627751046102023000132270ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. use serde::de::value::BorrowedStrDeserializer; use serde::de::{self, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor}; use serde::{forward_to_deserialize_any, Deserialize, Deserializer}; use std::marker::PhantomData; use std::{fmt, slice, str}; use super::{CellErrorType, CellType, Data, Range, Rows}; /// A cell deserialization specific error enum #[derive(Debug)] pub enum DeError { /// Cell out of range CellOutOfRange { /// Position tried try_pos: (u32, u32), /// Minimum position min_pos: (u32, u32), }, /// The cell value is an error CellError { /// Cell value error err: CellErrorType, /// Cell position pos: (u32, u32), }, /// Unexpected end of row UnexpectedEndOfRow { /// Cell position pos: (u32, u32), }, /// Required header not found HeaderNotFound(String), /// Serde specific error Custom(String), } impl fmt::Display for DeError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { match self { DeError::CellOutOfRange { try_pos, min_pos } => write!( f, "there is no cell at position '{try_pos:?}'. Minimum position is '{min_pos:?}'" ), DeError::CellError { pos, err } => { write!(f, "Cell error at position '{pos:?}': {err}") } DeError::UnexpectedEndOfRow { pos } => { write!(f, "Unexpected end of row at position '{pos:?}'") } DeError::HeaderNotFound(header) => { write!(f, "Cannot find header named '{header}'") } DeError::Custom(s) => write!(f, "{s}"), } } } impl std::error::Error for DeError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { None } } impl de::Error for DeError { fn custom(msg: T) -> Self { DeError::Custom(msg.to_string()) } } #[derive(Clone)] pub enum Headers<'h, H> { None, All, Custom(&'h [H]), } /// Builds a `Range` deserializer with some configuration options. /// /// This can be used to optionally parse the first row as a header. Once built, /// a `RangeDeserializer`s cannot be changed. #[derive(Clone)] pub struct RangeDeserializerBuilder<'h, H> { headers: Headers<'h, H>, } impl Default for RangeDeserializerBuilder<'static, &'static str> { fn default() -> Self { RangeDeserializerBuilder { headers: Headers::All, } } } impl RangeDeserializerBuilder<'static, &'static str> { /// Constructs a new builder for configuring `Range` deserialization. pub fn new() -> Self { Default::default() } /// Decide whether to treat the first row as a special header row. /// /// # Example /// /// ``` /// # use calamine::{Data, Error, open_workbook, Xlsx, Reader, RangeDeserializerBuilder}; /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/temperature.xlsx", env!("CARGO_MANIFEST_DIR")); /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// let range = workbook.worksheet_range("Sheet1")?; /// /// let mut iter = RangeDeserializerBuilder::new() /// .has_headers(false) /// .from_range(&range)?; /// /// if let Some(result) = iter.next() { /// let row: Vec = result?; /// assert_eq!(row, [Data::from("label"), Data::from("value")]); /// } else { /// return Err(From::from("expected at least three records but got none")); /// } /// /// if let Some(result) = iter.next() { /// let row: Vec = result?; /// assert_eq!(row, [Data::from("celsius"), Data::from(22.2222)]); /// } else { /// return Err(From::from("expected at least three records but got one")); /// } /// /// Ok(()) /// } /// ``` pub fn has_headers(&mut self, yes: bool) -> &mut Self { if yes { self.headers = Headers::All; } else { self.headers = Headers::None; } self } } impl<'h, H: AsRef + Clone + 'h> RangeDeserializerBuilder<'h, H> { /// Build a `RangeDeserializer` from this configuration and keep only selected headers. /// /// # Example /// /// ``` /// # use calamine::{open_workbook, Error, Xlsx, Reader, RangeDeserializerBuilder}; /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/temperature.xlsx", env!("CARGO_MANIFEST_DIR")); /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// let range = workbook.worksheet_range("Sheet1")?; /// let mut iter = RangeDeserializerBuilder::with_headers(&["value", "label"]).from_range(&range)?; /// /// if let Some(result) = iter.next() { /// let (value, label): (f64, String) = result?; /// assert_eq!(label, "celsius"); /// assert_eq!(value, 22.2222); /// /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn with_headers(headers: &'h [H]) -> Self { RangeDeserializerBuilder { headers: Headers::Custom(headers), } } /// Build a `RangeDeserializer` from this configuration. /// /// # Example /// /// ``` /// # use calamine::{open_workbook, Error, Xlsx, Reader, RangeDeserializerBuilder}; /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/temperature.xlsx", env!("CARGO_MANIFEST_DIR")); /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// let range = workbook.worksheet_range("Sheet1")?; /// let mut iter = RangeDeserializerBuilder::new().from_range(&range)?; /// /// if let Some(result) = iter.next() { /// let (label, value): (String, f64) = result?; /// assert_eq!(label, "celsius"); /// assert_eq!(value, 22.2222); /// /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn from_range<'cell, T, D>( &self, range: &'cell Range, ) -> Result, DeError> where T: ToCellDeserializer<'cell>, D: DeserializeOwned, { RangeDeserializer::new(self, range) } } impl<'h> RangeDeserializerBuilder<'h, &str> { /// Build a `RangeDeserializer` from this configuration and keep only selected headers /// from the specified deserialization struct. /// /// # Example /// /// ``` /// # use calamine::{open_workbook, Error, RangeDeserializerBuilder, Reader, Xlsx}; /// # use serde_derive::Deserialize; /// #[derive(Deserialize)] /// struct Record { /// label: String, /// value: f64, /// } /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/temperature.xlsx", env!("CARGO_MANIFEST_DIR")); /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// let range = workbook.worksheet_range("Sheet1")?; /// let mut iter = /// RangeDeserializerBuilder::with_deserialize_headers::().from_range(&range)?; /// /// if let Some(result) = iter.next() { /// let record: Record = result?; /// assert_eq!(record.label, "celsius"); /// assert_eq!(record.value, 22.2222); /// /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub fn with_deserialize_headers<'de, T>() -> Self where T: Deserialize<'de>, { struct StructFieldsDeserializer<'h> { fields: &'h mut Option<&'static [&'static str]>, } impl<'de, 'h> Deserializer<'de> for StructFieldsDeserializer<'h> { type Error = de::value::Error; fn deserialize_any(self, _visitor: V) -> Result where V: Visitor<'de>, { Err(de::Error::custom("I'm just here for the fields")) } fn deserialize_struct( self, _name: &'static str, fields: &'static [&'static str], _visitor: V, ) -> Result where V: Visitor<'de>, { *self.fields = Some(fields); // get the names of the deserialized fields Err(de::Error::custom("I'm just here for the fields")) } serde::forward_to_deserialize_any! { bool i8 i16 i32 i64 u8 u16 u32 u64 f32 f64 char str string bytes byte_buf option unit unit_struct newtype_struct seq tuple tuple_struct map enum identifier ignored_any } } let mut serialized_names = None; let _ = T::deserialize(StructFieldsDeserializer { fields: &mut serialized_names, }); let headers = serialized_names.unwrap_or_default(); Self::with_headers(headers) } } /// A configured `Range` deserializer. /// /// # Example /// /// ``` /// # use calamine::{open_workbook, Error, Xlsx, Reader, RangeDeserializerBuilder}; /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/temperature.xlsx", env!("CARGO_MANIFEST_DIR")); /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// let range = workbook.worksheet_range("Sheet1")?; /// /// let mut iter = RangeDeserializerBuilder::new().from_range(&range)?; /// /// if let Some(result) = iter.next() { /// let (label, value): (String, f64) = result?; /// assert_eq!(label, "celsius"); /// assert_eq!(value, 22.2222); /// Ok(()) /// } else { /// Err(From::from("expected at least one record but got none")) /// } /// } /// ``` pub struct RangeDeserializer<'cell, T, D> where T: ToCellDeserializer<'cell>, D: DeserializeOwned, { column_indexes: Vec, headers: Option>, rows: Rows<'cell, T>, current_pos: (u32, u32), end_pos: (u32, u32), _priv: PhantomData, } impl<'cell, T, D> RangeDeserializer<'cell, T, D> where T: ToCellDeserializer<'cell>, D: DeserializeOwned, { fn new<'h, H: AsRef + Clone + 'h>( builder: &RangeDeserializerBuilder<'h, H>, range: &'cell Range, ) -> Result { let mut rows = range.rows(); let mut current_pos = range.start().unwrap_or((0, 0)); let end_pos = range.end().unwrap_or((0, 0)); let (column_indexes, headers) = match builder.headers { Headers::None => ((0..range.width()).collect(), None), Headers::All => { if let Some(row) = rows.next() { let all_indexes = (0..row.len()).collect::>(); let all_headers = { let de = RowDeserializer::new(&all_indexes, None, row, current_pos); current_pos.0 += 1; Deserialize::deserialize(de)? }; (all_indexes, Some(all_headers)) } else { (Vec::new(), None) } } Headers::Custom(headers) => { if let Some(row) = rows.next() { let all_indexes = (0..row.len()).collect::>(); let de = RowDeserializer::new(&all_indexes, None, row, current_pos); current_pos.0 += 1; let all_headers: Vec = Deserialize::deserialize(de)?; let custom_indexes = headers .iter() .map(|h| h.as_ref().trim()) .map(|h| { all_headers .iter() .position(|header| header.trim() == h) .ok_or_else(|| DeError::HeaderNotFound(h.to_owned())) }) .collect::, DeError>>()?; (custom_indexes, Some(all_headers)) } else { (Vec::new(), None) } } }; Ok(RangeDeserializer { column_indexes, headers, rows, current_pos, end_pos, _priv: PhantomData, }) } } impl<'cell, T, D> Iterator for RangeDeserializer<'cell, T, D> where T: ToCellDeserializer<'cell>, D: DeserializeOwned, { type Item = Result; fn next(&mut self) -> Option { let RangeDeserializer { column_indexes, headers, rows, mut current_pos, .. } = self; if let Some(row) = rows.next() { current_pos.0 += 1; let headers = headers.as_ref().map(|h| &**h); let de = RowDeserializer::new(column_indexes, headers, row, current_pos); Some(Deserialize::deserialize(de)) } else { None } } fn size_hint(&self) -> (usize, Option) { let remaining = (self.end_pos.0 - self.current_pos.0) as usize; (remaining, Some(remaining)) } } struct RowDeserializer<'header, 'cell, T> { cells: &'cell [T], headers: Option<&'header [String]>, iter: slice::Iter<'header, usize>, // iterator over column indexes peek: Option, pos: (u32, u32), } impl<'header, 'cell, T> RowDeserializer<'header, 'cell, T> where T: 'cell + ToCellDeserializer<'cell>, { fn new( column_indexes: &'header [usize], headers: Option<&'header [String]>, cells: &'cell [T], pos: (u32, u32), ) -> Self { RowDeserializer { iter: column_indexes.iter(), headers, cells, pos, peek: None, } } fn has_headers(&self) -> bool { self.headers.is_some() } } impl<'de, 'header, 'cell, T> serde::Deserializer<'de> for RowDeserializer<'header, 'cell, T> where 'header: 'de, 'cell: 'de, T: 'cell + ToCellDeserializer<'cell>, { type Error = DeError; fn deserialize_any(self, visitor: V) -> Result where V: Visitor<'de>, { visitor.visit_seq(self) } fn deserialize_map>(self, visitor: V) -> Result { if self.has_headers() { visitor.visit_map(self) } else { visitor.visit_seq(self) } } fn deserialize_struct>( self, _name: &'static str, _cells: &'static [&'static str], visitor: V, ) -> Result { if self.has_headers() { visitor.visit_map(self) } else { visitor.visit_seq(self) } } forward_to_deserialize_any! { bool i8 i16 i32 i64 u8 u16 u32 u64 f32 f64 char str string bytes byte_buf option unit unit_struct newtype_struct seq tuple tuple_struct enum identifier ignored_any } } impl<'de, 'header, 'cell, T> SeqAccess<'de> for RowDeserializer<'header, 'cell, T> where 'header: 'de, 'cell: 'de, T: ToCellDeserializer<'cell>, { type Error = DeError; fn next_element_seed(&mut self, seed: D) -> Result, Self::Error> where D: DeserializeSeed<'de>, { match self.iter.next().map(|i| &self.cells[*i]) { Some(value) => { let de = value.to_cell_deserializer(self.pos); seed.deserialize(de).map(Some) } None => Ok(None), } } fn size_hint(&self) -> Option { match self.iter.size_hint() { (lower, Some(upper)) if lower == upper => Some(upper), _ => None, } } } impl<'de, 'header: 'de, 'cell: 'de, T> de::MapAccess<'de> for RowDeserializer<'header, 'cell, T> where 'header: 'de, 'cell: 'de, T: ToCellDeserializer<'cell>, { type Error = DeError; fn next_key_seed>( &mut self, seed: K, ) -> Result, Self::Error> { let headers = self .headers .expect("Cannot map-deserialize range without headers"); for i in self.iter.by_ref() { if !self.cells[*i].is_empty() { self.peek = Some(*i); let de = BorrowedStrDeserializer::::new(&headers[*i]); return seed.deserialize(de).map(Some); } } Ok(None) } fn next_value_seed>( &mut self, seed: K, ) -> Result { let cell = self .peek .take() .map(|i| &self.cells[i]) .ok_or(DeError::UnexpectedEndOfRow { pos: self.pos })?; let de = cell.to_cell_deserializer(self.pos); seed.deserialize(de) } } /// Constructs a deserializer for a `CellType`. pub trait ToCellDeserializer<'a>: CellType { /// The deserializer. type Deserializer: for<'de> serde::Deserializer<'de, Error = DeError>; /// Construct a `CellType` deserializer at the specified position. fn to_cell_deserializer(&'a self, pos: (u32, u32)) -> Self::Deserializer; /// Assess if the cell is empty. fn is_empty(&self) -> bool; } impl<'a> ToCellDeserializer<'a> for Data { type Deserializer = DataDeserializer<'a>; fn to_cell_deserializer(&'a self, pos: (u32, u32)) -> DataDeserializer<'a> { DataDeserializer { data_type: self, pos, } } #[inline] fn is_empty(&self) -> bool { matches!(self, Data::Empty) } } macro_rules! deserialize_num { ($typ:ty, $method:ident, $visit:ident) => { fn $method(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::Float(v) => visitor.$visit(*v as $typ), Data::Int(v) => visitor.$visit(*v as $typ), Data::String(s) => { let v = s.parse().map_err(|_| { DeError::Custom(format!("Expecting {}, got '{}'", stringify!($typ), s)) })?; visitor.$visit(v) } Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), d => Err(DeError::Custom(format!( "Expecting {}, got {:?}", stringify!($typ), d ))), } } }; } /// A deserializer for the `Data` type. pub struct DataDeserializer<'a> { data_type: &'a Data, pos: (u32, u32), } impl<'a, 'de> serde::Deserializer<'de> for DataDeserializer<'a> { type Error = DeError; fn deserialize_any(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::String(v) => visitor.visit_str(v), Data::Float(v) => visitor.visit_f64(*v), Data::Bool(v) => visitor.visit_bool(*v), Data::Int(v) => visitor.visit_i64(*v), Data::Empty => visitor.visit_unit(), Data::DateTime(v) => visitor.visit_f64(v.as_f64()), Data::DateTimeIso(v) => visitor.visit_str(v), Data::DurationIso(v) => visitor.visit_str(v), Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), } } fn deserialize_str(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::String(v) => visitor.visit_str(v), Data::Empty => visitor.visit_str(""), Data::Float(v) => visitor.visit_str(&v.to_string()), Data::Int(v) => visitor.visit_str(&v.to_string()), Data::Bool(v) => visitor.visit_str(&v.to_string()), Data::DateTime(v) => visitor.visit_str(&v.to_string()), Data::DateTimeIso(v) => visitor.visit_str(v), Data::DurationIso(v) => visitor.visit_str(v), Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), } } fn deserialize_bytes(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::String(v) => visitor.visit_bytes(v.as_bytes()), Data::Empty => visitor.visit_bytes(&[]), Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), d => Err(DeError::Custom(format!("Expecting bytes, got {d:?}"))), } } fn deserialize_byte_buf(self, visitor: V) -> Result where V: Visitor<'de>, { self.deserialize_bytes(visitor) } fn deserialize_string(self, visitor: V) -> Result where V: Visitor<'de>, { self.deserialize_str(visitor) } fn deserialize_bool(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::Bool(v) => visitor.visit_bool(*v), Data::String(v) => match &**v { "TRUE" | "true" | "True" => visitor.visit_bool(true), "FALSE" | "false" | "False" => visitor.visit_bool(false), d => Err(DeError::Custom(format!("Expecting bool, got '{d}'"))), }, Data::Empty => visitor.visit_bool(false), Data::Float(v) => visitor.visit_bool(*v != 0.), Data::Int(v) => visitor.visit_bool(*v != 0), Data::DateTime(v) => visitor.visit_bool(v.as_f64() != 0.), Data::DateTimeIso(_) => visitor.visit_bool(true), Data::DurationIso(_) => visitor.visit_bool(true), Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), } } fn deserialize_char(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::String(s) if s.len() == 1 => { visitor.visit_char(s.chars().next().expect("s not empty")) } Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), d => Err(DeError::Custom(format!("Expecting unit, got {d:?}"))), } } fn deserialize_unit(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::Empty => visitor.visit_unit(), Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), d => Err(DeError::Custom(format!("Expecting unit, got {d:?}"))), } } fn deserialize_option(self, visitor: V) -> Result where V: Visitor<'de>, { match self.data_type { Data::Empty => visitor.visit_none(), _ => visitor.visit_some(self), } } fn deserialize_newtype_struct( self, _name: &'static str, visitor: V, ) -> Result where V: Visitor<'de>, { visitor.visit_newtype_struct(self) } fn deserialize_enum( self, _name: &'static str, _variants: &'static [&'static str], visitor: V, ) -> Result where V: Visitor<'de>, { use serde::de::IntoDeserializer; match self.data_type { Data::String(s) => visitor.visit_enum(s.as_str().into_deserializer()), Data::Error(err) => Err(DeError::CellError { err: err.clone(), pos: self.pos, }), d => Err(DeError::Custom(format!("Expecting enum, got {d:?}"))), } } deserialize_num!(i64, deserialize_i64, visit_i64); deserialize_num!(i32, deserialize_i32, visit_i32); deserialize_num!(i16, deserialize_i16, visit_i16); deserialize_num!(i8, deserialize_i8, visit_i8); deserialize_num!(u64, deserialize_u64, visit_u64); deserialize_num!(u32, deserialize_u32, visit_u32); deserialize_num!(u16, deserialize_u16, visit_u16); deserialize_num!(u8, deserialize_u8, visit_u8); deserialize_num!(f64, deserialize_f64, visit_f64); deserialize_num!(f32, deserialize_f32, visit_f32); forward_to_deserialize_any! { unit_struct seq tuple tuple_struct map struct identifier ignored_any } } #[cfg(test)] mod tests { #[test] fn test_deserialize_enum() { use crate::ToCellDeserializer; use serde::Deserialize; #[derive(Debug, serde_derive::Deserialize, PartialEq)] enum Content { Foo, } assert_eq!( Content::deserialize( super::Data::String("Foo".to_string()).to_cell_deserializer((0, 0)) ) .unwrap(), Content::Foo ); } } calamine-0.34.0/src/errors.rs000064400000000000000000000040451046102023000141350ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! A module to provide a convenient wrapper around all error types /// A struct to handle any error and a message #[derive(Debug)] pub enum Error { /// IO error Io(std::io::Error), /// Ods specific error Ods(crate::ods::OdsError), /// xls specific error Xls(crate::xls::XlsError), /// xlsb specific error Xlsb(crate::xlsb::XlsbError), /// xlsx specific error Xlsx(crate::xlsx::XlsxError), /// vba specific error Vba(crate::vba::VbaError), /// cfb specific error De(crate::de::DeError), /// General error message Msg(&'static str), } from_err!(std::io::Error, Error, Io); from_err!(crate::ods::OdsError, Error, Ods); from_err!(crate::xls::XlsError, Error, Xls); from_err!(crate::xlsb::XlsbError, Error, Xlsb); from_err!(crate::xlsx::XlsxError, Error, Xlsx); from_err!(crate::vba::VbaError, Error, Vba); from_err!(crate::de::DeError, Error, De); from_err!(&'static str, Error, Msg); impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Error::Io(e) => write!(f, "I/O error: {e}"), Error::Ods(e) => write!(f, "Ods error: {e}"), Error::Xls(e) => write!(f, "Xls error: {e}"), Error::Xlsx(e) => write!(f, "Xlsx error: {e}"), Error::Xlsb(e) => write!(f, "Xlsb error: {e}"), Error::Vba(e) => write!(f, "Vba error: {e}"), Error::De(e) => write!(f, "Deserializer error: {e}"), Error::Msg(msg) => write!(f, "{msg}"), } } } impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { Error::Io(e) => Some(e), Error::Ods(e) => Some(e), Error::Xls(e) => Some(e), Error::Xlsb(e) => Some(e), Error::Xlsx(e) => Some(e), Error::Vba(e) => Some(e), Error::De(e) => Some(e), Error::Msg(_) => None, } } } calamine-0.34.0/src/formats.rs000064400000000000000000000150141046102023000142720ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. use crate::datatype::{Data, DataRef, ExcelDateTime, ExcelDateTimeType}; #[derive(Debug, Clone, Copy, PartialEq)] pub enum CellFormat { Other, DateTime, TimeDelta, } /// Check excel number format is datetime pub fn detect_custom_number_format(format: &str) -> CellFormat { let mut escaped = false; let mut is_quote = false; let mut brackets = 0u8; let mut prev = ' '; let mut hms = false; let mut ap = false; for s in format.chars() { match (s, escaped, is_quote, ap, brackets) { (_, true, ..) => escaped = false, // if escaped, ignore ('_' | '\\', ..) => escaped = true, ('"', _, true, _, _) => is_quote = false, (_, _, true, _, _) => (), ('"', _, _, _, _) => is_quote = true, (';', ..) => return CellFormat::Other, // first format only ('[', ..) => brackets += 1, (']', .., 1) if hms => return CellFormat::TimeDelta, // if closing (']', ..) => brackets = brackets.saturating_sub(1), ('a' | 'A', _, _, false, 0) => ap = true, ('p' | 'm' | '/' | 'P' | 'M', _, _, true, 0) => return CellFormat::DateTime, ('d' | 'm' | 'h' | 'y' | 's' | 'D' | 'M' | 'H' | 'Y' | 'S', _, _, false, 0) => { return CellFormat::DateTime } _ => { if hms && s.eq_ignore_ascii_case(&prev) { // ok ... } else { hms = prev == '[' && matches!(s, 'm' | 'h' | 's' | 'M' | 'H' | 'S'); } } } prev = s; } CellFormat::Other } pub fn builtin_format_by_id(id: &[u8]) -> CellFormat { match id { // mm-dd-yy b"14" | // d-mmm-yy b"15" | // d-mmm b"16" | // mmm-yy b"17" | // h:mm AM/PM b"18" | // h:mm:ss AM/PM b"19" | // h:mm b"20" | // h:mm:ss b"21" | // m/d/yy h:mm b"22" | // mm:ss b"45" | // mmss.0 b"47" => CellFormat::DateTime, // [h]:mm:ss b"46" => CellFormat::TimeDelta, _ => CellFormat::Other } } /// Check if code corresponds to builtin date format /// /// See `is_builtin_date_format_id` pub fn builtin_format_by_code(code: u16) -> CellFormat { match code { 14..=22 | 45 | 47 => CellFormat::DateTime, 46 => CellFormat::TimeDelta, _ => CellFormat::Other, } } // convert i64 to date, if format == Date pub fn format_excel_i64(value: i64, format: Option<&CellFormat>, is_1904: bool) -> Data { match format { Some(CellFormat::DateTime) => Data::DateTime(ExcelDateTime::new( value as f64, ExcelDateTimeType::DateTime, is_1904, )), Some(CellFormat::TimeDelta) => Data::DateTime(ExcelDateTime::new( value as f64, ExcelDateTimeType::TimeDelta, is_1904, )), _ => Data::Int(value), } } // convert f64 to date, if format == Date #[inline] pub fn format_excel_f64_ref( value: f64, format: Option<&CellFormat>, is_1904: bool, ) -> DataRef<'static> { match format { Some(CellFormat::DateTime) => DataRef::DateTime(ExcelDateTime::new( value, ExcelDateTimeType::DateTime, is_1904, )), Some(CellFormat::TimeDelta) => DataRef::DateTime(ExcelDateTime::new( value, ExcelDateTimeType::TimeDelta, is_1904, )), _ => DataRef::Float(value), } } // convert f64 to date, if format == Date pub fn format_excel_f64(value: f64, format: Option<&CellFormat>, is_1904: bool) -> Data { format_excel_f64_ref(value, format, is_1904).into() } /// Ported from openpyxl, MIT License /// https://foss.heptapod.net/openpyxl/openpyxl/-/blob/a5e197c530aaa49814fd1d993dd776edcec35105/openpyxl/styles/tests/test_number_style.py #[test] fn test_is_date_format() { assert_eq!( detect_custom_number_format("DD/MM/YY"), CellFormat::DateTime ); assert_eq!( detect_custom_number_format("H:MM:SS;@"), CellFormat::DateTime ); assert_eq!( detect_custom_number_format("#,##0\\ [$\\u20bd-46D]"), CellFormat::Other ); assert_eq!( detect_custom_number_format("m\"M\"d\"D\";@"), CellFormat::DateTime ); assert_eq!( detect_custom_number_format("[h]:mm:ss"), CellFormat::TimeDelta ); assert_eq!( detect_custom_number_format("\"Y: \"0.00\"m\";\"Y: \"-0.00\"m\";\"Y: m\";@"), CellFormat::Other ); assert_eq!( detect_custom_number_format("#,##0\\ [$''u20bd-46D]"), CellFormat::Other ); assert_eq!( detect_custom_number_format("\"$\"#,##0_);[Red](\"$\"#,##0)"), CellFormat::Other ); assert_eq!( detect_custom_number_format("[$-404]e\"\\xfc\"m\"\\xfc\"d\"\\xfc\""), CellFormat::DateTime ); assert_eq!( detect_custom_number_format("0_ ;[Red]\\-0\\ "), CellFormat::Other ); assert_eq!(detect_custom_number_format("\\Y000000"), CellFormat::Other); assert_eq!( detect_custom_number_format("#,##0.0####\" YMD\""), CellFormat::Other ); assert_eq!(detect_custom_number_format("[h]"), CellFormat::TimeDelta); assert_eq!(detect_custom_number_format("[ss]"), CellFormat::TimeDelta); assert_eq!( detect_custom_number_format("[s].000"), CellFormat::TimeDelta ); assert_eq!(detect_custom_number_format("[m]"), CellFormat::TimeDelta); assert_eq!(detect_custom_number_format("[mm]"), CellFormat::TimeDelta); assert_eq!( detect_custom_number_format("[Blue]\\+[h]:mm;[Red]\\-[h]:mm;[Green][h]:mm"), CellFormat::TimeDelta ); assert_eq!( detect_custom_number_format("[>=100][Magenta][s].00"), CellFormat::TimeDelta ); assert_eq!( detect_custom_number_format("[h]:mm;[=0]\\-"), CellFormat::TimeDelta ); assert_eq!( detect_custom_number_format("[>=100][Magenta].00"), CellFormat::Other ); assert_eq!( detect_custom_number_format("[>=100][Magenta]General"), CellFormat::Other ); assert_eq!( detect_custom_number_format("ha/p\\\\m"), CellFormat::DateTime ); assert_eq!( detect_custom_number_format("#,##0.00\\ _M\"H\"_);[Red]#,##0.00\\ _M\"S\"_)"), CellFormat::Other ); } calamine-0.34.0/src/lib.rs000064400000000000000000002164351046102023000133770ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. #![cfg_attr(docsrs, feature(doc_cfg))] //! Rust Excel/`OpenDocument` reader //! //! # Status //! //! **calamine** is a pure Rust library to read Excel and `OpenDocument` Spreadsheet files. //! //! Read both cell values and vba project. //! //! # Examples //! ``` //! use calamine::{Reader, open_workbook, Xlsx, Data}; //! //! // opens a new workbook //! # let path = format!("{}/tests/issue3.xlsm", env!("CARGO_MANIFEST_DIR")); //! let mut workbook: Xlsx<_> = open_workbook(path).expect("Cannot open file"); //! //! // Read whole worksheet data and provide some statistics //! if let Ok(range) = workbook.worksheet_range("Sheet1") { //! let total_cells = range.get_size().0 * range.get_size().1; //! let non_empty_cells: usize = range.used_cells().count(); //! println!("Found {total_cells} cells in 'Sheet1', including {non_empty_cells} non empty cells"); //! // alternatively, we can manually filter rows //! assert_eq!(non_empty_cells, range.rows() //! .flat_map(|r| r.iter().filter(|&c| c != &Data::Empty)).count()); //! } //! //! // Check if the workbook has a vba project //! if let Ok(Some(vba)) = workbook.vba_project() { //! let module1 = vba.get_module("Module 1").unwrap(); //! println!("Module 1 code:"); //! println!("{module1}"); //! for r in vba.get_references() { //! if r.is_missing() { //! println!("Reference {} is broken or not accessible", r.name); //! } //! } //! } //! //! // You can also get defined names definition (string representation only) //! for name in workbook.defined_names() { //! println!("name: {}, formula: {}", name.0, name.1); //! } //! //! // Now get all formula! //! let sheets = workbook.sheet_names().to_owned(); //! for s in sheets { //! println!("found {} formula in '{}'", //! workbook //! .worksheet_formula(&s) //! .expect("error while getting formula") //! .rows().flat_map(|r| r.iter().filter(|f| !f.is_empty())) //! .count(), //! s); //! } //! ``` //! //! //! # Crate Features //! //! The following is a list of the optional features supported by the `calamine` //! crate. They are all off by default. //! //! - `chrono`: Adds support for Chrono date/time types to the API. //! - `dates`: A deprecated backwards compatible synonym for the `chrono` feature. //! - `picture`: Adds support for reading raw data for pictures in spreadsheets. //! //! A `calamine` feature can be enabled in your `Cargo.toml` file as follows: //! //! ```bash //! cargo add calamine -F chrono //! ``` #[macro_use] mod utils; mod auto; mod cfb; mod datatype; mod formats; mod ods; mod xls; mod xlsb; mod xlsx; mod de; mod errors; pub mod changelog; pub mod vba; use serde::de::{Deserialize, DeserializeOwned, Deserializer}; use std::cmp::{max, min}; use std::fmt; use std::fs::File; use std::io::{BufReader, Read, Seek}; use std::ops::{Index, IndexMut}; use std::path::Path; pub use crate::auto::{open_workbook_auto, open_workbook_auto_from_rs, Sheets}; pub use crate::datatype::{Data, DataRef, DataType, ExcelDateTime, ExcelDateTimeType}; pub use crate::de::{DeError, RangeDeserializer, RangeDeserializerBuilder, ToCellDeserializer}; pub use crate::errors::Error; pub use crate::ods::{Ods, OdsError}; pub use crate::xls::{Xls, XlsError, XlsOptions}; pub use crate::xlsb::{Xlsb, XlsbError}; pub use crate::xlsx::{Xlsx, XlsxError}; use crate::vba::VbaProject; // https://msdn.microsoft.com/en-us/library/office/ff839168.aspx /// An enum to represent all different errors that can appear as /// a value in a worksheet cell #[derive(Debug, Clone, PartialEq)] pub enum CellErrorType { /// Division by 0 error Div0, /// Unavailable value error NA, /// Invalid name error Name, /// Null value error Null, /// Number error Num, /// Invalid cell reference error Ref, /// Value error Value, /// Getting data GettingData, } impl fmt::Display for CellErrorType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { match *self { CellErrorType::Div0 => write!(f, "#DIV/0!"), CellErrorType::NA => write!(f, "#N/A"), CellErrorType::Name => write!(f, "#NAME?"), CellErrorType::Null => write!(f, "#NULL!"), CellErrorType::Num => write!(f, "#NUM!"), CellErrorType::Ref => write!(f, "#REF!"), CellErrorType::Value => write!(f, "#VALUE!"), CellErrorType::GettingData => write!(f, "#DATA!"), } } } /// Dimensions info #[derive(Debug, Default, PartialEq, Eq, Hash, Ord, PartialOrd, Copy, Clone)] pub struct Dimensions { /// start: (row, col) pub start: (u32, u32), /// end: (row, col) pub end: (u32, u32), } #[allow(clippy::len_without_is_empty)] impl Dimensions { /// create dimensions info with start position and end position pub fn new(start: (u32, u32), end: (u32, u32)) -> Self { Self { start, end } } /// check if a position is in it pub fn contains(&self, row: u32, col: u32) -> bool { row >= self.start.0 && row <= self.end.0 && col >= self.start.1 && col <= self.end.1 } /// len pub fn len(&self) -> u64 { (self.end.0 - self.start.0 + 1) as u64 * (self.end.1 - self.start.1 + 1) as u64 } } /// Common file metadata /// /// Depending on file type, some extra information may be stored /// in the Reader implementations #[derive(Debug, Default)] pub struct Metadata { sheets: Vec, /// Map of sheet names/sheet path within zip archive names: Vec<(String, String)>, } /// Type of sheet. /// /// Only Excel formats support this. Default value for ODS is /// `SheetType::WorkSheet`. /// /// The property is defined in the following specifications: /// /// - [ECMA-376 Part 1] 12.3.2, 12.3.7 and 12.3.24. /// - [MS-XLS `BoundSheet`]. /// - [MS-XLSB `ST_SheetType`]. /// /// [ECMA-376 Part 1]: https://www.ecma-international.org/publications-and-standards/standards/ecma-376/ /// [MS-XLS `BoundSheet`]: https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xls/b9ec509a-235d-424e-871d-f8e721106501 /// [MS-XLS `BrtBundleSh`]: https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xlsb/1edadf56-b5cd-4109-abe7-76651bbe2722 /// #[derive(Debug, Clone, Copy, PartialEq)] pub enum SheetType { /// A worksheet. WorkSheet, /// A dialog sheet. DialogSheet, /// A macro sheet. MacroSheet, /// A chartsheet. ChartSheet, /// A VBA module. Vba, } /// Type of visible sheet. /// /// The property is defined in the following specifications: /// /// - [ECMA-376 Part 1] 18.18.68 `ST_SheetState` (Sheet Visibility Types). /// - [MS-XLS `BoundSheet`]. /// - [MS-XLSB `ST_SheetState`]. /// - [OpenDocument v1.2] 19.471 `style:display`. /// /// [ECMA-376 Part 1]: https://www.ecma-international.org/publications-and-standards/standards/ecma-376/ /// [OpenDocument v1.2]: https://docs.oasis-open.org/office/v1.2/os/OpenDocument-v1.2-os-part1.html#property-table_display /// [MS-XLS `BoundSheet`]: https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xls/b9ec509a-235d-424e-871d-f8e721106501 /// [MS-XLSB `ST_SheetState`]: https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xlsb/74cb1d22-b931-4bf8-997d-17517e2416e9 /// #[derive(Debug, Clone, Copy, PartialEq)] pub enum SheetVisible { /// Visible Visible, /// Hidden Hidden, /// The sheet is hidden and cannot be displayed using the user interface. It is supported only by Excel formats. VeryHidden, } /// Metadata of sheet #[derive(Debug, Clone, PartialEq)] pub struct Sheet { /// Name pub name: String, /// Type /// Only Excel formats support this. Default value for ODS is `SheetType::WorkSheet`. pub typ: SheetType, /// Visible pub visible: SheetVisible, } /// Row to use as header /// By default, the first non-empty row is used as header #[derive(Debug, Default, Clone, Copy)] #[non_exhaustive] pub enum HeaderRow { /// First non-empty row #[default] FirstNonEmptyRow, /// Index of the header row Row(u32), } // FIXME `Reader` must only be seek `Seek` for `Xls::xls`. Because of the present API this limits // the kinds of readers (other) data in formats can be read from. /// A trait to share spreadsheets reader functions across different `FileType`s pub trait Reader: Sized where RS: Read + Seek, { /// Error specific to file type type Error: std::fmt::Debug + From; /// Creates a new instance. fn new(reader: RS) -> Result; /// Set header row (i.e. first row to be read) /// If `header_row` is `None`, the first non-empty row will be used as header row fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self; /// Gets `VbaProject` fn vba_project(&mut self) -> Result, Self::Error>; /// Initialize fn metadata(&self) -> &Metadata; /// Read worksheet data in corresponding worksheet path fn worksheet_range(&mut self, name: &str) -> Result, Self::Error>; /// Fetch all worksheet data & paths fn worksheets(&mut self) -> Vec<(String, Range)>; /// Read worksheet formula in corresponding worksheet path fn worksheet_formula(&mut self, _: &str) -> Result, Self::Error>; /// Get all sheet names of this workbook, in workbook order /// /// # Examples /// ``` /// use calamine::{Xlsx, open_workbook, Reader}; /// /// # let path = format!("{}/tests/issue3.xlsm", env!("CARGO_MANIFEST_DIR")); /// let mut workbook: Xlsx<_> = open_workbook(path).unwrap(); /// println!("Sheets: {:#?}", workbook.sheet_names()); /// ``` fn sheet_names(&self) -> Vec { self.metadata() .sheets .iter() .map(|s| s.name.to_owned()) .collect() } /// Fetch all sheets metadata fn sheets_metadata(&self) -> &[Sheet] { &self.metadata().sheets } /// Get all defined names (Ranges names etc) fn defined_names(&self) -> &[(String, String)] { &self.metadata().names } /// Get the nth worksheet. Shortcut for getting the nth /// worksheet name, then the corresponding worksheet. fn worksheet_range_at(&mut self, n: usize) -> Option, Self::Error>> { let name = self.sheet_names().get(n)?.to_string(); Some(self.worksheet_range(&name)) } /// Get the raw data of the pictures in a workbook. /// /// Returns a vector of tuples containing the file extension and a buffer of /// the image data. /// /// # Examples /// /// An example of getting the raw data of pictures in an spreadsheet file. /// /// ``` /// # use calamine::{open_workbook, Error, Reader, Xlsx}; /// # /// # fn main() -> Result<(), Error> { /// # let path = "tests/picture.xlsx"; /// # /// // Open the workbook. /// let workbook: Xlsx<_> = open_workbook(path)?; /// /// // Get the data for each picture. /// if let Some(pics) = workbook.pictures() { /// for (ext, data) in pics { /// println!("Type: '{}', Size: {} bytes", ext, data.len()); /// } /// } /// # /// # Ok(()) /// # } /// # /// ``` /// /// Output: /// /// ```text /// Type: 'jpg', Size: 20762 bytes /// Type: 'png', Size: 23453 bytes /// ``` /// #[cfg(feature = "picture")] #[cfg_attr(docsrs, doc(cfg(feature = "picture")))] fn pictures(&self) -> Option)>>; } /// A trait to share spreadsheets reader functions across different `FileType`s pub trait ReaderRef: Reader where RS: Read + Seek, { /// Get worksheet range where shared string values are only borrowed. /// /// This is implemented only for [`calamine::Xlsx`](crate::Xlsx) and [`calamine::Xlsb`](crate::Xlsb), as Xls and Ods formats /// do not support lazy iteration. fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, Self::Error>; /// Get the nth worksheet range where shared string values are only borrowed. Shortcut for getting the nth /// worksheet name, then the corresponding worksheet. /// /// This is implemented only for [`calamine::Xlsx`](crate::Xlsx) and [`calamine::Xlsb`](crate::Xlsb), as Xls and Ods formats /// do not support lazy iteration. fn worksheet_range_at_ref( &mut self, n: usize, ) -> Option>, Self::Error>> { let name = self.sheet_names().get(n)?.to_string(); Some(self.worksheet_range_ref(&name)) } } /// Convenient function to open a file with a `BufReader`. pub fn open_workbook(path: P) -> Result where R: Reader>, P: AsRef, { let file = BufReader::new(File::open(path)?); R::new(file) } /// Convenient function to open a file with a `BufReader`. pub fn open_workbook_from_rs(rs: RS) -> Result where RS: Read + Seek, R: Reader, { R::new(rs) } /// A trait to constrain cells pub trait CellType: Default + Clone + PartialEq {} impl CellType for Data {} impl<'a> CellType for DataRef<'a> {} impl CellType for String {} impl CellType for usize {} // for tests // ----------------------------------------------------------------------- // The `Cell` struct. // ----------------------------------------------------------------------- /// A struct to hold a cell position and value. /// /// A `Cell` is a fundamental worksheet type that is used to create a [`Range`]. /// It contains a position and a value. /// /// # Examples /// /// An example of creating a range of `Cell`s and iterating over them. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Iterate over the cells in the range. /// for (row, col, data) in range.cells() { /// println!("({row}, {col}): {data}"); /// } /// /// ``` /// /// Output: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (1, 0): /// (1, 1): /// (2, 0): 3 /// (2, 1): /// ``` /// #[derive(Debug, Clone)] pub struct Cell { // The position for the cell (row, column). pos: (u32, u32), // The [`CellType`] value of the cell. val: T, } impl Cell { /// Creates a new `Cell` instance. /// /// # Parameters /// /// - `position`: A tuple representing the cell's position in the form of /// `(row, column)`. /// - `value`: The value of the cell, which must implement the [`CellType`] /// trait. /// /// # Examples /// /// An example of creating a new `Cell` instance. /// /// ``` /// use calamine::{Cell, Data}; /// /// let cell = Cell::new((1, 2), Data::Int(42)); /// /// assert_eq!(&Data::Int(42), cell.get_value()); /// ``` /// pub fn new(position: (u32, u32), value: T) -> Cell { Cell { pos: position, val: value, } } /// Gets `Cell` position. /// /// # Examples /// /// An example of getting a `Cell` position `(row, column)`. /// /// ``` /// use calamine::{Cell, Data}; /// /// let cell = Cell::new((1, 2), Data::Int(42)); /// /// assert_eq!((1, 2), cell.get_position()); /// ``` /// pub fn get_position(&self) -> (u32, u32) { self.pos } /// Gets `Cell` value. /// /// # Examples /// /// An example of getting a `Cell` value. /// /// ``` /// use calamine::{Cell, Data}; /// /// let cell = Cell::new((1, 2), Data::Int(42)); /// /// assert_eq!(&Data::Int(42), cell.get_value()); /// ``` /// pub fn get_value(&self) -> &T { &self.val } } // ----------------------------------------------------------------------- // The `Range` struct. // ----------------------------------------------------------------------- /// A struct which represents an area of cells and the data within it. /// /// Ranges are used by `calamine` to represent an area of data in a worksheet. A /// `Range` is a rectangular area of cells defined by its start and end /// positions. /// /// A `Range` is constructed with **absolute positions** in the form of `(row, /// column)`. The start position for the absolute positioning is the cell `(0, /// 0)` or `A1`. For the example range "B3:C6", shown below, the start position /// is `(2, 1)` and the end position is `(5, 2)`. Within the range, the cells /// are indexed with **relative positions** where `(0, 0)` is the start cell. In /// the example below the relative positions for the start and end cells are /// `(0, 0)` and `(3, 1)` respectively. /// /// ```text /// ______________________________________________________________________________ /// | || | | | | /// | || A | B | C | D | /// |_________||________________|________________|________________|________________| /// | 1 || | | | | /// |_________||________________|________________|________________|________________| /// | 2 || | | | | /// |_________||________________|________________|________________|________________| /// | 3 || | (2, 1), (0, 0) | | | /// |_________||________________|________________|________________|________________| /// | 4 || | | | | /// |_________||________________|________________|________________|________________| /// | 5 || | | | | /// |_________||________________|________________|________________|________________| /// | 6 || | | (5,2), (3, 1) | | /// |_________||________________|________________|________________|________________| /// | 7 || | | | | /// |_________||________________|________________|________________|________________| /// |_ ___________________________________________________________________| /// \ Sheet1 / /// ------ /// ``` /// /// A `Range` contains a vector of cells of of generic type `T` which implement /// the [`CellType`] trait. The values are stored in a row-major order. /// #[derive(Debug, Default, Clone, PartialEq, Eq)] pub struct Range { start: (u32, u32), end: (u32, u32), inner: Vec, } impl Range { /// Creates a new `Range` with default values. /// /// Create a new [`Range`] with the given start and end positions. The /// positions are in worksheet absolute coordinates, i.e. `(0, 0)` is cell `A1`. /// /// The range is populated with default values of type `T`. /// /// When possible, use the more efficient [`Range::from_sparse()`] /// constructor. /// /// # Parameters /// /// - `start`: The zero indexed (row, column) tuple. /// - `end`: The zero indexed (row, column) tuple. /// /// # Panics /// /// Panics if `start` > `end`. /// /// /// # Examples /// /// An example of creating a new calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// // Create a 8x1 Range. /// let range: Range = Range::new((2, 2), (9, 2)); /// /// assert_eq!(range.width(), 1); /// assert_eq!(range.height(), 8); /// assert_eq!(range.cells().count(), 8); /// assert_eq!(range.used_cells().count(), 0); /// ``` /// /// #[inline] pub fn new(start: (u32, u32), end: (u32, u32)) -> Range { assert!(start <= end, "invalid range bounds"); Range { start, end, inner: vec![T::default(); ((end.0 - start.0 + 1) * (end.1 - start.1 + 1)) as usize], } } /// Creates a new empty `Range`. /// /// Creates a new [`Range`] with start and end positions both set to `(0, /// 0)` and with an empty inner vector. An empty range can be expanded by /// adding data. /// /// # Examples /// /// An example of creating a new empty calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::empty(); /// /// assert!(range.is_empty()); /// ``` /// #[inline] pub fn empty() -> Range { Range { start: (0, 0), end: (0, 0), inner: Vec::new(), } } /// Get top left cell position of a `Range`. /// /// Get the top left cell position of a range in absolute `(row, column)` /// coordinates. /// /// Returns `None` if the range is empty. /// /// # Examples /// /// An example of getting the start position of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::new((2, 3), (9, 3)); /// /// assert_eq!(range.start(), Some((2, 3))); /// ``` /// #[inline] pub fn start(&self) -> Option<(u32, u32)> { if self.is_empty() { None } else { Some(self.start) } } /// Get bottom right cell position of a `Range`. /// /// Get the bottom right cell position of a range in absolute `(row, /// column)` coordinates. /// /// Returns `None` if the range is empty. /// /// # Examples /// /// An example of getting the end position of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::new((2, 3), (9, 3)); /// /// assert_eq!(range.end(), Some((9, 3))); /// ``` /// #[inline] pub fn end(&self) -> Option<(u32, u32)> { if self.is_empty() { None } else { Some(self.end) } } /// Get the column width of a `Range`. /// /// The width is defined as the number of columns between the start and end /// positions. /// /// # Examples /// /// An example of getting the column width of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::new((2, 3), (9, 3)); /// /// assert_eq!(range.width(), 1); /// ``` /// #[inline] pub fn width(&self) -> usize { if self.is_empty() { 0 } else { (self.end.1 - self.start.1 + 1) as usize } } /// Get the row height of a `Range`. /// /// The height is defined as the number of rows between the start and end /// positions. /// /// # Examples /// /// An example of getting the row height of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::new((2, 3), (9, 3)); /// /// assert_eq!(range.height(), 8); /// ``` /// #[inline] pub fn height(&self) -> usize { if self.is_empty() { 0 } else { (self.end.0 - self.start.0 + 1) as usize } } /// Get size of a `Range` in (height, width) format. /// /// # Examples /// /// An example of getting the (height, width) size of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::new((2, 3), (9, 3)); /// /// assert_eq!(range.get_size(), (8, 1)); /// ``` /// #[inline] pub fn get_size(&self) -> (usize, usize) { (self.height(), self.width()) } /// Check if a `Range` is empty. /// /// # Examples /// /// An example of checking if a calamine `Range` is empty. /// /// ``` /// use calamine::{Data, Range}; /// /// let range: Range = Range::empty(); /// /// assert!(range.is_empty()); /// ``` /// #[inline] pub fn is_empty(&self) -> bool { self.inner.is_empty() } /// Creates a `Range` from a sparse vector of cells. /// /// The `Range::from_sparse()` constructor can be used to create a Range /// from a vector of [`Cell`] data. This is slightly more efficient than /// creating a range with [`Range::new()`] and then setting the values. /// /// # Parameters /// /// - `cells`: A vector of [`Cell`] elements. /// /// # Examples /// /// An example of creating a new calamine `Range` for a sparse vector of /// Cells. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((2, 2), Data::Int(1)), /// Cell::new((5, 2), Data::Int(1)), /// Cell::new((9, 2), Data::Int(1)), /// ]; /// /// let range = Range::from_sparse(cells); /// /// assert_eq!(range.width(), 1); /// assert_eq!(range.height(), 8); /// assert_eq!(range.cells().count(), 8); /// assert_eq!(range.used_cells().count(), 3); /// ``` /// pub fn from_sparse(cells: Vec>) -> Range { if cells.is_empty() { return Range::empty(); } // cells do not always appear in (row, col) order // search bounds let mut row_start = u32::MAX; let mut row_end = 0; let mut col_start = u32::MAX; let mut col_end = 0; for (r, c) in cells.iter().map(|c| c.pos) { row_start = min(r, row_start); row_end = max(r, row_end); col_start = min(c, col_start); col_end = max(c, col_end); } let cols = (col_end - col_start + 1) as usize; let rows = (row_end - row_start + 1) as usize; let len = cols.saturating_mul(rows); let mut v = vec![T::default(); len]; v.shrink_to_fit(); for c in cells { let row = (c.pos.0 - row_start) as usize; let col = (c.pos.1 - col_start) as usize; let idx = row.saturating_mul(cols) + col; if let Some(v) = v.get_mut(idx) { *v = c.val; } } Range { start: (row_start, col_start), end: (row_end, col_end), inner: v, } } /// Set a value at an absolute position in a `Range`. /// /// This method sets a value in the range at the given absolute position /// (relative to `A1`). /// /// Try to avoid this method as much as possible and prefer initializing the /// `Range` with the [`Range::from_sparse()`] constructor. /// /// # Parameters /// /// - `absolute_position`: The absolute position, relative to `A1`, in the /// form of `(row, column)`. It must be greater than or equal to the start /// position of the range. If the position is greater than the end of the range /// the structure will be resized to accommodate the new end position. /// /// # Panics /// /// If `absolute_position.0 < self.start.0 || absolute_position.1 < self.start.1` /// /// # Examples /// /// An example of setting a value in a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let mut range = Range::new((0, 0), (5, 2)); /// /// // The initial range is empty. /// assert_eq!(range.get_value((2, 1)), Some(&Data::Empty)); /// /// // Set a value at a specific position. /// range.set_value((2, 1), Data::Float(1.0)); /// /// // The value at the specified position should now be set. /// assert_eq!(range.get_value((2, 1)), Some(&Data::Float(1.0))); /// ``` /// pub fn set_value(&mut self, absolute_position: (u32, u32), value: T) { assert!( self.start.0 <= absolute_position.0 && self.start.1 <= absolute_position.1, "absolute_position out of bounds" ); // check if we need to change range dimension (strangely happens sometimes ...) match ( self.end.0 < absolute_position.0, self.end.1 < absolute_position.1, ) { (false, false) => (), // regular case, position within bounds (true, false) => { let len = (absolute_position.0 - self.end.0 + 1) as usize * self.width(); self.inner.extend_from_slice(&vec![T::default(); len]); self.end.0 = absolute_position.0; } // missing some rows (e, true) => { let height = if e { (absolute_position.0 - self.start.0 + 1) as usize } else { self.height() }; let width = (absolute_position.1 - self.start.1 + 1) as usize; let old_width = self.width(); let mut data = Vec::with_capacity(width * height); let empty = vec![T::default(); width - old_width]; for sce in self.inner.chunks(old_width) { data.extend_from_slice(sce); data.extend_from_slice(&empty); } data.extend_from_slice(&vec![T::default(); width * (height - self.height())]); if e { self.end = absolute_position; } else { self.end.1 = absolute_position.1; } self.inner = data; } // missing some columns } let pos = ( absolute_position.0 - self.start.0, absolute_position.1 - self.start.1, ); let idx = pos.0 as usize * self.width() + pos.1 as usize; self.inner[idx] = value; } /// Get a value at an absolute position in a `Range`. /// /// If the `absolute_position` is out of range, returns `None`, otherwise /// returns the cell value. The coordinate format is `(row, column)` /// relative to `A1`. /// /// For relative positions see the [`Range::get()`] method. /// /// # Parameters /// /// - `absolute_position`: The absolute position, relative to `A1`, in the /// form of `(row, column)`. /// /// # Examples /// /// An example of getting a value in a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// let range = Range::new((1, 1), (5, 5)); /// /// // Get the value for a cell in the range. /// assert_eq!(range.get_value((2, 2)), Some(&Data::Empty)); /// /// // Get the value for a cell outside the range. /// assert_eq!(range.get_value((0, 0)), None); /// ``` /// pub fn get_value(&self, absolute_position: (u32, u32)) -> Option<&T> { let p = absolute_position; if p.0 >= self.start.0 && p.0 <= self.end.0 && p.1 >= self.start.1 && p.1 <= self.end.1 { return self.get(( (absolute_position.0 - self.start.0) as usize, (absolute_position.1 - self.start.1) as usize, )); } None } /// Get a value at a relative position in a `Range`. /// /// If the `relative_position` is out of range, returns `None`, otherwise /// returns the cell value. The coordinate format is `(row, column)` /// relative to `(0, 0)` in the range. /// /// For absolute cell positioning see the [`Range::get_value()`] method. /// /// # Parameters /// /// - `relative_position`: The position relative to the index `(0, 0)` in /// the range. /// /// # Examples /// /// An example of getting a value in a calamine `Range`, using relative /// positioning. /// /// ``` /// use calamine::{Data, Range}; /// /// let mut range = Range::new((1, 1), (5, 5)); /// /// // Set a cell value using the cell absolute position. /// range.set_value((2, 3), Data::Int(123)); /// /// // Get the value using the range relative position. /// assert_eq!(range.get((1, 2)), Some(&Data::Int(123))); /// ``` /// pub fn get(&self, relative_position: (usize, usize)) -> Option<&T> { let (row, col) = relative_position; let (height, width) = self.get_size(); if col >= width || row >= height { None } else { self.inner.get(row * width + col) } } /// Get an iterator over the rows of a `Range`. /// /// # Examples /// /// An example of using a `Row` iterator with a calamine `Range`. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Iterate over the rows of the range. /// for (row_num, row) in range.rows().enumerate() { /// for (col_num, data) in row.iter().enumerate() { /// // Print the data in each cell of the row. /// println!("({row_num}, {col_num}): {data}"); /// } /// } /// /// ``` /// /// Output in relative coordinates: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (1, 0): /// (1, 1): /// (2, 0): 3 /// (2, 1): /// ``` /// pub fn rows(&self) -> Rows<'_, T> { if self.inner.is_empty() { Rows { inner: None } } else { let width = self.width(); Rows { inner: Some(self.inner.chunks(width)), } } } /// Get an iterator over the used cells in a `Range`. /// /// This method returns an iterator over the used cells in a range. The /// "used" cells are defined as the cells that have a value other than the /// default value for `T`. The iterator returns tuples of `(row, column, /// value)` for each used cell. The row and column are relative/index values /// rather than absolute cell positions. /// /// # Examples /// /// An example of iterating over the used cells in a calamine `Range`. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Iterate over the used cells in the range. /// for (row, col, data) in range.used_cells() { /// println!("({row}, {col}): {data}"); /// } /// ``` /// /// Output: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (2, 0): 3 /// ``` /// pub fn used_cells(&self) -> UsedCells<'_, T> { UsedCells { width: self.width(), inner: self.inner.iter().enumerate(), } } /// Get an iterator over all the cells in a `Range`. /// /// This method returns an iterator over all the cells in a range, including /// those that are empty. The iterator returns tuples of `(row, column, /// value)` for each cell. The row and column are relative/index values /// rather than absolute cell positions. /// /// # Examples /// /// An example of iterating over the used cells in a calamine `Range`. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Iterate over the cells in the range. /// for (row, col, data) in range.cells() { /// println!("({row}, {col}): {data}"); /// } /// ``` /// /// Output: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (1, 0): /// (1, 1): /// (2, 0): 3 /// (2, 1): /// ``` /// pub fn cells(&self) -> Cells<'_, T> { Cells { width: self.width(), inner: self.inner.iter().enumerate(), } } /// Build a `RangeDeserializer` for a `Range`. /// /// This method returns a [`RangeDeserializer`] that can be used to /// deserialize the data in the range. /// /// # Errors /// /// - [`DeError`] if the range cannot be deserialized. /// /// # Examples /// /// An example of creating a deserializer fora calamine `Range`. /// /// The sample Excel file `temperature.xlsx` contains a single sheet named /// "Sheet1" with the following data: /// /// ```text /// ____________________________________________ /// | || | | /// | || A | B | /// |_________||________________|________________| /// | 1 || label | value | /// |_________||________________|________________| /// | 2 || celsius | 22.2222 | /// |_________||________________|________________| /// | 3 || fahrenheit | 72 | /// |_________||________________|________________| /// |_ _________________________________| /// \ Sheet1 / /// ------ /// ``` /// /// ``` /// use calamine::{open_workbook, Error, Reader, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/temperature.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Get the data range from the first sheet. /// let sheet_range = workbook.worksheet_range("Sheet1")?; /// /// // Get an iterator over data in the range. /// let mut iter = sheet_range.deserialize()?; /// /// // Get the next record in the range. The first row is assumed to be the /// // header. /// if let Some(result) = iter.next() { /// let (label, value): (String, f64) = result?; /// /// assert_eq!(label, "celsius"); /// assert_eq!(value, 22.2222); /// /// Ok(()) /// } else { /// Err(From::from("Expected at least one record but got none")) /// } /// } /// ``` /// pub fn deserialize<'a, D>(&'a self) -> Result, DeError> where T: ToCellDeserializer<'a>, D: DeserializeOwned, { RangeDeserializerBuilder::new().from_range(self) } /// Build a new `Range` out of the current range. /// /// This method returns a new `Range` with cloned data. In general it is /// used to get a subset of an existing range. However, if the new range is /// larger than the existing range the new cells will be filled with default /// values. /// /// # Examples /// /// An example of getting a sub range of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// // Create a range with some values. /// let mut a = Range::new((1, 1), (3, 3)); /// a.set_value((1, 1), Data::Bool(true)); /// a.set_value((2, 2), Data::Bool(true)); /// a.set_value((3, 3), Data::Bool(true)); /// /// // Get a sub range of the main range. /// let b = a.range((1, 1), (2, 2)); /// assert_eq!(b.get_value((1, 1)), Some(&Data::Bool(true))); /// assert_eq!(b.get_value((2, 2)), Some(&Data::Bool(true))); /// /// // Get a larger range with default values. /// let c = a.range((0, 0), (5, 5)); /// assert_eq!(c.get_value((0, 0)), Some(&Data::Empty)); /// assert_eq!(c.get_value((3, 3)), Some(&Data::Bool(true))); /// assert_eq!(c.get_value((5, 5)), Some(&Data::Empty)); /// ``` /// pub fn range(&self, start: (u32, u32), end: (u32, u32)) -> Range { let mut other = Range::new(start, end); let (self_start_row, self_start_col) = self.start; let (self_end_row, self_end_col) = self.end; let (other_start_row, other_start_col) = other.start; let (other_end_row, other_end_col) = other.end; // copy data from self to other let start_row = max(self_start_row, other_start_row); let end_row = min(self_end_row, other_end_row); let start_col = max(self_start_col, other_start_col); let end_col = min(self_end_col, other_end_col); if start_row > end_row || start_col > end_col { return other; } let self_width = self.width(); let other_width = other.width(); // change referential // // we want to copy range: start_row..(end_row + 1) // In self referential it is (start_row - self_start_row)..(end_row + 1 - self_start_row) let self_row_start = (start_row - self_start_row) as usize; let self_row_end = (end_row + 1 - self_start_row) as usize; let self_col_start = (start_col - self_start_col) as usize; let self_col_end = (end_col + 1 - self_start_col) as usize; let other_row_start = (start_row - other_start_row) as usize; let other_row_end = (end_row + 1 - other_start_row) as usize; let other_col_start = (start_col - other_start_col) as usize; let other_col_end = (end_col + 1 - other_start_col) as usize; { let self_rows = self .inner .chunks(self_width) .take(self_row_end) .skip(self_row_start); let other_rows = other .inner .chunks_mut(other_width) .take(other_row_end) .skip(other_row_start); for (self_row, other_row) in self_rows.zip(other_rows) { let self_cols = &self_row[self_col_start..self_col_end]; let other_cols = &mut other_row[other_col_start..other_col_end]; other_cols.clone_from_slice(self_cols); } } other } } impl Range { /// Get headers for a `Range`. /// /// This method returns the first row of the range as an optional vector of /// strings. The data type `T` in the range must support the [`ToString`] /// trait. /// /// # Examples /// /// An example of getting the header row of a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// // Create a range with some values. /// let mut range = Range::new((0, 0), (5, 2)); /// range.set_value((0, 0), Data::String(String::from("a"))); /// range.set_value((0, 1), Data::Int(1)); /// range.set_value((0, 2), Data::Bool(true)); /// /// // Get the headers of the range. /// let headers = range.headers(); /// /// assert_eq!( /// headers, /// Some(vec![ /// String::from("a"), /// String::from("1"), /// String::from("true") /// ]) /// ); /// ``` /// pub fn headers(&self) -> Option> { self.rows() .next() .map(|row| row.iter().map(ToString::to_string).collect()) } } /// Implementation of the `Index` trait for `Range` rows. /// /// # Examples /// /// An example of row indexing for a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// // Create a range with a value. /// let mut range = Range::new((1, 1), (3, 3)); /// range.set_value((2, 2), Data::Int(123)); /// /// // Get the second row via indexing. /// assert_eq!(range[1], [Data::Empty, Data::Int(123), Data::Empty]); /// ``` /// impl Index for Range { type Output = [T]; fn index(&self, index: usize) -> &[T] { let width = self.width(); &self.inner[index * width..(index + 1) * width] } } /// Implementation of the `Index` trait for `Range` cells. /// /// # Examples /// /// An example of cell indexing for a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// // Create a range with a value. /// let mut range = Range::new((1, 1), (3, 3)); /// range.set_value((2, 2), Data::Int(123)); /// /// // Get the value via cell indexing. /// assert_eq!(range[(1, 1)], Data::Int(123)); /// ``` /// impl Index<(usize, usize)> for Range { type Output = T; fn index(&self, index: (usize, usize)) -> &T { let (height, width) = self.get_size(); assert!(index.1 < width && index.0 < height, "index out of bounds"); &self.inner[index.0 * width + index.1] } } /// Implementation of the `IndexMut` trait for `Range` rows. impl IndexMut for Range { fn index_mut(&mut self, index: usize) -> &mut [T] { let width = self.width(); &mut self.inner[index * width..(index + 1) * width] } } /// Implementation of the `IndexMut` trait for `Range` cells. /// /// # Examples /// /// An example of mutable cell indexing for a calamine `Range`. /// /// ``` /// use calamine::{Data, Range}; /// /// // Create a new empty range. /// let mut range = Range::new((1, 1), (3, 3)); /// /// // Set a value in the range using cell indexing. /// range[(1, 1)] = Data::Int(123); /// /// // Test the value was set correctly. /// assert_eq!(range.get((1, 1)), Some(&Data::Int(123))); /// ``` /// impl IndexMut<(usize, usize)> for Range { fn index_mut(&mut self, index: (usize, usize)) -> &mut T { let (height, width) = self.get_size(); assert!(index.1 < width && index.0 < height, "index out of bounds"); &mut self.inner[index.0 * width + index.1] } } // ----------------------------------------------------------------------- // Range Iterators. // ----------------------------------------------------------------------- /// A struct to iterate over all `Cell`s in a `Range`. /// /// # Examples /// /// An example iterating over the cells in a calamine range using the `Cells` /// iterator returned by [`Range::cells()`]. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Use the Cells iterator returned by Range::cells(). /// for (row, col, data) in range.cells() { /// println!("({row}, {col}): {data}"); /// } /// /// ``` /// /// Output: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (1, 0): /// (1, 1): /// (2, 0): 3 /// (2, 1): /// ``` /// #[derive(Clone, Debug)] pub struct Cells<'a, T: CellType> { width: usize, inner: std::iter::Enumerate>, } impl<'a, T: 'a + CellType> Iterator for Cells<'a, T> { type Item = (usize, usize, &'a T); fn next(&mut self) -> Option { self.inner.next().map(|(i, v)| { let row = i / self.width; let col = i % self.width; (row, col, v) }) } fn size_hint(&self) -> (usize, Option) { self.inner.size_hint() } } impl<'a, T: 'a + CellType> DoubleEndedIterator for Cells<'a, T> { fn next_back(&mut self) -> Option { self.inner.next_back().map(|(i, v)| { let row = i / self.width; let col = i % self.width; (row, col, v) }) } } impl<'a, T: 'a + CellType> ExactSizeIterator for Cells<'a, T> {} /// A struct to iterate over all the used `Cell`s in a `Range`. /// /// # Examples /// /// An example iterating over the used cells in a calamine range using the /// `UsedCells` iterator returned by [`Range::used_cells()`]. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Use the UsedCells iterator returned by Range::used_cells(). /// for (row, col, data) in range.used_cells() { /// println!("({row}, {col}): {data}"); /// } /// /// ``` /// /// Output: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (2, 0): 3 /// ``` /// #[derive(Clone, Debug)] pub struct UsedCells<'a, T: CellType> { width: usize, inner: std::iter::Enumerate>, } impl<'a, T: 'a + CellType> Iterator for UsedCells<'a, T> { type Item = (usize, usize, &'a T); fn next(&mut self) -> Option { self.inner .by_ref() .find(|&(_, v)| v != &T::default()) .map(|(i, v)| { let row = i / self.width; let col = i % self.width; (row, col, v) }) } fn size_hint(&self) -> (usize, Option) { let (_, up) = self.inner.size_hint(); (0, up) } } impl<'a, T: 'a + CellType> DoubleEndedIterator for UsedCells<'a, T> { fn next_back(&mut self) -> Option { self.inner .by_ref() .rfind(|&(_, v)| v != &T::default()) .map(|(i, v)| { let row = i / self.width; let col = i % self.width; (row, col, v) }) } } /// A struct to iterate over all `Rows`s in a `Range`. /// /// # Examples /// /// An example iterating over the rows in a calamine range using the `Rows` /// iterator returned by [`Range::rows()`]. /// /// ``` /// use calamine::{Cell, Data, Range}; /// /// let cells = vec![ /// Cell::new((1, 1), Data::Int(1)), /// Cell::new((1, 2), Data::Int(2)), /// Cell::new((3, 1), Data::Int(3)), /// ]; /// /// // Create a Range from the cells. /// let range = Range::from_sparse(cells); /// /// // Use the Rows iterator returned by Range::rows(). /// for (row_num, row) in range.rows().enumerate() { /// for (col_num, data) in row.iter().enumerate() { /// // Print the data in each cell of the row. /// println!("({row_num}, {col_num}): {data}"); /// } /// } /// ``` /// /// Output in relative coordinates: /// /// ```text /// (0, 0): 1 /// (0, 1): 2 /// (1, 0): /// (1, 1): /// (2, 0): 3 /// (2, 1): /// ``` /// #[derive(Clone, Debug)] pub struct Rows<'a, T: CellType> { inner: Option>, } impl<'a, T: 'a + CellType> Iterator for Rows<'a, T> { type Item = &'a [T]; fn next(&mut self) -> Option { self.inner.as_mut().and_then(std::iter::Iterator::next) } fn size_hint(&self) -> (usize, Option) { self.inner .as_ref() .map_or((0, Some(0)), std::iter::Iterator::size_hint) } } impl<'a, T: 'a + CellType> DoubleEndedIterator for Rows<'a, T> { fn next_back(&mut self) -> Option { self.inner .as_mut() .and_then(std::iter::DoubleEndedIterator::next_back) } } impl<'a, T: 'a + CellType> ExactSizeIterator for Rows<'a, T> {} // ----------------------------------------------------------------------- // The `Table` struct. // ----------------------------------------------------------------------- /// The `Table` struct represents an Excel worksheet table. /// /// Tables in Excel are a way of grouping a range of cells into a single entity /// that has common formatting or that can be referenced in formulas. In /// `calamine`, tables can be read and converted to a data [`Range`] for further /// processing. /// /// Calamine does not automatically load Table data from a workbook to avoid /// unnecessary overhead. Instead you must explicitly load the Table data using /// the [`Xlsx::load_tables()`](crate::Xlsx::load_tables) method. Once the /// tables have been loaded the following methods can be used to extract and /// work with individual tables: /// /// - [`Xlsx::table_by_name()`](crate::Xlsx::table_by_name). /// - [`Xlsx::table_by_name_ref()`](crate::Xlsx::table_by_name_ref). /// - [`Xlsx::table_names()`](crate::Xlsx::table_names). /// - [`Xlsx::table_names_in_sheet()`](crate::Xlsx::table_names_in_sheet). /// /// Note, these methods are only available for the [`Xlsx`] struct since Tables /// are a feature of the xlsx/xlsb format. They are not currently implemented /// for [`Xlsb`]. /// /// Once you have a `Table` instance, you can access its properties and data /// using the methods below. /// /// # Examples /// /// An example of reading the data from an Excel worksheet Table using the /// `calamine` crate. /// /// The sample Excel file `inventory-table.xlsx` contains a single sheet named /// "Sheet1" with the following data laid out in a worksheet Table called /// "Table1": /// /// ```text /// _____________________________________________________________ /// | || | | | /// | || A | B | C | /// |_________||________________|________________|________________| /// | 1 || Item | Type | Quantity | /// |_________||________________|________________|________________| /// | 2 || 1 | Apple | 50 | /// |_________||________________|________________|________________| /// | 3 || 2 | Banana | 200 | /// |_________||________________|________________|________________| /// | 4 || 3 | Orange | 60 | /// |_________||________________|________________|________________| /// | 5 || 4 | Pear | 100 | /// |_________||________________|________________|________________| /// |_ __________________________________________________| /// \ Sheet1 / /// ------ /// ``` /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/inventory-table.xlsx", env!("CARGO_MANIFEST_DIR")); /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Table1")?; /// /// // Check the table's name. /// let table_name = table.name(); /// assert_eq!(table_name, "Table1"); /// /// // Check that it came from Sheet1. /// let sheet_name = table.sheet_name(); /// assert_eq!(sheet_name, "Sheet1"); /// /// // Get the table column headers. /// let columns_headers = table.columns(); /// assert_eq!(columns_headers, vec!["Item", "Type", "Quantity"]); /// /// // Get the table data range (without the headers). /// let data = table.data(); /// /// // Iterate over the rows of the data range. /// for (row_num, row) in data.rows().enumerate() { /// for (col_num, data) in row.iter().enumerate() { /// // Print the data in each cell of the row. /// println!("({row_num}, {col_num}): {data}"); /// } /// println!(); /// } /// /// Ok(()) /// } /// ``` /// /// Output in relative coordinates: /// /// ```text /// (0, 0): 1 /// (0, 1): Apple /// (0, 2): 50 /// /// (1, 0): 2 /// (1, 1): Banana /// (1, 2): 200 /// /// (2, 0): 3 /// (2, 1): Orange /// (2, 2): 60 /// /// (3, 0): 4 /// (3, 1): Pear /// (3, 2): 100 /// ``` /// #[derive(Debug, Clone)] pub struct Table { pub(crate) name: String, pub(crate) sheet_name: String, pub(crate) columns: Vec, pub(crate) data: Range, } impl Table { /// Get the name of the table. /// /// Tables in Excel have sequentially assigned names like "Table1", /// "Table2", etc. but can also have used assigned names. /// /// # Examples /// /// An example of getting the name of an Excel worksheet Table. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/inventory-table.xlsx", env!("CARGO_MANIFEST_DIR")); /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Table1")?; /// /// // Check the table's name. /// let table_name = table.name(); /// assert_eq!(table_name, "Table1"); /// /// Ok(()) /// } /// ``` /// pub fn name(&self) -> &str { &self.name } /// Get the name of the parent worksheet for a table. /// /// This method returns the name of the parent worksheet that contains the /// table. /// /// # Examples /// /// An example of getting the parent worksheet name for an Excel worksheet /// Table. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/inventory-table.xlsx", env!("CARGO_MANIFEST_DIR")); /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Table1")?; /// /// // Check that it came from Sheet1. /// let sheet_name = table.sheet_name(); /// assert_eq!(sheet_name, "Sheet1"); /// /// Ok(()) /// } /// ``` /// pub fn sheet_name(&self) -> &str { &self.sheet_name } /// Get the header names of the table columns. /// /// This method returns a slice of strings representing the names of the /// column headers in the table. /// /// In Excel table headers can be hidden but the table will still have /// column header names. /// /// # Examples /// /// An example of getting the column headers for an Excel worksheet Table. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/inventory-table.xlsx", env!("CARGO_MANIFEST_DIR")); /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Table1")?; /// /// // Get the table column headers. /// let columns_headers = table.columns(); /// assert_eq!(columns_headers, vec!["Item", "Type", "Quantity"]); /// /// Ok(()) /// } /// ``` /// pub fn columns(&self) -> &[String] { &self.columns } /// Get a range representing the data from the table /// /// This method returns a reference to the data [`Range`] of the table, /// /// Note that the data range excludes the column headers. /// /// # Examples /// /// An example of getting the data range of an Excel worksheet Table. /// /// ``` /// use calamine::{open_workbook, Data, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/inventory-table.xlsx", env!("CARGO_MANIFEST_DIR")); /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Table1")?; /// /// // Get the data range of the table. /// let data_range = table.data(); /// /// // Check one of the values in the data range. Note the relative /// // positioning within the range returned by the `get()` method. /// assert_eq!( /// data_range.get((0, 1)), /// Some(&Data::String("Apple".to_string())) /// ); /// /// Ok(()) /// } /// ``` /// pub fn data(&self) -> &Range { &self.data } } /// Convert a `Table` into a `Range`. /// /// # Examples /// /// An example of getting the data range of an Excel worksheet Table via the /// `From/Into` trait. /// /// ``` /// use calamine::{open_workbook, Data, Error, Range, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = format!("{}/tests/inventory-table.xlsx", env!("CARGO_MANIFEST_DIR")); /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Table1")?; /// /// // Convert the table into a data range using the `From/Into` trait. /// let data_range: Range = table.into(); /// /// // Check one of the values in the data range. Note the relative /// // positioning within the range returned by the `get()` method. /// assert_eq!( /// data_range.get((0, 1)), /// Some(&Data::String("Apple".to_string())) /// ); /// /// Ok(()) /// } /// ``` /// impl From> for Range { fn from(table: Table) -> Range { table.data } } /// A helper function to deserialize cell values as `i64`. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_i64`](crate::datatype::DataType::as_i64) method to the cell /// value, and returns `Ok(Some(value_as_i64))` if successful or `Ok(None)` if /// unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// pub fn deserialize_as_i64_or_none<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_i64()) } /// A helper function to deserialize cell values as `i64`. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_i64`](crate::datatype::DataType::as_i64) method to the cell /// value, and returns `Ok(Ok(value_as_i64))` if successful or /// `Ok(Err(value_to_string))` if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// pub fn deserialize_as_i64_or_string<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_i64().ok_or_else(|| data.to_string())) } /// A helper function to deserialize cell values as `f64`. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_f64`](crate::datatype::DataType::as_f64) method to the cell /// value, and returns `Ok(Some(value_as_f64))` if successful or `Ok(None)` if /// unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// pub fn deserialize_as_f64_or_none<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_f64()) } /// A helper function to deserialize cell values as `f64`. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_f64`](crate::datatype::DataType::as_f64) method to the cell /// value, and returns `Ok(Ok(value_as_f64))` if successful or /// `Ok(Err(value_to_string))` if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// pub fn deserialize_as_f64_or_string<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_f64().ok_or_else(|| data.to_string())) } /// A helper function to deserialize cell values as [`chrono::NaiveDate`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_date()`](crate::Data::as_date) method to the cell value, /// and returns `Ok(Some(value_as_date))` if successful or `Ok(None)` if /// unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::NaiveDate`]: https://docs.rs/chrono/latest/chrono/naive/struct.NaiveDate.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_date_or_none<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_date()) } /// A helper function to deserialize cell values as [`chrono::NaiveDate`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_date()`](crate::Data::as_date) method to the cell value, /// and returns `Ok(Ok(value_as_date))` if successful or /// `Ok(Err(value_to_string))` if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::NaiveDate`]: https://docs.rs/chrono/latest/chrono/naive/struct.NaiveDate.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_date_or_string<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_date().ok_or_else(|| data.to_string())) } /// A helper function to deserialize cell values as [`chrono::NaiveTime`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_time()`](crate::Data::as_time) method to the cell value, /// and returns `Ok(Some(value_as_time))` if successful or `Ok(None)` if /// unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::NaiveTime`]: /// https://docs.rs/chrono/latest/chrono/naive/struct.NaiveTime.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_time_or_none<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_time()) } /// A helper function to deserialize cell values as [`chrono::NaiveTime`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_time()`](crate::Data::as_time) method to the cell value, /// and returns `Ok(Ok(value_as_time))` if successful or /// `Ok(Err(value_to_string))` if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::NaiveTime`]: /// https://docs.rs/chrono/latest/chrono/naive/struct.NaiveTime.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_time_or_string<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_time().ok_or_else(|| data.to_string())) } /// A helper function to deserialize cell values as [`chrono::Duration`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_duration()`](crate::Data::as_duration) method to the cell /// value, and returns `Ok(Some(value_as_duration))` if successful or `Ok(None)` /// if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::Duration`]: /// https://docs.rs/chrono/latest/chrono/struct.Duration.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_duration_or_none<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_duration()) } /// A helper function to deserialize cell values as [`chrono::Duration`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_duration()`](crate::Data::as_duration) method to the cell /// value, and returns `Ok(Ok(value_as_duration))` if successful or /// `Ok(Err(value_to_string))` if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::Duration`]: /// https://docs.rs/chrono/latest/chrono/struct.Duration.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_duration_or_string<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_duration().ok_or_else(|| data.to_string())) } /// A helper function to deserialize cell values as [`chrono::NaiveDateTime`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_datetime()`](crate::Data::as_datetime) method to the cell /// value, and returns `Ok(Some(value_as_datetime))` if successful or `Ok(None)` /// if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::NaiveDateTime`]: /// https://docs.rs/chrono/latest/chrono/naive/struct.NaiveDateTime.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_datetime_or_none<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_datetime()) } /// A helper function to deserialize cell values as [`chrono::NaiveDateTime`]. /// /// This is useful when cells may also contain invalid values (i.e. strings). It /// applies the [`as_datetime()`](crate::Data::as_datetime) method to the cell /// value, and returns `Ok(Ok(value_as_datetime))` if successful or /// `Ok(Err(value_to_string))` if unsuccessful, therefore never failing. /// /// This function is intended to be used with Serde's /// [`deserialize_with`](https://serde.rs/field-attrs.html#deserialize_with) /// field attribute. /// /// [`chrono::NaiveDateTime`]: /// https://docs.rs/chrono/latest/chrono/naive/struct.NaiveDateTime.html /// #[cfg(feature = "chrono")] #[cfg_attr(docsrs, doc(cfg(feature = "chrono")))] pub fn deserialize_as_datetime_or_string<'de, D>( deserializer: D, ) -> Result, D::Error> where D: Deserializer<'de>, { let data = Data::deserialize(deserializer)?; Ok(data.as_datetime().ok_or_else(|| data.to_string())) } calamine-0.34.0/src/ods.rs000064400000000000000000000734111046102023000134110ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! A module to parse Open Document Spreadsheets //! /// # Reference /// /// OASIS Open Document Format for Office Application 1.2 ([ODF 1.2]). /// /// [ODF 1.2]: http://docs.oasis-open.org/office/v1.2/OpenDocument-v1.2.pdf /// use std::collections::{BTreeMap, HashMap}; use std::io::{BufReader, Read, Seek}; use log::warn; use quick_xml::events::attributes::Attributes; use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::Reader as XmlReader; use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::utils::unescape_entity_to_buffer; use crate::vba::VbaProject; use crate::{Data, DataType, HeaderRow, Metadata, Range, Reader, Sheet, SheetType, SheetVisible}; use std::marker::PhantomData; const MIMETYPE: &[u8] = b"application/vnd.oasis.opendocument.spreadsheet"; /// Maximum number of rows allowed in an ODS file (matches XLSX limit). const MAX_ROWS: u32 = 1_048_576; /// Maximum number of columns allowed in an ODS file (matches XLSX limit). const MAX_COLUMNS: u32 = 16_384; /// Maximum number of cells to prevent memory exhaustion from malicious files. const MAX_CELLS: usize = 100_000_000; type OdsReader<'a, RS> = XmlReader>>; /// An enum for ods specific errors #[derive(Debug)] pub enum OdsError { /// Io error Io(std::io::Error), /// Zip error Zip(zip::result::ZipError), /// Xml error Xml(quick_xml::Error), /// Xml attribute error XmlAttr(quick_xml::events::attributes::AttrError), /// Error while parsing string Parse(std::string::ParseError), /// Error while parsing integer ParseInt(std::num::ParseIntError), /// Error while parsing float ParseFloat(std::num::ParseFloatError), /// Error while parsing bool ParseBool(std::str::ParseBoolError), /// Invalid MIME InvalidMime(Vec), /// File not found FileNotFound(&'static str), /// Unexpected end of file Eof(&'static str), /// Unexpected error Mismatch { /// Expected expected: &'static str, /// Found found: String, }, /// Workbook is password protected Password, /// Worksheet not found WorksheetNotFound(String), /// XML attribute error AttrError(quick_xml::events::attributes::AttrError), /// XML encoding error EncodingError(quick_xml::encoding::EncodingError), /// File exceeds maximum cell count CellLimitExceeded { /// Number of cells requested requested: usize, /// Maximum allowed cells max: usize, }, } /// Ods reader options #[derive(Debug, Default)] #[non_exhaustive] struct OdsOptions { pub header_row: HeaderRow, } from_err!(std::io::Error, OdsError, Io); from_err!(zip::result::ZipError, OdsError, Zip); from_err!(quick_xml::Error, OdsError, Xml); from_err!(std::str::ParseBoolError, OdsError, ParseBool); from_err!(std::num::ParseFloatError, OdsError, ParseFloat); from_err!(std::num::ParseIntError, OdsError, ParseInt); from_err!(quick_xml::events::attributes::AttrError, OdsError, XmlAttr); from_err!(quick_xml::encoding::EncodingError, OdsError, Xml); impl std::fmt::Display for OdsError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { OdsError::Io(e) => write!(f, "I/O error: {e}"), OdsError::Zip(e) => write!(f, "Zip error: {e:?}"), OdsError::Xml(e) => write!(f, "Xml error: {e}"), OdsError::XmlAttr(e) => write!(f, "Xml attribute error: {e}"), OdsError::Parse(e) => write!(f, "Parse string error: {e}"), OdsError::ParseInt(e) => write!(f, "Parse integer error: {e}"), OdsError::ParseFloat(e) => write!(f, "Parse float error: {e}"), OdsError::ParseBool(e) => write!(f, "Parse bool error: {e}"), OdsError::InvalidMime(mime) => write!(f, "Invalid MIME type: {mime:?}"), OdsError::FileNotFound(file) => write!(f, "'{file}' file not found in archive"), OdsError::Eof(node) => write!(f, "Expecting '{node}' node, found end of xml file"), OdsError::Mismatch { expected, found } => { write!(f, "Expecting '{expected}', found '{found}'") } OdsError::Password => write!(f, "Workbook is password protected"), OdsError::WorksheetNotFound(name) => write!(f, "Worksheet '{name}' not found"), OdsError::AttrError(e) => write!(f, "XML attribute Error: {e}"), OdsError::EncodingError(e) => write!(f, "XML encoding Error: {e}"), OdsError::CellLimitExceeded { requested, max } => { write!( f, "Cell limit exceeded ({requested} cells requested, max {max})" ) } } } } impl std::error::Error for OdsError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { OdsError::Io(e) => Some(e), OdsError::Zip(e) => Some(e), OdsError::Xml(e) => Some(e), OdsError::Parse(e) => Some(e), OdsError::ParseInt(e) => Some(e), OdsError::ParseFloat(e) => Some(e), OdsError::AttrError(e) => Some(e), OdsError::EncodingError(e) => Some(e), _ => None, } } } /// An `OpenDocument` Spreadsheet document parser /// /// # Reference /// /// OASIS Open Document Format for Office Application 1.2 ([ODF 1.2]). /// /// [ODF 1.2]: http://docs.oasis-open.org/office/v1.2/OpenDocument-v1.2.pdf /// pub struct Ods { sheets: BTreeMap, Range)>, metadata: Metadata, marker: PhantomData, #[cfg(feature = "picture")] pictures: Option)>>, /// Reader options options: OdsOptions, } impl Reader for Ods where RS: Read + Seek, { type Error = OdsError; fn new(reader: RS) -> Result { let mut zip = ZipArchive::new(reader)?; // check mimetype match zip.by_name("mimetype") { Ok(mut f) => { let mut buf = [0u8; 46]; f.read_exact(&mut buf)?; if &buf[..] != MIMETYPE { return Err(OdsError::InvalidMime(buf.to_vec())); } } Err(ZipError::FileNotFound) => return Err(OdsError::FileNotFound("mimetype")), Err(e) => return Err(OdsError::Zip(e)), } check_for_password_protected(&mut zip)?; #[cfg(feature = "picture")] let pictures = read_pictures(&mut zip)?; let Content { sheets, sheets_metadata, defined_names, } = parse_content(zip)?; let metadata = Metadata { sheets: sheets_metadata, names: defined_names, }; Ok(Ods { marker: PhantomData, metadata, sheets, #[cfg(feature = "picture")] pictures, options: OdsOptions::default(), }) } fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } /// Gets `VbaProject` fn vba_project(&mut self) -> Result, OdsError> { Ok(None) } /// Read sheets from workbook.xml and get their corresponding path from relationships fn metadata(&self) -> &Metadata { &self.metadata } /// Read worksheet data in corresponding worksheet path fn worksheet_range(&mut self, name: &str) -> Result, OdsError> { let sheet = self .sheets .get(name) .ok_or_else(|| OdsError::WorksheetNotFound(name.into()))? .0 .to_owned(); match self.options.header_row { HeaderRow::FirstNonEmptyRow => Ok(sheet), HeaderRow::Row(header_row_idx) => { // If `header_row` is a row index, adjust the range if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { Ok(sheet.range((header_row_idx, start.1), end)) } else { Ok(sheet) } } } } fn worksheets(&mut self) -> Vec<(String, Range)> { self.sheets .iter() .map(|(name, (range, _formula))| (name.to_owned(), range.clone())) .collect() } /// Read worksheet data in corresponding worksheet path fn worksheet_formula(&mut self, name: &str) -> Result, OdsError> { self.sheets .get(name) .ok_or_else(|| OdsError::WorksheetNotFound(name.into())) .map(|r| r.1.to_owned()) } #[cfg(feature = "picture")] fn pictures(&self) -> Option)>> { self.pictures.to_owned() } } struct Content { sheets: BTreeMap, Range)>, sheets_metadata: Vec, defined_names: Vec<(String, String)>, } /// Check password protection fn check_for_password_protected(zip: &mut ZipArchive) -> Result<(), OdsError> { let mut reader = match zip.by_name("META-INF/manifest.xml") { Ok(f) => { let mut r = XmlReader::from_reader(BufReader::new(f)); let config = r.config_mut(); config.check_end_names = false; config.trim_text(false); config.check_comments = false; config.expand_empty_elements = true; r } Err(ZipError::FileNotFound) => return Err(OdsError::FileNotFound("META-INF/manifest.xml")), Err(e) => return Err(OdsError::Zip(e)), }; let mut buf = Vec::new(); let mut inner = Vec::new(); loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.name() == QName(b"manifest:file-entry") => { loop { match reader.read_event_into(&mut inner) { Ok(Event::Start(e)) if e.name() == QName(b"manifest:encryption-data") => { return Err(OdsError::Password) } Ok(Event::Eof) => break, Err(e) => return Err(OdsError::Xml(e)), _ => (), } } inner.clear(); } Ok(Event::Eof) => break, Err(e) => return Err(OdsError::Xml(e)), _ => (), } buf.clear(); } Ok(()) } /// Parses content.xml and store the result in `self.content` fn parse_content(mut zip: ZipArchive) -> Result { let mut reader = match zip.by_name("content.xml") { Ok(f) => { let mut r = XmlReader::from_reader(BufReader::new(f)); let config = r.config_mut(); config.check_end_names = false; config.trim_text(false); config.check_comments = false; config.expand_empty_elements = true; r } Err(ZipError::FileNotFound) => return Err(OdsError::FileNotFound("content.xml")), Err(e) => return Err(OdsError::Zip(e)), }; let mut buf = Vec::with_capacity(1024); let mut sheets = BTreeMap::new(); let mut defined_names = Vec::new(); let mut sheets_metadata = Vec::new(); let mut styles = HashMap::new(); let mut style_name: Option = None; loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.name() == QName(b"style:style") => { style_name = e .try_get_attribute(b"style:name")? .map(|a| a.decode_and_unescape_value(reader.decoder())) .transpose()? .map(|x| x.to_string()); } Ok(Event::Start(e)) if style_name.is_some() && e.name() == QName(b"style:table-properties") => { let visible = match e.try_get_attribute(b"table:display")? { Some(a) => { if a.decode_and_unescape_value(reader.decoder())?.parse()? { SheetVisible::Visible } else { SheetVisible::Hidden } } None => SheetVisible::Visible, }; styles.insert(style_name.clone(), visible); } Ok(Event::Start(e)) if e.name() == QName(b"table:table") => { let visible = styles .get( &e.try_get_attribute(b"table:style-name")? .map(|a| a.decode_and_unescape_value(reader.decoder())) .transpose()? .map(|x| x.to_string()), ) .cloned() .unwrap_or(SheetVisible::Visible); if let Some(a) = e .attributes() .filter_map(|a| a.ok()) .find(|a| a.key == QName(b"table:name")) { let name = a.decode_and_unescape_value(reader.decoder())?.to_string(); let (range, formulas) = read_table(&mut reader)?; sheets_metadata.push(Sheet { name: name.clone(), typ: SheetType::WorkSheet, visible, }); sheets.insert(name, (range, formulas)); } } Ok(Event::Start(e)) if e.name() == QName(b"table:named-expressions") => { defined_names = read_named_expressions(&mut reader)?; } Ok(Event::Eof) => break, Err(e) => return Err(OdsError::Xml(e)), _ => (), } buf.clear(); } Ok(Content { sheets, sheets_metadata, defined_names, }) } fn read_table(reader: &mut OdsReader<'_, RS>) -> Result<(Range, Range), OdsError> where RS: Read + Seek, { let mut cells = Vec::new(); let mut rows_repeats = Vec::new(); let mut formulas = Vec::new(); let mut cols = Vec::new(); let mut buf = Vec::with_capacity(1024); let mut row_buf = Vec::with_capacity(1024); let mut cell_buf = Vec::with_capacity(1024); let mut total_rows = 0; cols.push(0); loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.name() == QName(b"table:table-row") => { let row_repeats = match e.try_get_attribute(b"table:number-rows-repeated")? { Some(c) => c.decode_and_unescape_value(reader.decoder())?.parse()?, None => 1, }; // Cap row_repeats so total rows don't exceed MAX_ROWS let remaining_rows = (MAX_ROWS as usize).saturating_sub(total_rows); let capped_repeats = row_repeats.min(remaining_rows); if capped_repeats < row_repeats { warn!( "ods row repeat count capped ({row_repeats} -> {capped_repeats}, max rows {MAX_ROWS})" ); } total_rows = total_rows.saturating_add(capped_repeats); read_row( reader, &mut row_buf, &mut cell_buf, &mut cells, &mut formulas, )?; cols.push(cells.len()); rows_repeats.push(capped_repeats); } Ok(Event::End(e)) if e.name() == QName(b"table:table") => break, Err(e) => return Err(OdsError::Xml(e)), Ok(_) => (), } buf.clear(); } Ok(( get_range(cells, &cols, &rows_repeats)?, get_range(formulas, &cols, &rows_repeats)?, )) } fn is_empty_row(row: &[T]) -> bool { row.iter().all(|x| x == &T::default()) } fn get_range( mut cells: Vec, cols: &[usize], rows_repeats: &[usize], ) -> Result, OdsError> { // find smallest area with non empty Cells let mut row_min = None; let mut row_max = 0; let mut col_min = usize::MAX; let mut col_max = 0; let mut first_empty_rows_repeated = 0; { for (i, w) in cols.windows(2).enumerate() { let row = &cells[w[0]..w[1]]; if let Some(p) = row.iter().position(|c| c != &T::default()) { if row_min.is_none() { row_min = Some(i); first_empty_rows_repeated = rows_repeats.iter().take(i).sum::().saturating_sub(i); } row_max = i; if p < col_min { col_min = p; } if let Some(p) = row.iter().rposition(|c| c != &T::default()) { if p > col_max { col_max = p; } } } } } let Some(row_min) = row_min else { return Ok(Range::default()); }; // rebuild cells into its smallest non empty area let row_width = col_max + 1 - col_min; let cells_len = (row_max + 1 - row_min) * row_width; { let mut new_cells = Vec::with_capacity(cells_len.min(MAX_CELLS)); let empty_cells = vec![T::default(); col_max + 1]; let mut empty_row_repeats = 0_usize; let mut consecutive_empty_rows = 0_usize; for (w, row_repeats) in cols .windows(2) .skip(row_min) .take(row_max + 1) .zip(rows_repeats.iter().skip(row_min).take(row_max + 1)) { let row = &cells[w[0]..w[1]]; let row_repeats = *row_repeats; if is_empty_row(row) { empty_row_repeats = empty_row_repeats.saturating_add(row_repeats); consecutive_empty_rows += 1; continue; } if empty_row_repeats > 0 { // Check if expanding empty rows would exceed MAX_CELLS let cells_to_add = empty_row_repeats.saturating_mul(row_width); if new_cells.len().saturating_add(cells_to_add) > MAX_CELLS { return Err(OdsError::CellLimitExceeded { requested: new_cells.len().saturating_add(cells_to_add), max: MAX_CELLS, }); } row_max = row_max + empty_row_repeats - consecutive_empty_rows; for _ in 0..empty_row_repeats { new_cells.extend_from_slice(&empty_cells[col_min..]); } empty_row_repeats = 0; consecutive_empty_rows = 0; } if row_repeats > 1 { row_max = row_max + row_repeats - 1; } // Check if expanding this row would exceed MAX_CELLS let cells_to_add = row_repeats.saturating_mul(row_width); if new_cells.len().saturating_add(cells_to_add) > MAX_CELLS { return Err(OdsError::CellLimitExceeded { requested: new_cells.len().saturating_add(cells_to_add), max: MAX_CELLS, }); } for _ in 0..row_repeats { match row.len().cmp(&(col_max + 1)) { std::cmp::Ordering::Less => { new_cells.extend_from_slice(&row[col_min..]); new_cells.extend_from_slice(&empty_cells[row.len()..]); } std::cmp::Ordering::Equal => { new_cells.extend_from_slice(&row[col_min..]); } std::cmp::Ordering::Greater => { new_cells.extend_from_slice(&row[col_min..=col_max]); } } } } cells = new_cells; } let row_min = row_min + first_empty_rows_repeated; let row_max = row_max + first_empty_rows_repeated; Ok(Range { start: (row_min as u32, col_min as u32), end: (row_max as u32, col_max as u32), inner: cells, }) } fn read_row( reader: &mut OdsReader<'_, RS>, row_buf: &mut Vec, cell_buf: &mut Vec, cells: &mut Vec, formulas: &mut Vec, ) -> Result<(), OdsError> where RS: Read + Seek, { let mut empty_col_repeats = 0; let row_start = cells.len(); loop { row_buf.clear(); match reader.read_event_into(row_buf) { Ok(Event::Start(e)) if e.name() == QName(b"table:table-cell") || e.name() == QName(b"table:covered-table-cell") => { let mut repeats = 1; for a in e.attributes() { let a = a?; if a.key == QName(b"table:number-columns-repeated") { repeats = reader.decoder().decode(&a.value)?.parse()?; break; } } let (value, formula, is_closed) = get_datatype(reader, e.attributes(), cell_buf)?; // Cap empty_col_repeats to not exceed MAX_COLUMNS let current_cols = cells.len() - row_start; let remaining = (MAX_COLUMNS as usize).saturating_sub(current_cols); let capped_empty = empty_col_repeats.min(remaining); if capped_empty < empty_col_repeats { warn!( "ods column repeat count capped ({empty_col_repeats} -> {capped_empty}, max columns {MAX_COLUMNS})" ); } for _ in 0..capped_empty { cells.push(Data::Empty); formulas.push("".to_string()); } empty_col_repeats = 0; // Cap repeats to not exceed MAX_COLUMNS let current_cols = cells.len() - row_start; let remaining = (MAX_COLUMNS as usize).saturating_sub(current_cols); let capped_repeats = repeats.min(remaining); if capped_repeats < repeats { warn!( "ods column repeat count capped ({repeats} -> {capped_repeats}, max columns {MAX_COLUMNS})" ); } if value.is_empty() && formula.is_empty() { empty_col_repeats = capped_repeats; } else { for _ in 0..capped_repeats { cells.push(value.clone()); formulas.push(formula.clone()); } } if !is_closed { reader.read_to_end_into(e.name(), cell_buf)?; } } Ok(Event::End(e)) if e.name() == QName(b"table:table-row") => break, Err(e) => return Err(OdsError::Xml(e)), Ok(e) => { return Err(OdsError::Mismatch { expected: "table-cell", found: format!("{e:?}"), }); } } } Ok(()) } /// Converts table-cell element into a `Data` /// /// ODF 1.2-19.385 fn get_datatype( reader: &mut OdsReader<'_, RS>, atts: Attributes<'_>, buf: &mut Vec, ) -> Result<(Data, String, bool), OdsError> where RS: Read + Seek, { let mut is_string = false; let mut is_value_set = false; let mut val = Data::Empty; let mut formula = String::new(); for a in atts { let a = a?; match a.key { QName(b"office:value") if !is_value_set => { let v = reader.decoder().decode(&a.value)?; val = Data::Float( fast_float2::parse(v.as_bytes()) .map_err(|_| OdsError::ParseFloat(v.parse::().unwrap_err()))?, ); is_value_set = true; } QName(b"office:string-value" | b"office:date-value" | b"office:time-value") if !is_value_set => { let attr = a.decode_and_unescape_value(reader.decoder())?.to_string(); val = match a.key { QName(b"office:date-value") => Data::DateTimeIso(attr), QName(b"office:time-value") => Data::DurationIso(attr), _ => Data::String(attr), }; is_value_set = true; } QName(b"office:boolean-value") if !is_value_set => { let b = &*a.value == b"TRUE" || &*a.value == b"true"; val = Data::Bool(b); is_value_set = true; } QName(b"office:value-type") if !is_value_set => is_string = &*a.value == b"string", QName(b"table:formula") => { formula = a.decode_and_unescape_value(reader.decoder())?.to_string(); } _ => (), } } if !is_value_set && is_string { // If the value type is string and the office:string-value attribute // is not present, the element content defines the value. let mut s = String::new(); let mut first_paragraph = true; loop { buf.clear(); match reader.read_event_into(buf) { Ok(Event::Text(t)) => { s.push_str(&t.xml10_content()?); } Ok(Event::GeneralRef(e)) => { unescape_entity_to_buffer(&e, &mut s)?; } Ok(Event::End(e)) if e.name() == QName(b"table:table-cell") || e.name() == QName(b"table:covered-table-cell") => { return Ok((Data::String(s), formula, true)); } Ok(Event::Start(e)) if e.name() == QName(b"office:annotation") => loop { match reader.read_event_into(buf) { Ok(Event::End(e)) if e.name() == QName(b"office:annotation") => { break; } Err(e) => return Err(OdsError::Xml(e)), _ => (), } }, Ok(Event::Start(e)) if e.name() == QName(b"text:p") => { if first_paragraph { first_paragraph = false; } else { s.push('\n'); } } Ok(Event::Start(e)) if e.name() == QName(b"text:s") => { let count = match e.try_get_attribute("text:c")? { Some(c) => c.decode_and_unescape_value(reader.decoder())?.parse()?, None => 1, }; for _ in 0..count { s.push(' '); } } Err(e) => return Err(OdsError::Xml(e)), Ok(Event::Eof) => return Err(OdsError::Eof("table:table-cell")), _ => (), } } } else { Ok((val, formula, false)) } } fn read_named_expressions( reader: &mut OdsReader<'_, RS>, ) -> Result, OdsError> where RS: Read + Seek, { let mut defined_names = Vec::new(); let mut buf = Vec::with_capacity(512); loop { buf.clear(); match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.name() == QName(b"table:named-range") || e.name() == QName(b"table:named-expression") => { let mut name = String::new(); let mut formula = String::new(); for a in e.attributes() { let a = a?; match a.key { QName(b"table:name") => { name = a.decode_and_unescape_value(reader.decoder())?.to_string(); } QName(b"table:cell-range-address" | b"table:expression") => { formula = a.decode_and_unescape_value(reader.decoder())?.to_string(); } _ => (), } } defined_names.push((name, formula)); } Ok(Event::End(e)) if e.name() == QName(b"table:named-range") || e.name() == QName(b"table:named-expression") => {} Ok(Event::End(e)) if e.name() == QName(b"table:named-expressions") => break, Err(e) => return Err(OdsError::Xml(e)), Ok(e) => { return Err(OdsError::Mismatch { expected: "table:named-expressions", found: format!("{e:?}"), }); } } } Ok(defined_names) } // Read pictures. #[cfg(feature = "picture")] #[allow(clippy::type_complexity)] fn read_pictures( zip: &mut ZipArchive, ) -> Result)>>, OdsError> { let mut pics = Vec::new(); for i in 0..zip.len() { let mut zfile = zip.by_index(i)?; let zname = zfile.name(); // no Thumbnails if zname.starts_with("Pictures") { if let Some(ext) = zname.split('.').next_back() { if [ "emf", "wmf", "pict", "jpeg", "jpg", "png", "dib", "gif", "tiff", "eps", "bmp", "wpg", ] .contains(&ext) { let ext = ext.to_string(); let mut buf: Vec = Vec::new(); zfile.read_to_end(&mut buf)?; pics.push((ext, buf)); } } } } if pics.is_empty() { Ok(None) } else { Ok(Some(pics)) } } calamine-0.34.0/src/utils.rs000064400000000000000000000651021046102023000137620ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! Internal module providing handy functions use std::borrow::Cow; use std::collections::HashMap; use std::io::{Read, Seek}; use quick_xml::{escape::resolve_xml_entity, events::BytesRef}; use zip::read::ZipArchive; const UNICODE_ESCAPE_LENGTH: usize = 7; // Length of _x00HH_. macro_rules! from_err { ($from:ty, $to:tt, $var:tt) => { impl From<$from> for $to { fn from(e: $from) -> $to { $to::$var(e.into()) } } }; } /// Converts a &[u8] into an iterator of `u32`s pub fn to_u32(s: &[u8]) -> impl ExactSizeIterator + '_ { assert_eq!(s.len() % 4, 0); s.chunks_exact(4) .map(|data| u32::from_le_bytes([data[0], data[1], data[2], data[3]])) } #[inline] pub fn read_u32(s: &[u8]) -> u32 { u32::from_le_bytes(s[..4].try_into().unwrap()) } #[inline] pub fn read_i32(s: &[u8]) -> i32 { i32::from_le_bytes(s[..4].try_into().unwrap()) } #[inline] pub fn read_u16(s: &[u8]) -> u16 { u16::from_le_bytes(s[..2].try_into().unwrap()) } #[inline] pub fn read_i16(s: &[u8]) -> i16 { i16::from_le_bytes(s[..2].try_into().unwrap()) } #[inline] pub fn read_u64(s: &[u8]) -> u64 { u64::from_le_bytes(s[..8].try_into().unwrap()) } #[inline] pub fn read_usize(s: &[u8]) -> usize { read_u32(s).try_into().unwrap() } #[inline] pub fn read_f64(s: &[u8]) -> f64 { f64::from_le_bytes(s[..8].try_into().unwrap()) } /// Push literal column into a String buffer pub fn push_column(mut col: u32, buf: &mut String) { if col < 26 { buf.push((b'A' + col as u8) as char); } else { let mut rev = String::new(); while col >= 26 { let c = col % 26; rev.push((b'A' + c as u8) as char); col -= c; col /= 26; } buf.extend(rev.chars().rev()); } } // Utility function to unescape standard XML entities or character references // generated by a `quick_xml::Event::GeneralRef()`. Appends the result to the // provided buffer or returns a `quick_xml::Error`. #[inline] pub(crate) fn unescape_entity_to_buffer( entity: &BytesRef, buffer: &mut String, ) -> Result<(), quick_xml::Error> { let decoded = entity.decode()?; // Handle standard XML entities directly. if let Some(unescaped_xml_entity) = resolve_xml_entity(&decoded) { buffer.push_str(unescaped_xml_entity); return Ok(()); } // Handle character references like `#xA0`. if let Some(unescaped_char) = entity.resolve_char_ref()? { buffer.push(unescaped_char); return Ok(()); } // Raise a quick_xml::Error if we failed to unescape the entity. The // location information isn't useful to the end user in this context so we // return `0..0`. Err(quick_xml::Error::Escape( quick_xml::escape::EscapeError::UnrecognizedEntity(0..0, format!("&{decoded};")), )) } // Function to unescape Excel XML escapes in a string. Excel encodes a character // like "\r" as "_x000D_". In turn it escapes the literal string "_x000D_" as // "_x005F_x000D_". pub(crate) fn unescape_xml(original: &str) -> Cow<'_, str> { if !original.contains("_x00") { return Cow::Borrowed(original); } let bytes = original.as_bytes(); let mut escaped_string = String::with_capacity(original.len()); let mut i = 0; let mut has_changes = false; while i < bytes.len() { // Look for an escape sequence like: "_x00HH_" if i + UNICODE_ESCAPE_LENGTH <= bytes.len() && bytes[i] == b'_' && bytes.get(i..i + 4) == Some(b"_x00") && bytes[i + 6] == b'_' { // Extract and validate the hex digits. if let Ok(hex_str) = std::str::from_utf8(&bytes[i + 4..i + 6]) { if let Ok(hex_value) = u8::from_str_radix(hex_str, 16) { // Valid escape sequence, convert to a character. escaped_string.push(hex_value as char); // Skip the processed escape sequence. i += UNICODE_ESCAPE_LENGTH; has_changes = true; continue; } } } // Not an escape sequence, so we copy the current UTF-8 character. let remaining = &original[i..]; match remaining.chars().next() { Some(ch) => { escaped_string.push(ch); i += ch.len_utf8(); } None => break, } } if has_changes { Cow::Owned(escaped_string) } else { Cow::Borrowed(original) } } /// Build a lookup cache mapping lowercased normalized paths to original ZIP entry names. pub fn build_zip_path_cache(zip: &ZipArchive) -> HashMap { let mut cache = HashMap::with_capacity(zip.len()); for zip_path in zip.file_names() { let normalized = zip_path.replace('\\', "/").to_ascii_lowercase(); cache.insert(normalized, zip_path.to_string()); } cache } /// Look up a path in the cache, falling back to the path itself. pub fn cached_zip_path<'a>(cache: &'a HashMap, path: &'a str) -> &'a str { let key = path.to_ascii_lowercase(); cache.get(&key).map(|s| s.as_str()).unwrap_or(path) } pub const FTAB_LEN: usize = 485; /* [MS-XLS] 2.5.198.17 */ /* [MS-XLSB] 2.5.97.10 */ pub const FTAB: [&str; FTAB_LEN] = [ "COUNT", "IF", "ISNA", "ISERROR", "SUM", "AVERAGE", "MIN", "MAX", "ROW", "COLUMN", "NA", "NPV", "STDEV", "DOLLAR", "FIXED", "SIN", "COS", "TAN", "ATAN", "PI", "SQRT", "EXP", "LN", "LOG10", "ABS", "INT", "SIGN", "ROUND", "LOOKUP", "INDEX", "REPT", "MID", "LEN", "VALUE", "TRUE", "FALSE", "AND", "OR", "NOT", "MOD", "DCOUNT", "DSUM", "DAVERAGE", "DMIN", "DMAX", "DSTDEV", "VAR", "DVAR", "TEXT", "LINEST", "TREND", "LOGEST", "GROWTH", "GOTO", "HALT", "RETURN", "PV", "FV", "NPER", "PMT", "RATE", "MIRR", "IRR", "RAND", "MATCH", "DATE", "TIME", "DAY", "MONTH", "YEAR", "WEEKDAY", "HOUR", "MINUTE", "SECOND", "NOW", "AREAS", "ROWS", "COLUMNS", "OFFSET", "ABSREF", "RELREF", "ARGUMENT", "SEARCH", "TRANSPOSE", "ERROR", "STEP", "TYPE", "ECHO", "SET.NAME", "CALLER", "DEREF", "WINDOWS", "SERIES", "DOCUMENTS", "ACTIVE.CELL", "SELECTION", "RESULT", "ATAN2", "ASIN", "ACOS", "CHOOSE", "HLOOKUP", "VLOOKUP", "LINKS", "INPUT", "ISREF", "GET.FORMULA", "GET.NAME", "SET.VALUE", "LOG", "EXEC", "CHAR", "LOWER", "UPPER", "PROPER", "LEFT", "RIGHT", "EXACT", "TRIM", "REPLACE", "SUBSTITUTE", "CODE", "NAMES", "DIRECTORY", "FIND", "CELL", "ISERR", "ISTEXT", "ISNUMBER", "ISBLANK", "T", "N", "FOPEN", "FCLOSE", "FSIZE", "FREADLN", "FREAD", "FWRITELN", "FWRITE", "FPOS", "DATEVALUE", "TIMEVALUE", "SLN", "SYD", "DDB", "GET.DEF", "REFTEXT", "TEXTREF", "INDIRECT", "REGISTER", "CALL", "ADD.BAR", "ADD.MENU", "ADD.COMMAND", "ENABLE.COMMAND", "CHECK.COMMAND", "RENAME.COMMAND", "SHOW.BAR", "DELETE.MENU", "DELETE.COMMAND", "GET.CHART.ITEM", "DIALOG.BOX", "CLEAN", "MDETERM", "MINVERSE", "MMULT", "FILES", "IPMT", "PPMT", "COUNTA", "CANCEL.KEY", "FOR", "WHILE", "BREAK", "NEXT", "INITIATE", "REQUEST", "POKE", "EXECUTE", "TERMINATE", "RESTART", "HELP", "GET.BAR", "PRODUCT", "FACT", "GET.CELL", "GET.WORKSPACE", "GET.WINDOW", "GET.DOCUMENT", "DPRODUCT", "ISNONTEXT", "GET.NOTE", "NOTE", "STDEVP", "VARP", "DSTDEVP", "DVARP", "TRUNC", "ISLOGICAL", "DCOUNTA", "DELETE.BAR", "UNREGISTER", "", "", "USDOLLAR", "FINDB", "SEARCHB", "REPLACEB", "LEFTB", "RIGHTB", "MIDB", "LENB", "ROUNDUP", "ROUNDDOWN", "ASC", "DBCS", "RANK", "", "", "ADDRESS", "DAYS360", "TODAY", "VDB", "ELSE", "ELSE.IF", "END.IF", "FOR.CELL", "MEDIAN", "SUMPRODUCT", "SINH", "COSH", "TANH", "ASINH", "ACOSH", "ATANH", "DGET", "CREATE.OBJECT", "VOLATILE", "LAST.ERROR", "CUSTOM.UNDO", "CUSTOM.REPEAT", "FORMULA.CONVERT", "GET.LINK.INFO", "TEXT.BOX", "INFO", "GROUP", "GET.OBJECT", "DB", "PAUSE", "", "", "RESUME", "FREQUENCY", "ADD.TOOLBAR", "DELETE.TOOLBAR", "User", "RESET.TOOLBAR", "EVALUATE", "GET.TOOLBAR", "GET.TOOL", "SPELLING.CHECK", "ERROR.TYPE", "APP.TITLE", "WINDOW.TITLE", "SAVE.TOOLBAR", "ENABLE.TOOL", "PRESS.TOOL", "REGISTER.ID", "GET.WORKBOOK", "AVEDEV", "BETADIST", "GAMMALN", "BETAINV", "BINOMDIST", "CHIDIST", "CHIINV", "COMBIN", "CONFIDENCE", "CRITBINOM", "EVEN", "EXPONDIST", "FDIST", "FINV", "FISHER", "FISHERINV", "FLOOR", "GAMMADIST", "GAMMAINV", "CEILING", "HYPGEOMDIST", "LOGNORMDIST", "LOGINV", "NEGBINOMDIST", "NORMDIST", "NORMSDIST", "NORMINV", "NORMSINV", "STANDARDIZE", "ODD", "PERMUT", "POISSON", "TDIST", "WEIBULL", "SUMXMY2", "SUMX2MY2", "SUMX2PY2", "CHITEST", "CORREL", "COVAR", "FORECAST", "FTEST", "INTERCEPT", "PEARSON", "RSQ", "STEYX", "SLOPE", "TTEST", "PROB", "DEVSQ", "GEOMEAN", "HARMEAN", "SUMSQ", "KURT", "SKEW", "ZTEST", "LARGE", "SMALL", "QUARTILE", "PERCENTILE", "PERCENTRANK", "MODE", "TRIMMEAN", "TINV", "", "MOVIE.COMMAND", "GET.MOVIE", "CONCATENATE", "POWER", "PIVOT.ADD.DATA", "GET.PIVOT.TABLE", "GET.PIVOT.FIELD", "GET.PIVOT.ITEM", "RADIANS", "DEGREES", "SUBTOTAL", "SUMIF", "COUNTIF", "COUNTBLANK", "SCENARIO.GET", "OPTIONS.LISTS.GET", "ISPMT", "DATEDIF", "DATESTRING", "NUMBERSTRING", "ROMAN", "OPEN.DIALOG", "SAVE.DIALOG", "VIEW.GET", "GETPIVOTDATA", "HYPERLINK", "PHONETIC", "AVERAGEA", "MAXA", "MINA", "STDEVPA", "VARPA", "STDEVA", "VARA", "BAHTTEXT", "THAIDAYOFWEEK", "THAIDIGIT", "THAIMONTHOFYEAR", "THAINUMSOUND", "THAINUMSTRING", "THAISTRINGLENGTH", "ISTHAIDIGIT", "ROUNDBAHTDOWN", "ROUNDBAHTUP", "THAIYEAR", "RTD", "CUBEVALUE", "CUBEMEMBER", "CUBEMEMBERPROPERTY", "CUBERANKEDMEMBER", "HEX2BIN", "HEX2DEC", "HEX2OCT", "DEC2BIN", "DEC2HEX", "DEC2OCT", "OCT2BIN", "OCT2HEX", "OCT2DEC", "BIN2DEC", "BIN2OCT", "BIN2HEX", "IMSUB", "IMDIV", "IMPOWER", "IMABS", "IMSQRT", "IMLN", "IMLOG2", "IMLOG10", "IMSIN", "IMCOS", "IMEXP", "IMARGUMENT", "IMCONJUGATE", "IMAGINARY", "IMREAL", "COMPLEX", "IMSUM", "IMPRODUCT", "SERIESSUM", "FACTDOUBLE", "SQRTPI", "QUOTIENT", "DELTA", "GESTEP", "ISEVEN", "ISODD", "MROUND", "ERF", "ERFC", "BESSELJ", "BESSELK", "BESSELY", "BESSELI", "XIRR", "XNPV", "PRICEMAT", "YIELDMAT", "INTRATE", "RECEIVED", "DISC", "PRICEDISC", "YIELDDISC", "TBILLEQ", "TBILLPRICE", "TBILLYIELD", "PRICE", "YIELD", "DOLLARDE", "DOLLARFR", "NOMINAL", "EFFECT", "CUMPRINC", "CUMIPMT", "EDATE", "EOMONTH", "YEARFRAC", "COUPDAYBS", "COUPDAYS", "COUPDAYSNC", "COUPNCD", "COUPNUM", "COUPPCD", "DURATION", "MDURATION", "ODDLPRICE", "ODDLYIELD", "ODDFPRICE", "ODDFYIELD", "RANDBETWEEN", "WEEKNUM", "AMORDEGRC", "AMORLINC", "CONVERT", // "SHEETJS", "ACCRINT", "ACCRINTM", "WORKDAY", "NETWORKDAYS", "GCD", "MULTINOMIAL", "LCM", "FVSCHEDULE", "CUBEKPIMEMBER", "CUBESET", "CUBESETCOUNT", "IFERROR", "COUNTIFS", "SUMIFS", "AVERAGEIF", "AVERAGEIFS", ]; pub const FTAB_ARGC: [u8; FTAB_LEN] = [ 255, // "COUNT", 3, // "IF", 1, // "ISNA", 1, // "ISERROR", 255, // "SUM", 255, // "AVERAGE", 255, // "MIN", 255, // "MAX", 1, // "ROW", 1, // "COLUMN", 0, // "NA", 254, // "NPV", 255, // "STDEV", 2, // "DOLLAR", 3, // "FIXED", 1, // "SIN", 1, // "COS", 1, // "TAN", 1, // "ATAN", 0, // "PI", 1, // "SQRT", 1, // "EXP", 1, // "LN", 1, // "LOG10", 1, // "ABS", 1, // "INT", 1, // "SIGN", 2, // "ROUND", 3, // "LOOKUP", 4, // "INDEX", 2, // "REPT", 3, // "MID", 1, // "LEN", 1, // "VALUE", 0, // "TRUE", 0, // "FALSE", 255, // "AND", 255, // "OR", 1, // "NOT", 2, // "MOD", 3, // "DCOUNT", 3, // "DSUM", 3, // "DAVERAGE", 3, // "DMIN", 3, // "DMAX", 3, // "DSTDEV", 255, // "VAR", 3, // "DVAR", 2, // "TEXT", 4, // "LINEST", 4, // "TREND", 4, // "LOGEST", 4, // "GROWTH", 1, // "GOTO", 1, // "HALT", 1, // "RETURN", 5, // "PV", 5, // "FV", 5, // "NPER", 5, // "PMT", 6, // "RATE", 3, // "MIRR", 2, // "IRR", 0, // "RAND", 3, // "MATCH", 3, // "DATE", 3, // "TIME", 1, // "DAY", 1, // "MONTH", 1, // "YEAR", 2, // "WEEKDAY", 1, // "HOUR", 1, // "MINUTE", 1, // "SECOND", 0, // "NOW", 1, // "AREAS", 1, // "ROWS", 1, // "COLUMNS", 5, // "OFFSET", 2, // "ABSREF", 2, // "RELREF", 3, // "ARGUMENT", 3, // "SEARCH", 1, // "TRANSPOSE", 2, // "ERROR", 0, // "STEP", 1, // "TYPE", 1, // "ECHO", 2, // "SET.NAME", 0, // "CALLER", 1, // "DEREF", 2, // "WINDOWS", 2, // "SERIES", 2, // "DOCUMENTS", 0, // "ACTIVE.CELL", 0, // "SELECTION", 1, // "RESULT", 2, // "ATAN2", 1, // "ASIN", 1, // "ACOS", 255, // "CHOOSE", 4, // "HLOOKUP", 4, // "VLOOKUP", 2, // "LINKS", 7, // "INPUT", 1, // "ISREF", 1, // "GET.FORMULA", 2, // "GET.NAME", 2, // "SET.VALUE", 2, // "LOG", 4, // "EXEC", 1, // "CHAR", 1, // "LOWER", 1, // "UPPER", 1, // "PROPER", 2, // "LEFT", 2, // "RIGHT", 2, // "EXACT", 1, // "TRIM", 4, // "REPLACE", 4, // "SUBSTITUTE", 1, // "CODE", 3, // "NAMES", 1, // "DIRECTORY", 3, // "FIND", 2, // "CELL", 1, // "ISERR", 1, // "ISTEXT", 1, // "ISNUMBER", 1, // "ISBLANK", 1, // "T", 1, // "N", 2, // "FOPEN", 1, // "FCLOSE", 1, // "FSIZE", 1, // "FREADLN", 2, // "FREAD", 2, // "FWRITELN", 2, // "FWRITE", 2, // "FPOS", 1, // "DATEVALUE", 1, // "TIMEVALUE", 3, // "SLN", 4, // "SYD", 5, // "DDB", 3, // "GET.DEF", 2, // "REFTEXT", 2, // "TEXTREF", 2, // "INDIRECT", 255, // "REGISTER", 255, // "CALL", 1, // "ADD.BAR", 4, // "ADD.MENU", 5, // "ADD.COMMAND", 5, // "ENABLE.COMMAND", 5, // "CHECK.COMMAND", 5, // "RENAME.COMMAND", 1, // "SHOW.BAR", 3, // "DELETE.MENU", 4, // "DELETE.COMMAND", 3, // "GET.CHART.ITEM", 1, // "DIALOG.BOX", 1, // "CLEAN", 1, // "MDETERM", 1, // "MINVERSE", 1, // "MMULT", 2, // "FILES", 6, // "IPMT", 6, // "PPMT", 255, // "COUNTA", 2, // "CANCEL.KEY", 4, // "FOR", 1, // "WHILE", 0, // "BREAK", 0, // "NEXT", 2, // "INITIATE", 2, // "REQUEST", 3, // "POKE", 2, // "EXECUTE", 1, // "TERMINATE", 1, // "RESTART", 1, // "HELP", 4, // "GET.BAR", 255, // "PRODUCT", 1, // "FACT", 2, // "GET.CELL", 1, // "GET.WORKSPACE", 2, // "GET.WINDOW", 2, // "GET.DOCUMENT", 3, // "DPRODUCT", 1, // "ISNONTEXT", 3, // "GET.NOTE", 4, // "NOTE", 255, // "STDEVP", 255, // "VARP", 3, // "DSTDEVP", 3, // "DVARP", 2, // "TRUNC", 1, // "ISLOGICAL", 3, // "DCOUNTA", 1, // "DELETE.BAR", 1, // "UNREGISTER", 0, // "", 0, // "", 2, // "USDOLLAR", 3, // "FINDB", 3, // "SEARCHB", 4, // "REPLACEB", 2, // "LEFTB", 2, // "RIGHTB", 3, // "MIDB", 3, // "LENB", 2, // "ROUNDUP", 2, // "ROUNDDOWN", 1, // "ASC", 1, // "DBCS", 3, // "RANK", 0, // "", 0, // "", 5, // "ADDRESS", 3, // "DAYS360", 0, // "TODAY", 7, // "VDB", 0, // "ELSE", 1, // "ELSE.IF", 0, // "END.IF", 3, // "FOR.CELL", 255, // "MEDIAN", 255, // "SUMPRODUCT", 1, // "SINH", 1, // "COSH", 1, // "TANH", 1, // "ASINH", 1, // "ACOSH", 1, // "ATANH", 3, // "DGET", 11, // "CREATE.OBJECT", 1, // "VOLATILE", 0, // "LAST.ERROR", 2, // "CUSTOM.UNDO", 3, // "CUSTOM.REPEAT", 5, // "FORMULA.CONVERT", 4, // "GET.LINK.INFO", 4, // "TEXT.BOX", 1, // "INFO", 0, // "GROUP", 5, // "GET.OBJECT", 5, // "DB", 1, // "PAUSE", 0, // "", 0, // "", 1, // "RESUME", 2, // "FREQUENCY", 2, // "ADD.TOOLBAR", 1, // "DELETE.TOOLBAR", 255, // "User", 1, // "RESET.TOOLBAR", 1, // "EVALUATE", 2, // "GET.TOOLBAR", 3, // "GET.TOOL", 3, // "SPELLING.CHECK", 1, // "ERROR.TYPE", 1, // "APP.TITLE", 1, // "WINDOW.TITLE", 2, // "SAVE.TOOLBAR", 3, // "ENABLE.TOOL", 3, // "PRESS.TOOL", 3, // "REGISTER.ID", 2, // "GET.WORKBOOK", 255, // "AVEDEV", 5, // "BETADIST", 1, // "GAMMALN", 5, // "BETAINV", 4, // "BINOMDIST", 2, // "CHIDIST", 2, // "CHIINV", 2, // "COMBIN", 3, // "CONFIDENCE", 3, // "CRITBINOM", 1, // "EVEN", 3, // "EXPONDIST", 3, // "FDIST", 3, // "FINV", 1, // "FISHER", 1, // "FISHERINV", 2, // "FLOOR", 4, // "GAMMADIST", 3, // "GAMMAINV", 2, // "CEILING", 4, // "HYPGEOMDIST", 3, // "LOGNORMDIST", 3, // "LOGINV", 3, // "NEGBINOMDIST", 4, // "NORMDIST", 1, // "NORMSDIST", 3, // "NORMINV", 1, // "NORMSINV", 3, // "STANDARDIZE", 1, // "ODD", 2, // "PERMUT", 3, // "POISSON", 3, // "TDIST", 4, // "WEIBULL", 2, // "SUMXMY2", 2, // "SUMX2MY2", 2, // "SUMX2PY2", 2, // "CHITEST", 2, // "CORREL", 2, // "COVAR", 3, // "FORECAST", 2, // "FTEST", 2, // "INTERCEPT", 2, // "PEARSON", 2, // "RSQ", 2, // "STEYX", 2, // "SLOPE", 4, // "TTEST", 4, // "PROB", 255, // "DEVSQ", 255, // "GEOMEAN", 255, // "HARMEAN", 255, // "SUMSQ", 255, // "KURT", 255, // "SKEW", 3, // "ZTEST", 2, // "LARGE", 2, // "SMALL", 2, // "QUARTILE", 2, // "PERCENTILE", 3, // "PERCENTRANK", 255, // "MODE", 2, // "TRIMMEAN", 2, // "TINV", 4, // "", 4, // "MOVIE.COMMAND", 3, // "GET.MOVIE", 255, // "CONCATENATE", 2, // "POWER", 9, // "PIVOT.ADD.DATA", 2, // "GET.PIVOT.TABLE", 3, // "GET.PIVOT.FIELD", 4, // "GET.PIVOT.ITEM", 1, // "RADIANS", 1, // "DEGREES", 255, // "SUBTOTAL", 3, // "SUMIF", 2, // "COUNTIF", 1, // "COUNTBLANK", 2, // "SCENARIO.GET", 1, // "OPTIONS.LISTS.GET", 4, // "ISPMT", 3, // "DATEDIF", 1, // "DATESTRING", 2, // "NUMBERSTRING", 2, // "ROMAN", 4, // "OPEN.DIALOG", 5, // "SAVE.DIALOG", 2, // "VIEW.GET", 128, // "GETPIVOTDATA", 2, // "HYPERLINK", 1, // "PHONETIC", 255, // "AVERAGEA", 255, // "MAXA", 255, // "MINA", 255, // "STDEVPA", 255, // "VARPA", 255, // "STDEVA", 255, // "VARA", 1, // "BAHTTEXT", 1, // "THAIDAYOFWEEK", 1, // "THAIDIGIT", 1, // "THAIMONTHOFYEAR", 1, // "THAINUMSOUND", 1, // "THAINUMSTRING", 1, // "THAISTRINGLENGTH", 1, // "ISTHAIDIGIT", 1, // "ROUNDBAHTDOWN", 1, // "ROUNDBAHTUP", 1, // "THAIYEAR", 255, // "RTD", 255, // "CUBEVALUE", 3, // "CUBEMEMBER", 3, // "CUBEMEMBERPROPERTY", 4, // "CUBERANKEDMEMBER", 2, // "HEX2BIN", 1, // "HEX2DEC", 2, // "HEX2OCT", 2, // "DEC2BIN", 2, // "DEC2HEX", 2, // "DEC2OCT", 2, // "OCT2BIN", 2, // "OCT2HEX", 1, // "OCT2DEC", 1, // "BIN2DEC", 2, // "BIN2OCT", 2, // "BIN2HEX", 2, // "IMSUB", 2, // "IMDIV", 2, // "IMPOWER", 1, // "IMABS", 1, // "IMSQRT", 1, // "IMLN", 1, // "IMLOG2", 1, // "IMLOG10", 1, // "IMSIN", 1, // "IMCOS", 1, // "IMEXP", 1, // "IMARGUMENT", 1, // "IMCONJUGATE", 1, // "IMAGINARY", 1, // "IMREAL", 3, // "COMPLEX", 255, // "IMSUM", 255, // "IMPRODUCT", 4, // "SERIESSUM", 1, // "FACTDOUBLE", 1, // "SQRTPI", 2, // "QUOTIENT", 2, // "DELTA", 2, // "GESTEP", 1, // "ISEVEN", 1, // "ISODD", 2, // "MROUND", 2, // "ERF", 1, // "ERFC", 2, // "BESSELJ", 2, // "BESSELK", 2, // "BESSELY", 2, // "BESSELI", 3, // "XIRR", 3, // "XNPV", 6, // "PRICEMAT", 6, // "YIELDMAT", 5, // "INTRATE", 5, // "RECEIVED", 5, // "DISC", 5, // "PRICEDISC", 5, // "YIELDDISC", 3, // "TBILLEQ", 3, // "TBILLPRICE", 3, // "TBILLYIELD", 7, // "PRICE", 7, // "YIELD", 2, // "DOLLARDE", 2, // "DOLLARFR", 2, // "NOMINAL", 2, // "EFFECT", 6, // "CUMPRINC", 6, // "CUMIPMT", 2, // "EDATE", 2, // "EOMONTH", 3, // "YEARFRAC", 4, // "COUPDAYBS", 4, // "COUPDAYS", 4, // "COUPDAYSNC", 4, // "COUPNCD", 4, // "COUPNUM", 4, // "COUPPCD", 6, // "DURATION", 6, // "MDURATION", 8, // "ODDLPRICE", 8, // "ODDLYIELD", 8, // "ODDFPRICE", 8, // "ODDFYIELD", 2, // "RANDBETWEEN", 2, // "WEEKNUM", 7, // "AMORDEGRC", 7, // "AMORLINC", 8, // "CONVERT", // 1, // "SHEETJS", 8, // "ACCRINT", 5, // "ACCRINTM", 3, // "WORKDAY", 3, // "NETWORKDAYS", 255, // "GCD", 255, // "MULTINOMIAL", 255, // "LCM", 2, // "FVSCHEDULE", 4, // "CUBEKPIMEMBER", 5, // "CUBESET", 1, // "CUBESETCOUNT", 2, // "IFERROR", 128, // "COUNTIFS", 129, // "SUMIFS", 3, // "AVERAGEIF", 129, // "AVERAGEIFS" ]; #[cfg(test)] mod tests { use super::*; #[test] fn sound_to_u32() { let data = b"ABCDEFGH"; assert_eq!( to_u32(data).collect::>(), [u32::from_le_bytes(*b"ABCD"), u32::from_le_bytes(*b"EFGH")] ); } #[test] fn unescape_entity() { // Tests for standard named XML entities. let mut buffer = String::new(); let entity = BytesRef::new("amp"); unescape_entity_to_buffer(&entity, &mut buffer).unwrap(); assert_eq!(buffer, "&"); buffer.clear(); let entity = BytesRef::new("lt"); unescape_entity_to_buffer(&entity, &mut buffer).unwrap(); assert_eq!(buffer, "<"); // Tests for numeric character references. buffer.clear(); let entity = BytesRef::new("#xA"); let result = unescape_entity_to_buffer(&entity, &mut buffer); assert_eq!(buffer, "\n"); assert!(result.is_ok()); buffer.clear(); let entity = BytesRef::new("#x41"); let result = unescape_entity_to_buffer(&entity, &mut buffer); assert!(result.is_ok()); assert_eq!(buffer, "A"); buffer.clear(); let entity = BytesRef::new("#65"); // Decimal. let result = unescape_entity_to_buffer(&entity, &mut buffer); assert!(result.is_ok()); assert_eq!(buffer, "A"); // Test for failure on empty entity. buffer.clear(); let entity = BytesRef::new(""); let result = unescape_entity_to_buffer(&entity, &mut buffer); assert!(result.is_err()); // Test the error message. let entity = BytesRef::new("not_a_real_entity"); let result = unescape_entity_to_buffer(&entity, &mut buffer); assert!(result.is_err()); if let Err(quick_xml::Error::Escape(quick_xml::escape::EscapeError::UnrecognizedEntity( _, msg, ))) = result { assert!(msg.contains("not_a_real_entity")); } } #[test] fn xml_with_escapes() { let test_cases = vec![ ("_", "_"), ("_x", "_x"), ("_x0", "_x0"), ("_x00", "_x00"), ("_x005F_", "_"), ("_x000D_", "\r"), ("_x000", "_x000"), ("_x001F_", "\x1F"), ("_x000D", "_x000D"), ("_x00ZZ_", "_x00ZZ_"), ("_x_x_x", "_x_x_x"), ("_x00Β½_", "_x00Β½_"), ("_x000G_", "_x000G_"), ("_x597G_", "_x597G_"), ("πŸ˜€_x000D_πŸ˜€", "πŸ˜€\rπŸ˜€"), ("_x005F_x0000_", "_x0000_"), ("_x005F_x000a_", "_x000a_"), ("_x005F_x000A_", "_x000A_"), ("_x005F_x005F_", "_x005F_"), ("_x005F_x597D_", "_x597D_"), ("_x005F_x597d_", "_x597d_"), ("__x005F_x0000__", "__x0000__"), ("Hello_x000D_World", "Hello\rWorld"), ("Control_x001F_Char", "Control\x1FChar"), ("Hello_x000D_World_x000D_", "Hello\rWorld\r"), ("Just_a_normal_string", "Just_a_normal_string"), ("Hello_x000D_World_x000D__x000D_", "Hello\rWorld\r\r"), ("Multiple_x000D__x000D__x000D_Chars", "Multiple\r\r\rChars"), ]; for (input, expected) in test_cases { assert_eq!(unescape_xml(input), expected); } } } calamine-0.34.0/src/vba.rs000064400000000000000000000365761046102023000134070ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. //! Parse vbaProject.bin file //! //! Retranscription from [`OfficeParser`]. //! //! [`OfficeParser`]: https://github.com/unixfreak0037/officeparser/blob/master/officeparser.py use std::collections::BTreeMap; use std::io::Read; use std::path::PathBuf; use byteorder::{LittleEndian, ReadBytesExt}; use log::{debug, log_enabled, warn, Level}; use crate::cfb::{Cfb, XlsEncoding}; use crate::utils::read_u16; /// A VBA specific error enum #[derive(Debug)] pub enum VbaError { /// Error comes from a cfb parsing Cfb(crate::cfb::CfbError), /// Io error Io(std::io::Error), /// Cannot find module ModuleNotFound(String), /// Generic unknown u16 value Unknown { /// error type typ: &'static str, /// value found val: u16, }, /// Invalid libid format LibId, /// Invalid record id InvalidRecordId { /// expected record id expected: u16, /// record if found found: u16, }, } from_err!(crate::cfb::CfbError, VbaError, Cfb); from_err!(std::io::Error, VbaError, Io); impl std::fmt::Display for VbaError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { VbaError::Io(e) => write!(f, "I/O error: {e}"), VbaError::Cfb(e) => write!(f, "Cfb error: {e}"), VbaError::ModuleNotFound(e) => write!(f, "Cannot find module '{e}'"), VbaError::Unknown { typ, val } => write!(f, "Unknown {typ} '{val:X}'"), VbaError::LibId => write!(f, "Unexpected libid format"), VbaError::InvalidRecordId { expected, found } => write!( f, "Invalid record id: expecting {expected:X} found {found:X}" ), } } } impl std::error::Error for VbaError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { VbaError::Io(e) => Some(e), VbaError::Cfb(e) => Some(e), _ => None, } } } /// A struct for managing VBA reading #[derive(Clone, Debug, PartialEq, Eq)] pub struct VbaProject { references: Vec, modules: BTreeMap>, encoding: XlsEncoding, } impl VbaProject { /// Create a new `VbaProject` out of the vbaProject.bin `ZipFile` or xls file /// /// Starts reading project metadata (header, directories, sectors and minisectors). pub fn new(r: &mut R, len: usize) -> Result { let mut cfb = Cfb::new(r, len)?; VbaProject::from_cfb(r, &mut cfb) } /// Creates a new `VbaProject` out of a Compound File Binary and the corresponding reader pub(crate) fn from_cfb(r: &mut R, cfb: &mut Cfb) -> Result { // dir stream let stream = cfb.get_stream("dir", r)?; let stream = crate::cfb::decompress_stream(&stream)?; let stream = &mut &*stream; // read dir information record (not used) let encoding = read_dir_information(stream)?; // array of REFERENCE records let refs = Reference::from_stream(stream, &encoding)?; // modules let mods: Vec = read_modules(stream, &encoding)?; // read all modules let modules: BTreeMap> = mods .into_iter() .map(|m| { cfb.get_stream(&m.stream_name, r).and_then(|s| { crate::cfb::decompress_stream(&s[m.text_offset..]).map(move |s| (m.name, s)) }) }) .collect::>()?; Ok(VbaProject { references: refs, modules, encoding, }) } /// Gets the list of `Reference`s pub fn get_references(&self) -> &[Reference] { &self.references } /// Gets the list of `Module` names pub fn get_module_names(&self) -> Vec<&str> { self.modules.keys().map(|k| &**k).collect() } /// Reads module content and tries to convert to utf8 /// /// While it works most of the time, the modules are MBCS encoding and the conversion /// may fail. If this is the case you should revert to `read_module_raw` as there is /// no built in decoding provided in this crate /// /// # Examples /// ``` /// use calamine::{Reader, open_workbook, Xlsx}; /// /// # let path = format!("{}/tests/vba.xlsm", env!("CARGO_MANIFEST_DIR")); /// let mut xl: Xlsx<_> = open_workbook(path).expect("Cannot find excel file"); /// if let Ok(Some(vba)) = xl.vba_project() { /// let modules = vba.get_module_names().into_iter() /// .map(|s| s.to_string()).collect::>(); /// for m in modules { /// println!("Module {m}:"); /// println!("{}", vba.get_module(&m) /// .unwrap_or_else(|_| panic!("cannot read {m:?} module"))); /// } /// } /// ``` pub fn get_module(&self, name: &str) -> Result { debug!("read module {name}"); let data = self.get_module_raw(name)?; Ok(self.encoding.decode_all(data)) } /// Reads module content (MBCS encoded) and output it as-is (binary output) pub fn get_module_raw(&self, name: &str) -> Result<&[u8], VbaError> { match self.modules.get(name) { Some(m) => Ok(&**m), None => Err(VbaError::ModuleNotFound(name.into())), } } } /// A vba reference #[derive(Debug, Clone, Hash, Eq, PartialEq)] pub struct Reference { /// name pub name: String, /// description pub description: String, /// location of the reference pub path: PathBuf, } impl Reference { /// Check if the reference location is accessible pub fn is_missing(&self) -> bool { !self.path.exists() } /// Gets the list of references from the `dir_stream` relevant part fn from_stream(stream: &mut &[u8], encoding: &XlsEncoding) -> Result, VbaError> { debug!("read all references metadata"); let mut references = Vec::new(); let mut reference = Reference { name: "".to_string(), description: "".to_string(), path: "".into(), }; loop { let check = stream.read_u16::(); match check? { 0x000F => { // termination of references array if !reference.name.is_empty() { references.push(reference); } break; } 0x0016 => { // REFERENCENAME if !reference.name.is_empty() { references.push(reference); } let name = read_variable_record(stream, 1)?; let name = encoding.decode_all(name); reference = Reference { name: name.clone(), description: name, path: "".into(), }; check_variable_record(0x003E, stream)?; // unicode } 0x0033 => { // REFERENCEORIGINAL (followed by REFERENCECONTROL) reference.set_libid(stream, encoding)?; } 0x002F => { // REFERENCECONTROL *stream = &stream[4..]; // SizeTwiddled: len of total ref control reference.set_libid(stream, encoding)?; *stream = &stream[6..]; match stream.read_u16::()? { 0x0016 => { // optional name record extended read_variable_record(stream, 1)?; // name extended check_variable_record(0x003E, stream)?; // name extended unicode check_record(0x0030, stream)?; } 0x0030 => (), e => { return Err(VbaError::Unknown { typ: "token in reference control", val: e, }); } } *stream = &stream[4..]; reference.set_libid(stream, encoding)?; *stream = &stream[26..]; } 0x000D => { // REFERENCEREGISTERED *stream = &stream[4..]; reference.set_libid(stream, encoding)?; *stream = &stream[6..]; } 0x000E => { // REFERENCEPROJECT *stream = &stream[4..]; let absolute = read_variable_record(stream, 1)?; // project libid absolute { let absolute = encoding.decode_all(absolute); reference.path = if let Some(stripped) = absolute.strip_prefix("*\\C") { stripped.into() } else { absolute.into() }; } read_variable_record(stream, 1)?; // project libid relative *stream = &stream[6..]; } c => { return Err(VbaError::Unknown { typ: "check id", val: c, }); } } } debug!("references: {references:#?}"); Ok(references) } fn set_libid(&mut self, stream: &mut &[u8], encoding: &XlsEncoding) -> Result<(), VbaError> { let libid = read_variable_record(stream, 1)?; //libid twiddled if libid.is_empty() || libid.ends_with(b"##") { return Ok(()); } let libid = encoding.decode_all(libid); let mut parts = libid.rsplit('#'); match (parts.next(), parts.next()) { (Some(desc), Some(path)) => { self.description = desc.into(); // use original path if already set if !path.is_empty() && self.path.as_os_str().is_empty() { self.path = path.into(); } Ok(()) } _ => Err(VbaError::LibId), } } } /// A vba module #[derive(Debug, Clone, Default)] struct Module { /// module name as it appears in vba project name: String, stream_name: String, text_offset: usize, } fn read_dir_information(stream: &mut &[u8]) -> Result { debug!("read dir header"); // PROJECTSYSKIND *stream = &stream[10..]; // PROJECTCOMPATVERSION (optional) if read_u16(&stream[0..2]) == 0x004A { *stream = &stream[10..]; } // PROJECTLCID and PROJECTLCIDINVOKE Records *stream = &stream[20..]; // PROJECT Codepage let encoding = XlsEncoding::from_codepage(read_u16(&stream[6..8]))?; *stream = &stream[8..]; // PROJECTNAME Record check_variable_record(0x0004, stream)?; // PROJECTDOCSTRING Record check_variable_record(0x0005, stream)?; check_variable_record(0x0040, stream)?; // unicode // PROJECTHELPFILEPATH Record - MS-OVBA 2.3.4.2.1.7 check_variable_record(0x0006, stream)?; check_variable_record(0x003D, stream)?; // PROJECTHELPCONTEXT PROJECTLIBFLAGS and PROJECTVERSION Records *stream = &stream[32..]; // PROJECTCONSTANTS Record check_variable_record(0x000C, stream)?; check_variable_record(0x003C, stream)?; // unicode Ok(encoding) } fn read_modules(stream: &mut &[u8], encoding: &XlsEncoding) -> Result, VbaError> { debug!("read all modules metadata"); *stream = &stream[4..]; let module_len = stream.read_u16::()? as usize; // PROJECTCOOKIE record: Id (2 bytes), Size (4 bytes), Cookie (2 bytes) *stream = &stream[8..]; let mut modules = Vec::with_capacity(module_len); for _ in 0..module_len { let name = check_variable_record(0x0019, stream)?; // NameRecord let name = encoding.decode_all(name); check_variable_record(0x0047, stream)?; // NameUnicodeRecord let stream_name = check_variable_record(0x001A, stream)?; let stream_name = encoding.decode_all(stream_name); check_variable_record(0x0032, stream)?; // StreamNameUnicode check_variable_record(0x001C, stream)?; // StreamNameRecord check_variable_record(0x0048, stream)?; // DocStringUnicode check_record(0x0031, stream)?; // OffsetRecord (10 bytes) *stream = &stream[4..]; let offset = stream.read_u32::()? as usize; check_record(0x001E, stream)?; // HelpContextRecord (10 bytes) *stream = &stream[8..]; check_record(0x002C, stream)?; // CookieRecord (8 bytes) *stream = &stream[6..]; // TypeRecord (6 bytes) match stream.read_u16::()? { 0x0021 /* procedural module */ | 0x0022 /* document, class or designer module */ => (), e => return Err(VbaError::Unknown { typ: "module typ", val: e }), } loop { *stream = &stream[4..]; // reserved (4 bytes) match stream.read_u16::() { Ok(0x0025 | 0x0028) => (), // ReadOnlyRecord | PrivateRecord (6 bytes each) Ok(0x002B) => break, // Terminator (2 bytes) Ok(e) => { return Err(VbaError::Unknown { typ: "record id", val: e, }) } Err(e) => return Err(VbaError::Io(e)), } } *stream = &stream[4..]; // reserved (4 bytes) modules.push(Module { name, stream_name, text_offset: offset, }); } Ok(modules) } /// Reads a variable length record /// /// `mult` is a multiplier of the length (e.g 2 when parsing `XLWideString`) fn read_variable_record<'a>(r: &mut &'a [u8], mult: usize) -> Result<&'a [u8], VbaError> { let len = r.read_u32::()? as usize * mult; let (read, next) = r.split_at(len); *r = next; Ok(read) } /// Check that next record matches `id` and returns a variable length record fn check_variable_record<'a>(id: u16, r: &mut &'a [u8]) -> Result<&'a [u8], VbaError> { if id == 0x0047 && (r.len() < 2 || read_u16(&r[..2]) != id) { Ok(&[]) // NameUnicodeRecord (0x0047) is optional } else { // id must match; consume and read variable-length payload check_record(id, r)?; let record = read_variable_record(r, 1)?; if log_enabled!(Level::Warn) && record.len() > 100_000 { warn!( "record id {} has a suspicious/huge length of {} (hex: {:x})", id, record.len(), record.len() as u32 ); } Ok(record) } } /// Check that next record matches `id` fn check_record(id: u16, r: &mut &[u8]) -> Result<(), VbaError> { debug!("check record {id:x}"); let record_id = r.read_u16::()?; if record_id == id { Ok(()) } else { Err(VbaError::InvalidRecordId { expected: id, found: record_id, }) } } calamine-0.34.0/src/xls.rs000064400000000000000000001646071046102023000134420ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. use std::cmp::min; use std::collections::BTreeMap; use std::fmt::{self, Write}; use std::io::{Read, Seek, SeekFrom}; use log::debug; use crate::cfb::{Cfb, XlsEncoding}; use crate::formats::{ builtin_format_by_code, detect_custom_number_format, format_excel_f64, format_excel_i64, CellFormat, }; #[cfg(feature = "picture")] use crate::utils::read_usize; use crate::utils::{push_column, read_f64, read_i16, read_i32, read_u16, read_u32}; use crate::vba::VbaProject; use crate::{ Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, Sheet, SheetType, SheetVisible, }; #[derive(Debug)] /// An enum to handle Xls specific errors pub enum XlsError { /// Io error Io(std::io::Error), /// Cfb error Cfb(crate::cfb::CfbError), /// Vba error Vba(crate::vba::VbaError), /// Cannot parse formula, stack is too short StackLen, /// Unrecognized data Unrecognized { /// data type typ: &'static str, /// value found val: u8, }, /// Workbook is password protected Password, /// Invalid length Len { /// expected length expected: usize, /// found length found: usize, /// length type typ: &'static str, }, /// Continue Record is too short ContinueRecordTooShort, /// End of stream EoStream(&'static str), /// Invalid Formula InvalidFormula { /// stack size stack_size: usize, }, /// Invalid or unknown iftab IfTab(usize), /// Invalid etpg Etpg(u8), /// No vba project NoVba, /// Invalid OfficeArt Record. #[cfg(feature = "picture")] #[cfg_attr(docsrs, doc(cfg(feature = "picture")))] Art(&'static str), /// Worksheet not found WorksheetNotFound(String), /// Invalid iFmt value InvalidFormat { /// iFmt value, See 2.4.126 Format ifmt: u16, }, } from_err!(std::io::Error, XlsError, Io); from_err!(crate::cfb::CfbError, XlsError, Cfb); from_err!(crate::vba::VbaError, XlsError, Vba); impl std::fmt::Display for XlsError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { XlsError::Io(e) => write!(f, "I/O error: {e}"), XlsError::Cfb(e) => write!(f, "Cfb error: {e}"), XlsError::Vba(e) => write!(f, "Vba error: {e}"), XlsError::StackLen => write!(f, "Invalid stack length"), XlsError::Unrecognized { typ, val } => write!(f, "Unrecognized {typ}: 0x{val:0X}"), XlsError::Password => write!(f, "Workbook is password protected"), XlsError::Len { expected, found, typ, } => write!( f, "Invalid {typ} length, expected at least {expected}, found {found}", ), XlsError::ContinueRecordTooShort => write!( f, "Continued record too short while reading extended string" ), XlsError::EoStream(s) => write!(f, "End of stream '{s}'"), XlsError::InvalidFormula { stack_size } => { write!(f, "Invalid formula (stack size: {stack_size})") } XlsError::IfTab(iftab) => write!(f, "Invalid iftab {iftab:X}"), XlsError::Etpg(etpg) => write!(f, "Invalid etpg {etpg:X}"), XlsError::NoVba => write!(f, "No VBA project"), #[cfg(feature = "picture")] XlsError::Art(s) => write!(f, "Invalid art record '{s}'"), XlsError::WorksheetNotFound(name) => write!(f, "Worksheet '{name}' not found"), XlsError::InvalidFormat { ifmt } => write!(f, "Invalid ifmt value: '{ifmt}'"), } } } impl std::error::Error for XlsError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { XlsError::Io(e) => Some(e), XlsError::Cfb(e) => Some(e), XlsError::Vba(e) => Some(e), _ => None, } } } /// Options to perform specialized parsing. #[derive(Debug, Clone, Default)] #[non_exhaustive] pub struct XlsOptions { /// Force a spreadsheet to be interpreted using a particular code page. /// /// XLS files can contain [code page] identifiers. If this identifier is missing or incorrect, /// strings in the parsed spreadsheet may be decoded incorrectly. Setting this field causes /// `calamine::Xls` to interpret strings using the specified code page, which may allow such /// spreadsheets to be decoded properly. /// /// [code page]: https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers pub force_codepage: Option, /// Row to use as header pub header_row: HeaderRow, } struct SheetData { range: Range, formula: Range, merge_cells: Vec, } /// A struct representing an old xls format file (CFB) pub struct Xls { sheets: BTreeMap, metadata: Metadata, cfb: Cfb, reader: RS, options: XlsOptions, formats: Vec, is_1904: bool, #[cfg(feature = "picture")] pictures: Option)>>, } fn cfb(reader: &mut RS) -> Result { let offset_end = reader.seek(SeekFrom::End(0))? as usize; reader.seek(SeekFrom::Start(0))?; let cfb = Cfb::new(reader, offset_end)?; Ok(cfb) } impl Xls { /// Creates a new instance using `Options` to inform parsing. /// /// ``` /// use calamine::{Xls,XlsOptions}; /// # use std::io::Cursor; /// # const BYTES: &[u8] = b""; /// /// # fn run() -> Result>, calamine::XlsError> { /// # let reader = std::io::Cursor::new(BYTES); /// let mut options = XlsOptions::default(); /// // ...set options... /// let workbook = Xls::new_with_options(reader, options)?; /// # Ok(workbook) } /// # fn main() { assert!(run().is_err()); } /// ``` pub fn new_with_options(mut reader: RS, options: XlsOptions) -> Result { let cfb = cfb(&mut reader)?; debug!("cfb loaded"); let mut xls = Xls { sheets: BTreeMap::new(), cfb, reader, metadata: Metadata::default(), options, is_1904: false, formats: Vec::new(), #[cfg(feature = "picture")] pictures: None, }; xls.parse_workbook()?; debug!("xls parsed"); Ok(xls) } /// Gets the worksheet merge cell dimensions pub fn worksheet_merge_cells(&self, name: &str) -> Option> { self.sheets.get(name).map(|r| r.merge_cells.clone()) } /// Get the nth worksheet. Shortcut for getting the nth /// sheet name, then the corresponding worksheet. pub fn worksheet_merge_cells_at(&self, n: usize) -> Option> { let sheet = self.metadata().sheets.get(n)?; self.worksheet_merge_cells(&sheet.name) } } impl Reader for Xls { type Error = XlsError; fn new(reader: RS) -> Result { Self::new_with_options(reader, XlsOptions::default()) } fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } fn vba_project(&mut self) -> Result, XlsError> { // Reads vba once for all (better than reading all worksheets once for all) if !self.cfb.has_directory("_VBA_PROJECT_CUR") { return Ok(None); } let vba = VbaProject::from_cfb(&mut self.reader, &mut self.cfb)?; Ok(Some(vba)) } /// Parses Workbook stream, no need for the relationships variable fn metadata(&self) -> &Metadata { &self.metadata } fn worksheet_range(&mut self, name: &str) -> Result, XlsError> { let sheet = self .sheets .get(name) .map(|r| r.range.clone()) .ok_or_else(|| XlsError::WorksheetNotFound(name.into()))?; match self.options.header_row { HeaderRow::FirstNonEmptyRow => Ok(sheet), HeaderRow::Row(header_row_idx) => { // If `header_row` is a row index, adjust the range if let (Some(start), Some(end)) = (sheet.start(), sheet.end()) { Ok(sheet.range((header_row_idx, start.1), end)) } else { Ok(sheet) } } } } fn worksheets(&mut self) -> Vec<(String, Range)> { self.sheets .iter() .map(|(name, sheet)| (name.to_owned(), sheet.range.clone())) .collect() } fn worksheet_formula(&mut self, name: &str) -> Result, XlsError> { self.sheets .get(name) .ok_or_else(|| XlsError::WorksheetNotFound(name.into())) .map(|r| r.formula.clone()) } #[cfg(feature = "picture")] fn pictures(&self) -> Option)>> { self.pictures.to_owned() } } #[derive(Debug, Clone, Copy)] struct Xti { _isup_book: u16, itab_first: i16, _itab_last: i16, } impl Xls { fn parse_workbook(&mut self) -> Result<(), XlsError> { use super::cfb::CfbError::StreamNotFound; // gets workbook and worksheets stream, or early exit let stream = match self .cfb .get_stream("Workbook", &mut self.reader) .or_else(|_| self.cfb.get_stream("Book", &mut self.reader)) .or_else(|_| self.cfb.get_stream("WORKBOOK", &mut self.reader)) .or_else(|_| self.cfb.get_stream("BOOK", &mut self.reader)) { Ok(s) => s, Err(StreamNotFound(_)) => return Err(StreamNotFound("Workbook".to_string()).into()), Err(e) => return Err(e.into()), }; let mut sheet_names = Vec::new(); let mut strings = Vec::new(); let mut defined_names = Vec::new(); let mut xtis = Vec::new(); let mut formats = BTreeMap::new(); let mut xfs = Vec::new(); let mut biff = Biff::Biff8; // Binary Interchange File Format (BIFF) version let codepage = self.options.force_codepage.unwrap_or(1200); let mut encoding = XlsEncoding::from_codepage(codepage)?; #[cfg(feature = "picture")] let mut draw_group: Vec = Vec::new(); { let wb = &stream; let records = RecordIter { stream: wb }; for record in records { // Record docs/specs - // https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xls/6fba0383-0d7a-4c7a-afe9-642ff70cbd36 let mut r = record?; match r.typ { // FilePass (MS-XLS 2.4.117) 0x002F if read_u16(r.data) != 0 => return Err(XlsError::Password), // CodePage (MS-XLS 2.4.52) 0x0042 => { if self.options.force_codepage.is_none() { encoding = XlsEncoding::from_codepage(read_u16(r.data))?; } } // RRTabId (MS-XLS 2.4.241) 0x013D => { let sheet_len = r.data.len() / 2; sheet_names.reserve(sheet_len); self.metadata.sheets.reserve(sheet_len); } // DateMode (MS-XLS 2.4.77) 0x0022 => { if read_u16(r.data) == 1 { self.is_1904 = true; } } // Format (MS-XLS 2.4.126) 0x041E => match parse_format(&mut r, &encoding, biff) { Ok((idx, format)) => { formats.insert(idx, format); } Err(e) => log::warn!("{e}"), }, // XF (MS-XLS 2.4.353) 0x00E0 => { xfs.push(parse_xf(&r)?); } // BoundSheet8 (MS-XLS 2.4.28) 0x0085 => { let (pos, sheet) = parse_sheet_metadata(&mut r, &encoding, biff)?; self.metadata.sheets.push(sheet.clone()); sheet_names.push((pos, sheet.name)); } // BOF (MS-XLS 2.4.21) 0x0809 => { let bof = parse_bof(&mut r)?; biff = bof.biff; } // Lbl (MS-XLS 2.4.150) 0x0018 => defined_names.push(parse_lbl(&r, &encoding, biff)?), // ExternSheet (MS-XLS 2.4.106) 0x0017 => xtis.extend(parse_extern_sheet(&r, biff)), // SST (MS-XLS 2.4.265) 0x00FC => strings = parse_sst(&mut r, &encoding)?, #[cfg(feature = "picture")] // MsoDrawingGroup (MS-XLS 2.4.171) 0x00EB => { draw_group.extend(r.data); draw_group.extend(r.cont.iter().flat_map(|v| *v)); } // EOF (MS-XLS 2.4.103) 0x000A => break, _ => (), } } } // Before BIFF8, formula tokens embed sheet indices directly (itabFirst), // so create identity-mapped XTI entries to reuse the BIFF8 lookup code. if matches!(biff, Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5) && xtis.is_empty() { xtis = (0..sheet_names.len()) .map(|i| Xti { _isup_book: 0, itab_first: i as i16, _itab_last: i as i16, }) .collect(); } self.formats = xfs .into_iter() .map(|fmt| match formats.get(&fmt) { Some(s) => *s, _ => builtin_format_by_code(fmt), }) .collect(); debug!("formats: {:?}", self.formats); let defined_names = defined_names .into_iter() .map(|(name, (i, mut f))| { if let Some(i) = i { let sh = xtis .get(i) .and_then(|xti| sheet_names.get(xti.itab_first as usize)) .map_or("#REF", |sh| &sh.1); f = format!("{sh}!{f}"); } (name, f) }) .collect::>(); debug!("defined_names: {defined_names:?}"); let mut sheets = BTreeMap::new(); let fmla_sheet_names = sheet_names .iter() .map(|(_, n)| n.clone()) .collect::>(); for (pos, name) in sheet_names { let sh = &stream[pos..]; let records = RecordIter { stream: sh }; let mut cells = Vec::new(); let mut formulas = Vec::new(); let mut fmla_pos = (0, 0); let mut merge_cells = Vec::new(); for record in records { let r = record?; match r.typ { // 512: Dimensions 0x0200 => { let Dimensions { start, end } = parse_dimensions(r.data)?; let rows = (end.0 - start.0 + 1) as usize; let cols = (end.1 - start.1 + 1) as usize; cells.reserve(rows.saturating_mul(cols)); } //0x0201 => cells.push(parse_blank(r.data)?), // 513: Blank 0x0203 => cells.push(parse_number(r.data, &self.formats, self.is_1904)?), // 515: Number 0x0204 => cells.push(parse_label(r.data, &encoding, biff)?), // 516: Label [MS-XLS 2.4.148] 0x0205 => cells.push(parse_bool_err(r.data)?), // 517: BoolErr 0x0207 => { // 519 String (formula value) let val = Data::String(parse_string(r.data, &encoding, biff)?); cells.push(Cell::new(fmla_pos, val)); } 0x027E => cells.push(parse_rk(r.data, &self.formats, self.is_1904)?), // 638: Rk 0x00FD => cells.extend(parse_label_sst(r.data, &strings)?), // LabelSst 0x00BD => parse_mul_rk(r.data, &mut cells, &self.formats, self.is_1904)?, // 189: MulRk 0x00E5 => parse_merge_cells(r.data, &mut merge_cells)?, // 229: Merge Cells 0x000A => break, // 10: EOF, 0x0006 => { // 6: Formula if r.data.len() < 20 { return Err(XlsError::Len { expected: 20, found: r.data.len(), typ: "Formula", }); } let row = read_u16(r.data); let col = read_u16(&r.data[2..]); fmla_pos = (row as u32, col as u32); if let Some(val) = parse_formula_value(&r.data[6..14])? { // If the value is a string // it will appear in 0x0207 record coming next cells.push(Cell::new(fmla_pos, val)); } let fmla = parse_formula( &r.data[20..], &fmla_sheet_names, &defined_names, &xtis, &encoding, ) .unwrap_or_else(|e| { debug!("{e}"); format!( "Unrecognised formula \ for cell ({row}, {col}): {e:?}" ) }); formulas.push(Cell::new(fmla_pos, fmla)); } _ => (), } } let range = Range::from_sparse(cells); let formula = Range::from_sparse(formulas); sheets.insert( name, SheetData { range, formula, merge_cells, }, ); } self.sheets = sheets; self.metadata.names = defined_names; #[cfg(feature = "picture")] if !draw_group.is_empty() { let pics = parse_pictures(&draw_group)?; if !pics.is_empty() { self.pictures = Some(pics); } } Ok(()) } } /// struct Bof { /// Binary Interchange File Format biff: Biff, } /// #[derive(Clone, Copy, Debug, PartialEq, Eq)] enum Biff { Biff2, Biff3, Biff4, Biff5, Biff8, // Used by MS-XLSB Workbook(2.1.7.61) or Worksheet(2.1.7.61) which are not supported yet. // Biff12, } /// BOF [MS-XLS] 2.4.21 fn parse_bof(r: &mut Record<'_>) -> Result { let mut dt = 0; let biff_version = read_u16(&r.data[..2]); if r.data.len() >= 4 { dt = read_u16(&r.data[2..]); } let biff = match biff_version { 0x0200 | 0x0002 | 0x0007 => Biff::Biff2, 0x0300 => Biff::Biff3, 0x0400 => Biff::Biff4, 0x0500 => Biff::Biff5, 0x0600 => Biff::Biff8, 0 => { if dt == 0x1000 { Biff::Biff5 } else { Biff::Biff8 } } _ => Biff::Biff8, }; Ok(Bof { biff }) } /// `BoundSheet8` [MS-XLS 2.4.28] fn parse_sheet_metadata( r: &mut Record<'_>, encoding: &XlsEncoding, biff: Biff, ) -> Result<(usize, Sheet), XlsError> { let pos = read_u32(r.data) as usize; let visible = match r.data[4] & 0b0011_1111 { 0x00 => SheetVisible::Visible, 0x01 => SheetVisible::Hidden, 0x02 => SheetVisible::VeryHidden, e => { return Err(XlsError::Unrecognized { typ: "BoundSheet8:hsState", val: e, }); } }; let typ = match r.data[5] { 0x00 => SheetType::WorkSheet, 0x01 => SheetType::MacroSheet, 0x02 => SheetType::ChartSheet, 0x06 => SheetType::Vba, e => { return Err(XlsError::Unrecognized { typ: "BoundSheet8:dt", val: e, }); } }; r.data = &r.data[6..]; let mut name = parse_short_string(r, encoding, biff)?; name.retain(|c| c != '\0'); Ok((pos, Sheet { name, typ, visible })) } fn parse_lbl( r: &Record<'_>, encoding: &XlsEncoding, biff: Biff, ) -> Result<(String, (Option, String)), XlsError> { let cch = r.data[3] as usize; let cce = read_u16(&r.data[4..]) as usize; let mut name = String::with_capacity(cch); match biff { Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => { // BIFF5 and earlier: plain byte string, no flags byte encoding.decode_to(&r.data[14..], cch, &mut name, None); } Biff::Biff8 => read_unicode_string_no_cch(encoding, &r.data[14..], &cch, &mut name), } let rgce = &r.data[r.data.len() - cce..]; let formula = parse_defined_names(rgce, biff)?; Ok((name, formula)) } fn parse_extern_sheet(r: &Record<'_>, biff: Biff) -> Vec { match biff { Biff::Biff8 => { // Single record with cXTI count + array of 6-byte XTI structs let cxti = read_u16(r.data) as usize; r.data[2..] .chunks_exact(6) .take(cxti) .map(|xti| Xti { _isup_book: read_u16(&xti[..2]), itab_first: read_i16(&xti[2..4]), _itab_last: read_i16(&xti[4..]), }) .collect() } // BIFF5 and earlier: individual sheet name references; formula // tokens embed sheet indices directly, so no XTI table needed. Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => Vec::new(), } } fn parse_number(r: &[u8], formats: &[CellFormat], is_1904: bool) -> Result, XlsError> { if r.len() < 14 { return Err(XlsError::Len { typ: "number", expected: 14, found: r.len(), }); } let row = read_u16(r) as u32; let col = read_u16(&r[2..]) as u32; let v = read_f64(&r[6..]); let format = formats.get(read_u16(&r[4..]) as usize); Ok(Cell::new((row, col), format_excel_f64(v, format, is_1904))) } fn parse_bool_err(r: &[u8]) -> Result, XlsError> { if r.len() < 8 { return Err(XlsError::Len { typ: "BoolErr", expected: 8, found: r.len(), }); } let row = read_u16(r); let col = read_u16(&r[2..]); let pos = (row as u32, col as u32); match r[7] { 0x00 => Ok(Cell::new(pos, Data::Bool(r[6] != 0))), 0x01 => Ok(Cell::new(pos, parse_err(r[6])?)), e => Err(XlsError::Unrecognized { typ: "fError", val: e, }), } } fn parse_err(e: u8) -> Result { match e { 0x00 => Ok(Data::Error(CellErrorType::Null)), 0x07 => Ok(Data::Error(CellErrorType::Div0)), 0x0F => Ok(Data::Error(CellErrorType::Value)), 0x17 => Ok(Data::Error(CellErrorType::Ref)), 0x1D => Ok(Data::Error(CellErrorType::Name)), 0x24 => Ok(Data::Error(CellErrorType::Num)), 0x2A => Ok(Data::Error(CellErrorType::NA)), 0x2B => Ok(Data::Error(CellErrorType::GettingData)), e => Err(XlsError::Unrecognized { typ: "error", val: e, }), } } fn parse_rk(r: &[u8], formats: &[CellFormat], is_1904: bool) -> Result, XlsError> { if r.len() < 10 { return Err(XlsError::Len { typ: "rk", expected: 10, found: r.len(), }); } let row = read_u16(r); let col = read_u16(&r[2..]); Ok(Cell::new( (row as u32, col as u32), rk_num(&r[4..10], formats, is_1904), )) } fn parse_merge_cells(r: &[u8], merge_cells: &mut Vec) -> Result<(), XlsError> { let count = read_u16(r); for i in 0..count { let offset: usize = (2 + i * 8).into(); let rf = read_u16(&r[offset..]); let rl = read_u16(&r[offset + 2..]); let cf = read_u16(&r[offset + 4..]); let cl = read_u16(&r[offset + 6..]); merge_cells.push(Dimensions { start: (rf.into(), cf.into()), end: (rl.into(), cl.into()), }); } Ok(()) } fn parse_mul_rk( r: &[u8], cells: &mut Vec>, formats: &[CellFormat], is_1904: bool, ) -> Result<(), XlsError> { if r.len() < 6 { return Err(XlsError::Len { typ: "rk", expected: 6, found: r.len(), }); } let row = read_u16(r); let col_first = read_u16(&r[2..]); let col_last = read_u16(&r[r.len() - 2..]); if r.len() != 6 + 6 * (col_last - col_first + 1) as usize { return Err(XlsError::Len { typ: "rk", expected: 6 + 6 * (col_last - col_first + 1) as usize, found: r.len(), }); } let mut col = col_first as u32; for rk in r[4..r.len() - 2].chunks(6) { cells.push(Cell::new((row as u32, col), rk_num(rk, formats, is_1904))); col += 1; } Ok(()) } fn rk_num(rk: &[u8], formats: &[CellFormat], is_1904: bool) -> Data { let d100 = (rk[2] & 1) != 0; let is_int = (rk[2] & 2) != 0; let format = formats.get(read_u16(rk) as usize); let mut v = [0u8; 8]; v[4..].copy_from_slice(&rk[2..]); v[4] &= 0xFC; if is_int { let v = (read_i32(&v[4..8]) >> 2) as i64; if d100 && v % 100 != 0 { format_excel_f64(v as f64 / 100.0, format, is_1904) } else { format_excel_i64(if d100 { v / 100 } else { v }, format, is_1904) } } else { let v = read_f64(&v); format_excel_f64(if d100 { v / 100.0 } else { v }, format, is_1904) } } /// `ShortXLUnicodeString` [MS-XLS 2.5.240] fn parse_short_string( r: &mut Record<'_>, encoding: &XlsEncoding, biff: Biff, ) -> Result { let biff8 = matches!(biff, Biff::Biff8); if r.data.is_empty() || r.data.len() < 2 && biff8 { return Err(XlsError::Len { typ: "short string", expected: if biff8 { 2 } else { 1 }, found: r.data.len(), }); } let cch = r.data[0] as usize; r.data = &r.data[1..]; let mut high_byte = None; if biff8 { high_byte = Some(r.data[0] & 0x1 != 0); r.data = &r.data[1..]; } let mut s = String::with_capacity(cch); encoding.decode_to(r.data, cch, &mut s, high_byte); Ok(s) } /// `XLUnicodeString` [MS-XLS 2.5.294] fn parse_string(r: &[u8], encoding: &XlsEncoding, biff: Biff) -> Result { let (mut high_byte, expected) = match biff { Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => (None, 2), Biff::Biff8 => (Some(false), 3), }; if r.len() < expected { if 2 == r.len() && read_u16(r) == 0 { // tests/OOM_alloc2.xls return Ok(String::new()); } return Err(XlsError::Len { typ: "string", expected, found: r.len(), }); } // delay populating Some(_) variant until length checks guarantee r[2] can't crash high_byte = high_byte.map(|_| r[2] & 0x1 != 0); let cch = read_u16(r) as usize; let mut s = String::with_capacity(cch); encoding.decode_to(&r[expected..], cch, &mut s, high_byte); Ok(s) } fn parse_label(r: &[u8], encoding: &XlsEncoding, biff: Biff) -> Result, XlsError> { if r.len() < 6 { return Err(XlsError::Len { typ: "label", expected: 6, found: r.len(), }); } let row = read_u16(r); let col = read_u16(&r[2..]); let _ixfe = read_u16(&r[4..]); Ok(Cell::new( (row as u32, col as u32), Data::String(parse_string(&r[6..], encoding, biff)?), )) } fn parse_label_sst(r: &[u8], strings: &[String]) -> Result>, XlsError> { if r.len() < 10 { return Err(XlsError::Len { typ: "label sst", expected: 10, found: r.len(), }); } let row = read_u16(r); let col = read_u16(&r[2..]); let i = read_u32(&r[6..]) as usize; if let Some(s) = strings.get(i) { if !s.is_empty() { return Ok(Some(Cell::new( (row as u32, col as u32), Data::String(s.clone()), ))); } } Ok(None) } fn parse_dimensions(r: &[u8]) -> Result { let (rf, rl, mut cf, cl) = match r.len() { 10 => ( read_u16(&r[0..2]) as u32, read_u16(&r[2..4]) as u32, read_u16(&r[4..6]) as u32, read_u16(&r[6..8]) as u32, ), 14 => ( read_u32(&r[0..4]), read_u32(&r[4..8]), read_u16(&r[8..10]) as u32, read_u16(&r[10..12]) as u32, ), _ => { return Err(XlsError::Len { typ: "dimensions", expected: 14, found: r.len(), }); } }; // 2.5.53 ColU must be <= 0xFF, if larger, reasonable to assume // starts at 0 // tests/OOM_alloc2.xls if 0xFF < cf || cl < cf { cf = 0; } if 1 <= rl && 1 <= cl { Ok(Dimensions { start: (rf, cf), end: (rl - 1, cl - 1), }) } else { Ok(Dimensions { start: (rf, cf), end: (rf, cf), }) } } // Parse the Excel xls Shared String Table (SST). See [MS-XLS] 2.4.265. // // https://learn.microsoft.com/en-us/openspecs/office_file_formats/ms-xls/b6231b92-d32e-4626-badd-c3310a672bab fn parse_sst(r: &mut Record<'_>, encoding: &XlsEncoding) -> Result, XlsError> { if r.data.len() < 8 { return Err(XlsError::Len { typ: "sst", expected: 8, found: r.data.len(), }); } let mut sst = vec![]; // Skip cstTotal and cstUnique headers in SST record. r.data = &r.data[8..]; while !r.data.is_empty() || r.continue_record() { sst.push(read_rich_extended_string(r, encoding)?); } Ok(sst) } /// Decode XF (extract only ifmt - Format identifier) /// /// See: fn parse_xf(r: &Record<'_>) -> Result { if r.data.len() < 4 { return Err(XlsError::Len { typ: "xf", expected: 4, found: r.data.len(), }); } Ok(read_u16(&r.data[2..])) } /// Decode Format [MS-XLS 2.4.126] /// /// See: fn parse_format( r: &mut Record<'_>, encoding: &XlsEncoding, biff: Biff, ) -> Result<(u16, CellFormat), XlsError> { if r.data.len() < 2 { return Err(XlsError::Len { typ: "format", expected: 2, found: r.data.len(), }); } let ifmt = read_u16(r.data); match ifmt { 5..=8 | 23..=26 | 41..=44 | 63..=66 | 164..=382 => (), _ => return Err(XlsError::InvalidFormat { ifmt }), } let s = parse_string(&r.data[2..], encoding, biff)?; Ok((ifmt, detect_custom_number_format(&s))) } /// Decode `XLUnicodeRichExtendedString` [MS-XLS 2.5.293]. /// /// See: fn read_rich_extended_string( r: &mut Record<'_>, encoding: &XlsEncoding, ) -> Result { if r.data.is_empty() { // spec violation: at very least cch and flags should be present return Ok(String::new()); } if r.data.len() < 3 { return Err(XlsError::Len { typ: "rich extended string", expected: 3, found: r.data.len(), }); } let cch = read_u16(r.data) as usize; let flags = r.data[2]; r.data = &r.data[3..]; let high_byte = flags & 0x1 != 0; // how many FormatRun in rgRun data block let mut c_run = 0; // how many bytes in ExtRst data block let mut cb_ext_rst = 0; // if flag fRichSt exists, read cRun and forward. if flags & 0x8 != 0 { c_run = read_u16(r.data) as usize; r.data = &r.data[2..]; } // if flag fExtSt exists, read cbExtRst and forward. if flags & 0x4 != 0 { cb_ext_rst = read_i32(r.data) as usize; r.data = &r.data[4..]; } // read rgb data block for the string we want let s = read_dbcs(encoding, cch, r, high_byte)?; // skip rgRun data block. Note: each FormatRun contain 4 bytes. r.skip(c_run * 4)?; // skip ExtRst data block. r.skip(cb_ext_rst)?; Ok(s) } fn read_dbcs( encoding: &XlsEncoding, mut len: usize, r: &mut Record<'_>, mut high_byte: bool, ) -> Result { let mut s = String::with_capacity(len); while len > 0 { let (l, at) = encoding.decode_to(r.data, len, &mut s, Some(high_byte)); r.data = &r.data[at..]; len -= l; if len > 0 { if r.continue_record() { high_byte = r.data[0] & 0x1 != 0; r.data = &r.data[1..]; } else { return Err(XlsError::EoStream("dbcs")); } } } Ok(s) } fn read_unicode_string_no_cch(encoding: &XlsEncoding, buf: &[u8], len: &usize, s: &mut String) { encoding.decode_to(&buf[1..=*len], *len, s, Some(buf[0] & 0x1 != 0)); } struct Record<'a> { typ: u16, data: &'a [u8], cont: Vec<&'a [u8]>, } impl<'a> Record<'a> { fn continue_record(&mut self) -> bool { if self.cont.is_empty() { false } else { self.data = self.cont.remove(0); true } } fn skip(&mut self, mut len: usize) -> Result<(), XlsError> { while len > 0 { if self.data.is_empty() && !self.continue_record() { return Err(XlsError::ContinueRecordTooShort); } let l = min(len, self.data.len()); let (_, next) = self.data.split_at(l); self.data = next; len -= l; } Ok(()) } } // Simple Debug impl to dump record data in hex format. impl fmt::Debug for Record<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!( f, "\nRecord = 0x{:04X}, Length = 0x{:04X}, {}", self.typ, self.data.len(), self.data.len() )?; let mut iter = self.data.chunks(16); for chunk in iter.by_ref() { for byte in chunk { write!(f, "{byte:02X} ")?; } writeln!(f)?; } Ok(()) } } struct RecordIter<'a> { stream: &'a [u8], } impl<'a> Iterator for RecordIter<'a> { type Item = Result, XlsError>; fn next(&mut self) -> Option { if self.stream.len() < 4 { return if self.stream.is_empty() { None } else { Some(Err(XlsError::EoStream("record type and length"))) }; } let t = read_u16(self.stream); let mut len = read_u16(&self.stream[2..]) as usize; if self.stream.len() < len + 4 { return Some(Err(XlsError::EoStream("record length"))); } let (data, next) = self.stream.split_at(len + 4); self.stream = next; let d = &data[4..]; // Append next record data if it is a Continue record let mut cont = Vec::new(); if next.len() > 4 && read_u16(next) == 0x003C { while self.stream.len() > 4 && read_u16(self.stream) == 0x003C { len = read_u16(&self.stream[2..]) as usize; if self.stream.len() < len + 4 { return Some(Err(XlsError::EoStream("continue record length"))); } let sp = self.stream.split_at(len + 4); cont.push(&sp.0[4..]); self.stream = sp.1; } } Some(Ok(Record { typ: t, data: d, cont, })) } } /// Write an absolute cell reference (eg: `$B$3`) into `f`. fn write_absolute_cell_ref(f: &mut String, col: u32, row: u32) { f.push('$'); push_column(col, f); write!(f, "${}", row + 1).unwrap(); } /// Formula parsing /// /// Does not implement ALL possibilities, only Area are parsed fn parse_defined_names(rgce: &[u8], biff: Biff) -> Result<(Option, String), XlsError> { if rgce.is_empty() { // TODO: do something better here ... return Ok((None, "empty rgce".to_string())); } let ptg = rgce[0]; let res = match ptg { 0x3a | 0x5a | 0x7a => { // PtgRef3d let (sheet_idx, row, col) = match biff { Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => { // ixals(2) + reserved(8) + itabFirst(2) + itabLast(2) + row(2) + col(1) ( read_u16(&rgce[11..13]) as usize, read_u16(&rgce[15..17]) as u32 & 0x3FFF, rgce[17] as u32, ) } Biff::Biff8 => { // ixti(2) + row(2) + col(2) ( read_u16(&rgce[1..3]) as usize, read_u16(&rgce[3..5]) as u32, read_u16(&rgce[5..7]) as u32, ) } }; // TODO: check with relative columns let mut f = String::new(); write_absolute_cell_ref(&mut f, col, row); (Some(sheet_idx), f) } 0x3b | 0x5b | 0x7b => { // PtgArea3d let (sheet_idx, row_first, row_last, col_first, col_last) = match biff { Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => { // ixals(2) + reserved(8) + itabFirst(2) + itabLast(2) + rowFirst(2) + rowLast(2) + colFirst(1) + colLast(1) ( read_u16(&rgce[11..13]) as usize, read_u16(&rgce[15..17]) as u32 & 0x3FFF, read_u16(&rgce[17..19]) as u32 & 0x3FFF, rgce[19] as u32, rgce[20] as u32, ) } Biff::Biff8 => { // ixti(2) + rowFirst(2) + rowLast(2) + colFirst(2) + colLast(2) ( read_u16(&rgce[1..3]) as usize, read_u16(&rgce[3..5]) as u32, read_u16(&rgce[5..7]) as u32, read_u16(&rgce[7..9]) as u32, read_u16(&rgce[9..11]) as u32, ) } }; // TODO: check with relative columns let mut f = String::new(); write_absolute_cell_ref(&mut f, col_first, row_first); f.push(':'); write_absolute_cell_ref(&mut f, col_last, row_last); (Some(sheet_idx), f) } 0x3c | 0x5c | 0x7c | 0x3d | 0x5d | 0x7d => { // PtgAreaErr3d or PtgRefErr3d let sheet_idx = match biff { Biff::Biff8 => read_u16(&rgce[1..3]) as usize, Biff::Biff2 | Biff::Biff3 | Biff::Biff4 | Biff::Biff5 => { read_u16(&rgce[11..13]) as usize } }; (Some(sheet_idx), "#REF!".to_string()) } _ => (None, format!("Unsupported ptg: {ptg:x}")), }; Ok(res) } /// Formula parsing /// /// `CellParsedFormula` [MS-XLS 2.5.198.3] fn parse_formula( mut rgce: &[u8], sheets: &[String], names: &[(String, String)], xtis: &[Xti], encoding: &XlsEncoding, ) -> Result { let mut stack = Vec::new(); let mut formula = String::with_capacity(rgce.len()); let cce = read_u16(rgce) as usize; rgce = &rgce[2..2 + cce]; while !rgce.is_empty() { let ptg = rgce[0]; rgce = &rgce[1..]; match ptg { 0x3a | 0x5a | 0x7a => { // PtgRef3d let ixti = read_u16(&rgce[0..2]); let rowu = read_u16(&rgce[2..]); let colu = read_u16(&rgce[4..]); let sh = xtis .get(ixti as usize) .and_then(|xti| sheets.get(xti.itab_first as usize)) .map_or("#REF", |sh| sh); stack.push(formula.len()); formula.push_str(sh); formula.push('!'); let col = colu << 2; // first 14 bits only if colu & 2 != 0 { formula.push('$'); } push_column(col as u32, &mut formula); if colu & 1 != 0 { formula.push('$'); } write!(&mut formula, "{}", rowu + 1).unwrap(); rgce = &rgce[6..]; } 0x3b | 0x5b | 0x7b => { // PtgArea3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(sheets.get(ixti as usize).map_or("#REF", |s| &**s)); formula.push('!'); // TODO: check with relative columns formula.push('$'); push_column(read_u16(&rgce[6..8]) as u32, &mut formula); write!(&mut formula, "${}:$", read_u16(&rgce[2..4]) as u32 + 1).unwrap(); push_column(read_u16(&rgce[8..10]) as u32, &mut formula); write!(&mut formula, "${}", read_u16(&rgce[4..6]) as u32 + 1).unwrap(); rgce = &rgce[10..]; } 0x3c | 0x5c | 0x7c => { // PtfRefErr3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(sheets.get(ixti as usize).map_or("#REF", |s| &**s)); formula.push('!'); formula.push_str("#REF!"); rgce = &rgce[6..]; } 0x3d | 0x5d | 0x7d => { // PtgAreaErr3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(sheets.get(ixti as usize).map_or("#REF", |s| &**s)); formula.push('!'); formula.push_str("#REF!"); rgce = &rgce[10..]; } 0x01 => { // PtgExp: array/shared formula, ignore debug!("ignoring PtgExp array/shared formula"); stack.push(formula.len()); rgce = &rgce[4..]; } 0x03..=0x11 => { // binary operation let e2 = stack.pop().ok_or(XlsError::StackLen)?; // imaginary 'e1' will actually already be the start of the binary op let op = match ptg { 0x03 => "+", 0x04 => "-", 0x05 => "*", 0x06 => "/", 0x07 => "^", 0x08 => "&", 0x09 => "<", 0x0A => "<=", 0x0B => "=", 0x0C => ">", 0x0D => ">=", 0x0E => "<>", 0x0F => " ", 0x10 => ",", 0x11 => ":", _ => unreachable!(), }; let e2 = formula.split_off(e2); write!(&mut formula, "{op}{e2}").unwrap(); } 0x12 => { let e = stack.last().ok_or(XlsError::StackLen)?; formula.insert(*e, '+'); } 0x13 => { let e = stack.last().ok_or(XlsError::StackLen)?; formula.insert(*e, '-'); } 0x14 => { formula.push('%'); } 0x15 => { let e = stack.last().ok_or(XlsError::StackLen)?; formula.insert(*e, '('); formula.push(')'); } 0x16 => { stack.push(formula.len()); } 0x17 => { stack.push(formula.len()); formula.push('\"'); let cch = rgce[0] as usize; read_unicode_string_no_cch(encoding, &rgce[1..], &cch, &mut formula); formula.push('\"'); rgce = &rgce[2 + cch..]; } 0x18 => { rgce = &rgce[5..]; } 0x19 => { let etpg = rgce[0]; rgce = &rgce[1..]; match etpg { 0x01 | 0x02 | 0x08 | 0x20 | 0x21 => rgce = &rgce[2..], 0x04 => { // PtgAttrChoose let n = read_u16(&rgce[..2]) as usize + 1; rgce = &rgce[2 + 2 * n..]; // ignore } 0x10 => { rgce = &rgce[2..]; let e = *stack.last().ok_or(XlsError::StackLen)?; let e = formula.split_off(e); write!(&mut formula, "SUM({e})").unwrap(); } 0x40 | 0x41 => { // PtfAttrSpace let e = *stack.last().ok_or(XlsError::StackLen)?; let space = match rgce[0] { 0x00 | 0x02 | 0x04 | 0x06 => ' ', 0x01 | 0x03 | 0x05 => '\r', val => { return Err(XlsError::Unrecognized { typ: "PtgAttrSpaceType", val, }); } }; let cch = rgce[1]; for _ in 0..cch { formula.insert(e, space); } rgce = &rgce[2..]; } e => return Err(XlsError::Etpg(e)), } } 0x1C => { stack.push(formula.len()); let err = rgce[0]; rgce = &rgce[1..]; match err { 0x00 => formula.push_str("#NULL!"), 0x07 => formula.push_str("#DIV/0!"), 0x0F => formula.push_str("#VALUE!"), 0x17 => formula.push_str("#REF!"), 0x1D => formula.push_str("#NAME?"), 0x24 => formula.push_str("#NUM!"), 0x2A => formula.push_str("#N/A"), 0x2B => formula.push_str("#GETTING_DATA"), e => { return Err(XlsError::Unrecognized { typ: "BErr", val: e, }); } } } 0x1D => { stack.push(formula.len()); formula.push_str(if rgce[0] == 0 { "FALSE" } else { "TRUE" }); rgce = &rgce[1..]; } 0x1E => { stack.push(formula.len()); write!(&mut formula, "{}", read_u16(rgce)).unwrap(); rgce = &rgce[2..]; } 0x1F => { stack.push(formula.len()); write!(&mut formula, "{}", read_f64(rgce)).unwrap(); rgce = &rgce[8..]; } 0x20 | 0x40 | 0x60 => { // PtgArray: ignore stack.push(formula.len()); formula.push_str("{PtgArray}"); rgce = &rgce[7..]; } 0x21 | 0x22 | 0x41 | 0x42 | 0x61 | 0x62 => { let (iftab, argc) = match ptg { 0x22 | 0x42 | 0x62 => { let iftab = read_u16(&rgce[1..]) as usize; let argc = rgce[0] as usize; rgce = &rgce[3..]; (iftab, argc) } _ => { let iftab = read_u16(rgce) as usize; if iftab > crate::utils::FTAB_LEN { return Err(XlsError::IfTab(iftab)); } rgce = &rgce[2..]; let argc = crate::utils::FTAB_ARGC[iftab] as usize; (iftab, argc) } }; if stack.len() < argc { return Err(XlsError::StackLen); } if argc > 0 { let args_start = stack.len() - argc; let mut args = stack.split_off(args_start); let start = args[0]; for s in &mut args { *s -= start; } let fargs = formula.split_off(start); stack.push(formula.len()); args.push(fargs.len()); formula.push_str( crate::utils::FTAB .get(iftab) .ok_or(XlsError::IfTab(iftab))?, ); formula.push('('); for w in args.windows(2) { formula.push_str(&fargs[w[0]..w[1]]); formula.push(','); } formula.pop(); formula.push(')'); } else { stack.push(formula.len()); formula.push_str(crate::utils::FTAB[iftab]); formula.push_str("()"); } } 0x23 | 0x43 | 0x63 => { let iname = read_u32(rgce) as usize - 1; // one-based stack.push(formula.len()); formula.push_str(names.get(iname).map_or("#REF!", |n| &*n.0)); rgce = &rgce[4..]; } 0x24 | 0x44 | 0x64 => { stack.push(formula.len()); let row = read_u16(rgce) + 1; let col = read_u16(&[rgce[2], rgce[3] & 0x3F]); if rgce[3] & 0x80 != 0x80 { formula.push('$'); } push_column(col as u32, &mut formula); if rgce[3] & 0x40 != 0x40 { formula.push('$'); } formula.push_str(&format!("{row}")); rgce = &rgce[4..]; } 0x25 | 0x45 | 0x65 => { stack.push(formula.len()); formula.push('$'); push_column(read_u16(&rgce[4..6]) as u32, &mut formula); write!(&mut formula, "${}:$", read_u16(&rgce[0..2]) as u32 + 1).unwrap(); push_column(read_u16(&rgce[6..8]) as u32, &mut formula); write!(&mut formula, "${}", read_u16(&rgce[2..4]) as u32 + 1).unwrap(); rgce = &rgce[8..]; } 0x2A | 0x4A | 0x6A => { stack.push(formula.len()); formula.push_str("#REF!"); rgce = &rgce[4..]; } 0x2B | 0x4B | 0x6B => { stack.push(formula.len()); formula.push_str("#REF!"); rgce = &rgce[8..]; } 0x39 | 0x59 => { // PfgNameX stack.push(formula.len()); formula.push_str("[PtgNameX]"); rgce = &rgce[6..]; } _ => { return Err(XlsError::Unrecognized { typ: "ptg", val: ptg, }); } } } if stack.len() == 1 { Ok(formula) } else { Err(XlsError::InvalidFormula { stack_size: stack.len(), }) } } /// `FormulaValue` [MS-XLS 2.5.133] fn parse_formula_value(r: &[u8]) -> Result, XlsError> { match *r { // String, value should be in next record [0x00, .., 0xFF, 0xFF] => Ok(None), [0x01, _, b, .., 0xFF, 0xFF] => Ok(Some(Data::Bool(b != 0))), [0x02, _, e, .., 0xFF, 0xFF] => parse_err(e).map(Some), // ignore, return blank string value [0x03, _, .., 0xFF, 0xFF] => Ok(Some(Data::String("".to_string()))), [e, .., 0xFF, 0xFF] => Err(XlsError::Unrecognized { typ: "error", val: e, }), _ => Ok(Some(Data::Float(read_f64(r)))), } } // OfficeArtRecord [MS-ODRAW 1.3.1]. #[cfg(feature = "picture")] struct ArtRecord<'a> { instance: u16, typ: u16, data: &'a [u8], } #[cfg(feature = "picture")] struct ArtRecordIter<'a> { stream: &'a [u8], } #[cfg(feature = "picture")] impl<'a> Iterator for ArtRecordIter<'a> { type Item = Result, XlsError>; fn next(&mut self) -> Option { if self.stream.len() < 8 { return if self.stream.is_empty() { None } else { Some(Err(XlsError::EoStream("art record header"))) }; } let ver_ins = read_u16(self.stream); let instance = ver_ins >> 4; let typ = read_u16(&self.stream[2..]); if typ < 0xF000 { return Some(Err(XlsError::Art("type range 0xF000 - 0xFFFF"))); } let len = read_usize(&self.stream[4..]); if self.stream.len() < len + 8 { return Some(Err(XlsError::EoStream("art record length"))); } let (d, next) = self.stream.split_at(len + 8); self.stream = next; let data = &d[8..]; Some(Ok(ArtRecord { instance, typ, data, })) } } // Parsing pictures. #[cfg(feature = "picture")] fn parse_pictures(stream: &[u8]) -> Result)>, XlsError> { let mut pics = Vec::new(); let records = ArtRecordIter { stream }; for record in records { let r = record?; match r.typ { // OfficeArtDggContainer [MS-ODRAW 2.2.12] // OfficeArtBStoreContainer [MS-ODRAW 2.2.20] 0xF000 | 0xF001 => pics.extend(parse_pictures(r.data)?), // OfficeArtFBSE [MS-ODRAW 2.2.32] 0xF007 => { let skip = 36 + r.data[33] as usize; pics.extend(parse_pictures(&r.data[skip..])?); } // OfficeArtBlip [MS-ODRAW 2.2.23] 0xF01A | 0xF01B | 0xF01C | 0xF01D | 0xF01E | 0xF01F | 0xF029 | 0xF02A => { let ext_skip = match r.typ { // OfficeArtBlipEMF [MS-ODRAW 2.2.24] 0xF01A => { let skip = match r.instance { 0x3D4 => 50usize, 0x3D5 => 66, _ => unreachable!(), }; Ok(("emf", skip)) } // OfficeArtBlipWMF [MS-ODRAW 2.2.25] 0xF01B => { let skip = match r.instance { 0x216 => 50usize, 0x217 => 66, _ => unreachable!(), }; Ok(("wmf", skip)) } // OfficeArtBlipPICT [MS-ODRAW 2.2.26] 0xF01C => { let skip = match r.instance { 0x542 => 50usize, 0x543 => 66, _ => unreachable!(), }; Ok(("pict", skip)) } // OfficeArtBlipJPEG [MS-ODRAW 2.2.27] 0xF01D | 0xF02A => { let skip = match r.instance { 0x46A | 0x6E2 => 17usize, 0x46B | 0x6E3 => 33, _ => unreachable!(), }; Ok(("jpg", skip)) } // OfficeArtBlipPNG [MS-ODRAW 2.2.28] 0xF01E => { let skip = match r.instance { 0x6E0 => 17usize, 0x6E1 => 33, _ => unreachable!(), }; Ok(("png", skip)) } // OfficeArtBlipDIB [MS-ODRAW 2.2.29] 0xF01F => { let skip = match r.instance { 0x7A8 => 17usize, 0x7A9 => 33, _ => unreachable!(), }; Ok(("dib", skip)) } // OfficeArtBlipTIFF [MS-ODRAW 2.2.30] 0xF029 => { let skip = match r.instance { 0x6E4 => 17usize, 0x6E5 => 33, _ => unreachable!(), }; Ok(("tiff", skip)) } _ => Err(XlsError::Art("picture type not support")), }; let ext_skip = ext_skip?; pics.push((ext_skip.0.to_string(), Vec::from(&r.data[ext_skip.1..]))); } _ => {} } } Ok(pics) } #[cfg(test)] mod tests { use super::*; #[test] fn test_parse_string() { let enc = XlsEncoding::from_codepage(1252).unwrap(); parse_string(&[0, 1], &enc, Biff::Biff8).unwrap_err(); } } calamine-0.34.0/src/xlsb/cells_reader.rs000064400000000000000000000171471046102023000162240ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. use std::io::{Read, Seek}; use crate::{ datatype::DataRef, formats::{format_excel_f64_ref, CellFormat}, utils::{read_f64, read_i32, read_u32, read_usize}, Cell, CellErrorType, Dimensions, XlsbError, }; use super::{cell_format, parse_formula, wide_str, RecordIter}; /// A cells reader for xlsb files pub struct XlsbCellsReader<'a, RS> where RS: Read + Seek, { iter: RecordIter<'a, RS>, formats: &'a [CellFormat], strings: &'a [String], extern_sheets: &'a [String], metadata_names: &'a [(String, String)], typ: u16, row: u32, is_1904: bool, dimensions: Dimensions, buf: Vec, } impl<'a, RS> XlsbCellsReader<'a, RS> where RS: Read + Seek, { pub(crate) fn new( mut iter: RecordIter<'a, RS>, formats: &'a [CellFormat], strings: &'a [String], extern_sheets: &'a [String], metadata_names: &'a [(String, String)], is_1904: bool, ) -> Result { let mut buf = Vec::with_capacity(1024); // BrtWsDim let _ = iter.next_skip_blocks( 0x0094, &[ (0x0081, None), // BrtBeginSheet (0x0093, None), // BrtWsProp ], &mut buf, )?; let dimensions = parse_dimensions(&buf[..16]); // BrtBeginSheetData let _ = iter.next_skip_blocks( 0x0091, &[ (0x0085, Some(0x0086)), // Views (0x0025, Some(0x0026)), // AC blocks (0x01E5, None), // BrtWsFmtInfo (0x0186, Some(0x0187)), // Col Infos ], &mut buf, )?; Ok(XlsbCellsReader { iter, formats, is_1904, strings, extern_sheets, metadata_names, dimensions, typ: 0, row: 0, buf, }) } pub fn dimensions(&self) -> Dimensions { self.dimensions } pub fn next_cell(&mut self) -> Result>>, XlsbError> { // loop until end of sheet let value = loop { self.buf.clear(); self.typ = self.iter.read_type()?; let _ = self.iter.fill_buffer(&mut self.buf)?; let value = match self.typ { // 0x0001 => continue, // Data::Empty, // BrtCellBlank 0x0002 => { // BrtCellRk MS-XLSB 2.5.122 let d100 = (self.buf[8] & 1) != 0; let is_int = (self.buf[8] & 2) != 0; self.buf[8] &= 0xFC; if is_int { let v = (read_i32(&self.buf[8..12]) >> 2) as i64; if d100 { let v = (v as f64) / 100.0; format_excel_f64_ref( v, cell_format(self.formats, &self.buf), self.is_1904, ) } else { DataRef::Int(v) } } else { let mut v = [0u8; 8]; v[4..].copy_from_slice(&self.buf[8..12]); let v = read_f64(&v); let v = if d100 { v / 100.0 } else { v }; format_excel_f64_ref(v, cell_format(self.formats, &self.buf), self.is_1904) } } 0x0003 => { let error = match self.buf[8] { 0x00 => CellErrorType::Null, 0x07 => CellErrorType::Div0, 0x0F => CellErrorType::Value, 0x17 => CellErrorType::Ref, 0x1D => CellErrorType::Name, 0x24 => CellErrorType::Num, 0x2A => CellErrorType::NA, 0x2B => CellErrorType::GettingData, c => return Err(XlsbError::CellError(c)), }; // BrtCellError DataRef::Error(error) } 0x0004 | 0x000A => DataRef::Bool(self.buf[8] != 0), // BrtCellBool or BrtFmlaBool 0x0005 | 0x0009 => { let v = read_f64(&self.buf[8..16]); format_excel_f64_ref(v, cell_format(self.formats, &self.buf), self.is_1904) } // BrtCellReal or BrtFmlaNum 0x0006 | 0x0008 => DataRef::String(wide_str(&self.buf[8..], &mut 0)?.into_owned()), // BrtCellSt or BrtFmlaString 0x0007 => { // BrtCellIsst let isst = read_usize(&self.buf[8..12]); DataRef::SharedString(&self.strings[isst]) } 0x0000 => { // BrtRowHdr self.row = read_u32(&self.buf); if self.row > 0x0010_0000 { return Ok(None); // invalid row } continue; } 0x0092 => return Ok(None), // BrtEndSheetData _ => continue, // anything else, ignore and try next, without changing idx }; break value; }; let col = read_u32(&self.buf); Ok(Some(Cell::new((self.row, col), value))) } pub fn next_formula(&mut self) -> Result>, XlsbError> { let value = loop { self.typ = self.iter.read_type()?; let _ = self.iter.fill_buffer(&mut self.buf)?; let value = match self.typ { // 0x0001 => continue, // Data::Empty, // BrtCellBlank 0x0008 => { // BrtFmlaString let cch = read_u32(&self.buf[8..]) as usize; let formula = &self.buf[14 + cch * 2..]; let cce = read_u32(formula) as usize; let rgce = &formula[4..4 + cce]; parse_formula(rgce, self.extern_sheets, self.metadata_names)? } 0x0009 => { // BrtFmlaNum let formula = &self.buf[18..]; let cce = read_u32(formula) as usize; let rgce = &formula[4..4 + cce]; parse_formula(rgce, self.extern_sheets, self.metadata_names)? } 0x000A | 0x000B => { // BrtFmlaBool | BrtFmlaError let formula = &self.buf[11..]; let cce = read_u32(formula) as usize; let rgce = &formula[4..4 + cce]; parse_formula(rgce, self.extern_sheets, self.metadata_names)? } 0x0000 => { // BrtRowHdr self.row = read_u32(&self.buf); if self.row > 0x0010_0000 { return Ok(None); // invalid row } continue; } 0x0092 => return Ok(None), // BrtEndSheetData _ => continue, // anything else, ignore and try next, without changing idx }; break value; }; let col = read_u32(&self.buf); Ok(Some(Cell::new((self.row, col), value))) } } fn parse_dimensions(buf: &[u8]) -> Dimensions { Dimensions { start: (read_u32(&buf[0..4]), read_u32(&buf[8..12])), end: (read_u32(&buf[4..8]), read_u32(&buf[12..16])), } } calamine-0.34.0/src/xlsb/mod.rs000064400000000000000000001124501046102023000143500ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. mod cells_reader; pub use cells_reader::XlsbCellsReader; use std::borrow::Cow; use std::collections::HashMap; use std::io::{BufReader, Read, Seek}; use log::debug; use encoding_rs::UTF_16LE; use quick_xml::events::attributes::Attribute; use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::Reader as XmlReader; use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::datatype::DataRef; use crate::formats::{builtin_format_by_code, detect_custom_number_format, CellFormat}; use crate::utils::{ build_zip_path_cache, cached_zip_path, push_column, read_f64, read_i32, read_u16, read_u32, read_usize, }; use crate::vba::VbaProject; use crate::{ Cell, Data, HeaderRow, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, }; /// A Xlsb specific error #[derive(Debug)] pub enum XlsbError { /// Io error Io(std::io::Error), /// Zip error Zip(zip::result::ZipError), /// Xml error Xml(quick_xml::Error), /// Xml attribute error XmlAttr(quick_xml::events::attributes::AttrError), /// Vba error Vba(crate::vba::VbaError), /// Mismatch value Mismatch { /// expected expected: &'static str, /// found found: u16, }, /// File not found FileNotFound(String), /// Invalid formula, stack length too short StackLen, /// Unsupported type UnsupportedType(u16), /// Unsupported etpg Etpg(u8), /// Unsupported iftab IfTab(usize), /// Unsupported `BErr` BErr(u8), /// Unsupported Ptg Ptg(u8), /// Unsupported cell error code CellError(u8), /// Wide str length too long WideStr { /// wide str length ws_len: usize, /// buffer length buf_len: usize, }, /// Unrecognized data Unrecognized { /// data type typ: &'static str, /// value found val: String, }, /// Workbook is password protected Password, /// Worksheet not found WorksheetNotFound(String), /// XML Encoding error Encoding(quick_xml::encoding::EncodingError), } from_err!(std::io::Error, XlsbError, Io); from_err!(zip::result::ZipError, XlsbError, Zip); from_err!(quick_xml::Error, XlsbError, Xml); from_err!(quick_xml::events::attributes::AttrError, XlsbError, XmlAttr); from_err!(quick_xml::encoding::EncodingError, XlsbError, Encoding); from_err!(crate::vba::VbaError, XlsbError, Vba); impl std::fmt::Display for XlsbError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { XlsbError::Io(e) => write!(f, "I/O error: {e}"), XlsbError::Zip(e) => write!(f, "Zip error: {e}"), XlsbError::Xml(e) => write!(f, "Xml error: {e}"), XlsbError::XmlAttr(e) => write!(f, "Xml attribute error: {e}"), XlsbError::Vba(e) => write!(f, "Vba error: {e}"), XlsbError::Mismatch { expected, found } => { write!(f, "Expecting {expected}, got {found:X}") } XlsbError::FileNotFound(file) => write!(f, "File not found: '{file}'"), XlsbError::StackLen => write!(f, "Invalid stack length"), XlsbError::UnsupportedType(t) => write!(f, "Unsupported type {t:X}"), XlsbError::Etpg(t) => write!(f, "Unsupported etpg {t:X}"), XlsbError::IfTab(t) => write!(f, "Unsupported iftab {t:X}"), XlsbError::BErr(t) => write!(f, "Unsupported BErr {t:X}"), XlsbError::Ptg(t) => write!(f, "Unsupported Ptf {t:X}"), XlsbError::CellError(t) => write!(f, "Unsupported Cell Error code {t:X}"), XlsbError::WideStr { ws_len, buf_len } => write!( f, "Wide str length exceeds buffer length ({ws_len} > {buf_len})", ), XlsbError::Unrecognized { typ, val } => { write!(f, "Unrecognized {typ}: {val}") } XlsbError::Password => write!(f, "Workbook is password protected"), XlsbError::WorksheetNotFound(name) => write!(f, "Worksheet '{name}' not found"), XlsbError::Encoding(e) => write!(f, "XML encoding error: {e}"), } } } impl std::error::Error for XlsbError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { XlsbError::Io(e) => Some(e), XlsbError::Zip(e) => Some(e), XlsbError::Xml(e) => Some(e), XlsbError::Vba(e) => Some(e), _ => None, } } } /// Xlsb reader options #[derive(Debug, Default)] #[non_exhaustive] struct XlsbOptions { pub header_row: HeaderRow, } /// A Xlsb reader pub struct Xlsb { zip: ZipArchive, extern_sheets: Vec, sheets: Vec<(String, String)>, strings: Vec, /// Cell (number) formats formats: Vec, is_1904: bool, metadata: Metadata, #[cfg(feature = "picture")] pictures: Option)>>, options: XlsbOptions, zip_path_cache: HashMap, } impl Xlsb { /// MS-XLSB fn read_relationships(&mut self) -> Result, String>, XlsbError> { let mut relationships = HashMap::new(); match self.zip.by_name("xl/_rels/workbook.bin.rels") { Ok(f) => { let mut xml = XmlReader::from_reader(BufReader::new(f)); let config = xml.config_mut(); config.check_end_names = false; config.trim_text(false); config.check_comments = false; config.expand_empty_elements = true; let mut buf: Vec = Vec::with_capacity(64); loop { match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.name() == QName(b"Relationship") => { let mut id = None; let mut target = None; for a in e.attributes() { match a? { Attribute { key: QName(b"Id"), value: v, } => { id = Some(v.to_vec()); } Attribute { key: QName(b"Target"), value: v, } => { target = Some( xml.decoder() .decode(&v) .map_err(XlsbError::Encoding)? .into_owned(), ); } _ => (), } } if let (Some(id), Some(target)) = (id, target) { relationships.insert(id, target); } } Ok(Event::Eof) => break, Err(e) => return Err(XlsbError::Xml(e)), _ => (), } buf.clear(); } } Err(ZipError::FileNotFound) => (), Err(e) => return Err(XlsbError::Zip(e)), } Ok(relationships) } /// MS-XLSB 2.1.7.50 Styles fn read_styles(&mut self) -> Result<(), XlsbError> { let mut iter = match RecordIter::from_zip(&mut self.zip, "xl/styles.bin", &self.zip_path_cache) { Ok(iter) => iter, Err(_) => return Ok(()), // it is fine if path does not exists }; let mut buf = Vec::with_capacity(1024); let mut number_formats = HashMap::new(); loop { match iter.read_type()? { 0x0267 => { // BrtBeginFmts let _len = iter.fill_buffer(&mut buf)?; let len = read_usize(&buf); for _ in 0..len { let _ = iter.next_skip_blocks(0x002C, &[], &mut buf)?; // BrtFmt let fmt_code = read_u16(&buf); let fmt_str = wide_str(&buf[2..], &mut 0)?; number_formats .insert(fmt_code, detect_custom_number_format(fmt_str.as_ref())); } } 0x0269 => { // BrtBeginCellXFs let _len = iter.fill_buffer(&mut buf)?; let len = read_usize(&buf); for _ in 0..len { let _ = iter.next_skip_blocks(0x002F, &[], &mut buf)?; // BrtXF let fmt_code = read_u16(&buf[2..4]); match builtin_format_by_code(fmt_code) { CellFormat::DateTime => self.formats.push(CellFormat::DateTime), CellFormat::TimeDelta => self.formats.push(CellFormat::TimeDelta), CellFormat::Other => { self.formats.push( number_formats .get(&fmt_code) .copied() .unwrap_or(CellFormat::Other), ); } } } // BrtBeginCellXFs is always present and always after BrtBeginFmts break; } _ => (), } buf.clear(); } Ok(()) } /// MS-XLSB 2.1.7.45 fn read_shared_strings(&mut self) -> Result<(), XlsbError> { let mut iter = match RecordIter::from_zip(&mut self.zip, "xl/sharedStrings.bin", &self.zip_path_cache) { Ok(iter) => iter, Err(_) => return Ok(()), // it is fine if path does not exists }; let mut buf = Vec::with_capacity(1024); let _ = iter.next_skip_blocks(0x009F, &[], &mut buf)?; // BrtBeginSst let len = read_usize(&buf[4..8]); // BrtSSTItems for _ in 0..len { let _ = iter.next_skip_blocks( 0x0013, &[ (0x0023, Some(0x0024)), // future ], &mut buf, )?; // BrtSSTItem self.strings.push(wide_str(&buf[1..], &mut 0)?.into_owned()); } Ok(()) } /// MS-XLSB 2.1.7.61 fn read_workbook(&mut self, relationships: &HashMap, String>) -> Result<(), XlsbError> { let mut iter = RecordIter::from_zip(&mut self.zip, "xl/workbook.bin", &self.zip_path_cache)?; let mut buf = Vec::with_capacity(1024); loop { match iter.read_type()? { 0x0099 => { let _ = iter.fill_buffer(&mut buf)?; self.is_1904 = &buf[0] & 0x1 != 0; } // BrtWbProp 0x009C => { // BrtBundleSh let len = iter.fill_buffer(&mut buf)?; let rel_len = read_u32(&buf[8..len]); if rel_len != 0xFFFF_FFFF { let rel_len = rel_len as usize * 2; let relid = &buf[12..12 + rel_len]; // converts utf16le to utf8 for HashMap search let relid = UTF_16LE.decode(relid).0; let path = format!("xl/{}", relationships[relid.as_bytes()]); // ST_SheetState let visible = match read_u32(&buf) { 0 => SheetVisible::Visible, 1 => SheetVisible::Hidden, 2 => SheetVisible::VeryHidden, v => { return Err(XlsbError::Unrecognized { typ: "BoundSheet8:hsState", val: v.to_string(), }) } }; let typ = match path.split('/').nth(1) { Some("worksheets") => SheetType::WorkSheet, Some("chartsheets") => SheetType::ChartSheet, Some("dialogsheets") => SheetType::DialogSheet, _ => { return Err(XlsbError::Unrecognized { typ: "BoundSheet8:dt", val: path.to_string(), }) } }; let name = wide_str(&buf[12 + rel_len..len], &mut 0)?; self.metadata.sheets.push(Sheet { name: name.to_string(), typ, visible, }); self.sheets.push((name.into_owned(), path)); } } 0x0090 => break, // BrtEndBundleShs _ => (), } buf.clear(); } // BrtName let mut defined_names = Vec::new(); loop { let typ = iter.read_type()?; match typ { 0x016A => { // BrtExternSheet let _len = iter.fill_buffer(&mut buf)?; let cxti = read_u32(&buf[..4]) as usize; if cxti < 1_000_000 { self.extern_sheets.reserve(cxti); } let sheets = &self.sheets; let extern_sheets = buf[4..] .chunks(12) .map(|xti| { match read_i32(&xti[4..8]) { -2 => "#ThisWorkbook", -1 => "#InvalidWorkSheet", p if p >= 0 && (p as usize) < sheets.len() => &sheets[p as usize].0, _ => "#Unknown", } .to_string() }) .take(cxti) .collect(); self.extern_sheets = extern_sheets; } 0x0027 => { // BrtName let len = iter.fill_buffer(&mut buf)?; let mut str_len = 0; let name = wide_str(&buf[9..len], &mut str_len)?.into_owned(); let rgce_len = read_u32(&buf[9 + str_len..]) as usize; let rgce = &buf[13 + str_len..13 + str_len + rgce_len]; let formula = parse_formula(rgce, &self.extern_sheets, &defined_names)?; defined_names.push((name, formula)); } 0x009D | 0x0225 | 0x018D | 0x0180 | 0x009A | 0x0252 | 0x0229 | 0x009B | 0x0084 => { // record supposed to happen AFTER BrtNames self.metadata.names = defined_names; return Ok(()); } _ => debug!("Unsupported type {typ:X}"), } } } /// Get a cells reader for a given worksheet pub fn worksheet_cells_reader<'a>( &'a mut self, name: &str, ) -> Result, XlsbError> { let path = match self.sheets.iter().find(|&(n, _)| n == name) { Some((_, path)) => path.clone(), None => return Err(XlsbError::WorksheetNotFound(name.into())), }; let iter = RecordIter::from_zip(&mut self.zip, &path, &self.zip_path_cache)?; XlsbCellsReader::new( iter, &self.formats, &self.strings, &self.extern_sheets, &self.metadata.names, self.is_1904, ) } #[cfg(feature = "picture")] fn read_pictures(&mut self) -> Result<(), XlsbError> { let mut pics = Vec::new(); for i in 0..self.zip.len() { let mut zfile = self.zip.by_index(i)?; let zname = zfile.name(); if zname.starts_with("xl/media") { if let Some(ext) = zname.split('.').next_back() { if [ "emf", "wmf", "pict", "jpeg", "jpg", "png", "dib", "gif", "tiff", "eps", "bmp", "wpg", ] .contains(&ext) { let ext = ext.to_string(); let mut buf: Vec = Vec::new(); zfile.read_to_end(&mut buf)?; pics.push((ext, buf)); } } } } if !pics.is_empty() { self.pictures = Some(pics); } Ok(()) } } impl Reader for Xlsb { type Error = XlsbError; fn new(mut reader: RS) -> Result { check_for_password_protected(&mut reader)?; let zip = ZipArchive::new(reader)?; let zip_path_cache = build_zip_path_cache(&zip); let mut xlsb = Xlsb { zip, sheets: Vec::new(), strings: Vec::new(), extern_sheets: Vec::new(), formats: Vec::new(), is_1904: false, metadata: Metadata::default(), #[cfg(feature = "picture")] pictures: None, options: XlsbOptions::default(), zip_path_cache, }; xlsb.read_shared_strings()?; xlsb.read_styles()?; let relationships = xlsb.read_relationships()?; xlsb.read_workbook(&relationships)?; #[cfg(feature = "picture")] xlsb.read_pictures()?; Ok(xlsb) } fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } fn vba_project(&mut self) -> Result, XlsbError> { let Some(mut f) = self.zip.by_name("xl/vbaProject.bin").ok() else { return Ok(None); }; let len = f.size() as usize; let vba = VbaProject::new(&mut f, len)?; Ok(Some(vba)) } fn metadata(&self) -> &Metadata { &self.metadata } /// MS-XLSB 2.1.7.62 fn worksheet_range(&mut self, name: &str) -> Result, XlsbError> { let rge = self.worksheet_range_ref(name)?; let inner = rge.inner.into_iter().map(|v| v.into()).collect(); Ok(Range { start: rge.start, end: rge.end, inner, }) } /// MS-XLSB 2.1.7.62 fn worksheet_formula(&mut self, name: &str) -> Result, XlsbError> { let mut cells_reader = self.worksheet_cells_reader(name)?; let mut cells = Vec::with_capacity(cells_reader.dimensions().len().min(1_000_000) as _); while let Some(cell) = cells_reader.next_formula()? { if !cell.val.is_empty() { cells.push(cell); } } Ok(Range::from_sparse(cells)) } /// MS-XLSB 2.1.7.62 fn worksheets(&mut self) -> Vec<(String, Range)> { let sheets = self .sheets .iter() .map(|(name, _)| name.clone()) .collect::>(); sheets .into_iter() .filter_map(|name| { let ws = self.worksheet_range(&name).ok()?; Some((name, ws)) }) .collect() } #[cfg(feature = "picture")] fn pictures(&self) -> Option)>> { self.pictures.to_owned() } } impl ReaderRef for Xlsb { fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsbError> { let header_row = self.options.header_row; let mut cell_reader = self.worksheet_cells_reader(name)?; let len = cell_reader.dimensions().len(); let mut cells = Vec::new(); if len < 100_000 { cells.reserve(len as usize); } match header_row { HeaderRow::FirstNonEmptyRow => { // the header row is the row of the first non-empty cell loop { match cell_reader.next_cell() { Ok(Some(Cell { val: DataRef::Empty, .. })) => (), Ok(Some(cell)) => cells.push(cell), Ok(None) => break, Err(e) => return Err(e), } } } HeaderRow::Row(header_row_idx) => { // If `header_row` is a row index, we only add non-empty cells after this index. loop { match cell_reader.next_cell() { Ok(Some(Cell { val: DataRef::Empty, .. })) => (), Ok(Some(cell)) => { if cell.pos.0 >= header_row_idx { cells.push(cell); } } Ok(None) => break, Err(e) => return Err(e), } } // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. if cells.first().is_some_and(|c| c.pos.0 != header_row_idx) { cells.insert( 0, Cell { pos: ( header_row_idx, cells.first().expect("cells should not be empty").pos.1, ), val: DataRef::Empty, }, ); } } } Ok(Range::from_sparse(cells)) } } pub(crate) struct RecordIter<'a, RS> where RS: Read + Seek, { b: [u8; 1], r: BufReader>, } impl<'a, RS> RecordIter<'a, RS> where RS: Read + Seek, { fn from_zip( zip: &'a mut ZipArchive, path: &str, cache: &HashMap, ) -> Result, XlsbError> { let zip_path = cached_zip_path(cache, path); match zip.by_name(zip_path) { Ok(f) => Ok(RecordIter { r: BufReader::new(f), b: [0], }), Err(ZipError::FileNotFound) => Err(XlsbError::FileNotFound(path.into())), Err(e) => Err(XlsbError::Zip(e)), } } fn read_u8(&mut self) -> Result { self.r.read_exact(&mut self.b)?; Ok(self.b[0]) } /// Read next type, until we have no future record fn read_type(&mut self) -> Result { let b = self.read_u8()?; let typ = if (b & 0x80) == 0x80 { (b & 0x7F) as u16 + (((self.read_u8()? & 0x7F) as u16) << 7) } else { b as u16 }; Ok(typ) } fn fill_buffer(&mut self, buf: &mut Vec) -> Result { let mut b = self.read_u8()?; let mut len = (b & 0x7F) as usize; for i in 1..4 { if (b & 0x80) == 0 { break; } b = self.read_u8()?; len += ((b & 0x7F) as usize) << (7 * i); } if buf.len() < len { buf.resize(len, 0); } self.r.read_exact(&mut buf[..len])?; Ok(len) } /// Reads next type, and discard blocks between `start` and `end` fn next_skip_blocks( &mut self, record_type: u16, bounds: &[(u16, Option)], buf: &mut Vec, ) -> Result { loop { let typ = self.read_type()?; let len = self.fill_buffer(buf)?; if typ == record_type { return Ok(len); } if let Some(end) = bounds.iter().find(|b| b.0 == typ).and_then(|b| b.1) { while self.read_type()? != end { let _ = self.fill_buffer(buf)?; } let _ = self.fill_buffer(buf)?; } } } } fn wide_str<'a>(buf: &'a [u8], str_len: &mut usize) -> Result, XlsbError> { let len = read_u32(buf) as usize; if buf.len() < 4 + len * 2 { return Err(XlsbError::WideStr { ws_len: 4 + len * 2, buf_len: buf.len(), }); } *str_len = 4 + len * 2; let s = &buf[4..*str_len]; Ok(UTF_16LE.decode(s).0) } /// Formula parsing /// /// [MS-XLSB 2.2.2] /// [MS-XLSB 2.5.97] /// /// See Ptg [2.5.97.16] fn parse_formula( mut rgce: &[u8], sheets: &[String], names: &[(String, String)], ) -> Result { if rgce.is_empty() { return Ok(String::new()); } let mut stack = Vec::new(); let mut formula = String::with_capacity(rgce.len()); while !rgce.is_empty() { let ptg = rgce[0]; rgce = &rgce[1..]; match ptg { 0x3a | 0x5a | 0x7a => { // PtgRef3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(&sheets[ixti as usize]); formula.push('!'); // TODO: check with relative columns formula.push('$'); push_column(read_u16(&rgce[6..8]) as u32, &mut formula); formula.push('$'); formula.push_str(&format!("{}", read_u32(&rgce[2..6]) + 1)); rgce = &rgce[8..]; } 0x3b | 0x5b | 0x7b => { // PtgArea3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(&sheets[ixti as usize]); formula.push('!'); // TODO: check with relative columns formula.push('$'); push_column(read_u16(&rgce[10..12]) as u32, &mut formula); formula.push('$'); formula.push_str(&format!("{}", read_u32(&rgce[2..6]) + 1)); formula.push(':'); formula.push('$'); push_column(read_u16(&rgce[12..14]) as u32, &mut formula); formula.push('$'); formula.push_str(&format!("{}", read_u32(&rgce[6..10]) + 1)); rgce = &rgce[14..]; } 0x3c | 0x5c | 0x7c => { // PtfRefErr3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(&sheets[ixti as usize]); formula.push('!'); formula.push_str("#REF!"); rgce = &rgce[8..]; } 0x3d | 0x5d | 0x7d => { // PtgAreaErr3d let ixti = read_u16(&rgce[0..2]); stack.push(formula.len()); formula.push_str(&sheets[ixti as usize]); formula.push('!'); formula.push_str("#REF!"); rgce = &rgce[14..]; } 0x01 => { // PtgExp: array/shared formula, ignore debug!("ignoring PtgExp array/shared formula"); stack.push(formula.len()); rgce = &rgce[4..]; } 0x03..=0x11 => { // binary operation let e2 = stack.pop().ok_or(XlsbError::StackLen)?; let e2 = formula.split_off(e2); // imaginary 'e1' will actually already be the start of the binary op let op = match ptg { 0x03 => "+", 0x04 => "-", 0x05 => "*", 0x06 => "/", 0x07 => "^", 0x08 => "&", 0x09 => "<", 0x0A => "<=", 0x0B => "=", 0x0C => ">", 0x0D => ">=", 0x0E => "<>", 0x0F => " ", 0x10 => ",", 0x11 => ":", _ => unreachable!(), }; formula.push_str(op); formula.push_str(&e2); } 0x12 => { let e = stack.last().ok_or(XlsbError::StackLen)?; formula.insert(*e, '+'); } 0x13 => { let e = stack.last().ok_or(XlsbError::StackLen)?; formula.insert(*e, '-'); } 0x14 => { formula.push('%'); } 0x15 => { let e = stack.last().ok_or(XlsbError::StackLen)?; formula.insert(*e, '('); formula.push(')'); } 0x16 => { stack.push(formula.len()); } 0x17 => { stack.push(formula.len()); formula.push('\"'); let cch = read_u16(&rgce[0..2]) as usize; formula.push_str(&UTF_16LE.decode(&rgce[2..2 + 2 * cch]).0); formula.push('\"'); rgce = &rgce[2 + 2 * cch..]; } 0x18 => { stack.push(formula.len()); let eptg = rgce[0]; rgce = &rgce[1..]; match eptg { 0x19 => rgce = &rgce[12..], 0x1D => rgce = &rgce[4..], e => return Err(XlsbError::Etpg(e)), } } 0x19 => { let eptg = rgce[0]; rgce = &rgce[1..]; match eptg { 0x01 | 0x02 | 0x08 | 0x20 | 0x21 | 0x40 | 0x41 | 0x80 => rgce = &rgce[2..], 0x04 => rgce = &rgce[10..], 0x10 => { rgce = &rgce[2..]; let e = stack.last().ok_or(XlsbError::StackLen)?; let e = formula.split_off(*e); formula.push_str("SUM("); formula.push_str(&e); formula.push(')'); } e => return Err(XlsbError::Etpg(e)), } } 0x1C => { stack.push(formula.len()); let err = rgce[0]; rgce = &rgce[1..]; match err { 0x00 => formula.push_str("#NULL!"), 0x07 => formula.push_str("#DIV/0!"), 0x0F => formula.push_str("#VALUE!"), 0x17 => formula.push_str("#REF!"), 0x1D => formula.push_str("#NAME?"), 0x24 => formula.push_str("#NUM!"), 0x2A => formula.push_str("#N/A"), 0x2B => formula.push_str("#GETTING_DATA"), e => return Err(XlsbError::BErr(e)), } } 0x1D => { stack.push(formula.len()); formula.push_str(if rgce[0] == 0 { "FALSE" } else { "TRUE" }); rgce = &rgce[1..]; } 0x1E => { stack.push(formula.len()); formula.push_str(&format!("{}", read_u16(rgce))); rgce = &rgce[2..]; } 0x1F => { stack.push(formula.len()); formula.push_str(&format!("{}", read_f64(rgce))); rgce = &rgce[8..]; } 0x20 | 0x40 | 0x60 => { // PtgArray: ignore stack.push(formula.len()); rgce = &rgce[14..]; } 0x21 | 0x22 | 0x41 | 0x42 | 0x61 | 0x62 => { let (iftab, argc) = match ptg { 0x22 | 0x42 | 0x62 => { let iftab = read_u16(&rgce[1..]) as usize; let argc = rgce[0] as usize; rgce = &rgce[3..]; (iftab, argc) } _ => { let iftab = read_u16(rgce) as usize; if iftab > crate::utils::FTAB_LEN { return Err(XlsbError::IfTab(iftab)); } rgce = &rgce[2..]; let argc = crate::utils::FTAB_ARGC[iftab] as usize; (iftab, argc) } }; if stack.len() < argc { return Err(XlsbError::StackLen); } if argc > 0 { let args_start = stack.len() - argc; let mut args = stack.split_off(args_start); let start = args[0]; for s in &mut args { *s -= start; } let fargs = formula.split_off(start); stack.push(formula.len()); args.push(fargs.len()); formula.push_str(crate::utils::FTAB[iftab]); formula.push('('); for w in args.windows(2) { formula.push_str(&fargs[w[0]..w[1]]); formula.push(','); } formula.pop(); formula.push(')'); } else { stack.push(formula.len()); formula.push_str(crate::utils::FTAB[iftab]); formula.push_str("()"); } } 0x23 | 0x43 | 0x63 => { let iname = read_u32(rgce) as usize - 1; // one-based stack.push(formula.len()); if let Some(name) = names.get(iname) { formula.push_str(&name.0); } rgce = &rgce[4..]; } 0x24 | 0x44 | 0x64 => { let row = read_u32(rgce) + 1; let col = [rgce[4], rgce[5] & 0x3F]; let col = read_u16(&col); stack.push(formula.len()); if rgce[5] & 0x80 != 0x80 { formula.push('$'); } push_column(col as u32, &mut formula); if rgce[5] & 0x40 != 0x40 { formula.push('$'); } formula.push_str(&format!("{row}")); rgce = &rgce[6..]; } 0x25 | 0x45 | 0x65 => { stack.push(formula.len()); formula.push('$'); push_column(read_u16(&rgce[8..10]) as u32, &mut formula); formula.push('$'); formula.push_str(&format!("{}", read_u32(&rgce[0..4]) + 1)); formula.push(':'); formula.push('$'); push_column(read_u16(&rgce[10..12]) as u32, &mut formula); formula.push('$'); formula.push_str(&format!("{}", read_u32(&rgce[4..8]) + 1)); rgce = &rgce[12..]; } 0x2A | 0x4A | 0x6A => { stack.push(formula.len()); formula.push_str("#REF!"); rgce = &rgce[6..]; } 0x2B | 0x4B | 0x6B => { stack.push(formula.len()); formula.push_str("#REF!"); rgce = &rgce[12..]; } 0x29 | 0x49 | 0x69 => { let cce = read_u16(rgce) as usize; rgce = &rgce[2..]; let f = parse_formula(&rgce[..cce], sheets, names)?; stack.push(formula.len()); formula.push_str(&f); rgce = &rgce[cce..]; } 0x39 | 0x59 | 0x79 => { // TODO: external workbook ... ignore this formula ... stack.push(formula.len()); formula.push_str("EXTERNAL_WB_NAME"); rgce = &rgce[6..]; } _ => return Err(XlsbError::Ptg(ptg)), } } if stack.len() == 1 { Ok(formula) } else { Err(XlsbError::StackLen) } } fn cell_format<'a>(formats: &'a [CellFormat], buf: &[u8]) -> Option<&'a CellFormat> { // Parses a Cell (MS-XLSB 2.5.9) and determines if it references a Date format // iStyleRef is stored as a 24bit integer starting at the fifth byte let style_ref = u32::from_le_bytes([buf[4], buf[5], buf[6], 0]); formats.get(style_ref as usize) } fn check_for_password_protected(reader: &mut RS) -> Result<(), XlsbError> { let offset_end = reader.seek(std::io::SeekFrom::End(0))? as usize; reader.seek(std::io::SeekFrom::Start(0))?; if let Ok(cfb) = crate::cfb::Cfb::new(reader, offset_end) { if cfb.has_directory("EncryptedPackage") { return Err(XlsbError::Password); } } Ok(()) } calamine-0.34.0/src/xlsx/cells_reader.rs000064400000000000000000000454731046102023000162550ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. use quick_xml::{ events::{attributes::Attribute, BytesStart, Event}, name::QName, }; use std::{ borrow::{Borrow, Cow}, collections::HashMap, io::{Read, Seek}, }; use super::{ get_attribute, get_dimension, get_row, get_row_column, read_string_with_bufs, replace_cell_names, Dimensions, XlReader, }; use crate::{ datatype::DataRef, formats::{format_excel_f64_ref, CellFormat}, utils::unescape_entity_to_buffer, Cell, XlsxError, }; type FormulaMap = HashMap<(u32, u32), (i64, i64)>; /// Workbook-level context used when reading cell values. struct WorkbookContext<'a> { strings: &'a [String], formats: &'a [CellFormat], is_1904: bool, } /// Reusable scratch buffers for cell value parsing (avoid per-cell allocations). struct ValueBufs { xml: Vec, value: String, str_inner: Vec, } impl ValueBufs { fn new() -> Self { Self { xml: Vec::with_capacity(1024), value: String::with_capacity(64), str_inner: Vec::with_capacity(1024), } } } /// An xlsx Cell Iterator pub struct XlsxCellReader<'a, RS> where RS: Read + Seek, { xml: XlReader<'a, RS>, strings: &'a [String], formats: &'a [CellFormat], is_1904: bool, dimensions: Dimensions, row_index: u32, col_index: u32, buf: Vec, cell_buf: Vec, value_bufs: ValueBufs, formulas: Vec>, } impl<'a, RS> XlsxCellReader<'a, RS> where RS: Read + Seek, { pub fn new( mut xml: XlReader<'a, RS>, strings: &'a [String], formats: &'a [CellFormat], is_1904: bool, ) -> Result { let mut buf = Vec::with_capacity(1024); let mut dimensions = Dimensions::default(); let mut sh_type = None; 'xml: loop { buf.clear(); match xml.read_event_into(&mut buf).map_err(XlsxError::Xml)? { Event::Start(e) => match e.local_name().as_ref() { b"dimension" => { for a in e.attributes() { if let Attribute { key: QName(b"ref"), value: rdim, } = a? { dimensions = get_dimension(&rdim)?; continue 'xml; } } return Err(XlsxError::UnexpectedNode("dimension")); } b"sheetData" => break, typ => { if sh_type.is_none() { sh_type = Some(xml.decoder().decode(typ)?.to_string()); } } }, Event::Eof => { if let Some(typ) = sh_type { return Err(XlsxError::NotAWorksheet(typ)); } else { return Err(XlsxError::XmlEof("worksheet")); } } _ => (), } } Ok(Self { xml, strings, formats, is_1904, dimensions, row_index: 0, col_index: 0, buf: Vec::with_capacity(1024), cell_buf: Vec::with_capacity(1024), value_bufs: ValueBufs::new(), formulas: Vec::with_capacity(1024), }) } pub fn dimensions(&self) -> Dimensions { self.dimensions } pub fn next_cell(&mut self) -> Result>>, XlsxError> { loop { self.buf.clear(); match self.xml.read_event_into(&mut self.buf) { Ok(Event::Start(row_element)) if row_element.local_name().as_ref() == b"row" => { let attribute = get_attribute(row_element.attributes(), QName(b"r"))?; if let Some(range) = attribute { let row = get_row(range)?; self.row_index = row; } } Ok(Event::End(row_element)) if row_element.local_name().as_ref() == b"row" => { self.row_index += 1; self.col_index = 0; } Ok(Event::Start(c_element)) if c_element.local_name().as_ref() == b"c" => { // Extract all needed attributes in one pass (avoids calling // `get_attribute` multiple times as each re-iterates). let mut pos_attr = None; let mut style_attr = None; let mut type_attr = None; for a in c_element.attributes() { let a = a.map_err(XlsxError::XmlAttr)?; let Cow::Borrowed(val) = a.value else { continue; }; match a.key { QName(b"r") => pos_attr = Some(val), QName(b"s") => style_attr = Some(val), QName(b"t") => type_attr = Some(val), _ => {} } } let pos = if let Some(range) = pos_attr { let (row, col) = get_row_column(range)?; self.col_index = col; (row, col) } else { (self.row_index, self.col_index) }; let mut value = DataRef::Empty; loop { self.cell_buf.clear(); match self.xml.read_event_into(&mut self.cell_buf) { Ok(Event::Start(e)) => { let ctx = WorkbookContext { strings: self.strings, formats: self.formats, is_1904: self.is_1904, }; value = read_value( &ctx, &mut self.xml, &e, style_attr, type_attr, &mut self.value_bufs, )?; } Ok(Event::End(e)) if e.local_name().as_ref() == b"c" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("c")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } self.col_index += 1; return Ok(Some(Cell::new(pos, value))); } Ok(Event::End(e)) if e.local_name().as_ref() == b"sheetData" => { return Ok(None); } Ok(Event::Eof) => return Err(XlsxError::XmlEof("sheetData")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } } pub fn next_formula(&mut self) -> Result>, XlsxError> { loop { self.buf.clear(); match self.xml.read_event_into(&mut self.buf) { Ok(Event::Start(row_element)) if row_element.local_name().as_ref() == b"row" => { let attribute = get_attribute(row_element.attributes(), QName(b"r"))?; if let Some(range) = attribute { let row = get_row(range)?; self.row_index = row; } } Ok(Event::End(row_element)) if row_element.local_name().as_ref() == b"row" => { self.row_index += 1; self.col_index = 0; } Ok(Event::Start(c_element)) if c_element.local_name().as_ref() == b"c" => { let attribute = get_attribute(c_element.attributes(), QName(b"r"))?; let pos = if let Some(range) = attribute { let (row, col) = get_row_column(range)?; self.col_index = col; (row, col) } else { (self.row_index, self.col_index) }; let mut value = None; loop { self.cell_buf.clear(); match self.xml.read_event_into(&mut self.cell_buf) { Ok(Event::Start(e)) => { let formula = read_formula(&mut self.xml, &e)?; if let Some(f) = formula.borrow() { value = Some(f.clone()); } if let Ok(Some(b"shared")) = get_attribute(e.attributes(), QName(b"t")) { // shared formula let mut offset_map: HashMap<(u32, u32), (i64, i64)> = HashMap::new(); // shared index let shared_index = match get_attribute(e.attributes(), QName(b"si"))? { Some(res) => match atoi_simd::parse::(res) { Ok(res) => res, Err(_) => { return Err(XlsxError::Unexpected( "si attribute must be a number", )); } }, None => { return Err(XlsxError::Unexpected( "si attribute is mandatory if it is shared", )); } }; // shared reference match get_attribute(e.attributes(), QName(b"ref"))? { Some(res) => { // original reference formula let reference = get_dimension(res)?; for row in reference.start.0..=reference.end.0 { for column in reference.start.1..=reference.end.1 { offset_map.insert( (row, column), ( row as i64 - pos.0 as i64, column as i64 - pos.1 as i64, ), ); } } if let Some(f) = formula.borrow() { if self.formulas.len() <= shared_index { self.formulas.resize(shared_index + 1, None); } self.formulas[shared_index] = Some((f.clone(), offset_map)); } value = formula; } None => { // calculated formula if let Some(Some((f, offset_map))) = self.formulas.get(shared_index) { if let Some(offset) = offset_map.get(&pos) { value = Some(replace_cell_names(f, *offset)?); } } } } } } Ok(Event::End(e)) if e.local_name().as_ref() == b"c" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("c")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } self.col_index += 1; return Ok(Some(Cell::new(pos, value.unwrap_or_default()))); } Ok(Event::End(e)) if e.local_name().as_ref() == b"sheetData" => { return Ok(None); } Ok(Event::Eof) => return Err(XlsxError::XmlEof("sheetData")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } } } /// Reads a cell value using pre-extracted `s` and `t` attributes /// (avoids repeating attribute iteration on the `` element). fn read_value<'s, RS>( ctx: &WorkbookContext<'s>, xml: &mut XlReader<'_, RS>, e: &BytesStart<'_>, style_attr: Option<&[u8]>, type_attr: Option<&[u8]>, bufs: &mut ValueBufs, ) -> Result, XlsxError> where RS: Read + Seek, { Ok(match e.local_name().as_ref() { b"is" => { // inlineStr read_string_with_bufs(xml, e.name(), &mut bufs.xml, &mut bufs.str_inner)? .map_or(DataRef::Empty, DataRef::String) } b"v" => { // value bufs.value.clear(); loop { bufs.xml.clear(); match xml.read_event_into(&mut bufs.xml)? { Event::Text(t) => bufs.value.push_str(&t.xml10_content()?), Event::GeneralRef(e) => unescape_entity_to_buffer(&e, &mut bufs.value)?, Event::End(end) if end.name() == e.name() => break, Event::Eof => return Err(XlsxError::XmlEof("v")), _ => (), } } read_v(ctx, &mut bufs.value, style_attr, type_attr)? } b"f" => { bufs.xml.clear(); xml.read_to_end_into(e.name(), &mut bufs.xml)?; DataRef::Empty } _n => return Err(XlsxError::UnexpectedNode("v, f, or is")), }) } /// Read the contents of a `` cell using pre-extracted `s` and `t` attributes. /// Takes `v` by mutable reference to allow for buffer reuse across cells. fn read_v<'s>( ctx: &WorkbookContext<'s>, v: &mut String, style_attr: Option<&[u8]>, type_attr: Option<&[u8]>, ) -> Result, XlsxError> { let cell_format = match style_attr { Some(style) => { let id = atoi_simd::parse::(style).unwrap_or(0); ctx.formats.get(id) } _ => Some(&CellFormat::Other), }; match type_attr { Some(b"s") => { if v.is_empty() { return Ok(DataRef::Empty); } // Cell value is an index into the shared string table. let idx = atoi_simd::parse::(v.as_bytes()).unwrap_or(0); match ctx.strings.get(idx) { Some(shared_string) => Ok(DataRef::SharedString(shared_string)), None => Err(XlsxError::Unexpected( "Cell string index not found in shared strings table", )), } } Some(b"b") => { // boolean Ok(DataRef::Bool(v.as_str() != "0")) } Some(b"e") => { // error Ok(DataRef::Error(v.parse()?)) } Some(b"d") => { // date (needs owned String) Ok(DataRef::DateTimeIso(std::mem::take(v))) } Some(b"str") => { // string (needs owned String) Ok(DataRef::String(std::mem::take(v))) } Some(b"n") => { // n - number if v.is_empty() { Ok(DataRef::Empty) } else { fast_float2::parse::(v.as_bytes()) .map(|n| format_excel_f64_ref(n, cell_format, ctx.is_1904)) .map_err(|_| XlsxError::ParseFloat(v.parse::().unwrap_err())) } } None => { // If type is not known, we try to parse as Float for utility, but fall back to // String if this fails. fast_float2::parse::(v.as_bytes()) .map(|n| format_excel_f64_ref(n, cell_format, ctx.is_1904)) .or(Ok(DataRef::String(std::mem::take(v)))) } Some(b"is") => { // this case should be handled in outer loop over cell elements, in which // case read_inline_str is called instead. Case included here for completeness. Err(XlsxError::Unexpected( "called read_value on a cell of type inlineStr", )) } Some(t) => { let t = std::str::from_utf8(t).unwrap_or("").to_string(); Err(XlsxError::CellTAttribute(t)) } } } fn read_formula(xml: &mut XlReader, e: &BytesStart) -> Result, XlsxError> where RS: Read + Seek, { match e.local_name().as_ref() { b"is" | b"v" => { xml.read_to_end_into(e.name(), &mut Vec::new())?; Ok(None) } b"f" => { let mut f_buf = Vec::with_capacity(512); let mut f = String::new(); loop { match xml.read_event_into(&mut f_buf)? { Event::Text(t) => f.push_str(&t.xml10_content()?), Event::GeneralRef(e) => unescape_entity_to_buffer(&e, &mut f)?, Event::End(end) if end.name() == e.name() => break, Event::Eof => return Err(XlsxError::XmlEof("f")), _ => (), } f_buf.clear(); } Ok(Some(f)) } _ => Err(XlsxError::UnexpectedNode("v, f, or is")), } } calamine-0.34.0/src/xlsx/mod.rs000064400000000000000000003604761046102023000144130ustar 00000000000000// SPDX-License-Identifier: MIT // // Copyright 2016-2025, Johann Tuffe. #![warn(missing_docs)] mod cells_reader; use std::borrow::Cow; use std::collections::HashMap; use std::io::BufReader; use std::io::{Read, Seek}; use std::str::FromStr; use log::warn; use quick_xml::events::attributes::{AttrError, Attribute, Attributes}; use quick_xml::events::BytesStart; use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::Decoder; use quick_xml::Reader as XmlReader; use zip::read::{ZipArchive, ZipFile}; use zip::result::ZipError; use crate::datatype::DataRef; use crate::formats::{builtin_format_by_id, detect_custom_number_format, CellFormat}; use crate::utils::{ build_zip_path_cache, cached_zip_path, unescape_entity_to_buffer, unescape_xml, }; use crate::vba::VbaProject; use crate::{ Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, Table, }; pub use cells_reader::XlsxCellReader; pub(crate) type XlReader<'a, RS> = XmlReader>>; /// Maximum number of rows allowed in an XLSX file. pub const MAX_ROWS: u32 = 1_048_576; /// Maximum number of columns allowed in an XLSX file. pub const MAX_COLUMNS: u32 = 16_384; /// An enum for Xlsx specific errors. #[derive(Debug)] pub enum XlsxError { /// A wrapper for a variety of [`std::io::Error`] errors such as file /// permissions when reading an XLSX file. This can be caused by a /// non-existent file or parent directory or, commonly on Windows, if the /// file is already open in Excel. Io(std::io::Error), /// A wrapper for a variety of [`zip::result::ZipError`] errors from /// [`zip::ZipWriter`]. These relate to errors arising from reading the XLSX /// file zip container. Zip(zip::result::ZipError), /// A general error when reading a VBA project from an XLSX file. Vba(crate::vba::VbaError), /// A wrapper for a variety of [`quick_xml::Error`] XML parsing errors, but /// most commonly for missing data in the target file. Xml(quick_xml::Error), /// A wrapper for a variety of [`quick_xml::events::attributes::AttrError`] /// errors related to attributes in XML elements. XmlAttr(quick_xml::events::attributes::AttrError), /// A wrapper for a variety of [`std::string::ParseError`] errors when /// parsing strings. Parse(std::string::ParseError), /// A wrapper for a variety of [`std::num::ParseFloatError`] errors when /// parsing floats. ParseFloat(std::num::ParseFloatError), /// A wrapper for a variety of [`std::num::ParseIntError`] errors when /// parsing integers. ParseInt(std::num::ParseIntError), /// Unexpected end of XML file, usually when an end tag is missing. XmlEof(&'static str), /// Unexpected node in XML. UnexpectedNode(&'static str), /// XML file not found in XLSX container. FileNotFound(String), /// Relationship file not found in XLSX container. RelationshipNotFound, /// Non alphanumeric character found when parsing `A1` style range string. Alphanumeric(u8), /// Error when parsing the column name in a `A1` style range string. NumericColumn(u8), /// Missing column name when parsing an `A1` style range string. RangeWithoutColumnComponent, /// Missing row number when parsing an `A1` style range string. RangeWithoutRowComponent, /// Column number exceeds maximum allowed columns. ColumnNumberOverflow, /// Row number exceeds maximum allowed rows. RowNumberOverflow, /// Error when parsing dimensions of a worksheet. DimensionCount(usize), /// Unknown cell type (`t`) attribute. CellTAttribute(String), /// Unexpected XML element or attribute error. Unexpected(&'static str), /// Unrecognized worksheet type or state. Unrecognized { /// The data type. typ: &'static str, /// The value found. val: String, }, /// Unrecognized cell error type. CellError(String), /// Workbook is password protected. Password, /// Specified worksheet was not found. WorksheetNotFound(String), /// Specified worksheet Table was not found. TableNotFound(String), /// The specified sheet is not a worksheet. NotAWorksheet(String), /// A wrapper for a variety of [`quick_xml::encoding::EncodingError`] /// encoding errors. Encoding(quick_xml::encoding::EncodingError), /// Specified Pivot Table was not found on worksheet. PivotTableNotFound(String), } from_err!(std::io::Error, XlsxError, Io); from_err!(zip::result::ZipError, XlsxError, Zip); from_err!(crate::vba::VbaError, XlsxError, Vba); from_err!(quick_xml::Error, XlsxError, Xml); from_err!(std::num::ParseFloatError, XlsxError, ParseFloat); from_err!(std::num::ParseIntError, XlsxError, ParseInt); from_err!(quick_xml::encoding::EncodingError, XlsxError, Encoding); from_err!(quick_xml::events::attributes::AttrError, XlsxError, XmlAttr); impl std::fmt::Display for XlsxError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { XlsxError::Io(e) => write!(f, "I/O error: {e}"), XlsxError::Zip(e) => write!(f, "Zip error: {e}"), XlsxError::Xml(e) => write!(f, "Xml error: {e}"), XlsxError::XmlAttr(e) => write!(f, "Xml attribute error: {e}"), XlsxError::Vba(e) => write!(f, "Vba error: {e}"), XlsxError::Parse(e) => write!(f, "Parse string error: {e}"), XlsxError::ParseInt(e) => write!(f, "Parse integer error: {e}"), XlsxError::ParseFloat(e) => write!(f, "Parse float error: {e}"), XlsxError::XmlEof(e) => write!(f, "Unexpected end of xml, expecting ''"), XlsxError::UnexpectedNode(e) => write!(f, "Expecting '{e}' node"), XlsxError::FileNotFound(e) => write!(f, "File not found '{e}'"), XlsxError::RelationshipNotFound => write!(f, "Relationship not found"), XlsxError::Alphanumeric(e) => { write!(f, "Expecting alphanumeric character, got {e:X}") } XlsxError::NumericColumn(e) => write!( f, "Numeric character is not allowed for column name, got {e}", ), XlsxError::DimensionCount(e) => { write!(f, "Range dimension must be lower than 2. Got {e}") } XlsxError::CellTAttribute(e) => write!(f, "Unknown cell 't' attribute: {e:?}"), XlsxError::RangeWithoutColumnComponent => { write!(f, "Range is missing the expected column component.") } XlsxError::RangeWithoutRowComponent => { write!(f, "Range is missing the expected row component.") } XlsxError::ColumnNumberOverflow => write!(f, "column number overflow"), XlsxError::RowNumberOverflow => write!(f, "row number overflow"), XlsxError::Unexpected(e) => write!(f, "{e}"), XlsxError::Unrecognized { typ, val } => write!(f, "Unrecognized {typ}: {val}"), XlsxError::CellError(e) => write!(f, "Unsupported cell error value '{e}'"), XlsxError::WorksheetNotFound(n) => write!(f, "Worksheet '{n}' not found"), XlsxError::Password => write!(f, "Workbook is password protected"), XlsxError::TableNotFound(n) => write!(f, "Table '{n}' not found"), XlsxError::NotAWorksheet(typ) => write!(f, "Expecting a worksheet, got {typ}"), XlsxError::Encoding(e) => write!(f, "XML encoding error: {e}"), XlsxError::PivotTableNotFound(pt) => { write!(f, "Pivot Table '{pt}' was not found on worksheet") } } } } impl std::error::Error for XlsxError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { XlsxError::Io(e) => Some(e), XlsxError::Zip(e) => Some(e), XlsxError::Xml(e) => Some(e), XlsxError::Vba(e) => Some(e), XlsxError::Parse(e) => Some(e), XlsxError::ParseInt(e) => Some(e), XlsxError::ParseFloat(e) => Some(e), XlsxError::Encoding(e) => Some(e), _ => None, } } } impl FromStr for CellErrorType { type Err = XlsxError; fn from_str(s: &str) -> Result { match s { "#DIV/0!" => Ok(CellErrorType::Div0), "#N/A" => Ok(CellErrorType::NA), "#NAME?" => Ok(CellErrorType::Name), "#NULL!" => Ok(CellErrorType::Null), "#NUM!" => Ok(CellErrorType::Num), "#REF!" => Ok(CellErrorType::Ref), "#VALUE!" => Ok(CellErrorType::Value), _ => Err(XlsxError::CellError(s.into())), } } } type Tables = Option, Dimensions)>>; /// A struct representing xml zipped excel file /// Xlsx, Xlsm, Xlam pub struct Xlsx { zip: ZipArchive, /// Shared strings strings: Vec, /// Sheets paths sheets: Vec<(String, String)>, /// Tables: Name, Sheet, Columns, Data dimensions tables: Tables, /// Cell (number) formats formats: Vec, /// 1904 datetime system is_1904: bool, /// Metadata metadata: Metadata, /// Pictures #[cfg(feature = "picture")] pictures: Option)>>, /// Merged Regions: Name, Sheet, Merged Dimensions merged_regions: Option>, /// Reader options options: XlsxOptions, /// Cached ZIP path lookups (lowercased normalized β†’ original) zip_path_cache: HashMap, } /// Xlsx reader options #[derive(Debug, Default)] #[non_exhaustive] struct XlsxOptions { pub header_row: HeaderRow, } impl Xlsx { fn read_shared_strings(&mut self) -> Result<(), XlsxError> { let mut xml = match xml_reader(&mut self.zip, "xl/sharedStrings.xml", &self.zip_path_cache) { None => return Ok(()), Some(x) => x?, }; let mut buf = Vec::with_capacity(1024); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"sst" => { if let Ok(Some(count)) = get_attribute(e.attributes(), QName(b"uniqueCount")) { if let Ok(n) = atoi_simd::parse::(count) { self.strings.reserve(n); } } break; } Ok(Event::Eof) => return Err(XlsxError::XmlEof("sst")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } let mut str_buf = Vec::with_capacity(1024); let mut str_val_buf = Vec::with_capacity(1024); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"si" => { if let Some(s) = read_string_with_bufs(&mut xml, e.name(), &mut str_buf, &mut str_val_buf)? { self.strings.push(s); } } Ok(Event::End(e)) if e.local_name().as_ref() == b"sst" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("sst")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } Ok(()) } fn read_styles(&mut self) -> Result<(), XlsxError> { let mut xml = match xml_reader(&mut self.zip, "xl/styles.xml", &self.zip_path_cache) { None => return Ok(()), Some(x) => x?, }; let mut number_formats = HashMap::new(); let mut buf = Vec::with_capacity(1024); let mut inner_buf = Vec::with_capacity(1024); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"numFmts" => loop { inner_buf.clear(); match xml.read_event_into(&mut inner_buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"numFmt" => { let mut id = Vec::new(); let mut format = String::new(); for a in e.attributes() { match a? { Attribute { key: QName(b"numFmtId"), value: v, } => id.extend_from_slice(&v), Attribute { key: QName(b"formatCode"), value: v, } => format = xml.decoder().decode(&v)?.into_owned(), _ => (), } } if !format.is_empty() { number_formats.insert(id, format); } } Ok(Event::End(e)) if e.local_name().as_ref() == b"numFmts" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("numFmts")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } }, Ok(Event::Start(e)) if e.local_name().as_ref() == b"cellXfs" => loop { inner_buf.clear(); match xml.read_event_into(&mut inner_buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"xf" => { self.formats.push( e.attributes() .filter_map(|a| a.ok()) .find(|a| a.key == QName(b"numFmtId")) .map_or(CellFormat::Other, |a| { match number_formats.get(&*a.value) { Some(fmt) => detect_custom_number_format(fmt), None => builtin_format_by_id(&a.value), } }), ); } Ok(Event::End(e)) if e.local_name().as_ref() == b"cellXfs" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("cellXfs")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } }, Ok(Event::End(e)) if e.local_name().as_ref() == b"styleSheet" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("styleSheet")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } Ok(()) } fn read_workbook(&mut self, relationships: &HashMap, String>) -> Result<(), XlsxError> { let mut xml = match xml_reader(&mut self.zip, "xl/workbook.xml", &self.zip_path_cache) { None => return Ok(()), Some(x) => x?, }; let mut defined_names = Vec::new(); let mut buf = Vec::with_capacity(1024); let mut val_buf = Vec::with_capacity(1024); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"sheet" => { let mut name = String::new(); let mut path = String::new(); let mut visible = SheetVisible::Visible; for a in e.attributes() { let a = a?; match a { Attribute { key: QName(b"name"), .. } => { name = a.decode_and_unescape_value(xml.decoder())?.to_string(); } Attribute { key: QName(b"state"), .. } => { visible = match a.decode_and_unescape_value(xml.decoder())?.as_ref() { "visible" => SheetVisible::Visible, "hidden" => SheetVisible::Hidden, "veryHidden" => SheetVisible::VeryHidden, v => { return Err(XlsxError::Unrecognized { typ: "sheet:state", val: v.to_string(), }) } } } Attribute { key: QName(b"r:id" | b"relationships:id"), value: v, } => { let r = &relationships .get(&*v) .ok_or(XlsxError::RelationshipNotFound)?[..]; // target may have prepended "/xl/" or "xl/" path; // strip if present path = if r.starts_with("/xl/") { r[1..].to_string() } else if r.starts_with("xl/") { r.to_string() } else { format!("xl/{r}") }; } _ => (), } } let typ = match path.split('/').nth(1) { Some("worksheets") => SheetType::WorkSheet, Some("chartsheets") => SheetType::ChartSheet, Some("dialogsheets") => SheetType::DialogSheet, _ => { return Err(XlsxError::Unrecognized { typ: "sheet:type", val: path.to_string(), }) } }; self.metadata.sheets.push(Sheet { name: name.to_string(), typ, visible, }); self.sheets.push((name, path)); } Ok(Event::Start(e)) if e.name().as_ref() == b"workbookPr" => { self.is_1904 = match e.try_get_attribute("date1904")? { Some(c) => ["1", "true"].contains( &c.decode_and_unescape_value(xml.decoder()) .map_err(XlsxError::Xml)? .as_ref(), ), None => false, }; } Ok(Event::Start(e)) if e.local_name().as_ref() == b"definedName" => { if let Some(a) = e .attributes() .filter_map(std::result::Result::ok) .find(|a| a.key == QName(b"name")) { let name = a.decode_and_unescape_value(xml.decoder())?.to_string(); val_buf.clear(); let mut value = String::new(); loop { match xml.read_event_into(&mut val_buf)? { Event::Text(t) => value.push_str(&t.xml10_content()?), Event::GeneralRef(e) => unescape_entity_to_buffer(&e, &mut value)?, Event::End(end) if end.name() == e.name() => break, Event::Eof => return Err(XlsxError::XmlEof("workbook")), _ => (), } } defined_names.push((name, value)); } } Ok(Event::End(e)) if e.local_name().as_ref() == b"workbook" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("workbook")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } self.metadata.names = defined_names; Ok(()) } fn read_relationships(&mut self) -> Result, String>, XlsxError> { let mut xml = match xml_reader( &mut self.zip, "xl/_rels/workbook.xml.rels", &self.zip_path_cache, ) { None => { return Err(XlsxError::FileNotFound( "xl/_rels/workbook.xml.rels".to_string(), )); } Some(x) => x?, }; let mut relationships = HashMap::new(); let mut buf = Vec::with_capacity(64); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"Relationship" => { let mut id = Vec::new(); let mut target = String::new(); for a in e.attributes() { match a? { Attribute { key: QName(b"Id"), value: v, } => id.extend_from_slice(&v), Attribute { key: QName(b"Target"), value: v, } => target = xml.decoder().decode(&v)?.into_owned(), _ => (), } } relationships.insert(id, target); } Ok(Event::End(e)) if e.local_name().as_ref() == b"Relationships" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("Relationships")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } Ok(relationships) } // sheets must be added before this is called!! fn read_table_metadata(&mut self) -> Result<(), XlsxError> { let mut new_tables = Vec::new(); for (sheet_name, sheet_path) in &self.sheets { let last_folder_index = sheet_path.rfind('/').expect("should be in a folder"); let (base_folder, file_name) = sheet_path.split_at(last_folder_index); let rel_path = format!("{base_folder}/_rels{file_name}.rels"); let mut table_locations = Vec::new(); let mut buf = Vec::with_capacity(64); // we need another mutable borrow of self.zip later so we enclose this borrow within braces { let mut xml = match xml_reader(&mut self.zip, &rel_path, &self.zip_path_cache) { None => continue, Some(x) => x?, }; loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"Relationship" => { let mut id = Vec::new(); let mut target = String::new(); let mut table_type = false; for a in e.attributes() { match a? { Attribute { key: QName(b"Id"), value: v, } => id.extend_from_slice(&v), Attribute { key: QName(b"Target"), value: v, } => target = xml.decoder().decode(&v)?.into_owned(), Attribute { key: QName(b"Type"), value: v, } => table_type = *v == b"http://schemas.openxmlformats.org/officeDocument/2006/relationships/table"[..], _ => (), } } if table_type { if target.starts_with("../") { // Relative path. let new_index = base_folder.rfind('/').expect("Must be a parent folder"); let full_path = format!("{}{}", &base_folder[..new_index], &target[2..]); table_locations.push(full_path); } else if let Some(stripped) = target.strip_prefix('/') { // Absolute path. table_locations.push(stripped.to_string()); } else if !target.is_empty() { // Assume absolute path without leading slash. table_locations.push(target); } } } Ok(Event::End(e)) if e.local_name().as_ref() == b"Relationships" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("Relationships")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } } for table_file in table_locations { let mut xml = match xml_reader(&mut self.zip, &table_file, &self.zip_path_cache) { None => continue, Some(x) => x?, }; let mut column_names = Vec::new(); let mut table_meta = InnerTableMetadata::new(); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"table" => { for a in e.attributes() { match a? { Attribute { key: QName(b"displayName"), value: v, } => { table_meta.display_name = xml.decoder().decode(&v)?.into_owned(); } Attribute { key: QName(b"ref"), value: v, } => { table_meta.ref_cells = xml.decoder().decode(&v)?.into_owned(); } Attribute { key: QName(b"headerRowCount"), value: v, } => { table_meta.header_row_count = xml.decoder().decode(&v)?.parse()?; } Attribute { key: QName(b"totalsRowCount"), value: v, } => { table_meta.totals_row_count = xml.decoder().decode(&v)?.parse()?; } _ => (), } } } Ok(Event::Start(e)) if e.local_name().as_ref() == b"tableColumn" => { for a in e.attributes().flatten() { if let Attribute { key: QName(b"name"), value: v, } = a { column_names.push(xml.decoder().decode(&v)?.into_owned()); } } } Ok(Event::End(e)) if e.local_name().as_ref() == b"table" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("Table")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } let mut dims = get_dimension(table_meta.ref_cells.as_bytes())?; if table_meta.header_row_count != 0 { dims.start.0 += table_meta.header_row_count; } if table_meta.totals_row_count != 0 { dims.end.0 -= table_meta.header_row_count; } new_tables.push(( table_meta.display_name, sheet_name.clone(), column_names, dims, )); } } self.tables = Some(new_tables); Ok(()) } // Read pictures. #[cfg(feature = "picture")] fn read_pictures(&mut self) -> Result<(), XlsxError> { let mut pics = Vec::new(); for i in 0..self.zip.len() { let mut zfile = self.zip.by_index(i)?; let zname = zfile.name(); if zname.starts_with("xl/media") { if let Some(ext) = zname.split('.').next_back() { if [ "emf", "wmf", "pict", "jpeg", "jpg", "png", "dib", "gif", "tiff", "eps", "bmp", "wpg", ] .contains(&ext) { let ext = ext.to_string(); let mut buf: Vec = Vec::new(); zfile.read_to_end(&mut buf)?; pics.push((ext, buf)); } } } } if !pics.is_empty() { self.pictures = Some(pics); } Ok(()) } /// Get all Pivot Tables in a workbook. /// /// # Note /// /// This function is required before working with Pivot Table Data due to reliance on metadata in `PivotTableRef`. pub fn pivot_tables(&mut self) -> Result where RS: Read + Seek, { let mut pivot_tables = PivotTables::new(); for (sheet_name, sheet_path) in self.sheets.iter() { for pivot_path in find_pivot_table_paths_from_sheet(&mut self.zip, sheet_path, &self.zip_path_cache)? .iter() { let name = find_pivot_name_from_pivot_path( &mut self.zip, pivot_path, &self.zip_path_cache, )?; let definition_cache_path = find_pivot_cache_definitions_from_pivot( &mut self.zip, pivot_path, &self.zip_path_cache, )?; let record_cache_path = find_pivot_cache_records_from_definitions( &mut self.zip, &definition_cache_path, &self.zip_path_cache, )?; pivot_tables.push(PivotTableRef::new( name, sheet_name.to_string(), record_cache_path, definition_cache_path, )); } } Ok(pivot_tables) } /// Get an iterator over a pivot table's cached data. /// /// Invalid Pivot Table names will return None. /// /// # Examples /// /// An example of retrieving pivot data for a Pivot Table named PivotTable1 on sheet PivotSheet1. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// /// let path = "tests/pivots.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Must retrieve necessary metadata before reading Pivot Table data. /// let pivot_tables = workbook.pivot_tables()?; /// /// // Get the Pivot Table data by referencing the pivot table name and the worksheet it resides. /// for row in workbook.pivot_table_data(&pivot_tables, "PivotSheet1", "PivotTable1")? { /// // Do something. /// } /// /// Ok(()) /// } /// ``` /// pub fn pivot_table_data( &'_ mut self, pivot_tables: &PivotTables, sheet_name: &str, pivot_table_name: &str, ) -> Result, XlsxError> { match pivot_tables.0.iter().find(|pivot_table| { pivot_table.name() == pivot_table_name && pivot_table.sheet() == sheet_name }) { Some(pt_ref) => get_pivot_cache_iter(self, pt_ref), None => Err(XlsxError::PivotTableNotFound(pivot_table_name.to_string())), } } // sheets must be added before this is called!! fn read_merged_regions(&mut self) -> Result<(), XlsxError> { let mut regions = Vec::new(); for (sheet_name, sheet_path) in &self.sheets { // we need another mutable borrow of self.zip later so we enclose this borrow within braces { let mut xml = match xml_reader(&mut self.zip, sheet_path, &self.zip_path_cache) { None => continue, Some(x) => x?, }; let mut buf = Vec::new(); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name() == QName(b"mergeCell").into() => { if let Some(attr) = get_attribute(e.attributes(), QName(b"ref"))? { let dimension = get_dimension(attr)?; regions.push(( sheet_name.to_string(), sheet_path.to_string(), dimension, )); } } Ok(Event::Eof) => break, Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } } } self.merged_regions = Some(regions); Ok(()) } #[inline] fn get_table_meta(&self, table_name: &str) -> Result { let match_table_meta = self .tables .as_ref() .expect("Tables must be loaded before they are referenced") .iter() .find(|(table, ..)| table == table_name) .ok_or_else(|| XlsxError::TableNotFound(table_name.into()))?; let name = match_table_meta.0.to_owned(); let sheet_name = match_table_meta.1.clone(); let columns = match_table_meta.2.clone(); let dimensions = Dimensions { start: match_table_meta.3.start, end: match_table_meta.3.end, }; Ok(TableMetadata { name, sheet_name, columns, dimensions, }) } /// Load the merged regions in the workbook. /// /// A merged region in Excel is a range of cells that have been merged to /// act as a single cell. It is often used to create headers or titles that /// span multiple columns or rows. /// /// This method must be called before accessing the merged regions using the /// methods: /// /// - [`Xlsx::merged_regions()`]. /// - [`Xlsx::merged_regions_by_sheet()`]. /// /// These methods are explained below. /// /// # Errors /// /// - [`XlsxError::Xml`]. /// pub fn load_merged_regions(&mut self) -> Result<(), XlsxError> { if self.merged_regions.is_none() { self.read_merged_regions() } else { Ok(()) } } /// Get the merged regions for all the worksheets in a workbook. /// /// The function returns a ref to a vector of tuples containing the sheet /// name, the sheet path, and the [`Dimensions`] of the merged region. The /// middle element of the tuple can generally be ignored. /// /// The [`Xlsx::load_merged_regions()`] method must be called before calling /// this method. /// /// # Examples /// /// An example of getting all the merged regions in an Excel workbook. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/merged_range.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the merged regions in the workbook. /// workbook.load_merged_regions()?; /// /// // Get all the merged regions in the workbook. /// let merged_regions = workbook.merged_regions(); /// /// // Print the sheet name and dimensions of each merged region. /// for (sheet_name, _, dimensions) in merged_regions { /// println!("{sheet_name}: {dimensions:?}"); /// } /// /// Ok(()) /// } /// /// ``` /// /// Output: /// /// ```text /// Sheet1: Dimensions { start: (0, 7), end: (1, 7) } /// Sheet1: Dimensions { start: (0, 0), end: (1, 0) } /// Sheet1: Dimensions { start: (0, 1), end: (1, 1) } /// Sheet1: Dimensions { start: (0, 2), end: (1, 3) } /// Sheet1: Dimensions { start: (2, 2), end: (2, 3) } /// Sheet1: Dimensions { start: (3, 2), end: (3, 3) } /// Sheet1: Dimensions { start: (0, 4), end: (1, 4) } /// Sheet1: Dimensions { start: (0, 5), end: (1, 5) } /// Sheet1: Dimensions { start: (0, 6), end: (1, 6) } /// Sheet2: Dimensions { start: (0, 0), end: (3, 0) } /// Sheet2: Dimensions { start: (2, 2), end: (3, 3) } /// Sheet2: Dimensions { start: (0, 5), end: (3, 7) } /// Sheet2: Dimensions { start: (0, 1), end: (1, 1) } /// Sheet2: Dimensions { start: (0, 2), end: (1, 3) } /// Sheet2: Dimensions { start: (0, 4), end: (1, 4) } /// ``` /// pub fn merged_regions(&self) -> &Vec<(String, String, Dimensions)> { self.merged_regions .as_ref() .expect("Merged Regions must be loaded before the are referenced") } /// Get the merged regions in a workbook by the sheet name. /// /// The function returns a vector of tuples containing the sheet name, the /// sheet path, and the [`Dimensions`] of the merged region. The first two /// elements of the tuple can generally be ignored. /// /// The [`Xlsx::load_merged_regions()`] method must be called before calling /// this method. /// /// # Parameters /// /// - `sheet_name`: The name of the worksheet to get the merged regions from. /// /// # Examples /// /// An example of getting the merged regions in an Excel workbook, by individual /// worksheet. /// /// ``` /// use calamine::{open_workbook, Error, Reader, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/merged_range.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Get the names of all the sheets in the workbook. /// let sheet_names = workbook.sheet_names(); /// /// // Load the merged regions in the workbook. /// workbook.load_merged_regions()?; /// /// for sheet_name in &sheet_names { /// println!("{sheet_name}: "); /// /// // Get the merged regions in the current sheet. /// let merged_regions = workbook.merged_regions_by_sheet(sheet_name); /// /// for (_, _, dimensions) in &merged_regions { /// // Print the dimensions of each merged region. /// println!(" {dimensions:?}"); /// } /// } /// /// Ok(()) /// } /// /// ``` /// /// Output: /// /// /// ```text /// Sheet1: /// Dimensions { start: (0, 7), end: (1, 7) } /// Dimensions { start: (0, 0), end: (1, 0) } /// Dimensions { start: (0, 1), end: (1, 1) } /// Dimensions { start: (0, 2), end: (1, 3) } /// Dimensions { start: (2, 2), end: (2, 3) } /// Dimensions { start: (3, 2), end: (3, 3) } /// Dimensions { start: (0, 4), end: (1, 4) } /// Dimensions { start: (0, 5), end: (1, 5) } /// Dimensions { start: (0, 6), end: (1, 6) } /// Sheet2: /// Dimensions { start: (0, 0), end: (3, 0) } /// Dimensions { start: (2, 2), end: (3, 3) } /// Dimensions { start: (0, 5), end: (3, 7) } /// Dimensions { start: (0, 1), end: (1, 1) } /// Dimensions { start: (0, 2), end: (1, 3) } /// Dimensions { start: (0, 4), end: (1, 4) } /// ``` /// pub fn merged_regions_by_sheet(&self, name: &str) -> Vec<(&String, &String, &Dimensions)> { self.merged_regions() .iter() .filter(|s| s.0 == name) .map(|(name, sheet, region)| (name, sheet, region)) .collect() } /// Load the worksheet tables from the XLSX file. /// /// Tables in Excel are a way of grouping a range of cells into a single /// entity that has common formatting or that can be referenced in formulas. /// In `calamine`, tables can be read as a [`Table`] object and converted to /// a data [`Range`] for further processing. /// /// Calamine does not automatically load table data from a workbook to avoid /// unnecessary overhead. Instead you must explicitly load the table data /// using the `Xlsx::load_tables()` method. Once the tables have been loaded /// the following methods can be used to extract and work with individual /// tables: /// /// - [`Xlsx::table_by_name()`]. /// - [`Xlsx::table_by_name_ref()`]. /// - [`Xlsx::table_names()`]. /// - [`Xlsx::table_names_in_sheet()`]. /// /// These methods are explained below. See also the [`Table`] struct for /// additional methods that can be used when working with tables. /// /// # Errors /// /// - [`XlsxError::XmlAttr`]. /// - [`XlsxError::XmlEof`]. /// - [`XlsxError::Xml`]. /// /// pub fn load_tables(&mut self) -> Result<(), XlsxError> { if self.tables.is_none() { self.read_table_metadata() } else { Ok(()) } } /// Get the names of all the tables in the workbook. /// /// Read all the table names in the workbook. This can be used in /// conjunction with [`Xlsx::table_by_name()`] to iterate over the tables in /// the workbook. /// /// # Panics /// /// Panics if tables have not been loaded via [`Xlsx::load_tables()`]. /// /// # Examples /// /// An example of getting the names of all the tables in an Excel workbook. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/table-multiple.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get all the table names in the workbook. /// let table_names = workbook.table_names(); /// /// // Check the table names. /// assert_eq!( /// table_names, /// vec!["Inventory", "Pricing", "Sales_Bob", "Sales_Alice"] /// ); /// /// Ok(()) /// } /// ``` /// pub fn table_names(&self) -> Vec<&String> { self.tables .as_ref() .expect("Tables must be loaded before they are referenced") .iter() .map(|(name, ..)| name) .collect() } /// Get the names of all the tables in a worksheet. /// /// Read all the table names in a worksheet. This can be used in conjunction /// with [`Xlsx::table_by_name()`] to iterate over the tables in the /// worksheet. /// /// # Parameters /// /// - `sheet_name`: The name of the worksheet to get the table names from. /// /// # Panics /// /// Panics if tables have not been loaded via [`Xlsx::load_tables()`]. /// /// # Examples /// /// An example of getting the names of all the tables in an Excel workbook, /// sheet by sheet. /// /// ``` /// use calamine::{open_workbook, Error, Reader, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/table-multiple.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Get the names of all the sheets in the workbook. /// let sheet_names = workbook.sheet_names(); /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// for sheet_name in &sheet_names { /// // Get the table names in the current sheet. /// let table_names = workbook.table_names_in_sheet(sheet_name); /// /// // Print the associated table names. /// println!("{sheet_name} contains tables: {table_names:?}"); /// } /// /// Ok(()) /// } /// ``` /// /// Output: /// /// ```text /// Sheet1 contains tables: ["Inventory"] /// Sheet2 contains tables: ["Pricing"] /// Sheet3 contains tables: ["Sales_Bob", "Sales_Alice"] /// ``` /// pub fn table_names_in_sheet(&self, sheet_name: &str) -> Vec<&String> { self.tables .as_ref() .expect("Tables must be loaded before they are referenced") .iter() .filter(|(_, sheet, ..)| sheet == sheet_name) .map(|(name, ..)| name) .collect() } /// Get a worksheet table by name. /// /// This method retrieves a [`Table`] from the workbook by its name. The /// table will contain an owned copy of the worksheet data in the table /// range. /// /// # Parameters /// /// - `table_name`: The name of the table to retrieve. /// /// # Errors /// /// - [`XlsxError::TableNotFound`]. /// - [`XlsxError::NotAWorksheet`]. /// /// # Panics /// /// Panics if tables have not been loaded via [`Xlsx::load_tables()`]. /// /// # Examples /// /// An example of getting an Excel worksheet table by its name. The file in /// this example contains 4 tables spread across 3 worksheets. This example /// gets an owned copy of the worksheet data in the table area. /// /// ``` /// use calamine::{open_workbook, Data, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/table-multiple.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name("Inventory")?; /// /// // Get the data range of the table. The data type is `&Range`. /// let data_range = table.data(); /// /// // Do something with the data using the `Range` APIs. In this case /// // we will just check for a cell value. /// assert_eq!( /// data_range.get((0, 1)), /// Some(&Data::String("Apple".to_string())) /// ); /// /// Ok(()) /// } /// ``` /// pub fn table_by_name(&mut self, table_name: &str) -> Result, XlsxError> { let TableMetadata { name, sheet_name, columns, dimensions, } = self.get_table_meta(table_name)?; let Dimensions { start, end } = dimensions; let range = self.worksheet_range(&sheet_name)?; let tbl_rng = range.range(start, end); Ok(Table { name, sheet_name, columns, data: tbl_rng, }) } /// Get a worksheet table by name, with referenced data. /// /// This method retrieves a [`Table`] from the workbook by its name. The /// table will contain an borrowed/referenced copy of the worksheet data in /// the table range. This is more efficient than [`Xlsx::table_by_name()`] /// for large tables. /// /// # Parameters /// /// - `table_name`: The name of the table to retrieve. /// /// # Errors /// /// - [`XlsxError::TableNotFound`]. /// - [`XlsxError::NotAWorksheet`]. /// /// # Panics /// /// Panics if tables have not been loaded via [`Xlsx::load_tables()`]. /// /// # Examples /// /// An example of getting an Excel worksheet table by its name. The file in /// this example contains 4 tables spread across 3 worksheets. This example /// gets a borrowed/referenced copy of the worksheet data in the table area. /// /// ``` /// use calamine::{open_workbook, DataRef, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/table-multiple.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Load the tables in the workbook. /// workbook.load_tables()?; /// /// // Get the table by name. /// let table = workbook.table_by_name_ref("Inventory")?; /// /// // Get the data range of the table. The data type is `&Range>`. /// let data_range = table.data(); /// /// // Do something with the data using the `Range` APIs. In this case /// // we will just check for a cell value. /// assert_eq!( /// data_range.get((0, 1)), /// Some(&DataRef::SharedString("Apple")) /// ); /// /// Ok(()) /// } /// ``` /// pub fn table_by_name_ref(&mut self, table_name: &str) -> Result>, XlsxError> { let TableMetadata { name, sheet_name, columns, dimensions, } = self.get_table_meta(table_name)?; let Dimensions { start, end } = dimensions; let range = self.worksheet_range_ref(&sheet_name)?; let tbl_rng = range.range(start, end); Ok(Table { name, sheet_name, columns, data: tbl_rng, }) } /// Get the merged cells/regions in a workbook by the sheet name. /// /// Merged cells in Excel are a range of cells that have been merged to act /// as a single cell. It is often used to create headers or titles that span /// multiple columns or rows. /// /// The function returns a vector of [`Dimensions`] of the merged region. /// This is wrapped in a [`Result`] and an [`Option`]. /// /// # Parameters /// /// - `sheet_name`: The name of the worksheet to get the merged regions /// from. /// /// # Errors /// /// - [`XlsxError::Xml`]. /// /// # Examples /// /// An example of getting the merged regions/cells in an Excel workbook, by /// individual worksheet. /// /// ``` /// use calamine::{open_workbook, Error, Reader, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/merged_range.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Get the names of all the sheets in the workbook. /// let sheet_names = workbook.sheet_names(); /// /// for sheet_name in &sheet_names { /// println!("{sheet_name}: "); /// /// // Get the merged cells in the current sheet. /// let merge_cells = workbook.worksheet_merge_cells(sheet_name); /// /// if let Some(dimensions) = merge_cells { /// let dimensions = dimensions?; /// /// // Print the dimensions of each merged region. /// for dimension in &dimensions { /// println!(" {dimension:?}"); /// } /// } /// } /// /// Ok(()) /// } /// /// ``` /// /// Output: /// /// ```text /// Sheet1: /// Dimensions { start: (0, 7), end: (1, 7) } /// Dimensions { start: (0, 0), end: (1, 0) } /// Dimensions { start: (0, 1), end: (1, 1) } /// Dimensions { start: (0, 2), end: (1, 3) } /// Dimensions { start: (2, 2), end: (2, 3) } /// Dimensions { start: (3, 2), end: (3, 3) } /// Dimensions { start: (0, 4), end: (1, 4) } /// Dimensions { start: (0, 5), end: (1, 5) } /// Dimensions { start: (0, 6), end: (1, 6) } /// Sheet2: /// Dimensions { start: (0, 0), end: (3, 0) } /// Dimensions { start: (2, 2), end: (3, 3) } /// Dimensions { start: (0, 5), end: (3, 7) } /// Dimensions { start: (0, 1), end: (1, 1) } /// Dimensions { start: (0, 2), end: (1, 3) } /// Dimensions { start: (0, 4), end: (1, 4) } /// ``` /// pub fn worksheet_merge_cells( &mut self, name: &str, ) -> Option, XlsxError>> { let (_, path) = self.sheets.iter().find(|(n, _)| n == name)?; let xml = xml_reader(&mut self.zip, path, &self.zip_path_cache); xml.map(|xml| { let mut xml = xml?; let mut merge_cells = Vec::new(); let mut buffer = Vec::new(); loop { buffer.clear(); match xml.read_event_into(&mut buffer) { Ok(Event::Start(event)) if event.local_name().as_ref() == b"mergeCells" => { if let Ok(cells) = read_merge_cells(&mut xml) { merge_cells = cells; } break; } Ok(Event::Eof) => break, Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } Ok(merge_cells) }) } /// Get the merged cells/regions in a workbook by the sheet index. /// /// Merged cells in Excel are a range of cells that have been merged to act /// as a single cell. It is often used to create headers or titles that span /// multiple columns or rows. /// /// The function returns a vector of [`Dimensions`] of the merged region. /// This is wrapped in a [`Result`] and an [`Option`]. /// /// # Parameters /// /// - `sheet_index`: The zero index of the worksheet to get the merged /// regions from. /// /// # Errors /// /// - [`XlsxError::Xml`]. /// /// # Examples /// /// An example of getting the merged regions/cells in an Excel workbook, by /// worksheet index. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// let path = "tests/merged_range.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Get the merged cells in the first worksheet. /// let merge_cells = workbook.worksheet_merge_cells_at(0); /// /// if let Some(dimensions) = merge_cells { /// let dimensions = dimensions?; /// /// // Print the dimensions of each merged region. /// for dimension in &dimensions { /// println!("{dimension:?}"); /// } /// } /// /// Ok(()) /// } /// /// ``` /// /// Output: /// /// ```text /// Dimensions { start: (0, 7), end: (1, 7) } /// Dimensions { start: (0, 0), end: (1, 0) } /// Dimensions { start: (0, 1), end: (1, 1) } /// Dimensions { start: (0, 2), end: (1, 3) } /// Dimensions { start: (2, 2), end: (2, 3) } /// Dimensions { start: (3, 2), end: (3, 3) } /// Dimensions { start: (0, 4), end: (1, 4) } /// Dimensions { start: (0, 5), end: (1, 5) } /// Dimensions { start: (0, 6), end: (1, 6) } /// ``` /// pub fn worksheet_merge_cells_at( &mut self, sheet_index: usize, ) -> Option, XlsxError>> { let name = self .metadata() .sheets .get(sheet_index) .map(|sheet| sheet.name.clone())?; self.worksheet_merge_cells(&name) } } struct TableMetadata { name: String, sheet_name: String, columns: Vec, dimensions: Dimensions, } struct InnerTableMetadata { display_name: String, ref_cells: String, header_row_count: u32, totals_row_count: u32, } impl InnerTableMetadata { fn new() -> Self { Self { display_name: String::new(), ref_cells: String::new(), header_row_count: 1, totals_row_count: 0, } } } impl Xlsx { /// Get a reader over all used cells in the given worksheet cell reader pub fn worksheet_cells_reader<'a>( &'a mut self, name: &str, ) -> Result, XlsxError> { let (_, path) = self .sheets .iter() .find(|&(n, _)| n == name) .ok_or_else(|| XlsxError::WorksheetNotFound(name.into()))?; let xml = xml_reader(&mut self.zip, path, &self.zip_path_cache) .ok_or_else(|| XlsxError::WorksheetNotFound(name.into()))??; let is_1904 = self.is_1904; let strings = &self.strings; let formats = &self.formats; XlsxCellReader::new(xml, strings, formats, is_1904) } } impl Reader for Xlsx { type Error = XlsxError; fn new(mut reader: RS) -> Result { check_for_password_protected(&mut reader)?; let zip = ZipArchive::new(reader)?; let zip_path_cache = build_zip_path_cache(&zip); let mut xlsx = Xlsx { zip, strings: Vec::new(), formats: Vec::new(), is_1904: false, sheets: Vec::new(), tables: None, metadata: Metadata::default(), #[cfg(feature = "picture")] pictures: None, merged_regions: None, options: XlsxOptions::default(), zip_path_cache, }; xlsx.read_shared_strings()?; xlsx.read_styles()?; let relationships = xlsx.read_relationships()?; xlsx.read_workbook(&relationships)?; #[cfg(feature = "picture")] xlsx.read_pictures()?; Ok(xlsx) } fn with_header_row(&mut self, header_row: HeaderRow) -> &mut Self { self.options.header_row = header_row; self } fn vba_project(&mut self) -> Result, XlsxError> { let Some(mut f) = self.zip.by_name("xl/vbaProject.bin").ok() else { return Ok(None); }; let len = f.size() as usize; let vba = VbaProject::new(&mut f, len)?; Ok(Some(vba)) } fn metadata(&self) -> &Metadata { &self.metadata } fn worksheet_range(&mut self, name: &str) -> Result, XlsxError> { let rge = self.worksheet_range_ref(name)?; let inner = rge.inner.into_iter().map(|v| v.into()).collect(); Ok(Range { start: rge.start, end: rge.end, inner, }) } fn worksheet_formula(&mut self, name: &str) -> Result, XlsxError> { let mut cell_reader = match self.worksheet_cells_reader(name) { Ok(reader) => reader, Err(XlsxError::NotAWorksheet(typ)) => { warn!("'{typ}' not a worksheet"); return Ok(Range::default()); } Err(e) => return Err(e), }; let len = cell_reader.dimensions().len(); let mut cells = Vec::new(); if len < 100_000 { cells.reserve(len as usize); } while let Some(cell) = cell_reader.next_formula()? { if !cell.val.is_empty() { cells.push(cell); } } Ok(Range::from_sparse(cells)) } fn worksheets(&mut self) -> Vec<(String, Range)> { let names = self .sheets .iter() .map(|(n, _)| n.clone()) .collect::>(); names .into_iter() .filter_map(|n| { let rge = self.worksheet_range(&n).ok()?; Some((n, rge)) }) .collect() } #[cfg(feature = "picture")] fn pictures(&self) -> Option)>> { self.pictures.to_owned() } } impl ReaderRef for Xlsx { fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsxError> { let header_row = self.options.header_row; let mut cell_reader = match self.worksheet_cells_reader(name) { Ok(reader) => reader, Err(XlsxError::NotAWorksheet(typ)) => { log::warn!("'{typ}' not a valid worksheet"); return Ok(Range::default()); } Err(e) => return Err(e), }; let len = cell_reader.dimensions().len(); let mut cells = Vec::new(); if len < 100_000 { cells.reserve(len as usize); } match header_row { HeaderRow::FirstNonEmptyRow => { // the header row is the row of the first non-empty cell loop { match cell_reader.next_cell() { Ok(Some(Cell { val: DataRef::Empty, .. })) => (), Ok(Some(cell)) => cells.push(cell), Ok(None) => break, Err(e) => return Err(e), } } } HeaderRow::Row(header_row_idx) => { // If `header_row` is a row index, we only add non-empty cells after this index. loop { match cell_reader.next_cell() { Ok(Some(Cell { val: DataRef::Empty, .. })) => (), Ok(Some(cell)) => { if cell.pos.0 >= header_row_idx { cells.push(cell); } } Ok(None) => break, Err(e) => return Err(e), } } // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. if cells.first().is_some_and(|c| c.pos.0 != header_row_idx) { cells.insert( 0, Cell { pos: ( header_row_idx, cells.first().expect("cells should not be empty").pos.1, ), val: DataRef::Empty, }, ); } } } Ok(Range::from_sparse(cells)) } } fn xml_reader<'a, RS: Read + Seek>( zip: &'a mut ZipArchive, path: &str, cache: &HashMap, ) -> Option, XlsxError>> { let zip_path = cached_zip_path(cache, path); match zip.by_name(zip_path) { Ok(f) => { let mut r = XmlReader::from_reader(BufReader::new(f)); let config = r.config_mut(); config.check_end_names = false; config.trim_text(false); config.check_comments = false; config.expand_empty_elements = true; Some(Ok(r)) } Err(ZipError::FileNotFound) => None, Err(e) => Some(Err(e.into())), } } /// search through an Element's attributes for the named one pub(crate) fn get_attribute<'a>( atts: Attributes<'a>, n: QName, ) -> Result, XlsxError> { for a in atts { match a { Ok(Attribute { key, value: Cow::Borrowed(value), }) if key == n => return Ok(Some(value)), Err(e) => return Err(XlsxError::XmlAttr(e)), _ => {} // ignore other attributes } } Ok(None) } /// converts a text representation (e.g. "A6:G67") of a dimension into integers /// - top left (row, column), /// - bottom right (row, column) pub(crate) fn get_dimension(dimension: &[u8]) -> Result { let parts: Vec<_> = dimension .split(|c| *c == b':') .map(get_row_column) .collect::, XlsxError>>()?; match parts.len() { 0 => Err(XlsxError::DimensionCount(0)), 1 => Ok(Dimensions { start: parts[0], end: parts[0], }), 2 => { let rows = parts[1].0 - parts[0].0; let columns = parts[1].1 - parts[0].1; if rows > MAX_ROWS { warn!("xlsx has more than maximum number of rows ({rows} > {MAX_ROWS})"); } if columns > MAX_COLUMNS { warn!("xlsx has more than maximum number of columns ({columns} > {MAX_COLUMNS})"); } Ok(Dimensions { start: parts[0], end: parts[1], }) } len => Err(XlsxError::DimensionCount(len)), } } /// Converts a text range name into its position (row, column) (0 based index). /// If the row or column component in the range is missing, an Error is returned. pub(crate) fn get_row_column(range: &[u8]) -> Result<(u32, u32), XlsxError> { let (row, col) = get_row_and_optional_column(range)?; let col = col.ok_or(XlsxError::RangeWithoutColumnComponent)?; Ok((row, col)) } /// Converts a text row name into its position (0 based index). /// If the row component in the range is missing, an Error is returned. /// If the text row name also contains a column component, it is ignored. pub(crate) fn get_row(range: &[u8]) -> Result { get_row_and_optional_column(range).map(|(row, _)| row) } /// Converts a text range name into its position (row, column) (0 based index). /// If the row component in the range is missing, an Error is returned. /// If the column component in the range is missing, an None is returned for the column. fn get_row_and_optional_column(range: &[u8]) -> Result<(u32, Option), XlsxError> { let (mut row, mut col) = (0, 0); let mut pow = 1; let mut readrow = true; for c in range.iter().rev() { match *c { c @ b'0'..=b'9' => { if readrow { row += ((c - b'0') as u32) * pow; pow *= 10; } else { return Err(XlsxError::NumericColumn(c)); } } c @ b'A'..=b'Z' => { if readrow { if row == 0 { return Err(XlsxError::RangeWithoutRowComponent); } pow = 1; readrow = false; } col += ((c - b'A') as u32 + 1) * pow; pow *= 26; } c @ b'a'..=b'z' => { if readrow { if row == 0 { return Err(XlsxError::RangeWithoutRowComponent); } pow = 1; readrow = false; } col += ((c - b'a') as u32 + 1) * pow; pow *= 26; } _ => return Err(XlsxError::Alphanumeric(*c)), } } let row = row .checked_sub(1) .ok_or(XlsxError::RangeWithoutRowComponent)?; Ok((row, col.checked_sub(1))) } /// Attempts to read either a simple or richtext string, reusing caller-provided /// buffers to avoid per-call allocations. pub(crate) fn read_string_with_bufs( xml: &mut XlReader<'_, RS>, closing: QName, xml_buf: &mut Vec, text_buf: &mut Vec, ) -> Result, XlsxError> where RS: Read + Seek, { let mut rich_buffer: Option = None; let mut is_phonetic_text = false; loop { xml_buf.clear(); match xml.read_event_into(xml_buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"r" => { if rich_buffer.is_none() { // use a buffer since richtext has multiples and for the same cell rich_buffer = Some(String::new()); } } Ok(Event::Start(e)) if e.local_name().as_ref() == b"rPh" => { is_phonetic_text = true; } Ok(Event::End(e)) if e.name() == closing => { if rich_buffer.is_none() { // An empty element, without or other // subelements, is treated as a valid empty string in Excel. rich_buffer = Some(String::new()); } return Ok(rich_buffer); } Ok(Event::End(e)) if e.local_name().as_ref() == b"rPh" => { is_phonetic_text = false; } Ok(Event::Start(e)) if e.local_name().as_ref() == b"t" && !is_phonetic_text => { text_buf.clear(); let mut value = String::new(); loop { match xml.read_event_into(text_buf)? { Event::Text(t) => value.push_str(&unescape_xml(&t.xml10_content()?)), Event::GeneralRef(e) => unescape_entity_to_buffer(&e, &mut value)?, Event::End(end) if end.name() == e.name() => break, Event::Eof => return Err(XlsxError::XmlEof("t")), _ => (), } } if let Some(s) = &mut rich_buffer { s.push_str(&value); } else { // consume any remaining events up to expected closing tag xml.read_to_end_into(closing, text_buf)?; return Ok(Some(value)); } } Ok(Event::Eof) => return Err(XlsxError::XmlEof("")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } } fn check_for_password_protected(reader: &mut RS) -> Result<(), XlsxError> { let offset_end = reader.seek(std::io::SeekFrom::End(0))? as usize; reader.seek(std::io::SeekFrom::Start(0))?; if let Ok(cfb) = crate::cfb::Cfb::new(reader, offset_end) { if cfb.has_directory("EncryptedPackage") { return Err(XlsxError::Password); } } Ok(()) } fn read_merge_cells(xml: &mut XlReader<'_, RS>) -> Result, XlsxError> where RS: Read + Seek, { let mut merge_cells = Vec::new(); loop { let mut buffer = Vec::new(); match xml.read_event_into(&mut buffer) { Ok(Event::Start(event)) if event.local_name().as_ref() == b"mergeCell" => { for attribute in event.attributes() { let attribute = attribute?; if attribute.key == QName(b"ref") { let dimensions = get_dimension(&attribute.value)?; merge_cells.push(dimensions); break; } } } Ok(Event::End(event)) if event.local_name().as_ref() == b"mergeCells" => { break; } Ok(Event::Eof) => return Err(XlsxError::XmlEof("")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } Ok(merge_cells) } #[derive(Debug, Copy, Clone)] enum Reference { Cell { row: u32, col: u32, absolute_row: bool, absolute_col: bool, }, Row { row: u32, absolute: bool, }, Column { col: u32, absolute: bool, }, } impl Reference { // Create a cell reference with validation. fn cell(row: u32, col: u32, absolute_row: bool, absolute_col: bool) -> Result { let reference = Reference::Cell { row, col, absolute_row, absolute_col, }; reference.validate()?; Ok(reference) } // Create a column reference with validation. fn column(col: u32, absolute: bool) -> Result { let reference = Reference::Column { col, absolute }; reference.validate()?; Ok(reference) } // Create a row reference with validation. fn row(row: u32, absolute: bool) -> Result { let reference = Reference::Row { row, absolute }; reference.validate()?; Ok(reference) } // Parse a reference (e.g., "A1", "$A1", "A$1", "$A$1", "E", "$E", "5", "$5"). fn parse(name: &[u8]) -> Result { let mut iter = name.iter().peekable(); let mut col: u32 = 0; let mut row: u32 = 0; let mut absolute_col = false; let mut absolute_row = false; while let Some(&c) = iter.next() { match (c, iter.peek()) { (b'$', Some(b'A'..=b'Z' | b'a'..=b'z')) => { if row > 0 || col > 0 { return Err(XlsxError::Alphanumeric(c)); } absolute_col = true; } (b'$', Some(b'0'..=b'9')) => { if row > 0 { return Err(XlsxError::Alphanumeric(c)); } absolute_row = true; } (b'$', _) => return Err(XlsxError::Alphanumeric(c)), (c @ (b'A'..=b'Z' | b'a'..=b'z'), _) => { if row > 0 { return Err(XlsxError::Alphanumeric(c)); } col = col .wrapping_mul(26) .wrapping_add((c.to_ascii_uppercase() - b'A') as u32 + 1); } (c @ b'0'..=b'9', _) => { row = row.wrapping_mul(10).wrapping_add((c - b'0') as u32); } _ => return Err(XlsxError::Alphanumeric(c)), } } match (col.checked_sub(1), row.checked_sub(1)) { (Some(col), Some(row)) => Reference::cell(row, col, absolute_row, absolute_col), (Some(col), None) => Reference::column(col, absolute_col), (None, Some(row)) => Reference::row(row, absolute_row), (None, None) => Err(XlsxError::Unexpected("Empty reference")), } } // Apply offset to create a new reference with validation. fn offset(self, offset: (i64, i64)) -> Result { let result = match self { Reference::Cell { row, col, absolute_row, absolute_col, } => { let new_col = if absolute_col { col } else { (col as i64 + offset.1) as u32 }; let new_row = if absolute_row { row } else { (row as i64 + offset.0) as u32 }; Reference::Cell { row: new_row, col: new_col, absolute_row, absolute_col, } } Reference::Column { col, absolute } => { let new_col = if absolute { col } else { (col as i64 + offset.1) as u32 }; Reference::Column { col: new_col, absolute, } } Reference::Row { row, absolute } => { let new_row = if absolute { row } else { (row as i64 + offset.0) as u32 }; Reference::Row { row: new_row, absolute, } } }; result.validate()?; Ok(result) } // Validate that row/column values are in bounds. fn validate(&self) -> Result<(), XlsxError> { match self { Reference::Cell { row, col, .. } => { if *col >= MAX_COLUMNS { return Err(XlsxError::ColumnNumberOverflow); } if *row >= MAX_ROWS { return Err(XlsxError::RowNumberOverflow); } Ok(()) } Reference::Column { col, .. } => { if *col >= MAX_COLUMNS { return Err(XlsxError::ColumnNumberOverflow); } Ok(()) } Reference::Row { row, .. } => { if *row >= MAX_ROWS { return Err(XlsxError::RowNumberOverflow); } Ok(()) } } } // Format a reference to bytes. fn format(&self, buf: &mut Vec) -> Result<(), XlsxError> { match self { Reference::Cell { row, col, absolute_row, absolute_col, } => { if *absolute_col { buf.push(b'$'); } column_number_to_name(*col, buf)?; if *absolute_row { buf.push(b'$'); } buf.extend((row + 1).to_string().into_bytes()); Ok(()) } Reference::Column { col, absolute } => { if *absolute { buf.push(b'$'); } column_number_to_name(*col, buf) } Reference::Row { row, absolute } => { if *absolute { buf.push(b'$'); } buf.extend((row + 1).to_string().into_bytes()); Ok(()) } } } } // Advance a reference by the offset (e.g., "A1", "E:F", "5:6", "A1:B5"). fn offset_range(range: &[u8], offset: (i64, i64), buf: &mut Vec) -> Result<(), XlsxError> { let colon_pos = range.iter().position(|&b| b == b':'); match colon_pos { None => { let reference = Reference::parse(range)?; if !matches!(reference, Reference::Cell { .. }) { return Err(XlsxError::Unexpected("Single reference type must be cell")); } let offset_ref = reference.offset(offset)?; offset_ref.format(buf) } Some(idx) => { let start = &range[..idx]; let end = &range[idx + 1..]; let start_ref = Reference::parse(start)?; let end_ref = Reference::parse(end)?; if std::mem::discriminant(&start_ref) != std::mem::discriminant(&end_ref) { return Err(XlsxError::Unexpected("Range type mismatch")); } let start_offset = start_ref.offset(offset)?; let end_offset = end_ref.offset(offset)?; start_offset.format(buf)?; buf.push(b':'); end_offset.format(buf) } } } // Advance all valid cell names in the string by the offset. fn replace_cell_names(s: &str, offset: (i64, i64)) -> Result { let bytes = s.as_bytes(); let mut res: Vec = Vec::new(); let mut in_quote = false; let mut token_start = 0; let mut token_end = 0; for (i, &c) in bytes.iter().enumerate() { if !in_quote && (c.is_ascii_alphanumeric() || c == b'$' || c == b':') { token_end = i + 1; } else { if token_start < token_end && offset_range(&bytes[token_start..token_end], offset, &mut res).is_err() { res.extend(&bytes[token_start..token_end]); } res.push(c); token_start = i + 1; token_end = i + 1; if c == b'"' { in_quote = !in_quote; } } } if token_start < token_end && offset_range(&bytes[token_start..token_end], offset, &mut res).is_err() { res.extend(&bytes[token_start..token_end]); } match String::from_utf8(res) { Ok(s) => Ok(s), Err(_) => Err(XlsxError::Unexpected("fail to convert cell name")), } } /// Convert the integer to Excelsheet column title. /// If the column number not in 1~16384, an Error is returned. pub(crate) fn column_number_to_name(num: u32, buf: &mut Vec) -> Result<(), XlsxError> { if num >= MAX_COLUMNS { return Err(XlsxError::ColumnNumberOverflow); } let start = buf.len(); let mut num = num + 1; while num > 0 { let integer = ((num - 1) % 26 + 65) as u8; buf.push(integer); num = (num - 1) / 26; } buf[start..].reverse(); Ok(()) } // Data type of the record's value. enum Tag { // String S, // Number (Float or Int) N, // Missing M, // Error E, // Bool B, // Date D, } type Value = Option>; /// Check if tag is an item within a PivotCache Record, which does not require a Definitions lookup. fn item_tag(e: &BytesStart) -> Option { match e.local_name().as_ref() { b"s" => Some(Tag::S), b"n" => Some(Tag::N), b"m" => Some(Tag::M), b"e" => Some(Tag::E), b"b" => Some(Tag::B), b"d" => Some(Tag::D), _ => None, } } fn item_value(e: &BytesStart) -> Result { for a in e.attributes() { if let Attribute { key: QName(b"v"), value, } = a? { return Ok(Some(Box::from(value))); } } Ok(None) } // Get the target location of the pivot table's pivot cache definitions. fn find_pivot_cache_definitions_from_pivot( zip: &mut zip::ZipArchive, path: &str, cache: &HashMap, ) -> Result where RS: Read + Seek, { let (base_folder, file_name) = path.rsplit_once('/').expect("should be in a folder"); let rel_path = format!("{base_folder}/_rels/{file_name}.rels"); let mut xml = match xml_reader(zip, &rel_path, cache) { None => return Err(XlsxError::FileNotFound(rel_path.to_owned())), Some(x) => x?, }; let mut definitions_path = None; let mut buf = Vec::with_capacity(64); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"Relationship" => { let mut target = String::new(); let mut is_pivot_cache_definitions_type = false; for a in e.attributes() { match a? { Attribute { key: QName(b"Target"), value: v, } => target = xml.decoder().decode(&v)?.into_owned(), Attribute { key: QName(b"Type"), value: v, } => is_pivot_cache_definitions_type = *v == b"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheDefinition"[..], _ => (), } } match (is_pivot_cache_definitions_type, definitions_path.is_some()) { (true, false) => { if let Some(target) = target.strip_prefix("../") { // this is an incomplete implementation, but should be good enough for excel let (parent, _) = base_folder .rsplit_once('/') .expect("Must be a parent folder"); definitions_path.replace(format!("{parent}/{target}")); } else if !target.is_empty() { definitions_path.replace(target); } } (true, true) => return Err(XlsxError::Unexpected( "multiple pivot cache definition relationships found for one pivot table", )), _ => {} } } Ok(Event::End(e)) if e.local_name().as_ref() == b"Relationships" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("Relationships")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } match definitions_path { Some(path) => Ok(path), None => Err(XlsxError::Unexpected( "no pivot cache definition found for pivot table", )), } } // Get the target location of the pivot cache record file. fn find_pivot_cache_records_from_definitions( zip: &mut zip::ZipArchive, path: &str, cache: &HashMap, ) -> Result where RS: Read + Seek, { let (base_folder, file_name) = path.rsplit_once('/').expect("should be in a folder"); let rel_path = format!("{base_folder}/_rels/{file_name}.rels"); let mut xml = match xml_reader(zip, rel_path.as_ref(), cache) { None => return Err(XlsxError::FileNotFound(rel_path.to_owned())), Some(x) => x?, }; let mut record_path = None; let mut buf = Vec::with_capacity(64); loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"Relationship" => { let mut target = String::new(); let mut is_pivot_cache_record_type = false; for a in e.attributes() { match a? { Attribute { key: QName(b"Target"), value: v, } => target = xml.decoder().decode(&v)?.into_owned(), Attribute { key: QName(b"Type"), value: v, } => is_pivot_cache_record_type = *v == b"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotCacheRecords"[..], _ => (), } } match (is_pivot_cache_record_type, record_path.is_some()) { (true, false) => { if target.starts_with("xl/pivotCache") { record_path.replace(target); } else if !target.is_empty() { record_path.replace(format!("xl/pivotCache/{target}")); } } (true, true) => { return Err(XlsxError::Unexpected( "multiple pivot cache record relationships found for one pivot table", )) } _ => {} } } Ok(Event::End(e)) if e.local_name().as_ref() == b"Relationships" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("Relationships")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } match record_path { Some(path) => Ok(path), None => Err(XlsxError::Unexpected( "no pivot cache records found for pivot table", )), } } // Return a vec of pivot table paths (ie xl/pivotTables/pivot1.xml) for a given sheet name. fn find_pivot_table_paths_from_sheet( zip: &mut zip::ZipArchive, sheet_path: &str, cache: &HashMap, ) -> Result, XlsxError> where RS: Read + Seek, { let mut pivots_on_sheet = vec![]; let mut buf = Vec::with_capacity(64); let last_folder_index = sheet_path.rfind('/').expect("should be in a folder"); let (base_folder, file_name) = sheet_path.split_at(last_folder_index); let rel_path = format!("{base_folder}/_rels{file_name}.rels"); let mut xml = match xml_reader(zip, &rel_path, cache) { // Some sheets may not have relationships - okay for path to not exist. None => return Ok(vec![]), Some(x) => x?, }; loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"Relationship" => { let mut target = String::new(); let mut is_pivot_table_type = false; for a in e.attributes() { match a? { Attribute { key: QName(b"Target"), value: v, } => target = xml.decoder().decode(&v)?.into_owned(), Attribute { key: QName(b"Type"), value: v, } => is_pivot_table_type = *v == b"http://schemas.openxmlformats.org/officeDocument/2006/relationships/pivotTable"[..], _ => (), } } if is_pivot_table_type { if let Some(target) = target.strip_prefix("../") { // this is an incomplete implementation, but should be good enough for excel let (parent, _) = base_folder .rsplit_once('/') .expect("Must be a parent folder"); pivots_on_sheet.push(format!("{parent}/{target}")); } else if !target.is_empty() { pivots_on_sheet.push(target); } } } Ok(Event::End(e)) if e.local_name().as_ref() == b"Relationships" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("Relationships")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } Ok(pivots_on_sheet) } // Takes a pivot table path (ie xl/pivotTables/pivot1.xml) and returns the name. fn find_pivot_name_from_pivot_path( zip: &mut zip::ZipArchive, pivot_path: &str, cache: &HashMap, ) -> Result where RS: Read + Seek, { let mut xml = match xml_reader(zip, pivot_path, cache) { None => return Err(XlsxError::FileNotFound(pivot_path.to_string())), Some(x) => x?, }; let mut buf = Vec::with_capacity(64); let mut name = None; loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"pivotTableDefinition" => { for a in e.attributes() { if let Attribute { key: QName(b"name"), value: v, } = a? { if name.is_some() { return Err(XlsxError::Unexpected( "multiple name entries for one pivot table path", )); } else { name.replace(xml.decoder().decode(&v)?.into_owned()); } } } } Ok(Event::End(e)) if e.local_name().as_ref() == b"pivotTableDefinition" => break, Ok(Event::Eof) => return Err(XlsxError::XmlEof("pivotTableDefinition")), Err(e) => return Err(XlsxError::Xml(e)), _ => (), } } match name { Some(name) => Ok(name), None => Err(XlsxError::Unexpected("no name for pivot table")), } } /// Parse an item within a PivotCache Record into its appropriate [`Data`] type. fn parse_item(item: &(Tag, Value), decoder: &Decoder) -> Data { let Some(val) = item.1.as_deref() else { return Data::Empty; }; match item.0 { Tag::M => Data::Empty, Tag::S => { if let Ok(val) = decoder.decode(val.as_ref()) { Data::String(val.to_string()) } else { Data::Error(CellErrorType::GettingData) } } Tag::N => { if val.contains(&b'.') { match bytes_to_f64(val, decoder) { Some(val) => Data::Float(val), None => Data::Error(CellErrorType::GettingData), } } else { match bytes_to_i64(val, decoder) { Some(val) => Data::Int(val), None => Data::Error(CellErrorType::GettingData), } } } Tag::D => { if let Ok(val) = decoder.decode(val) { Data::DateTimeIso(val.into()) } else { Data::Error(CellErrorType::GettingData) } } Tag::B => { { // boolean tags only support W3C XML Schema match val { b"0" | b"false" => Data::Bool(false), b"1" | b"true" => Data::Bool(true), _ => Data::Error(CellErrorType::GettingData), } } } Tag::E => Data::Error(CellErrorType::Ref), } } // Parse failures are handled with None and left to `Self::parse_item` to address. fn bytes_to_i64(val: &[u8], decoder: &Decoder) -> Option { if let Ok(val) = decoder.decode(val) { atoi_simd::parse::(val.as_bytes()).ok() } else { None } } // Parse failures are handled with None and left to `parse_item` to address. fn bytes_to_f64(val: &[u8], decoder: &Decoder) -> Option { if let Ok(val) = decoder.decode(val) { fast_float2::parse(val.as_bytes()).ok() } else { None } } #[derive(Default)] pub struct PivotTables(Vec); impl PivotTables { fn new() -> Self { Self(vec![]) } fn push(&mut self, pivot_table: PivotTableRef) { self.0.push(pivot_table); } /// Helper function to identify pivot tables by name and worksheet. /// /// # Returns /// /// ```text /// Vec<(&str, &str)> /// β”‚ β”‚ /// β”‚ └─── Pivot Table name /// β”‚ /// └──── Worksheet name /// ``` /// /// # Note /// /// Pivot table names are unique per worksheet, not per workbook. /// /// # Examples /// /// An example of retrieving pivot cache data for a Pivot Table named "PivotTable1" /// on worksheet "PivotSheet1". /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook("tests/pivots.xlsx")?; /// // Must retrieve necessary metadata before reading Pivot Table data. /// let pivot_tables = workbook.pivot_tables()?; /// /// // "PivotTable1" is found on both sheets: "PivotSheet1" & "PivotSheet3" so /// // we must include the sheet name in our filter ~ see note on uniqueness. /// let names_and_sheets = pivot_tables.get_pivot_tables_by_name_and_sheet() /// .into_iter() /// .filter_map(|pt| { /// if pt.0.eq("PivotSheet1") && pt.1.eq("PivotTable1") { /// Some(pt) /// } else { /// None /// } /// }) /// .collect::>(); /// /// assert_eq!(names_and_sheets.len(), 1); /// /// Ok(()) /// /// } /// ``` /// pub fn get_pivot_tables_by_name_and_sheet(&self) -> Vec<(&str, &str)> { self.0.iter().map(|pt| (pt.sheet(), pt.name())).collect() } /// Get the names of all pivot tables for a given worksheet. /// /// Worksheets that do not contain any pivot tables will return None. Worksheet names /// /// # Examples /// /// An example of getting all pivot tables for a provided sheet. /// /// ``` /// use calamine::{open_workbook, Error, Xlsx}; /// /// fn main() -> Result<(), Error> { /// /// let path = "tests/pivots.xlsx"; /// /// // Open the workbook. /// let mut workbook: Xlsx<_> = open_workbook(path)?; /// /// // Must retrieve necessary metadata before reading Pivot Table data. /// let pivot_tables = workbook.pivot_tables()?; /// /// // Get the pivot table names in the workbook for a given sheet. /// let pivot_table_names = pivot_tables.pivot_tables_by_sheet("PivotSheet1"); /// /// // Check the pivot table names (ordering not guaranteed). /// assert_eq!(pivot_table_names, vec!["PivotTable1"]); /// /// Ok(()) /// } /// ``` /// pub fn pivot_tables_by_sheet(&self, sheet_name: &str) -> Vec<&str> { self.0 .iter() .filter_map(|val| { if val.sheet() == sheet_name { Some(val.name()) } else { None } }) .collect::>() } } struct PivotTableRef { name: String, sheet: String, records: String, definitions: String, } impl PivotTableRef { fn new(name: String, sheet: String, records: String, definitions: String) -> Self { Self { name, sheet, records, definitions, } } fn name(&self) -> &str { self.name.as_ref() } fn sheet(&self) -> &str { self.sheet.as_ref() } fn records(&self) -> &str { self.records.as_ref() } fn definitions(&self) -> &str { self.definitions.as_ref() } } pub struct PivotCacheIter<'a, RS: Read + Seek + 'a> { definitions: HashMap>, field_names: Vec, reader: XlReader<'a, RS>, } fn get_pivot_cache_iter<'a, RS: Read + Seek + 'a>( xl: &'a mut crate::Xlsx, pivot_table: &PivotTableRef, ) -> Result, XlsxError> { let definitions = pivot_table.definitions(); let records = pivot_table.records(); let mut fields: Vec> = vec![]; let mut definition_map = HashMap::new(); let mut field_names = vec![]; // Converting into an iterator requires first reading a pivotCacheDefinitions.xml file // to get lookup values used in pivotCacheRecords.xml file. { let mut xml = match xml_reader(&mut xl.zip, definitions, &xl.zip_path_cache) { None => { return Err(XlsxError::FileNotFound(format!( "File not found: {}", definitions ))) } Some(x) => x?, }; let mut buf = Vec::with_capacity(64); // building list of field names and definitions from some pivotCacheDefinitions.xml file loop { buf.clear(); match xml.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"cacheField" => { for a in e.attributes() { match a? { Attribute { key: QName(b"name"), value, } => { field_names.push(xml.decoder().decode(value.as_ref())?.to_string()); fields.push(vec![]); } Attribute { key: QName(b"formula"), value: _value, } => { field_names.pop(); fields.pop(); } _ => { // do nothing } } } } // Exclude grouped fields from results. // This does not represent the underlying data and should be removed. Ok(Event::Start(e)) if e.local_name().as_ref() == b"groupItems" => { field_names.pop(); fields.pop(); } Ok(Event::Eof) => break, Err(e) => { panic!("{e}") } Ok(Event::Start(e)) => { if let Some(tag) = item_tag(&e) { if let Some(field) = fields.last_mut() { field.push((tag, item_value(&e)?)); } } } Ok(_) => {} } } // add the definitions to the definition map with a key on field name for (field, name) in fields.into_iter().zip(field_names.iter()) { definition_map.insert(name.to_string(), field); } } xml_reader(&mut xl.zip, records, &xl.zip_path_cache).map_or_else( || { Err(XlsxError::FileNotFound(format!( "File not found: {records}" ))) }, |record_reader| { Ok(PivotCacheIter::new( definition_map, field_names, record_reader?, )) }, ) } impl<'a, RS: Read + Seek + 'a> PivotCacheIter<'a, RS> { fn new( definitions: HashMap>, field_names: Vec, reader: XlReader<'a, RS>, ) -> Self { Self { definitions, field_names, reader, } } } // Iterates over , the tag for a row, found in the PivotCacheRecords.xml file. // PivotCacheIter must also hold some lookup values / metadata to support the content within . // // https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.spreadsheet.pivotcacherecord?view=openxml-3.0.1 impl<'a, RS: Read + Seek + 'a> Iterator for PivotCacheIter<'a, RS> { type Item = Result, XlsxError>; fn next(&mut self) -> Option { let mut row = vec![]; let mut col_number = 0; let mut buf = Vec::with_capacity(64); loop { buf.clear(); match self.reader.read_event_into(&mut buf) { Ok(Event::Start(e)) if e.local_name().as_ref() == b"x" => { for a in e.attributes() { if let Ok(Attribute { key: QName(b"v"), value, }) = a { let value_position = match self.reader.decoder().decode(value.as_ref()) { Ok(val) => match val.parse::() { Ok(val) => val, Err(e) => { return Some(Err(XlsxError::ParseInt(e))); } }, Err(e) => return Some(Err(XlsxError::Encoding(e))), }; let column_name = &self.field_names[col_number]; row.push(parse_item( &self.definitions[column_name][value_position], &self.reader.decoder(), )); break; } } col_number += 1; } Ok(Event::End(e)) if e.local_name().as_ref() == b"r" => return Some(Ok(row)), Ok(Event::Start(e)) if e.local_name().as_ref() == b"pivotCacheRecords" => { return Some(Ok(self .field_names .iter() .map(|fields| Data::String(fields.to_string())) .collect())) } Ok(Event::Eof) => return None, Err(e) => return Some(Err(XlsxError::Xml(e))), Ok(Event::Start(e)) => { if let Some(tag) = item_tag(&e) { if let Ok(value) = item_value(&e) { row.push(parse_item(&(tag, value), &self.reader.decoder())); col_number += 1; } } } Ok(_) => {} } } } } // ----------------------------------------------------------------------- // Unit tests for Xlsx. // ----------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; use std::io::Write; use zip::write::SimpleFileOptions; use zip::ZipWriter; #[test] fn test_dimensions() { assert_eq!(get_row_column(b"A1").unwrap(), (0, 0)); assert_eq!(get_row_column(b"C107").unwrap(), (106, 2)); assert_eq!( get_dimension(b"C2:D35").unwrap(), Dimensions { start: (1, 2), end: (34, 3) } ); assert_eq!( get_dimension(b"A1:XFD1048576").unwrap(), Dimensions { start: (0, 0), end: (1_048_575, 16_383), } ); } #[test] fn test_dimension_length() { assert_eq!(get_dimension(b"A1:Z99").unwrap().len(), 2_574); assert_eq!( get_dimension(b"A1:XFD1048576").unwrap().len(), 17_179_869_184 ); } #[test] fn test_parse_error() { assert_eq!( CellErrorType::from_str("#DIV/0!").unwrap(), CellErrorType::Div0 ); assert_eq!(CellErrorType::from_str("#N/A").unwrap(), CellErrorType::NA); assert_eq!( CellErrorType::from_str("#NAME?").unwrap(), CellErrorType::Name ); assert_eq!( CellErrorType::from_str("#NULL!").unwrap(), CellErrorType::Null ); assert_eq!( CellErrorType::from_str("#NUM!").unwrap(), CellErrorType::Num ); assert_eq!( CellErrorType::from_str("#REF!").unwrap(), CellErrorType::Ref ); assert_eq!( CellErrorType::from_str("#VALUE!").unwrap(), CellErrorType::Value ); } #[test] fn test_column_number_to_name() { let check = |num, expected: &[u8]| { let mut buf = Vec::new(); column_number_to_name(num, &mut buf).unwrap(); assert_eq!(buf, expected); }; check(0, b"A"); check(25, b"Z"); check(26, b"AA"); check(27, b"AB"); check(MAX_COLUMNS - 1, b"XFD"); } #[test] fn test_parse_reference() { let check_cell = |input: &[u8], row, col, abs_row, abs_col| match Reference::parse(input).unwrap() { Reference::Cell { row: r, col: c, absolute_row: ar, absolute_col: ac, } => { assert_eq!((r, c, ar, ac), (row, col, abs_row, abs_col)); } _ => panic!("Expected Cell reference"), }; let check_column = |input: &[u8], col, abs| match Reference::parse(input).unwrap() { Reference::Column { col: c, absolute: a, } => { assert_eq!((c, a), (col, abs)); } _ => panic!("Expected Column reference"), }; let check_row = |input: &[u8], row, abs| match Reference::parse(input).unwrap() { Reference::Row { row: r, absolute: a, } => { assert_eq!((r, a), (row, abs)); } _ => panic!("Expected Row reference"), }; // Cell references check_cell(b"A1", 0, 0, false, false); check_cell(b"$A1", 0, 0, false, true); check_cell(b"A$1", 0, 0, true, false); check_cell(b"$A$1", 0, 0, true, true); check_cell(b"XFD1048576", MAX_ROWS - 1, MAX_COLUMNS - 1, false, false); // Column references check_column(b"A", 0, false); check_column(b"$A", 0, true); check_column(b"XFD", MAX_COLUMNS - 1, false); // Row references check_row(b"1", 0, false); check_row(b"$1", 0, true); check_row(b"1048576", MAX_ROWS - 1, false); } #[test] fn test_format_reference() { let check_cell = |row, col, abs_row, abs_col, expected: &[u8]| { let mut buf = Vec::new(); Reference::Cell { row, col, absolute_row: abs_row, absolute_col: abs_col, } .format(&mut buf) .unwrap(); assert_eq!(buf, expected); }; let check_column = |col, absolute, expected: &[u8]| { let mut buf = Vec::new(); Reference::Column { col, absolute } .format(&mut buf) .unwrap(); assert_eq!(buf, expected); }; let check_row = |row, absolute, expected: &[u8]| { let mut buf = Vec::new(); Reference::Row { row, absolute }.format(&mut buf).unwrap(); assert_eq!(buf, expected); }; // Cell references check_cell(0, 0, false, false, b"A1"); check_cell(0, 0, false, true, b"$A1"); check_cell(0, 0, true, false, b"A$1"); check_cell(0, 0, true, true, b"$A$1"); check_cell(MAX_ROWS - 1, MAX_COLUMNS - 1, false, false, b"XFD1048576"); // Column references check_column(0, false, b"A"); check_column(0, true, b"$A"); check_column(MAX_COLUMNS - 1, false, b"XFD"); // Row references check_row(0, false, b"1"); check_row(0, true, b"$1"); check_row(MAX_ROWS - 1, false, b"1048576"); } #[test] fn test_format_reference_overflow() { let check_err = |reference: Reference, offset| { let result = reference.offset(offset); assert!( matches!( result, Err(XlsxError::ColumnNumberOverflow) | Err(XlsxError::RowNumberOverflow) ), "expected overflow error, got {:?}", result ); }; // Cell reference offset pushes column out of bounds check_err( Reference::Cell { row: 0, col: MAX_COLUMNS - 1, absolute_row: false, absolute_col: false, }, (0, 1), ); // Cell reference offset pushes row out of bounds check_err( Reference::Cell { row: MAX_ROWS - 1, col: 0, absolute_row: false, absolute_col: false, }, (1, 0), ); // Column reference offset pushes out of bounds check_err( Reference::Column { col: MAX_COLUMNS - 1, absolute: false, }, (0, 1), ); // Row reference offset pushes out of bounds check_err( Reference::Row { row: MAX_ROWS - 1, absolute: false, }, (1, 0), ); } #[test] fn test_offset_range() { let check = |input: &[u8], offset, expected: &[u8]| { let mut buf = Vec::new(); offset_range(input, offset, &mut buf).unwrap(); assert_eq!(buf, expected); }; let check_err = |input: &[u8], offset| { let mut buf = Vec::new(); let res = offset_range(input, offset, &mut buf); assert!(res.is_err()); assert_eq!(buf.len(), 0) }; // Cell references check(b"A1", (1, 1), b"B2"); check(b"$A1", (1, 1), b"$A2"); check(b"A$1", (1, 1), b"B$1"); check(b"$A$1", (1, 1), b"$A$1"); // Column references check_err(b"E", (0, 1)); check_err(b"$E", (0, 1)); // Row references check_err(b"5", (1, 0)); check_err(b"$5", (1, 0)); // Cell ranges check(b"A1:B2", (1, 1), b"B2:C3"); check(b"$A$1:$B$2", (1, 1), b"$A$1:$B$2"); // Column ranges check(b"E:F", (0, 1), b"F:G"); check(b"$E:$F", (0, 1), b"$E:$F"); check(b"E:F", (1, 0), b"E:F"); // Row ranges check(b"5:6", (1, 0), b"6:7"); check(b"$5:$6", (1, 0), b"$5:$6"); check(b"5:6", (0, 1), b"5:6"); } #[test] fn test_parse_reference_overflow() { let check_col_err = |input: &[u8]| { assert!(matches!( Reference::parse(input), Err(XlsxError::ColumnNumberOverflow) )); }; let check_row_err = |input: &[u8]| { assert!(matches!( Reference::parse(input), Err(XlsxError::RowNumberOverflow) )); }; let check_syntax_err = |input: &[u8]| { assert!(matches!( Reference::parse(input), Err(XlsxError::Alphanumeric(_)) )); }; // Invalid syntax check_syntax_err(b"A$A1"); check_syntax_err(b"A1$2"); check_syntax_err(b"$$A1"); check_syntax_err(b"$A$$1"); check_syntax_err(b"A$$1"); check_syntax_err(b"1A"); check_syntax_err(b"1A1"); check_syntax_err(b"A1B2"); // Cell references check_col_err(b"XFE1"); check_col_err(b"AAAA1"); check_row_err(b"A1048577"); check_row_err(b"A99999999999999999999"); check_col_err(b"$XFE$1"); // Column references check_col_err(b"XFE"); check_col_err(b"$XFE"); // Row references check_row_err(b"1048577"); check_row_err(b"$1048577"); } #[test] fn test_offset_range_overflow() { let check_col_err = |input: &[u8], offset| { let mut buf = Vec::new(); assert!(matches!( offset_range(input, offset, &mut buf), Err(XlsxError::ColumnNumberOverflow) )); }; let check_row_err = |input: &[u8], offset| { let mut buf = Vec::new(); assert!(matches!( offset_range(input, offset, &mut buf), Err(XlsxError::RowNumberOverflow) )); }; // Original reference is out of bounds check_col_err(b"XFE1", (0, 0)); check_col_err(b"$XFE$1", (0, 0)); check_row_err(b"A1048577", (0, 0)); check_row_err(b"$A$1048577", (0, 0)); check_col_err(b"XFE:XFE", (0, 0)); check_row_err(b"1048577:1048577", (0, 0)); // Offset pushes valid cell out of bounds check_col_err(b"XFD1", (0, 1)); check_row_err(b"A1048576", (1, 0)); check_row_err(b"XFD1048576", (1, 0)); check_col_err(b"XFD1048576", (0, 1)); // Offset pushes valid range out of bounds check_col_err(b"XFD:XFD", (0, 1)); check_row_err(b"1048576:1048576", (1, 0)); } #[test] fn test_replace_cell_names() { assert_eq!(replace_cell_names("A1", (1, 0)).unwrap(), "A2".to_owned()); assert_eq!( replace_cell_names("CONCATENATE(A1, \"a\")", (1, 0)).unwrap(), "CONCATENATE(A2, \"a\")".to_owned() ); assert_eq!( replace_cell_names( "A1 is a cell, B1 is another, also C107, but XFE123 is not and \"A3\" in quote wont change.", (1, 0) ) .unwrap(), "A2 is a cell, B2 is another, also C108, but XFE123 is not and \"A3\" in quote wont change.".to_owned() ); assert_eq!( replace_cell_names("ν•œκΈ€ A1 γƒ†γ‚Ήγƒˆ", (0, 1)).unwrap(), "ν•œκΈ€ B1 γƒ†γ‚Ήγƒˆ".to_owned() ); assert_eq!( replace_cell_names("ABC\"asd\"123", (1, 0)).unwrap(), "ABC\"asd\"123".to_owned() ); // Column ranges assert_eq!( replace_cell_names("SUM(E:F)", (0, 1)).unwrap(), "SUM(F:G)".to_owned() ); assert_eq!( replace_cell_names("SUM($E:$F)", (0, 1)).unwrap(), "SUM($E:$F)".to_owned() ); assert_eq!( replace_cell_names("SUM($E:F)", (0, 1)).unwrap(), "SUM($E:G)".to_owned() ); // Row ranges assert_eq!( replace_cell_names("SUM(5:6)", (1, 0)).unwrap(), "SUM(6:7)".to_owned() ); assert_eq!( replace_cell_names("SUM($5:$6)", (1, 0)).unwrap(), "SUM($5:$6)".to_owned() ); assert_eq!( replace_cell_names("SUM($5:6)", (1, 0)).unwrap(), "SUM($5:7)".to_owned() ); // Mixed with cell references assert_eq!( replace_cell_names("SUM(A1:A5,E:F)", (0, 1)).unwrap(), "SUM(B1:B5,F:G)".to_owned() ); // Invalid syntax assert_eq!( replace_cell_names( "Valid: A1 Invalid: A1B1 A1$ $$A1 $A$$1 A$$1 A:1 1:A 1 A A1:1 A1:B A$A1 A1$2 $1 $A Valid: C1:D1", (1, 1) ) .unwrap(), "Valid: B2 Invalid: A1B1 A1$ $$A1 $A$$1 A$$1 A:1 1:A 1 A A1:1 A1:B A$A1 A1$2 $1 $A Valid: D2:E2" .to_owned() ); } #[test] fn test_read_shared_strings_with_namespaced_si_name() { let shared_strings_data = br#" String 1 String 2 String 3 "#; let mut buf = [0; 1000]; let mut zip_writer = ZipWriter::new(std::io::Cursor::new(&mut buf[..])); let options = SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored); zip_writer .start_file("xl/sharedStrings.xml", options) .unwrap(); zip_writer.write_all(shared_strings_data).unwrap(); let zip_size = zip_writer.finish().unwrap().position() as usize; let zip = ZipArchive::new(std::io::Cursor::new(&buf[..zip_size])).unwrap(); let zip_path_cache = build_zip_path_cache(&zip); let mut xlsx = Xlsx { zip, strings: vec![], sheets: vec![], tables: None, formats: vec![], is_1904: false, metadata: Metadata::default(), #[cfg(feature = "picture")] pictures: None, merged_regions: None, options: XlsxOptions::default(), zip_path_cache, }; assert!(xlsx.read_shared_strings().is_ok()); assert_eq!(3, xlsx.strings.len()); assert_eq!("String 1", &xlsx.strings[0]); assert_eq!("String 2", &xlsx.strings[1]); assert_eq!("String 3", &xlsx.strings[2]); } }