askalono-0.5.0/.cargo_vcs_info.json0000644000000001360000000000100126450ustar { "git": { "sha1": "988b57f1768103e88cef0289d82d8f671541b102" }, "path_in_vcs": "" }askalono-0.5.0/Cargo.lock0000644000000352250000000000100106270ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "adler2" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" [[package]] name = "aho-corasick" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] [[package]] name = "anstream" version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" dependencies = [ "anstyle", "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", "is_terminal_polyfill", "utf8parse", ] [[package]] name = "anstyle" version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anstyle-parse" version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" dependencies = [ "windows-sys", ] [[package]] name = "anstyle-wincon" version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" dependencies = [ "anstyle", "windows-sys", ] [[package]] name = "anyhow" version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37bf3594c4c988a53154954629820791dde498571819ae4ca50ca811e060cc95" [[package]] name = "askalono" version = "0.5.0" dependencies = [ "anyhow", "env_logger", "flate2", "lazy_static", "log", "rayon", "regex", "rmp-serde", "serde", "serde_json", "unicode-normalization", "zstd", ] [[package]] name = "autocfg" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "byteorder" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" version = "1.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2e7962b54006dcfcc61cb72735f4d89bb97061dd6a7ed882ec6b8ee53714c6f" dependencies = [ "jobserver", "libc", "shlex", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "colorchoice" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" [[package]] name = "crc32fast" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" dependencies = [ "cfg-if", ] [[package]] name = "crossbeam-deque" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ "crossbeam-utils", ] [[package]] name = "crossbeam-utils" version = "0.8.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80" [[package]] name = "either" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" [[package]] name = "env_filter" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f2c92ceda6ceec50f43169f9ee8424fe2db276791afde7b2cd8bc084cb376ab" dependencies = [ "log", "regex", ] [[package]] name = "env_logger" version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e13fa619b91fb2381732789fc5de83b45675e882f66623b7d8cb4f643017018d" dependencies = [ "anstream", "anstyle", "env_filter", "humantime", "log", ] [[package]] name = "flate2" version = "1.0.34" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1b589b4dc103969ad3cf85c950899926ec64300a1a46d76c03a6072957036f0" dependencies = [ "crc32fast", "miniz_oxide", ] [[package]] name = "humantime" version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "is_terminal_polyfill" version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itoa" version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "jobserver" version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ "libc", ] [[package]] name = "lazy_static" version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "log" version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" [[package]] name = "memchr" version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" dependencies = [ "adler2", ] [[package]] name = "num-traits" version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", ] [[package]] name = "paste" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "pkg-config" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2" [[package]] name = "proc-macro2" version = "1.0.88" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c3a7fc5db1e57d5a779a352c8cdb57b29aa4c40cc69c3a68a7fedc815fbf2f9" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] [[package]] name = "rayon" version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" dependencies = [ "crossbeam-deque", "crossbeam-utils", ] [[package]] name = "regex" version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-syntax" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rmp" version = "0.8.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "228ed7c16fa39782c3b3468e974aec2795e9089153cd08ee2e9aefb3613334c4" dependencies = [ "byteorder", "num-traits", "paste", ] [[package]] name = "rmp-serde" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "52e599a477cf9840e92f2cde9a7189e67b42c57532749bf90aea6ec10facd4db" dependencies = [ "byteorder", "rmp", "serde", ] [[package]] name = "ryu" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" [[package]] name = "serde" version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.210" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.132" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" dependencies = [ "itoa", "memchr", "ryu", "serde", ] [[package]] name = "shlex" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "syn" version = "2.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83540f837a8afc019423a8edb95b52a8effe46957ee402287f4292fae35be021" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "tinyvec" version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] [[package]] name = "tinyvec_macros" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "unicode-ident" version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" [[package]] name = "unicode-normalization" version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" dependencies = [ "tinyvec", ] [[package]] name = "utf8parse" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", "windows_i686_gnu", "windows_i686_gnullvm", "windows_i686_msvc", "windows_x86_64_gnu", "windows_x86_64_gnullvm", "windows_x86_64_msvc", ] [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "zstd" version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" version = "7.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "54a3ab4db68cea366acc5c897c7b4d4d1b8994a9cd6e6f841f8964566a419059" dependencies = [ "zstd-sys", ] [[package]] name = "zstd-sys" version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", ] askalono-0.5.0/Cargo.toml0000644000000036710000000000100106520ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "askalono" version = "0.5.0" authors = ["Jacob Peddicord "] build = false include = [ "/Cargo.toml", "/examples/**/*", "/LICENSE", "/NOTICE", "/README.md", "/src/**/*", ] autobins = false autoexamples = false autotests = false autobenches = false description = "a library to detect the contents of license files" readme = "README.md" license = "Apache-2.0" repository = "https://github.com/jpeddicord/askalono" [lib] name = "askalono" path = "src/lib.rs" [[example]] name = "annotate-text" path = "examples/annotate-text.rs" [[example]] name = "basic" path = "examples/basic.rs" [dependencies.anyhow] version = "1.0.44" [dependencies.lazy_static] version = "1.3.0" [dependencies.log] version = "0.4.6" [dependencies.regex] version = "1.1.7" [dependencies.rmp-serde] version = "1.1.1" [dependencies.serde] version = "1.0.92" features = ["derive"] [dependencies.serde_json] version = "1.0.39" optional = true [dependencies.unicode-normalization] version = "0.1.8" [dev-dependencies.env_logger] version = "0.11" [features] default = ["spdx"] gzip = ["flate2"] spdx = ["serde_json"] [target.'cfg(not(target_arch = "wasm32"))'.dependencies.flate2] version = "1.0.14" optional = true [target.'cfg(not(target_arch = "wasm32"))'.dependencies.rayon] version = "1.2" [target.'cfg(not(target_arch = "wasm32"))'.dependencies.zstd] version = "0.13" [target.'cfg(target_arch = "wasm32")'.dependencies.flate2] version = "1.0.14" askalono-0.5.0/Cargo.toml.orig000064400000000000000000000023731046102023000143310ustar 00000000000000[package] name = "askalono" version = "0.5.0" edition = "2021" description = "a library to detect the contents of license files" license = "Apache-2.0" repository = "https://github.com/jpeddicord/askalono" authors = ["Jacob Peddicord "] readme = "README.md" include = [ "/Cargo.toml", "/examples/**/*", "/LICENSE", "/NOTICE", "/README.md", "/src/**/*", ] [dependencies] anyhow = "1.0.44" lazy_static = "1.3.0" log = "0.4.6" regex = "1.1.7" rmp-serde = "1.1.1" serde = { version = "1.0.92", features = ["derive"] } unicode-normalization = "0.1.8" # spdx deps serde_json = { version = "1.0.39", optional = true } # gzip via flate2 is available via feature flag if you prefer that over # zstd. it's also enabled for wasm32 builds, as zstd doesn't yet compile for # that target. in order to build a gzip cache, you'll need the flag enabled # on your build machine too. [target.'cfg(not(target_arch = "wasm32"))'.dependencies] rayon = "1.2" zstd = "0.13" flate2 = { version = "1.0.14", optional = true } [target.'cfg(target_arch = "wasm32")'.dependencies] flate2 = "1.0.14" [dev-dependencies] env_logger = "0.11" [lib] name = "askalono" path = "src/lib.rs" [features] default = ["spdx"] gzip = ["flate2"] spdx = ["serde_json"] askalono-0.5.0/LICENSE000064400000000000000000000261361046102023000124520ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. askalono-0.5.0/NOTICE000064400000000000000000000011161046102023000123400ustar 00000000000000askalono Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. This tool does not provide legal advice and it is not a lawyer. It endeavors to match your input to a database of similar license texts, and tell you what it thinks is a close match. But, it can't tell you that the given license is authoritative over a project. Nor can it tell you what to do with a license once it's identified. You are not entitled to rely on the accuracy of the output of this tool, and should seek independent legal advice for any licensing questions that may arise from using this tool.askalono-0.5.0/README.md000064400000000000000000000123061046102023000127160ustar 00000000000000# askalono askalono is a library and command-line tool to help detect license texts. It's designed to be fast, accurate, and to support a wide variety of license texts. [![askalono crate](https://img.shields.io/crates/v/askalono.svg)](https://crates.io/crates/askalono) [![documentation](https://docs.rs/askalono/badge.svg)](https://docs.rs/askalono) ## Notice This tool does not provide legal advice and it is not a lawyer. It endeavors to match your input to a database of similar license texts, and tell you what it thinks is a close match. But, it can't tell you that the given license is authoritative over a project. Nor can it tell you what to do with a license once it's identified. You are not entitled to rely on the accuracy of the output of this tool, and should seek independent legal advice for any licensing questions that may arise from using this tool. ## Usage ### On the command line Pre-built binaries are available on the [Releases section](https://github.com/jpeddicord/askalono/releases) on GitHub. Rust developers may also grab a copy by running `cargo install askalono-cli`. Basic usage: askalono id where `` is a file (not folder) containing license text to analyze. In many projects, this file is called `LICENSE` or `COPYING`. askalono will analyze the text and output what it thinks it is. If askalono can't identify a file, it may simply be a license it just doesn't know. But, if it's actually source code with a file header (or footer, or anything in between) it may be able to dig deeper. To try this, pass the `--optimize` flag: askalono id --optimize If you'd like to discover license files within a directory tree, askalono offers a `crawl` action: askalono crawl ### As a library At the moment, `Store` and `LicenseContent` are exposed for usage. The best way to get an idea of how to use askalono as a library in its early state is to look at the [example](./examples/basic.rs). Some examples are also available in the [documentation](https://docs.rs/askalono). ## Details ### Implementation **tl;dr**: Sørensen–Dice scoring, multi-threading, compressed cache file At its core, askalono builds up bigrams (word pairs) of input text, and compares that with other license texts it knows about to see how similar they are. It scores each match with a [Sørensen–Dice](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) coefficient and looks for the highest result. There is some minimal preprocessing happening before matching, but there are no hand-maintained regular expressions or curations used to determine a match. In detail, the matching process: 1. Reads in input text 1. Normalizes everything it reasonably can -- Unicode characters, whitespace, quoting styles, etc. are all whittled down to something common. * Lines that tend to change a lot in licenses, like "Copyright 20XX Some Person", are additionally removed. 1. Tokenizes normalized text into a set of bigrams. 1. In parallel, the bigram set is compared with all of the other sets askalono knows about. 1. The resulting list is sorted, the top match identified, and result returned. To optimize startup, askalono builds up a database of license texts (applying the same normalization techniques described above), and persists this data to a MessagePack'd & zstd compressed cache file. This cache is loaded at startup, and is optionally embedded in the binary itself. ### Name It means "shallot" in Esperanto. You could try to derive a hidden meaning from it, but the real reason is really just that onions are delicious and Esperanto is an interesting language. In the author's opinion. (Sed la verkisto ne estas bonega Esperantisto, do bonvolu konversacii en la angla sur ĉi tiu projekto.) ### How is this different from other solutions? There are several other excellent projects in this space, including [licensee](https://github.com/benbalter/licensee), [LiD](https://source.codeaurora.org/external/qostg/lid/), and [ScanCode](https://github.com/nexB/scancode-toolkit). These projects attempt to get a larger picture of a project's licensing, and can look at other sources of metadata to try to find answers. Both of these inspired the creation of askalono, first as a curiosity, then as a serious project. askalono focuses on the problem of matching text itself -- it's often the piece that is difficult to optimize for speed and accuracy. askalono could be seen as a piece of plumbing in a larger system. The askalono command line application includes other goodies, such as a directory crawler, but these are largely for quick once-off use before diving in with more systematic solutions. (If you're looking for such a solution, take a look at the projects I just mentioned!) ### Where do the licenses come from? License data is sourced directly from SPDX: https://github.com/spdx/license-list-data askalono can parse the "json" format included in that repository to generate its cache. At this time, askalono is not taking requests for additional licenses in its default dataset -- its dataset is SPDX's own. ## Contributing Contributions are very welcome! See [CONTRIBUTING](CONTRIBUTING.md) for more info. ## License This library is licensed under the [Apache 2.0 License](LICENSE). askalono-0.5.0/examples/annotate-text.rs000064400000000000000000000037231046102023000164210ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use askalono::*; use std::collections::HashMap; use std::fs::File; use std::io::{stdin, Read}; enum Annotation { Begin(String), End, } fn main() { let args: Vec<_> = std::env::args().collect(); if args.len() != 2 { eprintln!("usage: annotate-text cache.bin.zstd < input.txt > output.html"); std::process::exit(1); } let cache = &args[1]; let store = Store::from_cache(File::open(cache).expect("couldn't read cache file")) .expect("error parsing cache"); let mut buf = String::new(); stdin() .read_to_string(&mut buf) .expect("couldn't read stdin"); let strategy = ScanStrategy::new(&store) .mode(ScanMode::TopDown) .confidence_threshold(0.80); let results = strategy .scan(&TextData::new(&buf)) .expect("scan didn't complete successfully"); let mut annotations = HashMap::with_capacity(results.containing.len() * 2); for result in &results.containing { annotations.insert( result.line_range.0, Annotation::Begin(result.license.name.to_owned()), ); annotations.insert(result.line_range.1, Annotation::End); } println!(""); println!("
{:#?}
", results); println!("
");
    for (i, line) in buf.lines().enumerate() {
        if annotations.contains_key(&i) {
            let a = annotations.get(&i).unwrap();
            match a {
                Annotation::Begin(license) => {
                    print!(
                        r#"
"#, license ); } Annotation::End => { print!("
"); } } } println!("{}", line); } println!("
"); } askalono-0.5.0/examples/basic.rs000064400000000000000000000020771046102023000147100ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use askalono::{Store, TextData}; use std::path::Path; // Note: this example is stupid slow because it loads and parses licenses // each run instead of persisting to a cache file. Expect runs to take ~20s; // 19.99s of that to be loading license data. fn main() { // create a new license text store let mut store = Store::new(); // load up data from SPDX JSON files, opting to not embed full text println!("Loading SPDX data, this may take a while..."); store .load_spdx( Path::new(concat!( env!("CARGO_MANIFEST_DIR"), "/datasets/modules/spdx-license-list-data/json/details" )), false, ) .unwrap(); // load input text println!("Parsing input text"); let input: TextData = include_str!("../LICENSE").into(); // do the heavy lifting println!("Scoring licenses"); let matched = store.analyze(&input); println!("{:?}", matched); } askalono-0.5.0/src/lib.rs000064400000000000000000000011551046102023000133420ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //! askalono is a crate that is Quite Good at detecting licenses from text. //! //! To get started, have a look at the `Store` struct, or one of the examples //! in the `examples` directory. #![warn(missing_docs)] #![allow(clippy::match_bool, clippy::useless_format)] mod license; mod ngram; mod preproc; mod store; mod strategy; pub use crate::{ license::{LicenseType, TextData}, store::{Match, Store}, strategy::{ContainedResult, IdentifiedLicense, ScanMode, ScanResult, ScanStrategy}, }; askalono-0.5.0/src/license.rs000064400000000000000000000347171046102023000142300ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::{collections::HashMap, fmt}; use serde::{Deserialize, Serialize}; use crate::{ ngram::NgramSet, preproc::{apply_aggressive, apply_normalizers}, }; /// The type of a license entry (typically in a `Store`). #[derive(Clone, Copy, PartialEq, Debug, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum LicenseType { /// The canonical text of the license. Original, /// A license header. There may be more than one in a `Store`. Header, /// An alternate form of a license. This is intended to be used for /// alternate _formats_ of a license, not for variants where the text has /// different meaning. Not currently used in askalono's SPDX dataset. Alternate, } impl fmt::Display for LicenseType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}", match *self { LicenseType::Original => "original text", LicenseType::Header => "license header", LicenseType::Alternate => "alternate text", } ) } } /// A structure representing compiled text/matching data. /// /// This is the key structure used to compare two texts against one another. It /// handles pre-processing the text to n-grams, scoring, and optimizing the /// result to try to identify specific details about a match. /// /// # Examples /// /// Basic scoring of two texts: /// /// ``` /// use askalono::TextData; /// /// let license = TextData::from("My First License"); /// let sample = TextData::from("copyright 20xx me irl\n\n // my first license"); /// assert_eq!(sample.match_score(&license), 1.0); /// ``` /// /// The above example is a perfect match, as identifiable copyright statements /// are stripped out during pre-processing. /// /// Building on that, TextData is able to tell you _where_ in the text a /// license is located: /// /// ``` /// # use std::error::Error; /// # use askalono::TextData; /// # fn main() -> Result<(), Box> { /// # let license = TextData::from("My First License"); /// let sample = TextData::from("copyright 20xx me irl\n// My First License\nfn hello() {\n ..."); /// let (optimized, score) = sample.optimize_bounds(&license); /// assert_eq!((1, 2), optimized.lines_view()); /// assert!(score > 0.99f32, "license within text matches"); /// # Ok(()) /// # } /// ``` #[derive(Serialize, Deserialize, Clone, Debug)] pub struct TextData { match_data: NgramSet, lines_view: (usize, usize), lines_normalized: Option>, text_processed: Option, } const TEXTDATA_TEXT_ERROR: &str = "TextData does not have original text"; impl TextData { /// Create a new TextData structure from a string. /// /// The given text will be normalized, then smashed down into n-grams for /// matching. By default, the normalized text is stored inside the /// structure for future diagnostics. This is necessary for optimizing a /// match and for diffing against other texts. If you don't want this extra /// data, you can call `without_text` throw it out. Generally, as a user of /// this library you want to keep the text data, but askalono will throw it /// away in its own `Store` as it's not needed. pub fn new(text: &str) -> TextData { let normalized = apply_normalizers(text); let normalized_joined = normalized.join("\n"); let processed = apply_aggressive(&normalized_joined); let match_data = NgramSet::from_str(&processed, 2); TextData { match_data, lines_view: (0, normalized.len()), lines_normalized: Some(normalized), text_processed: Some(processed), } } /// Consume this `TextData`, returning one without normalized/processed /// text stored. /// /// Unless you know you don't want the text, you probably don't want to use /// this. Other methods on `TextData` require that text is present. pub fn without_text(self) -> Self { TextData { match_data: self.match_data, lines_view: (0, 0), lines_normalized: None, text_processed: None, } } /// Get the bounds of the active line view. /// /// This represents the "active" region of lines that matches are generated /// from. The bounds are a 0-indexed `(start, end)` tuple, with inclusive /// start and exclusive end indicies. See `optimize_bounds`. /// /// This is largely for informational purposes; other methods in /// `TextView`, such as `lines` and `match_score`, will already account for /// the line range. However, it's useful to call it after running /// `optimize_bounds` to discover where the input text was discovered. pub fn lines_view(&self) -> (usize, usize) { self.lines_view } /// Clone this `TextView`, creating a copy with the given view. /// /// This will re-generate match data for the given view. It's used in /// `optimize_bounds` to shrink/expand the view of the text to discover /// bounds. /// /// Other methods on `TextView` respect this boundary, so it's not needed /// outside this struct. pub fn with_view(&self, start: usize, end: usize) -> Self { let view = &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR)[start..end]; let view_joined = view.join("\n"); let processed = apply_aggressive(&view_joined); TextData { match_data: NgramSet::from_str(&processed, 2), lines_view: (start, end), lines_normalized: self.lines_normalized.clone(), text_processed: Some(processed), } } /// "Erase" the current lines in view and restore the view to its original /// bounds. /// /// For example, consider a file with two licenses in it. One was identified /// (and located) with `optimize_bounds`. Now you want to find the other: /// white-out the matched lines, and re-run the overall search to find a /// new high score. pub fn white_out(&self) -> Self { // note that we're not using the view here... let lines = self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR); // ...because it's used here to exclude lines let new_normalized: Vec = lines .iter() .enumerate() .map(|(i, line)| { if i >= self.lines_view.0 && i < self.lines_view.1 { "".to_string() } else { line.clone() } }) .collect(); let processed = apply_aggressive(&new_normalized.join("\n")); TextData { match_data: NgramSet::from_str(&processed, 2), lines_view: (0, new_normalized.len()), lines_normalized: Some(new_normalized), text_processed: Some(processed), } } /// Get a slice of the normalized lines in this `TextData`. pub fn lines(&self) -> &[String] { &self.lines_normalized.as_ref().expect(TEXTDATA_TEXT_ERROR) [self.lines_view.0..self.lines_view.1] } #[doc(hidden)] pub fn text_processed(&self) -> Option<&str> { self.text_processed.as_ref().map(String::as_ref) } /// Compare this `TextData` with another, returning a similarity score. /// /// This is what's used during analysis to rank licenses. pub fn match_score(&self, other: &TextData) -> f32 { self.match_data.dice(&other.match_data) } #[cfg(feature = "spdx")] pub(crate) fn eq_data(&self, other: &Self) -> bool { self.match_data.eq(&other.match_data) } /// Attempt to optimize a known match to locate possible line ranges. /// /// Returns a new `TextData` struct and a score. The returned struct is a /// clone of `self`, with its view set to the best match against `other`. /// /// This will respect any views set on the TextData (an optimized result /// won't go outside the original view). /// /// Note that this won't be 100% optimal if there are blank lines /// surrounding the actual match, since successive blank lines in a range /// will likely have the same score. /// /// You should check the value of `lines_view` on the returned struct to /// find the line ranges. pub fn optimize_bounds(&self, other: &TextData) -> (Self, f32) { assert!(self.lines_normalized.is_some(), "{}", TEXTDATA_TEXT_ERROR); let view = self.lines_view; // optimize the ending bounds of the text match let (end_optimized, _) = self.search_optimize( &|end| self.with_view(view.0, end).match_score(other), &|end| self.with_view(view.0, end), ); let new_end = end_optimized.lines_view.1; // then optimize the starting bounds let (optimized, score) = end_optimized.search_optimize( &|start| end_optimized.with_view(start, new_end).match_score(other), &|start| end_optimized.with_view(start, new_end), ); (optimized, score) } fn search_optimize( &self, score: &dyn Fn(usize) -> f32, value: &dyn Fn(usize) -> Self, ) -> (Self, f32) { // cache score checks, since they're kinda expensive let mut memo: HashMap = HashMap::new(); let mut check_score = |index: usize| -> f32 { *memo.entry(index).or_insert_with(|| score(index)) }; fn search(score: &mut dyn FnMut(usize) -> f32, left: usize, right: usize) -> (usize, f32) { if right - left <= 3 { // find the index of the highest score in the remaining items return (left..=right) .map(|x| (x, score(x))) .fold((0usize, 0f32), |acc, x| if x.1 >= acc.1 { x } else { acc }); } let low = (left * 2 + right) / 3; let high = (left + right * 2) / 3; let score_low = score(low); let score_high = score(high); if score_low > score_high { search(score, left, high - 1) } else { search(score, low + 1, right) } } let optimal = search(&mut check_score, self.lines_view.0, self.lines_view.1); (value(optimal.0), optimal.1) } } impl<'a> From<&'a str> for TextData { fn from(text: &'a str) -> Self { Self::new(text) } } impl From for TextData { fn from(text: String) -> Self { Self::new(&text) } } #[cfg(test)] mod tests { use super::*; // psst: // cargo test -- --nocapture #[test] fn optimize_bounds() { let license_text = "this is a license text\nor it pretends to be one\nit's just a test"; let sample_text = "this is a license text\nor it pretends to be one\nit's just a test\nwords\n\nhere is some\ncode\nhello();\n\n//a comment too"; let license = TextData::from(license_text).without_text(); let sample = TextData::from(sample_text); let (optimized, _) = sample.optimize_bounds(&license); println!("{:?}", optimized.lines_view); println!("{:?}", optimized.lines_normalized); assert_eq!((0, 3), optimized.lines_view); // add more to the string, try again (avoid int trunc screwups) let sample_text = format!("{}\none more line", sample_text); let sample = TextData::from(sample_text.as_str()); let (optimized, _) = sample.optimize_bounds(&license); println!("{:?}", optimized.lines_view); println!("{:?}", optimized.lines_normalized); assert_eq!((0, 3), optimized.lines_view); // add to the beginning too let sample_text = format!("some content\nat\n\nthe beginning\n{}", sample_text); let sample = TextData::from(sample_text.as_str()); let (optimized, _) = sample.optimize_bounds(&license); println!("{:?}", optimized.lines_view); println!("{:?}", optimized.lines_normalized); // end bounds at 7 and 8 have the same score, since they're empty lines (not // counted). askalono is not smart enough to trim this as close as it // can. assert!( (4, 7) == optimized.lines_view || (4, 8) == optimized.lines_view, "bounds are (4, 7) or (4, 8)" ); } // if a view is set on the text data, optimize_bounds must not find text // outside of that range #[test] fn optimize_doesnt_grow_view() { let sample_text = "0\n1\n2\naaa aaa\naaa\naaa\naaa\n7\n8"; let license_text = "aaa aaa aaa aaa aaa"; let sample = TextData::from(sample_text); let license = TextData::from(license_text).without_text(); // sanity: the optimized bounds should be at (3, 7) let (optimized, _) = sample.optimize_bounds(&license); assert_eq!((3, 7), optimized.lines_view); // this should still work let sample = sample.with_view(3, 7); let (optimized, _) = sample.optimize_bounds(&license); assert_eq!((3, 7), optimized.lines_view); // but if we shrink the view further, it shouldn't be outside that range let sample = sample.with_view(4, 6); let (optimized, _) = sample.optimize_bounds(&license); assert_eq!((4, 6), optimized.lines_view); // restoring the view should still be OK too let sample = sample.with_view(0, 9); let (optimized, _) = sample.optimize_bounds(&license); assert_eq!((3, 7), optimized.lines_view); } // ensure we don't choke on small TextData matches #[test] fn match_small() { let a = TextData::from("a b"); let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg"); let x = a.match_score(&b); let y = b.match_score(&a); assert_eq!(x, y); } // don't choke on empty TextData either #[test] fn match_empty() { let a = TextData::from(""); let b = TextData::from("a\nlong\nlicense\nfile\n\n\n\n\nabcdefg"); let x = a.match_score(&b); let y = b.match_score(&a); assert_eq!(x, y); } #[test] fn view_and_white_out() { let a = TextData::from("aaa\nbbb\nccc\nddd"); assert_eq!(Some("aaa bbb ccc ddd"), a.text_processed()); let b = a.with_view(1, 3); assert_eq!(2, b.lines().len()); assert_eq!(Some("bbb ccc"), b.text_processed()); let c = b.white_out(); assert_eq!(Some("aaa ddd"), c.text_processed()); } } askalono-0.5.0/src/ngram.rs000064400000000000000000000066761046102023000137150ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::{ cmp::min, collections::{hash_map::Iter, HashMap, VecDeque}, }; use serde::{Deserialize, Serialize}; #[derive(PartialEq, Clone, Debug, Serialize, Deserialize)] pub struct NgramSet { map: HashMap, // once Rust supports it, it'd be nice to make this // a type parameter & specialize n: u8, size: usize, } impl NgramSet { pub fn new(n: u8) -> NgramSet { NgramSet { map: HashMap::new(), n, size: 0, } } pub fn from_str(s: &str, n: u8) -> NgramSet { let mut set = NgramSet::new(n); set.analyze(s); set } pub fn analyze(&mut self, s: &str) { let words = s.split(' '); let mut deque: VecDeque<&str> = VecDeque::with_capacity(self.n as usize); for w in words { deque.push_back(w); if deque.len() == self.n as usize { let parts = deque.iter().cloned().collect::>(); self.add_gram(parts.join(" ")); deque.pop_front(); } } } fn add_gram(&mut self, gram: String) { let n = self.map.entry(gram).or_insert(0); *n += 1; self.size += 1; } pub fn get(&self, gram: &str) -> u32 { if let Some(count) = self.map.get(gram) { *count } else { 0 } } pub fn len(&self) -> usize { self.size } pub fn is_empty(&self) -> bool { self.size == 0 } pub fn dice(&self, other: &NgramSet) -> f32 { // no sense comparing sets of different sizes if other.n != self.n { return 0f32; } // there's obviously no match if either are empty strings; // if we don't check here we could end up with NaN below // when both are empty if self.is_empty() || other.is_empty() { return 0f32; } // choose the smaller map to iterate let (x, y) = if self.len() < other.len() { (self, other) } else { (other, self) }; let mut matches = 0; for (gram, count) in x { matches += min(*count, y.get(gram)); } (2.0 * matches as f32) / ((self.len() + other.len()) as f32) } } impl<'a> IntoIterator for &'a NgramSet { type Item = (&'a String, &'a u32); type IntoIter = Iter<'a, String, u32>; fn into_iter(self) -> Self::IntoIter { self.map.iter() } } #[cfg(test)] mod tests { use super::*; // this is a pretty banal test, but it's a starting point :P #[test] fn can_construct() { let set = NgramSet::new(2); assert_eq!(set.size, 0); assert_eq!(set.n, 2); } #[test] fn no_nan() { let a = NgramSet::from_str("", 2); let b = NgramSet::from_str("", 2); let score = a.dice(&b); assert!(!score.is_nan()); } #[test] fn same_size() { let a = NgramSet::from_str("", 2); let b = NgramSet::from_str("", 3); let score = a.dice(&b); assert_eq!(0f32, score); } #[test] fn identical() { let a = NgramSet::from_str("one two three apple banana", 2); let b = NgramSet::from_str("one two three apple banana", 2); let score = a.dice(&b); assert_eq!(1f32, score); } } askalono-0.5.0/src/preproc.rs000064400000000000000000000314551046102023000142540ustar 00000000000000// Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::borrow::Cow; use std::collections::HashMap; use lazy_static::lazy_static; use log::debug; use regex::{Regex, Replacer}; use unicode_normalization::UnicodeNormalization; type PreprocFn = dyn Fn(Cow) -> Cow; trait CowRegex { fn replace_all_cow<'a, R: Replacer>(&self, text: Cow<'a, str>, replace: R) -> Cow<'a, str>; } impl CowRegex for Regex { fn replace_all_cow<'a, R: Replacer>(&self, text: Cow<'a, str>, replace: R) -> Cow<'a, str> { match text { Cow::Borrowed(find) => self.replace_all(find, replace), Cow::Owned(find) => Cow::Owned(self.replace_all(&find, replace).into_owned()), } } } /// A list of preprocessors that normalize text without removing anything /// substantial. These operate on one line at a time. pub const PREPROC_NORMALIZE: [&PreprocFn; 6] = [ &normalize_unicode, &remove_junk, &blackbox_urls, &normalize_horizontal_whitespace, &normalize_punctuation, &trim, ]; /// A list of preprocessors that more aggressively normalize/mangle text /// to make for friendlier matching. May remove statements and lines, and /// more heavily normalize punctuation. pub const PREPROC_AGGRESSIVE: [&PreprocFn; 8] = [ &remove_common_tokens, &normalize_vertical_whitespace, &remove_punctuation, &lowercaseify, &remove_title_line, &remove_copyright_statements, &collapse_whitespace, &trim, ]; pub fn apply_normalizers(text: &str) -> Vec { let mut lines = Vec::new(); for line in text.split('\n') { let mut out: Cow = line.into(); for preproc in &PREPROC_NORMALIZE { out = preproc(out); } lines.push(out.into()); } debug!("Normalized to:\n{:?}\n---", lines); lines } pub fn apply_aggressive(text: &str) -> String { let mut out = text.into(); for preproc in &PREPROC_AGGRESSIVE { out = preproc(out); } debug!("Aggressively normalized to:\n{}\n---", &out); out.into() } // Line-by-line normalizers fn normalize_unicode(input: Cow) -> Cow { input.nfc().collect::().into() } fn remove_junk(input: Cow) -> Cow { lazy_static! { static ref RX: Regex = Regex::new(r"[^\w\s\pP]+").unwrap(); } RX.replace_all_cow(input, "") } fn blackbox_urls(input: Cow) -> Cow { lazy_static! { static ref RX: Regex = Regex::new(r"https?://\S+").unwrap(); } RX.replace_all_cow(input, "http://blackboxed/url") } fn normalize_horizontal_whitespace(input: Cow) -> Cow { lazy_static! { // including slashes here as well static ref RX: Regex = Regex::new(r"(?x)[ \t\p{Zs} \\ / \| \x2044 ]+").unwrap(); } RX.replace_all_cow(input, " ") } fn normalize_punctuation(input: Cow) -> Cow { lazy_static! { static ref RX_QUOTES: Regex = Regex::new(r#"["'\p{Pi}\p{Pf}]+"#).unwrap(); static ref RX_DASH: Regex = Regex::new(r"\p{Pd}+").unwrap(); static ref RX_OPEN: Regex = Regex::new(r"\p{Ps}+").unwrap(); static ref RX_CLOSE: Regex = Regex::new(r"\p{Pe}+").unwrap(); static ref RX_UNDER: Regex = Regex::new(r"\p{Pc}+").unwrap(); static ref RX_COPY: Regex = Regex::new(r"[©Ⓒⓒ]").unwrap(); } let mut out = input; out = RX_QUOTES.replace_all_cow(out, "'"); out = RX_DASH.replace_all_cow(out, "-"); out = RX_OPEN.replace_all_cow(out, "("); out = RX_CLOSE.replace_all_cow(out, ")"); out = RX_UNDER.replace_all_cow(out, "_"); out = RX_COPY.replace_all_cow(out, "(c)"); out } fn trim(input: Cow) -> Cow { match input { Cow::Borrowed(text) => text.trim().into(), Cow::Owned(text) => Cow::Owned(text.trim().to_owned()), } } // Aggressive preprocessors // Cut prefix of string near given byte index. // If given index doesn't lie at char boundary, // returns the biggest prefix with length not exceeding idx. // If index is bigger than length or string, returns the whole string. fn trim_byte_adjusted(s: &str, idx: usize) -> &str { if idx >= s.len() { return s; } if let Some(sub) = s.get(..idx) { sub } else { // Inspect bytes before index let trailing_continuation = s.as_bytes()[..idx] .iter() .rev() // Multibyte characters are encoded in UTF-8 in the following manner: // first byte | rest of bytes // 1..10xxxxx 10xxxxxx // ^^^^ number of ones is equal to number of bytes in codepoint // Number of 10xxxxxx bytes in codepoint is at most 3 in valid UTF-8-encoded string, // so this loop actually runs a little iterations .take_while(|&byte| byte & 0b1100_0000 == 0b1000_0000) .count(); // Subtract 1 to take the first byte in codepoint into account &s[..idx - trailing_continuation - 1] } } fn lcs_substr<'a>(f_line: &'a str, s_line: &'a str) -> &'a str { // find the length of common prefix in byte representations of strings let prefix_len = f_line .as_bytes() .iter() .zip(s_line.as_bytes()) .take_while(|(&f, &s)| f == s) .count(); trim_byte_adjusted(f_line, prefix_len).trim() } fn remove_common_tokens(input: Cow) -> Cow { let lines: Vec<&str> = input.split('\n').collect(); let mut l_iter = lines.iter(); let mut prefix_counts = HashMap::<_, u32>::new(); // pass 1: iterate through the text to record common prefixes if let Some(first) = l_iter.next() { let mut pair = ("", first); let line_pairs = std::iter::from_fn(|| { pair = (pair.1, l_iter.next()?); Some(pair) }); for (a, b) in line_pairs { let common = lcs_substr(a, b); // why start at 1, then immediately add 1? // lcs_substr compares two lines! // this doesn't need to be exact, just consistent. if common.len() > 3 { *prefix_counts.entry(common).or_insert(1) += 1; } } } // look at the most common observed prefix let most_common = match prefix_counts.iter().max_by_key(|&(_k, v)| v) { Some((prefix, _count)) => prefix, None => return input, }; // reconcile the count with other longer prefixes that may be stored let common_count = prefix_counts .iter() .filter_map(|(s, count)| Some(count).filter(|_| s.starts_with(most_common))) .sum::(); // the common string must be at least 80% of the text let prefix_threshold = (0.8f32 * lines.len() as f32) as _; if common_count < prefix_threshold { return input; } // pass 2: remove that substring lines .iter() .map(|line| { if let Some(stripped) = line.strip_prefix(most_common) { stripped } else { line } .trim() }) .collect::>() .join("\n") .into() } fn normalize_vertical_whitespace(input: Cow) -> Cow { lazy_static! { static ref RX_MISC: Regex = Regex::new(r"[\r\n\v\f]").unwrap(); static ref RX_NUM: Regex = Regex::new(r"\n{3,}").unwrap(); } let mut out = input; out = RX_MISC.replace_all_cow(out, "\n"); out = RX_NUM.replace_all_cow(out, "\n\n"); out } fn remove_punctuation(input: Cow) -> Cow { lazy_static! { static ref RX: Regex = Regex::new(r"[^\w\s]+").unwrap(); } RX.replace_all_cow(input, "") } fn lowercaseify(input: Cow) -> Cow { input.to_lowercase().into() } fn remove_title_line(input: Cow) -> Cow { lazy_static! { static ref RX: Regex = Regex::new(r"^.*license( version \S+)?( copyright.*)?\n\n").unwrap(); } RX.replace_all_cow(input, "") } fn remove_copyright_statements(input: Cow) -> Cow { lazy_static! { static ref RX: Regex = Regex::new( r"(?mx) ( # either a new paragraph, or the beginning of the text + empty lines (\n\n|\A\n*) # any number of lines starting with 'copyright' followed by a new paragraph (^\x20*copyright.*?$)+ \n\n ) | ( # or the very first line if it has 'copyright' in it \A.*copyright.*$ ) | ( # or any lines that really look like a copyright statement ^copyright (\s+(c|\d+))+ .*?$ ) " ) .unwrap(); } RX.replace_all_cow(input, "\n\n") } fn collapse_whitespace(input: Cow) -> Cow { lazy_static! { static ref RX: Regex = Regex::new(r"\s+").unwrap(); } RX.replace_all_cow(input, " ") } #[cfg(test)] mod tests { use super::*; #[test] fn trim_byte_adjusted_respects_multibyte_characters() { let input = "RustКраб橙蟹🦀"; let expected = [ "", "R", "Ru", "Rus", "Rust", "Rust", "RustК", "RustК", "RustКр", "RustКр", "RustКра", "RustКра", "RustКраб", "RustКраб", "RustКраб", "RustКраб橙", "RustКраб橙", "RustКраб橙", "RustКраб橙蟹", "RustКраб橙蟹", "RustКраб橙蟹", "RustКраб橙蟹", "RustКраб橙蟹🦀", ]; for (i, &outcome) in expected.iter().enumerate() { assert_eq!(outcome, trim_byte_adjusted(input, i)) } } #[test] fn greatest_substring_removal() { // the funky string syntax \n\ is to add a newline but skip the // leading whitespace in the source code let text = "%%Copyright: Copyright\n\ %%Copyright: All rights reserved.\n\ %%Copyright: Redistribution and use in source and binary forms, with or\n\ %%Copyright: without modification, are permitted provided that the\n\ %%Copyright: following conditions are met:\n\ \n\ abcd"; let new_text = remove_common_tokens(text.into()); println!("{}", new_text); assert!( !new_text.contains("%%Copyright"), "new text shouldn't contain the common substring" ); } #[test] fn greatest_substring_removal_keep_inner() { let text = "this string should still have\n\ this word -> this <- in it even though\n\ this is still the most common word"; let new_text = remove_common_tokens(text.into()); println!("-- {}", new_text); // the "this" at the start of the line can be discarded... assert!(!new_text.contains("\nthis")); // ...but the "this" in the middle of sentences shouldn't be assert!(new_text.contains("this")); let text = "aaaa bbbb cccc dddd\n\ eeee ffff aaaa gggg\n\ hhhh iiii jjjj"; let new_text = remove_common_tokens(text.into()); println!("-- {}", new_text); assert!(new_text.contains("aaaa")); // similar to above test } #[test] fn greatest_substring_removal_42() { // https://github.com/jpeddicord/askalono/issues/42 let text = "AAAAAA line 1\n\ AAAAAA another line here\n\ AAAAAA yet another line here\n\ AAAAAA how long will this go on\n\ AAAAAA another line here\n\ AAAAAA more\n\ AAAAAA one more\n\ AAAAAA two more\n\ AAAAAA three more\n\ AAAAAA four more\n\ AAAAAA five more\n\ AAAAAA six more\n\ \n\ preserve\n\ keep"; let new_text = remove_common_tokens(text.into()); println!("{}", new_text); assert!(new_text.contains("preserve")); assert!(new_text.contains("keep")); assert!(!new_text.contains("AAAAAA")); } #[test] fn normalize_no_line_mangle() { let text = "some license copyright 2012 person \tlicense\r text \t goes here"; let text_lines = text.lines().count(); let normalized = apply_normalizers(text); let normalized_lines = normalized.len(); assert_eq!( text_lines, normalized_lines, "normalizers shouldnt change line counts" ); } } askalono-0.5.0/src/store/analyze.rs000064400000000000000000000111471046102023000153750ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::{cmp::Ordering, fmt}; use crate::{ license::LicenseType, license::TextData, store::base::{LicenseEntry, Store}, }; /// Information about text that was compared against licenses in the store. /// /// This only contains information about the overall match; to uncover more /// data you can run methods like `optimize_bounds` on `TextData`. /// /// Its lifetime is tied to the lifetime of the `Store` it was generated from. #[derive(Clone)] pub struct Match<'a> { /// Confidence score of the match, ranging from 0 to 1. pub score: f32, /// The name of the closest matching license in the `Store`. This will /// always be something that exists in the store, regardless of the score. pub name: &'a str, /// The type of the license that matched. Useful to know if the match was /// the complete text, a header, or something else. pub license_type: LicenseType, /// A reference to the license data that matched inside the `Store`. May be /// useful for diagnostic purposes or to further optimize the result. pub data: &'a TextData, } /// A lighter version of Match to be used during analysis. /// Reduces the need for cloning a bunch of fields. struct PartialMatch<'a> { pub name: &'a str, pub score: f32, pub license_type: LicenseType, pub data: &'a TextData, } impl<'a> PartialOrd for PartialMatch<'a> { fn partial_cmp(&self, other: &PartialMatch<'_>) -> Option { self.score.partial_cmp(&other.score) } } impl<'a> PartialEq for PartialMatch<'a> { fn eq(&self, other: &PartialMatch<'_>) -> bool { self.score.eq(&other.score) && self.name == other.name && self.license_type == other.license_type } } impl<'a> fmt::Debug for Match<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "Match {{ score: {}, name: {}, license_type: {:?} }}", self.score, self.name, self.license_type ) } } impl Store { /// Compare the given `TextData` against all licenses in the `Store`. /// /// This parallelizes the search as much as it can to find the best match. /// Once a match is obtained, it can be optimized further; see methods on /// `TextData` for more information. pub fn analyze<'a>(&'a self, text: &TextData) -> Match<'a> { let mut res: Vec>; let analyze_fold = |mut acc: Vec>, (name, data): (&'a String, &'a LicenseEntry)| { acc.push(PartialMatch { score: data.original.match_score(text), name, license_type: LicenseType::Original, data: &data.original, }); data.alternates.iter().for_each(|alt| { acc.push(PartialMatch { score: alt.match_score(text), name, license_type: LicenseType::Alternate, data: alt, }) }); data.headers.iter().for_each(|head| { acc.push(PartialMatch { score: head.match_score(text), name, license_type: LicenseType::Header, data: head, }) }); acc }; // parallel analysis #[cfg(not(target_arch = "wasm32"))] { use rayon::prelude::*; res = self .licenses .par_iter() .fold(Vec::new, analyze_fold) .reduce( Vec::new, |mut a: Vec>, b: Vec>| { a.extend(b); a }, ); res.par_sort_unstable_by(|a, b| b.partial_cmp(a).unwrap()); } // single-threaded analysis #[cfg(target_arch = "wasm32")] { res = self .licenses .iter() // len of licenses isn't strictly correct, but it'll do .fold(Vec::with_capacity(self.licenses.len()), analyze_fold); res.sort_unstable_by(|a, b| b.partial_cmp(a).unwrap()); } let m = &res[0]; Match { score: m.score, name: m.name, license_type: m.license_type, data: m.data, } } } askalono-0.5.0/src/store/base.rs000064400000000000000000000103541046102023000146430ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::collections::HashMap; use anyhow::{format_err, Error}; use serde::{Deserialize, Serialize}; use crate::{license::LicenseType, license::TextData}; #[derive(Serialize, Deserialize)] pub(crate) struct LicenseEntry { pub original: TextData, pub aliases: Vec, pub headers: Vec, pub alternates: Vec, } /// A representation of a collection of known licenses. /// /// This struct is generally what you want to start with if you're looking to /// match text against a database of licenses. Load a cache from disk using /// `from_cache`, then use the `analyze` function to determine what a text most /// closely matches. /// /// # Examples /// /// ```rust,should_panic /// # use std::fs::File; /// # use std::error::Error; /// use askalono::{Store, TextData}; /// /// # fn main() -> Result<(), Box> { /// let store = Store::from_cache(File::open("askalono-cache.bin.zstd")?)?; /// let result = store.analyze(&TextData::from("what's this")); /// # Ok(()) /// # } /// ``` #[derive(Default, Serialize, Deserialize)] pub struct Store { pub(crate) licenses: HashMap, } impl LicenseEntry { pub fn new(original: TextData) -> LicenseEntry { LicenseEntry { original, aliases: Vec::new(), alternates: Vec::new(), headers: Vec::new(), } } } impl Store { /// Create a new `Store`. /// /// More often, you probably want to use `from_cache` instead of creating /// an empty store. pub fn new() -> Store { Store { licenses: HashMap::new(), } } /// Get the number of licenses in the store. /// /// This only counts licenses by name -- headers, aliases, and alternates /// aren't included in the count. pub fn len(&self) -> usize { self.licenses.len() } /// Check if the store is empty. pub fn is_empty(&self) -> bool { self.licenses.is_empty() } /// Get all licenses by name via iterator. pub fn licenses(&self) -> impl Iterator { self.licenses.keys() } /// Get a license's standard TextData by name. pub fn get_original(&self, name: &str) -> Option<&TextData> { Some(&self.licenses.get(name)?.original) } /// Add a single license to the store. /// /// If the license with the given name already existed, it and all of its /// variants will be replaced. pub fn add_license(&mut self, name: String, data: TextData) { let entry = LicenseEntry::new(data); self.licenses.insert(name, entry); } /// Add a variant (a header or alternate formatting) of a given license to /// the store. /// /// The license must already exist. This function cannot be used to replace /// the original/canonical text of the license. pub fn add_variant( &mut self, name: &str, variant: LicenseType, data: TextData, ) -> Result<(), Error> { let entry = self .licenses .get_mut(name) .ok_or_else(|| format_err!("license {} not present in store", name))?; match variant { LicenseType::Alternate => { entry.alternates.push(data); } LicenseType::Header => { entry.headers.push(data); } _ => { return Err(format_err!("variant type not applicable for add_variant")); } }; Ok(()) } /// Get the list of aliases for a given license. pub fn aliases(&self, name: &str) -> Result<&Vec, Error> { let entry = self .licenses .get(name) .ok_or_else(|| format_err!("license {} not present in store", name))?; Ok(&entry.aliases) } /// Set the list of aliases for a given license. pub fn set_aliases(&mut self, name: &str, aliases: Vec) -> Result<(), Error> { let entry = self .licenses .get_mut(name) .ok_or_else(|| format_err!("license {} not present in store", name))?; entry.aliases = aliases; Ok(()) } } askalono-0.5.0/src/store/cache.rs000064400000000000000000000045371046102023000150020ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::{io::copy, io::prelude::*}; use anyhow::Error; use log::info; use rmp_serde::Serializer; use serde::Serialize; use crate::store::base::Store; const CACHE_VERSION: &[u8] = b"askalono-04"; impl Store { /// Create a store from a cache file. /// /// This method is highly useful for quickly loading a cache, as creating /// one from text data is rather slow. This method can typically load /// the full SPDX set from disk in 200-300 ms. The cache will be /// sanity-checked to ensure it was generated with a similar version of /// askalono. pub fn from_cache(mut readable: R) -> Result where R: Read + Sized, { let mut header = [0u8; 11]; readable.read_exact(&mut header)?; if header != CACHE_VERSION { anyhow::bail!( "cache version mismatch; expected '{}', found '{}'", String::from_utf8_lossy(CACHE_VERSION), String::from_utf8_lossy(&header) ); } #[cfg(not(feature = "gzip"))] let dec = zstd::Decoder::new(readable)?; #[cfg(feature = "gzip")] let dec = flate2::read::GzDecoder::new(readable); let store = rmp_serde::decode::from_read(dec)?; Ok(store) } /// Serialize the current store. /// /// The output will be a MessagePack'd gzip'd or zstd'd binary stream that should be /// written to disk. pub fn to_cache(&self, mut writable: W) -> Result<(), Error> where W: Write + Sized, { let buf = { // This currently sits around 3.7MiB, so go up to 4 to fit comfortably let mut buf = Vec::with_capacity(4 * 1024 * 1024); let mut serializer = Serializer::new(&mut buf); self.serialize(&mut serializer)?; buf }; info!("Pre-compressed output is {} bytes", buf.len()); writable.write_all(CACHE_VERSION)?; #[cfg(not(feature = "gzip"))] let mut enc = zstd::Encoder::new(writable, 21)?; #[cfg(feature = "gzip")] let mut enc = flate2::write::GzEncoder::new(writable, flate2::Compression::default()); copy(&mut buf.as_slice(), &mut enc)?; enc.finish()?; Ok(()) } } askalono-0.5.0/src/store/mod.rs000064400000000000000000000003471046102023000145110ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 mod analyze; mod base; mod cache; #[cfg(feature = "spdx")] mod spdx; pub use self::{analyze::Match, base::Store}; askalono-0.5.0/src/store/spdx.rs000064400000000000000000000071561046102023000147150ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::{ ffi::OsStr, fs::{read_dir, File}, io::prelude::*, path::Path, }; use anyhow::{format_err, Error}; use log::{debug, info}; use crate::{ license::TextData, store::base::{LicenseEntry, Store}, }; impl Store { /// Fill the store with SPDX JSON data. /// /// This function is very specific to the format of SPDX's /// `license-list-data` repository. It reads all JSON files in the /// `json/details` directory and creates entries inside the store for /// matching. /// /// This is intended to be used during build of askalono, so it's not /// available unless the `spdx` feature is enabled. /// /// `include_texts`, if true, will keep normalized license text data inside /// the store. This yields a larger store when serialized, but has the /// benefit of allowing you to diff your result against what askalono has /// stored. pub fn load_spdx(&mut self, dir: &Path, include_texts: bool) -> Result<(), Error> { use serde_json::{from_str, Value}; // locate all json files in the directory let mut paths: Vec<_> = read_dir(dir)? .filter_map(|e| e.ok()) .map(|e| e.path()) .filter(|p| p.is_file() && p.extension().unwrap_or_else(|| OsStr::new("")) == "json") .collect(); // sort without extensions; otherwise dashes and dots muck it up paths.sort_by(|a, b| a.file_stem().unwrap().cmp(b.file_stem().unwrap())); for path in paths { let mut f = File::open(path)?; let mut data = String::new(); f.read_to_string(&mut data)?; let val: Value = from_str(&data)?; let name = val["licenseId"] .as_str() .ok_or_else(|| format_err!("missing licenseId"))?; let deprecated = val["isDeprecatedLicenseId"] .as_bool() .ok_or_else(|| format_err!("missing isDeprecatedLicenseId"))?; if deprecated { debug!("Skipping {} (deprecated)", name); continue; } let text = val["licenseText"] .as_str() .ok_or_else(|| format_err!("missing licenseText"))?; let header = val["standardLicenseHeader"].as_str(); info!("Processing {}", name); let content = match include_texts { true => TextData::new(text), false => TextData::new(text).without_text(), }; // check if an identical license is already present let mut already_existed = false; self.licenses.iter_mut().for_each(|(key, ref mut value)| { if value.original.eq_data(&content) { value.aliases.push(name.to_string()); info!("{} already stored; added as an alias for {}", name, key); already_existed = true; } }); if already_existed { continue; } let license = self .licenses .entry(name.to_owned()) .or_insert_with(|| LicenseEntry::new(content)); if let Some(header_text) = header { let header_data = match include_texts { false => TextData::new(header_text), true => TextData::new(header_text).without_text(), }; license.headers = vec![header_data]; } } Ok(()) } } askalono-0.5.0/src/strategy.rs000064400000000000000000000453751046102023000144520ustar 00000000000000// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 use std::borrow::Cow; use std::fmt; use anyhow::Error; use log::{info, trace}; use serde::Serialize; use crate::{ license::{LicenseType, TextData}, store::{Match, Store}, }; /// A struct describing a license that was identified, as well as its type. #[derive(Serialize, Clone)] pub struct IdentifiedLicense<'a> { /// The identifier of the license. pub name: &'a str, /// The type of the license that was matched. pub kind: LicenseType, /// A reference to the license data inside the store. pub data: &'a TextData, } impl<'a> fmt::Debug for IdentifiedLicense<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("IdentifiedLicense") .field("name", &self.name) .field("kind", &self.kind) .finish() } } /// Information about scanned content. /// /// Produced by `ScanStrategy.scan`. #[derive(Serialize, Debug)] pub struct ScanResult<'a> { /// The confidence of the match from 0.0 to 1.0. pub score: f32, /// The identified license of the overall text, or None if nothing met the /// confidence threshold. pub license: Option>, /// Any licenses discovered inside the text, if `optimize` was enabled. pub containing: Vec>, } /// A struct describing a single license identified within a larger text. #[derive(Serialize, Debug, Clone)] pub struct ContainedResult<'a> { /// The confidence of the match within the line range from 0.0 to 1.0. pub score: f32, /// The license identified in this portion of the text. pub license: IdentifiedLicense<'a>, /// A 0-indexed (inclusive, exclusive) range of line numbers identifying /// where in the overall text a license was identified. /// /// See `TextData.lines_view()` for more information. pub line_range: (usize, usize), } /// A `ScanStrategy` can be used as a high-level wrapped over a `Store`'s /// analysis logic. /// /// A strategy configured here can be run repeatedly to scan a document for /// multiple licenses, or to automatically optimize to locate texts within a /// larger text. /// /// # Examples /// /// ```rust,should_panic /// # use std::error::Error; /// use askalono::{ScanStrategy, Store}; /// /// # fn main() -> Result<(), Box> { /// let store = Store::new(); /// // [...] /// let strategy = ScanStrategy::new(&store) /// .confidence_threshold(0.9) /// .optimize(true); /// let results = strategy.scan(&"my text to scan".into())?; /// # Ok(()) /// # } /// ``` pub struct ScanStrategy<'a> { store: &'a Store, mode: ScanMode, confidence_threshold: f32, shallow_limit: f32, optimize: bool, max_passes: u16, step_size: usize, } /// Available scanning strategy modes. pub enum ScanMode { /// Elimination is a general-purpose strategy that iteratively locates the /// highest license match in a file, then the next, and so on until not /// finding any more strong matches. Elimination, /// TopDown is a strategy intended for use with attribution documents, or /// text files containing multiple licenses (and not much else). It's more /// accurate than Elimination, but significantly slower. TopDown, } impl<'a> ScanStrategy<'a> { /// Construct a new scanning strategy tied to the given `Store`. /// /// By default, the strategy has conservative defaults and won't perform /// any deeper investigaton into the contents of files. pub fn new(store: &'a Store) -> ScanStrategy<'a> { Self { store, mode: ScanMode::Elimination, confidence_threshold: 0.9, shallow_limit: 0.99, optimize: false, max_passes: 10, step_size: 5, } } /// Set the scanning mode. /// /// See ScanMode for a description of options. The default mode is /// Elimination, which is a fast, good general-purpose matcher. pub fn mode(mut self, mode: ScanMode) -> Self { self.mode = mode; self } /// Set the confidence threshold for this strategy. /// /// The overall license match must meet this number in order to be /// reported. Additionally, if contained licenses are reported in the scan /// (when `optimize` is enabled), they'll also need to meet this bar. /// /// Set this to 1.0 for only exact matches, and 0.0 to report even the /// weakest match. pub fn confidence_threshold(mut self, confidence_threshold: f32) -> Self { self.confidence_threshold = confidence_threshold; self } /// Set a fast-exit parameter that allows the strategy to skip the rest of /// a scan for strong matches. /// /// This should be set higher than the confidence threshold; ideally close /// to 1.0. If the overall match score is above this limit, the scanner /// will return early and not bother performing deeper checks. /// /// This is really only useful in conjunction with `optimize`. A value of /// 0.0 will fast-return on any match meeting the confidence threshold, /// while a value of 1.0 will only stop on a perfect match. pub fn shallow_limit(mut self, shallow_limit: f32) -> Self { self.shallow_limit = shallow_limit; self } /// Indicate whether a deeper scan should be performed. /// /// This is ignored if the shallow limit is met. It's not enabled by /// default, however, so if you want deeper results you should set /// `shallow_limit` fairly high and enable this. pub fn optimize(mut self, optimize: bool) -> Self { self.optimize = optimize; self } /// The maximum number of identifications to perform before exiting a scan /// of a single text. /// /// This is largely to prevent misconfigurations and infinite loop /// scenarios, but if you have a document with a large number of licenses /// then you may want to tune this to a value above the number of licenses /// you expect to be identified. pub fn max_passes(mut self, max_passes: u16) -> Self { self.max_passes = max_passes; self } /// Configure the scanning interval (in lines) for TopDown mode. /// /// A smaller step size will be more accurate at a significant cost of /// speed. pub fn step_size(mut self, step_size: usize) -> Self { self.step_size = step_size; self } /// Scan the given text content using this strategy's configured /// preferences. /// /// Returns a `ScanResult` containing all discovered information. pub fn scan(&self, text: &TextData) -> Result { match self.mode { ScanMode::Elimination => Ok(self.scan_elimination(text)), ScanMode::TopDown => Ok(self.scan_topdown(text)), } } fn scan_elimination(&self, text: &TextData) -> ScanResult { let mut analysis = self.store.analyze(text); let score = analysis.score; let mut license = None; let mut containing = Vec::new(); info!("Elimination top-level analysis: {:?}", analysis); // meets confidence threshold? record that if analysis.score > self.confidence_threshold { license = Some(IdentifiedLicense { name: analysis.name, kind: analysis.license_type, data: analysis.data, }); // above the shallow limit -> exit if analysis.score > self.shallow_limit { return ScanResult { score, license, containing, }; } } if self.optimize { // repeatedly try to dig deeper // this loop effectively iterates once for each license it finds let mut current_text: Cow<'_, TextData> = Cow::Borrowed(text); for _n in 0..self.max_passes { let (optimized, optimized_score) = current_text.optimize_bounds(analysis.data); // stop if we didn't find anything acceptable if optimized_score < self.confidence_threshold { break; } // otherwise, save it info!( "Optimized to {} lines ({}, {})", optimized_score, optimized.lines_view().0, optimized.lines_view().1 ); containing.push(ContainedResult { score: optimized_score, license: IdentifiedLicense { name: analysis.name, kind: analysis.license_type, data: analysis.data, }, line_range: optimized.lines_view(), }); // and white-out + reanalyze for next iteration current_text = Cow::Owned(optimized.white_out()); analysis = self.store.analyze(¤t_text); } } ScanResult { score, license, containing, } } fn scan_topdown(&self, text: &TextData) -> ScanResult { let (_, text_end) = text.lines_view(); let mut containing = Vec::new(); // find licenses working down thru the text's lines let mut current_start = 0usize; while current_start < text_end { let result = self.topdown_find_contained_license(text, current_start); let contained = match result { Some(c) => c, None => break, }; current_start = contained.line_range.1 + 1; containing.push(contained); } ScanResult { score: 0.0, license: None, containing, } } fn topdown_find_contained_license( &self, text: &TextData, starting_at: usize, ) -> Option { let (_, text_end) = text.lines_view(); let mut found: (usize, usize, Option>) = (0, 0, None); trace!( "topdown_find_contained_license starting at line {}", starting_at ); // speed: only start tracking once conf is met, and bail out after let mut hit_threshold = false; // move the start of window... 'start: for start in (starting_at..text_end).step_by(self.step_size) { // ...and also the end of window to find high scores. for end in (start..=text_end).step_by(self.step_size) { let view = text.with_view(start, end); let analysis = self.store.analyze(&view); // just getting a feel for the data at this point, not yet // optimizing the view. // entering threshold: save the starting location if !hit_threshold && analysis.score >= self.confidence_threshold { hit_threshold = true; trace!( "hit_threshold at ({}, {}) with score {}", start, end, analysis.score ); } if hit_threshold { if analysis.score < self.confidence_threshold { // exiting threshold trace!( "exiting threshold at ({}, {}) with score {}", start, end, analysis.score ); break 'start; } else { // maintaining threshold (also true for entering) found = (start, end, Some(analysis)); } } } } // at this point we have a *rough* bounds for a match. // now we can optimize to find the best one let matched = match found.2 { Some(m) => m, None => return None, }; let check = matched.data; let view = text.with_view(found.0, found.1); let (optimized, optimized_score) = view.optimize_bounds(check); trace!( "optimized {} {} at ({:?})", optimized_score, matched.name, optimized.lines_view() ); if optimized_score < self.confidence_threshold { return None; } Some(ContainedResult { score: optimized_score, license: IdentifiedLicense { name: matched.name, kind: matched.license_type, data: matched.data, }, line_range: optimized.lines_view(), }) } } #[cfg(test)] mod tests { use super::*; #[test] fn can_construct() { let store = Store::new(); ScanStrategy::new(&store); ScanStrategy::new(&store).confidence_threshold(0.5); ScanStrategy::new(&store) .shallow_limit(0.99) .optimize(true) .max_passes(100); } #[test] fn shallow_scan() { let store = create_dummy_store(); let test_data = TextData::new("lorem ipsum\naaaaa bbbbb\nccccc\nhello"); // the above text should have a result with a confidence minimum of 0.5 let strategy = ScanStrategy::new(&store) .confidence_threshold(0.5) .shallow_limit(0.0); let result = strategy.scan(&test_data).unwrap(); assert!( result.score > 0.5, "score must meet threshold; was {}", result.score ); assert_eq!( result.license.expect("result has a license").name, "license-1" ); // but it won't pass with a threshold of 0.8 let strategy = ScanStrategy::new(&store) .confidence_threshold(0.8) .shallow_limit(0.0); let result = strategy.scan(&test_data).unwrap(); assert!(result.license.is_none(), "result license is None"); } #[test] fn single_optimize() { let store = create_dummy_store(); // this TextData matches license-2 with an overall score of ~0.46 and optimized // score of ~0.57 let test_data = TextData::new("lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout"); // check that we can spot the gibberish license in the sea of other gibberish let strategy = ScanStrategy::new(&store) .confidence_threshold(0.5) .optimize(true) .shallow_limit(1.0); let result = strategy.scan(&test_data).unwrap(); assert!(result.license.is_none(), "result license is None"); assert_eq!(result.containing.len(), 1); let contained = &result.containing[0]; assert_eq!(contained.license.name, "license-2"); assert!( contained.score > 0.5, "contained score is greater than threshold" ); } #[test] fn find_multiple_licenses_elimination() { let store = create_dummy_store(); // this TextData matches license-2 with an overall score of ~0.46 and optimized // score of ~0.57 let test_data = TextData::new("lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc"); // check that we can spot the gibberish license in the sea of other gibberish let strategy = ScanStrategy::new(&store) .mode(ScanMode::Elimination) .confidence_threshold(0.5) .optimize(true) .shallow_limit(1.0); let result = strategy.scan(&test_data).unwrap(); assert!(result.license.is_none(), "result license is None"); assert_eq!(2, result.containing.len()); // inspect the array and ensure we got both licenses let mut found1 = 0; let mut found2 = 0; for (_, contained) in result.containing.iter().enumerate() { match contained.license.name { "license-1" => { assert!(contained.score > 0.5, "license-1 score meets threshold"); found1 += 1; } "license-2" => { assert!(contained.score > 0.5, "license-2 score meets threshold"); found2 += 1; } _ => { panic!("somehow got an unknown license name"); } } } assert!( found1 == 1 && found2 == 1, "found both licenses exactly once" ); } #[test] fn find_multiple_licenses_topdown() { env_logger::init(); let store = create_dummy_store(); // this TextData matches license-2 with an overall score of ~0.46 and optimized // score of ~0.57 let test_data = TextData::new("lorem\nipsum abc def ghi jkl\n1234 5678 1234\n0000\n1010101010\n\n8888 9999\nwhatsit hello\narst neio qwfp colemak is the best keyboard layout\naaaaa\nbbbbb\nccccc"); // check that we can spot the gibberish license in the sea of other gibberish let strategy = ScanStrategy::new(&store) .mode(ScanMode::TopDown) .confidence_threshold(0.5) .step_size(1); let result = strategy.scan(&test_data).unwrap(); assert!(result.license.is_none(), "result license is None"); println!("{:?}", result); assert_eq!(2, result.containing.len()); // inspect the array and ensure we got both licenses let mut found1 = 0; let mut found2 = 0; for (_, contained) in result.containing.iter().enumerate() { match contained.license.name { "license-1" => { assert!(contained.score > 0.5, "license-1 score meets threshold"); found1 += 1; } "license-2" => { assert!(contained.score > 0.5, "license-2 score meets threshold"); found2 += 1; } _ => { panic!("somehow got an unknown license name"); } } } assert!( found1 == 1 && found2 == 1, "found both licenses exactly once" ); } fn create_dummy_store() -> Store { let mut store = Store::new(); store.add_license("license-1".into(), "aaaaa\nbbbbb\nccccc".into()); store.add_license( "license-2".into(), "1234 5678 1234\n0000\n1010101010\n\n8888 9999".into(), ); store } }