pax_global_header00006660000000000000000000000064146636713710014530gustar00rootroot0000000000000052 comment=398a3c25059ee0ec7e2987d6af336d67dd681ba1 libzim-9.2.3/000077500000000000000000000000001466367137100130315ustar00rootroot00000000000000libzim-9.2.3/.codecov.yml000066400000000000000000000003351466367137100152550ustar00rootroot00000000000000codecov: notify: require_ci_to_pass: yes coverage: status: project: default: threshold: 1% patch: default: target: 90% threshold: 0% ignore: - "test" - "examples" libzim-9.2.3/.github/000077500000000000000000000000001466367137100143715ustar00rootroot00000000000000libzim-9.2.3/.github/FUNDING.yml000066400000000000000000000012451466367137100162100ustar00rootroot00000000000000# These are supported funding model platforms github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username custom: # https://kiwix.org/support-us/ libzim-9.2.3/.github/script/000077500000000000000000000000001466367137100156755ustar00rootroot00000000000000libzim-9.2.3/.github/script/build_libzim.cmd000066400000000000000000000005471466367137100210350ustar00rootroot00000000000000call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" set CC=cl.exe set CXX=cl.exe meson.exe setup build . --force-fallback-for liblzma -Ddefault_library=static -Dwith_xapian=false -Dzstd:bin_programs=false -Dzstd:bin_tests=false -Dzstd:bin_contrib=false -Dliblzma:default_library=static cd build ninja.exe libzim-9.2.3/.github/workflows/000077500000000000000000000000001466367137100164265ustar00rootroot00000000000000libzim-9.2.3/.github/workflows/ci.yml000066400000000000000000000262501466367137100175510ustar00rootroot00000000000000name: CI on: pull_request: push: branches: - main jobs: macOS: strategy: fail-fast: false matrix: os: # x86 - macos-13 # arm - macos-14 target: - macos-aarch64-dyn - macos-x86_64-dyn - ios-arm64-dyn - ios-x86_64-dyn include: - target: macos-aarch64-dyn arch_name: arm64-apple-macos run_test: true - target: macos-x86_64-dyn arch_name: x86_64-apple-darwin run_test: true - target: ios-arm64-dyn arch_name: aarch64-apple-ios run_test: false - target: ios-x86_64-dyn arch_name: x86-apple-ios-simulator run_test: false exclude: - target: macos-x86_64-dyn os: macos-14 runs-on: ${{ matrix.os }} steps: - name: Harden Runner uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 with: egress-policy: audit - name: Checkout code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 - name: Setup Python 3.10 uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.10' - name: Install packages run: | brew update brew install gcovr pkg-config ninja || brew link --overwrite python - name: Install Python modules run: pip3 install meson pytest - name: Install dependencies uses: kiwix/kiwix-build/actions/dl_deps_archive@main with: target_platform: ${{ matrix.target }} - name: Compile shell: bash run: | MESON_OPTION="--default-library=shared" MESON_CROSSFILE="$HOME/BUILD_${{matrix.arch_name}}/meson_cross_file.txt" if [ -e $MESON_CROSSFILE ]; then MESON_OPTION="$MESON_OPTION --cross-file $MESON_CROSSFILE -Dstatic-linkage=true" cat $MESON_CROSSFILE fi export PKG_CONFIG_PATH=$HOME/BUILD_${{matrix.arch_name}}/INSTALL/lib/pkgconfig meson . build ${MESON_OPTION} cd build ninja - name: Test if: matrix.run_test shell: bash run: | export LD_LIBRARY_PATH=$HOME/BUILD_${{matrix.arch_name}}/INSTALL/lib:$HOME/BUILD_${{matrix.arch_name}}/INSTALL/lib64 cd build ninja download_test_data meson test --verbose env: SKIP_BIG_MEMORY_TEST: 1 WAIT_TIME_FACTOR_TEST: 10 Windows: strategy: fail-fast: false matrix: os: - windows-2022 runs-on: ${{ matrix.os }} steps: - name: Harden Runner uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 with: egress-policy: audit - name: Checkout code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 - name: Setup python 3.10 uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 with: python-version: '3.10' - name: Install packages run: choco install pkgconfiglite ninja - name: Install python modules run: pip3 install meson - name: Setup MSVC compiler uses: bus1/cabuild/action/msdevshell@v1 with: architecture: x64 - name: Install dependencies uses: kiwix/kiwix-build/actions/dl_deps_archive@main with: target_platform: win-x86_64-dyn - name: Compile shell: cmd run: | set PKG_CONFIG_PATH=%cd%\BUILD_win-amd64\INSTALL\lib\pkgconfig dir %PKG_CONFIG_PATH% meson.exe setup . build -Dwith_xapian_fuller=false -Dwerror=false cd build ninja.exe - name: Test shell: cmd run: | cd build ninja.exe download_test_data meson.exe test --verbose env: WAIT_TIME_FACTOR_TEST: 10 Linux: strategy: fail-fast: false matrix: target: - linux-x86_64-static - linux-x86_64-dyn - linux-aarch64-musl-dyn - linux-aarch64-dyn - android-arm - android-arm64 - win32-static - win32-dyn - wasm with_xapian: - true - false include: - target: linux-x86_64-static image_variant: focal lib_postfix: '/x86_64-linux-gnu' arch_name: linux-x86_64 run_test: true coverage: true - target: linux-x86_64-dyn image_variant: focal lib_postfix: '/x86_64-linux-gnu' arch_name: linux-x86_64 run_test: true coverage: true - target: linux-aarch64-musl-dyn image_variant: alpine lib_postfix: '/x86_64-linux-musl' arch_name: linux-aarch64-musl run_test: true coverage: false - target: linux-aarch64-dyn image_variant: focal lib_postfix: '/aarch64-linux-gnu' arch_name: aarch64-linux-gnu run_test: false coverage: false - target: android-arm image_variant: focal lib_postfix: '/arm-linux-androideabi' arch_name: arm-linux-androideabi run_test: false coverage: false - target: android-arm64 image_variant: focal lib_postfix: '/aarch64-linux-android' arch_name: aarch64-linux-android run_test: false coverage: false - target: win32-static image_variant: f35 lib_postfix: '64' arch_name: i686-w64-mingw32 run_test: false coverage: false - target: win32-dyn image_variant: f35 lib_postfix: '64' arch_name: i686-w64-mingw32 run_test: false coverage: false - target: wasm image_variant: focal lib_postfix: '/x86_64-linux-gnu' arch_name: wasm64-emscripten run_test: false coverage: false env: HOME: /home/runner runs-on: ubuntu-22.04 container: image: "ghcr.io/kiwix/kiwix-build_ci_${{matrix.image_variant}}:2023-10-30" steps: - name: Harden Runner uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 with: egress-policy: audit - name: Install dependencies if: ${{ !contains(matrix.target, 'musl') }} uses: kiwix/kiwix-build/actions/dl_deps_archive@main with: target_platform: ${{ matrix.target }} - name: Retrieve source code uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4.1.4 - name: Compile source code shell: bash env: PKG_CONFIG_PATH: ${{env.HOME}}/BUILD_${{matrix.arch_name}}/INSTALL/lib/pkgconfig:${{env.HOME}}/BUILD_${{matrix.arch_name}}/INSTALL/lib${{matrix.lib_postfix}}/pkgconfig run: | if [[ "${{matrix.target}}" =~ .*-dyn ]] then MESON_OPTION="--default-library=shared" else MESON_OPTION="--default-library=static" fi if [ -e "${{ env.HOME }}/BUILD_${{ matrix.arch_name }}/meson_cross_file.txt" ] then MESON_OPTION="$MESON_OPTION --cross-file ${{ env.HOME }}/BUILD_${{ matrix.arch_name }}/meson_cross_file.txt" else MESON_OPTION="$MESON_OPTION -Db_coverage=true" fi if [[ "${{matrix.target}}" =~ android_.* ]] then MESON_OPTION="$MESON_OPTION -Dstatic-linkage=true -DUSE_BUFFER_HEADER=false" fi if [[ "${{matrix.target}}" == wasm ]] then MESON_OPTION="$MESON_OPTION -Dexamples=false" fi meson setup . build ${MESON_OPTION} -Dwith_xapian=${{matrix.with_xapian}} cd build ninja - name: Run automated tests shell: bash if: matrix.run_test env: LD_LIBRARY_PATH: "${{env.HOME}}/BUILD_${{matrix.arch_name}}/INSTALL/lib:${{env.HOME}}/BUILD_${{matrix.arch_name}}/INSTALL/lib${{matrix.lib_postfix}}" SKIP_BIG_MEMORY_TEST: 1 WAIT_TIME_FACTOR_TEST: 10 run: | cd build ninja download_test_data meson test --verbose if [[ "${{matrix.coverage}}" = "true" ]]; then ninja coverage fi - name: Upload code coverage uses: codecov/codecov-action@5ecb98a3c6b747ed38dc09f787459979aebb39be # v4.3.1 if: matrix.coverage with: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} OSSF-Scorecard: name: OSSF Scorecard runs-on: ubuntu-22.04 permissions: # Needed to upload the results to code-scanning dashboard. security-events: write # Needed to publish results and get a badge (see publish_results below). id-token: write # Uncomment the permissions below if installing in a private repository. # contents: read # actions: read steps: - name: Harden Runner uses: step-security/harden-runner@a4aa98b93cab29d9b1101a6143fb8bce00e2eac4 # v2.7.1 with: egress-policy: audit - name: "Checkout code" uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: persist-credentials: false - name: "Run analysis" uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1 with: results_file: results.sarif results_format: sarif # (Optional) "write" PAT token. Uncomment the `repo_token` line below if: # - you want to enable the Branch-Protection check on a *public* repository, or # - you are installing Scorecard on a *private* repository # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action?tab=readme-ov-file#authentication-with-fine-grained-pat-optional. # repo_token: ${{ secrets.SCORECARD_TOKEN }} # Public repositories: # - Publish results to OpenSSF REST API for easy access by consumers # - Allows the repository to include the Scorecard badge. # - See https://github.com/ossf/scorecard-action#publishing-results. publish_results: true # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" uses: actions/upload-artifact@97a0fba1372883ab732affbe8f94b823f91727db # v3.pre.node20 with: name: SARIF file path: results.sarif retention-days: 5 # Upload the results to GitHub's code scanning dashboard (optional). # Commenting out will disable upload of results to your repo's Code Scanning dashboard - name: "Upload to code-scanning" uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9 with: sarif_file: results.sarif libzim-9.2.3/.github/workflows/package.yml000066400000000000000000000101461466367137100205460ustar00rootroot00000000000000name: Packages on: pull_request: push: branches: - main release: types: [ published ] jobs: build-deb: runs-on: ubuntu-22.04 strategy: fail-fast: false matrix: distro: - debian-unstable - debian-trixie - debian-bookworm - debian-bullseye - ubuntu-noble - ubuntu-jammy - ubuntu-focal # Pin your dependencies with https://github.com/mheap/pin-github-action steps: - name: Harden Runner uses: step-security/harden-runner@0d381219ddf674d61a7572ddd19d7941e271515c # pin@v2 with: egress-policy: audit - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # pin@v4 # Determine which PPA we should upload to - name: PPA id: ppa run: | if [[ $REF == refs/tags* ]] then echo "::set-output name=ppa::kiwixteam/release" else echo "::set-output name=ppa::kiwixteam/dev" fi env: REF: ${{ github.ref }} - uses: legoktm/gh-action-auto-dch@2b7d6a33db93a408d4b5e2edf38be7fd578b11d7 # pin@main with: fullname: Kiwix builder email: release+launchpad@kiwix.org distro: ${{ matrix.distro }} - uses: legoktm/gh-action-build-deb@7a6b22239275ae4e425fefc6f1aeb1118160500d # pin@debian-unstable if: matrix.distro == 'debian-unstable' name: Build package for debian-unstable id: build-debian-unstable with: args: --no-sign - uses: legoktm/gh-action-build-deb@b47978ba8498dc8b8153cc3b5f99a5fc1afa5de1 # pin@debian-trixie if: matrix.distro == 'debian-trixie' name: Build package for debian-trixie id: build-debian-trixie with: args: --no-sign - uses: legoktm/gh-action-build-deb@1f4e86a6bb34aaad388167eaf5eb85d553935336 # pin@debian-bookworm if: matrix.distro == 'debian-bookworm' name: Build package for debian-bookworm id: build-debian-bookworm with: args: --no-sign - uses: legoktm/gh-action-build-deb@084b4263209252ec80a75d2c78a586192c17f18d # pin@debian-bullseye if: matrix.distro == 'debian-bullseye' name: Build package for debian-bullseye id: build-debian-bullseye with: args: --no-sign - uses: legoktm/gh-action-build-deb@9114a536498b65c40b932209b9833aa942bf108d # pin@ubuntu-noble if: matrix.distro == 'ubuntu-noble' name: Build package for ubuntu-noble id: build-ubuntu-noble with: args: --no-sign - uses: legoktm/gh-action-build-deb@1553bc52b826020691af83a7354a047f2727106c # pin@ubuntu-jammy if: matrix.distro == 'ubuntu-jammy' name: Build package for ubuntu-jammy id: build-ubuntu-jammy with: args: --no-sign - uses: legoktm/gh-action-build-deb@77900afcbdc12874b7177e0e9fca2f4da043cd05 # pin@ubuntu-focal if: matrix.distro == 'ubuntu-focal' name: Build package for ubuntu-focal id: build-ubuntu-focal with: args: --no-sign ppa: ${{ steps.ppa.outputs.ppa }} - uses: actions/upload-artifact@0b2256b8c012f0828dc542b3febcab082c67f72b # pin@v4 with: name: Packages for ${{ matrix.distro }} path: output - uses: legoktm/gh-action-dput@4f46c373c7d114c8885c376be07f9ad5490c4f51 # pin@main name: Upload dev package # Only upload on pushes to main if: github.event_name == 'push' && github.event.ref == 'refs/heads/main' && startswith(matrix.distro, 'ubuntu-') with: gpg_key: ${{ secrets.LAUNCHPAD_GPG }} repository: ppa:kiwixteam/dev packages: output/*_source.changes - uses: legoktm/gh-action-dput@4f46c373c7d114c8885c376be07f9ad5490c4f51 # pin@main name: Upload release package if: github.event_name == 'release' && startswith(matrix.distro, 'ubuntu-') with: gpg_key: ${{ secrets.LAUNCHPAD_GPG }} repository: ppa:kiwixteam/release packages: output/*_source.changes libzim-9.2.3/.gitignore000066400000000000000000000005241466367137100150220ustar00rootroot00000000000000*~ *#* autom4te.cache build compile config.h configure depcomp .deps .dirstamp INSTALL install-sh *.kate-swp *.la .libs libtool *.lo ltmain.sh *.m4 Makefile Makefile.in missing *.o stamp-h1 .svn .*.swp *.zim examples/createZimExample src/tools/zimdump src/tools/zimsearch libzim.pc test-driver test/zimlib-test* test/test-suite.log .clangd libzim-9.2.3/.readthedocs.yaml000066400000000000000000000010471466367137100162620ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details # Required version: 2 # Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: python: "3.11" # Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py # We recommend specifying your dependencies to enable reproducible builds: # https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html python: install: - requirements: docs/requirements.txt libzim-9.2.3/AUTHORS000066400000000000000000000016631466367137100141070ustar00rootroot00000000000000# This is the list of Libzim's significant contributors. # # This does not necessarily list everyone who has contributed code, # especially since many employees of one corporation may be contributing. # To see the full list of contributors, see the revision history in # source control. C. Scott Ananian https://github.com/cscott Dmitry Atamanov https://github.com/data-man Emmanuel Engelhart https://github.com/kelson42 Kunal Mehta https://github.com/legoktm Maneeshpm https://github.com/maneeshpm Matthieu Gautier https://github.com/mgautierfr MiguelRocha https://github.com/miguelrocha Renaud Gaudin https://github.com/rgaudin Tommi Mäkitalo https://github.com/maekitalo Veloman Yunkan https://github.com/veloman-yunkan libzim-9.2.3/COPYING000066400000000000000000000354341466367137100140750ustar00rootroot00000000000000 GNU GENERAL PUBLIC LICENSE Version 2, June 1991 Copyright (C) 1989, 1991 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This General Public License applies to most of the Free Software Foundation's software and to any other program whose authors commit to using it. (Some other Free Software Foundation software is covered by the GNU Library General Public License instead.) You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs; and that you know you can do these things. To protect your rights, we need to make restrictions that forbid anyone to deny you these rights or to ask you to surrender the rights. These restrictions translate to certain responsibilities for you if you distribute copies of the software, or if you modify it. For example, if you distribute copies of such a program, whether gratis or for a fee, you must give the recipients all the rights that you have. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. We protect your rights with two steps: (1) copyright the software, and (2) offer you this license which gives you legal permission to copy, distribute and/or modify the software. Also, for each author's protection and ours, we want to make certain that everyone understands that there is no warranty for this free software. If the software is modified by someone else and passed on, we want its recipients to know that what they have is not the original, so that any problems introduced by others will not reflect on the original authors' reputations. Finally, any free program is threatened constantly by software patents. We wish to avoid the danger that redistributors of a free program will individually obtain patent licenses, in effect making the program proprietary. To prevent this, we have made it clear that any patent must be licensed for everyone's free use or not licensed at all. The precise terms and conditions for copying, distribution and modification follow. GNU GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License applies to any program or other work which contains a notice placed by the copyright holder saying it may be distributed under the terms of this General Public License. The "Program", below, refers to any such program or work, and a "work based on the Program" means either the Program or any derivative work under copyright law: that is to say, a work containing the Program or a portion of it, either verbatim or with modifications and/or translated into another language. (Hereinafter, translation is included without limitation in the term "modification".) Each licensee is addressed as "you". Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running the Program is not restricted, and the output from the Program is covered only if its contents constitute a work based on the Program (independent of having been made by running the Program). Whether that is true depends on what the Program does. 1. You may copy and distribute verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and give any other recipients of the Program a copy of this License along with the Program. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Program or any portion of it, thus forming a work based on the Program, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) You must cause the modified files to carry prominent notices stating that you changed the files and the date of any change. b) You must cause any work that you distribute or publish, that in whole or in part contains or is derived from the Program or any part thereof, to be licensed as a whole at no charge to all third parties under the terms of this License. c) If the modified program normally reads commands interactively when run, you must cause it, when started running for such interactive use in the most ordinary way, to print or display an announcement including an appropriate copyright notice and a notice that there is no warranty (or else, saying that you provide a warranty) and that users may redistribute the program under these conditions, and telling the user how to view a copy of this License. (Exception: if the Program itself is interactive but does not normally print such an announcement, your work based on the Program is not required to print an announcement.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Program, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Program, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Program. In addition, mere aggregation of another work not based on the Program with the Program (or with a work based on the Program) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may copy and distribute the Program (or a work based on it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you also do one of the following: a) Accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, b) Accompany it with a written offer, valid for at least three years, to give any third party, for a charge no more than your cost of physically performing source distribution, a complete machine-readable copy of the corresponding source code, to be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange; or, c) Accompany it with the information you received as to the offer to distribute corresponding source code. (This alternative is allowed only for noncommercial distribution and only if you received the program in object code or executable form with such an offer, in accord with Subsection b above.) The source code for a work means the preferred form of the work for making modifications to it. For an executable work, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the executable. However, as a special exception, the source code distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. If distribution of executable or object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place counts as distribution of the source code, even though third parties are not compelled to copy the source along with the object code. 4. You may not copy, modify, sublicense, or distribute the Program except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense or distribute the Program is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 5. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Program or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Program (or any work based on the Program), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Program or works based on it. 6. Each time you redistribute the Program (or any work based on the Program), the recipient automatically receives a license from the original licensor to copy, distribute or modify the Program subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties to this License. 7. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Program at all. For example, if a patent license would not permit royalty-free redistribution of the Program by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Program. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system, which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 8. If the distribution and/or use of the Program is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Program under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 9. The Free Software Foundation may publish revised and/or new versions of the General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of this License, you may choose any version ever published by the Free Software Foundation. 10. If you wish to incorporate parts of the Program into other free programs whose distribution conditions are different, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS libzim-9.2.3/ChangeLog000066400000000000000000000601751466367137100146140ustar00rootroot00000000000000libzim 9.2.3 ============ * Correctly locate part range in which read data in case of ZIM chunks (@mgautierfr #903) * Fix test compilation GCC12 #899 (#mgautierfr #899) * Move Windows CI/CD to native MS Windows via GitHub Actions (@mgautierfr #899 #914) * Update deb package CI/CD for Debian & Ubuntu with latest releases (@kelson42 #905 #906 #922) libzim 9.2.2 ============ * Avoid crash scenario in case of invalid offset in clusters (@mgautierfr #895) * Many improvements around testing (@mgautierfr #889 #897) * Windows CI using GitHub Windows runner (@mgautierfr #894) * Better error message when failin to open (split) ZIM file (@mgautierfr #884) libzim 9.2.1 ============ * Better handling of split ZIM files (@mgautierfr #879) * Fix creation of shared_ptr in test (@mgautierfr #881) libzim 9.2.0 ============ * Allow open Archive with a set of (positionned) file descriptors (@mgautierfr #860) * Introduce new (private) method `getEntryByPathWithNamespace` (@mgautierfr #859) * Internally catch xapian exception and rethrow ZimFileFormatError instead (@mgautierfr #873) * Fix compilation on Haiku (@Begasus #857) * Fix macos mmap (@mgautierfr #867) * Optimize checksum calculation (@aryanA101a #861) * Introduce `Formatter` helper (@ShaopengLin #862) * Rename all `*Url*` symbols to `*Path*` (@mgautierfr #869) * Build script: Allow to disable compilation of test (@kelson42 #854) * [CI] Use kiwix-build github's action to download deps archive (@mgautierfr #850) * [CI] Build libzim on macos-14 (@kelson42 #856) libzim 9.1.0 ============ * New addAlias() method in the creator (@mgautierfr #833) * Bump-up ZIM format minor version to 6.1.2 (@mgautierfr #847) libzim 9.0.0 ============ * getMediaCount() does not fail anymore if M/Counter is missing (@mgautierfr #827) * Reintroduce optimization of Entry::getItem() (@kelson42 @mgautierfr #836) * C++17 compatible code (@mgautierfr #819) * Add support to recent googletest framework (@kelson42 #830) * Multiple fixes for Apple macOS/iOS compilation & CI (@mgautierfr @kelson42 @rgaudin #832 #839) libzim 8.2.1 ============ * Better indexer CJK content (@mgautierfr #806) * Fix export of symbol on Windows dll (@xiaoyifang @mgautierfr #796 #807) * Remove accents from the search query (@mgautierfr #804) * Improved performance of removing of accents when working on long string (@mgautierfr #797) * Various improvement and fixes of the CI (@mgautierfr @kelson42) libzim 8.2.0 ============ * Deprecate `SearchIterator::getSize()` method (@mgautierfr #774) * Fix handling of search end iterator (@mgautierfr #774) There were cases when we could dereference a end iterator. * Fix suggestions with titles containing punctuations (@veloman-yunkan #765) * Correctly publish our public API in Windows's dll (@xiaoyifang #783) * Fix various warning and compilation error when compiling with last xcode version (@mgautierfr #782) * Fix faulty unit-test checking for async errors (@mgautierfr #776) * Update subproject wrap zstd to version 1.5.4 (and use upstream wrap file.) (@mgautierfr #749) * Move main branch from `master` to `main` * Add CI to build on aarch64 (@mgautierfr #784) * Various CI improvement (@kelson42) libzim 8.1.1 ============ * Revert a ABI breaking change introduced in 8.1.0 (Optimization of `Entry::getItem()`) libzim 8.1.0 ============ * Optimization ofthe first call to `zim::Archive::iterEfficient` (@veloman-yunkan #724) * Add some documentation to `zim::writer::IndexData` (@mgautierfr #727) * Correctly catch and rethrow exception thrown in worker threads at creation (@mgautierfr #496 #748) * Optimization of `Entry::getItem()` (@veloman-yunkan #732) * Fix declaration of `zim::setICUDataDirectory()` (@MohitMaliFtechiz #733) * Add `zim::Archive::getMediatCount()` (@mgautierfr #730) * Make compilaton of examples optional (@mgautierfr #738) * Add a CI for wasm (@mgautierfr #746) * Make constructor of SuggestionItem public (@veloman-yunkan #740) libzim 8.0.1 ============ * Fix debian packagin (@mgautierfr) libzim 8.0.0 ============ * [API-BREAK] Remove lzma compression support in writer (@veloman-yunkan #718) * Add new method `zim::Entry::getRedirectEntryIndex()` (@veloman-yunkan #716) * Add new helper function `zim::setICUDataDirectory()` to help android wrapper compilation (@mgautierfr #722) * Fix `std::call_once` usage (alpine bug) (@veloman-yunkan #ê708) * Better xapian indexation (no transaction, better compact algorithm) (@mgautierfr #719) * Reserve more space (1968B instead of 944B) for mimetype list (@mgautierfr #720) * [CI] Fix android compilation in the CI (@veloman-yunkan @mgautierfr #713) * [CI] Add CI for Alpine (@veloman-yunkan #710) * [CI] Support checkout of tag in the CI (@teeks99 #696) * [CI] Remove movebot (@kelson42 704) * [CI] Remove Impish and add Kinetic packages (@legoktm #715) * Fix code factor report (@kelson42 #700) * Fix readme (@kelson42 #701 #716) libzim 7.2.2 ============ * Change the way we generate search result snippet. We now ask xapian to generate "less" relevant snippet (even if in practice, snippets are still good). But it know generate snippet far more quicker. On cold search, no cache and low IO, search can go from 90s to 3s. (@mgautier #697) * [CI] Update base images (@mgautier #695) libzim 7.2.1 ============ * Make suggestions diacritics insensitive (@veloman-yunkan #691) * [Writer] Raise an exception when user add a invalid entry (duplicate path) instead of printing a message (which can be too easily missed) and be buggy (@mautierfr #690) * [Writer] Do not `hasIndexData` and `getTitle` in the main thread when we add an entry (@mgautier #684) * [Writer] Properly clean and stop the writer even if user hasn't call `finishZimCreation` (The created zim file is still invalid) (@veloman-yunkan #666) * Add a default argument value for mimetype of `creator::addMetadata` (@kelvinhammond #678) * Use a more informative message in exception when we cannot open a file (@veloman-yunkan #667 #668) * Use a generic dirent lookup to search by title (@veloman-yunkan #651) * Various improvements: - CI, Packaging : Stop creating packages for Ubuntu Hirsute (@legoktm #664) - Update Readme (@TheDuchy #660) - Fix cross-compilation host machine detection (@kelson42 #665) - Fix macos/ios compilation (@mgautierfr #672) - Update documentation @mgautierfr #677, @veloman-yunkan #682 libzim 7.2.0 ============ * Add methods to get/print (dependences) versions (@kelson42, #452) * Fix Emscripten compilation (@kelson42, @mossroy, #643) libzim 7.1.0 ============ * Fix dirent test on 32 bits architectures (@mgautierfr #632) * Fix compilation on Alpine - with musl (@amirouche #649) * Don't crash if ZIM without illustration nor X/W namespace (@mgautierfr #641) * Switch default suggestion operator to AND (@maneeshpm #644) * Add a new method Archive::getMetadataItem (@mgautierfr #639) * Better indexion criterias (@mgautierfr #642) * Avoid duplicated archives in the searcher (@veloman-yunkan #648) * Fix random entry (@veloman-yunkan #650) * Various improvements. - CI @mgautierfr #640, @kelson42 #638, @legoktm #654 - Doc @rgaudin #646 libzim 7.0.0 ============ Version 7.0.0 is a major release. The API has been completely rewritten. Most notable change is that namespaces are now hidden. The new API is described in documentation, which includes a Transition Guide from v6. ZIM files created with it uses new ZIM minor version (6.1 - see Header section of spec.) Both backward and forward compatibility is kept. Improvements ------------ * Rewrite creator and reader API This removes the namespace from the API. Article are automatically put in the right namespace ('A') and the retrivial of content is made using specific API. (@mgautier #454) * Better handling of the conditional compilation without xapian. Before that, the search API was present (but returning empty result) if libzim was compiled without xapian. Now the API is not present anymore. User code must check if libzim is compiled with xapian or not by checking if LIBZIM_WITH_XAPIAN is defined or not. (@mgautierfr #465) * Add a new specific listing in zim files to list entries considered as "front article". At creation, wrapper MUST pass the hint `FRONT_ARTICLE` to correctly mark the entry. Search by title uses this list if present. (@mgautierfr #487) * Store the wellknown entries in the `W` namespace (`W/mainPage`) (@mgautierfr #497) * Rewrite Search API. Fix potential memory link and allow correct reusing of create search. (@mgautierfr #530) * New suggestion search API. The api mimics the Search API but specialized for suggestion (@maneeshpm #574) * Add `zim::Archive` constructors to open an archive using a existing file descriptor. This API is not available on Windows. (@veloman-yunkan #449) * Make zstd the default compression algorithm (@veloman-yunkan #480) * The method `zim::Archive::checkIntegrity` now if the mimetypes indicated in the dirents are correct (@veloman-yunkan #505) * Writer doesn't add a `.zim` extension to the given path. (@maneeshpm #503) * Implement random entry picking. We are choosing a entry from the "front article" list if present. (@mgautierfr #476) * Creator now create the `M/Counter` metadata. (@mgautierfr) * Better Illustration handling. Favicon is replaced by Illustration. Illustration can now have different size and scale (even if the API do not use this feature) (@mgautierfr #540) * Search iterator now have a method `getZimId` to know the Id of the zim corresponding to the result (useful for multizim search) (@maneeshpm #557) Bug fixes --------- * The method `zim::Archive::checkIntegrity` now check if the dirents are correctly sorted. (@veloman-yunkan #448) * Handle large MIME-type list. Some zim file may have a pretty large mimetype list. (@veloman-yunkan #460) * Fix handling of zim file containing item of size 0. (@mgautierfr #483) * Better parsing of the entry paths to detect the namespace (@maneeshpm #479) * Fix zim file creation on Windows (@mgautierfr #508) * Better algorithm tunning for suggestion search (@maneeshpm #492) * The default indexer now index html content only. (@mgautierfr #511) * Better suggestion search : Don't use stopwords, use OP_PHRASE (@maneeshpm #501) * Remove duplicate in the suggestion search (@maneeshpm #515) * Remove the termlist from the xapian database, lower memory usage (@maneeshpm #528) * Add a anchor in the suggestion search to search term at the beginning of the title (@maneeshpm #526) * Make the suggestion search working with special characters (`&`, `+`) (@veloman-yunkan #534) * Fix creator issue not detecting that cluster must be extended if it contains only 32-bit-sized content. (@veloman-yunkan #552) * Correctly generate suggestion snippet. (@maneeshpm #545) * Better cluster size configuration (@mgautierfr #555) * Make search iterator `getTitle` return the real title of the entry and not the one stored in the xapian database (caseless) (@maneeshpm #586) * Correcly close a zim creator to avoid a crash when the creator is destructed without being started (@mgautierfr #613) * Reduce the creator memory usage by reducing the memory size of the dirent (@mgautier #616, #628) * Write the cluster using a bigger chunk size for performance (@mgautierfr #506) * Change the default cluster size to 2MiB (@mgautierfr #585) * The default mimetype for metadata now include the utf8 chardet (@rgaudin #626) * Improve the estimation of the number of search/suggestion results by forcing Xapian to evaluate at least 10 results (@mgautier #625) Other ----- * Update xapian stopwords list. (@data-man #447) * Remove direct pthread dependency (use c++11 thread library). (@mgautierfr #443) We still need pthread library on linux and freebsd as C++11 is using it internally. * [CI] Make the libzim CI compile libzim natively on Windows (@mgautierfr #453). * [CI] Build libzim package for Ubuntu Hirsute and Impish (@legoktm #459, #580) * Always create zim file using the major version 6. (@mgautierfr #512) * Move the test data files out of the git repository. Now test files are stored in `zim-testing-suite` repository and must be downloaded. (@mgautierfr #538, #535) * Add search iterator unit test (@maneeshpm #547) * Correctly fix search iterator method case to use camelCase everywhere (@maneeshpm #563) * Add a cast to string opertor on Uuid (@maneeshpm #582) * Make unittest print the path of the missing zim file when something goes wrong (@kelson42 #601) * Delete temporary data (index) after we called `finishZimCreation` instead of waiting for creator destruction. (@mgautierfr #603) * Add basic user documentation (@mgautierfr #611) Known bugs ---------- Suggestion system using in current libkiwix doesn't work with new zim files created with this release (and future ones). New libkiwix version will be fixed and will work with new and old zim files. libzim 6.3.2 ============ This is a hotfix of 6.3.0 : * libzim now create zimfile with zstd compression 19 instead of 22. So new libzim do not need to allocate 128Mb per cluster at decompression time. * At reading time, on 32 bits architectures, zstd cluster are not keep in cache. This avoid use to also keep the decompression stream which reserve 128Mb of memory address. libzim 6.3.1 ============ The release process of 6.3.1 was buggy. So, no 6.3.1. libzim 6.3.0 ============ * Rewrite internal reader structure to use stream decompression. This allow libzim to not decompresse the whole cluster to get an article content. This is big performance improvement, it speedups random access by 2, with a very small cost when doing "full" incremental reading (zim-check/zim-dump). (@veloman-yunkan) * Better dirent lookup. Dirent lookup is the process of locating article data starting from the url or title. This improves reading of zim file up to 10% (@veloman-yunkan) * Add basic, first version of `validate` function to check internal structure of a zim file. (@veloman-yunkan, @MiguelRocha) * Fix compilation of libzim without xapian (@mgautierfr) * Remove zlib dependency (and support of very old files created using zlib compression) (@mgautierfr) * New unit tests and various small fixes. libzim 6.2.2 ============ * Check blob index before access it in the cluster. * Refactoring of the cluster reading. libzim 6.2.1 (release process broken) ===================================== * Update readme and add link to repology.org packages list. * Fix compilation on windows. libzim 6.2.0 ============ * Fix compilation of libzim on freebsd. * Rewrite unit tests to remove python based test and use gtest all the time. * Make libzstd mandatory. * Support for meson 0.45. * Fix multipart support on macos. * Add a documentation system. * Better cache system implementation (huge speed up). * Various (and numerous) small refactoring. libzim 6.1.8 ============ * Increase default timeout for test to 120 seconds/test * Compression algorithm to use can be passed to `zim::writer::Creator` * Add automatic debian packaging of libzim. * Fix using of tmpdir (and now use env var TMPDIR) during tests. libzim 6.1.7 ============ * Do not assume urlPtrPos is just after the mimetype list. * Fix compilation of compression test. * Do not exit but throw an exception if an ASSERT is not fulfill. libzim 6.1.6 ============ * Better (faster) implementation of the ordering of article by cluster. * Fix compression algorithm. libzim 6.1.5 ============ * [Writer] Remove unused declaration of classes. Those classes were not implemented nor used at all. libzim 6.1.4 ============ * [Writer] Fix excessive memory usage. Data of the cluster were clean at the end of the process, not once we don't need it. libzim 6.1.3 ============ * [Writer] Use a `.tmp` suffix and rename to `.zim` at the end of the write proces. * Add unit tests * Do not include uncessary `windows.h` headers in public zim's headers. libzim 6.1.2 ============ * [CI] Fix codecov configuration * [Writer] Fix threads synchronization at end of writing process. libzim 6.1.1 ============ * Fix bug around the find function libzim 6.1.0 ============ * Compile now on OpenBSD * [Test] Use the main function provided by gtest. * [CI] Move the CI compilation to github actions. * Add stopwords for 54 new languages. * [Writer] Improve the way we are writing cluster at zim creation time. - Clusters are directly written in the zim file instead of using temporary files. - mimetypes are limited to 944 bytes. * Add a new type of iterator to iterate over articles in a performant way reducing decompression of clusters. This is now the new default iterator. * Add support for zim files compressed with zstd compression algorithm. This is not possible to use zstd to create zim file for now. libzim 6.0.2 ============ * Fix search suggestion parsing. libzim 6.0.1 ============ * Fix crash when trying to open an empty file. * Ensure that pytest tests are run on the CI. libzim 6.0.0 ============ * [Writer] Index the articles in differents threads. This is a huge speed improvement as the main thread in not blocked by indexing. * Index the title only if `shouldIndex` return true. libzim 5.1.0 ============ * Improve indexation of the title. * Better pertinence of suggestions (only for new zim files) * Improvement of the speed of Leveinstein distance for suggestions (for old zims) libzim 5.0.2 ============ * Improve README. * Remove gtest as embeded subproject. * Better lzma compression. * Better performance of the leveinstein algorithm (better suggestions performance) libzim 5.0.1 ============ * Update README. * [Writer] Add debug information (print progress of the clusters writing). * [Writer] Correctly print the url to the user. * [CI] Add code coverage. libzim 5.0.0 ============ * Fix thread slipping for win32 crosscompilation. * Fix a potential invalid access when reading dirent. * Fix memory leak in the decompression algorithm. * [Writer] Fix a memory leak (cluster cleanning) * [Writer] Write article data in a temporary cluster file instead of a temporary file per article. * [Writer] Better algorithm to store the dirent while creating the zim file. Better memory usage. * [Writer] [API Change] Url/Ns are now handle using the same struct Url. * [Writer] [API Change] No more aid and redirectAid. A redirectArticle have to implement redirectUrl. * [Writer] Use a memory pool to avoid multiple small memory allocations. * [Writer] [API Change] Rename `ZimCreator` to `Creator`. * [API Change] File's `search` and `suggestions` now return a unique_ptr instead of a raw pointer. libzim 4.0.7 ============ * Build libzim without rpath. libzim 4.0.6 ============ * Support zim file created with cluster not written sequentially. * Remove a meson warning. libzim 4.0.5 ============ * Store the xapian database in the right url. * Do not fail when reading very small zim file (<256b). * Do not print message on normal behavior. * [BUILDSYSTEM] Be able to build a dynamic lib (libzim.so) but using static dependencies. * [CI] Use last version of meson. * [CI] Use the new deps archive xz libzim 4.0.4 ============ * Fix opening of multi-part zim. * Fix convertion of path to wpath on Windows. libzim 4.0.3 ============ * Implement low level file manipilation using different backends libzim 4.0.2 ============ * [Windows] Fix opening of zim file bigger than 4GiB libzim 4.0.1 ============ * [Writer] Fix wrong redirectyon log message * Make libzim compile natively on windows using MSVC * Better message when failing to read a zim file. * Make libzim on windows correctly open unicode path. * Add compilation option to use less memory (but more I/O). Usefull on low memory devices (android) * Small fixes libzim 4.0.0 ============ * [Writer] Remove a lot of memory copy. * [Writer] Add xapian indexing directly in libzim. * [Writer] Better API. * [Writer] Use multi-threading to write clusters. * [Writer] Ensure mimetype of articles article is not null. * Extend test timeout for cluster's test. * Less memory copy for cluster's test. * Allow skipping test using a lot memory using env variable `SKIP_BIG_MEMORY_TEST=1` * Explicitly use the icu namespace to allow using of packaged icu lib. * Use a temporary file name as long as the ZIM writting process is not finished (#163) * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8) libzim 3.3.0 ============ * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly done by : * Do not mmap the whole cluster by default. * MMap only the memory asociated to an article. * If an article is > 4GiB, the blob associated to it is invalid (data==size==0). * Other information are still valid (directAccessInformation, ...) * Fix writing of extended cluster in writer. * Compile libzim on macos. * Build libzim setting RPATH. * Search result urls are now what is stored in the zim file. They should not start with a `/`. This is a revert of the change made in last release. (See kiwix/kiwix-lib#123) * Spelling corrections in README. libzim 3.2.0 ============ * Support geo query if the xapian database has indexed localisation. * Handle articles bigger than 4Go in the zim file (#110). * Use AND operator between search term. * Fix compilation with recent clang (#95). * Add method to get article's data localisation in the zim file. * Be able to get only a part of article (#77). * Do not crash if we cannot open the xapian Database for some reasons. (kiwix/kiwix-tools#153) * Do not assumen there is always a checksum in the zim file. (kiwix/kiwix-tools#150) * Try to do some sanity checks when opening a zim file. * Use pytest to do some tests (when cython is available). * Use levenshtein distance to sort and have better suggestion results. * Search result urls are now always absolute (starts with a '/'). (kiwix/kiwix-lib#110) * Open the file readonly when checking the zim file (and so be able to check read only file). * Accept absolute url starting with '/' when searching for article. * Fix various bugs libzim 3.1.0 ============ * Lzma is not a optional dependency anymore. * Better handle (report and not crash) invalid zim file. * Embed source of gtest (used only if gtest is not available on the system) * Move zimDump tools out of libzim repository to zim-tools * ZimCreator tools doesn't not read command line to set options. libzim 3.0.0 ============ This is a major change of the libzim. Expect a lot new improvement and API changes. * Add a suggestion mode to the search * Fix licensing issues * Fix wrong stemming of the query when searching * Deactivate searching (and so crash) in the embedded database if the zim is splitted * Rewrite the low level memory management of libzim when reading a zim file: * We use a buffer base entity to handle memory and reading file instead of reading file using stream. * MMap the memory when posible to avoid memory copy. * Use const when posible (API break) * Move to googletest instead of cxxtools for unit-tests. * Fix endiannes bug on arm. * Do not install private headers. Those headers declare private structure and should not be visible (API break) * Compile libzim with `-Werror` and `-Wall` options. * Make libzim thread safe for reading article. The search part is not thread safe, and all search operation must be protected by a lock. * Add method to get only a part of a article. * Move some tools to zim-tools repository. libzim 2.0.0 ============ * Move to meson build system `libzim` now use `meson` as build system instead of `autotools` * Move to C++11 standard. * Fulltext search in zim file. We have integrated the xapian fulltext search in libzim. So now, libzim provide an API to search in a zim containing embeded fulltext index. This means that : *libzim need xapian as (optional) dependencies (if you want compile with xapian support). * The old and unused search API has been removed. * Remove bzip2 support. * Remove Symbian support. * Few API hanges * Make some header files private (not installed); * A `Blob` can now be cast to a `string` directly; * Change a lot of `File` methods to const methods. libzim-9.2.3/README.md000066400000000000000000000210361466367137100143120ustar00rootroot00000000000000Libzim ====== The Libzim is the reference implementation for the [ZIM file format](https://wiki.openzim.org/wiki/ZIM_file_format). It's a [software library](https://en.wikipedia.org/wiki/Library_(computing)) to read and write ZIM files on many systems and architectures. More information about the ZIM format and the openZIM project at https://openzim.org/. [![Release](https://img.shields.io/github/v/tag/openzim/libzim?label=release&sort=semver)](https://download.openzim.org/release/libzim/) [![Repositories](https://img.shields.io/repology/repositories/libzim?label=repositories)](https://github.com/openzim/libzim/wiki/Repology) [![macOS Homebrew](https://badgen.net/homebrew/v/libzim)](https://formulae.brew.sh/formula/libzim) [![License](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) [![Build](https://github.com/openzim/libzim/workflows/CI/badge.svg?query=branch%3Amain)](https://github.com/openzim/libzim/actions?query=branch%3Amain) [![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/openzim/libzim/badge)](https://securityscorecards.dev/viewer/?uri=github.com/openzim/libzim) [![Doc](https://readthedocs.org/projects/libzim/badge/?style=flat)](https://libzim.readthedocs.io/en/latest/?badge=latest) [![Codecov](https://codecov.io/gh/openzim/libzim/branch/main/graph/badge.svg)](https://codecov.io/gh/openzim/libzim) [![CodeFactor](https://www.codefactor.io/repository/github/openzim/libzim/badge)](https://www.codefactor.io/repository/github/openzim/libzim) Disclaimer ---------- This document assumes you have a little knowledge about software compilation. If you experience difficulties with the dependencies or with the Libzim compilation itself, we recommend to have a look to [kiwix-build](https://github.com/kiwix/kiwix-build). Usage ----- Beside the source code, compiled versions of the libzim are made [available for various platforms](https://download.openzim.org/release/libzim/). Please notice that on Microsoft Windows with Microsoft compiler, you need to be careful to not compile in debug mode (because our released binaries are not). If you want to compile with the debug flag `/MDd`, then please use our [nightlies](https://download.openzim.org/nightly/). Preamble -------- Although the Libzim can be compiled/cross-compiled on/for many systems, the following documentation explains how to do it on POSIX ones. It is primarily though for GNU/Linux systems and has been tested on recent releases of Ubuntu and Fedora. Dependencies ------------ The Libzim relies on many third party software libraries. They are prerequisites to the Kiwix library compilation. Following libraries need to be available: * [LZMA](https://tukaani.org/lzma/) (package `liblzma-dev` on Ubuntu) * [ICU](http://site.icu-project.org/) (package `libicu-dev` on Ubuntu) * [Zstd](https://facebook.github.io/zstd/) (package `libzstd-dev` on Ubuntu) * [Xapian](https://xapian.org/) - optional (package `libxapian-dev` on Ubuntu) To test the code: * [Google Test](https://github.com/google/googletest) (package `googletest` on Ubuntu) * [ZIM Testing Suite](https://github.com/openzim/zim-testing-suite) - Reference test data set To build the documentations you need the packages: * [Doxygen](https://www.doxygen.nl) * Python packages for [Sphinx](https://www.sphinx-doc.org), [Sphinx rtd theme](https://github.com/readthedocs/sphinx_rtd_theme), [Breathe](https://breathe.readthedocs.io) and [Exhale](https://exhale.readthedocs.io) (packages `Sphinx`, `sphinx_rtd_theme`, `Breathe` and `Exhale` while using pip) These dependencies may or may not be packaged by your operating system. They may also be packaged but only in an older version. The compilation script will tell you if one of them is missing or too old. In the worse case, you will have to download and compile a more recent version by hand. If you want to install these dependencies locally, then ensure that Meson (through `pkg-config`) will properly find them. Environment ------------- The Libzim builds using [Meson](https://mesonbuild.com/) version 0.43 or higher. Meson relies itself on Ninja, Pkg-config and few other compilation tools. Install them first: * Meson * Ninja * Pkg-config These tools should be packaged if you use a cutting edge operating system. If not, have a look to the [Troubleshooting](#Troubleshooting) section. Compilation ----------- Once all dependencies are installed, you can compile Libzim with: ```bash meson . build ninja -C build ``` By default, it will compile dynamic linked libraries. All binary files will be created in the `build` directory created automatically by Meson. If you want statically linked libraries, you can add `--default-library=static` option to the Meson command. If you want to build the documentation, we need to pass the `-Ddoc=true` option and run the `doc` target: ```bash meson . build -Ddoc=true ninja -C build doc ``` Depending on your system, `ninja` command may be called `ninja-build`. By default, Libzim tries to compile with Xapian (and will generate an error if Xapian is not found). You can build without Xapian by passing the option `-Dwith_xapian=false` : ```bash meson . build -Dwith_xapian=false ninja -C build doc ``` If Libzim is compiled without Xapian, all search API are removed. You can test if an installed version of Libzim is compiled with or without xapian by testing the define `LIBZIM_WITH_XAPIAN`. Testing ------- ZIM files needed by unit-tests are not included in this repository. By default, Meson will use an internal directory in your build directory, but you can specify another directory with option `test_data_dir`: ```bash meson . build -Dtest_data_dir= ``` Whatever you specify a directory or not, you need a extra step to download the data. At choice: * Get the data from the repository [openzim/zim-testing-suite](https://github.com/openzim/zim-testing-suite) and put it yourself in the directory. * Use the script [download_test_data.py](scripts/download_test_data.py) which will download and extract the data for you. * As `ninja` to do it for you with `ninja download_test_data` once the project is configured. The simple workflow is: ```bash meson . build # Configure the project (using default directory for test data) cd build ninja # Build ninja download_test_data # Download the test data meson test # Test ``` It is possible to deactivate all tests using test data zim files by passing `none` to the `test_data_dir` option: ```bash meson . build -Dtest_data_dir=none cd build ninja meson test # Run tests but tests needing test zim files. ``` If the automated tests fail or timeout, you need to be aware that some tests need up to 16GB of memory. You can skip those specific tests with: ```bash SKIP_BIG_MEMORY_TEST=1 meson test ``` Some tests are checking error detection in multithread environment and they need to sleep to let threads working (and detect error). How many time to wait depends of your computer. If you have `error_in_creator` test failing, you probably need to extend the waiting time. This can be done by setting the env variable `WAIT_TIME_FACTOR_TEST` to a float factor. The waiting time will multiplied by this factor. ``` WAIT_TIME_FACTOR_TEST=2 meson test ``` Installation ------------ If you want to install the Libzim and the headers you just have compiled on your system, here we go: ```bash ninja -C build install ``` You might need to run the command as root (or using `sudo`), depending where you want to install the libraries. After the installation succeeded, you may need to run ldconfig (as root). Uninstallation ------------ If you want to uninstall the Libzim: ```bash ninja -C build uninstall ``` Like for the installation, you might need to run the command as root (or using `sudo`). Troubleshooting --------------- If you need to install Meson "manually": ```bash virtualenv -p python3 ./ # Create virtualenv source bin/activate # Activate the virtualenv pip3 install meson # Install Meson hash -r # Refresh bash paths ``` If you need to install Ninja "manually": ```bash git clone git://github.com/ninja-build/ninja.git cd ninja git checkout release ./configure.py --bootstrap mkdir ../bin cp ninja ../bin cd .. ``` If the compilation still fails, you might need to get a more recent version of a dependency than the one packaged by your Linux distribution. Try then with a source tarball distributed by the problematic upstream project or even directly from the source code repository. License ------- [GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or later, see [COPYING](COPYING) for more details. libzim-9.2.3/debian/000077500000000000000000000000001466367137100142535ustar00rootroot00000000000000libzim-9.2.3/debian/changelog000066400000000000000000000004341466367137100161260ustar00rootroot00000000000000libzim (8.0.1) stable; urgency=medium * Update to libzim version 8.0.1 -- Matthieu Gautier Web, 07 Sep 2022 14:38:00 -0200 libzim (0.0.0) unstable; urgency=medium * Initial release. -- Kunal Mehta Tue, 02 Jun 2020 01:49:48 -0700 libzim-9.2.3/debian/control000066400000000000000000000040201466367137100156520ustar00rootroot00000000000000Source: libzim Section: libs Priority: optional Build-Depends: debhelper-compat (= 13), liblzma-dev, libicu-dev, libxapian-dev, libzstd-dev, uuid-dev, libgtest-dev, meson, ninja-build, pkgconf Maintainer: Kiwix team Homepage: https://www.openzim.org/wiki/Libzim Standards-Version: 4.6.2 Rules-Requires-Root: no Package: libzim9 Architecture: any Multi-Arch: same Depends: ${misc:Depends}, ${shlibs:Depends} Pre-Depends: ${misc:Pre-Depends} Conflicts: libzim0, libzim0v5, libzim2, libzim4, libzim5, libzim6, libzim7, libzim8 Replaces: libzim0, libzim0v5, libzim2, libzim4, libzim5, libzim6, libzim7, libzim8 Description: library implementation of ZIM specifications ZIM (Zeno IMproved) is an open file format for storing the contents of wiki for offline usage. This file format is primarily focused on providing the contents of Wikipedia and Wikimedia projects for offline use. . libzim is the standard implementation of ZIM specification, which implements the read and write method for ZIM files. . ZIM is a file format created with focus on extracting and encoding data from MediaWiki for offline use. . Features of libzim are: * Native, coded in C++ * Extremely fast * Minimal footprint * Minimal dependencies * Portable on most OS (Windows, Linux, iOS, MacOS, Android, ...) Package: libzim-dev Section: libdevel Architecture: any Depends: ${misc:Depends}, libzim9 (= ${binary:Version}), liblzma-dev, libxapian-dev, libicu-dev, libzstd-dev Description: library implementation of ZIM specifications (development) ZIM (Zeno IMproved) is an open file format for storing the contents of wiki for offline usage. This file format is primarily focused on providing the contents of Wikipedia and Wikimedia projects for offline use. . libzim is the standard implementation of ZIM specification, which implements the read and write method for ZIM files. . ZIM is a file format created with focus on extracting and encoding data from MediaWiki for offline use. . This package contains development files. libzim-9.2.3/debian/copyright000066400000000000000000000000441466367137100162040ustar00rootroot00000000000000See COPYING in the repository root. libzim-9.2.3/debian/libzim-dev.install000066400000000000000000000000671466367137100177100ustar00rootroot00000000000000usr/include/* usr/lib/*/libzim.so usr/lib/*/pkgconfig/*libzim-9.2.3/debian/libzim9.install000066400000000000000000000000201466367137100172120ustar00rootroot00000000000000usr/lib/*/*.so.*libzim-9.2.3/debian/rules000077500000000000000000000005561466367137100153410ustar00rootroot00000000000000#!/usr/bin/make -f export DEB_BUILD_MAINT_OPTIONS = hardening=+all # Skip some extremely memory-intensive tests export SKIP_BIG_MEMORY_TEST=1 %: dh $@ --buildsystem=meson # Skip tests that require zim-testing-data for now override_dh_auto_configure: dh_auto_configure -- -Dtest_data_dir=none # Increase test timeout override_dh_auto_test: dh_auto_test -- -t 3 libzim-9.2.3/debian/source/000077500000000000000000000000001466367137100155535ustar00rootroot00000000000000libzim-9.2.3/debian/source/format000066400000000000000000000000151466367137100167620ustar00rootroot000000000000003.0 (native) libzim-9.2.3/docs/000077500000000000000000000000001466367137100137615ustar00rootroot00000000000000libzim-9.2.3/docs/.gitignore000066400000000000000000000000101466367137100157400ustar00rootroot00000000000000api xml libzim-9.2.3/docs/6to7.rst000066400000000000000000000304631466367137100153200ustar00rootroot00000000000000 Libzim 7 transition guide ========================= Libzim7 change a lot of things in the API and in the way we use namespaces (reflected in the API changes). This part is a document helping to do the transition from libzim6 to libzim7. Namespace handling ------------------ In libzim6 namespaces were exposed to the user. It was to the user to handle them correctly. Libzim6 was not doing any assumption about the namespaces. However, the usage (mainly from libkiwix) was to store metadata in ``M`` namespace, articles in ``A`` and image/video in ``I``. On the opposite side, libzim7 hides the concept of namespace and handle it for the user. While namespaces are still present and used in the zim format, they have vanished from the libzim api. For information (but it is not important to use libzim), we now store all "user content" in ``C`` namespace. Metadata are stored in ``M`` namespace and we use few other (``X``, ``W``) for some internal content. "User content" are accessed using "classic" method to get content. Metadata, illustration and such are accessed using specific method. An article stored in ``A`` namespace before ("A/index.html") is now accessed simply using "index.html". (It is stored in "C/index.html" in new format, but you must not specify the namespace in the new api). Compatibility ------------- libzim6 is agnostic about the namespaces. They are exposed to the user, whatever if we are reading a new or old zim file. It is up to the user to correctly handle namespaces (mainly, content are now in ``C`` instead of ``A``/``I``). libzim7 tries to be smart about the transition. It will look in the right namespace, depending of the zim file. Accessing "index.html" should work whatever if we use old or new namespace scheme. Accessing article/entry ----------------------- Getting one entry ................. In libzim6 accessing an ``Article`` was done using a ``File`` instance. You then had to check for the `Article` validity before using it. .. code-block:: c++ auto f = zim::File("foo.zim"); auto a = f.getArticleByUrl("A/index.html"); if (!a.good()) { std::cerr << "No article "A/index.html" << std::endl; } In libzim7 you access a |Entry| using a |Archive| instance. If there the entry is not found, a exception is raised. .. code-block:: c++ auto a = zim::Archive("foo.zim"); try { auto e = a.getEntryByPath("index.html"); } catch (zim::EntryNotFound& e) { std::cerr << "No entry "index.html" << std::endl; } Redirection ........... Article in libzim6 may be a redirection to another article or a article containing data. You had to check the kind of the article before using the right set of method. Using a method on a wrong kind was undefined behavior. .. code-block:: c++ auto article = [...]; if (article.isRedirect()) { auto target = article.getRedirectArticle(); } else { auto blob = article.getData(); } In libzim7, |Entry| is a kind of intermediate structure, either redirecting to another entry or a item. A |Item| is the structure containing the data. .. code-block:: c++ auto entry = [...]; if (entry.isRedirect()) { auto target = entry.getRedirectEntry(); } else { auto item = entry.getItem(); auto blob = item.getData(); } As a common usage is to get the item associated to the entry while resolving the redirection chain, it is possible to do this easily : .. code-block:: c++ auto entry = [...]; // Resolve any redirection chain and return the final item. auto item = entry.getItem(true); auto blob = item.getData() Iteration ......... To iterate on article with libzim6 you had to use the ``begin*`` method to get a iterator. You may iterate until ``end()`` was reached. .. code-block:: c++ auto file = [...]; for(auto it = file.beginByUrl(); it!=file.end(); it++) { auto article = *it; [...] } If you wanted to iterate on article starting by a url prefix it was a bit more complex : .. code-block:: c++ auto file = [...]; auto it = file.find("A/ind"); while(!it.is_end() && it->getUrl().startWith("A/ind")) { auto article = *it; [...] it++; } In libzim7 you get |EntryRange| on which you can easily iterate on: .. code-block:: c++ auto archive = [...]; for(auto entry : archive.iterByPath()) { [...] } .. code-block:: c++ auto archive = [...]; for(auto entry : archive.findByPath("ind")) { [...] } Searching --------- In libzim6 searching was made the only class ``Search`` .. code-block:: c++ auto f = zim::File("foo.zim"); auto search = zim::Search(&f); search.set_query("bar"); search.set_range(10, 30); for (auto it =search.begin(); it!=search.end(); it++) { std::cout << "Found result " << it.get_url() << std::endl; } In libzim7 you search starting from a |Searcher|. .. code-block:: c++ // Create a searcher, something to search on an archive zim::Searcher searcher(archive); // We need a query to specify what to search for zim::Query query; query.setQuery("bar"); // Create a search for the specified query zim::Search search = searcher.search(query); // Now we can get some result from the search. // 20 results starting from offset 10 (from 10 to 30) zim::SearchResultSet results = search.getResults(10, 20); // SearchResultSet is iterable for(auto entry: results) { std::cout << entry.getPath() << std::endl; } While it may seems a bit more complex (and it is), it has the main advantage to allow reusing of the different instance : - |Searcher| is what we are searching on, we can do several search on it without recreating a internal xapian database. - |Query| is what we are searching for. - |Search| is a specific search. - |SearchResultSet| is a set of result for a |Search|, it allow getting particular results without having to search several times. Suggestion ---------- In libzim6 suggestion was made using the same class ``Search`` but by setting the suggestion mode before iterating on the results. .. code-block:: c++ auto f = zim::File("foo.zim"); auto search = zim::Search(&f); search.set_query("bar"); search.set_range(10, 30); search.set_suggestion_mode(true); // <<< for (auto it =search.begin(); it!=search.end(); it++) { std::cout << "Found result " << it.get_url() << std::endl; } If the zim file had no suggestion database, the suggestion search was made on full text database (with variable results). In libzim7 you do suggestion using |SuggestionSearcher| API : .. code-block:: c++ // Create a searcher, something to search on an archive zim::SuggestionSearcher searcher(archive); // Create a search for the specified query zim::SuggestionSearch search = searcher.search("bar"); // Now we can get some result from the search. // 20 results starting from offset 10 (from 10 to 30) zim::SuggestionResultSet results = search.getResults(10, 20); // SearchResultSet is iterable for(auto entry: results) { std::cout << entry.getPath() << std::endl; } Creating a zim file ------------------- Creating a zim file with libzim6 was pretty complex. One had to inherit the ``zim::writer::Creator`` to provide the main url. Then it had to inherit from ``zim::writer::Article`` to be able to add different kind of article to the zim file. .. code-block:: c++ class MyCreator: public zim::writer::Creator { Url getMainUrl() const { return Url('A', "index.html"); } }; class RedirectArticle : public zim::writer::Article { public: RedirectArticle(const std::string& title, const std::string& url, const std::string& target) : title(title), url(url), target(target) {} bool isRedirect() const { return true; } zim::writer::Url getUrl() const { return url; } std::string getTitle() const { return title; } zim::writer::Url getRedirectUrl() const { return target; } private: std::string title; std::string url; std::string target; }; class ContentArticle: public zim::writer::Article { ContentArticle(const std::string& title, const std::string& url, const std::string& mimetype, const std::string& content) : title(title), url(url), mimetype(mimetype), content(content) {} bool isRedirect() const { return false; } zim::writer::Url getUrl() const { return url; } std::string getTitle() const { return title; } std::string getMimeType() const { return mimetype; } Blob getData() const { return Blob(content.data(), content.size()); } private: std::string title; std::string url; std::string mimetype; std::string content; }; int main() { MyCreator creator(); creator.startZimCreation("out_file.zim"); std::shared_ptr article = std::make_shared("A article", "A/article", "text/html", "A content"); creator.addArticle(article); std::shared_ptr redirect = std::make_shared("A redirect", "A/redirect", "A/article"); creator.addArticle(redirect); creator.finishZimCreation(); } On libzim7, you don't have to inherit the |Creator|. Redirect and metadata are added using |addRedirection| and |addMetadata|. You still may have to inherit |WriterItem| but default implementation are provided (|StringItem|, |FileItem|). .. code-block:: c++ int main() { zim::writer::Creator creator; creator.startZimCreation(); creator.addRedirection("A/redirect", "A redirect", "A/article"); std::shared_ptr item = std::make_shared("article", "text/html", "A article", {}, "A content"); creator.addItem(item); creator.finishZimCreation(); } Metadata and Illustration ......................... Metadata are adding using |addMetadata|. You don't have to create a specific item in ``M`` namespace. The creator now create the ``M/Counter`` metadata for you. You don't have (and must not) add a ``M/Counter`` yourself. Favicon has been deprecated in favor of Illustration. In libzim6, you had to add a file in ``I`` namespace and add a ``-/favicon`` redirection to the file. In libzim7, you have to use the |addIllustration| method. Hints ..... Hints are a new concept in libzim7. This is a generic way to pass information to the creator about how to handle item/redirection. An almost mandatory hint to pass is the hint ``FRONT_ARTICLE`` (|HintKeys|). ``FRONT_ARTICLE`` mark entry (item or redirection) as main article for the reader (typically a html page in opposition to a resource file as css, js, ...). Random and suggestion feature will search only in entries marked as ``FRONT_ARTICLE``. If no entry are marked as ``FRONT_ARTICLE``, all entries will be used. .. Declare some replacement helpers .. |Archive| replace:: :class:`zim::Archive` .. |EntryRange| replace:: :class:`zim::Archive::EntryRange` .. |Entry| replace:: :class:`zim::Entry` .. |Item| replace:: :class:`zim::Item` .. |EntryNotFound| replace:: :class:`zim::EntryNotFound` .. |Searcher| replace:: :class:`zim::Searcher` .. |Search| replace:: :class:`zim::Search` .. |Query| replace:: :class:`zim::Query` .. |SearchResultSet| replace:: :class:`zim::SearchResultSet` .. |SuggestionSearcher| replace:: :class:`zim::SuggestionSearcher` .. |getEntryByPath| replace:: :func:`getEntryByPath` .. |getEntryByTitle| replace:: :func:`getEntryByTitle` .. |findByPath| replace:: :func:`findByPath` .. |findByTitle| replace:: :func:`findByTitle` .. |Creator| replace:: :class:`zim::writer::Creator` .. |WriterItem| replace:: :class:`zim::writer::Item` .. |StringItem| replace:: :class:`zim::writer::StringItem` .. |FileItem| replace:: :class:`zim::writer::FileItem` .. |addMetadata| replace:: :func:`addMetadata` .. |addRedirection| replace:: :func:`addRedirection` .. |addIllustration| replace:: :func:`addIllustration` .. |HintKeys| replace:: :enum:`zim::writer::HintKeys` libzim-9.2.3/docs/conf.py000066400000000000000000000042451466367137100152650ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os # import sys # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = 'libzim' copyright = '2020, libzim-team' author = 'libzim-team' # -- General configuration --------------------------------------------------- on_rtd = os.environ.get('READTHEDOCS', None) == 'True' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'breathe', 'exhale' ] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] if not on_rtd: html_theme = 'sphinx_rtd_theme' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] breathe_projects = { "libzim": "./xml" } breathe_default_project = 'libzim' exhale_args = { "containmentFolder": "./api", "rootFileName": "ref_api.rst", "rootFileTitle": "Reference API", "doxygenStripFromPath": "..", "treeViewIsBootstrap": True, "createTreeView" : True, "exhaleExecutesDoxygen": True, "exhaleDoxygenStdin": "INPUT = ../include" } primary_domain = 'cpp' highlight_language = 'cpp' libzim-9.2.3/docs/index.rst000066400000000000000000000005651466367137100156300ustar00rootroot00000000000000.. libzim documentation master file, created by sphinx-quickstart on Fri Jul 24 15:40:50 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. Welcome to libzim's documentation! ================================== .. toctree:: :maxdepth: 2 :caption: Contents: usage 6to7 api/ref_api libzim-9.2.3/docs/meson.build000066400000000000000000000003131466367137100161200ustar00rootroot00000000000000 sphinx = find_program('sphinx-build', native:true) sphinx_target = run_target('doc', command: [sphinx, '-bhtml', meson.current_source_dir(), meson.current_build_dir()]) libzim-9.2.3/docs/requirements.txt000066400000000000000000000000261466367137100172430ustar00rootroot00000000000000breathe exhale sphinx libzim-9.2.3/docs/usage.rst000066400000000000000000000141301466367137100156160ustar00rootroot00000000000000Libzim programming ================== Introduction ------------ libzim is written in C++. To use the library, you need the include files of libzim have to link against libzim. Errors are handled with exceptions. When something goes wrong, libzim throws an error, which is always derived from std::exception. All classes are defined in the namespace zim. Copying is allowed and tried to make as cheap as possible. The reading part of the libzim is most of the time thread safe. Searching and creating part are not. You have to serialize access to the class yourself. The main class, which accesses a archive is |Archive|. It has actually a reference to an implementation, so that copies of the class just references the same file. You open a file by passing the file name to the constructor as a std::string. Iterating over entries is made by creating a |EntryRange|. .. code-block:: c++ #include #include #include int main(int argc, char* argv[]) { try { zim::Archive a("wikipedia.zim"); for (auto entry: a.iterByPath()) { std::cout << "path: " << entry.getPath() << " title: " << entry.getTitle() << std::endl; } } catch (const std::exception& e) { std::cerr << e.what() << std::endl; } } In subsequent examples, only code needed to use the library will be explained. The main-function with the error catcher should always be in place. Getting entries --------------- Entries are addressed either by path or title. |Archive| has methods |getEntryByPath| and |getEntryByTitle|. Both take 1 parameters : a string, which specifies the path or the title of the entry to get. They return a |Entry|. If the entry cannot be found, they throw the exception |EntryNotFound|. Entry are entry point in a archive for "things". It can be a redirection to another entry or a |Item| .. code-block:: c++ auto entry = archive.getEntryByPath("foo"); if (entry.isRedirect()) { std::cout << "This is a redirection to " << entry.getRedirectEntry().getPath() << std::endl(); } else { std::cout << "This is a item with content : " << entry.getItem().getData() << std::endl(); } As it is pretty common to resolve potential entry redirection and get the final item, you can do it directly using `getItem` : .. code-block:: c++ auto entry = archive.getEntryByPath("foo"); auto item = entry.getItem(true); if (entry.isRedirect()) { std::cout << "Entry " << entry.getPath() << " is a entry pointing to the item " << item.getPath() << std::endl; } else { std::cout << entry.getPath() << " should be equal to " << item.getPath() << std::endl; } std::cout << "The item data is " << item.getData() << std::endl; Finding entries --------------- |getEntryByPath|/|getEntryByTitle| allow to get a exact entry. But you may want to find entries using a more loosely method. |findByPath| and |findByTitle| allow you to find entries starting by the given path/title prefix. |findByPath|/|findByTitle| return a |EntryRange| you can iterate on : .. code-block:: c++ for (auto entry: archive.findEntryByPath("fo")) { std::cout << "Entry " << entry.getPath() << " should starts with fo." << std::endl; } Searching for entries --------------------- Find entries by path/title is nice but you may want to search for entries base on their content. If the zim archive contains a full text index, you can search on it. The class |Searcher| allow to search on one or several |Archive|. It allows to create a |Search| which represent a particular search for a |Query|. From a |Search|, you can get a |SearchResultSet| on which you can iterate. .. code-block:: c++ // Create a searcher, something to search on an archive zim::Searcher searcher(archive); // We need a query to specify what to search for zim::Query query; query.setQuery("bar"); // Create a search for the specified query zim::Search search = searcher.search(query); // Now we can get some result from the search. // 20 results starting from offset 10 (from 10 to 30) zim::SearchResultSet results = search.getResults(10, 20); // SearchResultSet is iterable for(auto entry: results) { std::cout << entry.getPath() << std::endl; } Searching for suggestions ------------------------- While |findByTitle| may be a good start to search for suggestion, you may want to search for suggestion for term in the middle of the suggestion. The suggestion API allow you to search for suggestion, using suggestion database included in recent zim files. The suggestion API is pretty close from the search API: .. code-block:: c++ // Create a searcher, something to search on an archive zim::SuggestionSearcher searcher(archive); // Create a search for the specified query zim::SuggestionSearch search = searcher.search("bar"); // Now we can get some result from the search. // 20 results starting from offset 10 (from 10 to 30) zim::SuggestionResultSet results = search.getResults(10, 20); // SearchResultSet is iterable for(auto entry: results) { std::cout << entry.getPath() << std::endl; } If the zim file doesn't contain a suggestion database, the suggestion will fallback to |findByTitle| for you. .. Declare some replacement helpers .. |Archive| replace:: :class:`zim::Archive` .. |EntryRange| replace:: :class:`zim::Archive::EntryRange` .. |Entry| replace:: :class:`zim::Entry` .. |Item| replace:: :class:`zim::Item` .. |EntryNotFound| replace:: :class:`zim::EntryNotFound` .. |Searcher| replace:: :class:`zim::Searcher` .. |Search| replace:: :class:`zim::Search` .. |Query| replace:: :class:`zim::Query` .. |SearchResultSet| replace:: :class:`zim::SearchResultSet` .. |getEntryByPath| replace:: :func:`getEntryByPath` .. |getEntryByTitle| replace:: :func:`getEntryByTitle` .. |findByPath| replace:: :func:`findByPath` .. |findByTitle| replace:: :func:`findByTitle` libzim-9.2.3/examples/000077500000000000000000000000001466367137100146475ustar00rootroot00000000000000libzim-9.2.3/examples/createZimExample.cpp000066400000000000000000000046151466367137100206200ustar00rootroot00000000000000/* * Copyright (C) 2012 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include class TestItem : public zim::writer::Item { std::string _id; std::string _data; public: TestItem() { } explicit TestItem(const std::string& id); virtual ~TestItem() = default; virtual std::string getPath() const; virtual std::string getTitle() const; virtual std::string getMimeType() const; virtual std::unique_ptr getContentProvider() const; }; TestItem::TestItem(const std::string& id) : _id(id) { std::ostringstream data; data << "this is item " << id << std::endl; _data = data.str(); } std::string TestItem::getPath() const { return std::string("A/") + _id; } std::string TestItem::getTitle() const { return _id; } std::string TestItem::getMimeType() const { return "text/plain"; } std::unique_ptr TestItem::getContentProvider() const { return std::unique_ptr(new zim::writer::StringProvider(_data)); } int main(int argc, char* argv[]) { unsigned max = 16; try { zim::writer::Creator c; c.configVerbose(false).configCompression(zim::Compression::Zstd); c.startZimCreation("foo.zim"); for (unsigned n = 0; n < max; ++n) { std::ostringstream id; id << (n + 1); auto article = std::make_shared(id.str()); c.addItem(article); } c.setMainPath("A/0"); c.finishZimCreation(); } catch (const std::exception& e) { std::cerr << e.what() << std::endl; } } libzim-9.2.3/examples/meson.build000066400000000000000000000003431466367137100170110ustar00rootroot00000000000000 executable('createZimExample', 'createZimExample.cpp', link_with: libzim, include_directories: include_directory, dependencies: [thread_dep, xapian_dep, icu_dep, lzma_dep, zstd_dep, win_deps]) libzim-9.2.3/include/000077500000000000000000000000001466367137100144545ustar00rootroot00000000000000libzim-9.2.3/include/meson.build000066400000000000000000000000741466367137100166170ustar00rootroot00000000000000subdir('zim') include_directory = include_directories('.') libzim-9.2.3/include/zim/000077500000000000000000000000001466367137100152535ustar00rootroot00000000000000libzim-9.2.3/include/zim/archive.h000066400000000000000000000634231466367137100170550ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ARCHIVE_H #define ZIM_ARCHIVE_H #include "zim.h" #include "entry.h" #include "uuid.h" #include #include #include #include #include namespace zim { class FileImpl; enum class EntryOrder { pathOrder, titleOrder, efficientOrder }; /** * The Archive class to access content in a zim file. * * The `Archive` is the main class to access content in a zim file. * `Archive` are lightweight object and can be copied easily. * * An `Archive` is read-only, and internal states (as caches) are protected * from race-condition. Therefore, all methods of `Archive` are threadsafe. * * Zim archives exist with two different namespace schemes: An old one and the new one. * The method `hasNewNamespaceScheme` permit to know which namespace is used by the archive. * * When using old namespace scheme: * - User entries may be stored in different namespaces (historically `A`, `I`, `J` or `-`). * So path of the entries contains the namespace as a "top level directory": `A/foo.html`, `I/image.png`, ... * - All API taking or returning a path expect/will return a path with the namespace. * * When using new namespace scheme: * - User entries are always stored without namespace. * (For information, they are stored in the same namespace `C`. Still consider there is no namespace as all API masks it) * As there is no namespace, paths don't contain it: `foo.hmtl`, `image.png`, ... * - All API taking or returning a path expect/will return a path without namespace. * * This difference may seem complex to handle, but not so much. * As all paths returned by API is consistent with paths expected, you simply have to use the path as it is. * Forget about the namespace and if a path has it, simply consider it as a subdirectory. * The only place it could be problematic is when you already have a path stored somewhere (bookmark, ...) * using a scheme and use it on an archive with another scheme. For this case, the method `getEntryByPath` * has a compatibility layer trying to transform a path to the new scheme as a fallback if the entry is not found. * * All methods of archive may throw an `ZimFileFormatError` if the file is invalid. */ class LIBZIM_API Archive { public: template class EntryRange; template class iterator; /** Archive constructor. * * Construct an archive from a filename. * The file is open readonly. * * The filename is the "logical" path. * So if you want to open a split zim file (foo.zimaa, foo.zimab, ...) * you must pass the `foo.zim` path. * * @param fname The filename to the file to open (utf8 encoded) */ explicit Archive(const std::string& fname); #ifndef _WIN32 /** Archive constructor. * * Construct an archive from a file descriptor. * Fd is used only at Archive creation. * Ownership of the fd is not taken and it must be closed by caller. * * Note: This function is not available under Windows. * * @param fd The descriptor of a seekable file representing a ZIM archive */ explicit Archive(int fd); /** Archive constructor. * * Construct an archive from a descriptor of a file with an embedded ZIM * archive inside. * Fd is used only at Archive creation. * Ownership of the fd is not taken and it must be closed by caller. * * Note: This function is not available under Windows. * * @param fd The descriptor of a seekable file with a continuous segment * representing a complete ZIM archive. * @param offset The offset of the ZIM archive relative to the beginning * of the file (rather than the current position associated with fd). * @param size The size of the ZIM archive. */ Archive(int fd, offset_type offset, size_type size); /** Archive constructor. * * Construct an archive from a descriptor of a file with an embedded ZIM * archive inside. * Fd is used only at Archive creation. * Ownership of the fd is not taken and it must be closed by caller. * * Note: This function is not available under Windows. * * @param fd A FdInput (tuple) containing the fd (int), offset (offset_type) and size (size_type) * referencing a continuous segment representing a complete ZIM archive. */ explicit Archive(FdInput fd); /** Archive constructor. * * Construct an archive from several file descriptors. * Each part may be embedded in a file. * Fds are used only at Archive creation. * Ownership of the fds is not taken and they must be closed by caller. * Fds (int) can be the same between FdInput if the parts belong to the same file. * * Note: This function is not available under Windows. * * @param fds A vector of FdInput (tuple) containing the fd (int), offset (offset_type) and size (size_type) * referencing a series of segments representing a complete ZIM archive. */ explicit Archive(const std::vector& fds); #endif /** Return the filename of the zim file. * * Return the filename as passed to the constructor * (So foo.zim). * * @return The logical filename of the archive. */ const std::string& getFilename() const; /** Return the logical archive size. * * Return the size of the full archive, not the size of the file on the fs. * If the zim is split, return the sum of the size of the parts. * * @return The logical size of the archive. */ size_type getFilesize() const; /** Return the number of entries in the archive. * * Return the total number of entries in the archive, including * internal entries created by libzim itself, metadata, indexes, ... * * @return the number of all entries in the archive. */ entry_index_type getAllEntryCount() const; /** Return the number of user entries in the archive. * * If the notion of "user entries" doesn't exist in the zim archive, * returns `getAllEntryCount()`. * * @return the number of user entries in the archive. */ entry_index_type getEntryCount() const; /** Return the number of articles in the archive. * * The definition of "article" depends of the zim archive. * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE" * at creaton time. * On old archives, this corresponds to all "text/html*" entries. * * @return the number of articles in the archive. */ entry_index_type getArticleCount() const; /** Return the number of media in the archive. * * This definition of "media" is based on the mimetype. * * @return the number of media in the archive. */ entry_index_type getMediaCount() const; /** The uuid of the archive. * * @return the uuid of the archive. */ Uuid getUuid() const; /** Get a specific metadata content. * * Get the content of a metadata stored in the archive. * * @param name The name of the metadata. * @return The content of the metadata. * @exception EntryNotFound If the metadata is not in the arcthive. */ std::string getMetadata(const std::string& name) const; /** Get a specific metadata item. * * Get the item associated to a metadata stored in the archive. * * @param name The name of the metadata. * @return The item associated to the metadata. * @exception EntryNotFound If the metadata in not in the archive. */ Item getMetadataItem(const std::string& name) const; /** Get the list of metadata stored in the archive. * * @return The list of metadata in the archive. */ std::vector getMetadataKeys() const; /** Get the illustration item of the archive. * * Illustration is a icon for the archive that can be used in catalog and so to illustrate the archive. * * @param size The size (width and height) of the illustration to get. Default to 48 (48x48px icon) * @return The illustration item. * @exception EntryNotFound If no illustration item can be found. */ Item getIllustrationItem(unsigned int size=48) const; /** Return a list of available sizes (width) for the illustations in the archive. * * Illustration is an icon for the archive that can be used in catalog and elsewehere to illustrate the archive. * An Archive may contains several illustrations with different size. * This method allows to know which illustration are in the archive (by size: width) * * @return A set of size. */ std::set getIllustrationSizes() const; /** Get an entry using its "path" index. * * Use the index of the entry to get the idx'th entry * (entry being sorted by path). * * @param idx The index of the entry. * @return The Entry. * @exception std::out_of_range If idx is greater than the number of entry. */ Entry getEntryByPath(entry_index_type idx) const; /** Get an entry using a path. * * Search an entry in the zim, using its path. * On archive with new namespace scheme, path must not contain the namespace. * On archive without new namespace scheme, path must contain the namespace. * A compatibility layer exists to accept "old" path on new archive (and the opposite) * to help using saved path (bookmark) on new archive. * On new archive, we first search the path in `C` namespace, then try to remove the potential namespace in path * and search again in `C` namespace with path "without namespace". * On old archive, we first assume path contains a namespace and if not (or no entry found) search in * namespaces `A`, `I`, `J` and `-`. * * @param path The entry's path. * @return The Entry. * @exception EntryNotFound If no entry has the asked path. */ Entry getEntryByPath(const std::string& path) const; /** Get an entry using its "title" index. * * Use the index of the entry to get the idx'th entry * (entry being sorted by title). * * @param idx The index of the entry. * @return The Entry. * @exception std::out_of_range If idx is greater than the number of entry. */ Entry getEntryByTitle(entry_index_type idx) const; /** Get an entry using a title. * * Get an entry using its title. * * @param title The entry's title. * @return The Entry. * @exception EntryNotFound If no entry has the asked title. */ Entry getEntryByTitle(const std::string& title) const; /** Get an entry using its "cluster" index. * * Use the index of the entry to get the idx'th entry * The actual order of the entry is not really specified. * It is infered from the internal way the entry are stored. * * This method is probably not relevent and is provided for completeness. * You should probably use a iterator using the `efficientOrder`. * * @param idx The index of the entry. * @return The Entry. * @exception std::out_of_range If idx is greater than the number of entry. */ Entry getEntryByClusterOrder(entry_index_type idx) const; /** Get the main entry of the archive. * * @return The Main entry. * @exception EntryNotFound If no main entry has been specified in the archive. */ Entry getMainEntry() const; /** Get a random entry. * * The entry is picked randomly from the front artice list. * * @return A random entry. * @exception EntryNotFound If no valid random entry can be found. */ Entry getRandomEntry() const; /** Check in an entry has path in the archive. * * The path follows the same requirement than `getEntryByPath`. * * @param path The entry's path. * @return True if the path in the archive, false else. */ bool hasEntryByPath(const std::string& path) const { try{ getEntryByPath(path); return true; } catch(...) { return false; } } /** Check in an entry has title in the archive. * * @param title The entry's title. * @return True if the title in the archive, false else. */ bool hasEntryByTitle(const std::string& title) const { try{ getEntryByTitle(title); return true; } catch(...) { return false; } } /** Check if archive has a main entry * * @return True if the archive has a main entry. */ bool hasMainEntry() const; /** Check if archive has a favicon entry * * @param size The size (width and height) of the illustration to check. Default to 48 (48x48px icon) * @return True if the archive has a corresponding illustration entry. * (Always True if the archive has no illustration, but a favicon) */ bool hasIllustration(unsigned int size=48) const; /** Check if the archive has a fulltext index. * * @return True if the archive has a fulltext index */ bool hasFulltextIndex() const; /** Check if the archive has a title index. * * @return True if the archive has a title index */ bool hasTitleIndex() const; /** Get a "iterable" by path order. * * This method allow to iterate on all user entries using a path order. * If the notion of "user entries" doesn't exists (for old zim archive), * this iterate on all entries in the zim file. * * ``` * for(auto& entry:archive.iterByPath()) { * ... * } * ``` * * @return A range on all the entries, in path order. */ EntryRange iterByPath() const; /** Get a "iterable" by title order. * * This method allow to iterate on all articles using a title order. * The definition of "article" depends of the zim archive. * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE" * at creaton time. * On old archives, this correspond to all entries in 'A' namespace. * Few archives may have been created without namespace but also without specific * article listing. In this case, this iterate on all user entries. * * ``` * for(auto& entry:archive.iterByTitle()) { * ... * } * ``` * * @return A range on all the entries, in title order. */ EntryRange iterByTitle() const; /** Get a "iterable" by a efficient order. * * This method allow to iterate on all user entries using a effictient order. * If the notion of "user entries" doesn't exists (for old zim archive), * this iterate on all entries in the zim file. * * ``` * for(auto& entry:archive.iterEfficient()) { * ... * } * ``` * * @return A range on all the entries, in efficitent order. */ EntryRange iterEfficient() const; /** Find a range of entries starting with path. * * When using new namespace scheme, path must not contain the namespace (`foo.html`). * When using old namespace scheme, path must contain the namespace (`A/foo.html`). * Contrary to `getEntryByPath`, there is no compatibility layer, path must follow the archive scheme. * * @param path The path prefix to search for. * @return A range starting from the first entry starting with path * and ending past the last entry. * If no entry starts with `path`, begin == end. */ EntryRange findByPath(std::string path) const; /** Find a range of entry starting with title. * * When using old namespace scheme, entry title is search in `A` namespace. * * @param title The title prefix to search for. * @return A range starting from the first entry starting with title * and ending past the last entry. * If no entry starts with `title`, begin == end. */ EntryRange findByTitle(std::string title) const; /** hasChecksum. * * The checksum is not the checksum of the file. * It is an internal checksum stored in the zim file. * * @return True if the archive has a checksum. */ bool hasChecksum() const; /** getChecksum. * * @return the checksum stored in the archive. * If the archive has no checksum return an empty string. */ std::string getChecksum() const; /** Check that the zim file is valid (in regard to its checksum). * * If the zim file has no checksum return false. * * @return True if the file is valid. */ bool check() const; /** Check the integrity of the zim file. * * Run different type of checks to verify the zim file is valid * (in regard to the zim format). * This may be time consuming. * * @return True if the file is valid. */ bool checkIntegrity(IntegrityCheck checkType); /** Check if the file is split in the filesystem. * * @return True if the archive is split in different file (foo.zimaa, foo.zimbb). */ bool isMultiPart() const; /** Get if the zim archive uses the new namespace scheme. * * Recent zim file use the new namespace scheme. * * On user perspective, it means that : * - On old namespace scheme : * . All entries are accessible, either using `getEntryByPath` with a specific namespace * or simply iterating over the entries (with `iter*` methods). * . Entry's path has namespace included ("A/foo.html") * - On new namespace scheme : * . Only the "user" entries are accessible with `getEntryByPath` and `iter*` methods. * To access metadatas, use `getMetadata` method. * . Entry's path do not contains namespace ("foo.html") */ bool hasNewNamespaceScheme() const; /** Get a shared ptr on the FileImpl * * @internal * @return The shared_ptr */ std::shared_ptr getImpl() const { return m_impl; } #ifdef ZIM_PRIVATE cluster_index_type getClusterCount() const; offset_type getClusterOffset(cluster_index_type idx) const; entry_index_type getMainEntryIndex() const; /** Get an entry using a path and a namespace. * * @param ns The namespace to search in * @param path The entry's path (without namespace) * @return The entry * @exception EntryNotFound If no entry has been found. */ Entry getEntryByPathWithNamespace(char ns, const std::string& path) const; #endif private: std::shared_ptr m_impl; }; template LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); template<> LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); template<> LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); template<> LIBZIM_API entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); /** * A range of entries in an `Archive`. * * `EntryRange` represents a range of entries in a specific order. * * An `EntryRange` can't be modified is consequently threadsafe. */ template class LIBZIM_API Archive::EntryRange { public: explicit EntryRange(const std::shared_ptr file, entry_index_type begin, entry_index_type end) : m_file(file), m_begin(begin), m_end(end) {} iterator begin() const { return iterator(m_file, entry_index_type(m_begin)); } iterator end() const { return iterator(m_file, entry_index_type(m_end)); } int size() const { return m_end - m_begin; } EntryRange offset(int start, int maxResults) const { auto begin = m_begin + start; if (begin > m_end) { begin = m_end; } auto end = m_end; if (begin + maxResults < end) { end = begin + maxResults; } return EntryRange(m_file, begin, end); } private: std::shared_ptr m_file; entry_index_type m_begin; entry_index_type m_end; }; /** * An iterator on an `Archive`. * * `Archive::iterator` stores an internal state which is not protected * from race-condition. It is not threadsafe. * * An `EntryRange` can't be modified and is consequently threadsafe. * * Be aware that the referenced/pointed Entry is generated and stored * in the iterator itself. * Once the iterator is destructed or incremented/decremented, you must NOT * use the Entry. */ template class LIBZIM_API Archive::iterator { public: /* SuggestionIterator is conceptually a bidirectional iterator. * But std *LegayBidirectionalIterator* is also a *LegacyForwardIterator* and * it would impose us that : * > Given a and b, dereferenceable iterators of type It: * > If a and b compare equal (a == b is contextually convertible to true) * > then either they are both non-dereferenceable or *a and *b are references bound to the same object. * and * > the LegacyForwardIterator requirements requires dereference to return a reference. * Which cannot be as we create the entry on demand. * * So we are stick with declaring ourselves at `input_iterator`. */ using iterator_category = std::input_iterator_tag; using value_type = Entry; using pointer = Entry*; using reference = Entry&; explicit iterator(const std::shared_ptr file, entry_index_type idx) : m_file(file), m_idx(idx), m_entry(nullptr) {} iterator(const iterator& other) : m_file(other.m_file), m_idx(other.m_idx), m_entry(other.m_entry?new Entry(*other.m_entry):nullptr) {} bool operator== (const iterator& it) const { return m_file == it.m_file && m_idx == it.m_idx; } bool operator!= (const iterator& it) const { return !operator==(it); } iterator& operator=(iterator&& it) = default; iterator& operator=(iterator& it) { m_entry.reset(); m_idx = it.m_idx; m_file = it.m_file; return *this; } iterator& operator++() { ++m_idx; m_entry.reset(); return *this; } iterator operator++(int) { auto it = *this; operator++(); return it; } iterator& operator--() { --m_idx; m_entry.reset(); return *this; } iterator operator--(int) { auto it = *this; operator--(); return it; } const Entry& operator*() const { if (!m_entry) { m_entry.reset(new Entry(m_file, _toPathOrder(*m_file, m_idx))); } return *m_entry; } const Entry* operator->() const { operator*(); return m_entry.get(); } private: std::shared_ptr m_file; entry_index_type m_idx; mutable std::unique_ptr m_entry; }; /** * The set of the integrity checks to be performed by `zim::validate()`. */ typedef std::bitset IntegrityCheckList; /** Check the integrity of the zim file. * * Run the specified checks to verify the zim file is valid * (with regard to the zim format). Some checks can be quite slow. * * @param zimPath The path of the ZIM archive to be checked. * @param checksToRun The set of checks to perform. * @return False if any check fails, true otherwise. */ bool LIBZIM_API validate(const std::string& zimPath, IntegrityCheckList checksToRun); } #endif // ZIM_ARCHIVE_H libzim-9.2.3/include/zim/blob.h000066400000000000000000000046151466367137100163500ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_BLOB_H #define ZIM_BLOB_H #include "zim.h" #include #include #include #include namespace zim { /** * A blob is a pointer to data, potentially stored in an `Archive`. * * All `Blob`'s methods are threadsafe. */ class LIBZIM_API Blob { public: // types using DataPtr = std::shared_ptr; public: // functions /** * Constuct a empty `Blob` */ Blob(); /** * Constuct `Blob` pointing to `data`. * * The created blob only point to the data and doesn't own it. * User must care that data is not freed before using the blob. */ Blob(const char* data, size_type size); /** * Constuct `Blob` pointing to `data`. * * The created blob shares the ownership on data. */ Blob(const DataPtr& buffer, size_type size); operator std::string() const { return std::string(_data.get(), _size); } const char* data() const { return _data.get(); } const char* end() const { return _data.get() + _size; } size_type size() const { return _size; } private: DataPtr _data; size_type _size; }; inline std::ostream& operator<< (std::ostream& out, const Blob& blob) { if (blob.data()) out.write(blob.data(), blob.size()); return out; } inline bool operator== (const Blob& b1, const Blob& b2) { return b1.size() == b2.size() && std::equal(b1.data(), b1.data() + b1.size(), b2.data()); } } #endif // ZIM_BLOB_H libzim-9.2.3/include/zim/entry.h000066400000000000000000000057271466367137100166000ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ENTRY_H #define ZIM_ENTRY_H #include "zim.h" #include #include namespace zim { class Item; class Dirent; class FileImpl; /** * An entry in an `Archive`. * * All `Entry`'s methods are threadsafe. */ class LIBZIM_API Entry { public: explicit Entry(std::shared_ptr file_, entry_index_type idx_); bool isRedirect() const; std::string getTitle() const; std::string getPath() const; /** Get the item associated to the entry. * * An item is associated only if the entry is not a redirect. * For convenience, if follow is true, return the item associated to the targeted entry. * * @param follow True if the redirection is resolved before getting the item. (false by default) * @return The Item associated to the entry. * @exception InvalidType if the entry is a redirection and follow is false. */ Item getItem(bool follow=false) const; /** Get the item associated to the target entry. * * If there is a chain of redirection, the whole chain is resolved * and the item associted to the last entry is returned. * * @return the Item associated with the targeted entry. * @exception InvalidType if the entry is not a redirection. */ Item getRedirect() const; /** Get the Entry targeted by the entry. * * @return The entry directly targeted by this redirect entry. * @exception InvalidEntry if the entry is not a redirection. */ Entry getRedirectEntry() const; /** Get the index of the Entry targeted by the entry. * * @return The index of the entry directly targeted by this redirect * entry. * @exception InvalidEntry if the entry is not a redirection. */ entry_index_type getRedirectEntryIndex() const; entry_index_type getIndex() const { return m_idx; } protected: // so that Item can be implemented as a wrapper over Entry std::shared_ptr m_file; entry_index_type m_idx; std::shared_ptr m_dirent; }; } #endif // ZIM_ENTRY_H libzim-9.2.3/include/zim/error.h000066400000000000000000000110721466367137100165560ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ERROR_H #define ZIM_ERROR_H #include "zim.h" #include "tools.h" #include #include namespace zim { class LIBZIM_API ZimFileFormatError : public std::runtime_error { public: explicit ZimFileFormatError(const std::string& msg) : std::runtime_error(msg) { } }; class LIBZIM_API InvalidType: public std::logic_error { public: explicit InvalidType(const std::string& msg) : std::logic_error(msg) {} }; class LIBZIM_API EntryNotFound : public std::runtime_error { public: explicit EntryNotFound(const std::string& msg) : std::runtime_error(msg) {} }; /* Exception thrown by the Creator in case of error. * * Most exceptions actually thrown are inheriting this exception. */ class LIBZIM_API CreatorError : public std::runtime_error { public: explicit CreatorError(const std::string& message) : std::runtime_error(message) {} }; /* Exception thrown when a entry cannot be added to the Creator.*/ class LIBZIM_API InvalidEntry : public CreatorError { public: explicit InvalidEntry(const std::string& message) : CreatorError(message) {} }; /* Exception thrown if a incoherence in the user implementation has been detected. * * Users need to implement interfaces such as: * - ContentProvider * - IndexData * - Item * * If a incoherence has been detected in those implementations a * `IncoherentImplementationError` will be thrown. */ class LIBZIM_API IncoherentImplementationError : public CreatorError { public: explicit IncoherentImplementationError(const std::string& message) : CreatorError(message) {} }; /* Exception thrown in the main thread when another exception has been * thrown in another worker thread. * * Creator uses different worker threads to do background work. * If an exception is thrown in one of this threads, it is catched and * "rethrown" in the main thread as soon as possible with a `AsyncError`. * * AsyncError contains the original exception. You can rethrow the original * exception using `rethrow`: * * ``` * try { * creator->addStuff(...); * } catch (const zim::AsyncError& e) { * // An exception has been thrown in a worker thread * try { * e.rethrow(); * } catch (const std::exception& original_exception) { * // original_exception is the exception thrown in the worker thread * ... * } * } * ``` */ class LIBZIM_API AsyncError : public CreatorError { public: explicit AsyncError(const std::exception_ptr exception) : CreatorError(buildErrorMessage(exception)), m_exception(exception) {} [[noreturn]] void rethrow() const { std::rethrow_exception(m_exception); } private: // data std::exception_ptr m_exception; private: // function static std::string buildErrorMessage(const std::exception_ptr exception) { try { std::rethrow_exception(exception); } catch (const std::exception& e) { return Formatter() << "Asynchronous error: " << typeid(e).name() << std::endl << e.what(); } catch (...) { return "Unknown asynchronous exception"; } } }; /* Exception thrown when the creator is in error state. * * If the creator is in error state (mostly because a AsyncError has already * being thrown), any call to any method on it will thrown a `CreatorStateError`. */ class LIBZIM_API CreatorStateError : public CreatorError { public: explicit CreatorStateError() : CreatorError("Creator is in error state.") {} }; } #endif // ZIM_ERROR_H libzim-9.2.3/include/zim/item.h000066400000000000000000000064141466367137100163670ustar00rootroot00000000000000/* * Copyright (C) 2021 Veloman Yunkan * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ITEM_H #define ZIM_ITEM_H #include "zim.h" #include "blob.h" #include "entry.h" #include namespace zim { /** * An `Item` in an `Archive` * * There is no public constructor - the only way to obtain an `Item` * is via `Entry::getItem()` or `Entry::getRedirect()`. * * All `Item`'s methods are threadsafe. */ class LIBZIM_API Item : private Entry { public: // types typedef std::pair DirectAccessInfo; public: // functions std::string getTitle() const { return Entry::getTitle(); } std::string getPath() const { return Entry::getPath(); } std::string getMimetype() const; /** Get the data associated to the item * * Get the data of the item, starting at offset. * * @param offset The number of byte to skip at begining of the data. * @return A blob corresponding to the data. */ Blob getData(offset_type offset=0) const; /** Get the data associated to the item * * Get the `size` bytes of data of the item, starting at offset. * * @param offset The number of byte to skip at begining of the data. * @param size The number of byte to read. * @return A blob corresponding to the data. */ Blob getData(offset_type offset, size_type size) const; /** The size of the item. * * @return The size (in byte) of the item. */ size_type getSize() const; /** Direct access information. * * Some item are stored raw in the zim file. * If possible, this function give information about which file * and at which to read to get the data. * * It can be usefull as an optimisation when interacting with other system * by reopeing the file and reading the content bypassing the libzim. * * @return A pair of filename/offset specifying where read the content. * If it is not possible to have direct access for this item, * return a pair of `{"", 0}` */ DirectAccessInfo getDirectAccessInformation() const; entry_index_type getIndex() const { return Entry::getIndex(); } #ifdef ZIM_PRIVATE cluster_index_type getClusterIndex() const; blob_index_type getBlobIndex() const; #endif private: // functions explicit Item(const Entry& entry); friend class Entry; }; } #endif // ZIM_ITEM_H libzim-9.2.3/include/zim/meson.build000066400000000000000000000011021466367137100174070ustar00rootroot00000000000000zim_config = configure_file(output : 'zim_config.h', configuration : public_conf) install_headers( 'archive.h', 'blob.h', 'error.h', 'item.h', 'entry.h', 'uuid.h', 'zim.h', 'suggestion.h', 'suggestion_iterator.h', 'tools.h', 'version.h', zim_config, subdir:'zim' ) if xapian_dep.found() install_headers( 'search.h', 'search_iterator.h', subdir:'zim' ) endif install_headers( 'writer/item.h', 'writer/creator.h', 'writer/contentProvider.h', subdir:'zim/writer' ) libzim-9.2.3/include/zim/search.h000066400000000000000000000142271466367137100166770ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * Copyright (C) 2007 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SEARCH_H #define ZIM_SEARCH_H #include "search_iterator.h" #include "archive.h" #include #include #include namespace Xapian { class Enquire; class MSet; }; namespace zim { class Archive; class InternalDataBase; class Query; class Search; class SearchResultSet; /** * A Searcher is a object fulltext searching a set of Archives * * A Searcher is mainly used to create new `Search` * Internaly, this is mainly a wrapper around a Xapian database. * * You should consider that all search operations are NOT threadsafe. * It is up to you to protect your calls to avoid race competition. * However, Searcher (and subsequent classes) do not maintain a global/share state. * You can create several Searchers and use them in different threads. */ class LIBZIM_API Searcher { public: /** Searcher constructor. * * Construct a searcher on top of several archives (multi search). * * @param archives A list(vector) of archives to search on. */ explicit Searcher(const std::vector& archives); /** Searcher constructor. * * Construct a searcher on top of on archive. * * @param archive A archive to search on. */ explicit Searcher(const Archive& archive); Searcher(const Searcher& other); Searcher& operator=(const Searcher& other); Searcher(Searcher&& other); Searcher& operator=(Searcher&& other); ~Searcher(); /** Add a archive to the searcher. * * Adding a archive to a searcher do not invalidate already created search. */ Searcher& addArchive(const Archive& archive); /** Create a search for a specific query. * * The search is made on all archives added to the Searcher. * * @param query The Query to search. * * @throws std::runtime_error if the searcher does not have a valid * FT database. */ Search search(const Query& query); /** Set the verbosity of search operations. * * @param verbose The verbose mode to set */ void setVerbose(bool verbose); private: // methods void initDatabase(); private: // data std::shared_ptr mp_internalDb; std::vector m_archives; bool m_verbose; }; /** * A Query represent a query. * * It describe what have to be searched and how. * A Query is "database" independent. */ class LIBZIM_API Query { public: /** Query constructor. * * Create a empty query. */ Query(const std::string& query = ""); /** Set the textual query of the Query. * * @param query The string to search for. */ Query& setQuery(const std::string& query); /** Set the geographical query of the Query. * * Some article may be geo positioned. * You can search for articles in a certain distance of a point. * * @param latitude The latitute of the point. * @param longitude The longitude of the point. * @param distance The maximal distance from the point. */ Query& setGeorange(float latitude, float longitude, float distance); std::string m_query { "" }; bool m_geoquery { false }; float m_latitude { 0 }; float m_longitude { 0 }; float m_distance { 0 } ; }; /** * A Search represent a particular search, based on a `Searcher`. * * This is somehow the reunification of a `Searcher` (what to search on) * and a `Query` (what to search for). */ class LIBZIM_API Search { public: Search(Search&& s); Search& operator=(Search&& s); ~Search(); /** Get a set of results for this search. * * @param start The begining of the range to get * (offset of the first result). * @param maxResults The maximum number of results to return * (offset of last result from the start of range). */ const SearchResultSet getResults(int start, int maxResults) const; /** Get the number of estimated results for this search. * * As the name suggest, it is a estimation of the number of results. */ int getEstimatedMatches() const; private: // methods Search(std::shared_ptr p_internalDb, const Query& query); Xapian::Enquire& getEnquire() const; private: // data std::shared_ptr mp_internalDb; mutable std::unique_ptr mp_enquire; Query m_query; friend class Searcher; }; /** * The `SearchResult` represent a range of results corresponding to a `Search`. * * It mainly allows to get a iterator. */ class LIBZIM_API SearchResultSet { public: typedef SearchIterator iterator; /** The begin iterator on the result range. */ iterator begin() const; /** The end iterator on the result range. */ iterator end() const; /** The size of the SearchResult (end()-begin()) */ int size() const; private: SearchResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset); SearchResultSet(std::shared_ptr p_internalDb); private: // data std::shared_ptr mp_internalDb; std::shared_ptr mp_mset; friend class Search; }; } //namespace zim #endif // ZIM_SEARCH_H libzim-9.2.3/include/zim/search_iterator.h000066400000000000000000000067071466367137100206140ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SEARCH_ITERATOR_H #define ZIM_SEARCH_ITERATOR_H #include #include #include "entry.h" #include "archive.h" #include "uuid.h" namespace zim { class SearchResultSet; /** * A interator on search result (an Entry) * * Be aware that the referenced/pointed Entry is generated and stored * in the iterator itself. * Once the iterator is destructed or incremented/decremented, you must NOT * use the Entry. */ class LIBZIM_API SearchIterator { friend class zim::SearchResultSet; public: /* SuggestionIterator is conceptually a bidirectional iterator. * But std *LegayBidirectionalIterator* is also a *LegacyForwardIterator* and * it would impose us that : * > Given a and b, dereferenceable iterators of type It: * > If a and b compare equal (a == b is contextually convertible to true) * > then either they are both non-dereferenceable or *a and *b are references bound to the same object. * and * > the LegacyForwardIterator requirements requires dereference to return a reference. * Which cannot be as we create the entry on demand. * * So we are stick with declaring ourselves at `input_iterator`. */ using iterator_category = std::input_iterator_tag; using value_type = Entry; using pointer = Entry*; using reference = Entry&; SearchIterator(); SearchIterator(const SearchIterator& it); SearchIterator& operator=(const SearchIterator& it); SearchIterator(SearchIterator&& it); SearchIterator& operator=(SearchIterator&& it); ~SearchIterator(); bool operator== (const SearchIterator& it) const; bool operator!= (const SearchIterator& it) const; SearchIterator& operator++(); SearchIterator operator++(int); SearchIterator& operator--(); SearchIterator operator--(int); std::string getPath() const; std::string getTitle() const; int getScore() const; std::string getSnippet() const; int getWordCount() const; DEPRECATED int getSize() const; int getFileIndex() const; Uuid getZimId() const; reference operator*() const; pointer operator->() const; #ifdef ZIM_PRIVATE std::string getDbData() const; #endif private: struct InternalData; std::unique_ptr internal; SearchIterator(InternalData* internal_data); bool isEnd() const; }; } // namespace zim #endif // ZIM_SEARCH_ITERATOR_H libzim-9.2.3/include/zim/suggestion.h000066400000000000000000000131041466367137100176120ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2007 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SUGGESTION_H #define ZIM_SUGGESTION_H #include "suggestion_iterator.h" #include "archive.h" #if defined(LIBZIM_WITH_XAPIAN) namespace Xapian { class Enquire; class MSet; }; #endif namespace zim { class SuggestionSearcher; class SuggestionSearch; class SuggestionIterator; class SuggestionDataBase; /** * A SuggestionSearcher is a object suggesting over titles of an Archive * * A SuggestionSearcher is mainly used to create new `SuggestionSearch` * Internaly, this is a wrapper around a SuggestionDataBase with may or may not * include a Xapian index. * * You should consider that all search operations are NOT threadsafe. * It is up to you to protect your calls to avoid race competition. * However, SuggestionSearcher (and subsequent classes) do not maintain a global/ * share state You can create several Searchers and use them in different threads. */ class LIBZIM_API SuggestionSearcher { public: /** SuggestionSearcher constructor. * * Construct a SuggestionSearcher on top of an archive. * * @param archive An archive to suggest on. */ explicit SuggestionSearcher(const Archive& archive); SuggestionSearcher(const SuggestionSearcher& other); SuggestionSearcher& operator=(const SuggestionSearcher& other); SuggestionSearcher(SuggestionSearcher&& other); SuggestionSearcher& operator=(SuggestionSearcher&& other); ~SuggestionSearcher(); /** Create a SuggestionSearch for a specific query. * * The search is made on the archive under the SuggestionSearcher. * * @param query The SuggestionQuery to search. */ SuggestionSearch suggest(const std::string& query); /** Set the verbosity of search operations. * * @param verbose The verbose mode to set */ void setVerbose(bool verbose); private: // methods void initDatabase(); private: // data std::shared_ptr mp_internalDb; Archive m_archive; bool m_verbose; }; /** * A SuggestionSearch represent a particular suggestion search, based on a `SuggestionSearcher`. */ class LIBZIM_API SuggestionSearch { public: SuggestionSearch(SuggestionSearch&& s); SuggestionSearch& operator=(SuggestionSearch&& s); ~SuggestionSearch(); /** Get a set of results for this search. * * @param start The begining of the range to get * (offset of the first result). * @param maxResults The maximum number of results to return * (offset of last result from the start of range). */ const SuggestionResultSet getResults(int start, int maxResults) const; /** Get the number of estimated results for this suggestion search. * * As the name suggest, it is a estimation of the number of results. */ int getEstimatedMatches() const; private: // methods SuggestionSearch(std::shared_ptr p_internalDb, const std::string& query); private: // data std::shared_ptr mp_internalDb; std::string m_query; friend class SuggestionSearcher; #ifdef ZIM_PRIVATE public: // Close Xapian db to force range based search const void forceRangeSuggestion(); #endif // Xapian based methods and data #if defined(LIBZIM_WITH_XAPIAN) private: // Xapian based methods Xapian::Enquire& getEnquire() const; private: // Xapian based data mutable std::unique_ptr mp_enquire; #endif // LIBZIM_WITH_XAPIAN }; /** * The `SuggestionResultSet` represent a range of results corresponding to a `SuggestionSearch`. * * It mainly allows to get a iterator either based on an MSetIterator or a RangeIterator. */ class LIBZIM_API SuggestionResultSet { public: typedef SuggestionIterator iterator; typedef Archive::EntryRange EntryRange; /** The begin iterator on the result range. */ iterator begin() const; /** The end iterator on the result range. */ iterator end() const; /** The size of the SearchResult (end()-begin()) */ int size() const; private: // data std::shared_ptr mp_internalDb; std::shared_ptr mp_entryRange; private: SuggestionResultSet(EntryRange entryRange); friend class SuggestionSearch; // Xapian based methods and data #if defined(LIBZIM_WITH_XAPIAN) private: // Xapian based methods SuggestionResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset); private: // Xapian based data std::shared_ptr mp_mset; #endif // LIBZIM_WITH_XAPIAN }; } // namespace zim #endif // ZIM_SUGGESTION_H libzim-9.2.3/include/zim/suggestion_iterator.h000066400000000000000000000107161466367137100215310ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SUGGESTION_ITERATOR_H #define ZIM_SUGGESTION_ITERATOR_H #include "archive.h" #include namespace zim { class SuggestionResultSet; class SuggestionItem; class SearchIterator; /** * A interator on suggestion. * * Be aware that the referenced/pointed SuggestionItem is generated and stored * in the iterator itself. * Once the iterator is destructed or incremented/decremented, you must NOT * use the SuggestionItem. */ class LIBZIM_API SuggestionIterator { typedef Archive::iterator RangeIterator; friend class SuggestionResultSet; public: /* SuggestionIterator is conceptually a bidirectional iterator. * But std *LegayBidirectionalIterator* is also a *LegacyForwardIterator* and * it would impose us that : * > Given a and b, dereferenceable iterators of type It: * > If a and b compare equal (a == b is contextually convertible to true) * > then either they are both non-dereferenceable or *a and *b are references bound to the same object. * and * > the LegacyForwardIterator requirements requires dereference to return a reference. * Which cannot be as we create the entry on demand. * * So we are stick with declaring ourselves at `input_iterator`. */ using iterator_category = std::input_iterator_tag; using value_type = SuggestionItem; using pointer = SuggestionItem*; using reference = SuggestionItem&; SuggestionIterator() = delete; SuggestionIterator(const SuggestionIterator& it); SuggestionIterator& operator=(const SuggestionIterator& it); SuggestionIterator(SuggestionIterator&& it); SuggestionIterator& operator=(SuggestionIterator&& it); ~SuggestionIterator(); bool operator== (const SuggestionIterator& it) const; bool operator!= (const SuggestionIterator& it) const; SuggestionIterator& operator++(); SuggestionIterator operator++(int); SuggestionIterator& operator--(); SuggestionIterator operator--(int); Entry getEntry() const; const SuggestionItem& operator*(); const SuggestionItem* operator->(); private: // data struct SuggestionInternalData; std::unique_ptr mp_rangeIterator; std::unique_ptr m_suggestionItem; private: // methods SuggestionIterator(RangeIterator rangeIterator); // Xapian based methods and data #if defined(LIBZIM_WITH_XAPIAN) #ifdef ZIM_PRIVATE public: std::string getDbData() const; #endif private: // xapian based data std::unique_ptr mp_internal; private: // xapian based methods std::string getIndexPath() const; std::string getIndexTitle() const; std::string getIndexSnippet() const; SuggestionIterator(SuggestionInternalData* internal_data); #endif // LIBZIM_WITH_XAPIAN }; class LIBZIM_API SuggestionItem { public: // methods SuggestionItem(std::string title, std::string path, std::string snippet = "") : title(title), path(path), snippet(snippet) {} std::string getTitle() const { return title; } std::string getPath() const { return path; } std::string getSnippet() const { return snippet; } bool hasSnippet() const { return !snippet.empty(); } private: // data std::string title; std::string path; std::string snippet; }; } // namespace zim #endif // ZIM_SUGGESTION_ITERATOR_H libzim-9.2.3/include/zim/tools.h000066400000000000000000000045771466367137100166010ustar00rootroot00000000000000/* * Copyright (C) 2022 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_TOOLS_H #define ZIM_TOOLS_H #include "zim.h" #include namespace zim { #if defined(LIBZIM_WITH_XAPIAN) /** Helper function to set the icu data directory. * * On Android, we compile ICU without data integrated * in the library. So android application needs to set * the data directory where ICU can find its data. */ LIBZIM_API void setICUDataDirectory(const std::string& path); #endif /** * @brief Stringstream Class to use itself as the stream object * returned by << operator. (std::stringstream returns an std::ostream). * Allows a one-line stringstream to str conversion, e.g. use_str(Formatter() * << "foo" << variable); * */ class Formatter { public: Formatter() {} ~Formatter() {} template Formatter &operator<<(const Type &value) { stream_ << value; return *this; } /* Operator for function templates like std::endl */ Formatter &operator<<(std::ostream& (* __pf)(std::ostream&)) { stream_ << __pf; return *this; } /* Operator for working with other ostream like std::cerr */ friend std::ostream &operator<<(std::ostream &os, const Formatter &obj) { os << obj.stream_.str(); return os; } operator std::string() const { return stream_.str(); } private: /* Disable copy and assignment constructors */ Formatter(const Formatter &) = delete; Formatter &operator=(Formatter &) = delete; /* Simple composition with std::stringstream */ std::stringstream stream_; }; } #endif // ZIM_TOOLS_H libzim-9.2.3/include/zim/uuid.h000066400000000000000000000032271466367137100163760ustar00rootroot00000000000000/* * Copyright (C) 2021 Mannesh P M * Copyright (C) 2018 Matthieu Gautier * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_UUID_H #define ZIM_UUID_H #include "zim.h" #include #include #include #include namespace zim { struct LIBZIM_API Uuid { Uuid() { std::memset(data, 0, 16); } Uuid(const char uuid[16]) { std::copy(uuid, uuid+16, data); } static Uuid generate(std::string value = ""); bool operator== (const Uuid& other) const { return std::equal(data, data+16, other.data); } bool operator!= (const Uuid& other) const { return !(*this == other); } unsigned size() const { return 16; } explicit operator std::string() const; char data[16]; }; LIBZIM_API std::ostream& operator<< (std::ostream& out, const Uuid& uuid); } #endif // ZIM_UUID_H libzim-9.2.3/include/zim/version.h000066400000000000000000000021711466367137100171120ustar00rootroot00000000000000/* * Copyright (C) 2021 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_VERSION_H #define ZIM_VERSION_H #include "zim.h" #include #include namespace zim { typedef std::vector> LibVersions; LIBZIM_API LibVersions getVersions(); LIBZIM_API void printVersions(std::ostream& out = std::cout); } #endif // ZIM_VERSION_H libzim-9.2.3/include/zim/writer/000077500000000000000000000000001466367137100165675ustar00rootroot00000000000000libzim-9.2.3/include/zim/writer/contentProvider.h000066400000000000000000000115111466367137100221240ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_CONTENTPROVIDER_H #define ZIM_WRITER_CONTENTPROVIDER_H #include #include #include #include namespace zim { #ifdef _WIN32 #define DEFAULTFD zim::windows::FD namespace windows { #else #define DEFAULTFD zim::unix::FD namespace unix { #endif class FD; } namespace writer { /** * `ContentProvider` is an abstract class in charge of providing the content to * add in the archive to the creator. */ class LIBZIM_API ContentProvider { public: virtual ~ContentProvider() = default; /** * The size of the content to add into the archive. * * @return the total size of the content. */ virtual zim::size_type getSize() const = 0; /** * Return a blob to add to the archive. * * The returned blob doesn't have to represent the whole content. * The feed method can return the whole content chunk by chunk or in * one step. * When the whole content has been returned, feed must return an empty blob * (size == 0). * * This method will be called several times (at least twice) for * each content to add. * * It is up to the implementation to manage correctly the data pointed by * the returned blob. * It may (re)use the same buffer between calls (rewriting its content), * create a new buffer each time or make the blob point to a new region of * a big buffer. * It is up to the implementation to free any allocated memory. * * The data pointed by the blob must stay valid until the next call to feed. * A call to feed ensure that the data returned by a previous call will not * be used anymore. */ virtual Blob feed() = 0; }; /** * StringProvider provide the content stored in a string. */ class LIBZIM_API StringProvider : public ContentProvider { public: /** * Create a provider using a string as content. * The string content is copied and the reference don't have to be "keep" alive. * * @param content the content to serve. */ explicit StringProvider(const std::string& content) : content(content), feeded(false) {} zim::size_type getSize() const { return content.size(); } Blob feed(); protected: std::string content; bool feeded; }; /** * SharedStringProvider provide the content stored in a shared string. * * It is mostly the same thing that `StringProvider` but use a shared_ptr * to avoid copy. */ class LIBZIM_API SharedStringProvider : public ContentProvider { public: /** * Create a provider using a string as content. * The string content is not copied. * * @param content the content to serve. */ explicit SharedStringProvider(std::shared_ptr content) : content(content), feeded(false) {} zim::size_type getSize() const { return content->size(); } Blob feed(); protected: std::shared_ptr content; bool feeded; }; /** * FileProvider provide the content stored in file. */ class LIBZIM_API FileProvider : public ContentProvider { public: /** * Create a provider using file as content. * * @param filepath the path to the file to serve. */ explicit FileProvider(const std::string& filepath); ~FileProvider(); zim::size_type getSize() const { return size; } Blob feed(); protected: std::string filepath; zim::size_type size; private: std::unique_ptr buffer; std::unique_ptr fd; zim::offset_type offset; }; } } #undef DEFAULTFD #endif // ZIM_WRITER_CONTENTPROVIDER_H libzim-9.2.3/include/zim/writer/creator.h000066400000000000000000000237031466367137100204040ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_CREATOR_H #define ZIM_WRITER_CREATOR_H #include #include #include namespace zim { class Fileheader; namespace writer { class CreatorData; /** * The `Creator` is responsible to create a zim file. * * Once the `Creator` is instantiated, it can be configured with the * `config*` methods. * Then the creation process must be started with `startZimCreation`. * Elements of the zim file can be added using the `add*` methods. * The final steps is to call `finishZimCreation`. * * During the creation of the zim file (and before the call to `finishZimCreation`), * some values must be set using the `set*` methods. * * All `add*` methods and `finishZimCreation` can throw a exception. * (most of the time zim::CreatorError child but not limited to) * It is up to the user to catch this exception and handle the error. * The current (documented) conditions when a exception is thrown are: * - When a entry cannot be added (mainly because a entry with the same path has already been added) * A `zim::InvalidEntry` will be thrown. The creator will still be in a valid state and the creation can continue. * - An exception has been thrown in a worker thread. * This exception will be catch and rethrown through a `zim::AsyncError`. * The creator will be set in a invalid state and creation cannot continue. * - The creator is in error state. * A `zim::CreatorStateError` will be thrown. * - Any exception thrown by user implementation itself. * Note that this exception may be thrown in a worker thread and so being "catch" by a AsyncError. * - Any other exception thrown for unknown reason. * By default, creator status is not changed by thrown exception and creation should stop. */ class LIBZIM_API Creator { public: /** * Creator constructor. * * @param verbose If the creator print verbose information. * @param comptype The compression algorithm to use. */ Creator(); virtual ~Creator(); /** * Configure the verbosity of the creator * * @param verbose if the creator print verbose information. * @return a reference to itself. */ Creator& configVerbose(bool verbose); /** * Configure the compression algorithm to use. * * @param comptype the compression algorithm to use. * @return a reference to itself. */ Creator& configCompression(Compression compression); /** * Set the size of the created clusters. * * The creator will try to create cluster with (uncompressed) size * as close as possible to targetSize without exceeding that limit. * If not possible, the only such case being an item larger than targetSize, * a separated cluster will be allocated for that oversized item. * * Be carefull with this value. * Bigger value means more content put together, so a better compression ratio. * But it means also that more decompression has to be made when reading a blob. * If you don't know which value to put, don't use this method and let libzim * use the default value. * * @param targetSize The target size of a cluster (in byte). * @return a reference to itself. */ Creator& configClusterSize(zim::size_type targetSize); /** * Configure the fulltext indexing feature. * * @param indexing True if we must fulltext index the content. * @param language Language to use for the indexation. * @return a reference to itself. */ Creator& configIndexing(bool indexing, const std::string& language); /** * Set the number of thread to use for the internal worker. * * @param nbWorkers The number of workers to use. * @return a reference to itself. */ Creator& configNbWorkers(unsigned nbWorkers); /** * Start the zim creation. * * The creator must have been configured before calling this method. * * @param filepath the path of the zim file to create. */ void startZimCreation(const std::string& filepath); /** * Add a item to the archive. * * @param item The item to add. */ void addItem(std::shared_ptr item); /** * Add a metadata to the archive. * * @param name the name of the metadata * @param content the content of the metadata * @param mimetype the mimetype of the metadata. * Only used to detect if the metadata must be compressed or not. */ void addMetadata(const std::string& name, const std::string& content, const std::string& mimetype = "text/plain;charset=utf-8"); /** * Add a metadata to the archive using a contentProvider instead of plain string. * * @param name the name of the metadata. * @param provider the provider of the content of the metadata. * @param mimetype the mimetype of the metadata. * Only used to detect if the metadata must be compressed. */ void addMetadata(const std::string& name, std::unique_ptr provider, const std::string& mimetype = "text/plain;charset=utf-8"); /** * Add illustration to the archive. * * @param size the size (width and height) of the illustration. * @param content the content of the illustration (must be a png content) */ void addIllustration(unsigned int size, const std::string& content); /** * Add illustration to the archive. * * @param size the size (width and height) of the illustration. * @param provider the provider of the content of the illustration (must be a png content) */ void addIllustration(unsigned int size, std::unique_ptr provider); /** * Add a redirection to the archive. * * Hints (especially FRONT_ARTICLE) can be used to put the redirection * in the front articles list. * By default, redirections are not front article. * * @param path the path of the redirection. * @param title the title of the redirection. * @param targetpath the path of the target of the redirection. * @param hints hints associated to the redirection. */ void addRedirection( const std::string& path, const std::string& title, const std::string& targetpath, const Hints& hints = Hints()); /** * Add a alias of a existing entry. * * The existing entry pointed by `targetPath` is cloned and updated with * `path` and `title`. * * The alias entry will shared the same type (redirection or item) * and namespace than `targetPath`. * * If the `targetPath` is a item, the new entry will be item pointing * to the same data than `targetPath` item. (Not a redirection to `targetPath`). * However, the alias entry is not counted in the media type counter * and it is not fulltext indexed (only title indexed). * * Hints can be given to influence creator handling (front article, ...) * as it is done for redirection. * * @param path the path of the alias * @param title the title of the alias * @param targetPath the path of the aliased entry. * @param hints hints associated to the alias. */ void addAlias( const std::string& path, const std::string& title, const std::string& targetPath, const Hints& hints = Hints() ); /** * Finalize the zim creation. */ void finishZimCreation(); /** * Set the path of the main page. * * @param mainPath The path of the main page. */ void setMainPath(const std::string& mainPath) { m_mainPath = mainPath; } /** * Set the uuid of the the archive. * * @param uuid The uuid of the archive. */ void setUuid(const zim::Uuid& uuid) { m_uuid = uuid; } private: std::unique_ptr data; // configuration bool m_verbose = false; Compression m_compression = Compression::Zstd; bool m_withIndex = false; size_t m_clusterSize; std::string m_indexingLanguage; unsigned m_nbWorkers = 4; // zim data std::string m_mainPath; Uuid m_uuid = Uuid::generate(); void fillHeader(Fileheader* header) const; void writeLastParts() const; void checkError(); }; } } #endif // ZIM_WRITER_CREATOR_H libzim-9.2.3/include/zim/writer/item.h000066400000000000000000000260511466367137100177020ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_ITEM_H #define ZIM_WRITER_ITEM_H #include #include #include #include #include #include namespace zim { namespace writer { enum HintKeys { COMPRESS, FRONT_ARTICLE, }; using Hints = std::map; class ContentProvider; /** * IndexData represent data of an Item to be indexed in the archive. * * This is a abstract class the user need to implement. * (But default `Item::getIndexData` returns a default implementation * for IndexData which works for html content.) */ class LIBZIM_API IndexData { public: using GeoPosition = std::tuple; virtual ~IndexData() = default; /** * If the IndexData actually has data to index. * * It can be used to create `IndexData` for all your content * but discard some indexation based on some criteria. * * @return true if the item associated to this IndexData must be indexed. */ virtual bool hasIndexData() const = 0; /** * The title to use when indexing the item. * * May be different than `Item::getTitle()`, even if most of the time * it will be the same. * * @return the title to use. */ virtual std::string getTitle() const = 0; /** * The content to use when indexing the item. * * This is probably the most important method of `IndexData`. * Most item's contents are not applicable for a direct indexation. * We don't want to index html tags or menu/footer of an article. * This method allow you to return a currated plain text to indexe. * * @return the content to use. */ virtual std::string getContent() const = 0; /** * The keywords to use when indexing the item. * * Return a set of keywords, separated by space for the content. * Keywords are indexed using a higher score than text in `getContent` * * @return a string containing keywords separated by space. */ virtual std::string getKeywords() const = 0; /** * The number of words in the content. * * This value is not directly used to index the content but it * is stored in the xapian database, which may be used later to query * articles. * * @return the number of words in the item. */ virtual uint32_t getWordCount() const = 0; /** * The Geographical position of the subject covered by the item. * (When applicable) * * @return a 3 tuple (true, latitude, longitude) if the item is * about a geo positioned thing. * a 3 tuple (false, _, _) if having a GeoPosition is not * relevant. */ virtual GeoPosition getGeoPosition() const = 0; }; /** * Item represent data to be added to the archive. * * This is a abstract class the user need to implement. * libzim provides `BasicItem`, `StringItem` and `FileItem` * to simplify (or avoid) this reimplementation. */ class LIBZIM_API Item { public: /** * The path of the item. * * The path must be absolute. * Path must be unique. * * @return the path of the item. */ virtual std::string getPath() const = 0; /** * The title of the item. * * Item's title is indexed and is used for the suggestion system. * Title don't have to be unique. * * @return the title of the item. */ virtual std::string getTitle() const = 0; /** * The mimetype of the item. * * Mimetype is store within the content. * It is also used to detect if the content must be compressed or not. * * @return the mimetype of the item. */ virtual std::string getMimeType() const = 0; /** * The content provider of the item. * * The content provider is responsible to provide the content to the creator. * The returned content provider must stay valid even after creator release * its reference to the item. * * This method will be called once by libzim, in the main thread * (but will be used in a different thread). * The default IndexData will also call this method once (more) * in the main thread (and use it in another thread). * * @return the contentProvider of the item. */ virtual std::unique_ptr getContentProvider() const = 0; /** * The index data of the item. * * The index data is the data to index. (May be different from the content * to store). * The returned index data must stay valid even after creator release * its reference to the item. * This method will be called once by libzim if it is compiled with xapian * (and is configured to index data). * * The returned IndexData will be used as source to index the item. * If you don't want the item to be indexed, you can return a nullptr here * or return a valid IndexData pointer which will return false to `hasIndexData`. * * If you don't implement this method, a default implementation will be used. * The default implementation first checks for the mimetype and if the mimetype * contains `text/html` it will use a contentProvider to get the content to index. * The contentProvider will be created in the main thread but the data reading and * parsing will occur in a different thread. * * All methods of `IndexData` will be called in a different (same) thread. * * @return the indexData of the item. * May return a nullptr if there is no indexData. */ virtual std::shared_ptr getIndexData() const; /** * Hints to help the creator takes decision about the item. * * For now two hints are supported: * - COMPRESS: Can be used to force the creator to put the item content * in a compressed cluster (if true) or not (if false). * If the hint is not provided, the decision is taken based on the * mimetype (textual or binary content ?) * - FRONT_ARTICLE: Can (Should) be used to specify if the item is * a front article or not. * If the hint is not provided, the decision is taken based on the * mimetype (html or not ?) * * @return A list of hints. */ virtual Hints getHints() const; /** * Returns the getHints() amended with default values based on mimetypes. */ Hints getAmendedHints() const; virtual ~Item() = default; }; /** * A BasicItem is a partial implementation of a Item. * * `BasicItem` provides a basic implementation for everything about an `Item` * but the actual content of the item. */ class LIBZIM_API BasicItem : public Item { public: /** * Create a BasicItem with the given path, mimetype and title. * * @param path the path of the item. * @param mimetype the mimetype of the item. * @param title the title of the item. */ BasicItem(const std::string& path, const std::string& mimetype, const std::string& title, Hints hints) : path(path), mimetype(mimetype), title(title), hints(hints) {} std::string getPath() const { return path; } std::string getTitle() const { return title; } std::string getMimeType() const { return mimetype; } Hints getHints() const { return hints; } protected: std::string path; std::string mimetype; std::string title; Hints hints; }; /** * A `StringItem` is a full implemented item where the content is stored in a string. */ class LIBZIM_API StringItem : public BasicItem, public std::enable_shared_from_this { public: /** * Create a StringItem with the given path, mimetype, title and content. * * The parameters are the ones of the private constructor. * * @param path the path of the item. * @param mimetype the mimetype of the item. * @param title the title of the item. * @param content the content of the item. */ template static std::shared_ptr create(Ts&&... params) { return std::shared_ptr(new StringItem(std::forward(params)...)); } std::unique_ptr getContentProvider() const; protected: std::string content; private: StringItem(const std::string& path, const std::string& mimetype, const std::string& title, Hints hints, const std::string& content) : BasicItem(path, mimetype, title, hints), content(content) {} }; /** * A `FileItem` is a full implemented item where the content is file. */ class LIBZIM_API FileItem : public BasicItem { public: /** * Create a FileItem with the given path, mimetype, title and filenpath. * * @param path the path of the item. * @param mimetype the mimetype of the item. * @param title the title of the item. * @param filepath the path of the file in the filesystem. */ FileItem(const std::string& path, const std::string& mimetype, const std::string& title, Hints hints, const std::string& filepath) : BasicItem(path, mimetype, title, hints), filepath(filepath) {} std::unique_ptr getContentProvider() const; protected: std::string filepath; }; } } #endif // ZIM_WRITER_ITEM_H libzim-9.2.3/include/zim/zim.h000066400000000000000000000067351466367137100162360ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2018-2020 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ZIM_H #define ZIM_ZIM_H #include #ifdef __GNUC__ #define DEPRECATED __attribute__((deprecated)) #elif defined(_MSC_VER) #define DEPRECATED __declspec(deprecated) #else #praga message("WARNING: You need to implement DEPRECATED for this compiler") #define DEPRECATED #endif #include #if defined(_MSC_VER) && defined(LIBZIM_EXPORT_DLL) #define LIBZIM_API __declspec(dllexport) #else #define LIBZIM_API #endif namespace zim { // An index of an entry (in a zim file) typedef uint32_t entry_index_type; // An index of an cluster (in a zim file) typedef uint32_t cluster_index_type; // An index of a blog (in a cluster) typedef uint32_t blob_index_type; // The size of something (entry, zim, cluster, blob, ...) typedef uint64_t size_type; // An offset. typedef uint64_t offset_type; struct FdInput { // An open file descriptor int fd; // The (absolute) offset of the data "pointed" by FdInput in fd. offset_type offset; // The size (length) of the data "pointed" by FdInput size_type size; FdInput(int fd, offset_type offset, size_type size): fd(fd), offset(offset), size(size) {} }; enum class Compression { None = 1, // intermediate values correspond to compression // methods that are no longer supported Zstd = 5 }; static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate"; /** * Various types of integrity checks performed by `zim::validate()`. */ enum class IntegrityCheck { /** * Validates the checksum of the ZIM file. */ CHECKSUM, /** * Checks that offsets in PathPtrList are valid. */ DIRENT_PTRS, /** * Checks that dirents are properly sorted. */ DIRENT_ORDER, /** * Checks that entries in the title index are valid and properly sorted. */ TITLE_INDEX, /** * Checks that offsets in ClusterPtrList are valid. */ CLUSTER_PTRS, /** * Checks that offsets inside each clusters are valid. */ CLUSTERS_OFFSETS, /** * Checks that mime-type values in dirents are valid. */ DIRENT_MIMETYPES, //////////////////////////////////////////////////////////////////////////// // End of integrity check types. // COUNT must be the last one and denotes the count of all checks //////////////////////////////////////////////////////////////////////////// /** * `COUNT` is not a valid integrity check type. It exists to tell the * number of all supported integrity checks. */ COUNT }; } #endif // ZIM_ZIM_H libzim-9.2.3/meson.build000066400000000000000000000071011466367137100151720ustar00rootroot00000000000000project('libzim', ['c', 'cpp'], version : '9.2.3', license : 'GPL2', default_options : ['c_std=c11', 'cpp_std=c++17', 'werror=true']) if build_machine.system() != 'windows' add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp') endif cpp = meson.get_compiler('cpp') sizeof_off_t = cpp.sizeof('off_t') sizeof_size_t = cpp.sizeof('size_t') private_conf = configuration_data() public_conf = configuration_data() private_conf.set('VERSION', '"@0@"'.format(meson.project_version())) public_conf.set('LIBZIM_VERSION', '"@0@"'.format(meson.project_version())) private_conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE')) private_conf.set('DIRENT_LOOKUP_CACHE_SIZE', get_option('DIRENT_LOOKUP_CACHE_SIZE')) private_conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE')) private_conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE')) private_conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8) private_conf.set10('ENV64BIT', sizeof_size_t==8) private_conf.set10('ENV32BIT', sizeof_size_t==4) if host_machine.system() == 'windows' private_conf.set('ENABLE_USE_MMAP', false) add_project_arguments('-DNOMINMAX', language: 'cpp') else private_conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP')) endif private_conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER')) private_conf.set('ENABLE_XAPIAN_FULLER', get_option('with_xapian_fuller')) static_linkage = get_option('static-linkage') static_linkage = static_linkage or get_option('default_library')=='static' lzma_dep = dependency('liblzma', static:static_linkage) if static_linkage add_project_arguments('-DLZMA_API_STATIC', language: 'cpp') endif if get_option('default_library') == 'shared' public_conf.set('LIBZIM_EXPORT_DLL', true) add_project_arguments('-DLIBZIM_EXPORT_PRIVATE_DLL', language: 'cpp') endif zstd_dep = dependency('libzstd', static:static_linkage, default_options:['werror=false']) if host_machine.system() == 'freebsd' execinfo_dep = cpp.find_library('execinfo') endif if get_option('with_xapian') xapian_dep = dependency('xapian-core', static:static_linkage) else xapian_dep = dependency('', required:false) endif private_conf.set('ENABLE_XAPIAN', xapian_dep.found()) public_conf.set('LIBZIM_WITH_XAPIAN', xapian_dep.found()) if build_machine.system() == 'windows' win_deps = declare_dependency( compile_args: ['-DSORTPP_PASS'], link_args: ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-lshlwapi'] ) else win_deps = declare_dependency() endif compiler = meson.get_compiler('cpp') if (compiler.get_id() == 'gcc' and build_machine.system() == 'linux') or host_machine.system() == 'freebsd' # C++ std::thread is implemented using pthread on linux by gcc thread_dep = dependency('threads') else thread_dep = dependency('', required:false) endif if xapian_dep.found() icu_dep = dependency('icu-i18n', static:static_linkage) else icu_dep = dependency('icu-i18n', required:false, static:static_linkage) endif gtest_dep = dependency('gtest', version: '>=1.10.0', main:true, fallback:['gtest', 'gtest_main_dep'], required:false) inc = include_directories('include') subdir('include') subdir('scripts') subdir('static') subdir('src') if get_option('examples') subdir('examples') endif if get_option('tests') subdir('test') endif if get_option('doc') subdir('docs') endif pkg_mod = import('pkgconfig') pkg_mod.generate(libraries : libzim, version : meson.project_version(), name : 'libzim', filebase : 'libzim', description : 'A Library to read/write ZIM files.') libzim-9.2.3/meson_options.txt000066400000000000000000000040031466367137100164630ustar00rootroot00000000000000option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', description : 'set cluster cache size to number (default:16)') option('DIRENT_CACHE_SIZE', type : 'string', value : '512', description : 'set dirent cache size to number (default:512)') option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024', description : 'set dirent lookup cache size to number (default:1024)') option('LZMA_MEMORY_SIZE', type : 'string', value : '128', description : 'set lzma uncompress memory in MB (default:128)') option('USE_MMAP', type: 'boolean', value: true, description: 'Use mmap to avoid copy from file. (default:true, always false on windows)') option('USE_BUFFER_HEADER', type: 'boolean', value: true, description: '''Copy (or use mmap) header index buffers. (default:true) Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory. If false, we directly read the index in the file at each article access.''') option('static-linkage', type : 'boolean', value : false, description : 'Link statically with the dependencies.') option('doc', type : 'boolean', value : false, description : 'Build the documentations.') option('examples', type : 'boolean', value : true, description : 'Build the examples.') option('tests', type : 'boolean', value : true, description : 'Build the tests.') option('with_xapian', type : 'boolean', value: true, description: 'Build libzim with xapian support') option('with_xapian_fuller', type: 'boolean', value: true, description: 'Create xapian archive using "FULLER" compaction.\nThis is a workaround for a compilation issue on Windows. This will be removed soon') option('test_data_dir', type : 'string', value: '', description: 'Where the test data are. If not set, meson will use a internal directory in the build dir. If you want to download the data in the specified directory you can use `meson download_test_data`. As a special value, you can pass `none` to deactivate test using external test data.') libzim-9.2.3/scripts/000077500000000000000000000000001466367137100145205ustar00rootroot00000000000000libzim-9.2.3/scripts/download_test_data.py000077500000000000000000000044421466367137100207400ustar00rootroot00000000000000#!/usr/bin/env python3 ''' Copyright 2021 Matthieu Gautier This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ''' import argparse from pathlib import Path from urllib import request from urllib.error import * import tarfile import sys TEST_DATA_VERSION = "0.6.0" ARCHIVE_URL_TEMPL = "https://github.com/openzim/zim-testing-suite/releases/download/{version}/zim-testing-suite-{version}.tar.gz" if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--version', '-v', help="The version to download.", default=TEST_DATA_VERSION) parser.add_argument('--remove-top-dir', help="Remove the top directory when extracting", action='store_true') parser.add_argument('outdir', help='The directory where to install the test data.') args = parser.parse_args() test_data_url = ARCHIVE_URL_TEMPL.format(version=args.version) try: with request.urlopen(test_data_url) as f: with tarfile.open(fileobj=f, mode="r|*") as archive: while True: member = archive.next() if member is None: break if args.remove_top_dir: member.name = '/'.join(member.name.split('/')[1:]) archive.extract(member, path=args.outdir) except HTTPError as e: print("Error downloading archive at url : {}".format(test_data_url)) print(e) sys.exit(1) except OSError as e: print("Error writing the test data on the file system.") print(e) sys.exit(1) libzim-9.2.3/scripts/libzim-compile-resources000077500000000000000000000142431466367137100213760ustar00rootroot00000000000000#!/usr/bin/env python3 ''' Copyright 2016 Matthieu Gautier This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ''' import argparse import os.path import re def full_identifier(filename): parts = os.path.normpath(filename).split(os.sep) parts = [to_identifier(part) for part in parts] print(filename, parts) return parts def to_identifier(name): ident = re.sub(r'[^0-9a-zA-Z]', '_', name) if ident[0].isnumeric(): return "_"+ident return ident resource_impl_template = """ static const unsigned char {data_identifier}[] = {{ {resource_content} }}; namespace RESOURCE {{ {namespaces_open} const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len}); {namespaces_close} }} """ resource_getter_template = """ if (name == "{common_name}") return RESOURCE::{identifier}; """ resource_decl_template = """{namespaces_open} extern const std::string {identifier}; {namespaces_close}""" class Resource: def __init__(self, base_dirs, filename): filename = filename.strip() self.filename = filename self.identifier = full_identifier(filename) found = False for base_dir in base_dirs: try: with open(os.path.join(base_dir, filename), 'rb') as f: self.data = f.read() found = True break except FileNotFoundError: continue if not found: raise Exception("Impossible to found {}".format(filename)) def dump_impl(self): nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0) sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row)) return resource_impl_template.format( data_identifier="_".join([""]+self.identifier), resource_content=",\n ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced), resource_len=len(self.data), namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), identifier=self.identifier[-1], env_identifier="RES_"+"_".join(self.identifier)+"_PATH" ) def dump_getter(self): return resource_getter_template.format( common_name=self.filename, identifier="::".join(self.identifier) ) def dump_decl(self): return resource_decl_template.format( namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), identifier=self.identifier[-1] ) master_c_template = """//This file is automaically generated. Do not modify it. #include #include #include "{include_file}" static std::string init_resource(const char* name, const unsigned char* content, int len) {{ char * resPath = getenv(name); if (NULL == resPath) return std::string(reinterpret_cast(content), len); std::ifstream ifs(resPath); if (!ifs.good()) return std::string(reinterpret_cast(content), len); return std::string( (std::istreambuf_iterator(ifs)), (std::istreambuf_iterator() )); }} const std::string& getResource_{basename}(const std::string& name) {{ {RESOURCES_GETTER} throw ResourceNotFound("Resource not found."); }} {RESOURCES} """ def gen_c_file(resources, basename): return master_c_template.format( RESOURCES="\n\n".join(r.dump_impl() for r in resources), RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources), include_file=basename, basename=to_identifier(basename) ) master_h_template = """//This file is automaically generated. Do not modify it. #ifndef KIWIX_{BASENAME} #define KIWIX_{BASENAME} #include #include namespace RESOURCE {{ {RESOURCES} }}; class ResourceNotFound : public std::runtime_error {{ public: ResourceNotFound(const std::string& what_arg): std::runtime_error(what_arg) {{ }}; }}; const std::string& getResource_{basename}(const std::string& name); #define getResource(a) (getResource_{basename}(a)) #endif // KIWIX_{BASENAME} """ def gen_h_file(resources, basename): return master_h_template.format( RESOURCES="\n ".join(r.dump_decl() for r in resources), BASENAME=basename.upper(), basename=basename, ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--cxxfile', help='The Cpp file name to generate') parser.add_argument('--hfile', help='The h file name to generate') parser.add_argument('--source_dir', help="Additional directory where to look for resources.", action='append') parser.add_argument('resource_file', help='The list of resources to compile.') args = parser.parse_args() base_dir = os.path.dirname(os.path.realpath(args.resource_file)) source_dir = args.source_dir or [] with open(args.resource_file, 'r') as f: resources = [Resource([base_dir]+source_dir, filename) for filename in f.readlines()] h_identifier = to_identifier(os.path.basename(args.hfile)) with open(args.hfile, 'w') as f: f.write(gen_h_file(resources, h_identifier)) with open(args.cxxfile, 'w') as f: f.write(gen_c_file(resources, os.path.basename(args.hfile))) libzim-9.2.3/scripts/meson.build000066400000000000000000000001661466367137100166650ustar00rootroot00000000000000 res_compiler = find_program('libzim-compile-resources') test_data_downloader = find_program('download_test_data.py') libzim-9.2.3/src/000077500000000000000000000000001466367137100136205ustar00rootroot00000000000000libzim-9.2.3/src/_dirent.h000066400000000000000000000077331466367137100154270ustar00rootroot00000000000000/* * Copyright (C) 2018-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yankan * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_DIRENT_H #define ZIM_DIRENT_H #include #include #include #include "zim_types.h" #include "config.h" namespace zim { class Buffer; class InvalidSize : public std::exception {}; class LIBZIM_PRIVATE_API Dirent { protected: uint16_t mimeType; uint32_t version; cluster_index_t clusterNumber; // only used when redirect is false blob_index_t blobNumber; // only used when redirect is false entry_index_t redirectIndex; // only used when redirect is true char ns; std::string title; std::string path; std::string parameter; public: // these constants are put into mimeType field static const uint16_t redirectMimeType = 0xffff; static const uint16_t linktargetMimeType = 0xfffe; static const uint16_t deletedMimeType = 0xfffd; Dirent() : mimeType(0), version(0), clusterNumber(0), blobNumber(0), redirectIndex(0), ns('\0') {} bool isRedirect() const { return mimeType == redirectMimeType; } bool isLinktarget() const { return mimeType == linktargetMimeType; } bool isDeleted() const { return mimeType == deletedMimeType; } bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); } uint16_t getMimeType() const { return mimeType; } uint32_t getVersion() const { return version; } void setVersion(uint32_t v) { version = v; } cluster_index_t getClusterNumber() const { return isRedirect() ? cluster_index_t(0) : clusterNumber; } blob_index_t getBlobNumber() const { return isRedirect() ? blob_index_t(0) : blobNumber; } entry_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : entry_index_t(0); } char getNamespace() const { return ns; } const std::string &getTitle() const { return title.empty() ? path : title; } const std::string &getPath() const { return path; } std::string getLongPath() const; const std::string& getParameter() const { return parameter; } size_t getDirentSize() const { size_t ret = (isRedirect() ? 12 : 16) + path.size() + parameter.size() + 2; if (title != path) ret += title.size(); return ret; } void setTitle(const std::string& title_) { title = title_; } void setPath(char ns_, const std::string &path_) { ns = ns_; path = path_; } void setParameter(const std::string& parameter_) { parameter = parameter_; } void setRedirect(entry_index_t idx) { redirectIndex = idx; mimeType = redirectMimeType; } void setItem(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_) { mimeType = mimeType_; clusterNumber = clusterNumber_; blobNumber = blobNumber_; } }; } #endif // ZIM_DIRENT_H libzim-9.2.3/src/archive.cpp000066400000000000000000000421431466367137100157510ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #define ZIM_PRIVATE #include #include #include #include #include #include "fileimpl.h" #include "tools.h" #include "log.h" log_define("zim.archive") namespace zim { Archive::Archive(const std::string& fname) : m_impl(new FileImpl(fname)) { } #ifndef _WIN32 Archive::Archive(int fd) : m_impl(new FileImpl(fd)) { } Archive::Archive(FdInput fd) : m_impl(new FileImpl(fd)) { } Archive::Archive(int fd, offset_type offset, size_type size) : Archive(FdInput(fd, offset, size)) {} Archive::Archive(const std::vector& fds) : m_impl(new FileImpl(fds)) { } #endif const std::string& Archive::getFilename() const { return m_impl->getFilename(); } size_type Archive::getFilesize() const { return m_impl->getFilesize().v; } entry_index_type Archive::getAllEntryCount() const { return m_impl->getCountArticles().v; } entry_index_type Archive::getEntryCount() const { return m_impl->getUserEntryCount().v; } entry_index_type Archive::getArticleCount() const { if (m_impl->hasFrontArticlesIndex()) { return m_impl->getFrontEntryCount().v; } else { try { return countMimeType( getMetadata("Counter"), [](const std::string& mimetype) { return mimetype.find("text/html") == 0; } ); } catch(const EntryNotFound& e) { const char articleNs = m_impl->hasNewNamespaceScheme() ? 'C' : 'A'; return m_impl->getNamespaceEntryCount(articleNs).v; } } } entry_index_type Archive::getMediaCount() const { try { return countMimeType( getMetadata("Counter"), [](const std::string& mimetype) { return mimetype.find("image/") == 0 || mimetype.find("video/") == 0 || mimetype.find("audio/") == 0; } ); } catch(const EntryNotFound& e) { return (m_impl->getNamespaceEntryCount('I').v + m_impl->getNamespaceEntryCount('J').v); } } Uuid Archive::getUuid() const { return m_impl->getFileheader().getUuid(); } Item Archive::getMetadataItem(const std::string& name) const { auto r = m_impl->findx('M', name); if (!r.first) { throw EntryNotFound("Cannot find metadata"); } auto entry = Entry(m_impl, entry_index_type(r.second)); return entry.getItem(true); } std::string Archive::getMetadata(const std::string& name) const { auto item = getMetadataItem(name); return item.getData(); } std::vector Archive::getMetadataKeys() const { std::vector ret; auto start = m_impl->getNamespaceBeginOffset('M'); auto end = m_impl->getNamespaceEndOffset('M'); for (auto idx=start; idx!=end; idx++) { auto dirent = m_impl->getDirent(idx); ret.push_back(dirent->getPath()); } return ret; } zim::FileImpl::FindxResult findFavicon(FileImpl& impl) { for(auto ns:{'-', 'I'}) { for (auto& path:{"favicon", "favicon.png"}) { auto r = impl.findx(ns, path); if (r.first) { return r; } } } throw EntryNotFound("No favicon found."); } Item Archive::getIllustrationItem(unsigned int size) const { auto r = m_impl->findx('M', Formatter() << "Illustration_" << size << "x" << size << "@" << 1); if (r.first) { return getEntryByPath(entry_index_type(r.second)).getItem(); } // We haven't found the exact entry. Let's "search" for a illustration and // use the first one we found. #if 0 // We have decided to not implement fallback in case of wrong resolution for now. // We keep this code for reference. r = m_impl->findx('M', "Illustration"); auto entry = getEntryByPath(entry_index_type(r.second)); if (entry.getPath().find("Illustration") == 0) { return entry.getItem(); } #endif // For 48x48 illustration, return favicon for older zims. if (size == 48) { auto r = findFavicon(*m_impl); return getEntryByPath(entry_index_type(r.second)).getItem(true); } throw EntryNotFound("Cannot find illustration item."); } std::set Archive::getIllustrationSizes() const { std::set ret; for(auto r = m_impl->findx('M', "Illustration_").second; /*No exit test*/; r++ ) { try { auto path = getEntryByPath(entry_index_type(r)).getPath(); if (path.find("Illustration_") != 0) { break; } try { ret.insert(parseIllustrationPathToSize(path)); } catch (...) {} } catch (const std::out_of_range& e) { break; } } if (ret.find(48) == ret.end()) { try { // raise a exception if we cannot find the (old format) favicon. findFavicon(*m_impl); ret.insert(48); } catch(EntryNotFound&) {} } return ret; } bool Archive::hasIllustration(unsigned int size) const { try { getIllustrationItem(size); return true; } catch (EntryNotFound& e) { return false; } } Entry Archive::getEntryByPath(entry_index_type idx) const { if (idx >= entry_index_type(m_impl->getCountArticles())) throw std::out_of_range("entry index out of range"); return Entry(m_impl, idx); } Entry Archive::getEntryByPath(const std::string& path) const { if (m_impl->hasNewNamespaceScheme()) { // Get path in user content. auto r = m_impl->findx('C', path); if (r.first) { return Entry(m_impl, entry_index_type(r.second)); } try { // Path may come from a already stored from a old zim archive (bookmark), // and so contains a namespace. // We have to adapt the path to use the C namespace. r = m_impl->findx('C', std::get<1>(parseLongPath(path))); if (r.first) { return Entry(m_impl, entry_index_type(r.second)); } } catch (std::runtime_error&) {} } else { // Path should contains the namespace. auto r = m_impl->findx(path); if (r.first) { return Entry(m_impl, entry_index_type(r.second)); } // If not (bookmark) from a recent zim archive. for (auto ns:{'A', 'I', 'J', '-'}) { r = m_impl->findx(ns, path); if (r.first) { return Entry(m_impl, entry_index_type(r.second)); } } } throw EntryNotFound("Cannot find entry"); } Entry Archive::getEntryByPathWithNamespace(char ns, const std::string& path) const { auto r = m_impl->findx(ns, path); if (r.first) { return Entry(m_impl, entry_index_type(r.second)); } throw EntryNotFound("Cannot find entry"); } Entry Archive::getEntryByTitle(entry_index_type idx) const { return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx)))); } Entry Archive::getEntryByTitle(const std::string& title) const { for (auto ns:{'C', 'A', 'I', 'J', '-'}) { log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')'); auto r = m_impl->findxByTitle(ns, title); if (r.first) return getEntryByTitle(entry_index_type(r.second)); } throw EntryNotFound("Cannot find entry"); } Entry Archive::getEntryByClusterOrder(entry_index_type idx) const { return Entry(m_impl, entry_index_type(m_impl->getIndexByClusterOrder(entry_index_t(idx)))); } Entry Archive::getMainEntry() const { auto r = m_impl->findx('W', "mainPage"); if (r.first) { return getEntryByPath(entry_index_type(r.second)); } auto& header = m_impl->getFileheader(); if (!header.hasMainPage()) { throw EntryNotFound("No main page"); } return getEntryByPath(header.getMainPage()); } bool Archive::hasMainEntry() const { return m_impl->getFileheader().hasMainPage(); } Entry Archive::getRandomEntry() const { if ( !m_impl->hasNewNamespaceScheme() ) { const auto startOfNamespaceA = m_impl->getNamespaceBeginOffset('A'); const auto endOfNamespaceA = m_impl->getNamespaceEndOffset('A'); const auto n = (endOfNamespaceA - startOfNamespaceA).v; if ( n == 0 ) { throw EntryNotFound("Cannot find valid random entry (empty namespace 'A'"); } return getEntryByPath(startOfNamespaceA.v + randomNumber(n-1)); } else { auto frontEntryCount = m_impl->getFrontEntryCount().v; if (frontEntryCount == 0) { throw EntryNotFound("Cannot find valid random entry (no front entry at all)"); } return getEntryByTitle(randomNumber(frontEntryCount-1)); } } bool Archive::hasFulltextIndex() const { auto r = m_impl->findx('X', "fulltext/xapian"); if (!r.first) { r = m_impl->findx('Z', "/fulltextIndex/xapian"); } if (!r.first) { return false; } auto entry = Entry(m_impl, entry_index_type(r.second)); auto item = entry.getItem(true); auto accessInfo = item.getDirectAccessInformation(); return accessInfo.second; } bool Archive::hasTitleIndex() const { auto r = m_impl->findx('X', "title/xapian"); if (!r.first) { return false; } auto entry = Entry(m_impl, entry_index_type(r.second)); auto item = entry.getItem(true); auto accessInfo = item.getDirectAccessInformation(); return accessInfo.second; } Archive::EntryRange Archive::iterByPath() const { return EntryRange(m_impl, m_impl->getStartUserEntry().v, m_impl->getEndUserEntry().v); } Archive::EntryRange Archive::iterByTitle() const { if (m_impl->hasFrontArticlesIndex()) { // We have a front articles index. We can "simply" loop over all front entries. return EntryRange( m_impl, 0, m_impl->getFrontEntryCount().v ); } else if (!m_impl->hasNewNamespaceScheme()) { // We are a old zim archive with namespace, we have to iterate on 'A' namespace. return EntryRange( m_impl, m_impl->getNamespaceBeginOffset('A').v, m_impl->getNamespaceEndOffset('A').v ); } else { // We are a zim archive without namespace but without specific articles listing. // We don't the choice here, iterate on all user entries. return EntryRange( m_impl, m_impl->getStartUserEntry().v, m_impl->getEndUserEntry().v ); } } Archive::EntryRange Archive::iterEfficient() const { return EntryRange(m_impl, 0, getEntryCount()); } Archive::EntryRange Archive::findByPath(std::string path) const { // "url order" means that the entries are stored by long url ("NS/url)". // // If we really want to search by url whatever is the namespace, we would have to // search in all "content" (A, I, J, -) namespaces and then merge the results. // // It would be pretty complex as we would need to have iterate hover several ranges // in the same time. Let's enforce that path is the full path and search in whatever // namespace is in it. // We have to return two iterator for a range of entry where `path` is a prefix. // - The begin iterator is a iterator to the first entry with `path` as a prefix (or (range) end if none) // - The end iterator is the iterator pass the last entry with `path` as a prefix (or (global) end) // // The findx return a iterator for the exact match or the one just after. // So, for the begin iterator, we can simply use the index returned by findx // For the end iterator we have to do the same but with a prefix "just after" the queried `path` // So the end index will always be just after the prefix range. If there is no prefix range, both // begin and end will be just after where it would be. // // Suposing a list of title : // 0. aaaaaa // 1. aaaaab // 2. aabbaa // 3. aabbbb // 4. bbaaaa // 5. bbbb // 6. bbbbaa // 7. bbbbbb // 8. // If we search for prefix aabb, we must return 2/4 // A findx with aabb will return 2 // A findx with aabc will return 4 // // If we search for prefix bbbb, we must return 5/8 // A findx with bbbb will return 5 (with exact match) // A findx with bbbc will return 8 // // If we search for prefix cccc, we must return 8/8 // A findx with cccc will return 8 // A findx with bbbc will return 8 // // If we search for prefix a, we must return 0/4 // A findx with a will return 0 // A find with b will return 4 entry_index_t begin_idx, end_idx; if (path.empty() || path == "/") { begin_idx = m_impl->getStartUserEntry(); end_idx = m_impl->getEndUserEntry(); } else if (m_impl->hasNewNamespaceScheme()) { begin_idx = m_impl->findx('C', path).second; path.back()++; end_idx = m_impl->findx('C', path).second; } else { char ns; try { std::tie(ns, path) = parseLongPath(path); } catch (...) { return Archive::EntryRange(m_impl, 0, 0); } begin_idx = m_impl->findx(ns, path).second; if (path.empty()) { ns++; } else { path.back()++; } end_idx = m_impl->findx(ns, path).second; } return Archive::EntryRange(m_impl, begin_idx.v, end_idx.v); } Archive::EntryRange Archive::findByTitle(std::string title) const { // "title order" means that the entries are stored by "NS/title" part. // It is nice when we want to search for title in a specific namespace, but // now we want to hide the namespace. It would be better if the "title order" // would be real title order, whatever is the namespace. // // If we really want to search by title what ever is the namespace, we would have to // search in all "content" namespace and then merge the results. // // The find by title is only used for the article (`A` namespace). So let's search // only in it. // See `Archive::findByPath` for the rational. auto ns = m_impl->hasNewNamespaceScheme() ? 'C' : 'A'; auto begin_idx = m_impl->findxByTitle(ns, title).second; title.back()++; auto end_idx = m_impl->findxByTitle(ns, title).second; return Archive::EntryRange(m_impl, begin_idx.v, end_idx.v); } bool Archive::hasChecksum() const { return m_impl->getFileheader().hasChecksum(); } std::string Archive::getChecksum() const { return m_impl->getChecksum(); } bool Archive::check() const { return m_impl->verify(); } bool Archive::isMultiPart() const { return m_impl->is_multiPart(); } bool Archive::hasNewNamespaceScheme() const { return m_impl->hasNewNamespaceScheme(); } cluster_index_type Archive::getClusterCount() const { return cluster_index_type(m_impl->getCountClusters()); } offset_type Archive::getClusterOffset(cluster_index_type idx) const { return offset_type(m_impl->getClusterOffset(cluster_index_t(idx))); } entry_index_type Archive::getMainEntryIndex() const { return m_impl->getFileheader().getMainPage(); } template<> entry_index_type _toPathOrder(const FileImpl& impl, entry_index_type idx) { return idx; } template<> entry_index_type _toPathOrder(const FileImpl& impl, entry_index_type idx) { return impl.getIndexByTitle(title_index_t(idx)).v; } template<> entry_index_type _toPathOrder(const FileImpl& impl, entry_index_type idx) { return impl.getIndexByClusterOrder(entry_index_t(idx)).v; } bool Archive::checkIntegrity(IntegrityCheck checkType) { return m_impl->checkIntegrity(checkType); } bool validate(const std::string& zimPath, IntegrityCheckList checksToRun) { try { Archive a(zimPath); for ( size_t i = 0; i < checksToRun.size(); ++i ) { if ( checksToRun.test(i) && !a.checkIntegrity(IntegrityCheck(i)) ) return false; } } catch(ZimFileFormatError &exception) { std::cerr << exception.what() << std::endl; return false; } return true; } } // namespace zim libzim-9.2.3/src/blob.cpp000066400000000000000000000031711466367137100152440ustar00rootroot00000000000000/* * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2017-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "zim/blob.h" #include "debug.h" #include "buffer.h" namespace zim { namespace { struct NoDelete { template void operator()(T*) {} }; // This shared_ptr is used as a source object for the std::shared_ptr // aliasing constructor (with the purpose of avoiding the control block // allocation) for the case when the referred data must not be deleted. static Blob::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete()); } // unnamed namespace Blob::Blob() : _data(nonOwnedDataPtr), _size(0) {} Blob::Blob(const char* data, size_type size) : _data(nonOwnedDataPtr, data), _size(size) { ASSERT(size, <, SIZE_MAX); ASSERT(data, <, (void*)(SIZE_MAX-size)); } Blob::Blob(const DataPtr& buffer, size_type size) : _data(buffer), _size(size) {} } //zim libzim-9.2.3/src/buffer.cpp000066400000000000000000000044451466367137100156040ustar00rootroot00000000000000/* * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2017-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "buffer.h" #include "debug.h" #include #include #ifndef _WIN32 # include # include #endif namespace zim { namespace { struct NoDelete { template void operator()(T*) {} }; // This shared_ptr is used as a source object for the std::shared_ptr // aliasing constructor (with the purpose of avoiding the control block // allocation) for the case when the referred data must not be deleted. static Buffer::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete()); } // unnamed namespace const Buffer Buffer::sub_buffer(offset_t offset, zsize_t size) const { ASSERT(offset.v, <=, m_size.v); ASSERT(offset.v+size.v, <=, m_size.v); auto sub_data = DataPtr(m_data, data(offset)); return Buffer(sub_data, size); } const Buffer Buffer::makeBuffer(const DataPtr& data, zsize_t size) { return Buffer(data, size); } const Buffer Buffer::makeBuffer(const char* data, zsize_t size) { return Buffer(DataPtr(nonOwnedDataPtr, data), size); } Buffer Buffer::makeBuffer(zsize_t size) { if (0 == size.v) { return Buffer(DataPtr(nonOwnedDataPtr, nullptr), size); } return Buffer(DataPtr(new char[size.v], std::default_delete()), size); } Buffer::Buffer(const DataPtr& data, zsize_t size) : m_size(size), m_data(data) { ASSERT(m_size.v, <, SIZE_MAX); } const char* Buffer::data(offset_t offset) const { ASSERT(offset.v, <=, m_size.v); return m_data.get() + offset.v; } } //zim libzim-9.2.3/src/buffer.h000066400000000000000000000034171466367137100152470ustar00rootroot00000000000000/* * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2017-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_BUFFER_H_ #define ZIM_BUFFER_H_ #include #include "config.h" #include "zim_types.h" #include namespace zim { class LIBZIM_PRIVATE_API Buffer { public: // types typedef std::shared_ptr DataPtr; public: // functions static const Buffer makeBuffer(const char* data, zsize_t size); static const Buffer makeBuffer(const DataPtr& data, zsize_t size); static Buffer makeBuffer(zsize_t size); const char* data(offset_t offset=offset_t(0)) const; char at(offset_t offset) const { return *(data(offset)); } zsize_t size() const { return m_size; } const Buffer sub_buffer(offset_t offset, zsize_t size) const; operator Blob() const { return Blob(m_data, m_size.v); } private: // functions Buffer(const DataPtr& data, zsize_t size); private: // data zsize_t m_size; DataPtr m_data; }; } // zim namespace #endif //ZIM_BUFFER_H_ libzim-9.2.3/src/buffer_reader.cpp000066400000000000000000000033651466367137100171260ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include "buffer_reader.h" #include "buffer.h" #include #include namespace zim { const Buffer BufferReader::get_buffer(offset_t offset, zsize_t size) const { return source.sub_buffer(offset, size); } std::unique_ptr BufferReader::sub_reader(offset_t offset, zsize_t size) const { auto sub_buff = get_buffer(offset, size); std::unique_ptr sub_read(new BufferReader(sub_buff)); return sub_read; } zsize_t BufferReader::size() const { return source.size(); } offset_t BufferReader::offset() const { return offset_t((offset_type)(static_cast(source.data(offset_t(0))))); } void BufferReader::readImpl(char* dest, offset_t offset, zsize_t size) const { memcpy(dest, source.data(offset), size.v); } char BufferReader::readImpl(offset_t offset) const { char dest; dest = *source.data(offset); return dest; } } // zim libzim-9.2.3/src/buffer_reader.h000066400000000000000000000030361466367137100165660ustar00rootroot00000000000000/* * Copyright (C) 2017 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_BUFFER_READER_H_ #define ZIM_BUFFER_READER_H_ #include "reader.h" namespace zim { class LIBZIM_PRIVATE_API BufferReader : public Reader { public: BufferReader(const Buffer& source) : source(source) {} virtual ~BufferReader() {}; zsize_t size() const override; offset_t offset() const override; const Buffer get_buffer(offset_t offset, zsize_t size) const override; std::unique_ptr sub_reader(offset_t offset, zsize_t size) const override; private: // functions void readImpl(char* dest, offset_t offset, zsize_t size) const override; char readImpl(offset_t offset) const override; private: // data const Buffer source; }; }; #endif // ZIM_BUFFER_READER_H_ libzim-9.2.3/src/bufferstreamer.h000066400000000000000000000037501466367137100170120ustar00rootroot00000000000000/* * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_BUFFERSTREAMER_H #define ZIM_BUFFERSTREAMER_H #include "debug.h" #include "endian_tools.h" #include namespace zim { class BufferStreamer { public: // functions BufferStreamer(const Buffer& buffer, zsize_t size) : m_buffer(buffer), m_current(buffer.data()), m_size(size) {} explicit BufferStreamer(const Buffer& buffer) : BufferStreamer(buffer, buffer.size()) {} // Reads a value of the said type from the stream // // For best portability this function should be used with types of known // bit-width (int32_t, uint16_t, etc) rather than builtin types with // unknown bit-width (int, unsigned, etc). template T read() { const size_t N(sizeof(T)); char buf[N]; memcpy(buf, m_current, N); skip(zsize_t(N)); return fromLittleEndian(buf); // XXX: This handles only integral types } const char* current() const { return m_current; } zsize_t left() const { return m_size; } void skip(zsize_t nbBytes) { m_current += nbBytes.v; m_size -= nbBytes; } private: // data const Buffer m_buffer; const char* m_current; zsize_t m_size; }; } // namespace zim #endif // ZIM_BUFDATASTREAM_H libzim-9.2.3/src/cluster.cpp000066400000000000000000000132151466367137100160070ustar00rootroot00000000000000/* * Copyright (C) 2016-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "cluster.h" #include #include #include "buffer_reader.h" #include "bufferstreamer.h" #include "decoderstreamreader.h" #include "rawstreamreader.h" #include #include #include "log.h" #include "config.h" log_define("zim.cluster") #define log_debug1(e) namespace zim { namespace { std::unique_ptr getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression* comp, bool* extended) { uint8_t clusterInfo = zimReader.read(offset); // Very old zim files used 0 as a "default" compression, which means no compression. uint8_t compInfo = clusterInfo & 0x0F; if(compInfo == 0) { *comp = Cluster::Compression::None; } else if (compInfo == int(Cluster::Compression::Zip)) { throw std::runtime_error("zlib not enabled in this library"); } else if (compInfo == int(Cluster::Compression::Bzip2)) { throw std::runtime_error("bzip2 not enabled in this library"); } else { *comp = static_cast(compInfo); } *extended = clusterInfo & 0x10; auto subReader = std::shared_ptr(zimReader.sub_reader(offset+offset_t(1))); switch ( *comp ) { case Cluster::Compression::None: return std::unique_ptr(new RawStreamReader(subReader)); case Cluster::Compression::Lzma: return std::unique_ptr(new DecoderStreamReader(subReader)); case Cluster::Compression::Zstd: return std::unique_ptr(new DecoderStreamReader(subReader)); default: throw ZimFileFormatError("Invalid compression flag"); } } } // unnamed namespace std::shared_ptr Cluster::read(const Reader& zimReader, offset_t clusterOffset) { Compression comp; bool extended; auto reader = getClusterReader(zimReader, clusterOffset, &comp, &extended); return std::make_shared(std::move(reader), comp, extended); } Cluster::Cluster(std::unique_ptr reader_, Compression comp, bool isExtended) : compression(comp), isExtended(isExtended), m_reader(std::move(reader_)) { if (isExtended) { read_header(); } else { read_header(); } } Cluster::~Cluster() = default; /* This return the number of char read */ template void Cluster::read_header() { // read first offset, which specifies, how many offsets we need to read OFFSET_TYPE offset = m_reader->read(); size_t n_offset = offset / sizeof(OFFSET_TYPE); // read offsets m_blobOffsets.clear(); m_blobOffsets.reserve(n_offset); m_blobOffsets.push_back(offset_t(offset)); // Get the whole offsets data to avoid to many (system) call. auto bufferSize = zsize_t(offset-sizeof(OFFSET_TYPE)); auto buffer = m_reader->sub_reader(bufferSize)->get_buffer(offset_t(0), bufferSize); auto seqReader = BufferStreamer(buffer, bufferSize); while (--n_offset) { OFFSET_TYPE new_offset = seqReader.read(); if (new_offset < offset) { throw zim::ZimFileFormatError("Error parsing cluster. Offsets are not ordered."); } m_blobOffsets.push_back(offset_t(new_offset)); offset = new_offset; } } zsize_t Cluster::getBlobSize(blob_index_t n) const { if (blob_index_type(n)+1 >= m_blobOffsets.size()) { throw ZimFileFormatError("blob index out of range"); } return zsize_t(m_blobOffsets[blob_index_type(n)+1].v - m_blobOffsets[blob_index_type(n)].v); } const Reader& Cluster::getReader(blob_index_t n) const { std::lock_guard lock(m_readerAccessMutex); for(blob_index_type current(m_blobReaders.size()); current<=n.v; ++current) { auto blobSize = getBlobSize(blob_index_t(current)); if (blobSize.v > SIZE_MAX) { m_blobReaders.push_back(std::unique_ptr(new BufferReader(Buffer::makeBuffer(zsize_t(0))))); } else { m_blobReaders.push_back(m_reader->sub_reader(blobSize)); } } return *m_blobReaders[blob_index_type(n)]; } Blob Cluster::getBlob(blob_index_t n) const { if (n < count()) { const auto blobSize = getBlobSize(n); if (blobSize.v > SIZE_MAX) { return Blob(); } return getReader(n).get_buffer(offset_t(0), blobSize); } else { return Blob(); } } Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const { if (n < count()) { const auto blobSize = getBlobSize(n); if ( offset.v > blobSize.v ) { return Blob(); } size = std::min(size, zsize_t(blobSize.v-offset.v)); if (size.v > SIZE_MAX) { return Blob(); } return getReader(n).get_buffer(offset, size); } else { return Blob(); } } } libzim-9.2.3/src/cluster.h000066400000000000000000000063411466367137100154560ustar00rootroot00000000000000/* * Copyright (C) 2016-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2020 Miguel Rocha * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_CLUSTER_H #define ZIM_CLUSTER_H #include #include "buffer.h" #include "zim_types.h" #include #include #include namespace zim { class Blob; class Reader; class IStreamReader; class LIBZIM_PRIVATE_API Cluster : public std::enable_shared_from_this { typedef std::vector BlobOffsets; typedef std::vector> BlobReaders; public: // zim::Compression lists only compression methods supported by the // writer. But on the reader side we need to deal with some historical // compression types. Here we maintain the full list of compression // types. enum class Compression { None = 1, Zip, // Support is discontinued Bzip2, // Support is discontinued Lzma, // Supported only by the reader Zstd }; public: const Compression compression; const bool isExtended; private: std::unique_ptr m_reader; // offsets of the blob boundaries relative to the start of the cluster data // (*after* the first byte (clusterInfo)) // For a cluster with N blobs, this collection contains N+1 entries. // The start of the first blob and the end of the last blob are included. BlobOffsets m_blobOffsets; mutable std::mutex m_readerAccessMutex; mutable BlobReaders m_blobReaders; template void read_header(); const Reader& getReader(blob_index_t n) const; public: Cluster(std::unique_ptr reader, Compression comp, bool isExtended); ~Cluster(); Compression getCompression() const { return compression; } bool isCompressed() const { return compression != Compression::None; } blob_index_t count() const { return blob_index_t(m_blobOffsets.size() - 1); } zsize_t getBlobSize(blob_index_t n) const; offset_t getBlobOffset(blob_index_t n) const { return offset_t(1) + m_blobOffsets[blob_index_type(n)]; } Blob getBlob(blob_index_t n) const; Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; static std::shared_ptr read(const Reader& zimReader, offset_t clusterOffset); }; } #endif // ZIM_CLUSTER_H libzim-9.2.3/src/compression.cpp000066400000000000000000000114621466367137100166710ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * Copyright (C) 2020 Emmanuel Engelhart * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the impliedD * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "compression.h" #include "envvalue.h" #include #include const std::string LZMA_INFO::name = "lzma"; void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data) { *stream = LZMA_STREAM_INIT; unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024); auto errcode = lzma_stream_decoder(stream, memsize, 0); if (errcode != LZMA_OK) { throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream"); } } CompStatus LZMA_INFO::stream_run_decode(stream_t* stream, CompStep step) { return stream_run(stream, step); } CompStatus LZMA_INFO::stream_run(stream_t* stream, CompStep step) { auto errcode = lzma_code(stream, step==CompStep::STEP?LZMA_RUN:LZMA_FINISH); switch(errcode) { case LZMA_BUF_ERROR: return CompStatus::BUF_ERROR; case LZMA_STREAM_END: return CompStatus::STREAM_END; case LZMA_OK: return CompStatus::OK; default: { throw std::runtime_error(zim::Formatter() << "Unexpected lzma status : " << errcode); } } } void LZMA_INFO::stream_end_decode(stream_t* stream) { lzma_end(stream); } const std::string ZSTD_INFO::name = "zstd"; ZSTD_INFO::stream_t::stream_t() : next_in(nullptr), avail_in(0), next_out(nullptr), avail_out(0), total_out(0), encoder_stream(nullptr), decoder_stream(nullptr) {} ZSTD_INFO::stream_t::~stream_t() { if ( encoder_stream ) ::ZSTD_freeCStream(encoder_stream); if ( decoder_stream ) ::ZSTD_freeDStream(decoder_stream); } void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data) { stream->decoder_stream = ::ZSTD_createDStream(); auto ret = ::ZSTD_initDStream(stream->decoder_stream); if (::ZSTD_isError(ret)) { throw std::runtime_error("Failed to initialize Zstd decompression"); } } void ZSTD_INFO::init_stream_encoder(stream_t* stream, char* raw_data) { stream->encoder_stream = ::ZSTD_createCStream(); auto ret = ::ZSTD_initCStream(stream->encoder_stream, 19); if (::ZSTD_isError(ret)) { throw std::runtime_error("Failed to initialize Zstd compression"); } } CompStatus ZSTD_INFO::stream_run_encode(stream_t* stream, CompStep step) { ::ZSTD_inBuffer inBuf; inBuf.src = stream->next_in; inBuf.size = stream->avail_in; inBuf.pos = 0; ::ZSTD_outBuffer outBuf; outBuf.dst = stream->next_out; outBuf.size = stream->avail_out; outBuf.pos = 0; auto ret = step == CompStep::STEP ? ::ZSTD_compressStream(stream->encoder_stream, &outBuf, &inBuf) : ::ZSTD_endStream(stream->encoder_stream, &outBuf); stream->next_in += inBuf.pos; stream->avail_in -= inBuf.pos; stream->next_out += outBuf.pos; stream->avail_out -= outBuf.pos; stream->total_out += outBuf.pos; if (::ZSTD_isError(ret)) { throw std::runtime_error(::ZSTD_getErrorName(ret)); } if ( step == CompStep::STEP ) { if ( stream->avail_in != 0) { ASSERT(stream->avail_out, ==, 0u); return CompStatus::BUF_ERROR; } } else if ( ret > 0 ) { return CompStatus::BUF_ERROR; } return CompStatus::OK; } CompStatus ZSTD_INFO::stream_run_decode(stream_t* stream, CompStep /*step*/) { ::ZSTD_inBuffer inBuf; inBuf.src = stream->next_in; inBuf.size = stream->avail_in; inBuf.pos = 0; ::ZSTD_outBuffer outBuf; outBuf.dst = stream->next_out; outBuf.size = stream->avail_out; outBuf.pos = 0; auto ret = ::ZSTD_decompressStream(stream->decoder_stream, &outBuf, &inBuf); stream->next_in += inBuf.pos; stream->avail_in -= inBuf.pos; stream->next_out += outBuf.pos; stream->avail_out -= outBuf.pos; stream->total_out += outBuf.pos; if (::ZSTD_isError(ret)) throw std::runtime_error(::ZSTD_getErrorName(ret)); if (ret == 0) return CompStatus::STREAM_END; return CompStatus::BUF_ERROR; } void ZSTD_INFO::stream_end_decode(stream_t* stream) { } void ZSTD_INFO::stream_end_encode(stream_t* stream) { } libzim-9.2.3/src/compression.h000066400000000000000000000226531466367137100163420ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * Copyright (C) 2020 Emmanuel Engelhart * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef _LIBZIM_COMPRESSION_ #define _LIBZIM_COMPRESSION_ #include "reader.h" #include #include "config.h" #include #include #include "zim_types.h" #include "constants.h" #include #include #include //#define DEB(X) std::cerr << __func__ << " " << X << std::endl ; #define DEB(X) enum class CompStep { STEP, FINISH }; enum class CompStatus { OK, STREAM_END, BUF_ERROR, }; enum class RunnerStatus { OK, NEED_MORE, ERROR }; struct LZMA_INFO { typedef lzma_stream stream_t; static const std::string name; static void init_stream_decoder(stream_t* stream, char* raw_data); static CompStatus stream_run_decode(stream_t* stream, CompStep step); static CompStatus stream_run(stream_t* stream, CompStep step); static void stream_end_decode(stream_t* stream); }; struct LIBZIM_PRIVATE_API ZSTD_INFO { struct LIBZIM_PRIVATE_API stream_t { const unsigned char* next_in; size_t avail_in; unsigned char* next_out; size_t avail_out; size_t total_out; ::ZSTD_CStream* encoder_stream; ::ZSTD_DStream* decoder_stream; stream_t(); ~stream_t(); private: stream_t(const stream_t& t) = delete; void operator=(const stream_t& t) = delete; }; static const std::string name; static void init_stream_decoder(stream_t* stream, char* raw_data); static void init_stream_encoder(stream_t* stream, char* raw_data); static CompStatus stream_run_encode(stream_t* stream, CompStep step); static CompStatus stream_run_decode(stream_t* stream, CompStep step); static void stream_end_encode(stream_t* stream); static void stream_end_decode(stream_t* stream); }; namespace zim { template class Uncompressor { public: Uncompressor(size_t initial_size) : ret_data(new char[initial_size]), data_size(initial_size) {} ~Uncompressor() = default; void init(char* data) { INFO::init_stream_decoder(&stream, data); stream.next_out = (uint8_t*)ret_data.get(); stream.avail_out = data_size; } RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) { stream.next_in = (unsigned char*)data; stream.avail_in = size; while (true) { auto errcode = INFO::stream_run_decode(&stream, step); DEB((int)errcode) switch (errcode) { case CompStatus::BUF_ERROR: if (stream.avail_in == 0 && stream.avail_out != 0) { // End of input stream. // compressor hasn't recognize the end of the input stream but there is // no more input. return RunnerStatus::NEED_MORE; } else { // Not enought output size. // Allocate more memory and continue the loop. DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out) data_size *= 2; std::unique_ptr new_ret_data(new char[data_size]); memcpy(new_ret_data.get(), ret_data.get(), stream.total_out); stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out); stream.avail_out = data_size - stream.total_out; DEB(data_size << " " << stream.avail_out << " " << stream.avail_in) ret_data = std::move(new_ret_data); } break; case CompStatus::OK: // On first call where lzma cannot progress (no output size). // Lzma return OK. If we return NEED_MORE, then we will try to compress // with new input data, but we should not as current one is not processed. // We must do a second step to have te BUF_ERROR and handle thing correctly. // If we have no more input, then we must ask for more. if (stream.avail_in == 0) { return RunnerStatus::NEED_MORE; } break; case CompStatus::STREAM_END: // End of compressed stream. Everything is ok. return RunnerStatus::OK; default: // unreachable return RunnerStatus::ERROR; } }; // unreachable return RunnerStatus::NEED_MORE; } std::unique_ptr get_data(zim::zsize_t* size) { feed(nullptr, 0, CompStep::FINISH); size->v = stream.total_out; INFO::stream_end_decode(&stream); return std::move(ret_data); } private: std::unique_ptr ret_data; size_type data_size; typename INFO::stream_t stream; }; #define CHUNCK_SIZE ((zim::size_type)(1024)) /** * Uncompress data of the reader at startOffset. * * @param reader The reader where the data is. * @param startOffset The offset where the data is in the reader. * @param[out] dest_size The size of the uncompressed data. * @return A pointer to the uncompressed data. This must be deleted (delete[]) */ template std::unique_ptr uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) { // Use a compressor to compress the data. // As we don't know the result size, neither the compressed size, // we have to do chunk by chunk until decompressor is happy. // Let's assume it will be something like the default clusterSize used at creation Uncompressor runner(DEFAULT_CLUSTER_SIZE); // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE. std::vector raw_data(CHUNCK_SIZE); DEB("Init") runner.init(raw_data.data()); zim::size_type availableSize = reader->size().v - startOffset.v; auto ret = RunnerStatus::NEED_MORE; while(ret != RunnerStatus::OK) { if (ret == RunnerStatus::NEED_MORE and availableSize) { zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE); reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize)); startOffset.v += inputSize; availableSize -= inputSize; DEB("Step " << startOffset.v) ret = runner.feed(raw_data.data(), inputSize); DEB("Ret " << (int)ret) } if (ret == RunnerStatus::ERROR) { throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name + std::string(" stream for cluster.")); } } DEB("Finish") return runner.get_data(dest_size); } template class Compressor { public: Compressor(size_t initial_size=1024*1024) : ret_data(new char[initial_size]), ret_size(initial_size) {} ~Compressor() = default; void init(char* data) { INFO::init_stream_encoder(&stream, data); stream.next_out = (uint8_t*)ret_data.get(); stream.avail_out = ret_size; } RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) { stream.next_in = (unsigned char*)data; stream.avail_in = size; while (true) { auto errcode = INFO::stream_run_encode(&stream, step); switch (errcode) { case CompStatus::OK: if (stream.avail_out == 0) { // lzma return a OK return status the first time it runs out of output memory. // The BUF_ERROR is returned only the second time we call a lzma_code. continue; } else { return RunnerStatus::NEED_MORE; } case CompStatus::STREAM_END: return RunnerStatus::NEED_MORE; case CompStatus::BUF_ERROR: if (stream.avail_out == 0) { //Not enought output size ret_size *= 2; std::unique_ptr new_ret_data(new char[ret_size]); memcpy(new_ret_data.get(), ret_data.get(), stream.total_out); stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out); stream.avail_out = ret_size - stream.total_out; ret_data = std::move(new_ret_data); continue; } else { return RunnerStatus::ERROR; } break; default: // unreachable return RunnerStatus::ERROR; }; }; // urreachable return RunnerStatus::NEED_MORE; } std::unique_ptr get_data(zim::zsize_t* size) { feed(nullptr, 0, CompStep::FINISH); INFO::stream_end_encode(&stream); size->v = stream.total_out; return std::move(ret_data); } private: std::unique_ptr ret_data; size_t ret_size; typename INFO::stream_t stream; }; } // namespace zim #endif // _LIBZIM_COMPRESSION_ libzim-9.2.3/src/concurrent_cache.h000066400000000000000000000054031466367137100173000ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_CONCURRENT_CACHE_H #define ZIM_CONCURRENT_CACHE_H #include "lrucache.h" #include #include namespace zim { /** ConcurrentCache implements a concurrent thread-safe cache Compared to zim::lru_cache, each access operation is slightly more expensive. However, different slots of the cache can be safely accessed concurrently with minimal blocking. Concurrent access to the same element is also safe, and, in case of a cache miss, will block until that element becomes available. */ template class ConcurrentCache { private: // types typedef std::shared_future ValuePlaceholder; typedef lru_cache Impl; public: // types explicit ConcurrentCache(size_t maxEntries) : impl_(maxEntries) {} // Gets the entry corresponding to the given key. If the entry is not in the // cache, it is obtained by calling f() (without any arguments) and the // result is put into the cache. // // The cache as a whole is locked only for the duration of accessing // the respective slot. If, in the case of the a cache miss, the generation // of the missing element takes a long time, only attempts to access that // element will block - the rest of the cache remains open to concurrent // access. template Value getOrPut(const Key& key, F f) { std::promise valuePromise; std::unique_lock l(lock_); const auto x = impl_.getOrPut(key, valuePromise.get_future().share()); l.unlock(); if ( x.miss() ) { try { valuePromise.set_value(f()); } catch (std::exception& e) { drop(key); throw; } } return x.value().get(); } bool drop(const Key& key) { std::unique_lock l(lock_); return impl_.drop(key); } private: // data Impl impl_; std::mutex lock_; }; } // namespace zim #endif // ZIM_CONCURRENT_CACHE_H libzim-9.2.3/src/config.h.in000066400000000000000000000011151466367137100156410ustar00rootroot00000000000000#ifndef ZIM_CONFIG_H #define ZIM_CONFIG_H #if defined(_MSC_VER) && defined(LIBZIM_EXPORT_PRIVATE_DLL) #define LIBZIM_PRIVATE_API __declspec(dllexport) #else #define LIBZIM_PRIVATE_API #endif #mesondefine VERSION #mesondefine DIRENT_CACHE_SIZE #mesondefine DIRENT_LOOKUP_CACHE_SIZE #mesondefine CLUSTER_CACHE_SIZE #mesondefine LZMA_MEMORY_SIZE #mesondefine ENABLE_XAPIAN #mesondefine ENABLE_XAPIAN_FULLER #mesondefine ENABLE_USE_MMAP #mesondefine ENABLE_USE_BUFFER_HEADER #mesondefine MMAP_SUPPORT_64 #mesondefine ENV64BIT #mesondefine ENV32BIT #endif // ZIM_CONFIG_H libzim-9.2.3/src/constants.h000066400000000000000000000016021466367137100160040ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #define ANCHOR_TERM "0posanchor " #define DEFAULT_CLUSTER_SIZE 2*1024*1024 libzim-9.2.3/src/debug.h000066400000000000000000000042371466367137100150650ustar00rootroot00000000000000/* * Copyright (C) 2017-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef DEBUG_H_ #define DEBUG_H_ #include #include #include #include #if defined (NDEBUG) # define ASSERT(left, operator, right) (void(0)) #else #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__) #include #endif template void _on_assert_fail(const char* vara, const char* op, const char* varb, T a, U b, const char* file, int line) { zim::Formatter fmt; std::cerr << (fmt << "\nAssertion failed at " << file << ":" << line << "\n " << vara << "[" << a << "] " << op << " " << varb << "[" << b << "]") << std::endl; #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__) void *callstack[64]; size_t size; size = backtrace(callstack, 64); char** strings = backtrace_symbols(callstack, size); for (size_t i=0; i * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_DECODERSTREAMREADER_H #define ZIM_DECODERSTREAMREADER_H #include "compression.h" #include "istreamreader.h" namespace zim { template class DecoderStreamReader : public IStreamReader { private: // constants enum { CHUNK_SIZE = 1024 }; public: // functions DecoderStreamReader(std::shared_ptr inputReader) : m_encodedDataReader(inputReader), m_currentInputOffset(0), m_inputBytesLeft(inputReader->size()), m_encodedDataChunk(Buffer::makeBuffer(zsize_t(CHUNK_SIZE))) { Decoder::init_stream_decoder(&m_decoderState, nullptr); readNextChunk(); } ~DecoderStreamReader() { Decoder::stream_end_decode(&m_decoderState); } private: // functions void readNextChunk() { const auto n = std::min(zsize_t(CHUNK_SIZE), m_inputBytesLeft); m_encodedDataChunk = m_encodedDataReader->get_buffer(m_currentInputOffset, n); m_currentInputOffset += n; m_inputBytesLeft -= n; // XXX: ugly C-style cast (casting away constness) on the next line m_decoderState.next_in = (unsigned char*)m_encodedDataChunk.data(); m_decoderState.avail_in = m_encodedDataChunk.size().v; } CompStatus decodeMoreBytes() { CompStep step = CompStep::STEP; if ( m_decoderState.avail_in == 0 ) { if ( m_inputBytesLeft.v == 0 ) step = CompStep::FINISH; else readNextChunk(); } return Decoder::stream_run_decode(&m_decoderState, step); } void readImpl(char* buf, zsize_t nbytes) override { m_decoderState.next_out = (unsigned char*)buf; m_decoderState.avail_out = nbytes.v; while ( m_decoderState.avail_out != 0 ) { // We don't car of the return code of decodeMoreBytes. // We feed (or stop feeding) the decoder based on what // we need to decode and the `avail_in`. // If there is a error somehow, a exception will be thrown. decodeMoreBytes(); } } private: // types typedef typename Decoder::stream_t DecoderState; private: // data std::shared_ptr m_encodedDataReader; offset_t m_currentInputOffset; zsize_t m_inputBytesLeft; // count of bytes left in the input stream DecoderState m_decoderState; Buffer m_encodedDataChunk; }; } // namespace zim #endif // ZIM_DECODERSTREAMREADER_H libzim-9.2.3/src/dirent.cpp000066400000000000000000000114511466367137100156130ustar00rootroot00000000000000/* * Copyright (C) 2017-2020 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "_dirent.h" #include "direntreader.h" #include #include #include "buffer.h" #include "bufferstreamer.h" #include "log.h" #include log_define("zim.dirent") namespace zim { ////////////////////////////////////////////////////////////////////// // Dirent // const uint16_t Dirent::redirectMimeType; const uint16_t Dirent::linktargetMimeType; const uint16_t Dirent::deletedMimeType; bool DirentReader::initDirent(Dirent& dirent, const Buffer& direntData) const { BufferStreamer reader(direntData); uint16_t mimeType = reader.read(); bool redirect = (mimeType == Dirent::redirectMimeType); bool linktarget = (mimeType == Dirent::linktargetMimeType); bool deleted = (mimeType == Dirent::deletedMimeType); uint8_t extraLen = reader.read(); char ns = reader.read(); uint32_t version = reader.read(); dirent.setVersion(version); if (redirect) { entry_index_type redirectIndex(reader.read()); log_debug("redirectIndex=" << redirectIndex); dirent.setRedirect(entry_index_t(redirectIndex)); } else if (linktarget || deleted) { log_debug("linktarget or deleted entry"); dirent.setItem(mimeType, cluster_index_t(0), blob_index_t(0)); } else { log_debug("read article entry"); uint32_t clusterNumber = reader.read(); uint32_t blobNumber = reader.read(); log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber); dirent.setItem(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber)); } std::string path; std::string title; std::string parameter; log_debug("read path, title and parameters"); size_type path_size = strnlen( reader.current(), reader.left().v - extraLen ); if (path_size >= reader.left().v) { return false; } path = std::string(reader.current(), path_size); reader.skip(zsize_t(path_size + 1)); size_type title_size = strnlen( reader.current(), reader.left().v - extraLen ); if (title_size >= reader.left().v) { return false; } title = std::string(reader.current(), title_size); reader.skip(zsize_t(title_size+1)); if (extraLen > reader.left().v) { return false; } parameter = std::string(reader.current(), extraLen); dirent.setPath(ns, path); dirent.setTitle(title); dirent.setParameter(parameter); return true; } std::shared_ptr DirentReader::readDirent(offset_t offset) { const auto totalSize = mp_zimReader->size(); if (offset.v >= totalSize.v) { throw ZimFileFormatError("Invalid dirent pointer"); } // We don't know the size of the dirent because it depends of the size of // the title, path and extra parameters. // This is a pity but we have no choice. // We cannot take a buffer of the size of the file, it would be really // inefficient. Let's do try, catch and retry while chosing a smart value // for the buffer size. Most dirent will be "Article" entry (header's size // == 16) without extra parameters. Let's hope that path + title size will // be < 256 and if not try again with a bigger size. size_t bufferSize(std::min(size_type(256), mp_zimReader->size().v-offset.v)); auto dirent = std::make_shared(); std::lock_guard lock(m_bufferMutex); for ( ; ; bufferSize += 256 ) { m_buffer.reserve(bufferSize); mp_zimReader->read(m_buffer.data(), offset, zsize_t(bufferSize)); if ( initDirent(*dirent, Buffer::makeBuffer(m_buffer.data(), zsize_t(bufferSize))) ) return dirent; } } std::string Dirent::getLongPath() const { log_trace("Dirent::getLongPath()"); log_debug("namespace=" << getNamespace() << " title=" << getTitle()); return std::string(1, getNamespace()) + '/' + getPath(); } } libzim-9.2.3/src/dirent_accessor.cpp000066400000000000000000000056621466367137100175040ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "dirent_accessor.h" #include "direntreader.h" #include "_dirent.h" #include "envvalue.h" #include #include using namespace zim; DirectDirentAccessor::DirectDirentAccessor( std::shared_ptr direntReader, std::unique_ptr pathPtrReader, entry_index_t direntCount) : mp_direntReader(direntReader), mp_pathPtrReader(std::move(pathPtrReader)), m_direntCount(direntCount), m_direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), m_bufferDirentZone(256) {} std::shared_ptr DirectDirentAccessor::getDirent(entry_index_t idx) const { { std::lock_guard l(m_direntCacheLock); auto v = m_direntCache.get(idx.v); if (v.hit()) { return v.value(); } } auto direntOffset = getOffset(idx); auto dirent = readDirent(direntOffset); std::lock_guard l(m_direntCacheLock); m_direntCache.put(idx.v, dirent); return dirent; } offset_t DirectDirentAccessor::getOffset(entry_index_t idx) const { if (idx >= m_direntCount) { throw std::out_of_range("entry index out of range"); } offset_t offset(mp_pathPtrReader->read_uint(offset_t(sizeof(offset_type)*idx.v))); return offset; } std::shared_ptr DirectDirentAccessor::readDirent(offset_t offset) const { return mp_direntReader->readDirent(offset); } IndirectDirentAccessor::IndirectDirentAccessor(std::shared_ptr direntAccessor, std::unique_ptr indexReader, title_index_t direntCount) : mp_direntAccessor(direntAccessor), mp_indexReader(std::move(indexReader)), m_direntCount(direntCount) {} entry_index_t IndirectDirentAccessor::getDirectIndex(title_index_t idx) const { if (idx >= m_direntCount) { throw std::out_of_range("entry index out of range"); } entry_index_t index(mp_indexReader->read_uint(offset_t(sizeof(entry_index_t)*idx.v))); return index; } std::shared_ptr IndirectDirentAccessor::getDirent(title_index_t idx) const { auto directIndex = getDirectIndex(idx); return mp_direntAccessor->getDirent(directIndex); } libzim-9.2.3/src/dirent_accessor.h000066400000000000000000000054541466367137100171500ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_DIRENT_ACCESSOR_H #define ZIM_DIRENT_ACCESSOR_H #include "zim_types.h" #include "lrucache.h" #include "config.h" #include #include #include namespace zim { class Dirent; class Reader; class DirentReader; /** * DirectDirentAccessor is used to access a dirent from its index. * It doesn't provide any "advanced" features like lookup or find. * * This is the base class to locate a dirent (offset) and read it. * */ class LIBZIM_PRIVATE_API DirectDirentAccessor { public: // functions DirectDirentAccessor(std::shared_ptr direntReader, std::unique_ptr pathPtrReader, entry_index_t direntCount); offset_t getOffset(entry_index_t idx) const; std::shared_ptr getDirent(entry_index_t idx) const; entry_index_t getDirentCount() const { return m_direntCount; } private: // functions std::shared_ptr readDirent(offset_t) const; private: // data std::shared_ptr mp_direntReader; std::unique_ptr mp_pathPtrReader; entry_index_t m_direntCount; mutable lru_cache> m_direntCache; mutable std::mutex m_direntCacheLock; mutable std::vector m_bufferDirentZone; mutable std::mutex m_bufferDirentLock; }; class IndirectDirentAccessor { public: IndirectDirentAccessor(std::shared_ptr, std::unique_ptr indexReader, title_index_t direntCount); entry_index_t getDirectIndex(title_index_t idx) const; std::shared_ptr getDirent(title_index_t idx) const; title_index_t getDirentCount() const { return m_direntCount; } private: // data std::shared_ptr mp_direntAccessor; std::unique_ptr mp_indexReader; title_index_t m_direntCount; }; } // namespace zim #endif // ZIM_DIRENT_ACCESSOR_H libzim-9.2.3/src/dirent_lookup.h000066400000000000000000000163741466367137100166620ustar00rootroot00000000000000/* * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_DIRENT_LOOKUP_H #define ZIM_DIRENT_LOOKUP_H #include "zim_types.h" #include "debug.h" #include "narrowdown.h" #include #include #include #include namespace zim { template class DirentLookup { public: // types typedef typename TConfig::DirentAccessorType DirentAccessor; typedef typename TConfig::index_t index_t; typedef std::pair Result; public: // functions explicit DirentLookup(const DirentAccessor* _direntAccessor); index_t getNamespaceRangeBegin(char ns) const; index_t getNamespaceRangeEnd(char ns) const; Result find(char ns, const std::string& key) const; protected: // functions int compareWithDirentAt(char ns, const std::string& key, entry_index_type i) const; Result findInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const; Result binarySearchInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const; protected: // types typedef std::map NamespaceBoundaryCache; protected: // data const DirentAccessor& direntAccessor; const entry_index_type direntCount; mutable NamespaceBoundaryCache namespaceBoundaryCache; mutable std::mutex cacheAccessMutex; }; template int DirentLookup::compareWithDirentAt(char ns, const std::string& key, entry_index_type i) const { const auto dirent = direntAccessor.getDirent(index_t(i)); return ns < dirent->getNamespace() ? -1 : ns > dirent->getNamespace() ? 1 : key.compare(TConfig::getDirentKey(*dirent)); } template class FastDirentLookup : public DirentLookup { typedef DirentLookup BaseType; typedef typename BaseType::DirentAccessor DirentAccessor; typedef typename BaseType::index_t index_t; public: // functions FastDirentLookup(const DirentAccessor* _direntAccessor, entry_index_type cacheEntryCount); typename BaseType::Result find(char ns, const std::string& key) const; private: // functions std::string getDirentKey(entry_index_type i) const; private: // data using BaseType::direntAccessor; using BaseType::direntCount; NarrowDown lookupGrid; }; template std::string FastDirentLookup::getDirentKey(entry_index_type i) const { const auto d = direntAccessor.getDirent(index_t(i)); return d->getNamespace() + TConfig::getDirentKey(*d); } template DirentLookup::DirentLookup(const DirentAccessor* _direntAccessor) : direntAccessor(*_direntAccessor) , direntCount(direntAccessor.getDirentCount()) { } template FastDirentLookup::FastDirentLookup(const DirentAccessor* _direntAccessor, entry_index_type cacheEntryCount) : BaseType(_direntAccessor) { if ( direntCount ) { const entry_index_type step = std::max(1u, direntCount/cacheEntryCount); for ( entry_index_type i = 0; i < direntCount-1; i += step ) { lookupGrid.add(getDirentKey(i), i, getDirentKey(i+1)); } lookupGrid.close(getDirentKey(direntCount - 1), direntCount - 1); } } template entry_index_t getNamespaceBeginOffset(TDirentAccessor& direntAccessor, char ch) { ASSERT(ch, >=, 32); ASSERT(ch, <=, 127); if (direntAccessor.getDirentCount().v == 0) { return entry_index_t(0); } entry_index_type lower = 0; entry_index_type upper = entry_index_type(direntAccessor.getDirentCount()); auto d = direntAccessor.getDirent(entry_index_t(0)); while (upper - lower > 1) { entry_index_type m = lower + (upper - lower) / 2; auto d = direntAccessor.getDirent(entry_index_t(m)); if (d->getNamespace() >= ch) upper = m; else lower = m; } entry_index_t ret = entry_index_t(d->getNamespace() < ch ? upper : lower); return ret; } template entry_index_t getNamespaceEndOffset(TDirentAccessor& direntAccessor, char ch) { ASSERT(ch, >=, 32); ASSERT(ch, <, 127); return getNamespaceBeginOffset(direntAccessor, ch+1); } template typename DirentLookup::index_t DirentLookup::getNamespaceRangeBegin(char ch) const { ASSERT(ch, >=, 32); ASSERT(ch, <=, 127); { std::lock_guard lock(cacheAccessMutex); const auto it = namespaceBoundaryCache.find(ch); if (it != namespaceBoundaryCache.end()) return it->second; } auto ret = getNamespaceBeginOffset(direntAccessor, ch); std::lock_guard lock(cacheAccessMutex); namespaceBoundaryCache[ch] = ret; return ret; } template typename DirentLookup::index_t DirentLookup::getNamespaceRangeEnd(char ns) const { return getNamespaceRangeBegin(ns+1); } template typename DirentLookup::Result FastDirentLookup::find(char ns, const std::string& key) const { const auto r = lookupGrid.getRange(ns + key); return BaseType::findInRange(r.begin, r.end, ns, key); } template typename DirentLookup::Result DirentLookup::find(char ns, const std::string& key) const { return findInRange(0, direntCount, ns, key); } template typename DirentLookup::Result DirentLookup::findInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const { if ( l == u ) return { false, index_t(l) }; const auto c = compareWithDirentAt(ns, key, l); if ( c < 0 ) return { false, index_t(l) }; else if ( c == 0 ) return { true, index_t(l) }; if ( compareWithDirentAt(ns, key, u-1) > 0 ) return { false, index_t(u) }; return binarySearchInRange(l, u-1, ns, key); } template typename DirentLookup::Result DirentLookup::binarySearchInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const { assert(l <= u && u < direntCount); assert(compareWithDirentAt(ns, key, l) > 0); assert(compareWithDirentAt(ns, key, u) <= 0); // Invariant maintained by the binary search: // (entry at l) < (query entry ns/key) <= (entry at u) while (true) { // compute p as the **upward rounded** average of l and u const entry_index_type p = l + (u - l + 1) / 2; const int c = compareWithDirentAt(ns, key, p); if (c <= 0) { // (entry at l) < ns/key <= (entry at p) <= (entry at u) if ( u == p ) { return { c == 0, index_t(u) }; } u = p; } else { // (entry at l) < (entry at p) < ns/key <= (entry at u) l = p; } } } } // namespace zim #endif // ZIM_DIRENT_LOOKUP_H libzim-9.2.3/src/direntreader.h000066400000000000000000000031471466367137100164460ustar00rootroot00000000000000/* * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_DIRENTREADER_H #define ZIM_DIRENTREADER_H #include "_dirent.h" #include "reader.h" #include #include #include namespace zim { // Unlke FileReader and MemoryReader (which read data from a file and memory, // respectively), DirentReader is a helper class that reads Dirents (rather // than from a Dirent). class LIBZIM_PRIVATE_API DirentReader { public: // functions explicit DirentReader(std::shared_ptr zimReader) : mp_zimReader(zimReader) {} std::shared_ptr readDirent(offset_t offset); private: // functions bool initDirent(Dirent& dirent, const Buffer& direntData) const; private: // data std::shared_ptr mp_zimReader; std::vector m_buffer; std::mutex m_bufferMutex; }; } // namespace zim #endif // ZIM_DIRENTREADER_H libzim-9.2.3/src/endian_tools.h000066400000000000000000000046371466367137100164610ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ENDIAN_H #define ENDIAN_H #include #include namespace zim { template struct ToLittleEndianImpl; template struct ToLittleEndianImpl{ static void write(const T& d, char* dst) { uint16_t v = static_cast(d); dst[0] = static_cast(v); dst[1] = static_cast(v>>8); } }; template struct ToLittleEndianImpl{ static void write(const T& d, char* dst) { uint32_t v = static_cast(d); dst[0] = static_cast(v); dst[1] = static_cast(v>>8); dst[2] = static_cast(v>>16); dst[3] = static_cast(v>>24); } }; template struct ToLittleEndianImpl{ static void write(const T& d, char* dst) { uint64_t v = static_cast(d); dst[0] = static_cast(v); dst[1] = static_cast(v>>8); dst[2] = static_cast(v>>16); dst[3] = static_cast(v>>24); dst[4] = static_cast(v>>32); dst[5] = static_cast(v>>40); dst[6] = static_cast(v>>48); dst[7] = static_cast(v>>56); } }; //////////////////////////////////////////////////////////////////////// template inline void toLittleEndian(T d, char* dst) { ToLittleEndianImpl::write(d, dst); } template inline T fromLittleEndian(const char* ptr) { T ret = 0; for(size_t i=0; i(static_cast(ptr[i])) << (i*8)); } return ret; } } #endif // ENDIAN_H libzim-9.2.3/src/entry.cpp000066400000000000000000000044371466367137100154750ustar00rootroot00000000000000/* * Copyright (C) 2021 Renaud Gaudin * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include "fileimpl.h" #include "log.h" log_define("zim.entry") using namespace zim; Entry::Entry(std::shared_ptr file, entry_index_type idx) : m_file(file), m_idx(idx), m_dirent(file->getDirent(entry_index_t(idx))) {} std::string Entry::getTitle() const { return m_dirent->getTitle(); } std::string Entry::getPath() const { if (m_file->hasNewNamespaceScheme()) { return m_dirent->getPath(); } else { return m_dirent->getLongPath(); } } bool Entry::isRedirect() const { return m_dirent->isRedirect(); } Item Entry::getItem(bool follow) const { if (isRedirect()) { if (!follow) throw InvalidType(Formatter() << "Entry " << getPath() << " is a redirect entry."); return getRedirect(); } return Item(*this); } Item Entry::getRedirect() const { auto nextEntry = getRedirectEntry(); auto watchdog = 50U; while (nextEntry.isRedirect() && --watchdog) { nextEntry = nextEntry.getRedirectEntry(); } return nextEntry.getItem(false); } entry_index_type Entry::getRedirectEntryIndex() const { if (!isRedirect()) throw InvalidType(Formatter() << "Entry " << getPath() << " is not a redirect entry."); return m_dirent->getRedirectIndex().v; } Entry Entry::getRedirectEntry() const { return Entry(m_file, getRedirectEntryIndex()); } libzim-9.2.3/src/envvalue.cpp000066400000000000000000000027141466367137100161550ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include namespace zim { unsigned envValue(const char* env, unsigned def) { const char* v = ::getenv(env); if (v) { std::istringstream s(v); s >> def; } return def; } unsigned envMemSize(const char* env, unsigned def) { const char* v = ::getenv(env); if (v) { char unit = '\0'; std::istringstream s(v); s >> def >> unit; switch (unit) { case 'k': case 'K': def *= 1024; break; case 'm': case 'M': def *= 1024 * 1024; break; case 'g': case 'G': def *= 1024 * 1024 * 1024; break; } } return def; } } libzim-9.2.3/src/envvalue.h000066400000000000000000000017511466367137100156220ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_ENVVALUE_H #define ZIM_ENVVALUE_H namespace zim { unsigned envValue(const char* env, unsigned def); unsigned envMemSize(const char* env, unsigned def); } #endif // ZIM_ENVVALUE_H libzim-9.2.3/src/file_compound.cpp000066400000000000000000000073541466367137100171600ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2017-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "file_compound.h" #include #include #include #include #include #ifdef _WIN32 # include #else # include #endif namespace zim { void FileCompound::addPart(FilePart* fpart) { const Range newRange(offset_t(_fsize.v), offset_t((_fsize+fpart->size()).v)); emplace(newRange, fpart); _fsize += fpart->size(); } std::shared_ptr FileCompound::openSinglePieceOrSplitZimFile(const std::string& original_filename) { std::shared_ptr fileCompound; bool multi_parts_asked = false; auto filename = original_filename; if (filename.size() > 6 && filename.substr(filename.size()-6) == ".zimaa") { filename.resize(filename.size()-2); multi_parts_asked = true; } else { try { fileCompound = std::make_shared(filename); } catch(...) { } } if ( !fileCompound ) { fileCompound = std::make_shared(filename, FileCompound::MultiPartToken::Multi); } if (fileCompound->empty()) { // We haven't found any part throw std::runtime_error(Formatter() << "Error opening " << (multi_parts_asked ? "as a split " : "") << "ZIM file: " << original_filename); } return fileCompound; } FileCompound::FileCompound(const std::string& filename): _filename(filename), _fsize(0) { addPart(new FilePart(filename)); } FileCompound::FileCompound(const std::string& base_filename, MultiPartToken _token): _filename(base_filename), _fsize(0) { try { for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) { const std::string fname0 = base_filename + ch0; for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) { addPart(new FilePart(fname0 + ch1)); } } } catch (std::runtime_error& e) { // This catch acts as a break for the double loop. } } #ifndef _WIN32 FileCompound::FileCompound(int fd): _filename(), _fsize(0) { addPart(new FilePart(fd)); } FileCompound::FileCompound(FdInput fd): _filename(), _fsize(0) { addPart(new FilePart(fd)); } FileCompound::FileCompound(const std::vector& fds): _filename(), _fsize(0) { for (auto& fd: fds) { addPart(new FilePart(fd)); } } #endif FileCompound::~FileCompound() { for(auto it=begin(); it!=end(); it++) { auto filepart = it->second; delete filepart; } } time_t FileCompound::getMTime() const { if (mtime || empty()) return mtime; const char* fname = begin()->second->filename().c_str(); #if defined(HAVE_STAT64) && ! defined(__APPLE__) struct stat64 st; int ret = ::stat64(fname, &st); #else struct stat st; int ret = ::stat(fname, &st); #endif if (ret != 0) throw std::runtime_error(Formatter() << "stat failed with errno " << errno << " : " << strerror(errno)); mtime = st.st_mtime; return mtime; } } // zim libzim-9.2.3/src/file_compound.h000066400000000000000000000070441466367137100166210ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2017-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_COMPOUND_H_ #define ZIM_FILE_COMPOUND_H_ #include "file_part.h" #include "zim_types.h" #include "debug.h" #include "config.h" #include #include #include namespace zim { struct Range { Range(const offset_t min, const offset_t max) : min(min), max(max) { // ASSERT(min, <, max); } const offset_t min; const offset_t max; }; struct less_range { bool operator()(const Range& lhs, const Range& rhs) const { return lhs.min < rhs.min && lhs.max <= rhs.min; } }; class LIBZIM_PRIVATE_API FileCompound : private std::map { typedef std::map ImplType; public: // types typedef const_iterator PartIterator; typedef std::pair PartRange; enum class MultiPartToken { Multi }; public: // functions static std::shared_ptr openSinglePieceOrSplitZimFile(const std::string& filename); explicit FileCompound(const std::string& filename); explicit FileCompound(const std::string& filename, MultiPartToken token); #ifndef _WIN32 explicit FileCompound(int fd); explicit FileCompound(FdInput fd); explicit FileCompound(const std::vector& fds); #endif ~FileCompound(); using ImplType::begin; using ImplType::end; const std::string& filename() const { return _filename; } zsize_t fsize() const { return _fsize; }; time_t getMTime() const; bool fail() const { return empty(); }; bool is_multiPart() const { return size() > 1; }; PartIterator locate(offset_t offset) const { const PartIterator partIt = lower_bound(Range(offset, offset)); ASSERT(partIt != end(), ==, true); return partIt; } PartRange locate(offset_t offset, zsize_t size) const { const Range queryRange(offset, offset+size); // equal_range expects comparator to satisfy the `Compare` requirement. // (ie `comp(a, b) == !comp(b, a)`) which is not the case for `less_range` // If not satisfy, this is UB. // Even if UB, stdlib's equal_range behaves "correctly". // But libc++ (used in Apple, Android, ..) is not. // In all case, we are triggering a UB and it is to us to not call equal_range. // So let's use lower_bound and upper_bound which doesn't need such requirement. // See https://stackoverflow.com/questions/67042750/should-setequal-range-return-pair-setlower-bound-setupper-bound return {lower_bound(queryRange), upper_bound(queryRange)}; } private: // functions void addPart(FilePart* fpart); private: // data std::string _filename; zsize_t _fsize; mutable time_t mtime; }; }; #endif //ZIM_FILE_COMPOUND_H_ libzim-9.2.3/src/file_part.h000066400000000000000000000052401466367137100157370ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2017-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_PART_H_ #define ZIM_FILE_PART_H_ #include #include #include #include "zim_types.h" #include "fs.h" namespace zim { /** A part of file. * * `FilePart` references a part(section) of a physical file. * Most of the time, `FilePart` will reference the whole file (m_offset==0 and m_size==m_fhandle->getSize()) * but in some situation, it can reference only a part of the file: * We have this case on android where the zim file is split in different part and stored in a "resource" (zip) archive * using no-compression. */ class FilePart { typedef DEFAULTFS FS; public: using FDSharedPtr = std::shared_ptr; public: explicit FilePart(const std::string& filename) : m_filename(filename), m_fhandle(std::make_shared(FS::openFile(filename))), m_offset(0), m_size(m_fhandle->getSize()) {} #ifndef _WIN32 explicit FilePart(int fd) : FilePart(getFilePathFromFD(fd)) {} explicit FilePart(FdInput fdInput): m_filename(getFilePathFromFD(fdInput.fd)), m_fhandle(std::make_shared(FS::openFile(m_filename))), m_offset(fdInput.offset), m_size(fdInput.size) {} #endif ~FilePart() = default; const std::string& filename() const { return m_filename; }; const FS::FD& fhandle() const { return *m_fhandle; }; const FDSharedPtr& shareable_fhandle() const { return m_fhandle; }; zsize_t size() const { return m_size; }; offset_t offset() const { return m_offset; } bool fail() const { return !m_size; }; bool good() const { return bool(m_size); }; private: const std::string m_filename; FDSharedPtr m_fhandle; offset_t m_offset; zsize_t m_size; // The total size of the (starting at m_offset) of the part }; }; #endif //ZIM_FILE_PART_H_ libzim-9.2.3/src/file_reader.cpp000066400000000000000000000235571466367137100166010ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include "file_reader.h" #include "file_compound.h" #include "buffer.h" #include #include #include #include #include #include #include #ifndef _WIN32 # include # include #endif #if defined(_MSC_VER) # include # include typedef SSIZE_T ssize_t; #endif namespace { [[noreturn]] void throwSystemError(const std::string& errorText) { #ifdef _WIN32 // Windows doesn't use errno. throw std::system_error(std::error_code(), errorText); #else std::error_code ec(errno, std::generic_category()); throw std::system_error(ec, errorText); #endif } } namespace zim { //////////////////////////////////////////////////////////////////////////////// // MultiPartFileReader //////////////////////////////////////////////////////////////////////////////// MultiPartFileReader::MultiPartFileReader(std::shared_ptr source) : MultiPartFileReader(source, offset_t(0), source->fsize()) {} MultiPartFileReader::MultiPartFileReader(std::shared_ptr source, offset_t offset, zsize_t size) : BaseFileReader(offset, size), source(source) { ASSERT(offset.v, <=, source->fsize().v); ASSERT(offset.v+size.v, <=, source->fsize().v); } char MultiPartFileReader::readImpl(offset_t offset) const { offset += _offset; auto part_pair = source->locate(offset); auto& fhandle = part_pair->second->fhandle(); offset_t logical_local_offset = offset - part_pair->first.min; ASSERT(logical_local_offset, <=, part_pair->first.max); offset_t physical_local_offset = logical_local_offset + part_pair->second->offset(); char ret; try { fhandle.readAt(&ret, zsize_t(1), physical_local_offset); } catch (std::runtime_error& e) { //Error while reading. Formatter fmt; fmt << "Cannot read a char.\n"; fmt << " - File part is " << part_pair->second->filename() << "\n"; fmt << " - File part size is " << part_pair->second->size().v << "\n"; fmt << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n"; fmt << " - Reading offset at " << offset.v << "\n"; fmt << " - logical local offset is " << logical_local_offset.v << "\n"; fmt << " - physical local offset is " << physical_local_offset.v << "\n"; fmt << " - error is " << e.what() << "\n"; throwSystemError(fmt); }; return ret; } void MultiPartFileReader::readImpl(char* dest, offset_t offset, zsize_t size) const { offset += _offset; auto found_range = source->locate(offset, size); for(auto current = found_range.first; current!=found_range.second; current++){ auto part = current->second; Range partRange = current->first; offset_t logical_local_offset = offset - partRange.min; ASSERT(size.v, >, 0U); zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-logical_local_offset.v)); offset_t physical_local_offset = logical_local_offset + part->offset(); try { part->fhandle().readAt(dest, size_to_get, physical_local_offset); } catch (std::runtime_error& e) { Formatter fmt; fmt << "Cannot read chars.\n"; fmt << " - File part is " << part->filename() << "\n"; fmt << " - File part size is " << part->size().v << "\n"; fmt << " - File part range is " << partRange.min << "-" << partRange.max << "\n"; fmt << " - size_to_get is " << size_to_get.v << "\n"; fmt << " - total size is " << size.v << "\n"; fmt << " - Reading offset at " << offset.v << "\n"; fmt << " - logical local offset is " << logical_local_offset.v << "\n"; fmt << " - physical local offset is " << physical_local_offset.v << "\n"; fmt << " - error is " << e.what() << "\n"; throwSystemError(fmt); }; ASSERT(size_to_get, <=, size); dest += size_to_get.v; size -= size_to_get; offset += size_to_get; } ASSERT(size.v, ==, 0U); } #ifdef ENABLE_USE_MMAP namespace { class MMapException : std::exception {}; char* mmapReadOnly(int fd, offset_type offset, size_type size) { #if defined(__APPLE__) || defined(__OpenBSD__) || defined(__HAIKU__) const auto MAP_FLAGS = MAP_PRIVATE; #elif defined(__FreeBSD__) const auto MAP_FLAGS = MAP_PRIVATE|MAP_PREFAULT_READ; #else const auto MAP_FLAGS = MAP_PRIVATE|MAP_POPULATE; #endif const auto p = (char*)mmap(NULL, size, PROT_READ, MAP_FLAGS, fd, offset); if (p == MAP_FAILED) { // mmap may fails for a lot of reason. // Most of them (io error, too big size...) may not recoverable but some of // them may be relative to mmap only and a "simple" read from the file would work. // Let's throw a MMapException to fallback to read (and potentially fail again there). throw MMapException(); } return p; } Buffer::DataPtr makeMmappedBuffer(int fd, offset_t offset, zsize_t size) { const offset_type pageAlignedOffset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1)); const size_t alignmentAdjustment = offset.v - pageAlignedOffset; size += alignmentAdjustment; #if !MMAP_SUPPORT_64 if(pageAlignedOffset >= INT32_MAX) { throw MMapException(); } #endif char* const mmappedAddress = mmapReadOnly(fd, pageAlignedOffset, size.v); const auto munmapDeleter = [mmappedAddress, size](char* ) { munmap(mmappedAddress, size.v); }; return Buffer::DataPtr(mmappedAddress+alignmentAdjustment, munmapDeleter); } } // unnamed namespace #endif // ENABLE_USE_MMAP const Buffer BaseFileReader::get_buffer(offset_t offset, zsize_t size) const { ASSERT(size, <=, _size); #ifdef ENABLE_USE_MMAP try { return get_mmap_buffer(offset, size); } catch(MMapException& e) #endif { // We cannot do the mmap, for several possible reasons: // - Mmap offset is too big (>4GB on 32 bits) // - The range is several part // - We are on Windows. // - Mmap itself has failed // We will have to do some memory copies (or fail trying to) :/ // [TODO] Use Windows equivalent for mmap. auto ret_buffer = Buffer::makeBuffer(size); read(const_cast(ret_buffer.data()), offset, size); return ret_buffer; } } const Buffer MultiPartFileReader::get_mmap_buffer(offset_t offset, zsize_t size) const { #ifdef ENABLE_USE_MMAP auto found_range = source->locate(_offset + offset, size); auto first_part_containing_it = found_range.first; if (++first_part_containing_it != found_range.second) { throw MMapException(); } // The range is in only one part auto range = found_range.first->first; auto part = found_range.first->second; auto logical_local_offset = offset + _offset - range.min; ASSERT(size, <=, part->size()); int fd = part->fhandle().getNativeHandle(); auto physical_local_offset = logical_local_offset + part->offset(); return Buffer::makeBuffer(makeMmappedBuffer(fd, physical_local_offset, size), size); #else return Buffer::makeBuffer(size); // unreachable #endif } bool Reader::can_read(offset_t offset, zsize_t size) const { return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v); } std::unique_ptr MultiPartFileReader::sub_reader(offset_t offset, zsize_t size) const { ASSERT(offset.v+size.v, <=, _size.v); // TODO: can use a FileReader here if the new range fully belongs to a single part return std::unique_ptr(new MultiPartFileReader(source, _offset+offset, size)); } //////////////////////////////////////////////////////////////////////////////// // FileReader //////////////////////////////////////////////////////////////////////////////// FileReader::FileReader(FileHandle fh, offset_t offset, zsize_t size) : BaseFileReader(offset, size) , _fhandle(fh) { } char FileReader::readImpl(offset_t offset) const { offset += _offset; char ret; try { _fhandle->readAt(&ret, zsize_t(1), offset); } catch (std::runtime_error& e) { //Error while reading. Formatter fmt; fmt << "Cannot read a char.\n"; fmt << " - Reading offset at " << offset.v << "\n"; fmt << " - error is " << e.what() << "\n"; throwSystemError(fmt); }; return ret; } void FileReader::readImpl(char* dest, offset_t offset, zsize_t size) const { offset += _offset; try { _fhandle->readAt(dest, size, offset); } catch (std::runtime_error& e) { Formatter fmt; fmt << "Cannot read chars.\n"; fmt << " - Reading offset at " << offset.v << "\n"; fmt << " - size is " << size.v << "\n"; fmt << " - error is " << e.what() << "\n"; throwSystemError(fmt); }; } const Buffer FileReader::get_mmap_buffer(offset_t offset, zsize_t size) const { #ifdef ENABLE_USE_MMAP auto local_offset = offset + _offset; int fd = _fhandle->getNativeHandle(); return Buffer::makeBuffer(makeMmappedBuffer(fd, local_offset, size), size); #else return Buffer::makeBuffer(size); // unreachable #endif } std::unique_ptr FileReader::sub_reader(offset_t offset, zsize_t size) const { ASSERT(offset.v+size.v, <=, _size.v); return std::unique_ptr(new FileReader(_fhandle, _offset + offset, size)); } } // zim libzim-9.2.3/src/file_reader.h000066400000000000000000000061101466367137100162300ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILE_READER_H_ #define ZIM_FILE_READER_H_ #include "reader.h" #include "fs.h" namespace zim { class FileCompound; class LIBZIM_PRIVATE_API BaseFileReader : public Reader { public: // functions BaseFileReader(offset_t offset, zsize_t size) : _offset(offset), _size(size) {} ~BaseFileReader() = default; zsize_t size() const override { return _size; }; offset_t offset() const override { return _offset; }; virtual const Buffer get_mmap_buffer(offset_t offset, zsize_t size) const = 0; const Buffer get_buffer(offset_t offset, zsize_t size) const override; protected: // data offset_t _offset; zsize_t _size; }; class LIBZIM_PRIVATE_API FileReader : public BaseFileReader { public: // types typedef std::shared_ptr FileHandle; public: // functions FileReader(FileHandle fh, offset_t offset, zsize_t size); ~FileReader() = default; const Buffer get_mmap_buffer(offset_t offset, zsize_t size) const override; std::unique_ptr sub_reader(offset_t offset, zsize_t size) const override; private: // functions char readImpl(offset_t offset) const override; void readImpl(char *dest, offset_t offset, zsize_t size) const override; private: // data // The file handle is stored via a shared pointer so that it can be shared // by a sub_reader (otherwise the file handle would be invalidated by // FD destructor when the sub-reader is destroyed). FileHandle _fhandle; }; class LIBZIM_PRIVATE_API MultiPartFileReader : public BaseFileReader { public: explicit MultiPartFileReader(std::shared_ptr source); ~MultiPartFileReader() {}; const Buffer get_mmap_buffer(offset_t offset, zsize_t size) const override; std::unique_ptr sub_reader(offset_t offset, zsize_t size) const override; private: // functions char readImpl(offset_t offset) const override; void readImpl(char *dest, offset_t offset, zsize_t size) const override; private: // data MultiPartFileReader(std::shared_ptr source, offset_t offset, zsize_t size); std::shared_ptr source; }; }; #endif // ZIM_FILE_READER_H_ libzim-9.2.3/src/fileheader.cpp000066400000000000000000000123741466367137100164230ustar00rootroot00000000000000/* * Copyright (C) 2017-2020 Mattieu Gautier * Copyright (C) 2008 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "fileheader.h" #include #include #include #include "log.h" #include "endian_tools.h" #include "reader.h" #include "bufferstreamer.h" #include "buffer.h" #ifdef _WIN32 # include "io.h" #else # include "unistd.h" # define _write(fd, addr, size) ::write((fd), (addr), (size)) #endif log_define("zim.file.header") namespace zim { const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d" const uint16_t Fileheader::zimOldMajorVersion = 5; const uint16_t Fileheader::zimMajorVersion = 6; const uint16_t Fileheader::zimMinorVersion = 2; const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset) Fileheader::Fileheader() : majorVersion(zimMajorVersion), minorVersion(zimMinorVersion), articleCount(0), titleIdxPos(0), pathPtrPos(0), clusterCount(0), clusterPtrPos(0), mainPage(std::numeric_limits::max()), layoutPage(std::numeric_limits::max()), checksumPos(std::numeric_limits::max()) {} void Fileheader::write(int out_fd) const { char header[Fileheader::size]; toLittleEndian(Fileheader::zimMagic, header); toLittleEndian(getMajorVersion(), header + 4); toLittleEndian(getMinorVersion(), header + 6); std::copy(getUuid().data, getUuid().data + sizeof(Uuid), header + 8); toLittleEndian(getArticleCount(), header + 24); toLittleEndian(getClusterCount(), header + 28); toLittleEndian(getPathPtrPos(), header + 32); toLittleEndian(getTitleIdxPos(), header + 40); toLittleEndian(getClusterPtrPos(), header + 48); toLittleEndian(getMimeListPos(), header + 56); toLittleEndian(getMainPage(), header + 64); toLittleEndian(getLayoutPage(), header + 68); toLittleEndian(getChecksumPos(), header + 72); auto ret = _write(out_fd, header, Fileheader::size); if (ret != Fileheader::size) { std::cerr << "Error Writing" << std::endl; std::cerr << "Ret is " << ret << std::endl; perror("Error writing"); throw std::runtime_error("Error writing"); } } void Fileheader::read(const Reader& reader) { auto buffer = reader.get_buffer(offset_t(0), zsize_t(Fileheader::size)); auto seqReader = BufferStreamer(buffer); uint32_t magicNumber = seqReader.read(); if (magicNumber != Fileheader::zimMagic) { log_error("invalid magic number " << magicNumber << " found - " << Fileheader::zimMagic << " expected"); throw ZimFileFormatError("Invalid magic number"); } uint16_t major_version = seqReader.read(); if (major_version != zimOldMajorVersion && major_version != zimMajorVersion) { log_error("invalid zimfile major version " << major_version << " found - " << Fileheader::zimMajorVersion << " expected"); throw ZimFileFormatError("Invalid version"); } setMajorVersion(major_version); setMinorVersion(seqReader.read()); Uuid uuid; std::copy(seqReader.current(), seqReader.current()+16, uuid.data); seqReader.skip(zsize_t(16)); setUuid(uuid); setArticleCount(seqReader.read()); setClusterCount(seqReader.read()); setPathPtrPos(seqReader.read()); setTitleIdxPos(seqReader.read()); setClusterPtrPos(seqReader.read()); setMimeListPos(seqReader.read()); setMainPage(seqReader.read()); setLayoutPage(seqReader.read()); setChecksumPos(seqReader.read()); sanity_check(); } void Fileheader::sanity_check() const { if (!!articleCount != !!clusterCount) { throw ZimFileFormatError("No article <=> No cluster"); } if (mimeListPos != size && mimeListPos != 72) { throw ZimFileFormatError("mimelistPos must be 80."); } if (pathPtrPos < mimeListPos) { throw ZimFileFormatError("pathPtrPos must be > mimelistPos."); } if (titleIdxPos < mimeListPos) { throw ZimFileFormatError("titleIdxPos must be > mimelistPos."); } if (clusterPtrPos < mimeListPos) { throw ZimFileFormatError("clusterPtrPos must be > mimelistPos."); } if (clusterCount > articleCount) { throw ZimFileFormatError("Cluster count cannot be higher than article count."); } if (checksumPos != 0 && checksumPos < mimeListPos) { throw ZimFileFormatError("checksumPos must be > mimeListPos."); } } } libzim-9.2.3/src/fileheader.h000066400000000000000000000101361466367137100160620ustar00rootroot00000000000000/* * Copyright (C) 2017-2020 Matthieu Gautier * Copyright (C) 2008 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILEHEADER_H #define ZIM_FILEHEADER_H #include #include #include "config.h" #include // max may be defined as a macro by window includes #ifdef max #undef max #endif namespace zim { class Reader; class LIBZIM_PRIVATE_API Fileheader { public: static const uint32_t zimMagic; static const uint16_t zimOldMajorVersion; static const uint16_t zimMajorVersion; static const uint16_t zimMinorVersion; static const size_type size; private: uint16_t majorVersion; uint16_t minorVersion; Uuid uuid; entry_index_type articleCount; offset_type titleIdxPos; offset_type pathPtrPos; offset_type mimeListPos; cluster_index_type clusterCount; offset_type clusterPtrPos; entry_index_type mainPage; entry_index_type layoutPage; offset_type checksumPos; public: Fileheader(); void write(int out_fd) const; void read(const Reader& reader); // Do some sanity check, raise a ZimFileFormateError is // something is wrong. void sanity_check() const; uint16_t getMajorVersion() const { return majorVersion; } void setMajorVersion(uint16_t v) { majorVersion = v; } uint16_t getMinorVersion() const { return minorVersion; } void setMinorVersion(uint16_t v) { minorVersion = v; } const Uuid& getUuid() const { return uuid; } void setUuid(const Uuid& uuid_) { uuid = uuid_; } entry_index_type getArticleCount() const { return articleCount; } void setArticleCount(entry_index_type s) { articleCount = s; } offset_type getTitleIdxPos() const { return titleIdxPos; } void setTitleIdxPos(offset_type p) { titleIdxPos = p; } offset_type getPathPtrPos() const { return pathPtrPos; } void setPathPtrPos(offset_type p) { pathPtrPos = p; } offset_type getMimeListPos() const { return mimeListPos; } void setMimeListPos(offset_type p) { mimeListPos = p; } cluster_index_type getClusterCount() const { return clusterCount; } void setClusterCount(cluster_index_type s) { clusterCount = s; } offset_type getClusterPtrPos() const { return clusterPtrPos; } void setClusterPtrPos(offset_type p) { clusterPtrPos = p; } bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } entry_index_type getMainPage() const { return mainPage; } void setMainPage(entry_index_type s){ mainPage = s; } bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } entry_index_type getLayoutPage() const { return layoutPage; } void setLayoutPage(entry_index_type s) { layoutPage = s; } bool hasChecksum() const { return getMimeListPos() >= 80; } offset_type getChecksumPos() const { return hasChecksum() ? checksumPos : 0; } void setChecksumPos(offset_type p) { checksumPos = p; } }; } #endif // ZIM_FILEHEADER_H libzim-9.2.3/src/fileimpl.cpp000066400000000000000000000645051466367137100161370ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2006,2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #define CHUNK_SIZE 1024 #include "fileimpl.h" #include #include #include "_dirent.h" #include "file_compound.h" #include "buffer_reader.h" #include #include #include #include #include #include #include "config.h" #include "log.h" #include "envvalue.h" #include "md5.h" #include "tools.h" log_define("zim.file.impl") namespace zim { namespace { offset_t readOffset(const Reader& reader, entry_index_type idx) { offset_t offset(reader.read_uint(offset_t(sizeof(offset_type)*idx))); return offset; } std::unique_ptr sectionSubReader(const Reader& zimReader, const std::string& sectionName, offset_t offset, zsize_t size) { if (!zimReader.can_read(offset, size)) { throw ZimFileFormatError(sectionName + " outside (or not fully inside) ZIM file."); } #ifdef ENABLE_USE_BUFFER_HEADER const auto buf = zimReader.get_buffer(offset, size); return std::unique_ptr(new BufferReader(buf)); #else return zimReader.sub_reader(offset, size); #endif } std::shared_ptr makeFileReader(std::shared_ptr zimFile) { if (zimFile->fail()) { return nullptr; } else if ( zimFile->is_multiPart() ) { return std::make_shared(zimFile); } else { const auto& firstAndOnlyPart = zimFile->begin()->second; return std::make_shared(firstAndOnlyPart->shareable_fhandle(), firstAndOnlyPart->offset(), firstAndOnlyPart->size()); } } // Consider a set of integer-numbered objects with their object-ids spanning a // contiguous range [a, b). // Each object is also labelled with an integer group id. The group-ids too // form a contiguous (or dense enough) set. // The Grouping class allows to re-arrange the stream of such objects fed // to it in the object-id order, returning a table of object-ids in the group-id // order (where the order of the objects within the same group is preserved). // template class Grouping { public: // types typedef std::vector GroupedObjectIds; public: // functions explicit Grouping(ObjectId objectIdBegin, ObjectId objectIdEnd) : firstObjectId_(objectIdBegin) , minGroupId_(std::numeric_limits::max()) , maxGroupId_(std::numeric_limits::min()) { groupIds_.reserve(objectIdEnd - objectIdBegin); } // i'th call of add() is assumed to refer to the object // with id (firstObjectId_+i) void add(GroupId groupId) { groupIds_.push_back(groupId); minGroupId_ = std::min(minGroupId_, groupId); maxGroupId_ = std::max(maxGroupId_, groupId); } GroupedObjectIds getGroupedObjectIds() { GroupedObjectIds result; if ( !groupIds_.empty() ) { // nextObjectSeat[g - minGroupId_] tells where the next object // with group-id g must be placed (seated) in the result std::vector nextObjectSeat = getGroupBoundaries(); result.resize(groupIds_.size()); for ( size_t i = 0; i < groupIds_.size(); ++i ) { const GroupId g = groupIds_[i]; // This statement has an important side-effect vv const auto pos = nextObjectSeat[g - minGroupId_]++; result[pos] = firstObjectId_ + i; } GroupIds().swap(groupIds_); } return result; } private: // functions std::vector getGroupBoundaries() const { std::vector groupIdCounts(maxGroupId_ - minGroupId_ + 1, 0); for ( const auto groupId : groupIds_ ) { ++groupIdCounts[groupId - minGroupId_]; } std::vector groupBoundaries(1, 0); std::partial_sum(groupIdCounts.begin(), groupIdCounts.end(), std::back_inserter(groupBoundaries) ); return groupBoundaries; } private: // types typedef std::vector GroupIds; private: // data const ObjectId firstObjectId_; GroupIds groupIds_; GroupId minGroupId_; GroupId maxGroupId_; }; } //unnamed namespace ////////////////////////////////////////////////////////////////////// // FileImpl // FileImpl::FileImpl(const std::string& fname) : FileImpl(FileCompound::openSinglePieceOrSplitZimFile(fname)) {} #ifndef _WIN32 FileImpl::FileImpl(int fd) : FileImpl(std::make_shared(fd)) {} FileImpl::FileImpl(FdInput fd) : FileImpl(std::make_shared(fd)) {} FileImpl::FileImpl(const std::vector& fds) : FileImpl(std::make_shared(fds)) {} #endif FileImpl::FileImpl(std::shared_ptr _zimFile) : zimFile(_zimFile), zimReader(makeFileReader(zimFile)), direntReader(new DirentReader(zimReader)), clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)), m_newNamespaceScheme(false), m_hasFrontArticlesIndex(true), m_startUserEntry(0), m_endUserEntry(0) { log_trace("read file \"" << zimFile->filename() << '"'); if (zimFile->fail()) throw ZimFileFormatError(std::string("can't open zim-file \"") + zimFile->filename() + '"'); // read header if (size_type(zimReader->size()) < Fileheader::size) { throw ZimFileFormatError("zim-file is too small to contain a header"); } try { header.read(*zimReader); } catch (ZimFileFormatError& e) { throw e; } catch (...) { throw ZimFileFormatError("error reading zim-file header."); } // This can happen for several reasons: // - Zim file is corrupted (corrupted header) // - Zim file is too small (ongoing download, truncated file...) // - Zim file is embedded at beginning of another file (and we try to open the file as a zim file) // If open through a FdInput, size should be set in FdInput. if (header.hasChecksum() && (header.getChecksumPos() + 16) != size_type(zimReader->size())) { throw ZimFileFormatError("Zim file(s) is of bad size or corrupted."); } auto pathPtrReader = sectionSubReader(*zimReader, "Dirent pointer table", offset_t(header.getPathPtrPos()), zsize_t(sizeof(offset_type)*header.getArticleCount())); mp_pathDirentAccessor.reset( new DirectDirentAccessor(direntReader, std::move(pathPtrReader), entry_index_t(header.getArticleCount()))); clusterOffsetReader = sectionSubReader(*zimReader, "Cluster pointer table", offset_t(header.getClusterPtrPos()), zsize_t(sizeof(offset_type)*header.getClusterCount())); quickCheckForCorruptFile(); mp_titleDirentAccessor = getTitleAccessor("listing/titleOrdered/v1"); if (!mp_titleDirentAccessor) { offset_t titleOffset(header.getTitleIdxPos()); zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table"); const_cast(m_hasFrontArticlesIndex) = false; } m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); readMimeTypes(); } std::unique_ptr FileImpl::getTitleAccessor(const std::string& path) { auto result = direntLookup().find('X', path); if (!result.first) { return nullptr; } auto dirent = mp_pathDirentAccessor->getDirent(result.second); auto cluster = getCluster(dirent->getClusterNumber()); if (cluster->isCompressed()) { // This is a ZimFileFormatError. // Let's be tolerent and skip the entry return nullptr; } auto titleOffset = getClusterOffset(dirent->getClusterNumber()) + cluster->getBlobOffset(dirent->getBlobNumber()); auto titleSize = cluster->getBlobSize(dirent->getBlobNumber()); return getTitleAccessor(titleOffset, titleSize, "Title index table" + path); } std::unique_ptr FileImpl::getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name) { auto titleIndexReader = sectionSubReader(*zimReader, name, offset, size); return std::unique_ptr( new IndirectDirentAccessor(mp_pathDirentAccessor, std::move(titleIndexReader), title_index_t(size.v/sizeof(entry_index_type)))); } FileImpl::DirentLookup& FileImpl::direntLookup() const { // Not using std::call_once because it is buggy. // 1. It doesn't play well with musl libc - an exception thrown by the // callable results in SIGABRT even if there is a handler for it higher // in the call stack. // 2. With `glibc` an exceptional execution of `std::call_once` doesn't // unlock the mutex associated with the `std::once_flag` object. if ( !m_direntLookup ) { std::lock_guard lock(m_direntLookupCreationMutex); if ( !m_direntLookup ) { const auto cacheSize = envValue("ZIM_DIRENTLOOKUPCACHE", DIRENT_LOOKUP_CACHE_SIZE); m_direntLookup.reset(new DirentLookup(mp_pathDirentAccessor.get(), cacheSize)); } } return *m_direntLookup; } void FileImpl::quickCheckForCorruptFile() { if (!getCountClusters()) log_warn("no clusters found"); else { offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1)); log_debug("last offset=" << lastOffset.v << " file size=" << getFilesize().v); if (lastOffset.v > getFilesize().v) { log_fatal("last offset (" << lastOffset << ") larger than file size (" << getFilesize() << ')'); throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); } } } offset_type FileImpl::getMimeListEndUpperLimit() const { offset_type result(header.getPathPtrPos()); result = std::min(result, header.getTitleIdxPos()); result = std::min(result, header.getClusterPtrPos()); if ( getCountArticles().v != 0 ) { // assuming that dirents are placed in the zim file in the same // order as the corresponding entries in the dirent pointer table result = std::min(result, mp_pathDirentAccessor->getOffset(entry_index_t(0)).v); // assuming that clusters are placed in the zim file in the same // order as the corresponding entries in the cluster pointer table result = std::min(result, readOffset(*clusterOffsetReader, 0).v); } return result; } void FileImpl::readMimeTypes() { // read mime types // libzim write zims files two ways : // - The old way by putting the pathPtrPos just after the mimetype. // - The new way by putting the pathPtrPos at the end of the zim files. // In this case, the cluster data are always at 1024 bytes offset and we // know that mimetype list is before this. // 1024 seems to be a good maximum size for the mimetype list, even for the // "old" way. const auto endMimeList = getMimeListEndUpperLimit(); if ( endMimeList <= header.getMimeListPos() ) { throw(ZimFileFormatError("Bad ZIM archive")); } const zsize_t size(endMimeList - header.getMimeListPos()); if ( endMimeList > 1024 ) { log_warn("The MIME-type list is abnormally large (" << size.v << " bytes)"); } auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size); const char* const bufferEnd = buffer.data() + size.v; const char* p = buffer.data(); while (*p != '\0') { const char* zp = std::find(p, bufferEnd, '\0'); if (zp == bufferEnd) { throw(ZimFileFormatError("Error getting mimelists.")); } std::string mimeType(p, zp); mimeTypes.push_back(mimeType); p = zp+1; } const_cast(m_newNamespaceScheme) = header.getMinorVersion() >= 1; if (m_newNamespaceScheme) { const_cast(m_startUserEntry) = getNamespaceBeginOffset('C'); const_cast(m_endUserEntry) = getNamespaceEndOffset('C'); } else { const_cast(m_endUserEntry) = getCountArticles(); } } FileImpl::FindxResult FileImpl::findx(char ns, const std::string& path) { return direntLookup().find(ns, path); } FileImpl::FindxResult FileImpl::findx(const std::string& longPath) { char ns; std::string path; try { std::tie(ns, path) = parseLongPath(longPath); return findx(ns, path); } catch (...) {} return { false, entry_index_t(0) }; } FileImpl::FindxTitleResult FileImpl::findxByTitle(char ns, const std::string& title) { return m_byTitleDirentLookup->find(ns, title); } FileCompound::PartRange FileImpl::getFileParts(offset_t offset, zsize_t size) { return zimFile->locate(offset, size); } std::shared_ptr FileImpl::getDirent(entry_index_t idx) { return mp_pathDirentAccessor->getDirent(idx); } std::shared_ptr FileImpl::getDirentByTitle(title_index_t idx) { return mp_titleDirentAccessor->getDirent(idx); } entry_index_t FileImpl::getIndexByTitle(title_index_t idx) const { return mp_titleDirentAccessor->getDirectIndex(idx); } entry_index_t FileImpl::getFrontEntryCount() const { return entry_index_t(mp_titleDirentAccessor->getDirentCount().v); } void FileImpl::prepareArticleListByCluster() const { const auto endIdx = getEndUserEntry().v; const auto startIdx = getStartUserEntry().v; Grouping g(startIdx, endIdx); for(auto i = startIdx; i < endIdx; i++) { // This is the offset of the dirent in the zimFile auto indexOffset = mp_pathDirentAccessor->getOffset(entry_index_t(i)); // Get the mimeType of the dirent (offset 0) to know the type of the dirent uint16_t mimeType = zimReader->read_uint(indexOffset); if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) { g.add(0); } else { // If it is a classic article, get the clusterNumber (at offset 8) auto clusterNumber = zimReader->read_uint(indexOffset+offset_t(8)); g.add(clusterNumber); } } m_articleListByCluster = g.getGroupedObjectIds(); } entry_index_t FileImpl::getIndexByClusterOrder(entry_index_t idx) const { // Not using std::call_once because it is buggy. See the comment // in FileImpl::direntLookup(). if ( m_articleListByCluster.empty() ) { std::lock_guard lock(m_articleListByClusterMutex); if ( m_articleListByCluster.empty() ) { prepareArticleListByCluster(); } } if (idx.v >= m_articleListByCluster.size()) throw std::out_of_range("entry index out of range"); return entry_index_t(m_articleListByCluster[idx.v]); } FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx) { offset_t clusterOffset(getClusterOffset(idx)); log_debug("read cluster " << idx << " from offset " << clusterOffset); return Cluster::read(*zimReader, clusterOffset); } std::shared_ptr FileImpl::getCluster(cluster_index_t idx) { if (idx >= getCountClusters()) throw ZimFileFormatError("cluster index out of range"); auto cluster = clusterCache.getOrPut(idx.v, [=](){ return readCluster(idx); }); #if ENV32BIT // There was a bug in the way we create the zim files using ZSTD compression. // We were using a too hight compression level and so a window of 128Mb. // So at decompression, zstd reserve a 128Mb buffer. // While this memory is not really used (thanks to lazy allocation of OS), // we are still consumming address space. On 32bits this start to be a rare // ressource when we reserved 128Mb at once. // So we drop the cluster from the cache to avoid future memory allocation error. if (cluster->getCompression() == Cluster::Compression::Zstd) { // ZSTD compression starts to be used on version 5.0 of zim format. // Recently after, we switch to 5.1 and itegrate the fix in zstd creation. // 5.0 is not a perfect way to detect faulty zim file (it will generate false // positives) but it should be enough. if (header.getMajorVersion() == 5 && header.getMinorVersion() == 0) { clusterCache.drop(idx.v); } } #endif return cluster; } offset_t FileImpl::getClusterOffset(cluster_index_t idx) const { return readOffset(*clusterOffsetReader, idx.v); } offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx) { auto cluster = getCluster(clusterIdx); if (cluster->isCompressed()) return offset_t(0); return getClusterOffset(clusterIdx) + cluster->getBlobOffset(blobIdx); } entry_index_t FileImpl::getNamespaceBeginOffset(char ch) const { log_trace("getNamespaceBeginOffset(" << ch << ')'); return direntLookup().getNamespaceRangeBegin(ch); } entry_index_t FileImpl::getNamespaceEndOffset(char ch) const { log_trace("getNamespaceEndOffset(" << ch << ')'); return direntLookup().getNamespaceRangeEnd(ch); } const std::string& FileImpl::getMimeType(uint16_t idx) const { if (idx >= mimeTypes.size()) throw ZimFileFormatError(Formatter() << "unknown mime type code " << idx); return mimeTypes[idx]; } std::string FileImpl::getChecksum() { if (!header.hasChecksum()) return std::string(); try { auto chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); char hexdigest[33]; hexdigest[32] = '\0'; static const char hex[] = "0123456789abcdef"; char* p = hexdigest; for (int i = 0; i < 16; ++i) { uint8_t v = chksum.at(offset_t(i)); *p++ = hex[v >> 4]; *p++ = hex[v & 0xf]; } log_debug("chksum=" << hexdigest); return hexdigest; } catch (...) { log_warn("error reading checksum"); return std::string(); } } bool FileImpl::verify() { if (!header.hasChecksum()) return false; struct zim_MD5_CTX md5ctx; zim_MD5Init(&md5ctx); unsigned char ch[CHUNK_SIZE]; offset_type checksumPos = header.getChecksumPos(); offset_type toRead = checksumPos; for(auto part = zimFile->begin(); part != zimFile->end(); part++) { std::ifstream stream(part->second->filename(), std::ios_base::in|std::ios_base::binary); while(toRead>=CHUNK_SIZE && stream.read(reinterpret_cast(ch),CHUNK_SIZE).good()) { zim_MD5Update(&md5ctx, ch, CHUNK_SIZE); toRead-=CHUNK_SIZE; } // Previous read was good, so we have exited the previous `while` because // `toRead(ch),toRead); } // It updates the checksum with the remaining amount of data when we // reach the end of the file or part zim_MD5Update(&md5ctx, ch, stream.gcount()); toRead-=stream.gcount(); if (stream.bad()) { perror("error while reading file"); return false; } if (!toRead) { break; } } if (toRead) { return false; } unsigned char chksumCalc[16]; auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); zim_MD5Final(chksumCalc, &md5ctx); if (std::memcmp(chksumFile.data(), chksumCalc, 16) != 0) { return false; } return true; } time_t FileImpl::getMTime() const { return zimFile->getMTime(); } zim::zsize_t FileImpl::getFilesize() const { return zimReader->size(); } bool FileImpl::is_multiPart() const { return zimFile->is_multiPart(); } bool FileImpl::checkIntegrity(IntegrityCheck checkType) { switch(checkType) { case IntegrityCheck::CHECKSUM: return FileImpl::checkChecksum(); case IntegrityCheck::DIRENT_PTRS: return FileImpl::checkDirentPtrs(); case IntegrityCheck::DIRENT_ORDER: return FileImpl::checkDirentOrder(); case IntegrityCheck::TITLE_INDEX: return FileImpl::checkTitleIndex(); case IntegrityCheck::CLUSTER_PTRS: return FileImpl::checkClusterPtrs(); case IntegrityCheck::CLUSTERS_OFFSETS: return FileImpl::checkClusters(); case IntegrityCheck::DIRENT_MIMETYPES: return FileImpl::checkDirentMimeTypes(); case IntegrityCheck::COUNT: ASSERT("shouldn't have reached here", ==, ""); } return false; } bool FileImpl::checkChecksum() { if ( ! verify() ) { std::cerr << "Checksum doesn't match" << std::endl; return false; } return true; } bool FileImpl::checkDirentPtrs() { const entry_index_type articleCount = getCountArticles().v; const offset_t validDirentRangeStart(80); // XXX: really??? const offset_t validDirentRangeEnd = header.hasChecksum() ? offset_t(header.getChecksumPos()) : offset_t(zimReader->size().v); const zsize_t direntMinSize(11); for ( entry_index_type i = 0; i < articleCount; ++i ) { const auto offset = mp_pathDirentAccessor->getOffset(entry_index_t(i)); if ( offset < validDirentRangeStart || offset + direntMinSize > validDirentRangeEnd ) { std::cerr << "Invalid dirent pointer" << std::endl; return false; } } return true; } bool FileImpl::checkDirentOrder() { const entry_index_type articleCount = getCountArticles().v; std::shared_ptr prevDirent; for ( entry_index_type i = 0; i < articleCount; ++i ) { const std::shared_ptr dirent = mp_pathDirentAccessor->getDirent(entry_index_t(i)); if ( prevDirent && !(prevDirent->getLongPath() < dirent->getLongPath()) ) { std::cerr << "Dirent table is not properly sorted:\n" << " #" << i-1 << ": " << prevDirent->getLongPath() << "\n" << " #" << i << ": " << dirent->getLongPath() << std::endl; return false; } prevDirent = dirent; } return true; } bool FileImpl::checkClusters() { const cluster_index_type clusterCount = getCountClusters().v; for ( cluster_index_type i = 0; i < clusterCount; ++i ) { // Force a read of each clusters (which will throw ZimFileFormatError in case of error) try { readCluster(cluster_index_t(i)); } catch (ZimFileFormatError& e) { std::cerr << e.what() << std::endl; return false; } } return true; } bool FileImpl::checkClusterPtrs() { const cluster_index_type clusterCount = getCountClusters().v; const offset_t validClusterRangeStart(80); // XXX: really??? const offset_t validClusterRangeEnd = header.hasChecksum() ? offset_t(header.getChecksumPos()) : offset_t(zimReader->size().v); const zsize_t clusterMinSize(1); // XXX for ( cluster_index_type i = 0; i < clusterCount; ++i ) { const auto offset = readOffset(*clusterOffsetReader, i); if ( offset < validClusterRangeStart || offset + clusterMinSize > validClusterRangeEnd ) { std::cerr << "Invalid cluster pointer" << std::endl; return false; } } return true; } namespace { std::string pseudoTitle(const Dirent& d) { return std::string(1, d.getNamespace()) + '/' + d.getTitle(); } bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type totalCount) { const entry_index_type direntCount = accessor.getDirentCount().v; std::shared_ptr prevDirent; for ( entry_index_type i = 0; i < direntCount; ++i ) { if (accessor.getDirectIndex(title_index_t(i)).v >= totalCount) { std::cerr << "Invalid title index entry." << std::endl; return false; } const std::shared_ptr dirent = accessor.getDirent(title_index_t(i)); if ( prevDirent && !(pseudoTitle(*prevDirent) <= pseudoTitle(*dirent)) ) { std::cerr << "Title index is not properly sorted." << std::endl; return false; } prevDirent = dirent; } return true; } } // unnamed namespace bool FileImpl::checkTitleIndex() { const entry_index_type articleCount = getCountArticles().v; offset_t titleOffset(header.getTitleIdxPos()); zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); auto titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Full Title index table"); auto ret = checkTitleListing(*titleDirentAccessor, articleCount); titleDirentAccessor = getTitleAccessor("listing/titleOrdered/v1"); if (titleDirentAccessor) { ret &= checkTitleListing(*titleDirentAccessor, articleCount); } return ret; } bool FileImpl::checkDirentMimeTypes() { const entry_index_type articleCount = getCountArticles().v; for ( entry_index_type i = 0; i < articleCount; ++i ) { const auto dirent = mp_pathDirentAccessor->getDirent(entry_index_t(i)); if ( dirent->isArticle() && dirent->getMimeType() >= mimeTypes.size() ) { std::cerr << "Entry " << dirent->getLongPath() << " has invalid MIME-type value " << dirent->getMimeType() << "." << std::endl; return false; } } return true; } } libzim-9.2.3/src/fileimpl.h000066400000000000000000000146611466367137100156020ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2020-2021 Veloman Yunkan * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FILEIMPL_H #define ZIM_FILEIMPL_H #include #include #include #include #include #include "concurrent_cache.h" #include "_dirent.h" #include "dirent_accessor.h" #include "dirent_lookup.h" #include "cluster.h" #include "file_reader.h" #include "file_compound.h" #include "fileheader.h" #include "zim_types.h" #include "direntreader.h" namespace zim { class FileImpl { std::shared_ptr zimFile; std::shared_ptr zimReader; std::shared_ptr direntReader; Fileheader header; std::unique_ptr clusterOffsetReader; std::shared_ptr mp_pathDirentAccessor; std::unique_ptr mp_titleDirentAccessor; typedef std::shared_ptr ClusterHandle; ConcurrentCache clusterCache; const bool m_newNamespaceScheme; const bool m_hasFrontArticlesIndex; const entry_index_t m_startUserEntry; const entry_index_t m_endUserEntry; typedef std::vector MimeTypes; MimeTypes mimeTypes; mutable std::vector m_articleListByCluster; mutable std::mutex m_articleListByClusterMutex; struct DirentLookupConfig { typedef DirectDirentAccessor DirentAccessorType; typedef entry_index_t index_t; static const std::string& getDirentKey(const Dirent& d) { return d.getPath(); } }; using DirentLookup = zim::FastDirentLookup; mutable std::unique_ptr m_direntLookup; mutable std::mutex m_direntLookupCreationMutex; struct ByTitleDirentLookupConfig { typedef IndirectDirentAccessor DirentAccessorType; typedef title_index_t index_t; static const std::string& getDirentKey(const Dirent& d) { return d.getTitle(); } }; using ByTitleDirentLookup = zim::DirentLookup; std::unique_ptr m_byTitleDirentLookup; public: using FindxResult = std::pair; using FindxTitleResult = std::pair; explicit FileImpl(const std::string& fname); #ifndef _WIN32 explicit FileImpl(int fd); explicit FileImpl(FdInput fd); explicit FileImpl(const std::vector& fds); #endif time_t getMTime() const; const std::string& getFilename() const { return zimFile->filename(); } const Fileheader& getFileheader() const { return header; } zsize_t getFilesize() const; bool hasNewNamespaceScheme() const { return m_newNamespaceScheme; } bool hasFrontArticlesIndex() const { return m_hasFrontArticlesIndex; } FileCompound::PartRange getFileParts(offset_t offset, zsize_t size); std::shared_ptr getDirent(entry_index_t idx); std::shared_ptr getDirentByTitle(title_index_t idx); entry_index_t getIndexByTitle(title_index_t idx) const; entry_index_t getIndexByClusterOrder(entry_index_t idx) const; entry_index_t getCountArticles() const { return entry_index_t(header.getArticleCount()); } FindxResult findx(char ns, const std::string &path); FindxResult findx(const std::string &path); FindxTitleResult findxByTitle(char ns, const std::string& title); std::shared_ptr getCluster(cluster_index_t idx); cluster_index_t getCountClusters() const { return cluster_index_t(header.getClusterCount()); } offset_t getClusterOffset(cluster_index_t idx) const; offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx); entry_index_t getNamespaceBeginOffset(char ch) const; entry_index_t getNamespaceEndOffset(char ch) const; entry_index_t getNamespaceEntryCount(char ch) const { return getNamespaceEndOffset(ch) - getNamespaceBeginOffset(ch); } entry_index_t getStartUserEntry() const { return m_startUserEntry; } entry_index_t getEndUserEntry() const { return m_endUserEntry; } // The number of entries added by the creator. (So excluding index, ...). // On new namespace scheme, number of entries in C namespace entry_index_t getUserEntryCount() const { return m_endUserEntry - m_startUserEntry; } // The number of enties that can be considered as front article (no resource) entry_index_t getFrontEntryCount() const; const std::string& getMimeType(uint16_t idx) const; std::string getChecksum(); bool verify(); bool is_multiPart() const; bool checkIntegrity(IntegrityCheck checkType); private: explicit FileImpl(std::shared_ptr zimFile); FileImpl(std::shared_ptr zimFile, offset_t offset, zsize_t size); std::unique_ptr getTitleAccessor(const std::string& path); std::unique_ptr getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name); void prepareArticleListByCluster() const; DirentLookup& direntLookup() const; ClusterHandle readCluster(cluster_index_t idx); offset_type getMimeListEndUpperLimit() const; void readMimeTypes(); void quickCheckForCorruptFile(); bool checkChecksum(); bool checkDirentPtrs(); bool checkDirentOrder(); bool checkTitleIndex(); bool checkClusterPtrs(); bool checkClusters(); bool checkDirentMimeTypes(); }; } #endif // ZIM_FILEIMPL_H libzim-9.2.3/src/fs.h000066400000000000000000000020511466367137100143770ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FS_H_ #define ZIM_FS_H_ #ifdef _WIN32 # include "fs_windows.h" #else # include "fs_unix.h" #endif namespace zim { #ifdef _WIN32 using DEFAULTFS = windows::FS; #else using DEFAULTFS = unix::FS; #endif }; #endif //ZIM_FS_H_ libzim-9.2.3/src/fs_unix.cpp000066400000000000000000000066141466367137100160060ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "fs_unix.h" #include #include #include #include #include #include #include #include #include namespace zim { namespace unix { zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const { #if defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) || defined(__HAIKU__) # define PREAD pread #else # define PREAD pread64 #endif ssize_t full_size_read = 0; auto size_to_read = size.v; auto current_offset = offset.v; errno = 0; while (size_to_read > 0) { auto size_read = PREAD(m_fd, dest, size_to_read, current_offset); if (size_read == 0) { throw std::runtime_error("Cannot read past the end of the file"); } if (size_read == -1) { throw std::runtime_error("Cannot read file"); } size_to_read -= size_read; current_offset += size_read; full_size_read += size_read; } return zsize_t(full_size_read); #undef PREAD } zsize_t FD::getSize() const { struct stat sb; fstat(m_fd, &sb); return zsize_t(sb.st_size); } bool FD::seek(offset_t offset) { return static_cast(offset.v) == lseek(m_fd, offset.v, SEEK_SET); } bool FD::close() { if (m_fd != -1) { return ::close(m_fd); } return -1; } FD FS::openFile(path_t filepath) { int fd = open(filepath.c_str(), O_RDONLY); if (fd == -1) { const std::string errorStr = strerror(errno); throw std::runtime_error("Error opening file: " + filepath + ": " + errorStr); } return FD(fd); } bool FS::makeDirectory(path_t path) { return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); } void FS::rename(path_t old_path, path_t new_path) { ::rename(old_path.c_str(), new_path.c_str()); } std::string FS::join(path_t base, path_t name) { return base + "/" + name; } bool FS::remove(path_t path) { DIR* dir; /* It's a directory, remove all its entries first */ if ((dir = opendir(path.c_str())) != NULL) { struct dirent* ent; while ((ent = readdir(dir)) != NULL) { std::string childName = ent->d_name; if (childName != "." && childName != "..") { auto childPath = join(path, childName); remove(childPath); } } closedir(dir); return removeDir(path); } /* It's a file */ else { return removeFile(path); } } bool FS::removeDir(path_t path) { return rmdir(path.c_str()); } bool FS::removeFile(path_t path) { return ::remove(path.c_str()); } }; // unix namespace std::string getFilePathFromFD(int fd) { return Formatter() << "/dev/fd/" << fd; } }; // zim namespace libzim-9.2.3/src/fs_unix.h000066400000000000000000000042511466367137100154460ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FS_UNIX_H_ #define ZIM_FS_UNIX_H_ #include "zim_types.h" #include #include #include #include #include namespace zim { namespace unix { using path_t = const std::string&; class FD { public: using fd_t = int; private: fd_t m_fd = -1; public: FD() = default; FD(fd_t fd): m_fd(fd) {}; FD(const FD& o) = delete; FD(FD&& o) : m_fd(o.m_fd) { o.m_fd = -1; } FD& operator=(FD&& o) { m_fd = o.m_fd; o.m_fd = -1; return *this; } ~FD() { close(); } zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; zsize_t getSize() const; fd_t getNativeHandle() const { return m_fd; } fd_t release() { int ret = m_fd; m_fd = -1; return ret; } bool seek(offset_t offset); bool close(); }; struct FS { using FD = zim::unix::FD; static std::string join(path_t base, path_t name); static FD openFile(path_t filepath); static bool makeDirectory(path_t path); static void rename(path_t old_path, path_t new_path); static bool remove(path_t path); static bool removeDir(path_t path); static bool removeFile(path_t path); }; }; // unix namespace std::string getFilePathFromFD(int fd); }; // zim namespace #endif //ZIM_FS_UNIX_H_ libzim-9.2.3/src/fs_windows.cpp000066400000000000000000000120171466367137100165070ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "fs_windows.h" #include #include #include #include #include #include #include #include namespace zim { namespace windows { struct ImplFD { HANDLE m_handle = INVALID_HANDLE_VALUE; CRITICAL_SECTION m_criticalSection; ImplFD() { InitializeCriticalSection(&m_criticalSection); } ImplFD(HANDLE handle) : m_handle(handle) { InitializeCriticalSection(&m_criticalSection); } ~ImplFD() { DeleteCriticalSection(&m_criticalSection); } }; FD::FD() : mp_impl(new ImplFD()) {} FD::FD(fd_t handle) : mp_impl(new ImplFD(handle)) {} FD::FD(FD&& o) = default; FD& FD::operator=(FD&& o) = default; FD::~FD() { if (mp_impl) close(); } zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const { if (!mp_impl) throw std::runtime_error("FD is not open"); EnterCriticalSection(&mp_impl->m_criticalSection); LARGE_INTEGER off; off.QuadPart = offset.v; std::string errorMsg; auto size_to_read = size.v; if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) { errorMsg = "Seek fail"; goto err; } DWORD size_read; while (size_to_read > 0) { // Read by batch < 4GiB // Lets use a batch of 1GiB auto batch_to_read = std::min(size_to_read, (size_type)1024*1024*1024); if (!ReadFile(mp_impl->m_handle, dest, batch_to_read, &size_read, NULL)) { errorMsg = "Read fail"; goto err; } if (size_read == 0) { errorMsg = "Cannot read past the end of the file"; goto err; } size_to_read -= size_read; dest += size_read; } LeaveCriticalSection(&mp_impl->m_criticalSection); return size; err: LeaveCriticalSection(&mp_impl->m_criticalSection); throw std::runtime_error(errorMsg); } bool FD::seek(offset_t offset) { if(!mp_impl) return false; LARGE_INTEGER off; off.QuadPart = offset.v; return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN); } zsize_t FD::getSize() const { if(!mp_impl) return zsize_t(0); LARGE_INTEGER size; if (!GetFileSizeEx(mp_impl->m_handle, &size)) { size.QuadPart = 0; } return zsize_t(size.QuadPart); } int FD::release() { if(!mp_impl) return -1; int ret = _open_osfhandle(reinterpret_cast(mp_impl->m_handle), 0); mp_impl->m_handle = INVALID_HANDLE_VALUE; return ret; } bool FD::close() { if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) { return false; } return CloseHandle(mp_impl->m_handle); } std::unique_ptr FS::toWideChar(path_t path) { auto size = MultiByteToWideChar(CP_UTF8, 0, path.c_str(), -1, nullptr, 0); auto wdata = std::unique_ptr(new wchar_t[size]); auto ret = MultiByteToWideChar(CP_UTF8, 0, path.c_str(), -1, wdata.get(), size); if (0 == ret) throw std::runtime_error(Formatter() << "Cannot convert path to wchar : " << GetLastError()); return wdata; } FD FS::openFile(path_t filepath) { auto wpath = toWideChar(filepath); FD::fd_t handle; handle = CreateFileW(wpath.get(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS, NULL); if (handle == INVALID_HANDLE_VALUE) throw std::runtime_error(Formatter() << "Cannot open file : " << GetLastError()); return FD(handle); } bool FS::makeDirectory(path_t path) { auto wpath = toWideChar(path); auto ret = CreateDirectoryW(wpath.get(), NULL); return ret; } void FS::rename(path_t old_path, path_t new_path) { auto ret = MoveFileExW(toWideChar(old_path).get(), toWideChar(new_path).get(), MOVEFILE_REPLACE_EXISTING|MOVEFILE_WRITE_THROUGH); if (!ret) throw std::runtime_error(Formatter() << "Cannot move file " << old_path << " to " << new_path); } std::string FS::join(path_t base, path_t name) { return base + "\\" + name; } bool FS::removeDir(path_t path) { return RemoveDirectoryW(toWideChar(path).get()); } bool FS::removeFile(path_t path) { return DeleteFileW(toWideChar(path).get()); } }; // windows namespace }; // zim namespace libzim-9.2.3/src/fs_windows.h000066400000000000000000000037671466367137100161700ustar00rootroot00000000000000/* * Copyright (C) 2018 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_FS_WINDOWS_H_ #define ZIM_FS_WINDOWS_H_ #include "zim_types.h" #include "config.h" #include typedef void* HANDLE; namespace zim { namespace windows { using path_t = const std::string&; struct ImplFD; class LIBZIM_PRIVATE_API FD { public: typedef HANDLE fd_t; private: std::unique_ptr mp_impl; public: FD(); FD(fd_t handle); FD(const FD& o) = delete; FD(FD&& o); FD& operator=(FD&& o); FD& operator=(const FD& o) = delete; ~FD(); zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; zsize_t getSize() const; int release(); bool seek(offset_t offset); bool close(); }; struct LIBZIM_PRIVATE_API FS { using FD = zim::windows::FD; static std::string join(path_t base, path_t name); static std::unique_ptr toWideChar(path_t path); static FD openFile(path_t filepath); static bool makeDirectory(path_t path); static void rename(path_t old_path, path_t new_path); static bool remove(path_t path); static bool removeDir(path_t path); static bool removeFile(path_t path); }; }; // windows namespace }; // zim namespace #endif //ZIM_FS_WINDOWS_H_ libzim-9.2.3/src/istreamreader.cpp000066400000000000000000000025221466367137100171540ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "istreamreader.h" #include "buffer_reader.h" namespace zim { //////////////////////////////////////////////////////////////////////////////// // IDataStream //////////////////////////////////////////////////////////////////////////////// std::unique_ptr IStreamReader::sub_reader(zsize_t size) { auto buffer = Buffer::makeBuffer(size); readImpl(const_cast(buffer.data()), size); return std::unique_ptr(new BufferReader(buffer)); } } // namespace zim libzim-9.2.3/src/istreamreader.h000066400000000000000000000053751466367137100166320ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_IDATASTREAM_H #define ZIM_IDATASTREAM_H #include "endian_tools.h" #include "reader.h" #include namespace zim { // IDataStream is a simple interface for sequential iteration over a stream // of values of built-in/primitive types and/or opaque binary objects (blobs). // An example usage: // // void foo(IDataStream& s) // { // const uint32_t n = s.read(); // for(uint32_t i=0; i < n; ++i) // { // const uint16_t blobSize = s.read(); // IDataStream::Blob blob = s.readBlob(blobSize); // bar(blob, blobSize); // } // } // class LIBZIM_PRIVATE_API IStreamReader { public: // functions virtual ~IStreamReader() = default; // Reads a value of the said type from the stream // // For best portability this function should be used with types of known // bit-width (int32_t, uint16_t, etc) rather than builtin types with // unknown bit-width (int, unsigned, etc). template T read(); // Reads a blob of the specified size from the stream virtual std::unique_ptr sub_reader(zsize_t size); private: // virtual methods // Reads exactly 'nbytes' bytes into the provided buffer 'buf' // (which must be at least that big). Throws an exception if // more bytes are requested than can be retrieved. virtual void readImpl(char* buf, zsize_t nbytes) = 0; }; //////////////////////////////////////////////////////////////////////////////// // Implementation of IDataStream //////////////////////////////////////////////////////////////////////////////// // XXX: Assuming that opaque binary data retrieved via 'readImpl()' // XXX: is encoded in little-endian form. template inline T IStreamReader::read() { constexpr size_type N(sizeof(T)); char buf[N]; readImpl(buf, zsize_t(N)); return fromLittleEndian(buf); // XXX: This handles only integral types } } // namespace zim #endif // ZIM_IDATASTREAM_H libzim-9.2.3/src/item.cpp000066400000000000000000000054541466367137100152720ustar00rootroot00000000000000/* * Copyright (C) 2021 Veloman Yunkan * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #define ZIM_PRIVATE #include #include "cluster.h" #include "fileimpl.h" #include "log.h" #include log_define("zim.item") using namespace zim; Item::Item(const Entry& entry) : Entry(entry) { assert(!entry.isRedirect()); } std::string Item::getMimetype() const { return m_file->getMimeType(m_dirent->getMimeType()); } Blob Item::getData(offset_type offset) const { auto size = getSize()-offset; return getData(offset, size); } Blob Item::getData(offset_type offset, size_type size) const { auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); return cluster->getBlob(m_dirent->getBlobNumber(), offset_t(offset), zsize_t(size)); } size_type Item::getSize() const { auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); return size_type(cluster->getBlobSize(m_dirent->getBlobNumber())); } std::pair Item::getDirectAccessInformation() const { auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); if (cluster->isCompressed()) { return std::make_pair("", 0); } auto full_offset = m_file->getBlobOffset(m_dirent->getClusterNumber(), m_dirent->getBlobNumber()); auto part_its = m_file->getFileParts(full_offset, zsize_t(getSize())); auto first_part = part_its.first; if (++part_its.first != part_its.second) { // The content is split on two parts. We cannot have direct access return std::make_pair("", 0); } auto range = first_part->first; auto part = first_part->second; const offset_type logical_local_offset(full_offset - range.min); const auto physical_local_offset = logical_local_offset + part->offset().v; return std::make_pair(part->filename(), physical_local_offset); } cluster_index_type Item::getClusterIndex() const { return m_dirent->getClusterNumber().v; } blob_index_type Item::getBlobIndex() const { return m_dirent->getBlobNumber().v; } libzim-9.2.3/src/log.h000066400000000000000000000020361466367137100145530ustar00rootroot00000000000000/* * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "config.h" #ifdef WITH_CXXTOOLS #include #else #define log_define(e) #define log_fatal(e) #define log_error(e) #define log_warn(e) #define log_info(e) #define log_debug(e) #define log_trace(e) #define log_init() #endif libzim-9.2.3/src/lrucache.h000066400000000000000000000121501466367137100155560ustar00rootroot00000000000000/* * Copyrigth (c) 2021, Matthieu Gautier * Copyright (c) 2020, Veloman Yunkan * Copyright (c) 2014, lamerman * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * * Neither the name of lamerman nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * File: lrucache.hpp * Author: Alexander Ponomarev * * Created on June 20, 2013, 5:09 PM */ #ifndef _LRUCACHE_HPP_INCLUDED_ #define _LRUCACHE_HPP_INCLUDED_ #include #include #include #include #include namespace zim { template class lru_cache { public: // types typedef typename std::pair key_value_pair_t; typedef typename std::list::iterator list_iterator_t; enum AccessStatus { HIT, // key was found in the cache PUT, // key was not in the cache but was created by the getOrPut() access MISS // key was not in the cache; get() access failed }; class AccessResult { const AccessStatus status_; const value_t val_; public: AccessResult(const value_t& val, AccessStatus status) : status_(status), val_(val) {} AccessResult() : status_(MISS), val_() {} bool hit() const { return status_ == HIT; } bool miss() const { return !hit(); } const value_t& value() const { if ( status_ == MISS ) throw std::range_error("There is no such key in cache"); return val_; } operator const value_t& () const { return value(); } }; public: // functions explicit lru_cache(size_t max_size) : _max_size(max_size) { } // If 'key' is present in the cache, returns the associated value, // otherwise puts the given value into the cache (and returns it with // a status of a cache miss). AccessResult getOrPut(const key_t& key, const value_t& value) { auto it = _cache_items_map.find(key); if (it != _cache_items_map.end()) { _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); return AccessResult(it->second->second, HIT); } else { putMissing(key, value); return AccessResult(value, PUT); } } void put(const key_t& key, const value_t& value) { auto it = _cache_items_map.find(key); if (it != _cache_items_map.end()) { _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); it->second->second = value; } else { putMissing(key, value); } } AccessResult get(const key_t& key) { auto it = _cache_items_map.find(key); if (it == _cache_items_map.end()) { return AccessResult(); } else { _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); return AccessResult(it->second->second, HIT); } } bool drop(const key_t& key) { try { auto list_it = _cache_items_map.at(key); _cache_items_list.erase(list_it); _cache_items_map.erase(key); return true; } catch (std::out_of_range& e) { return false; } } bool exists(const key_t& key) const { return _cache_items_map.find(key) != _cache_items_map.end(); } size_t size() const { return _cache_items_map.size(); } private: // functions void putMissing(const key_t& key, const value_t& value) { assert(_cache_items_map.find(key) == _cache_items_map.end()); _cache_items_list.push_front(key_value_pair_t(key, value)); _cache_items_map[key] = _cache_items_list.begin(); if (_cache_items_map.size() > _max_size) { _cache_items_map.erase(_cache_items_list.back().first); _cache_items_list.pop_back(); } } private: // data std::list _cache_items_list; std::map _cache_items_map; size_t _max_size; }; } // namespace zim #endif /* _LRUCACHE_HPP_INCLUDED_ */ libzim-9.2.3/src/md5.c000066400000000000000000000241311466367137100144520ustar00rootroot00000000000000/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm */ /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. License to copy and use this software is granted provided that it is identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing this software or this function. License is also granted to make and use derivative works provided that such works are identified as "derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing the derived work. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ #include "md5.h" #include #define MD5_CTX struct zim_MD5_CTX /* Constants for MD5Transform routine. */ #define S11 7 #define S12 12 #define S13 17 #define S14 22 #define S21 5 #define S22 9 #define S23 14 #define S24 20 #define S31 4 #define S32 11 #define S33 16 #define S34 23 #define S41 6 #define S42 10 #define S43 15 #define S44 21 static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64])); static void Encode PROTO_LIST ((unsigned char *, UINT4 *, unsigned int)); static void Decode PROTO_LIST ((UINT4 *, const unsigned char *, unsigned int)); /* static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int)); static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int)); */ #define MD5_memcpy memcpy #define MD5_memset memset static unsigned char PADDING[64] = { 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; /* F, G, H and I are basic MD5 functions. */ #define F(x, y, z) (((x) & (y)) | ((~x) & (z))) #define G(x, y, z) (((x) & (z)) | ((y) & (~z))) #define H(x, y, z) ((x) ^ (y) ^ (z)) #define I(x, y, z) ((y) ^ ((x) | (~z))) /* ROTATE_LEFT rotates x left n bits. */ #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) /* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. Rotation is separate from addition to prevent recomputation. */ #define FF(a, b, c, d, x, s, ac) { \ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define GG(a, b, c, d, x, s, ac) { \ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define HH(a, b, c, d, x, s, ac) { \ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } #define II(a, b, c, d, x, s, ac) { \ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ (a) = ROTATE_LEFT ((a), (s)); \ (a) += (b); \ } /* MD5 initialization. Begins an MD5 operation, writing a new context. */ void zim_MD5Init (MD5_CTX* context) { context->count[0] = context->count[1] = 0; /* Load magic initialization constants. */ context->state[0] = 0x67452301; context->state[1] = 0xefcdab89; context->state[2] = 0x98badcfe; context->state[3] = 0x10325476; } /* MD5 block update operation. Continues an MD5 message-digest operation, processing another message block, and updating the context. */ void zim_MD5Update ( MD5_CTX *context, const unsigned char *input, /* input block */ unsigned int inputLen) /* length of input block */ { unsigned int i, index, partLen; /* Compute number of bytes mod 64 */ index = (unsigned int)((context->count[0] >> 3) & 0x3F); /* Update number of bits */ if ((context->count[0] += ((UINT4)inputLen << 3)) < ((UINT4)inputLen << 3)) context->count[1]++; context->count[1] += ((UINT4)inputLen >> 29); partLen = 64 - index; /* Transform as many times as possible. */ if (inputLen >= partLen) { MD5_memcpy ((POINTER)&context->buffer[index], (POINTER)input, partLen); MD5Transform (context->state, context->buffer); for (i = partLen; i + 63 < inputLen; i += 64) MD5Transform (context->state, &input[i]); index = 0; } else i = 0; /* Buffer remaining input */ MD5_memcpy ((POINTER)&context->buffer[index], (POINTER)&input[i], inputLen-i); } /* MD5 finalization. Ends an MD5 message-digest operation, writing the the message digest and zeroizing the context. */ void zim_MD5Final ( unsigned char digest[16], /* message digest */ MD5_CTX *context) /* context */ { unsigned char bits[8]; unsigned int index, padLen; /* Save number of bits */ Encode (bits, context->count, 8); /* Pad out to 56 mod 64. */ index = (unsigned int)((context->count[0] >> 3) & 0x3f); padLen = (index < 56) ? (56 - index) : (120 - index); zim_MD5Update (context, PADDING, padLen); /* Append length (before padding) */ zim_MD5Update (context, bits, 8); /* Store state in digest */ Encode (digest, context->state, 16); /* Zeroize sensitive information. */ MD5_memset ((POINTER)context, 0, sizeof (*context)); } /* MD5 basic transformation. Transforms state based on block. */ static void MD5Transform ( UINT4 state[4], const unsigned char block[64]) { UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; Decode (x, block, 64); /* Round 1 */ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ /* Round 2 */ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ /* Round 3 */ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ /* Round 4 */ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ state[0] += a; state[1] += b; state[2] += c; state[3] += d; /* Zeroize sensitive information. */ MD5_memset ((POINTER)x, 0, sizeof (x)); } /* Encodes input (UINT4) into output (unsigned char). Assumes len is a multiple of 4. */ static void Encode ( unsigned char *output, UINT4 *input, unsigned int len) { unsigned int i, j; for (i = 0, j = 0; j < len; i++, j += 4) { output[j] = (unsigned char)(input[i] & 0xff); output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); } } /* Decodes input (unsigned char) into output (UINT4). Assumes len is a multiple of 4. */ static void Decode ( UINT4 *output, const unsigned char *input, unsigned int len) { unsigned int i, j; for (i = 0, j = 0; j < len; i++, j += 4) output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); } #if 0 /* Note: Replace "for loop" with standard memcpy if possible. */ static void MD5_memcpy ( POINTER output, POINTER input, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) output[i] = input[i]; } /* Note: Replace "for loop" with standard memset if possible. */ static void MD5_memset ( POINTER output, int value, unsigned int len) { unsigned int i; for (i = 0; i < len; i++) ((char *)output)[i] = (char)value; } #endif libzim-9.2.3/src/md5.h000066400000000000000000000070441466367137100144630ustar00rootroot00000000000000/* * Copyright (C) 2003 Tommi Maekitalo * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * As a special exception, you may use this file as part of a free * software library without restriction. Specifically, if other files * instantiate templates or use macros or inline functions from this * file, or you compile this file and link it with other files to * produce an executable, this file does not by itself cause the * resulting executable to be covered by the GNU General Public * License. This exception does not however invalidate any other * reasons why the executable file might be covered by the GNU Library * General Public License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ /* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. License to copy and use this software is granted provided that it is identified as the "RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing this software or this function. License is also granted to make and use derivative works provided that such works are identified as "derived from the RSA Data Security, Inc. MD5 Message-Digest Algorithm" in all material mentioning or referencing the derived work. RSA Data Security, Inc. makes no representations concerning either the merchantability of this software or the suitability of this software for any particular purpose. It is provided "as is" without express or implied warranty of any kind. These notices must be retained in any copies of any part of this documentation and/or software. */ /* RSAREF types and constants */ /* PROTOTYPES should be set to one if and only if the compiler supports function argument prototyping. The following makes PROTOTYPES default to 0 if it has not already been defined with C compiler flags. */ #ifndef ZIM_MD5_H #define ZIM_MD5_H #ifndef PROTOTYPES #define PROTOTYPES 1 #endif /* POINTER defines a generic pointer type */ typedef unsigned char *POINTER; /* UINT2 defines a two byte word */ typedef unsigned short int UINT2; /* UINT4 defines a four byte word */ typedef unsigned int UINT4; /* PROTO_LIST is defined depending on how PROTOTYPES is defined above. If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it returns an empty list. */ #if PROTOTYPES #define PROTO_LIST(list) list #else #define PROTO_LIST(list) () #endif /* MD5 context. */ struct zim_MD5_CTX { UINT4 state[4]; /* state (ABCD) */ UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ unsigned char buffer[64]; /* input buffer */ }; #ifdef __cplusplus extern "C" { #endif void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *)); void zim_MD5Update PROTO_LIST ((struct zim_MD5_CTX *, const unsigned char *, unsigned int)); void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *)); #ifdef __cplusplus } #endif #endif /* ZIM_MD5_H */ libzim-9.2.3/src/meson.build000066400000000000000000000035141466367137100157650ustar00rootroot00000000000000 configure_file(output : 'config.h', configuration : private_conf, input : 'config.h.in') src_directory = include_directories('.') common_sources = [ # 'config.h', 'archive.cpp', 'cluster.cpp', 'buffer_reader.cpp', 'dirent.cpp', 'dirent_accessor.cpp', 'entry.cpp', 'envvalue.cpp', 'fileheader.cpp', 'fileimpl.cpp', 'file_compound.cpp', 'file_reader.cpp', 'item.cpp', 'blob.cpp', 'buffer.cpp', 'md5.c', 'uuid.cpp', 'tools.cpp', 'compression.cpp', 'istreamreader.cpp', 'writer/contentProvider.cpp', 'writer/creator.cpp', 'writer/item.cpp', 'writer/cluster.cpp', 'writer/dirent.cpp', 'writer/workers.cpp', 'writer/clusterWorker.cpp', 'writer/titleListingHandler.cpp', 'writer/counterHandler.cpp', 'suggestion.cpp', 'suggestion_iterator.cpp', 'version.cpp' ] if host_machine.system() == 'windows' common_sources += 'fs_windows.cpp' else common_sources += 'fs_unix.cpp' endif xapian_sources = [ 'search.cpp', 'search_iterator.cpp', 'xapian/htmlparse.cc', 'xapian/myhtmlparse.cc', 'writer/xapianIndexer.cpp', 'writer/xapianWorker.cpp', 'writer/xapianHandler.cpp' ] sources = common_sources deps = [thread_dep, lzma_dep, zstd_dep, win_deps] if target_machine.system() == 'freebsd' deps += [execinfo_dep] endif if xapian_dep.found() sources += xapian_sources sources += lib_resources deps += [xapian_dep, icu_dep] endif libzim = library('zim', sources, include_directories : inc, dependencies : deps, version: meson.project_version(), install : true) libzim_dep = declare_dependency(link_with: libzim, include_directories: include_directory) libzim-9.2.3/src/narrowdown.h000066400000000000000000000202671466367137100162000ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_NARROWDOWN_H #define ZIM_NARROWDOWN_H #include "debug.h" #include #include #include #include namespace zim { // Given a sorted sequence of items with a string key, NarrowDown helps to // narrow down the range in which the query key should belong. // // The target usage of this class is as a partial in-memory index for a sorted // list residing in external storage with high access cost to inidividual items. // // Illustration: // // In RAM: // key: A I Q Y g o w z // item #: | | | | | | | | // ----------- | | | | | | | | // On disk: V V V V V V V V // key: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz // data: ajo097124ljp-oasd)(&(*)llkjasdf@$^nFDSs00ujlasdfjkll // // In such an external list looking up an item by key can be performed via a // binary search where on each iteration the item key must be accessed. There // are two performance problems with that: // 1. The API may not allow accessing only the key of the given item, reading // the entire item instead (this is the case with dirents). // 2. Access to items (or only their keys) in external storage is expensive. // // NarrowDown speeds up the look-up operation in such an external list by // allowing to split it into two steps: // 1. Perform the binary search on the index, yielding a narrower range // 2. Perform the binary search on the external list starting from that // narrower range. // // The denser the in-memory index the more the performance improvement. // Therefore the implementation focus of NarrowDown is on small memory // footprint. If the item keys are long strings with a lot of "garbage" at the // end the following trick helps. Suppose that we have the following pair of // adjacent keys in our full (external) list: // // Item # | Key // --------------------------------- // ... | ... // 1234 | "We Are The Champions" // 1235 | "We Will Rock You" // ... | ... // // If we were to include the item #1234 in our index the naive approach would // be to store its key as is. However, let's imagine that the list also // contains an item with key "We W". Then it would have to reside between "We // Are The Champions" and "We Will Rock You". So we can pretend that such an // item exists and store in our index the fictitious entry {"We W", 1234.5}. // When we arrive at that entry during the range narrow-down step we must round // the item index downward if it is going to be used as the lower bound of // the range, and round it upward if it is going to be used as the upper bound // of the range. class NarrowDown { typedef entry_index_type index_type; public: // types struct Range { const index_type begin, end; }; public: // functions NarrowDown() : pred(&keyContentArea) {} // Add another entry to the search index. The key of the next item is used // to derive and store a shorter pseudo-key as explained in the long comment // above the class. void add(const std::string& key, index_type i, const std::string& nextKey) { // It would be better to have `key >= nextKey`, but pretty old zim file were not enforce to // have unique path, just that entries were sorted by path, but two entries could have the same path. // It is somehow a bug and have been fixed then, but we still have to be tolerent here and accept that // two concecutive keys can be equal. if (key > nextKey) { Formatter fmt; fmt << "Dirent table is not properly sorted:\n"; fmt << " #" << i << ": " << key[0] << "/" << key.substr(1) << "\n"; fmt << " #" << i+1 << ": " << nextKey[0] << "/" << nextKey.substr(1); throw ZimFileFormatError(fmt); } if ( entries.empty() ) { addEntry(key, i); } else { const std::string pseudoKey = shortestStringInBetween(key, nextKey); if (pred(pseudoKey, entries.back())) { Formatter fmt; fmt << "Dirent table is not properly sorted:\n"; fmt << "PseudoKey " << pseudoKey << " should be after (or equal) previously generated " << pred.getKeyContent(entries.back()) << "\n"; throw ZimFileFormatError(fmt); } ASSERT(entries.back().lindex, <, i); addEntry(pseudoKey, i); } } void close(const std::string& key, index_type i) { ASSERT(entries.empty() || pred(entries.back(), key), ==, true); ASSERT(entries.empty() || entries.back().lindex < i, ==, true); addEntry(key, i); } Range getRange(const std::string& key) const { auto it = std::upper_bound(entries.begin(), entries.end(), key, pred); if ( it == entries.begin() ) return {0, 0}; const index_type prevEntryLindex = (it-1)->lindex; if ( it == entries.end() ) return {prevEntryLindex, prevEntryLindex+1}; return {prevEntryLindex, it->lindex+1}; } static std::string shortestStringInBetween(const std::string& a, const std::string& b) { ASSERT(a, <=, b); // msvc version of `std::mismatch(begin1, end1, begin2)` // need `begin2 + (end1-begin1)` to be valid. // So we cannot simply pass `a.end()` as `end1`. const auto minlen = std::min(a.size(), b.size()); const auto m = std::mismatch(a.begin(), a.begin()+minlen, b.begin()); return std::string(b.begin(), std::min(b.end(), m.second+1)); } private: // functions void addEntry(const std::string& s, index_type i) { entries.push_back({uint32_t(keyContentArea.size()), i}); keyContentArea.insert(keyContentArea.end(), s.begin(), s.end()); keyContentArea.push_back('\0'); } private: // types typedef std::vector KeyContentArea; struct Entry { // This is mostly a truncated version of a key from the input sequence. // The exceptions are // - the first item // - the last item // - keys that differ from their preceding key only in the last character // // std::string pseudoKey; // std::string has too much memory overhead. uint32_t pseudoKeyOffset; // Instead we densely pack the key contents // into keyContentArea and store in the entry // the offset into that container. // This represents the index of the item in the input sequence right // after which pseudoKey might be inserted without breaking the sequence // order. In other words, the condition // // sequence[lindex] <= pseudoKey <= sequence[lindex+1] // // must be true. index_type lindex; }; struct LookupPred { const KeyContentArea& keyContentArea; explicit LookupPred(const KeyContentArea* kca) : keyContentArea(*kca) {} const char* getKeyContent(const Entry& entry) const { return &keyContentArea[entry.pseudoKeyOffset]; } bool operator()(const Entry& entry, const std::string& key) const { return key.compare(getKeyContent(entry)) >= 0; } bool operator()(const std::string& key, const Entry& entry) const { return key.compare(getKeyContent(entry)) < 0; } }; typedef std::vector EntryCollection; private: // data // Used to store the (shortened) keys as densely packed C-style strings KeyContentArea keyContentArea; LookupPred pred; EntryCollection entries; }; } // namespace zim #endif // ZIM_NARROWDOWN_H libzim-9.2.3/src/rawstreamreader.h000066400000000000000000000031661466367137100171670ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_RAWSTREAMREADER_H #define ZIM_RAWSTREAMREADER_H #include "istreamreader.h" #include "reader.h" namespace zim { class RawStreamReader : public IStreamReader { public: // functions explicit RawStreamReader(std::shared_ptr reader) : m_reader(reader), m_readerPos(0) {} void readImpl(char* buf, zsize_t nbytes) override { m_reader->read(buf, m_readerPos, zsize_t(nbytes)); m_readerPos += nbytes; } std::unique_ptr sub_reader(zsize_t nbytes) override { auto reader = m_reader->sub_reader(m_readerPos, nbytes); m_readerPos += nbytes; return reader; } private: // data std::shared_ptr m_reader; offset_t m_readerPos; }; } // namespace zim #endif // ZIM_READERDATASTREAMWRAPPER_H libzim-9.2.3/src/reader.h000066400000000000000000000055751466367137100152470ustar00rootroot00000000000000/* * Copyright (C) 2017-2020 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_READER_H_ #define ZIM_READER_H_ #include #include #include "zim_types.h" #include "endian_tools.h" #include "debug.h" #include "buffer.h" namespace zim { class LIBZIM_PRIVATE_API Reader { public: Reader() {}; virtual zsize_t size() const = 0; virtual ~Reader() {}; void read(char* dest, offset_t offset, zsize_t size) const { if (can_read(offset, size)) { if (size) { // Do the actuall read only if we have a size to read readImpl(dest, offset, size); } return; } throw std::runtime_error("Cannot read after the end of the reader"); } template T read_uint(offset_t offset) const { ASSERT(offset.v, <, size().v); ASSERT(offset.v+sizeof(T), <=, size().v); char tmp_buf[sizeof(T)]; read(tmp_buf, offset, zsize_t(sizeof(T))); return fromLittleEndian(tmp_buf); } char read(offset_t offset) const { if (can_read(offset, zsize_t(1))) { return readImpl(offset); } throw std::runtime_error("Cannot read after the end of the reader"); } virtual const Buffer get_buffer(offset_t offset, zsize_t size) const = 0; const Buffer get_buffer(offset_t offset) const { return get_buffer(offset, zsize_t(size().v-offset.v)); } virtual std::unique_ptr sub_reader(offset_t offset, zsize_t size) const = 0; std::unique_ptr sub_reader(offset_t offset) const { return sub_reader(offset, zsize_t(size().v-offset.v)); } virtual offset_t offset() const = 0; bool can_read(offset_t offset, zsize_t size) const; private: // Implementation of the read method. // Check of the validity of the offset/size has already been done. virtual void readImpl(char* dest, offset_t offset, zsize_t size) const = 0; // Implementation of the read method. // Check of the validity of the offset has already been done. virtual char readImpl(offset_t offset) const = 0; }; }; #endif // ZIM_READER_H_ libzim-9.2.3/src/search.cpp000066400000000000000000000254571466367137100156060ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * Copyright (C) 2021 Veloman Yunkan * Copyright (C) 2020 Emmanuel Engelhart * Copyright (C) 2018 Kunal Mehta * Copyright (C) 2007 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include "fileimpl.h" #include "search_internal.h" #include "tools.h" #include #include #include #if !defined(_WIN32) # include #else # include #endif #include "xapian.h" #include #include "constants.h" #define MAX_MATCHES_TO_SORT 10000 namespace zim { InternalDataBase::InternalDataBase(const std::vector& archives, bool verbose) : m_verbose(verbose) { bool first = true; m_queryParser.set_database(m_database); m_queryParser.set_default_op(Xapian::Query::op::OP_AND); for(auto& archive: archives) { auto impl = archive.getImpl(); FileImpl::FindxResult r; r = impl->findx('X', "fulltext/xapian"); if (!r.first) { r = impl->findx('Z', "/fulltextIndex/xapian"); } if (!r.first) { continue; } auto xapianEntry = Entry(impl, entry_index_type(r.second)); auto accessInfo = xapianEntry.getItem().getDirectAccessInformation(); if (accessInfo.second == 0) { continue; } Xapian::Database database; if (!getDbFromAccessInfo(accessInfo, database)) { continue; } try { if ( first ) { m_valuesmap = read_valuesmap(database.get_metadata("valuesmap")); auto language = database.get_metadata("language"); if (language.empty() ) { // Database created before 2017/03 has no language metadata. // However, term were stemmed anyway and we need to stem our // search query the same the database was created. // So we need a language, let's use the one of the zim. // If zimfile has no language metadata, we can't do lot more here :/ try { language = archive.getMetadata("Language"); } catch(...) {} } if (!language.empty()) { icu::Locale languageLocale(language.c_str()); /* Configuring language base steemming */ try { m_stemmer = Xapian::Stem(languageLocale.getLanguage()); m_queryParser.set_stemmer(m_stemmer); m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL); } catch (...) { std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } auto stopwords = database.get_metadata("stopwords"); if ( !stopwords.empty() ){ std::string stopWord; std::istringstream file(stopwords); Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); while (std::getline(file, stopWord, '\n')) { stopper->add(stopWord); } stopper->release(); m_queryParser.set_stopper(stopper); } } else { std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); if (m_valuesmap != valuesmap ) { // [TODO] Ignore the database, raise a error ? } } m_xapianDatabases.push_back(database); m_database.add_database(database); m_archives.push_back(archive); first = false; } catch( Xapian::DatabaseError& e ) { // [TODO] Ignore the database or raise a error ? // As we already ignore the database if `getDbFromAccessInfo` "detects" a DatabaseError, // we also ignore here. } } } bool InternalDataBase::hasDatabase() const { return !m_xapianDatabases.empty(); } bool InternalDataBase::hasValuesmap() const { return !m_valuesmap.empty(); } bool InternalDataBase::hasValue(const std::string& valueName) const { return (m_valuesmap.find(valueName) != m_valuesmap.end()); } int InternalDataBase::valueSlot(const std::string& valueName) const { return m_valuesmap.at(valueName); } Xapian::Query InternalDataBase::parseQuery(const Query& query) { Xapian::Query xquery; const auto unaccentedQuery = removeAccents(query.m_query); xquery = m_queryParser.parse_query(unaccentedQuery, Xapian::QueryParser::FLAG_CJK_NGRAM); if (query.m_geoquery && hasValue("geo.position")) { Xapian::GreatCircleMetric metric; Xapian::LatLongCoord centre(query.m_latitude, query.m_longitude); Xapian::LatLongDistancePostingSource ps(valueSlot("geo.position"), centre, metric, query.m_distance); Xapian::Query geoQuery(&ps); if (unaccentedQuery.empty()) { xquery = geoQuery; } else { xquery = Xapian::Query(Xapian::Query::OP_FILTER, xquery, geoQuery); } } return xquery; } Searcher::Searcher(const std::vector& archives) : mp_internalDb(nullptr), m_verbose(false) { for ( const auto& a : archives ) { addArchive(a); } } Searcher::Searcher(const Archive& archive) : mp_internalDb(nullptr), m_verbose(false) { addArchive(archive); } Searcher::Searcher(const Searcher& other) = default; Searcher& Searcher::operator=(const Searcher& other) = default; Searcher::Searcher(Searcher&& other) = default; Searcher& Searcher::operator=(Searcher&& other) = default; Searcher::~Searcher() = default; namespace { bool archivesAreEquivalent(const Archive& a1, const Archive& a2) { return a1.getUuid() == a2.getUuid(); } bool contains(const std::vector& archives, const Archive& newArchive) { for ( const auto& a : archives ) { if ( archivesAreEquivalent(a, newArchive) ) { return true; } } return false; } } // unnamed namespace Searcher& Searcher::addArchive(const Archive& archive) { if ( !contains(m_archives, archive) ) { m_archives.push_back(archive); mp_internalDb.reset(); } return *this; } Search Searcher::search(const Query& query) { if (!mp_internalDb) { initDatabase(); } if (!mp_internalDb->hasDatabase()) { throw(std::runtime_error("Cannot create Search without FT Xapian index")); } return Search(mp_internalDb, query); } void Searcher::setVerbose(bool verbose) { m_verbose = verbose; } void Searcher::initDatabase() { mp_internalDb = std::make_shared(m_archives, m_verbose); } Search::Search(std::shared_ptr p_internalDb, const Query& query) : mp_internalDb(p_internalDb), mp_enquire(nullptr), m_query(query) { } Search::Search(Search&& s) = default; Search& Search::operator=(Search&& s) = default; Search::~Search() = default; Query::Query(const std::string& query) : m_query(query) {} Query& Query::setQuery(const std::string& query) { m_query = query; return *this; } Query& Query::setGeorange(float latitude, float longitude, float distance) { m_latitude = latitude; m_longitude = longitude; m_distance = distance; m_geoquery = true; return *this; } int Search::getEstimatedMatches() const { try { auto enquire = getEnquire(); // Force xapian to check at least 10 documents even if we ask for an empty mset. // Else, the get_matches_estimated may be wrong and return 0 even if we have results. auto mset = enquire.get_mset(0, 0, 10); return mset.get_matches_estimated(); } catch(Xapian::QueryParserError& e) { return 0; } catch(Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } const SearchResultSet Search::getResults(int start, int maxResults) const { try { auto enquire = getEnquire(); auto mset = enquire.get_mset(start, maxResults); return SearchResultSet(mp_internalDb, std::move(mset)); } catch(Xapian::QueryParserError& e) { return SearchResultSet(mp_internalDb); } catch(Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } Xapian::Enquire& Search::getEnquire() const { if ( mp_enquire ) { return *mp_enquire; } auto enquire = std::unique_ptr(new Xapian::Enquire(mp_internalDb->m_database)); auto query = mp_internalDb->parseQuery(m_query); if (mp_internalDb->m_verbose) { std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl; } enquire->set_query(query); mp_enquire = std::move(enquire); return *mp_enquire; } SearchResultSet::SearchResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset) : mp_internalDb(p_internalDb), mp_mset(std::make_shared(mset)) {} SearchResultSet::SearchResultSet(std::shared_ptr p_internalDb) : mp_internalDb(p_internalDb), mp_mset(nullptr) {} int SearchResultSet::size() const { if (! mp_mset) { return 0; } try { return mp_mset->size(); } catch(Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } SearchResultSet::iterator SearchResultSet::begin() const { if ( ! mp_mset ) { return nullptr; } try { return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin()); } catch(Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } SearchResultSet::iterator SearchResultSet::end() const { if ( ! mp_mset ) { return nullptr; } try { return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end()); } catch(Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } } //namespace zim libzim-9.2.3/src/search_internal.h000066400000000000000000000120311466367137100171270ustar00rootroot00000000000000/* * Copyright (C) 2021 Manneesh P M * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SEARCH_INTERNAL_H #define ZIM_SEARCH_INTERNAL_H #include #include #include namespace zim { /** * A class to encapsulate a xapian database and all the information we can gather from it. */ class InternalDataBase { public: // methods InternalDataBase(const std::vector& archives, bool verbose); bool hasDatabase() const; bool hasValuesmap() const; bool hasValue(const std::string& valueName) const; int valueSlot(const std::string& valueName) const; Xapian::Query parseQuery(const Query& query); public: // data // The (main) database we will search on (wrapping other xapian databases). Xapian::Database m_database; // The real databases. std::vector m_xapianDatabases; // The archives we are searching on. std::vector m_archives; // The valuesmap associated with the database. std::map m_valuesmap; // If the database is open for suggestion. // True even if the dabase has no newSuggestionformat. bool m_suggestionMode; // The query parser corresponding to the database. Xapian::QueryParser m_queryParser; // The stemmer used to parse queries Xapian::Stem m_stemmer; // Verbosity of operations. bool m_verbose; }; struct SearchIterator::InternalData { std::shared_ptr mp_internalDb; std::shared_ptr mp_mset; Xapian::MSetIterator _iterator; Xapian::Document _document; bool document_fetched; std::unique_ptr _entry; InternalData(const InternalData& other) : mp_internalDb(other.mp_internalDb), mp_mset(other.mp_mset), _iterator(other._iterator), _document(other._document), document_fetched(other.document_fetched), _entry(other._entry ? new Entry(*other._entry) : nullptr ) { } InternalData& operator=(const InternalData& other) { if (this != &other) { mp_internalDb = other.mp_internalDb; mp_mset = other.mp_mset; _iterator = other._iterator; _document = other._document; document_fetched = other.document_fetched; _entry.reset(other._entry ? new Entry(*other._entry) : nullptr); } return *this; } InternalData(std::shared_ptr p_internalDb, std::shared_ptr p_mset, Xapian::MSetIterator iterator) : mp_internalDb(p_internalDb), mp_mset(p_mset), _iterator(iterator), document_fetched(false) {}; Xapian::Document get_document() { try { if ( !document_fetched ) { _document = iterator().get_document(); document_fetched = true; } return _document; } catch ( Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } int get_databasenumber() { try { Xapian::docid docid = *iterator(); return (docid - 1) % mp_internalDb->m_archives.size(); } catch ( Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } Entry& get_entry() { try { if ( !_entry ) { int databasenumber = get_databasenumber(); auto archive = mp_internalDb->m_archives.at(databasenumber); _entry.reset(new Entry(archive.getEntryByPath(get_document().get_data()))); } return *_entry.get(); } catch ( Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } bool operator==(const InternalData& other) const { return (mp_internalDb == other.mp_internalDb && mp_mset == other.mp_mset && _iterator == other._iterator); } bool is_end() const { return _iterator == mp_mset->end(); } const Xapian::MSetIterator& iterator() const { if (is_end()) { throw std::runtime_error("Cannot get entry for end iterator"); } return _iterator; } }; }; //namespace zim #endif //ZIM_SEARCH_INTERNAL_H libzim-9.2.3/src/search_iterator.cpp000066400000000000000000000164541466367137100175140ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2017-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #define ZIM_PRIVATE #include "xapian/myhtmlparse.h" #include #include #include #include #include "search_internal.h" namespace zim { SearchIterator::~SearchIterator() = default; SearchIterator::SearchIterator(SearchIterator&& it) = default; SearchIterator& SearchIterator::operator=(SearchIterator&& it) = default; SearchIterator::SearchIterator() : SearchIterator(nullptr) {}; SearchIterator::SearchIterator(InternalData* internal_data) : internal(internal_data) {} SearchIterator::SearchIterator(const SearchIterator& it) : internal(nullptr) { if (it.internal) internal = std::unique_ptr(new InternalData(*it.internal)); } SearchIterator & SearchIterator::operator=(const SearchIterator& it) { if ( ! it.internal ) internal.reset(); else if ( ! internal ) internal = std::unique_ptr(new InternalData(*it.internal)); else *internal = *it.internal; return *this; } bool SearchIterator::operator==(const SearchIterator& it) const { if ( ! internal && ! it.internal) { return true; } if ( ! internal || ! it.internal) { return false; } return (*internal == *it.internal); } bool SearchIterator::operator!=(const SearchIterator& it) const { return ! (*this == it); } SearchIterator& SearchIterator::operator++() { if ( ! internal ) { return *this; } ++(internal->_iterator); internal->document_fetched = false; internal->_entry.reset(); return *this; } SearchIterator SearchIterator::operator++(int) { SearchIterator it = *this; operator++(); return it; } SearchIterator& SearchIterator::operator--() { if ( ! internal ) { return *this; } --(internal->_iterator); internal->document_fetched = false; internal->_entry.reset(); return *this; } SearchIterator SearchIterator::operator--(int) { SearchIterator it = *this; operator--(); return it; } std::string SearchIterator::getPath() const { if ( ! internal ) { return ""; } try { std::string path = internal->get_document().get_data(); bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme(); std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data"); if (dbDataType.empty()) { dbDataType = "fullPath"; } // If the archive has new namespace scheme and the type of its indexed data // is `fullPath` we return only the `path` without namespace if (hasNewNamespaceScheme && dbDataType == "fullPath") { path = path.substr(2); } return path; } catch (Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } std::string SearchIterator::getDbData() const { if ( ! internal ) { return ""; } return internal->get_document().get_data(); } std::string SearchIterator::getTitle() const { if ( ! internal ) { return ""; } return internal->get_entry().getTitle(); } int SearchIterator::getScore() const { if ( ! internal ) { return 0; } return internal->iterator().get_percent(); } std::string SearchIterator::getSnippet() const { if ( ! internal ) { return ""; } try { // Generate full text snippet if ( ! internal->mp_internalDb->hasValuesmap() ) { /* This is the old legacy version. Guess and try */ std::string stored_snippet = internal->get_document().get_value(1); if ( ! stored_snippet.empty() ) return stored_snippet; /* Let's continue here, and see if we can genenate one */ } else if ( internal->mp_internalDb->hasValue("snippet") ) { return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet")); } Entry& entry = internal->get_entry(); /* No reader, no snippet */ try { /* Get the content of the item to generate a snippet. We parse it and use the html dump to avoid remove html tags in the content and be able to nicely cut the text at random place. */ zim::MyHtmlParser htmlParser; std::string content = entry.getItem().getData(); try { htmlParser.parse_html(content, "UTF-8", true); } catch (...) {} return internal->mp_mset->snippet(htmlParser.dump, /*length=*/500, /*stemmer=*/internal->mp_internalDb->m_stemmer, /*flags=*/0); } catch (...) { return ""; } } catch (Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } int SearchIterator::getSize() const { return -1; } int SearchIterator::getWordCount() const { if ( ! internal ) { return -1; } try { if ( ! internal->mp_internalDb->hasValuesmap() ) { /* This is the old legacy version. Guess and try */ return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); } else if ( internal->mp_internalDb->hasValue("wordcount") ) { return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str()); } return -1; } catch (Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } int SearchIterator::getFileIndex() const { if ( internal ) { return internal->get_databasenumber(); } return 0; } Uuid SearchIterator::getZimId() const { if (! internal ) { throw std::runtime_error("Cannot get zimId from uninitialized iterator"); } try { return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid(); } catch (Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } } SearchIterator::reference SearchIterator::operator*() const { if (! internal ) { throw std::runtime_error("Cannot get a entry for a uninitialized iterator"); } return internal->get_entry(); } SearchIterator::pointer SearchIterator::operator->() const { return &**this; } } // namespace zim libzim-9.2.3/src/suggestion.cpp000066400000000000000000000265201466367137100165200ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #define ZIM_PRIVATE #include #include #include "suggestion_internal.h" #include "fileimpl.h" #include "tools.h" #include "constants.h" #if defined(ENABLE_XAPIAN) #include #endif // ENABLE_XAPIAN namespace zim { SuggestionDataBase::SuggestionDataBase(const Archive& archive, bool verbose) : m_archive(archive), m_verbose(verbose) { // Initialize Xapian DB if it is enabled #if defined(ENABLE_XAPIAN) try { initXapianDb(); } catch ( Xapian::DatabaseError& e) { throw zim::ZimFileFormatError(e.get_description()); } #endif // ENABLE_XAPIAN } #if defined(ENABLE_XAPIAN) void SuggestionDataBase::initXapianDb() { m_queryParser.set_database(m_database); m_queryParser.set_default_op(Xapian::Query::op::OP_AND); auto impl = m_archive.getImpl(); FileImpl::FindxResult r; r = impl->findx('X', "title/xapian"); if (!r.first) { return; } auto xapianEntry = Entry(impl, entry_index_type(r.second)); auto accessInfo = xapianEntry.getItem().getDirectAccessInformation(); if (accessInfo.second == 0) { return; } Xapian::Database database; if (!getDbFromAccessInfo(accessInfo, database)) { return; } m_valuesmap = read_valuesmap(database.get_metadata("valuesmap")); auto language = database.get_metadata("language"); if (language.empty() ) { // Database created before 2017/03 has no language metadata. // However, term were stemmed anyway and we need to stem our // search query the same the database was created. // So we need a language, let's use the one of the zim. // If zimfile has no language metadata, we can't do lot more here :/ try { language = m_archive.getMetadata("Language"); } catch(...) {} } if (!language.empty()) { icu::Locale languageLocale(language.c_str()); /* Configuring language base steemming */ try { m_stemmer = Xapian::Stem(languageLocale.getLanguage()); m_queryParser.set_stemmer(m_stemmer); } catch (...) { std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl; } } m_database = database; } bool SuggestionDataBase::hasDatabase() const { return !m_database.internal.empty(); } bool SuggestionDataBase::hasValuesmap() const { return !m_valuesmap.empty(); } bool SuggestionDataBase::hasValue(const std::string& valueName) const { return (m_valuesmap.find(valueName) != m_valuesmap.end()); } int SuggestionDataBase::valueSlot(const std::string& valueName) const { return m_valuesmap.at(valueName); } /* * subquery_phrase: selects documents that have the terms in the order of the query * within a specified window. * subquery_anchored: selects documents that have the terms in the order of the * query within a specified window and starts from the beginning of the document. * subquery_and: selects documents that have all the terms in the query. * * subquery_phrase and subquery_anchored by themselves are quite exclusive. To * include more "similar" docs, we combine them with subquery_and using OP_OR * operator. If a particular document has a weight of A in subquery_and and B * in subquery_phrase and C in subquery_anchored, the net weight of that document * becomes A+B+C (normalised out of 100). So the documents closer to the query * gets a higher relevance. */ Xapian::Query SuggestionDataBase::parseQuery(const std::string& query) { std::lock_guard locker(m_mutex); Xapian::Query xquery; const auto flags = Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_PARTIAL | Xapian::QueryParser::FLAG_CJK_NGRAM; // Reset stemming strategy for normal parsing m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); xquery = m_queryParser.parse_query(query, flags); if ( !query.empty() && xquery.get_num_subqueries() == 0 ) { // a non-empty query string produced an empty xapian query which means // that the query string is made solely of punctuation. xquery = Xapian::Query(Xapian::Query::OP_WILDCARD, query); } else if (!query.empty()) { // Reconfigure stemming strategy for phrase search m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE); Xapian::Query subquery_phrase = m_queryParser.parse_query(query, Xapian::QueryParser::FLAG_CJK_NGRAM); // Force the OP_PHRASE window to be equal to the number of terms. subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length()); auto qs = ANCHOR_TERM + query; Xapian::Query subquery_anchored = m_queryParser.parse_query(qs, Xapian::QueryParser::FLAG_CJK_NGRAM); subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length()); xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_phrase); xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_anchored); } return xquery; } #endif // ENABLE_XAPIAN SuggestionSearcher::SuggestionSearcher(const Archive& archive) : mp_internalDb(nullptr), m_archive(archive), m_verbose(false) {} SuggestionSearcher::SuggestionSearcher(const SuggestionSearcher& other) = default; SuggestionSearcher& SuggestionSearcher::operator=(const SuggestionSearcher& other) = default; SuggestionSearcher::SuggestionSearcher(SuggestionSearcher&& other) = default; SuggestionSearcher& SuggestionSearcher::operator=(SuggestionSearcher&& other) = default; SuggestionSearcher::~SuggestionSearcher() = default; SuggestionSearch SuggestionSearcher::suggest(const std::string& query) { if (!mp_internalDb) { initDatabase(); } return SuggestionSearch(mp_internalDb, query); } void SuggestionSearcher::setVerbose(bool verbose) { m_verbose = verbose; } void SuggestionSearcher::initDatabase() { mp_internalDb = std::make_shared(m_archive, m_verbose); } SuggestionSearch::SuggestionSearch(std::shared_ptr p_internalDb, const std::string& query) : mp_internalDb(p_internalDb), m_query(query) #if defined(ENABLE_XAPIAN) , mp_enquire(nullptr) #endif // ENABLE_XAPIAN {} SuggestionSearch::SuggestionSearch(SuggestionSearch&& s) = default; SuggestionSearch& SuggestionSearch::operator=(SuggestionSearch&& s) = default; SuggestionSearch::~SuggestionSearch() = default; int SuggestionSearch::getEstimatedMatches() const { #if defined(ENABLE_XAPIAN) if (mp_internalDb->hasDatabase()) { try { auto enquire = getEnquire(); // Force xapian to check at least 10 documents even if we ask for an empty mset. // Else, the get_matches_estimated may be wrong and return 0 even if we have results. auto mset = enquire.get_mset(0, 0, 10); return mset.get_matches_estimated(); } catch(...) { std::cerr << "Query Parsing failed, Switching to search without index." << std::endl; } } #endif // ENABLE_XAPIAN return mp_internalDb->m_archive.findByTitle(m_query).size(); } const SuggestionResultSet SuggestionSearch::getResults(int start, int maxResults) const { #if defined(ENABLE_XAPIAN) if (mp_internalDb->hasDatabase()) { try { auto enquire = getEnquire(); auto mset = enquire.get_mset(start, maxResults); return SuggestionResultSet(mp_internalDb, std::move(mset)); } catch(...) { std::cerr << "Query Parsing failed, Switching to search without index." << std::endl; } } #endif // ENABLE_XAPIAN auto entryRange = mp_internalDb->m_archive.findByTitle(m_query); entryRange.offset(start, maxResults); return SuggestionResultSet(entryRange); } const void SuggestionSearch::forceRangeSuggestion() { #if defined(ENABLE_XAPIAN) mp_internalDb->m_database.close(); #endif // ENABLE_XAPIAN } #if defined(ENABLE_XAPIAN) Xapian::Enquire& SuggestionSearch::getEnquire() const { if ( mp_enquire ) { return *mp_enquire; } auto enquire = std::unique_ptr(new Xapian::Enquire(mp_internalDb->m_database)); const auto unaccentedQuery = removeAccents(m_query); auto query = mp_internalDb->parseQuery(unaccentedQuery); if (mp_internalDb->m_verbose) { std::cout << "Parsed query '" << unaccentedQuery << "' to " << query.get_description() << std::endl; } enquire->set_query(query); /* * In suggestion mode, we are searching over a separate title index. Default BM25 is not * adapted for this case. WDF factor(k1) controls the effect of within document frequency. * k1 = 0.001 reduces the effect of word repitition in document. In BM25, smaller documents * get larger weights, so normalising the length of documents is necessary using b = 1. * The document set is first sorted by their relevance score then by value so that suggestion * results are closer to search string. * refer https://xapian.org/docs/apidoc/html/classXapian_1_1BM25Weight.html */ enquire->set_weighting_scheme(Xapian::BM25Weight(0.001,0,1,1,0.5)); if (mp_internalDb->hasValue("title")) { enquire->set_sort_by_relevance_then_value(mp_internalDb->valueSlot("title"), false); } if (mp_internalDb->hasValue("targetPath")) { enquire->set_collapse_key(mp_internalDb->valueSlot("targetPath")); } mp_enquire = std::move(enquire); return *mp_enquire; } SuggestionResultSet::SuggestionResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset) : mp_internalDb(p_internalDb), mp_entryRange(nullptr), mp_mset(std::make_shared(mset)) {} #endif // ENABLE_XAPIAN SuggestionResultSet::SuggestionResultSet(EntryRange entryRange) : mp_internalDb(nullptr), mp_entryRange(std::unique_ptr(new EntryRange(entryRange))) #if defined(ENABLE_XAPIAN) , mp_mset(nullptr) #endif // ENABLE_XAPIAN {} int SuggestionResultSet::size() const { #if defined(ENABLE_XAPIAN) if (! mp_entryRange) { return mp_mset->size(); } #endif // ENABLE_XAPIAN return mp_entryRange->size(); } SuggestionResultSet::iterator SuggestionResultSet::begin() const { #if defined(ENABLE_XAPIAN) if ( ! mp_entryRange ) { return new iterator::SuggestionInternalData(mp_internalDb, mp_mset, mp_mset->begin()); } #endif // ENABLE_XAPIAN return iterator(mp_entryRange->begin()); } SuggestionResultSet::iterator SuggestionResultSet::end() const { #if defined(ENABLE_XAPIAN) if ( ! mp_entryRange ) { return new iterator::SuggestionInternalData(mp_internalDb, mp_mset, mp_mset->end()); } #endif // ENABLE_XAPIAN return iterator(mp_entryRange->end()); } } // namespace zim libzim-9.2.3/src/suggestion_internal.h000066400000000000000000000105311466367137100200540ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_SUGGESTION_INTERNAL_H #define ZIM_SUGGESTION_INTERNAL_H #include "zim/suggestion.h" #include "zim/archive.h" #include #include #if defined(LIBZIM_WITH_XAPIAN) #include #endif namespace zim { /** * A class to encapsulate a xapian title index and it's archive and all the * information we can gather from it. */ class SuggestionDataBase { public: // methods SuggestionDataBase(const Archive& archive, bool verbose); public: // data // The archive to get suggestions from. Archive m_archive; // Verbosity of operations. bool m_verbose; private: // data std::mutex m_mutex; #if defined(LIBZIM_WITH_XAPIAN) public: // xapian based methods bool hasDatabase() const; bool hasValuesmap() const; bool hasValue(const std::string& valueName) const; int valueSlot(const std::string& valueName) const; Xapian::Query parseQuery(const std::string& query); public: // xapian based data // The Xapian database we will search on. Xapian::Database m_database; // The valuesmap associated with the database. std::map m_valuesmap; // The query parser corresponding to the database. Xapian::QueryParser m_queryParser; // The stemmer used to parse queries Xapian::Stem m_stemmer; private: void initXapianDb(); #endif // LIBZIM_WITH_XAPIAN }; #if defined(LIBZIM_WITH_XAPIAN) struct SuggestionIterator::SuggestionInternalData { std::shared_ptr mp_internalDb; std::shared_ptr mp_mset; Xapian::MSetIterator iterator; Xapian::Document _document; bool document_fetched; std::unique_ptr _entry; SuggestionInternalData(const SuggestionInternalData& other) : mp_internalDb(other.mp_internalDb), mp_mset(other.mp_mset), iterator(other.iterator), _document(other._document), document_fetched(other.document_fetched), _entry(other._entry ? new Entry(*other._entry) : nullptr ) { } SuggestionInternalData& operator=(const SuggestionInternalData& other) { if (this != &other) { mp_internalDb = other.mp_internalDb; mp_mset = other.mp_mset; iterator = other.iterator; _document = other._document; document_fetched = other.document_fetched; _entry.reset(other._entry ? new Entry(*other._entry) : nullptr); } return *this; } SuggestionInternalData(std::shared_ptr p_internalDb, std::shared_ptr p_mset, Xapian::MSetIterator iterator) : mp_internalDb(p_internalDb), mp_mset(p_mset), iterator(iterator), document_fetched(false) {}; Xapian::Document get_document() { if ( !document_fetched ) { if (iterator == mp_mset->end()) { throw std::runtime_error("Cannot get entry for end iterator"); } _document = iterator.get_document(); document_fetched = true; } return _document; } Entry& get_entry() { if (!_entry) { _entry.reset(new Entry(mp_internalDb->m_archive.getEntryByPath(get_document().get_data()))); } return *_entry.get(); } bool operator==(const SuggestionInternalData& other) const { return (mp_internalDb == other.mp_internalDb && mp_mset == other.mp_mset && iterator == other.iterator); } }; #endif // LIBZIM_WITH_XAPIAN } #endif // ZIM_SUGGESTION_INTERNAL_H libzim-9.2.3/src/suggestion_iterator.cpp000066400000000000000000000155401466367137100204310ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #define ZIM_PRIVATE #include "zim/suggestion_iterator.h" #include "suggestion_internal.h" #include namespace zim { SuggestionIterator::~SuggestionIterator() = default; SuggestionIterator::SuggestionIterator(SuggestionIterator&& it) = default; SuggestionIterator& SuggestionIterator::operator=(SuggestionIterator&& it) = default; SuggestionIterator::SuggestionIterator(RangeIterator rangeIterator) : mp_rangeIterator(std::unique_ptr(new RangeIterator(rangeIterator))) #if defined(LIBZIM_WITH_XAPIAN) , mp_internal(nullptr) #endif // LIBZIM_WITH_XAPIAN {} #if defined(LIBZIM_WITH_XAPIAN) SuggestionIterator::SuggestionIterator(SuggestionInternalData* internal) : mp_rangeIterator(nullptr), mp_internal(internal) {} #endif // LIBZIM_WITH_XAPIAN SuggestionIterator::SuggestionIterator(const SuggestionIterator& it) : mp_rangeIterator(nullptr) { #if defined(LIBZIM_WITH_XAPIAN) mp_internal.reset(nullptr); if (it.mp_internal) { mp_internal = std::unique_ptr(new SuggestionInternalData(*it.mp_internal)); } #endif // LIBZIM_WITH_XAPIAN if (it.mp_rangeIterator) { mp_rangeIterator = std::unique_ptr(new RangeIterator(*it.mp_rangeIterator)); } } SuggestionIterator& SuggestionIterator::operator=(const SuggestionIterator& it) { mp_rangeIterator.reset(); if (it.mp_rangeIterator) { mp_rangeIterator.reset(new RangeIterator(*it.mp_rangeIterator)); } #if defined(LIBZIM_WITH_XAPIAN) mp_internal.reset(); if (it.mp_internal) { mp_internal.reset(new SuggestionInternalData(*it.mp_internal)); } #endif // LIBZIM_WITH_XAPIAN m_suggestionItem.reset(); return *this; } bool SuggestionIterator::operator==(const SuggestionIterator& it) const { if (mp_rangeIterator && it.mp_rangeIterator) { return (*mp_rangeIterator == *it.mp_rangeIterator); } #if defined(LIBZIM_WITH_XAPIAN) if (mp_internal && it.mp_internal) { return (*mp_internal == *it.mp_internal); } #endif // LIBZIM_WITH_XAPIAN return false; } bool SuggestionIterator::operator!=(const SuggestionIterator& it) const { return ! (*this == it); } SuggestionIterator& SuggestionIterator::operator++() { #if defined(LIBZIM_WITH_XAPIAN) if (mp_internal) { ++(mp_internal->iterator); mp_internal->_entry.reset(); mp_internal->document_fetched = false; } #endif // LIBZIM_WITH_XAPIAN if (mp_rangeIterator) { ++(*mp_rangeIterator); } m_suggestionItem.reset(); return *this; } SuggestionIterator SuggestionIterator::operator++(int) { SuggestionIterator it = *this; operator++(); return it; } SuggestionIterator& SuggestionIterator::operator--() { #if defined(LIBZIM_WITH_XAPIAN) if (mp_internal) { --(mp_internal->iterator); mp_internal->_entry.reset(); mp_internal->document_fetched = false; } #endif // LIBZIM_WITH_XAPIAN if (mp_rangeIterator) { --(*mp_rangeIterator); } m_suggestionItem.reset(); return *this; } SuggestionIterator SuggestionIterator::operator--(int) { SuggestionIterator it = *this; operator--(); return it; } Entry SuggestionIterator::getEntry() const { #if defined(LIBZIM_WITH_XAPIAN) if (mp_internal) { try { return mp_internal->get_entry(); } catch ( Xapian::DatabaseError& e) { throw ZimFileFormatError(e.get_description()); } } #endif // LIBZIM_WITH_XAPIAN if (mp_rangeIterator) { return **mp_rangeIterator; } throw std::runtime_error("Cannot dereference iterator"); } #if defined(LIBZIM_WITH_XAPIAN) std::string SuggestionIterator::getDbData() const { if (! mp_internal) { return ""; } try { return mp_internal->get_document().get_data(); } catch ( Xapian::DatabaseError& e) { throw ZimFileFormatError(e.get_description()); } } std::string SuggestionIterator::getIndexPath() const { if (! mp_internal) { return ""; } try { std::string path = mp_internal->get_document().get_data(); bool hasNewNamespaceScheme = mp_internal->mp_internalDb->m_archive.hasNewNamespaceScheme(); std::string dbDataType = mp_internal->mp_internalDb->m_database.get_metadata("data"); if (dbDataType.empty()) { dbDataType = "fullPath"; } // If the archive has new namespace scheme and the type of its indexed data // is `fullPath` we return only the `path` without namespace if (hasNewNamespaceScheme && dbDataType == "fullPath") { path = path.substr(2); } return path; } catch ( Xapian::DatabaseError& e) { throw ZimFileFormatError(e.get_description()); } } std::string SuggestionIterator::getIndexTitle() const { if ( ! mp_internal) { return ""; } try { return mp_internal->get_entry().getTitle(); } catch (...) { return ""; } } std::string SuggestionIterator::getIndexSnippet() const { if (! mp_internal) { return ""; } try { return mp_internal->mp_mset->snippet(getIndexTitle(), 500, mp_internal->mp_internalDb->m_stemmer); } catch(...) { return ""; } } #endif // LIBZIM_WITH_XAPIAN const SuggestionItem& SuggestionIterator::operator*() { if (m_suggestionItem) { return *m_suggestionItem; } #if defined(LIBZIM_WITH_XAPIAN) if (mp_internal) { m_suggestionItem.reset(new SuggestionItem(getIndexTitle(), getIndexPath(), getIndexSnippet())); } else #endif // LIBZIM_WITH_XAPIAN if (mp_rangeIterator) { m_suggestionItem.reset(new SuggestionItem((*mp_rangeIterator)->getTitle(), (*mp_rangeIterator)->getPath())); } if (!m_suggestionItem){ throw std::runtime_error("Cannot dereference iterator"); } return *m_suggestionItem.get(); } const SuggestionItem* SuggestionIterator::operator->() { operator*(); return m_suggestionItem.get(); } } // namespace zim libzim-9.2.3/src/tools.cpp000066400000000000000000000214021466367137100154630ustar00rootroot00000000000000/* * Copyright (C) 2016-2021 Matthieu Gautier * Copyright (C) 2021 Maneeshs P M * Copyright (C) 2013-2016 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "tools.h" #include "zim/tools.h" #include "fs.h" #include #include #include #include #include #include #include #include #include #ifdef _WIN32 # include # include # include # include # define SEPARATOR "\\" #else # include # define SEPARATOR "/" #endif #ifdef __MINGW32__ # include #else # include # include #endif bool zim::isCompressibleMimetype(const std::string& mimetype) { return mimetype.find("text") == 0 || mimetype.find("+xml") != std::string::npos || mimetype.find("+json") != std::string::npos || mimetype == "application/javascript" || mimetype == "application/json"; } uint32_t zim::countWords(const std::string& text) { unsigned int numWords = 0; unsigned int length = text.size(); unsigned int i = 0; // Find first word while ( i < length && std::isspace(static_cast(text[i])) ) i++; while ( i < length ) { // Find end of word while ( i < length && !std::isspace(static_cast(text[i])) ) i++; numWords++; // Find start of next word while ( i < length && std::isspace(static_cast(text[i])) ) i++; } return numWords; } void zim::microsleep(int microseconds) { #ifdef __MINGW32__ struct timespec wait = {0, 0}; wait.tv_sec = microseconds / 1000000; wait.tv_nsec = (microseconds - wait.tv_sec*10000) * 1000; nanosleep(&wait, nullptr); #else std::this_thread::sleep_for(std::chrono::microseconds(microseconds)); #endif } std::tuple zim::parseLongPath(const std::string& longPath) { /* Index of the namespace char; discard '/' from absolute paths */ const unsigned int i = (longPath[0] == '/') ? 1 : 0; if (i + 1 > longPath.size() || longPath[i] == '/' || (i + 1 < longPath.size() && longPath[i+1] != '/')) throw std::runtime_error("Cannot parse path"); auto ns = longPath[i]; auto shortPath = longPath.substr(std::min(i+2, (unsigned int)longPath.size())); return std::make_tuple(ns, shortPath); } unsigned int zim::parseIllustrationPathToSize(const std::string& s) { int nw(0), nh(0), nEnd(0); long int w(-1), h(-1); if ( sscanf(s.c_str(), "Illustration_%n%ldx%n%ld@1%n)", &nw, &w, &nh, &h, &nEnd) == 2 && (size_t)nEnd == s.size() && !isspace(s[nw]) && !isspace(s[nh]) && w == h && w >= 0) { return (unsigned int)w; } throw std::runtime_error(""); } uint32_t zim::randomNumber(uint32_t max) { static std::default_random_engine random( std::chrono::system_clock::now().time_since_epoch().count()); static std::mutex mutex; std::lock_guard l(mutex); return (uint32_t)(((double)random() / random.max()) * max); } /* Split string in a token array */ std::vector zim::split(const std::string & str, const std::string & delims) { std::string::size_type lastPos = str.find_first_not_of(delims, 0); std::string::size_type pos = str.find_first_of(delims, lastPos); std::vector tokens; while (std::string::npos != pos || std::string::npos != lastPos) { tokens.push_back(str.substr(lastPos, pos - lastPos)); lastPos = str.find_first_not_of(delims, pos); pos = str.find_first_of(delims, lastPos); } return tokens; } std::map zim::read_valuesmap(const std::string &s) { std::map result; std::vector elems = split(s, ";"); for(std::vector::iterator elem = elems.begin(); elem != elems.end(); elem++) { std::vector tmp_elems = split(*elem, ":"); result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); } return result; } namespace { // The counter metadata format is a list of item separated by a `;` : // item0;item1;item2 // Each item is a "tuple" mimetype=number. // However, the mimetype may contains parameters: // text/html;raw=true;foo=bar // So the final format may be complex to parse: // key0=value0;key1;foo=bar=value1;key2=value2 typedef zim::MimeCounterType::value_type MimetypeAndCounter; std::string readFullMimetypeAndCounterString(std::istream& in) { std::string mtcStr, params; getline(in, mtcStr, ';'); if ( mtcStr.find('=') == std::string::npos ) { do { if ( !getline(in, params, ';' ) ) return std::string(); mtcStr += ";" + params; } while ( std::count(params.begin(), params.end(), '=') != 2 ); } return mtcStr; } MimetypeAndCounter parseASingleMimetypeCounter(const std::string& s) { const std::string::size_type k = s.find_last_of("="); if ( k != std::string::npos ) { const std::string mimeType = s.substr(0, k); std::istringstream counterSS(s.substr(k+1)); unsigned int counter; if (counterSS >> counter && counterSS.eof()) return std::make_pair(mimeType, counter); } return MimetypeAndCounter{"", 0}; } } // unnamed namespace zim::MimeCounterType zim::parseMimetypeCounter(const std::string& counterData) { zim::MimeCounterType counters; std::istringstream ss(counterData); while (ss) { const std::string mtcStr = readFullMimetypeAndCounterString(ss); const MimetypeAndCounter mtc = parseASingleMimetypeCounter(mtcStr); if ( !mtc.first.empty() ) counters.insert(mtc); } return counters; } // Xapian based tools #if defined(ENABLE_XAPIAN) #include "xapian.h" #include #include #include #define BATCH_SIZE (4*1024) std::string zim::removeAccents(const std::string& text) { ucnv_setDefaultName("UTF-8"); static UErrorCode status = U_ZERO_ERROR; static std::unique_ptr removeAccentsTrans(icu::Transliterator::createInstance( "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status)); icu::UnicodeString ustring(text.c_str()); std::string unaccentedText; auto nb_chars = ustring.length(); if (nb_chars <= BATCH_SIZE) { // Remove accents in one step. removeAccentsTrans->transliterate(ustring); ustring.toUTF8String(unaccentedText); } else { auto current_pos = 0; icu::UnicodeString current_ustring; while (current_pos < nb_chars) { // Remove accents by batch of BATCH_SIZE "chars" to avoid working on // a too long string and spending to much time memcpy things. auto end = ustring.getChar32Limit(current_pos+BATCH_SIZE); auto current_size = end - current_pos; current_ustring.remove(); ustring.extract(current_pos, current_size, current_ustring); removeAccentsTrans->transliterate(current_ustring); current_ustring.toUTF8String(unaccentedText); current_pos += current_size; } } return unaccentedText; } bool zim::getDbFromAccessInfo(zim::Item::DirectAccessInfo accessInfo, Xapian::Database& database) { zim::DEFAULTFS::FD databasefd; try { databasefd = zim::DEFAULTFS::openFile(accessInfo.first); } catch (...) { std::cerr << "Impossible to open " << accessInfo.first << std::endl; std::cerr << strerror(errno) << std::endl; return false; } if (!databasefd.seek(zim::offset_t(accessInfo.second))) { std::cerr << "Something went wrong seeking databasedb " << accessInfo.first << std::endl; std::cerr << "dbOffest = " << accessInfo.second << std::endl; return false; } try { database = Xapian::Database(databasefd.release()); } catch( Xapian::DatabaseError& e) { std::cerr << "Something went wrong opening xapian database for zimfile " << accessInfo.first << std::endl; std::cerr << "dbOffest = " << accessInfo.second << std::endl; std::cerr << "error = " << e.get_msg() << std::endl; return false; } return true; } void zim::setICUDataDirectory(const std::string& path) { u_setDataDirectory(path.c_str()); } #endif libzim-9.2.3/src/tools.h000066400000000000000000000052131466367137100151320ustar00rootroot00000000000000/* * Copyright (C) 2016-2020 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * Copyright (C) 2013-2016 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_TOOLS_H #define OPENZIM_LIBZIM_TOOLS_H #include #include #include #include #include "config.h" #include #if defined(ENABLE_XAPIAN) namespace Xapian { class Database; } #endif // ENABLE_XAPIAN namespace zim { bool isCompressibleMimetype(const std::string& mimetype); uint32_t LIBZIM_PRIVATE_API countWords(const std::string& text); void LIBZIM_PRIVATE_API microsleep(int microseconds); std::tuple LIBZIM_PRIVATE_API parseLongPath(const std::string& longPath); // Parse a illustration path ("Illustration_x@1") to a size. unsigned int LIBZIM_PRIVATE_API parseIllustrationPathToSize(const std::string& s); /** Return a random number from range [0, max] * * This function is threadsafe **/ uint32_t LIBZIM_PRIVATE_API randomNumber(uint32_t max); std::vector split(const std::string & str, const std::string & delims=" *-"); std::map read_valuesmap(const std::string& s); using MimeCounterType = std::map; MimeCounterType LIBZIM_PRIVATE_API parseMimetypeCounter(const std::string& counterData); template entry_index_type countMimeType(const std::string& counterData, Filter filter) { entry_index_type count = 0; for (auto& pair: parseMimetypeCounter(counterData)) { if (filter(pair.first)) { count += pair.second; } } return count; } // Xapian based tools #if defined(ENABLE_XAPIAN) std::string LIBZIM_PRIVATE_API removeAccents(const std::string& text); bool getDbFromAccessInfo(Item::DirectAccessInfo accessInfo, Xapian::Database& database); #endif } #endif // OPENZIM_LIBZIM_TOOLS_H libzim-9.2.3/src/uuid.cpp000066400000000000000000000056521466367137100153020ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2018-2020 Matthieu Gautier * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include // necessary to have the new types #include "log.h" #include "md5.h" #ifdef _WIN32 # include # include int gettimeofday(struct timeval* tp, void* tzp) { DWORD t; t = timeGetTime(); tp->tv_sec = t / 1000; tp->tv_usec = t % 1000; return 0; } #define getpid GetCurrentProcessId #else # include #endif log_define("zim.uuid") namespace zim { namespace { char hex[] = "0123456789abcdef"; inline char hi(char v) { return hex[(v >> 4) & 0xf]; } inline char lo(char v) { return hex[v & 0xf]; } } Uuid Uuid::generate(std::string value) { Uuid ret; struct zim_MD5_CTX md5ctx; zim_MD5Init(&md5ctx); if ( value.empty() ) { struct timeval tv; gettimeofday(&tv, 0); clock_t c = clock(); zim_MD5Update(&md5ctx, reinterpret_cast(&c), sizeof(clock_t)); zim_MD5Update(&md5ctx, reinterpret_cast(&tv), sizeof(struct timeval)); } else { zim_MD5Update(&md5ctx, reinterpret_cast(value.data()), value.size()); } zim_MD5Final(reinterpret_cast(&ret.data[0]), &md5ctx); log_debug("generated uuid: " << ret.data); return ret; } Uuid::operator std::string() const { std::ostringstream out; zim::operator<<(out, *this); return out.str(); } std::ostream& operator<< (std::ostream& out, const Uuid& uuid) { for (unsigned n = 0; n < 4; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 4; n < 6; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 6; n < 8; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 8; n < 10; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); out << '-'; for (unsigned n = 10; n < 16; ++n) out << hi(uuid.data[n]) << lo(uuid.data[n]); return out; } } libzim-9.2.3/src/version.cpp000066400000000000000000000037041466367137100160150ustar00rootroot00000000000000/* * Copyright (C) 2021 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include #include #include #include #include #if defined(ENABLE_XAPIAN) #include #include #endif namespace zim { LibVersions getVersions() { LibVersions versions = { { "libzim", LIBZIM_VERSION }, { "libzstd", ZSTD_VERSION_STRING }, { "liblzma", LZMA_VERSION_STRING } }; #if defined(ENABLE_XAPIAN) // Libxapian is not a mandatory dependence versions.push_back({ "libxapian", XAPIAN_VERSION }); // U_ICU_VERSION does not include the patch level if 0 versions.push_back({"libicu", Formatter() << U_ICU_VERSION_MAJOR_NUM << "." << U_ICU_VERSION_MINOR_NUM << "." << U_ICU_VERSION_PATCHLEVEL_NUM}); #endif return versions; } void printVersions(std::ostream& out) { LibVersions versions = getVersions(); for (const auto& iter : versions) { out << (iter != versions.front() ? "+ " : "") << iter.first << " " << iter.second << std::endl; } } } //namespace zim libzim-9.2.3/src/writer/000077500000000000000000000000001466367137100151345ustar00rootroot00000000000000libzim-9.2.3/src/writer/_dirent.h000066400000000000000000000201401466367137100167260ustar00rootroot00000000000000/* * Copyright (C) 2018-2021 Matthieu Gautier * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_DIRENT_H #define ZIM_WRITER_DIRENT_H #include "cluster.h" #include "tinyString.h" #include "debug.h" namespace zim { namespace writer { class Dirent; // Be sure that enum value are sorted by "alphabetical" order enum class NS: uint8_t { C = 0, M = 1, W = 2, X = 3 }; char NsAsChar(NS ns); class DirentInfo { public: // structures struct Direct { Direct() : cluster(nullptr), blobNumber(0) {}; Cluster* cluster; blob_index_t blobNumber; } PACKED; struct Redirect { Redirect(NS ns, const std::string& target) : targetPath(target), ns(ns) {}; Redirect(Redirect&& r) = default; Redirect(const Redirect& r) = default; ~Redirect() {}; TinyString targetPath; NS ns; } PACKED; struct Resolved { Resolved(const Dirent* target) : targetDirent(target) {}; const Dirent* targetDirent; } PACKED; public: // functions ~DirentInfo() { switch(tag) { case DIRECT: direct.~Direct(); break; case REDIRECT: redirect.~Redirect(); break; case RESOLVED: resolved.~Resolved(); break; } }; DirentInfo(Direct&& d): direct(std::move(d)), tag(DirentInfo::DIRECT) {} DirentInfo(Redirect&& r): redirect(std::move(r)), tag(DirentInfo::REDIRECT) {} DirentInfo(Resolved&& r): resolved(std::move(r)), tag(DirentInfo::RESOLVED) {} DirentInfo(const DirentInfo& other): tag(other.tag) { switch (tag) { case DIRECT: new(&direct) Direct(other.direct); break; case REDIRECT: new(&redirect) Redirect(other.redirect); break; case RESOLVED: new(&resolved) Resolved(other.resolved); break; } } DirentInfo::Direct& getDirect() { ASSERT(tag, ==, DIRECT); return direct; } DirentInfo::Redirect& getRedirect() { ASSERT(tag, ==, REDIRECT); return redirect; } DirentInfo::Resolved& getResolved() { ASSERT(tag, ==, RESOLVED); return resolved; } const DirentInfo::Direct& getDirect() const { ASSERT(tag, ==, DIRECT); return direct; } const DirentInfo::Redirect& getRedirect() const { ASSERT(tag, ==, REDIRECT); return redirect; } const DirentInfo::Resolved& getResolved() const { ASSERT(tag, ==, RESOLVED); return resolved; } private: // members union { Direct direct; Redirect redirect; Resolved resolved; } PACKED; public: // members enum : char {DIRECT, REDIRECT, RESOLVED} tag; } PACKED; class LIBZIM_PRIVATE_API Dirent { static const uint16_t redirectMimeType = 0xffff; static const uint32_t version = 0; PathTitleTinyString pathTitle; uint16_t mimeType; entry_index_t idx = entry_index_t(0); DirentInfo info; offset_t offset; uint8_t _ns : 2; bool removed : 1; bool frontArticle : 1; public: // Creator for a "classic" dirent Dirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype); // Creator for a "redirection" dirent Dirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath); // Creator for a "alias" dirent. Reuse the namespace of the targeted Dirent. Dirent(const std::string& path, const std::string& title, const Dirent& target); // Creator for "temporary" dirent, used to search for dirent in container. // We use them in path ordered container so we only need to set the namespace and the path. // Other value are irrelevant. Dirent(NS ns, const std::string& path) : Dirent(ns, path, "", 0) { } NS getNamespace() const { return static_cast(_ns); } std::string getTitle() const { return pathTitle.getTitle(false); } std::string getRealTitle() const { return pathTitle.getTitle(true); } std::string getPath() const { return pathTitle.getPath(); } uint32_t getVersion() const { return version; } NS getRedirectNs() const; std::string getRedirectPath() const; void setRedirect(const Dirent* target) { ASSERT(info.tag, ==, DirentInfo::REDIRECT); info.~DirentInfo(); new(&info) DirentInfo(DirentInfo::Resolved(target)); } entry_index_t getRedirectIndex() const { return info.getResolved().targetDirent->getIdx(); } void setIdx(entry_index_t idx_) { idx = idx_; } entry_index_t getIdx() const { return idx; } void setCluster(zim::writer::Cluster* _cluster) { auto& direct = info.getDirect(); direct.cluster = _cluster; direct.blobNumber = _cluster->count(); } zim::writer::Cluster* getCluster() { return info.getDirect().cluster; } cluster_index_t getClusterNumber() const { auto& direct = info.getDirect(); return direct.cluster ? direct.cluster->getClusterIndex() : cluster_index_t(0); } blob_index_t getBlobNumber() const { return info.getDirect().blobNumber; } bool isRedirect() const { return mimeType == redirectMimeType; } bool isItem() const { return !isRedirect(); } uint16_t getMimeType() const { return mimeType; } void setMimeType(uint16_t m) { ASSERT(info.tag, ==, DirentInfo::DIRECT); mimeType = m; } size_t getDirentSize() const { return (isRedirect() ? 12 : 16) + pathTitle.size() + 1; } offset_t getOffset() const { return offset; } void setOffset(offset_t o) { offset = o; } bool isRemoved() const { return removed; } void markRemoved() { removed = true; } bool isFrontArticle() const { return frontArticle; } void setFrontArticle() { frontArticle = true; } void write(int out_fd) const; friend bool comparePath(const Dirent* d1, const Dirent* d2); friend inline bool compareTitle(const Dirent* d1, const Dirent* d2); } PACKED; inline bool comparePath(const Dirent* d1, const Dirent* d2) { return d1->getNamespace() < d2->getNamespace() || (d1->getNamespace() == d2->getNamespace() && d1->getPath() < d2->getPath()); } inline bool compareTitle(const Dirent* d1, const Dirent* d2) { return d1->getNamespace() < d2->getNamespace() || (d1->getNamespace() == d2->getNamespace() && d1->getTitle() < d2->getTitle()); } } } #endif // ZIM_WRITER_DIRENT_H libzim-9.2.3/src/writer/cluster.cpp000066400000000000000000000142531466367137100173260ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * Copyright (C) 2021 Veloman Yunkan * Copyright (C) 2020 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "cluster.h" #include "../log.h" #include "../endian_tools.h" #include "../debug.h" #include "../compression.h" #include #include #include #include #include #ifdef _WIN32 # include #else # include # define _write(fd, addr, size) ::write((fd), (addr), (size)) #endif const zim::size_type MAX_WRITE_SIZE(4UL*1024*1024*1024-1); namespace zim { namespace writer { Cluster::Cluster(Compression compression) : compression(compression), isExtended(false), _size(0) { blobOffsets.push_back(offset_t(0)); } Cluster::~Cluster() { if (compressed_data.data()) { delete[] compressed_data.data(); } } void Cluster::clear_data() { clear_raw_data(); clear_compressed_data(); } void Cluster::clear_raw_data() { Offsets().swap(blobOffsets); ClusterProviders().swap(m_providers); } void Cluster::clear_compressed_data() { if (compressed_data.data()) { delete[] compressed_data.data(); compressed_data = Blob(); } } void Cluster::close() { if (getCompression() != Compression::None) { // We must compress the content in a buffer. compress(); clear_raw_data(); } closed = true; } bool Cluster::isClosed() const{ return closed; } zsize_t Cluster::size() const { if (isClosed()) { throw std::runtime_error("oups"); } if (isExtended) { return zsize_t(blobOffsets.size() * sizeof(uint64_t)) + _size; } else { return zsize_t(blobOffsets.size() * sizeof(uint32_t)) + _size; } } template void Cluster::write_offsets(writer_t writer) const { size_type delta = blobOffsets.size() * sizeof(OFFSET_TYPE); char out_buf[sizeof(OFFSET_TYPE)]; for (auto offset : blobOffsets) { offset.v += delta; toLittleEndian(static_cast(offset.v), out_buf); writer(Blob(out_buf, sizeof(OFFSET_TYPE))); } } void Cluster::write_content(writer_t writer) const { if (isExtended) { write_offsets(writer); } else { write_offsets(writer); } write_data(writer); } void Cluster::compress() { auto comp = getCompression(); switch(comp) { case Compression::Zstd: { _compress(); break; } default: throw std::runtime_error("We cannot compress an uncompressed cluster"); }; } template void Cluster::_compress() { Compressor runner; bool first = true; auto writer = [&](const Blob& data) -> void { if (first) { runner.init((char*)data.data()); first = false; } runner.feed(data.data(), data.size()); }; write_content(writer); zsize_t size; auto comp = runner.get_data(&size); compressed_data = Blob(comp.release(), size.v); } void Cluster::write(int out_fd) const { // write clusterInfo char clusterInfo = 0; if (isExtended) { clusterInfo = 0x10; } clusterInfo += static_cast(getCompression()); if (_write(out_fd, &clusterInfo, 1) == -1) { throw std::runtime_error("Error writing"); } // Open a comprestion stream if needed switch(getCompression()) { case Compression::None: { auto writer = [=](const Blob& data) -> void { // Ideally we would simply have to do : // ::write(tmp_fd, data.c_str(), data.size()); // However, the data can be pretty big (> 4Gb), especially with test, // And ::write fails to write data > 4Gb. So we have to chunck the write. size_type to_write = data.size(); const char* src = data.data(); while (to_write) { size_type chunk_size = std::min(MAX_WRITE_SIZE, to_write); auto ret = _write(out_fd, src, chunk_size); src += ret; to_write -= ret; } }; write_content(writer); break; } case Compression::Zstd: { log_debug("compress data"); if (_write(out_fd, compressed_data.data(), compressed_data.size()) == -1) { throw std::runtime_error("Error writing"); } break; } default: Formatter fmt_msg; fmt_msg << "invalid compression flag " << static_cast(getCompression()); log_error(fmt_msg); throw std::runtime_error(fmt_msg); } } void Cluster::addContent(std::unique_ptr provider) { auto size = provider->getSize(); _size += size; blobOffsets.push_back(offset_t(_size.v)); m_count++; isExtended |= (_size.v>UINT32_MAX); if (size == 0) return; m_providers.push_back(std::move(provider)); } void Cluster::addContent(const std::string& data) { auto contentProvider = std::unique_ptr(new StringProvider(data)); addContent(std::move(contentProvider)); } void Cluster::write_data(writer_t writer) const { for (auto& provider: m_providers) { ASSERT(provider->getSize(), !=, 0U); zim::size_type size = 0; while(true) { auto blob = provider->feed(); if(blob.size() == 0) { break; } size += blob.size(); writer(blob); } if (size != provider->getSize()) throw IncoherentImplementationError( Formatter() << "Declared provider's size (" << provider->getSize() << ")" << " is not equal to total size returned by feed() calls (" << size << ")."); } } } // writer } // zim libzim-9.2.3/src/writer/cluster.h000066400000000000000000000062761466367137100170010ustar00rootroot00000000000000/* * Copyright (C) 2017-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_CLUSTER_H_ #define ZIM_WRITER_CLUSTER_H_ #include #include #include #include #include #include #include #include "../zim_types.h" #include "../debug.h" #include "config.h" namespace zim { namespace writer { using writer_t = std::function; class ContentProvider; class LIBZIM_PRIVATE_API Cluster { typedef std::vector Offsets; typedef std::vector> ClusterProviders; public: Cluster(Compression compression); virtual ~Cluster(); void setCompression(Compression c) { compression = c; } Compression getCompression() const { return compression; } void addContent(std::unique_ptr provider); void addContent(const std::string& data); blob_index_t count() const { return blob_index_t(m_count); } zsize_t size() const; offset_t getOffset() const { return offset; } void setOffset(offset_t o) { offset = o; } bool is_extended() const { return isExtended; } void clear_data(); void close(); bool isClosed() const; void setClusterIndex(cluster_index_t idx) { index = idx; } cluster_index_t getClusterIndex() const { return index; } zsize_t getBlobSize(blob_index_t n) const { return zsize_t(blobOffsets[blob_index_type(n)+1].v - blobOffsets[blob_index_type(n)].v); } offset_t getBlobOffset(blob_index_t n) const { return blobOffsets[n.v]; } offset_t getDataOffset() const { ASSERT(bool(closed), ==, true); return offset_t(1) + offset_t((count().v + 1) * (isExtended?sizeof(uint64_t):sizeof(uint32_t))); } void write(int out_fd) const; protected: Compression compression; cluster_index_t index; bool isExtended; Offsets blobOffsets; offset_t offset; zsize_t _size; ClusterProviders m_providers; mutable Blob compressed_data; std::string tmp_filename; std::atomic closed { false }; blob_index_type m_count { 0 }; private: void write_content(writer_t writer) const; template void write_offsets(writer_t writer) const; void write_data(writer_t writer) const; void compress(); template void _compress(); void clear_raw_data(); void clear_compressed_data(); }; }; }; #endif //ZIM_WRITER_CLUSTER_H_ libzim-9.2.3/src/writer/clusterWorker.cpp000066400000000000000000000017521466367137100205200ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "clusterWorker.h" #include "cluster.h" namespace zim { namespace writer { void ClusterTask::run(CreatorData* data) { cluster->close(); }; } } libzim-9.2.3/src/writer/clusterWorker.h000066400000000000000000000025061466367137100201630ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_CLUSTER_WORKER_H #define OPENZIM_LIBZIM_CLUSTER_WORKER_H #include #include "workers.h" namespace zim { namespace writer { class Cluster; class ClusterTask : public TrackableTask { public: ClusterTask(const ClusterTask&) = delete; ClusterTask& operator=(const ClusterTask&) = delete; explicit ClusterTask(Cluster* cluster) : cluster(cluster) {}; virtual ~ClusterTask() = default; virtual void run(CreatorData* data); private: Cluster* cluster; }; } } #endif // OPENZIM_LIBZIM_QUEUE_H libzim-9.2.3/src/writer/contentProvider.cpp000066400000000000000000000040171466367137100210270ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include "../fs.h" const zim::size_type BUFFER_SIZE(1024*1024); namespace zim { namespace writer { Blob StringProvider::feed() { if (feeded) { return Blob(nullptr, 0); } feeded = true; return Blob(content.data(), content.size()); } Blob SharedStringProvider::feed() { if (feeded) { return Blob(nullptr, 0); } feeded = true; return Blob(content->data(), content->size()); } FileProvider::FileProvider(const std::string& filepath) : filepath(filepath), buffer(new char[BUFFER_SIZE]), fd(new DEFAULTFS::FD(DEFAULTFS::openFile(filepath))), offset(0) { size = fd->getSize().v; } FileProvider::~FileProvider() = default; Blob FileProvider::feed() { auto sizeToRead = std::min(BUFFER_SIZE, size-offset); if (!sizeToRead) { return Blob(nullptr, 0); } if(fd->readAt(buffer.get(), zim::zsize_t(sizeToRead), zim::offset_t(offset)) == zim::zsize_t(-1)) { throw std::runtime_error("Error reading file " + filepath); } offset += sizeToRead; return Blob(buffer.get(), sizeToRead); } } } libzim-9.2.3/src/writer/counterHandler.cpp000066400000000000000000000037411466367137100206220ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "counterHandler.h" #include "creatordata.h" #include #include #include using namespace zim::writer; CounterHandler::CounterHandler(CreatorData* data) : mp_creatorData(data) {} CounterHandler::~CounterHandler() = default; void CounterHandler::start() { } void CounterHandler::stop() { } DirentHandler::Dirents CounterHandler::createDirents() const { Dirents ret; ret.push_back(mp_creatorData->createDirent(NS::M, "Counter", "text/plain", "")); return ret; } DirentHandler::ContentProviders CounterHandler::getContentProviders() const { ContentProviders ret; Formatter fmt; bool first = true; for(auto pair: m_mimetypeCounter) { if (! first) { fmt << ";"; } fmt << pair.first << "=" << pair.second; first = false; } ret.push_back(std::unique_ptr(new StringProvider(fmt))); return ret; } void CounterHandler::handle(Dirent* dirent, const Hints& hints) { } void CounterHandler::handle(Dirent* dirent, std::shared_ptr item) { if (dirent->getNamespace() != NS::C) { return; } auto mimetype = item->getMimeType(); if (mimetype.empty()) { return; } m_mimetypeCounter[mimetype] += 1; } libzim-9.2.3/src/writer/counterHandler.h000066400000000000000000000031001466367137100202540ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_COUNTER_HANDLER_H #define OPENZIM_LIBZIM_COUNTER_HANDLER_H #include "handler.h" #include namespace zim { namespace writer { class CounterHandler : public DirentHandler { public: typedef std::map Counter; explicit CounterHandler(CreatorData* data); virtual ~CounterHandler(); void start() override; void stop() override; bool isCompressible() override { return true; } ContentProviders getContentProviders() const override; void handle(Dirent* dirent, std::shared_ptr item) override; void handle(Dirent* dirent, const Hints& hints) override; private: Dirents createDirents() const override; CreatorData* mp_creatorData; Counter m_mimetypeCounter; }; } } #endif // OPENZIM_LIBZIM_COUNTER_HANDLER_H libzim-9.2.3/src/writer/creator.cpp000066400000000000000000000551571466367137100173140ustar00rootroot00000000000000/* * Copyright (C) 2019-2021 Matthieu Gautier * Copyright (C) 2021 Maneesh P M * Copyright (C) 2021 Veloman Yunkan * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include "config.h" #include "creatordata.h" #include "cluster.h" #include "debug.h" #include "workers.h" #include "clusterWorker.h" #include #include #include #include #include "../endian_tools.h" #include #include #include "../md5.h" #include "../constants.h" #include "counterHandler.h" #if defined(ENABLE_XAPIAN) # include "xapianHandler.h" #endif #ifdef _WIN32 # include # include #else # include # define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ {throw std::runtime_error("Error writing");} #endif #include #include #include #include #include #include #include #include "log.h" #include "../fs.h" #include "../tools.h" log_define("zim.writer.creator") #define INFO(e) \ do { \ log_info(e); \ std::cout << e << std::endl; \ } while(false) #define TINFO(e) \ if (m_verbose) { \ double seconds = difftime(time(NULL), data->start_time); \ std::cout << "T:" << (int)(seconds) \ << "; " << e << std::endl; \ } #define TPROGRESS() \ if (m_verbose ) { \ double seconds = difftime(time(NULL),data->start_time); \ std::cout << "T:" << (int)seconds \ << "; A:" << data->dirents.size() \ << "; RA:" << data->nbRedirectItems \ << "; CA:" << data->nbCompItems \ << "; UA:" << data->nbUnCompItems \ << "; C:" << data->nbClusters \ << "; CC:" << data->nbCompClusters \ << "; UC:" << data->nbUnCompClusters \ << "; WC:" << data->taskList.size() \ << std::endl; \ } #define CLUSTER_BASE_OFFSET 2048 namespace zim { namespace writer { Creator::Creator() : m_clusterSize(DEFAULT_CLUSTER_SIZE) {} Creator::~Creator() = default; Creator& Creator::configVerbose(bool verbose) { m_verbose = verbose; return *this; } Creator& Creator::configCompression(Compression compression) { m_compression = compression; return *this; } Creator& Creator::configClusterSize(zim::size_type targetSize) { m_clusterSize = targetSize; return *this; } Creator& Creator::configIndexing(bool indexing, const std::string& language) { m_withIndex = indexing; m_indexingLanguage = language; return *this; } Creator& Creator::configNbWorkers(unsigned nbWorkers) { m_nbWorkers = nbWorkers; return *this; } void Creator::startZimCreation(const std::string& filepath) { data = std::unique_ptr( new CreatorData(filepath, m_verbose, m_withIndex, m_indexingLanguage, m_compression, m_clusterSize) ); for(unsigned i=0; idata.get()); data->workerThreads.push_back(std::move(thread)); } data->writerThread = std::thread(clusterWriter, this->data.get()); } void Creator::addItem(std::shared_ptr item) { checkError(); bool compressContent = item->getAmendedHints()[COMPRESS]; auto dirent = data->createItemDirent(item.get()); data->addItemData(dirent, item->getContentProvider(), compressContent); data->handle(dirent, item); if (data->dirents.size()%1000 == 0) { TPROGRESS(); } } void Creator::addMetadata(const std::string& name, const std::string& content, const std::string& mimetype) { checkError(); auto provider = std::unique_ptr(new StringProvider(content)); addMetadata(name, std::move(provider), mimetype); } void Creator::addMetadata(const std::string& name, std::unique_ptr provider, const std::string& mimetype) { checkError(); auto compressContent = isCompressibleMimetype(mimetype); auto dirent = data->createDirent(NS::M, name, mimetype, ""); data->addItemData(dirent, std::move(provider), compressContent); data->handle(dirent); } void Creator::addIllustration(unsigned int size, const std::string& content) { checkError(); auto provider = std::unique_ptr(new StringProvider(content)); addIllustration(size, std::move(provider)); } void Creator::addIllustration(unsigned int size, std::unique_ptr provider) { checkError(); addMetadata(Formatter() << "Illustration_" << size << "x" << size << "@1", std::move(provider), "image/png"); } void Creator::addRedirection(const std::string& path, const std::string& title, const std::string& targetPath, const Hints& hints) { checkError(); auto dirent = data->createRedirectDirent(NS::C, path, title, NS::C, targetPath); if (data->dirents.size()%1000 == 0){ TPROGRESS(); } data->handle(dirent, hints); } void Creator::addAlias(const std::string& path, const std::string& title, const std::string& targetPath, const Hints& hints) { checkError(); Dirent tmpDirent(NS::C, targetPath); auto existing_dirent_it = data->dirents.find(&tmpDirent); if (existing_dirent_it == data->dirents.end()) { Formatter fmt; fmt << "Impossible to alias C/" << targetPath << " as C/" << path << std::endl; fmt << "C/" << targetPath << " doesn't exist." << std::endl; throw InvalidEntry(fmt); } auto dirent = data->createAliasDirent(path, title, **existing_dirent_it); data->handle(dirent, hints); } void Creator::finishZimCreation() { checkError(); // Create a redirection for the mainPage. // We need to keep the created dirent to set the fileheader. // Dirent doesn't have to be deleted. if (!m_mainPath.empty()) { data->mainPageDirent = data->createRedirectDirent(NS::W, "mainPage", "", NS::C, m_mainPath); data->handle(data->mainPageDirent); } TPROGRESS(); // mp_titleListingHandler is a special case, it have to handle all dirents (including itself) for(auto& handler:data->m_direntHandlers) { // This silently create all the needed dirents. for(auto dirent:handler->getDirents()) { data->mp_titleListingHandler->handle(dirent, Hints()); } } // Now we have all the dirents (but not the data), we must correctly set/fix the dirents // before we ask data to the handlers TINFO("ResolveRedirectIndexes"); data->resolveRedirectIndexes(); TINFO("Set entry indexes"); data->setEntryIndexes(); TINFO("Resolve mimetype"); data->resolveMimeTypes(); // We can now stop the direntHandlers, and get their content bool titleListDirentSeen = false; for(auto& handler:data->m_direntHandlers) { handler->stop(); const auto& dirents = handler->getDirents(); if (dirents.empty()) { continue; } auto providers = handler->getContentProviders(); ASSERT(dirents.size(), ==, providers.size()); auto provider_it = providers.begin(); for(auto& dirent:dirents) { // As we use a "handler level" isCompressible, all content of the same handler // must have the same compression. data->addItemData(dirent, std::move(*provider_it), handler->isCompressible()); if (handler == data->mp_titleListingHandler && !titleListDirentSeen) { // We have to get the offset of the titleList in the cluster before // we close the cluster. Once the cluster is close, the offset information is dropped. // This works only if titleListingHandler create the full (V0) titlelist in its first dirent. data->m_titleListBlobOffset = data->uncompCluster->getBlobOffset(dirent->getBlobNumber()); titleListDirentSeen = true; } provider_it++; } } // All the data has been added, we can now close all clusters if (data->compCluster->count()) data->closeCluster(true); if (data->uncompCluster->count()) data->closeCluster(false); TINFO("Waiting for workers"); // wait all cluster compression has been done ClusterTask::waitNoMoreTask(data.get()); data->quitAllThreads(); checkError(); // Delete all handler (they will clean there own data) data->m_direntHandlers.clear(); TINFO(data->dirents.size() << " title index created"); TINFO(data->clustersList.size() << " clusters created"); TINFO("write zimfile :"); writeLastParts(); ::close(data->out_fd); data->out_fd = -1; TINFO("rename tmpfile to final one."); DEFAULTFS::rename(data->tmpFileName, data->zimName); data->tmpFileName.clear(); TINFO("finish"); } void Creator::fillHeader(Fileheader* header) const { header->setMainPage( data->mainPageDirent ? entry_index_type(data->mainPageDirent->getIdx()) : std::numeric_limits::max()); header->setLayoutPage(std::numeric_limits::max()); header->setUuid( m_uuid ); header->setArticleCount( data->dirents.size() ); header->setMimeListPos( Fileheader::size ); // We assume here that titleListingHandler create the V0 listing in its first dirent. auto cluster = data->mp_titleListingHandler->getDirents()[0]->getCluster(); header->setTitleIdxPos( offset_type(cluster->getOffset() + cluster->getDataOffset() + data->m_titleListBlobOffset)); header->setClusterCount( data->clustersList.size() ); } void Creator::writeLastParts() const { Fileheader header; fillHeader(&header); int out_fd = data->out_fd; lseek(out_fd, header.getMimeListPos(), SEEK_SET); TINFO(" write mimetype list"); for(auto& mimeType: data->mimeTypesList) { _write(out_fd, mimeType.c_str(), mimeType.size()+1); } _write(out_fd, "", 1); ASSERT(lseek(out_fd, 0, SEEK_CUR), <, CLUSTER_BASE_OFFSET); TINFO(" write directory entries"); lseek(out_fd, 0, SEEK_END); for (Dirent* dirent: data->dirents) { dirent->setOffset(offset_t(lseek(out_fd, 0, SEEK_CUR))); dirent->write(out_fd); } TINFO(" write path prt list"); header.setPathPtrPos(lseek(out_fd, 0, SEEK_CUR)); for (auto& dirent: data->dirents) { char tmp_buff[sizeof(offset_type)]; toLittleEndian(dirent->getOffset(), tmp_buff); _write(out_fd, tmp_buff, sizeof(offset_type)); } TINFO(" write cluster offset list"); header.setClusterPtrPos(lseek(out_fd, 0, SEEK_CUR)); for (auto cluster : data->clustersList) { char tmp_buff[sizeof(offset_type)]; toLittleEndian(cluster->getOffset(), tmp_buff); _write(out_fd, tmp_buff, sizeof(offset_type)); } header.setChecksumPos(lseek(out_fd, 0, SEEK_CUR)); TINFO(" write header"); lseek(out_fd, 0, SEEK_SET); header.write(out_fd); TINFO(" write checksum"); struct zim_MD5_CTX md5ctx; unsigned char batch_read[1024+1]; lseek(out_fd, 0, SEEK_SET); zim_MD5Init(&md5ctx); while (true) { auto r = read(out_fd, batch_read, 1024); if (r == -1) { throw std::runtime_error(std::strerror(errno)); } if (r == 0) break; batch_read[r] = 0; zim_MD5Update(&md5ctx, batch_read, r); } unsigned char digest[16]; zim_MD5Final(digest, &md5ctx); _write(out_fd, reinterpret_cast(digest), 16); } void Creator::checkError() { if (data->m_errored) { throw CreatorStateError(); } std::lock_guard l(data->m_exceptionLock); if (data->m_exceptionSlot) { std::cerr << "ERROR Detected" << std::endl; data->m_errored = true; throw AsyncError(data->m_exceptionSlot); } } CreatorData::CreatorData(const std::string& fname, bool verbose, bool withIndex, std::string language, Compression c, size_t clusterSize) : mainPageDirent(nullptr), m_errored(false), compression(c), zimName(fname), tmpFileName(fname + ".tmp"), clusterSize(clusterSize), withIndex(withIndex), indexingLanguage(language), verbose(verbose), nbRedirectItems(0), nbCompItems(0), nbUnCompItems(0), nbClusters(0), nbCompClusters(0), nbUnCompClusters(0), start_time(time(NULL)) { #ifdef _WIN32 int flag = _O_RDWR | _O_CREAT | _O_TRUNC | _O_BINARY; int mode = _S_IREAD | _S_IWRITE; #else int flag = O_RDWR | O_CREAT | O_TRUNC; mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; #endif out_fd = open(tmpFileName.c_str(), flag, mode); if (out_fd == -1){ throw std::runtime_error(std::strerror(errno)); } if(lseek(out_fd, CLUSTER_BASE_OFFSET, SEEK_SET) != CLUSTER_BASE_OFFSET) { close(out_fd); throw std::runtime_error(std::strerror(errno)); } // We keep both a "compressed cluster" and an "uncompressed cluster" // because we don't know which one will fill up first. We also need // to track the dirents currently in each, so we can fix up the // cluster index if the other one ends up written first. compCluster = new Cluster(compression); uncompCluster = new Cluster(Compression::None); #if defined(ENABLE_XAPIAN) auto xapianIndexer = std::make_shared(this, withIndex); m_direntHandlers.push_back(xapianIndexer); #endif mp_titleListingHandler = std::make_shared(this); m_direntHandlers.push_back(mp_titleListingHandler); m_direntHandlers.push_back(std::make_shared(this)); for(auto& handler:m_direntHandlers) { handler->start(); } } CreatorData::~CreatorData() { quitAllThreads(); if (compCluster) delete compCluster; if (uncompCluster) delete uncompCluster; for(auto& cluster: clustersList) { delete cluster; } if ( out_fd != - 1 ) { ::close(out_fd); } if ( ! tmpFileName.empty() ) { DEFAULTFS::removeFile(tmpFileName); } } void CreatorData::addError(const std::exception_ptr exception) { std::lock_guard l(m_exceptionLock); if (!m_exceptionSlot) { m_exceptionSlot = exception; } } bool CreatorData::isErrored() const { if (m_errored) { return true; } std::lock_guard l(m_exceptionLock); if (m_exceptionSlot) { return true; } return false; } void CreatorData::quitAllThreads() { // Quit all workerThreads for (auto i=0U; i< workerThreads.size(); i++) { taskList.pushToQueue(nullptr); } for(auto& thread: workerThreads) { thread.join(); } workerThreads.clear(); // Wait for writerThread to finish. if (writerThread.joinable()) { clusterToWrite.pushToQueue(nullptr); writerThread.join(); } } void CreatorData::addDirent(Dirent* dirent) { auto ret = dirents.insert(dirent); if (!ret.second) { Dirent* existing = *ret.first; if (existing->isRedirect() && !dirent->isRedirect()) { unresolvedRedirectDirents.erase(existing); dirents.erase(ret.first); existing->markRemoved(); dirents.insert(dirent); } else { Formatter fmt; fmt << "Impossible to add " << NsAsChar(dirent->getNamespace()) << "/" << dirent->getPath() << std::endl; fmt << " dirent's title to add is : " << dirent->getTitle() << std::endl; fmt << " existing dirent's title is : " << existing->getTitle() << std::endl; throw InvalidEntry(fmt); } }; if (dirent->isRedirect()) { unresolvedRedirectDirents.insert(dirent); nbRedirectItems++; } } void CreatorData::addItemData(Dirent* dirent, std::unique_ptr provider, bool compressContent) { // Add blob data to compressed or uncompressed cluster. auto itemSize = provider->getSize(); if (itemSize > 0) { isEmpty = false; } auto cluster = compressContent ? compCluster : uncompCluster; // If cluster will be too large, write it to dis, and open a new // one for the content. if ( cluster->count() && cluster->size().v+itemSize >= clusterSize ) { log_info("cluster with " << cluster->count() << " items, " << cluster->size() << " bytes; current title \"" << dirent->getTitle() << '\"'); cluster = closeCluster(compressContent); } dirent->setCluster(cluster); cluster->addContent(std::move(provider)); if (compressContent) { nbCompItems++; } else { nbUnCompItems++; } } Dirent* CreatorData::createDirent(NS ns, const std::string& path, const std::string& mimetype, const std::string& title) { auto dirent = pool.getClassicDirent(ns, path, title, getMimeTypeIdx(mimetype)); addDirent(dirent); return dirent; } Dirent* CreatorData::createItemDirent(const Item* item) { auto path = item->getPath(); auto mimetype = item->getMimeType(); if (mimetype.empty()) { std::cerr << "Warning, " << item->getPath() << " have empty mimetype." << std::endl; mimetype = "application/octet-stream"; } return createDirent(NS::C, item->getPath(), mimetype, item->getTitle()); } Dirent* CreatorData::createRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath) { auto dirent = pool.getRedirectDirent(ns, path, title, targetNs, targetPath); addDirent(dirent); return dirent; } Dirent* CreatorData::createAliasDirent(const std::string& path, const std::string& title, const Dirent& target) { auto dirent = pool.getAliasDirent(path, title, target); addDirent(dirent); return dirent; } Cluster* CreatorData::closeCluster(bool compressed) { Cluster *cluster; nbClusters++; if (compressed ) { cluster = compCluster; nbCompClusters++; } else { cluster = uncompCluster; nbUnCompClusters++; } cluster->setClusterIndex(cluster_index_t(clustersList.size())); clustersList.push_back(cluster); taskList.pushToQueue(std::make_shared(cluster)); clusterToWrite.pushToQueue(cluster); if (compressed) { cluster = compCluster = new Cluster(compression); } else { cluster = uncompCluster = new Cluster(Compression::None); } return cluster; } void CreatorData::setEntryIndexes() { // set index INFO("set index"); entry_index_t idx(0); for (auto& dirent: dirents) { dirent->setIdx(idx); idx += 1; } } void CreatorData::resolveRedirectIndexes() { // translate redirect aid to index INFO("Resolve redirect"); for (auto dirent: unresolvedRedirectDirents) { Dirent tmpDirent(dirent->getRedirectNs(), dirent->getRedirectPath()); auto target_pos = dirents.find(&tmpDirent); if(target_pos == dirents.end()) { INFO("Invalid redirection " << NsAsChar(dirent->getNamespace()) << '/' << dirent->getPath() << " redirecting to (missing) " << NsAsChar(dirent->getRedirectNs()) << '/' << dirent->getRedirectPath()); dirents.erase(dirent); dirent->markRemoved(); if (dirent == mainPageDirent) { mainPageDirent = nullptr; } } else { dirent->setRedirect(*target_pos); } } } void CreatorData::resolveMimeTypes() { std::vector oldMImeList; std::vector mapping; for (auto& rmimeType: rmimeTypesMap) { oldMImeList.push_back(rmimeType.second); mimeTypesList.push_back(rmimeType.second); } mapping.resize(oldMImeList.size()); std::sort(mimeTypesList.begin(), mimeTypesList.end()); for (unsigned i=0; i(j); } } for (auto& dirent: dirents) { if (dirent->isItem()) dirent->setMimeType(mapping[dirent->getMimeType()]); } } uint16_t CreatorData::getMimeTypeIdx(const std::string& mimeType) { auto it = mimeTypesMap.find(mimeType); if (it == mimeTypesMap.end()) { if (nextMimeIdx >= std::numeric_limits::max()) throw CreatorError("too many distinct mime types"); mimeTypesMap[mimeType] = nextMimeIdx; rmimeTypesMap[nextMimeIdx] = mimeType; return nextMimeIdx++; } return it->second; } const std::string& CreatorData::getMimeType(uint16_t mimeTypeIdx) const { auto it = rmimeTypesMap.find(mimeTypeIdx); if (it == rmimeTypesMap.end()) throw CreatorError("mime type index not found"); return it->second; } } } libzim-9.2.3/src/writer/creatordata.h000066400000000000000000000124771466367137100176110ustar00rootroot00000000000000/* * Copyright (C) 2018-2021 Matthieu Gautier * Copyright (C) 2021 Manessh P M * Copyright (C) 2020 Veloman Yunkan * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_CREATOR_DATA_H #define ZIM_WRITER_CREATOR_DATA_H #include #include "queue.h" #include "_dirent.h" #include "handler.h" #include #include #include #include #include #include "config.h" #include "../fileheader.h" #include "direntPool.h" #include "titleListingHandler.h" namespace zim { namespace writer { struct UrlCompare { bool operator() (const Dirent* d1, const Dirent* d2) const { return comparePath(d1, d2); } }; class Cluster; class Task; class CreatorData { public: typedef std::set UrlSortedDirents; typedef std::map MimeTypesMap; typedef std::map RMimeTypesMap; typedef std::vector MimeTypesList; typedef std::vector ClusterList; typedef Queue ClusterQueue; typedef Queue> TaskQueue; typedef std::vector ThreadList; CreatorData(const std::string& fname, bool verbose, bool withIndex, std::string language, Compression compression, size_t clusterSize); virtual ~CreatorData(); void addDirent(Dirent* dirent); void addItemData(Dirent* dirent, std::unique_ptr provider, bool compressContent); Dirent* createDirent(NS ns, const std::string& path, const std::string& mimetype, const std::string& title); Dirent* createItemDirent(const Item* item); Dirent* createRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath); Dirent* createAliasDirent(const std::string& path, const std::string& title, const Dirent& target); Cluster* closeCluster(bool compressed); void setEntryIndexes(); void resolveRedirectIndexes(); void resolveMimeTypes(); uint16_t getMimeTypeIdx(const std::string& mimeType); const std::string& getMimeType(uint16_t mimeTypeIdx) const; void addError(const std::exception_ptr error); bool isErrored() const; void quitAllThreads(); DirentPool pool; UrlSortedDirents dirents; UrlSortedDirents unresolvedRedirectDirents; Dirent* mainPageDirent; MimeTypesMap mimeTypesMap; RMimeTypesMap rmimeTypesMap; MimeTypesList mimeTypesList; uint16_t nextMimeIdx = 0; ClusterList clustersList; ClusterQueue clusterToWrite; TaskQueue taskList; ThreadList workerThreads; std::thread writerThread; mutable std::mutex m_exceptionLock; std::exception_ptr m_exceptionSlot; std::atomic m_errored; const Compression compression; std::string zimName; std::string tmpFileName; bool isEmpty = true; size_t clusterSize; Cluster *compCluster = nullptr; Cluster *uncompCluster = nullptr; int out_fd; bool withIndex; std::string indexingLanguage; std::shared_ptr mp_titleListingHandler; offset_t m_titleListBlobOffset; // The offset the title list blob, // related to the beginning of the start of cluster's data. std::vector> m_direntHandlers; void handle(Dirent* dirent, const Hints& hints = Hints()) { for(auto& handler: m_direntHandlers) { handler->handle(dirent, hints); } } void handle(Dirent* dirent, std::shared_ptr item) { for(auto& handler: m_direntHandlers) { handler->handle(dirent, item); } } // Some stats bool verbose; entry_index_type nbItems; entry_index_type nbRedirectItems; entry_index_type nbCompItems; entry_index_type nbUnCompItems; cluster_index_type nbClusters; cluster_index_type nbCompClusters; cluster_index_type nbUnCompClusters; time_t start_time; cluster_index_t clusterCount() const { return cluster_index_t(clustersList.size()); } entry_index_t itemCount() const { return entry_index_t(dirents.size()); } }; } } #endif // ZIM_WRITER_CREATOR_DATA_H libzim-9.2.3/src/writer/defaultIndexData.h000066400000000000000000000076651466367137100205310ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_DEFAULTINDEXDATA_H #define ZIM_WRITER_DEFAULTINDEXDATA_H #include #include #include "config.h" #if defined(ENABLE_XAPIAN) #include "xapian/myhtmlparse.h" #endif #include "../tools.h" #include #include #include namespace zim { namespace writer { class DefaultIndexData : public IndexData { public: DefaultIndexData(std::unique_ptr contentProvider, const std::string& title) : m_initialized(false), mp_contentProvider(std::move(contentProvider)), #if defined(ENABLE_XAPIAN) m_title(zim::removeAccents(title)), #else m_title(""), #endif m_hasIndexData(false), m_content(""), m_keywords(""), m_wordCount(0), m_geoPosition(std::make_tuple(false, 0, 0)) {} void initialize() const { if (m_initialized) { return; } std::lock_guard lock(m_initLock); // We have to do a double check to be sure that two call on a un-initialized object // will not be initiialized twice. if (m_initialized) { return; } #if defined(ENABLE_XAPIAN) Formatter fmt; while (true) { auto blob = mp_contentProvider->feed(); if(blob.size() == 0) { break; } fmt << blob; } MyHtmlParser htmlParser; try { htmlParser.parse_html(fmt, "UTF-8", true); } catch(...) {} m_hasIndexData = !htmlParser.dump.empty() && htmlParser.indexing_allowed && (htmlParser.dump.find("NOINDEX") == std::string::npos); m_content = zim::removeAccents(htmlParser.dump); m_keywords = zim::removeAccents(htmlParser.keywords); m_wordCount = zim::countWords(htmlParser.dump); if(htmlParser.has_geoPosition) { m_geoPosition = std::make_tuple(true, htmlParser.latitude, htmlParser.longitude); } #endif m_initialized = true; } bool hasIndexData() const { initialize(); return m_hasIndexData; } std::string getTitle() const { return m_title; } std::string getContent() const { initialize(); return m_content; } std::string getKeywords() const { initialize(); return m_keywords; } uint32_t getWordCount() const { initialize(); return m_wordCount; } GeoPosition getGeoPosition() const { initialize(); return m_geoPosition; } private: mutable std::atomic m_initialized; mutable std::mutex m_initLock; std::unique_ptr mp_contentProvider; std::string m_title; mutable bool m_hasIndexData; mutable std::string m_content; mutable std::string m_keywords; mutable uint32_t m_wordCount; mutable GeoPosition m_geoPosition; }; } } #endif // ZIM_WRITER_DEFAULTINDEXDATA_H libzim-9.2.3/src/writer/dirent.cpp000066400000000000000000000065421466367137100171340ustar00rootroot00000000000000/* * Copyright (C) 2020 Matthieu Gautier * Copyright (C) 2006 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "_dirent.h" #include #include "buffer.h" #include "endian_tools.h" #include "log.h" #include #include #ifdef _WIN32 # include #else # include # define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ {throw std::runtime_error("Error writing");} #endif log_define("zim.dirent") namespace zim { namespace writer { char NsAsChar(NS ns) { switch(ns) { case NS::C: return 'C'; case NS::M: return 'M'; case NS::W: return 'W'; case NS::X: return 'X'; } throw std::runtime_error("Invalid namespace value."); } // Creator for a "classic" dirent Dirent::Dirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype) : pathTitle(path, title), mimeType(mimetype), idx(0), info(DirentInfo::Direct()), offset(0), _ns(static_cast(ns)), removed(false), frontArticle(false) {} // Creator for a "redirection" dirent Dirent::Dirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath) : pathTitle(path, title), mimeType(redirectMimeType), idx(0), info(DirentInfo::Redirect(targetNs, targetPath)), offset(0), _ns(static_cast(ns)), removed(false), frontArticle(false) {} Dirent::Dirent(const std::string& path, const std::string& title, const Dirent& target) : pathTitle(path, title), mimeType(target.mimeType), idx(0), info(target.info), offset(0), _ns(target._ns), removed(false), frontArticle(false) {} NS Dirent::getRedirectNs() const { return info.getRedirect().ns; } std::string Dirent::getRedirectPath() const { return info.getRedirect().targetPath; } void Dirent::write(int out_fd) const { const static char zero = 0; union { char d[16]; long a; } header; zim::toLittleEndian(getMimeType(), header.d); header.d[2] = 0; // parameter size header.d[3] = NsAsChar(getNamespace()); log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size()); zim::toLittleEndian(getVersion(), header.d + 4); if (isRedirect()) { zim::toLittleEndian(getRedirectIndex().v, header.d + 8); _write(out_fd, header.d, 12); } else { zim::toLittleEndian(zim::cluster_index_type(getClusterNumber()), header.d + 8); zim::toLittleEndian(zim::blob_index_type(getBlobNumber()), header.d + 12); _write(out_fd, header.d, 16); } _write(out_fd, pathTitle.data(), pathTitle.size()); _write(out_fd, &zero, 1); } } } libzim-9.2.3/src/writer/direntPool.h000066400000000000000000000062321466367137100174270ustar00rootroot00000000000000/* * Copyright (C) 2019-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_DIRENTPOOL_H #define ZIM_WRITER_DIRENTPOOL_H #include "debug.h" #include "_dirent.h" namespace zim { namespace writer { class DirentPool { private: std::vector pools; uint16_t direntIndex; void allocate_new_pool() { pools.push_back(reinterpret_cast(new char[sizeof(Dirent)*0xFFFF])); direntIndex = 0; } static void destroyPoolBlock(Dirent* pool, uint16_t count=0xFFFF) { for (auto i = 0U; i < count; i++) { try { pool[i].~Dirent(); } catch (...){ /*discard*/ } } delete [] (reinterpret_cast(pool)); } /* Return a *NOT constructed* pointer to a dirent */ Dirent* getDirentSlot() { if (direntIndex == 0xFFFF) { allocate_new_pool(); } auto dirent = pools.back() + direntIndex++; return dirent; } public: DirentPool() : direntIndex(0xFFFF) {} DirentPool(const DirentPool&) = delete; DirentPool& operator=(const DirentPool&) = delete; ~DirentPool() { auto nbPools = pools.size(); if (nbPools == 0) { return; } // Delete all but last pools (add call the destructors of the dirents) for (auto i = 0U; i * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_WRITER_HANDLER_H #define OPENZIM_LIBZIM_WRITER_HANDLER_H #include #include #include #include namespace zim { namespace writer { class CreatorData; class ContentProvider; class Dirent; /** * DirentHandler is used to add "extra" handling on dirent/item. * * The main purpose of the handle is to "see" all dirents corresponding to user entries * and generate it's own dirent/item. * * Classical use cases are : * - Generating a index of the item (xapianIndex) * - Generating a listing of the item (all item or "main" entries only) * - Count mimetypes * - ... * * The workflow is the following: * - Start the handler with `start()`. * - Pass dirents to handle using `handle()`. * If a handler has to handle itself, it has to do it itself before (in start/stop, ...) * The handlers will NOT have dirents of other handlers passed. * (Exception made for titleListingHandle) * - Get the dirents associated to the handler using `createDirents()`. * Handler must created dirents if entry/entries associated to it must be created. * It may create several dirents if several entries must be created. * It may return a empty vector (no dirent) if no entry must be created (empty listing,...). * - All dirents are correctly set (redirect resolved, index and mimetype set, ...) * - Stop the handler with `stop()`. * - Get the content of the handler is taken using `getContentProviders`. * Handle MUST returns the same number of contentProvider that the number of dirents it has returned. * * While it seems that DirentHandler is dynamically (de)activated by user it is not. * This is purelly a internal structure to simplify the internal architecture of the writer. */ class DirentHandler { public: explicit DirentHandler(CreatorData* data); virtual ~DirentHandler() = default; using ContentProviders = std::vector>; using Dirents = std::vector; virtual void start() = 0; virtual void stop() = 0; virtual bool isCompressible() = 0; const Dirents& getDirents() { if (!m_direntsCreated) { m_dirents = createDirents(); m_direntsCreated = true; } return m_dirents; } virtual ContentProviders getContentProviders() const = 0; /* * Handle a dirent/item. * * item may be nullptr (dirent is a redirect or in special case) */ virtual void handle(Dirent* dirent, std::shared_ptr item) = 0; virtual void handle(Dirent* dirent, const Hints& hints) = 0; protected: virtual Dirents createDirents() const = 0; DirentHandler() = default; private: Dirents m_dirents; bool m_direntsCreated {false}; }; } } #endif // OPENZIM_LIBZIM_WRITER_HANDLER_H libzim-9.2.3/src/writer/item.cpp000066400000000000000000000043731466367137100166050ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * Copyright (C) 2009 Tommi Maekitalo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include #include #include "defaultIndexData.h" namespace zim { namespace writer { std::shared_ptr Item::getIndexData() const { if (getMimeType().find("text/html")!=0) { return nullptr; } auto provider = getContentProvider(); return std::make_shared(std::move(provider), getTitle()); } Hints Item::getHints() const { return Hints(); } Hints Item::getAmendedHints() const { auto hints = getHints(); // If not FRONT_ARTICLE hints is given, determine it from the mimetype. if (hints.find(FRONT_ARTICLE) == hints.end()) { hints[FRONT_ARTICLE] = (getMimeType().find("text/html") == 0); } // If not COMPRESS hints is given, determine it from the mimetype. if (hints.find(COMPRESS) == hints.end()) { hints[COMPRESS] = isCompressibleMimetype(getMimeType()); } return hints; } std::unique_ptr StringItem::getContentProvider() const { auto shared_string = std::shared_ptr(shared_from_this(), &content); return std::unique_ptr(new SharedStringProvider(shared_string)); } std::unique_ptr FileItem::getContentProvider() const { return std::unique_ptr(new FileProvider(filepath)); } } } libzim-9.2.3/src/writer/queue.h000066400000000000000000000050071466367137100164330ustar00rootroot00000000000000/* * Copyright (C) 2016-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_QUEUE_H #define OPENZIM_LIBZIM_QUEUE_H #define MAX_QUEUE_SIZE 10 #include #include #include "../tools.h" template class Queue { public: Queue() = default; virtual ~Queue() = default; virtual bool isEmpty(); virtual size_t size(); virtual void pushToQueue(const T& element); virtual bool getHead(T &element); virtual bool popFromQueue(T &element); protected: std::queue m_realQueue; std::mutex m_queueMutex; private: // Make this queue non copyable Queue(const Queue&); Queue& operator=(const Queue&); }; template bool Queue::isEmpty() { std::lock_guard l(m_queueMutex); return m_realQueue.empty(); } template size_t Queue::size() { std::lock_guard l(m_queueMutex); return m_realQueue.size(); } template void Queue::pushToQueue(const T &element) { unsigned int wait = 0; unsigned int queueSize = 0; do { zim::microsleep(wait); queueSize = size(); wait += 10; } while (queueSize > MAX_QUEUE_SIZE); std::lock_guard l(m_queueMutex); m_realQueue.push(element); } template bool Queue::getHead(T &element) { std::lock_guard l(m_queueMutex); if (m_realQueue.empty()) { return false; } element = m_realQueue.front(); return true; } template bool Queue::popFromQueue(T &element) { std::lock_guard l(m_queueMutex); if (m_realQueue.empty()) { return false; } element = m_realQueue.front(); m_realQueue.pop(); return true; } #endif // OPENZIM_LIBZIM_QUEUE_H libzim-9.2.3/src/writer/tinyString.h000066400000000000000000000075711466367137100174710ustar00rootroot00000000000000/* * Copyright (C) 2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #ifndef ZIM_WRITER_TINYSTRING_H #define ZIM_WRITER_TINYSTRING_H #include "../zim_types.h" #include namespace zim { namespace writer { class TinyString { public: // functions TinyString() : m_data(nullptr), m_size(0) {} TinyString(const std::string& s) : m_data(new char[(uint16_t)s.size()]), m_size(s.size()) { if (s.size() >= 0xFFFF) { throw std::runtime_error("String len is too big"); } std::memcpy(m_data, s.data(), m_size); } TinyString(TinyString&& t): m_data(t.m_data), m_size(t.m_size) { t.m_data = nullptr; t.m_size = 0; }; TinyString(const TinyString& t) : m_data(new char[(uint16_t)t.m_size]), m_size(t.m_size) { std::memcpy(m_data, t.m_data, m_size); } ~TinyString() { if (m_data) { delete[] m_data; m_data = nullptr; } } operator std::string() const { return std::string(m_data, m_size); } bool empty() const { return m_size == 0; } size_t size() const { return m_size; } const char* const data() const { return m_data; } bool operator==(const TinyString& other) const { return (m_size == other.m_size) && (std::memcmp(m_data, other.m_data, m_size) == 0); } bool operator<(const TinyString& other) const { auto min_size = std::min(m_size, other.m_size); auto ret = std::memcmp(m_data, other.m_data, min_size); if (ret == 0) { return m_size < other.m_size; } else { return ret < 0; } } protected: // members char* m_data; uint16_t m_size; } PACKED; class PathTitleTinyString : public TinyString { public: PathTitleTinyString() : TinyString() {} PathTitleTinyString(const std::string& path, const std::string& title) : TinyString(PathTitleTinyString::concat(path, title)) {} static std::string concat(const std::string& path, const std::string& title) { std::string result(path.data(), path.size()+1); if ( title != path ) { result += title; } return result; } std::string getPath() const { if (m_size == 0) { return std::string(); } return std::string(m_data); } std::string getTitle(bool storedOnly) const { if (m_size == 0) { return std::string(); } auto title_start = std::strlen(m_data) + 1; if (title_start == m_size) { if (storedOnly) { return std::string(); // return empty title } else { return std::string(m_data); // return the path as a title } } else { return std::string(m_data+title_start, m_size-title_start); } } } PACKED; } } #endif // ZIM_WRITER_TINYSTRING_H libzim-9.2.3/src/writer/titleListingHandler.cpp000066400000000000000000000075641466367137100216250ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "titleListingHandler.h" #include "creatordata.h" #include "../endian_tools.h" #include #include using namespace zim::writer; namespace { class ListingProvider : public ContentProvider { public: ListingProvider(const TitleListingHandler::Dirents* dirents, bool frontOnly) : mp_dirents(dirents), m_it(dirents->begin()), m_frontOnly(frontOnly) {} zim::size_type getSize() const override { if (m_frontOnly) { auto nbFrontArticles = std::count_if(mp_dirents->begin(), mp_dirents->end(), [](Dirent* d) { return d->isFrontArticle();}); return nbFrontArticles * sizeof(zim::entry_index_type); } else { return mp_dirents->size() * sizeof(zim::entry_index_type); } } zim::Blob feed() override { if (m_frontOnly) { while (m_it != mp_dirents->end() && !(*m_it)->isFrontArticle()) { m_it++; } } if (m_it == mp_dirents->end()) { return zim::Blob(nullptr, 0); } zim::toLittleEndian((*m_it)->getIdx().v, buffer); m_it++; return zim::Blob(buffer, sizeof(zim::entry_index_type)); } private: const TitleListingHandler::Dirents* mp_dirents; char buffer[sizeof(zim::entry_index_type)]; TitleListingHandler::Dirents::const_iterator m_it; bool m_frontOnly; }; } // end of anonymous namespace TitleListingHandler::TitleListingHandler(CreatorData* data) : mp_creatorData(data), m_hasFrontArticles(false) {} TitleListingHandler::~TitleListingHandler() = default; void TitleListingHandler::start() { } void TitleListingHandler::stop() { m_handledDirents.erase( std::remove_if(m_handledDirents.begin(), m_handledDirents.end(), [](const Dirent* d) { return d->isRemoved(); }), m_handledDirents.end()); std::sort(m_handledDirents.begin(), m_handledDirents.end(), TitleCompare()); } DirentHandler::Dirents TitleListingHandler::createDirents() const { Dirents ret; ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v0", "application/octet-stream+zimlisting", "")); if (m_hasFrontArticles) { ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v1", "application/octet-stream+zimlisting", "")); } return ret; } DirentHandler::ContentProviders TitleListingHandler::getContentProviders() const { ContentProviders ret; ret.push_back(std::unique_ptr(new ListingProvider(&m_handledDirents, false))); if (m_hasFrontArticles) { ret.push_back(std::unique_ptr(new ListingProvider(&m_handledDirents, true))); } return ret; } void TitleListingHandler::handle(Dirent* dirent, std::shared_ptr item) { handle(dirent, item->getAmendedHints()); } void TitleListingHandler::handle(Dirent* dirent, const Hints& hints) { m_handledDirents.push_back(dirent); // By definition, dirent not in `C` namespace are not FRONT_ARTICLE if (dirent->getNamespace() != NS::C) { return; } try { if(bool(hints.at(FRONT_ARTICLE))) { m_hasFrontArticles = true; dirent->setFrontArticle(); } } catch(std::out_of_range&) {} } libzim-9.2.3/src/writer/titleListingHandler.h000066400000000000000000000035741466367137100212670ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_LISTING_HANDLER_H #define OPENZIM_LIBZIM_LISTING_HANDLER_H #include "handler.h" #include "_dirent.h" #include namespace zim { namespace writer { struct TitleCompare { bool operator() (const Dirent* d1, const Dirent* d2) const { return compareTitle(d1, d2); } }; // This handler is in charge of handling titles. // It will create the "classic" old V0 title listing (for ALL entries) but also // the V1 title listing (for front article only). class TitleListingHandler : public DirentHandler { public: explicit TitleListingHandler(CreatorData* data); virtual ~TitleListingHandler(); void start() override; void stop() override; bool isCompressible() override { return false; } ContentProviders getContentProviders() const override; void handle(Dirent* dirent, std::shared_ptr item) override; void handle(Dirent* dirent, const Hints& hints) override; protected: Dirents createDirents() const override; CreatorData* mp_creatorData; Dirents m_handledDirents; bool m_hasFrontArticles; }; } } #endif // OPENZIM_LIBZIM_LISTING_HANDLER_H libzim-9.2.3/src/writer/workers.cpp000066400000000000000000000050001466367137100173270ustar00rootroot00000000000000/* * Copyright (C) 2019-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "workers.h" #include "cluster.h" #include "creatordata.h" #include "../tools.h" #ifdef _WIN32 #include #else #include #endif namespace zim { namespace writer { void* taskRunner(void* arg) { auto creatorData = static_cast(arg); unsigned int wait = 0; try { while(!creatorData->isErrored()) { std::shared_ptr task; microsleep(wait); wait += 100; if (creatorData->taskList.popFromQueue(task)) { if (!task) { return nullptr; } task->run(creatorData); wait = 0; } } } catch (...) { creatorData->addError(std::current_exception()); } return nullptr; } void* clusterWriter(void* arg) { auto creatorData = static_cast(arg); Cluster* cluster; unsigned int wait = 0; try { while(!creatorData->isErrored()) { microsleep(wait); wait += 100; if(creatorData->clusterToWrite.getHead(cluster)) { if (cluster == nullptr) { // All cluster writen, we can quit return nullptr; } if (not cluster->isClosed()) { continue; } creatorData->clusterToWrite.popFromQueue(cluster); cluster->setOffset(offset_t(lseek(creatorData->out_fd, 0, SEEK_CUR))); cluster->write(creatorData->out_fd); cluster->clear_data(); wait = 0; } } } catch(...) { creatorData->addError(std::current_exception()); } return nullptr; } } } libzim-9.2.3/src/writer/workers.h000066400000000000000000000037051466367137100170060ustar00rootroot00000000000000/* * Copyright (C) 2019-2020 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_WORKERS_H #define OPENZIM_LIBZIM_WORKERS_H #include "tools.h" #include "creatordata.h" namespace zim { namespace writer { class Task { public: Task() = default; virtual ~Task() = default; virtual void run(CreatorData* data) = 0; }; template class TrackableTask: public Task { public: TrackableTask(const TrackableTask&) = delete; TrackableTask& operator=(const TrackableTask&) = delete; TrackableTask() { ++waitingTaskCount; } virtual ~TrackableTask() { --waitingTaskCount;} static void waitNoMoreTask(const CreatorData* data) { // Wait for all tasks has been done // If we are in error state, threads have been stopped and waitingTaskCount // will never reach 0, so no need to wait. unsigned int wait = 0; do { microsleep(wait); wait += 10; } while(waitingTaskCount.load() > 0 && !data->isErrored()); } private: static std::atomic waitingTaskCount; }; template std::atomic zim::writer::TrackableTask::waitingTaskCount(0); void* taskRunner(void* data); void* clusterWriter(void* data); } } #endif // OPENZIM_LIBZIM_WORKERS_H libzim-9.2.3/src/writer/xapianHandler.cpp000066400000000000000000000076421466367137100204270ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "xapianHandler.h" #include "xapianIndexer.h" #include "xapianWorker.h" #include "creatordata.h" #include using namespace zim::writer; XapianHandler::XapianHandler(CreatorData* data, bool withFulltextIndex) : mp_fulltextIndexer(withFulltextIndex ? new XapianIndexer(data->zimName+"_fulltext.idx", data->indexingLanguage, IndexingMode::FULL, true) : nullptr), mp_titleIndexer(new XapianIndexer(data->zimName+"_title.idx", data->indexingLanguage, IndexingMode::TITLE, true)), mp_creatorData(data) {} XapianHandler::~XapianHandler() = default; void XapianHandler::waitNoMoreTask() const { IndexTask::waitNoMoreTask(mp_creatorData); } void XapianHandler::start() { if (mp_fulltextIndexer) { mp_fulltextIndexer->indexingPrelude(); } mp_titleIndexer->indexingPrelude(); } void XapianHandler::stop() { // We need to wait that all indexation tasks have been done before closing the // xapian database. if (mp_fulltextIndexer) { waitNoMoreTask(); mp_fulltextIndexer->indexingPostlude(); } mp_titleIndexer->indexingPostlude(); } DirentHandler::Dirents XapianHandler::createDirents() const { // Wait for all task to be done before checking if we are empty. Dirents ret; if (mp_fulltextIndexer) { waitNoMoreTask(); if (!mp_fulltextIndexer->is_empty()) { ret.push_back(mp_creatorData->createDirent(NS::X, "fulltext/xapian", "application/octet-stream+xapian", "")); } } if (!mp_titleIndexer->is_empty()) { ret.push_back(mp_creatorData->createDirent(NS::X, "title/xapian", "application/octet-stream+xapian", "")); } return ret; } DirentHandler::ContentProviders XapianHandler::getContentProviders() const { ContentProviders ret; if (mp_fulltextIndexer && !mp_fulltextIndexer->is_empty()) { ret.push_back(std::unique_ptr(new FileProvider(mp_fulltextIndexer->getIndexPath()))); } if (!mp_titleIndexer->is_empty()) { ret.push_back(std::unique_ptr(new FileProvider(mp_titleIndexer->getIndexPath()))); } return ret; } void XapianHandler::indexTitle(Dirent* dirent) { auto title = dirent->getRealTitle(); if (title.empty()) { return; } auto path = dirent->getPath(); if (dirent->isRedirect()) { auto redirectPath = dirent->getRedirectPath(); mp_titleIndexer->indexTitle(path, title, redirectPath); } else { mp_titleIndexer->indexTitle(path, title); } } void XapianHandler::handle(Dirent* dirent, const Hints& hints) { if (dirent->getNamespace() != NS::C) { return; } try { if (bool(hints.at(FRONT_ARTICLE))) { indexTitle(dirent); } } catch(std::out_of_range&) {} } void XapianHandler::handle(Dirent* dirent, std::shared_ptr item) { if (dirent->getNamespace() != NS::C) { return; } // Title index. handle(dirent, item->getAmendedHints()); // FullText index if (mp_fulltextIndexer) { auto indexData = item->getIndexData(); if (!indexData) { return; } auto path = dirent->getPath(); mp_creatorData->taskList.pushToQueue(std::make_shared(indexData, path, mp_fulltextIndexer.get())); } } libzim-9.2.3/src/writer/xapianHandler.h000066400000000000000000000033241466367137100200650ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_XAPIAN_HANDLER_H #define OPENZIM_LIBZIM_XAPIAN_HANDLER_H #include "handler.h" namespace zim { namespace writer { class XapianIndexer; class XapianHandler : public DirentHandler { public: XapianHandler(CreatorData* data, bool withFullTextIndex); virtual ~XapianHandler(); void start() override; void stop() override; bool isCompressible() override { return false; } ContentProviders getContentProviders() const override; void handle(Dirent* dirent, std::shared_ptr item) override; void handle(Dirent* dirent, const Hints& hints) override; protected: Dirents createDirents() const override; private: // methods void indexTitle(Dirent* dirent); void waitNoMoreTask() const; private: // data std::unique_ptr mp_fulltextIndexer; std::unique_ptr mp_titleIndexer; CreatorData* mp_creatorData; }; } } #endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H libzim-9.2.3/src/writer/xapianIndexer.cpp000066400000000000000000000137411466367137100204450ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2018-2021 Matthieu Gautier * Copyright (C) 2011 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #include "xapianIndexer.h" #include "libzim-resources.h" #include "fs.h" #include "tools.h" #include "../constants.h" #include #include #include #include using namespace zim::writer; /* Constructor */ XapianIndexer::XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode indexingMode, const bool verbose) : indexPath(indexPath), language(language), indexingMode(indexingMode) { /* Build ICU Local object to retrieve ISO-639 language code (from ISO-639-3) */ icu::Locale languageLocale(language.c_str()); stemmer_language = languageLocale.getLanguage(); /* Read the stopwords */ std::string stopWord; try { this->stopwords = getResource("stopwords/" + language); } catch(ResourceNotFound& e) {} std::istringstream file(this->stopwords); while (std::getline(file, stopWord, '\n')) { this->stopper.add(stopWord); } } XapianIndexer::~XapianIndexer() { if (!indexPath.empty()) { try { #ifndef _WIN32 //[TODO] Implement remove for windows zim::DEFAULTFS::remove(indexPath + ".tmp"); zim::DEFAULTFS::remove(indexPath); #endif } catch (...) { /* Do not raise */ } } } /* * `valuesmap` is a metadata associated with the Xapian database. We are using it * to attach slot numbers of each document in the index to the value they are storing. * These values and slot numbers are used in collapsing, filtering etc. * * Title index: * Slot 0: Title of the article. Used in collapsing articles with same name. * Slot 1: path/redirectPath of the article. Used in collapsing duplicates(redirects). * * Fulltext Index: * Slot 0: Title of the article. Used in collapsing articles with same name. * Slot 1: Word count of the article. * Slot 2: Geo position of the article. Used for geo-filtering. * * `kind` metadata indicate whether the database is a title or a fulltext index. * * `data` metadata indicate the type of data stored in the index. A value of "fullPath" * means the data stores the complete path with a namespace. */ void XapianIndexer::indexingPrelude() { writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE | Xapian::DB_NO_TERMLIST); switch (indexingMode) { case IndexingMode::TITLE: writableDatabase.set_metadata("valuesmap", "title:0;targetPath:1"); writableDatabase.set_metadata("kind", "title"); writableDatabase.set_metadata("data", "fullPath"); break; case IndexingMode::FULL: writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2"); writableDatabase.set_metadata("kind", "fulltext"); writableDatabase.set_metadata("data", "fullPath"); break; } writableDatabase.set_metadata("language", language); writableDatabase.set_metadata("stopwords", stopwords); } namespace { size_t getTermCount(const Xapian::Document& d) { return std::distance(d.termlist_begin(), d.termlist_end()); } } // unnamed namespace /* * For title index, index the full path with namespace as data of the document. * The targetPath in valuesmap will store the path without namespace. * TODO: * Currently for title index we are storing path twice (redirectPath/path in * valuesmap and path in index data). In the future, we want to keep only one of * these(index data if possible) to reduce index size while supporting the * collapse on path feature. */ void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath) { assert(indexingMode == IndexingMode::TITLE); Xapian::Stem stemmer; Xapian::TermGenerator indexer; indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM); try { stemmer = Xapian::Stem(stemmer_language); indexer.set_stemmer(stemmer); indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); } catch (...) {} Xapian::Document currentDocument; currentDocument.clear_values(); std::string fullPath = "C/" + path; currentDocument.set_data(fullPath); indexer.set_document(currentDocument); std::string unaccentedTitle = zim::removeAccents(title); currentDocument.add_value(0, title); if (targetPath.empty()) { currentDocument.add_value(1, path); } else { currentDocument.add_value(1, targetPath); } if (!unaccentedTitle.empty()) { std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle; indexer.index_text(anchoredTitle, 1); if ( getTermCount(currentDocument) == 1 ) { // only ANCHOR_TERM was added, hence unaccentedTitle is made solely of // non-word characters. Then add entire title as a single term. currentDocument.remove_term(*currentDocument.termlist_begin()); currentDocument.add_term(unaccentedTitle); } } /* add to the database */ writableDatabase.add_document(currentDocument); empty = false; } void XapianIndexer::indexingPostlude() { this->writableDatabase.commit(); #if defined ENABLE_XAPIAN_FULLER auto flags = Xapian::DBCOMPACT_SINGLE_FILE|Xapian::Compactor::FULLER; #else auto flags = Xapian::DBCOMPACT_SINGLE_FILE; #endif this->writableDatabase.compact(indexPath, flags); this->writableDatabase.close(); } libzim-9.2.3/src/writer/xapianIndexer.h000066400000000000000000000036061466367137100201110ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2018-2021 Matthieu Gautier * Copyright (C) 2011 Emmanuel Engelhart * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef LIBZIM_WRITER_XAPIANINDEXER_H #define LIBZIM_WRITER_XAPIANINDEXER_H #include #include #include #include namespace zim { namespace writer { class IndexTask; enum class IndexingMode { TITLE, FULL }; class XapianIndexer { public: XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode mode, bool verbose); virtual ~XapianIndexer(); std::string getIndexPath() { return indexPath; } void indexingPrelude(); void indexingPostlude(); bool is_empty() { return empty; } void indexTitle(const std::string& path, const std::string& title, const std::string& targetPath = ""); protected: Xapian::WritableDatabase writableDatabase; bool empty {true}; std::string stemmer_language; Xapian::SimpleStopper stopper; std::string indexPath; std::string language; std::string stopwords; IndexingMode indexingMode; friend class zim::writer::IndexTask; }; } } #endif // LIBZIM_WRITER_XAPIANINDEXER_H libzim-9.2.3/src/writer/xapianWorker.cpp000066400000000000000000000062221466367137100203140ustar00rootroot00000000000000/* * Copyright (C) 2021 Maneesh P M * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and * NON-INFRINGEMENT. See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * */ #include "xapianWorker.h" #include "creatordata.h" #include "xapianIndexer.h" #include #include #include static std::mutex s_dbaccessLock; namespace zim { namespace writer { const unsigned int keywordsBoostFactor = 3; inline unsigned int getTitleBoostFactor(const unsigned int contentLength) { return contentLength / 500 + 1; } void IndexTask::run(CreatorData* data) { if (!mp_indexData->hasIndexData()) { return; } Xapian::Stem stemmer; Xapian::TermGenerator indexer; indexer.set_flags(Xapian::TermGenerator::FLAG_CJK_NGRAM); try { stemmer = Xapian::Stem(mp_indexer->stemmer_language); indexer.set_stemmer(stemmer); indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); } catch (...) { // No stemming for language. } indexer.set_stopper(&mp_indexer->stopper); indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); Xapian::Document document; indexer.set_document(document); std::string fullPath = "C/" + m_path; document.set_data(fullPath); document.add_value(0, mp_indexData->getTitle()); document.add_value(1, Formatter() << mp_indexData->getWordCount()); auto geoInfo = mp_indexData->getGeoPosition(); if (std::get<0>(geoInfo)) { auto geoPosition = Xapian::LatLongCoord( std::get<1>(geoInfo), std::get<2>(geoInfo)).serialise(); document.add_value(2, geoPosition); } /* Index the content */ auto indexContent = mp_indexData->getContent(); if (!indexContent.empty()) { indexer.index_text_without_positions(indexContent); } /* Index the title */ auto indexTitle = mp_indexData->getTitle(); if (!indexTitle.empty()) { indexer.index_text_without_positions( indexTitle, getTitleBoostFactor(indexContent.size())); } /* Index the keywords */ auto indexKeywords = mp_indexData->getKeywords(); if (!indexKeywords.empty()) { indexer.index_text_without_positions(indexKeywords, keywordsBoostFactor); } std::lock_guard l(s_dbaccessLock); mp_indexer->writableDatabase.add_document(document); mp_indexer->empty = false; } } } libzim-9.2.3/src/writer/xapianWorker.h000066400000000000000000000031011466367137100177520ustar00rootroot00000000000000/* * Copyright (C) 2020-2021 Matthieu Gautier * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 3 of the License, or * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. */ #ifndef OPENZIM_LIBZIM_XAPIAN_WORKER_H #define OPENZIM_LIBZIM_XAPIAN_WORKER_H #include #include #include "workers.h" #include namespace zim { namespace writer { class Item; class XapianIndexer; class IndexTask : public TrackableTask { public: IndexTask(const IndexTask&) = delete; IndexTask& operator=(const IndexTask&) = delete; IndexTask(std::shared_ptr indexData, const std::string& path, XapianIndexer* indexer) : mp_indexData(indexData), m_path(path), mp_indexer(indexer) {} virtual ~IndexTask() = default; virtual void run(CreatorData* data); private: std::shared_ptr mp_indexData; std::string m_path; XapianIndexer* mp_indexer; }; } } #endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H libzim-9.2.3/src/xapian/000077500000000000000000000000001466367137100151005ustar00rootroot00000000000000libzim-9.2.3/src/xapian/htmlparse.cc000066400000000000000000000234551466367137100174170ustar00rootroot00000000000000/* htmlparse.cc: simple HTML parser for omega indexer * * Copyright 1999,2000,2001 BrightStation PLC * Copyright 2001 Ananova Ltd * Copyright 2002,2006,2007,2008 Olly Betts * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ // #include #include "htmlparse.h" #include // #include "utf8convert.h" #include #include #include #include #include #include using namespace std; inline void lowercase_string(string &str) { for (string::iterator i = str.begin(); i != str.end(); ++i) { *i = tolower(static_cast(*i)); } } map zim::HtmlParser::named_ents; static std::mutex sInitLock; inline static bool p_notdigit(char c) { return !isdigit(static_cast(c)); } inline static bool p_notxdigit(char c) { return !isxdigit(static_cast(c)); } inline static bool p_notalnum(char c) { return !isalnum(static_cast(c)); } inline static bool p_notwhitespace(char c) { return !isspace(static_cast(c)); } inline static bool p_nottag(char c) { return !isalnum(static_cast(c)) && c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. } inline static bool p_whitespacegt(char c) { return isspace(static_cast(c)) || c == '>'; } inline static bool p_whitespaceeqgt(char c) { return isspace(static_cast(c)) || c == '=' || c == '>'; } bool zim::HtmlParser::get_parameter(const string & param, string & value) { map::const_iterator i = parameters.find(param); if (i == parameters.end()) return false; value = i->second; return true; } zim::HtmlParser::HtmlParser() { static const struct ent { const char *n; unsigned int v; } ents[] = { #include "namedentities.h" { NULL, 0 } }; std::lock_guard l(sInitLock); if (named_ents.empty()) { const struct ent *i = ents; while (i->n) { named_ents[string(i->n)] = i->v; ++i; } } } void zim::HtmlParser::decode_entities(string &s) { // We need a const_iterator version of s.end() - otherwise the // find() and find_if() templates don't work... string::const_iterator amp = s.begin(), s_end = s.end(); while ((amp = find(amp, s_end, '&')) != s_end) { unsigned int val = 0; string::const_iterator end, p = amp + 1; if (p != s_end && *p == '#') { p++; if (p != s_end && (*p == 'x' || *p == 'X')) { // hex p++; end = find_if(p, s_end, p_notxdigit); sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); } else { // number end = find_if(p, s_end, p_notdigit); val = atoi(s.substr(p - s.begin(), end - p).c_str()); } } else { end = find_if(p, s_end, p_notalnum); string code = s.substr(p - s.begin(), end - p); map::const_iterator i; i = named_ents.find(code); if (i != named_ents.end()) val = i->second; } if (end < s_end && *end == ';') end++; if (val) { string::size_type amp_pos = amp - s.begin(); if (val < 0x80) { s.replace(amp_pos, end - amp, 1u, char(val)); } else { // Convert unicode value val to UTF-8. char seq[4]; unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); s.replace(amp_pos, end - amp, seq, len); } s_end = s.end(); // We've modified the string, so the iterators are no longer // valid... amp = s.begin() + amp_pos + 1; } else { amp = end; } } } void zim::HtmlParser::parse_html(const string &body) { in_script = false; parameters.clear(); string::const_iterator start = body.begin(); while (true) { // Skip through until we find an HTML tag, a comment, or the end of // document. Ignore isolated occurrences of `<' which don't start // a tag or comment. string::const_iterator p = start; while (true) { p = find(p, body.end(), '<'); if (p == body.end()) break; unsigned char ch = *(p + 1); // Tag, closing tag, or comment (or SGML declaration). if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; if (ch == '?') { // PHP code or XML declaration. // XML declaration is only valid at the start of the first line. // FIXME: need to deal with BOMs... if (p != body.begin() || body.size() < 20) break; // XML declaration looks something like this: // if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; if (strchr(" \t\r\n", p[5]) == NULL) break; string::const_iterator decl_end = find(p + 6, body.end(), '?'); if (decl_end == body.end()) break; // Default charset for XML is UTF-8. charset = "UTF-8"; string decl(p + 6, decl_end); size_t enc = decl.find("encoding"); if (enc == string::npos) break; enc = decl.find_first_not_of(" \t\r\n", enc + 8); if (enc == string::npos || enc == decl.size()) break; if (decl[enc] != '=') break; enc = decl.find_first_not_of(" \t\r\n", enc + 1); if (enc == string::npos || enc == decl.size()) break; if (decl[enc] != '"' && decl[enc] != '\'') break; char quote = decl[enc++]; size_t enc_end = decl.find(quote, enc); if (enc != string::npos) charset = decl.substr(enc, enc_end - enc); break; } p++; } // Process text up to start of tag. if (p > start) { string text = body.substr(start - body.begin(), p - start); // convert_to_utf8(text, charset); decode_entities(text); process_text(text); } if (p == body.end()) break; start = p + 1; if (start == body.end()) break; if (*start == '!') { if (++start == body.end()) break; if (++start == body.end()) break; // comment or SGML declaration if (*(start - 1) == '-' && *start == '-') { ++start; string::const_iterator close = find(start, body.end(), '>'); // An unterminated comment swallows rest of document // (like Netscape, but unlike MSIE IIRC) if (close == body.end()) break; p = close; // look for --> while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) p = find(p + 1, body.end(), '>'); if (p != body.end()) { // Check for htdig's "ignore this bit" comments. if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { string::size_type i; i = body.find("", p + 1 - body.begin()); if (i == string::npos) break; start = body.begin() + i + 21; continue; } // If we found --> skip to there. start = p; } else { // Otherwise skip to the first > we found (as Netscape does). start = close; } } else { // just an SGML declaration, perhaps giving the DTD - ignore it start = find(start - 1, body.end(), '>'); if (start == body.end()) break; } ++start; } else if (*start == '?') { if (++start == body.end()) break; // PHP - swallow until ?> or EOF start = find(start + 1, body.end(), '>'); // look for ?> while (start != body.end() && *(start - 1) != '?') start = find(start + 1, body.end(), '>'); // unterminated PHP swallows rest of document (rather arbitrarily // but it avoids polluting the database when things go wrong) if (start != body.end()) ++start; } else { // opening or closing tag int closing = 0; if (*start == '/') { closing = 1; start = find_if(start + 1, body.end(), p_notwhitespace); } p = start; start = find_if(start, body.end(), p_nottag); string tag = body.substr(p - body.begin(), start - p); // convert tagname to lowercase lowercase_string(tag); if (closing) { closing_tag(tag); if (in_script && tag == "script") in_script = false; /* ignore any bogus parameters on closing tags */ p = find(start, body.end(), '>'); if (p == body.end()) break; start = p + 1; } else { // FIXME: parse parameters lazily. while (start < body.end() && *start != '>') { string name, value; p = find_if(start, body.end(), p_whitespaceeqgt); name.assign(body, start - body.begin(), p - start); p = find_if(p, body.end(), p_notwhitespace); start = p; if (start != body.end() && *start == '=') { start = find_if(start + 1, body.end(), p_notwhitespace); p = body.end(); int quote = *start; if (quote == '"' || quote == '\'') { start++; p = find(start, body.end(), quote); } if (p == body.end()) { // unquoted or no closing quote p = find_if(start, body.end(), p_whitespacegt); } value.assign(body, start - body.begin(), p - start); start = find_if(p, body.end(), p_notwhitespace); if (!name.empty()) { // convert parameter name to lowercase lowercase_string(name); // in case of multiple entries, use the first // (as Netscape does) parameters.insert(make_pair(name, value)); } } } #if 0 cout << "<" << tag; map::const_iterator x; for (x = parameters.begin(); x != parameters.end(); x++) { cout << " " << x->first << "=\"" << x->second << "\""; } cout << ">\n"; #endif opening_tag(tag); parameters.clear(); // In