pax_global_header00006660000000000000000000000064151716725670014533gustar00rootroot0000000000000052 comment=6348fcc9c25f7fe8a878b75d84ba8630e5787e82 scrapinghub-web-poet-ba87b95/000077500000000000000000000000001517167256700162035ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/.codecov.yml000066400000000000000000000001631517167256700204260ustar00rootroot00000000000000comment: layout: "header, diff, tree" coverage: status: project: false ignore: - "web_poet/example.py" scrapinghub-web-poet-ba87b95/.git-blame-ignore-revs000066400000000000000000000006141517167256700223040ustar00rootroot00000000000000# Contains commits to be ignored due to linting # https://github.com/scrapinghub/web-poet/pull/34 660b1192da0e765a62b51bfe0cac1aec202e6310 be4bb7db17772f9128357246fdb88177f61c3ca8 2c8b9218bf90e9bad96b5d5eb59212dcb92dab43 279a90ac3e14522a8e2e7f8dabc2378f630a5c6f fff64aa4def52ed792691d1a8f8bd5e44f36e2bc # https://github.com/scrapinghub/web-poet/pull/68 2b88a79003d2c05ff8fa725e293599aee75de919 scrapinghub-web-poet-ba87b95/.github/000077500000000000000000000000001517167256700175435ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/.github/workflows/000077500000000000000000000000001517167256700216005ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/.github/workflows/publish.yml000066400000000000000000000015541517167256700237760ustar00rootroot00000000000000# This workflows will upload a Python Package using Twine when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries name: publish on: push: tags: - "[0-9]+.[0-9]+.[0-9]+" jobs: deploy: runs-on: ubuntu-latest environment: name: pypi url: https://pypi.org/p/web-poet permissions: id-token: write steps: - uses: actions/checkout@v5 - name: Set up Python uses: actions/setup-python@v6 with: python-version: "3.14" - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools build - name: Build package run: python -m build - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 scrapinghub-web-poet-ba87b95/.github/workflows/tests-ubuntu.yml000066400000000000000000000036151517167256700250120ustar00rootroot00000000000000# This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: tox-Ubuntu on: push: branches: [ master ] pull_request: jobs: test: runs-on: ubuntu-latest strategy: fail-fast: false matrix: python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] env: - TOXENV: py include: - python-version: '3.10' env: TOXENV: min - python-version: '3.10' env: TOXENV: min-framework - python-version: '3.14' env: TOXENV: framework steps: - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install tox - name: tox env: ${{ matrix.env }} run: | tox - name: coverage uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} check: runs-on: ubuntu-latest strategy: fail-fast: false matrix: # Keep in sync with pyproject.toml’s tool.sphinx-scrapy.python-version. python-version: ['3.13'] tox-job: ["mypy", "docs", "pre-commit", "types", "twinecheck"] steps: - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install tox - name: tox run: | tox -e ${{ matrix.tox-job }} scrapinghub-web-poet-ba87b95/.github/workflows/tests-windows.yml000066400000000000000000000023671517167256700251650ustar00rootroot00000000000000# This workflow will install Python dependencies, run tests and lint with a variety of Python versions # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions name: tox-Windows on: push: branches: [ master ] pull_request: jobs: test: runs-on: windows-latest strategy: fail-fast: false matrix: python-version: ['3.10', '3.11', '3.12', '3.13', '3.14'] env: - TOXENV: py include: - python-version: '3.10' env: TOXENV: min - python-version: '3.10' env: TOXENV: min-framework - python-version: '3.14' env: TOXENV: framework steps: - uses: actions/checkout@v5 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install tox - name: tox env: ${{ matrix.env }} run: | tox - name: coverage uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} scrapinghub-web-poet-ba87b95/.gitignore000066400000000000000000000001461517167256700201740ustar00rootroot00000000000000.coverage /coverage.xml .mypy_cache/ .tox/ dist/ htmlcov/ docs/_build *.egg-info/ __pycache__/ .idea/ scrapinghub-web-poet-ba87b95/.pre-commit-config.yaml000066400000000000000000000012131517167256700224610ustar00rootroot00000000000000exclude: ^docs/tutorial-project/ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.14.11 hooks: - id: ruff-check args: [ --fix ] - id: ruff-format - repo: https://github.com/adamchainz/blacken-docs rev: 1.20.0 hooks: - id: blacken-docs additional_dependencies: - black==25.12.0 - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/sphinx-contrib/sphinx-lint rev: v1.0.2 hooks: - id: sphinx-lint - repo: https://github.com/scrapy/sphinx-scrapy rev: 0.8.1 hooks: - id: sphinx-scrapy scrapinghub-web-poet-ba87b95/.readthedocs.yml000066400000000000000000000003251517167256700212710ustar00rootroot00000000000000version: 2 build: os: ubuntu-24.04 tools: python: "3.13" commands: - pip install tox - tox -e docs - mkdir -p $READTHEDOCS_OUTPUT/html - cp -a docs/_build/all/. $READTHEDOCS_OUTPUT/html/ scrapinghub-web-poet-ba87b95/.vscode/000077500000000000000000000000001517167256700175445ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/.vscode/settings.json000066400000000000000000000004251517167256700223000ustar00rootroot00000000000000{ "chat.tools.terminal.autoApprove": { "/^(uvx (-p \\d+\\.\\d+ )?)?tox -e (pre-commit|mypy|types|docs|twinecheck|min|min-framework|py\\d3|framework)( --( (-q|tests/\\S+))*)?$/": { "approve": true, "matchCommandLine": true }, } } scrapinghub-web-poet-ba87b95/CHANGELOG.rst000066400000000000000000000430611517167256700202300ustar00rootroot00000000000000========= Changelog ========= 0.24.0 (2026-04-21) ------------------- Backward-incompatible changes: * The tutorial-only ``web_poet.example`` module has been removed. * :class:`~web_poet.testing.Fixture` got some backward-incompatible changes: * The ``type_name`` and ``short_name`` properties have been removed. * | The following methods got a new ``page_cls`` parameter: | :meth:`~web_poet.testing.Fixture.assert_field_correct` | :meth:`~web_poet.testing.Fixture.assert_full_item_correct` | :meth:`~web_poet.testing.Fixture.assert_no_extra_fields` | :meth:`~web_poet.testing.Fixture.assert_no_toitem_exceptions` | :meth:`~web_poet.testing.Fixture.assert_toitem_exception` | :meth:`~web_poet.testing.Fixture.get_output` | :meth:`~web_poet.testing.Fixture.get_page` Improvements: * Added a :ref:`built-in framework ` for simple use cases. * :class:`~web_poet.testing.Fixture` instances are no longer tied to a single page object class. While pytest discovery still requires the parent folder name to be named after a page object class, a new ``--page-object`` option of ``python -m web_poet.testing`` allows specifying a different page object class. * :class:`~web_poet.exceptions.Retry` now accepts an optional ``max_retries`` parameter. * Added :class:`~web_poet.page_inputs.stats.DictStatCollector`. 0.23.3 (2026-04-07) ------------------- * Added :ref:`ai` to the docs. * Made the documentation more LLM-friendly, with markdown versions of every page and :file:`llms.txt` and :file:`llms-full.txt` files. 0.23.2 (2026-03-10) ------------------- * JSON files in :ref:`test fixtures ` are now saved using UTF-8 instead of the system encoding. 0.23.1 (2026-01-27) ------------------- * :func:`@field ` no longer strips docstrings from decorated methods. 0.23.0 (2026-01-22) ------------------- * Dropped Python 3.9 support. * Added :func:`~web_poet.annotation_encode` (see :ref:`input-annotations`) and :func:`~web_poet.annotation_decode`. * Implemented type hint improvements. 0.22.0 (2025-12-15) ------------------- * :ref:`Tests ` now put expected and actual values into :ref:`pytest user properties `. 0.21.0 (2025-11-24) ------------------- * Added :class:`~web_poet.pages.BrowserPage` page object class to work with :class:`~web_poet.page_inputs.browser.BrowserResponse`. * Added :attr:`BrowserResponse.text ` attribute. 0.20.0 (2025-10-28) ------------------- * Added support for Python 3.14. * Added support for :class:`~.BrowserResponse`, :class:`~.AnyResponse` and :class:`~.BrowserHtml` dependencies to the :ref:`testing framework `. * Explicitly re-export public names. 0.19.2 (2025-08-22) ------------------- * Fixed runtime resolving of type annotations for some types. 0.19.1 (2025-08-13) ------------------- * Improved type annotations. 0.19.0 (2025-06-06) ------------------- * Removed some deprecated code: * The ``web_poet.overrides`` module is removed. * The ``ItemWebPage``, ``OverrideRule`` and ``PageObjectRegistry`` classes are removed. * The ``from_override_rules()`` class method and the ``get_overrides()`` and ``search_overrides()`` methods of :class:`~web_poet.rules.RulesRegistry` are removed. * The ``overrides`` parameter of :meth:`~web_poet.rules.RulesRegistry.handle_urls` is removed. * The ``RequestUrl`` and ``ResponseUrl`` classes can no longer be imported from ``web_poet.page_inputs.http``. * :ref:`Tests ` now support items with :class:`~web_poet.page_inputs.url.RequestUrl` and :class:`~web_poet.page_inputs.url.ResponseUrl` objects. * Improved the :ref:`pytest plugin `: * Pytest ≥ 7.0.0 is now required. * Tests within a test case can now be run individually. * Tests are now compatible with `vscode-python`_. .. _vscode-python: https://github.com/microsoft/vscode-python * Fixed an error of :func:`~web_poet.pages.is_injectable` with :class:`~types.GenericAlias` on Python ≤ 3.10. 0.18.0 (2025-01-30) ------------------- * Removed support for Python 3.8, added support for Python 3.13. * The minimum required version of :doc:`url-matcher ` changed from ``0.2.0`` to ``0.4.0``. * ``type(None)`` is no longer considered injectable. * Added :meth:`RulesRegistry.top_rules_for_item() `. 0.17.1 (2024-10-11) ------------------- * :attr:`web_poet.mixins.SelectableMixin.selector` is now created with the ``base_url`` value set to ``self.url`` if this attribute exists. * Added a mention of the :doc:`form2request library ` to the :class:`~.HttpRequest` documentation. * CI improvements. 0.17.0 (2024-03-04) ------------------- * Now requires ``andi >= 0.5.0``. * Package requirements that were unversioned now have minimum versions specified. * Added support for Python 3.12. * Added support for ``typing.Annotated`` dependencies to the serialization and testing code. * Documentation improvements. * CI improvements. 0.16.0 (2024-01-23) ------------------- * Added new :class:`~.AnyResponse` which holds either :class:`~.BrowserResponse`, or :class:`~.HttpResponse`. * Documentation improvements. 0.15.1 (2023-11-21) ------------------- * ``HttpRequestHeaders`` now has a ``from_bytes_dict`` class method, like ``HttpResponseHeaders``. 0.15.0 (2023-09-11) ------------------- * A new dependency, :class:`~.Stats`, has been added. It allows storing key-value data pairs for different purposes. See :ref:`stats`. 0.14.0 (2023-08-03) ------------------- * Dropped Python 3.7 support. * Now requires ``packaging >= 20.0``. * Fixed detection of the :class:`~.Returns` base class. * Improved docs. * Updated type hints. * Updated CI tools. 0.13.1 (2023-05-30) ------------------- * Fixed an issue with :class:`~.HttpClient` which happens when a response with a non-standard status code is received. 0.13.0 (2023-05-30) ------------------- * A new dependency :class:`~.BrowserResponse` has been added. It contains a browser-rendered page URL, status code and HTML. * The :ref:`rules` documentation section has been rewritten. 0.12.0 (2023-05-05) ------------------- * The :ref:`testing framework ` now allows defining a :ref:`custom item adapter `. * We have made a backward-incompatible change on test fixture serialization: the ``type_name`` field of exceptions has been renamed to ``import_path``. * Fixed built-in Python types, e.g. ``int``, not working as :ref:`field processors `. 0.11.0 (2023-04-24) ------------------- * JMESPath_ support is now available: you can use :meth:`.WebPage.jmespath` and :meth:`.HttpResponse.jmespath` to run queries on JSON responses. * The testing framework now supports page objects that raise exceptions from the ``to_item`` method. .. _JMESPath: https://jmespath.org/ 0.10.0 (2023-04-19) ------------------- * New class :class:`~.Extractor` can be used for easier extraction of nested fields (see :ref:`default-processors-nested`). * Exceptions raised while getting a response for an additional request are now saved in :ref:`test fixtures `. * Multiple documentation improvements and fixes. * Add a ``twine check`` CI check. 0.9.0 (2023-03-30) ------------------ * Standardized :ref:`input validation `. * :ref:`Field processors ` can now also be defined through a nested ``Processors`` class, so that field redefinitions in subclasses can inherit them. See :ref:`default-processors`. * :ref:`Field processors ` can now opt in to receive the page object whose field is being read. * :class:`web_poet.fields.FieldsMixin` now keeps fields from all base classes when using multiple inheritance. * Fixed the documentation build. 0.8.1 (2023-03-03) ------------------ * Fix the error when calling :meth:`.to_item() `, :func:`item_from_fields_sync() `, or :func:`item_from_fields() ` on page objects defined as slotted attrs classes, while setting ``skip_nonitem_fields=True``. 0.8.0 (2023-02-23) ------------------ This release contains many improvements to the web-poet testing framework, as well as some other improvements and bug fixes. Backward-incompatible changes: * :func:`~.cached_method` no longer caches exceptions for ``async def`` methods. This makes the behavior the same for sync and async methods, and also makes it consistent with Python's stdlib caching (i.e. :func:`functools.lru_cache`, :func:`functools.cached_property`). * The testing framework now uses the ``HttpResponse-info.json`` file name instead of ``HttpResponse-other.json`` to store information about HttpResponse instances. To make tests generated with older web-poet work, rename these files on disk. Testing framework improvements: * Improved test reporting: better diffs and error messages. * By default, the pytest plugin now generates a test per item attribute (see :ref:`web-poet-testing-pytest`). There is also an option (``--web-poet-test-per-item``) to run a test per item instead. * Page objects with the :class:`~.HttpClient` dependency are now supported (see :ref:`web-poet-testing-additional-requests`). * Page objects with the :class:`~.PageParams` dependency are now supported. * Added a new ``python -m web_poet.testing rerun`` command (see :ref:`web-poet-testing-tdd`). * Fixed support for nested (indirect) dependencies in page objects. Previously they were not handled properly by the testing framework. * Non-ASCII output is now stored without escaping in the test fixtures, for better readability. Other changes: * Testing and CI fixes. * Fixed a packaging issue: ``tests`` and ``tests_extra`` packages were installed, not just ``web_poet``. 0.7.2 (2023-02-01) ------------------ * Restore the minimum version of ``itemadapter`` from 0.7.1 to 0.7.0, and prevent a similar issue from happening again in the future. 0.7.1 (2023-02-01) ------------------ * Updated the :ref:`tutorial ` to cover recent features and focus on best practices. Also, a new module was added, :mod:`web_poet.example`, that allows using page objects while following the tutorial. * :ref:`web-poet-testing` now covers :ref:`Git LFS ` and :ref:`scrapy-poet `, and recommends ``python -m pytest`` instead of ``pytest``. * Improved the warning message when duplicate ``ApplyRule`` objects are found. * ``HttpResponse-other.json`` content is now indented for better readability. * Improved test coverage for :ref:`fields `. 0.7.0 (2023-01-18) ------------------ * Add :ref:`a framework for creating tests and running them with pytest `. * Support implementing fields in mixin classes. * Introduce new methods for :class:`web_poet.rules.RulesRegistry`: * :meth:`web_poet.rules.RulesRegistry.add_rule` * :meth:`web_poet.rules.RulesRegistry.overrides_for` * :meth:`web_poet.rules.RulesRegistry.page_cls_for_item` * Improved the performance of :meth:`web_poet.rules.RulesRegistry.search` where passing a single parameter of either ``instead_of`` or ``to_return`` results in *O(1)* look-up time instead of *O(N)*. Additionally, having either ``instead_of`` or ``to_return`` present in multi-parameter search calls would filter the initial candidate results resulting in a faster search. * Support :ref:`page object dependency serialization `. * Add new dependencies used in testing and serialization code: ``andi``, ``python-dateutil``, and ``time-machine``. Also ``backports.zoneinfo`` on non-Windows platforms when the Python version is older than 3.9. 0.6.0 (2022-11-08) ------------------ In this release, the ``@handle_urls`` decorator gets an overhaul; it's not required anymore to pass another Page Object class to ``@handle_urls("...", overrides=...)``. Also, the ``@web_poet.field`` decorator gets support for output processing functions, via the ``out`` argument. Full list of changes: * **Backwards incompatible** ``PageObjectRegistry`` is no longer supporting dict-like access. * Official support for Python 3.11. * New ``@web_poet.field(out=[...])`` argument which allows to set output processing functions for web-poet fields. * The ``web_poet.overrides`` module is deprecated and replaced with ``web_poet.rules``. * The ``@handle_urls`` decorator is now creating ``ApplyRule`` instances instead of ``OverrideRule`` instances; ``OverrideRule`` is deprecated. ``ApplyRule`` is similar to ``OverrideRule``, but has the following differences: * ``ApplyRule`` accepts a ``to_return`` parameter, which should be the data container (item) class that the Page Object returns. * Passing a string to ``for_patterns`` would auto-convert it into ``url_matcher.Patterns``. * All arguments are now keyword-only except for ``for_patterns``. * New signature and behavior of ``handle_urls``: * The ``overrides`` parameter is made optional and renamed to ``instead_of``. * If defined, the item class declared in a subclass of ``web_poet.ItemPage`` is used as the ``to_return`` parameter of ``ApplyRule``. * Multiple ``handle_urls`` annotations are allowed. * ``PageObjectRegistry`` is replaced with ``RulesRegistry``; its API is changed: * **backwards incompatible** dict-like API is removed; * **backwards incompatible** *O(1)* lookups using ``.search(use=PagObject)`` has become *O(N)*; * ``search_overrides`` method is renamed to ``search``; * ``get_overrides`` method is renamed to ``get_rules``; * ``from_override_rules`` method is deprecated; use ``RulesRegistry(rules=...)`` instead. * Typing improvements. * Documentation, test, and warning message improvements. Deprecations: * The ``web_poet.overrides`` module is deprecated. Use ``web_poet.rules`` instead. * The ``overrides`` parameter from ``@handle_urls`` is now deprecated. Use the ``instead_of`` parameter instead. * The ``OverrideRule`` class is now deprecated. Use ``ApplyRule`` instead. * ``PageObjectRegistry`` is now deprecated. Use ``RulesRegistry`` instead. * The ``from_override_rules`` method of ``PageObjectRegistry`` is now deprecated. Use ``RulesRegistry(rules=...)`` instead. * The ``PageObjectRegistry.get_overrides`` method is deprecated. Use ``PageObjectRegistry.get_rules`` instead. * The ``PageObjectRegistry.search_overrides`` method is deprecated. Use ``PageObjectRegistry.search`` instead. 0.5.1 (2022-09-23) ------------------ * The BOM encoding from the response body is now read before the response headers when deriving the response encoding. * Minor typing improvements. 0.5.0 (2022-09-21) ------------------ Web-poet now includes a mini-framework for organizing extraction code as Page Object properties:: import attrs from web_poet import field, ItemPage @attrs.define class MyItem: foo: str bar: list[str] class MyPage(ItemPage[MyItem]): @field def foo(self): return "..." @field def bar(self): return ["...", "..."] **Backwards incompatible changes**: * ``web_poet.ItemPage`` is no longer an abstract base class which requires ``to_item`` method to be implemented. Instead, it provides a default ``async def to_item`` method implementation which uses fields marked as ``web_poet.field`` to create an item. This change shouldn't affect the user code in a backwards incompatible way, but it might affect typing. Deprecations: * ``web_poet.ItemWebPage`` is deprecated. Use ``web_poet.WebPage`` instead. Other changes: * web-poet is declared as PEP 561 package which provides typing information; mypy is going to use it by default. * Documentation, test, typing and CI improvements. 0.4.0 (2022-07-26) ------------------ * New ``HttpResponse.urljoin`` method, which take page's base url in account. * New ``HttpRequest.urljoin`` method. * standardized ``web_poet.exceptions.Retry`` exception, which allows to initiate a retry from the Page Object, e.g. based on page content. * Documentation improvements. 0.3.0 (2022-06-14) ------------------ * Backwards Incompatible Change: * ``web_poet.requests.request_backend_var`` is renamed to ``web_poet.requests.request_downloader_var``. * Documentation and CI improvements. 0.2.0 (2022-06-10) ------------------ * Backward Incompatible Change: * ``ResponseData`` is replaced with ``HttpResponse``. ``HttpResponse`` exposes methods useful for web scraping (such as xpath and css selectors, json loading), and handles web page encoding detection. There are also new types like ``HttpResponseBody`` and ``HttpResponseHeaders``. * Added support for performing additional requests using ``web_poet.HttpClient``. * Introduced ``web_poet.BrowserHtml`` dependency * Introduced ``web_poet.PageParams`` to pass arbitrary information inside a Page Object. * Added ``web_poet.handle_urls`` decorator, which allows to declare which websites should be handled by the page objects. Lower-level ``PageObjectRegistry`` class is also available. * removed support for Python 3.6 * added support for Python 3.10 0.1.1 (2021-06-02) ------------------ * ``base_url`` and ``urljoin`` shortcuts 0.1.0 (2020-07-18) ------------------ * Documentation * WebPage, ItemPage, ItemWebPage, Injectable and ResponseData are available as top-level imports (e.g. ``web_poet.ItemPage``) 0.0.1 (2020-04-27) ------------------ Initial release. scrapinghub-web-poet-ba87b95/CONTRIBUTING.rst000066400000000000000000000013101517167256700206370ustar00rootroot00000000000000Contributing ============ ``web-poet`` is an open-source project and we'd love to accept your contributions. There are just a few small guidelines that you need to follow. How to report issues -------------------- Please, create a new issue on this repository if you have: * a bug * a new feature * a change proposal Make sure to include all necessary details that could help other contributors to understand or to reproduce your use case. How to submit a patch --------------------- * Fork the project * Create a new branch with your patch * Open a pull request to master * Ask for code reviews from other contributors Ideally, tests should keep passing and coverage should not decrease after a commit. scrapinghub-web-poet-ba87b95/LICENSE000066400000000000000000000027451517167256700172200ustar00rootroot00000000000000Copyright (c) Zyte Group Ltd All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of Zyte nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. scrapinghub-web-poet-ba87b95/MAINTAINING.rst000066400000000000000000000023701517167256700204750ustar00rootroot00000000000000Maintaining =========== How to release a new version ---------------------------- Make sure to install bump2version_, a maintained fork of bumpversion_: .. code-block:: shell pip install --upgrade bump2version Then follow this checklist: * Update changelog in a separate commit * Execute the ``bumpversion`` command * Push latest changes including tags The changelog is kept under the ``CHANGES.rst`` file. It should be updated in a separate commit to master. After changelog is merged to master, you can check which changes are needed to update the version executing: .. code-block:: shell bumpversion minor --dry-run --verbose When you're ready, you can remove the flags and execute: .. code-block:: shell bumpversion minor In this example, we're bumping our minor version, but you may use any of the following: * patch (for bug fixes) * minor (new features that keep compatibility) * major (introduces breaking changes) Finally, you can push the changes to the remote repository. Make sure to include git tags. .. code-block:: shell git push origin `git describe --tags` The PyPI release will be handled by Travis CI. .. _bump2version: https://github.com/c4urself/bump2version .. _bumpversion: https://github.com/peritus/bumpversion scrapinghub-web-poet-ba87b95/README.rst000066400000000000000000000033431517167256700176750ustar00rootroot00000000000000======== web-poet ======== .. image:: https://img.shields.io/pypi/v/web-poet.svg :target: https://pypi.python.org/pypi/web-poet :alt: PyPI Version .. image:: https://img.shields.io/pypi/pyversions/web-poet.svg :target: https://pypi.python.org/pypi/web-poet :alt: Supported Python Versions .. image:: https://github.com/scrapinghub/web-poet/actions/workflows/tests-ubuntu.yml/badge.svg :target: https://github.com/scrapinghub/web-poet/actions/workflows/tests-ubuntu.yml :alt: Tox Ubuntu .. image:: https://github.com/scrapinghub/web-poet/actions/workflows/tests-windows.yml/badge.svg :target: https://github.com/scrapinghub/web-poet/actions/workflows/tests-windows.yml :alt: Tox Windows .. image:: https://codecov.io/github/scrapinghub/web-poet/coverage.svg?branch=master :target: https://codecov.io/gh/scrapinghub/web-poet :alt: Coverage report .. image:: https://readthedocs.org/projects/web-poet/badge/?version=stable :target: https://web-poet.readthedocs.io/en/stable/?badge=stable :alt: Documentation Status .. intro starts ``web-poet`` is a Python 3.10+ implementation of the `page object pattern`_ for web scraping. It enables writing portable, reusable web parsing code. .. _page object pattern: https://martinfowler.com/bliki/PageObject.html .. intro ends See the documentation_. .. _documentation: https://web-poet.readthedocs.io Developing ========== Setup your local Python environment via: 1. `pip install -r requirements-dev.txt` 2. `pre-commit install` Now everytime you perform a `git commit`, these tools will run against the staged files: * `black` * `isort` * `flake8` You can also directly invoke `pre-commit run --all-files` or `tox -e pre-commit` to run them without performing a commit. scrapinghub-web-poet-ba87b95/docs/000077500000000000000000000000001517167256700171335ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/api-reference.rst000066400000000000000000000054341517167256700224000ustar00rootroot00000000000000.. _api-reference: ============= API reference ============= .. _input: Page Inputs =========== .. module:: web_poet.page_inputs .. automodule:: web_poet.page_inputs.browser :members: :undoc-members: :inherited-members: str :show-inheritance: .. automodule:: web_poet.page_inputs.client :members: :undoc-members: .. automodule:: web_poet.page_inputs.http :members: :undoc-members: :inherited-members: str,bytes,MultiDict :show-inheritance: .. automodule:: web_poet.page_inputs.response :members: :undoc-members: :inherited-members: str :show-inheritance: .. automodule:: web_poet.page_inputs.page_params :members: :undoc-members: :show-inheritance: .. automodule:: web_poet.page_inputs.stats :members: :show-inheritance: Pages ===== .. automodule:: web_poet.pages .. autoclass:: Injectable :show-inheritance: :members: :no-special-members: .. autofunction:: is_injectable .. autoclass:: ItemPage :show-inheritance: :members: :no-special-members: .. autoclass:: WebPage :show-inheritance: :members: :undoc-members: :inherited-members: :no-special-members: .. autoclass:: BrowserPage :show-inheritance: :members: :undoc-members: :inherited-members: :no-special-members: .. autoclass:: Returns :show-inheritance: :members: .. autoclass:: Extractor :show-inheritance: :members: :no-special-members: .. autoclass:: SelectorExtractor :show-inheritance: :members: :no-special-members: Mixins ====== .. automodule:: web_poet.mixins .. autoclass:: web_poet.mixins.ResponseShortcutsMixin :members: :inherited-members: :no-special-members: .. autoclass:: web_poet.mixins.SelectableMixin :members: :inherited-members: :no-special-members: Requests ======== .. automodule:: web_poet.requests :members: :undoc-members: Exceptions ========== .. automodule:: web_poet.exceptions.core :members: .. automodule:: web_poet.exceptions.http :show-inheritance: :members: .. _api-rules: Apply Rules =========== See :ref:`rules` for more context about its use cases and some examples. .. currentmodule:: web_poet .. data:: default_registry Default :class:`~rules.RulesRegistry`. .. function:: handle_urls :meth:`~rules.RulesRegistry.handle_urls` of the :data:`default_registry`. .. automodule:: web_poet.rules :members: Fields ====== .. automodule:: web_poet.fields :members: Annotation support ================== .. autofunction:: web_poet.annotation_encode .. autofunction:: web_poet.annotation_decode .. autoclass:: web_poet.AnnotatedInstance Utils ===== .. automodule:: web_poet.utils :members: Built-in framework ================== .. automodule:: web_poet.framework :members: scrapinghub-web-poet-ba87b95/docs/changelog.rst000066400000000000000000000000361517167256700216130ustar00rootroot00000000000000.. include:: ../CHANGELOG.rst scrapinghub-web-poet-ba87b95/docs/conf.py000066400000000000000000000134011517167256700204310ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config import sys import warnings from pathlib import Path # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, str(Path(__file__).parent.parent)) # -- Project information ----------------------------------------------------- project = "web-poet" copyright = "2020-2022, Zyte Group Ltd" author = "Zyte Group Ltd" # The short X.Y version version = "" # The full version, including alpha/beta/rc tags release = "0.24.0" # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx_scrapy", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = {".rst": "restructuredtext"} # The master toctree document. master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = "sphinx_rtd_theme" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". # html_static_path = ['_static'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = "web-poetdoc" # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, "web-poet.tex", "web-poet Documentation", "Zyte Group Ltd", "manual"), ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, "web-poet", "web-poet Documentation", [author], 1)] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, "web-poet", "web-poet Documentation", author, "web-poet", "One line description of project.", "Miscellaneous", ), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- autodoc_member_order = "bysource" # -- Filter warnings --------------------------------------------------------- warnings.filterwarnings( "ignore", message=( "You should only be importing web_poet.example to follow the web-poet " "tutorial, never as part of production code." ), category=UserWarning, ) # -- sphinx-scrapy ----------------------------------------------------------- scrapy_intersphinx_enable = [ "attrs", "form2request", "multidict", "parsel", "scrapy-poet", "url-matcher", ] scrapinghub-web-poet-ba87b95/docs/contributing.rst000066400000000000000000000015541517167256700224010ustar00rootroot00000000000000============ Contributing ============ web-poet is an open-source project. Your contribution is very welcome! Issue Tracker ============= If you have a bug report, a new feature proposal or simply would like to make a question, please check our issue tracker on Github: https://github.com/scrapinghub/web-poet/issues Source code =========== Our source code is hosted on Github: https://github.com/scrapinghub/web-poet Before opening a pull request, it might be worth checking current and previous issues. Some code changes might also require some discussion before being accepted so it might be worth opening a new issue before implementing huge or breaking changes. Testing ======= We use tox_ to run tests with different Python versions:: tox The command above also runs type checks; we use mypy. .. toctree:: :hidden: .. _tox: https://tox.readthedocs.io scrapinghub-web-poet-ba87b95/docs/framework.rst000066400000000000000000000113531517167256700216650ustar00rootroot00000000000000.. _framework: ================== Built-in framework ================== :mod:`web_poet.framework` is a built-in :ref:`web-poet framework ` for simple use cases. It is designed to be easy to use for quick proof-of-concepts, simple scripts, and for generating test fixtures. It can also serve as a reference implementation for framework authors. Limitations =========== The main limitation of the built-in framework is that it is not a complete scraping framework like :doc:`Scrapy `, which can support web-poet thanks to :doc:`scrapy-poet `. As a web-poet framework, the built-in framework also lacks support for :ref:`custom input classes `, :exc:`~web_poet.exceptions.Retry` and :exc:`~web_poet.exceptions.UseFallback`. Also, :ref:`browser inputs ` only support plain GET requests. Requests with a non-GET method, headers or a body raise :exc:`~web_poet.exceptions.http.HttpRequestError`. Installation ============ To use :mod:`web_poet.framework`, install the ``framework`` extra: .. code-block:: bash pip install web-poet[framework] For :ref:`browser support `, you also need to `install at least 1 browser with Playwright `__. For example, to install the main browsers: .. code-block:: bash playwright install Basic use ========= .. code-block:: python from dataclasses import dataclass from web_poet import WebPage from web_poet.framework import Framework from web_poet.utils import ensure_awaitable @dataclass class Book: title: str class BookPage(WebPage[Book]): @field def title(self) -> str: return self.response.css("h1::text").get() framework = Framework() item = await framework.get_item("https://books.example.com/book/1", BookPage) # Or, if you prefer, get a page object instance first. page = await framework.get_page("https://books.example.com/book/1", BookPage) item = await ensure_awaitable(page.to_item()) Choosing a page object class automatically ========================================== If you decorate your page object classes with :func:`~web_poet.handle_urls` and make sure they are imported, e.g. with :func:`~web_poet.consume_modules`, you can pass :meth:`~web_poet.framework.Framework.get_item` an item class, and let it determine which page object class to use: .. code-block:: python from dataclasses import dataclass from web_poet import WebPage, handle_urls from web_poet.framework import Framework @dataclass class Book: title: str @handle_urls("books.example.com") class BookPage(WebPage[Book]): @field def title(self) -> str: return self.response.css("h1::text").get() framework = Framework() item = await framework.get_item("https://books.example.com/book/1", Book) .. _framework-browser: Browser ======= The built-in framework can use `Playwright `_ to resolve browser dependencies like :class:`~web_poet.page_inputs.browser.BrowserHtml` or :class:`~web_poet.page_inputs.browser.BrowserResponse`. Chromium is used by default. You can override that by passing ``default_playwright_engine`` to :class:`~web_poet.framework.Framework`. Page objects can also annotate their Playwright engine dependencies with :func:`~web_poet.framework.playwright_engine` to specify which engine they require. For example: .. code-block:: python from typing import Annotated from web_poet import WebPage, Item from web_poet.page_inputs.browser import BrowserResponse from web_poet.framework import playwright_engine class MyPageObject(WebPage[Item]): response = Annotated[BrowserResponse, playwright_engine("firefox")] Stats ===== The built-in framework supports :class:`~web_poet.page_inputs.stats.Stats`. By default, :class:`~web_poet.framework.Framework` creates a :class:`~web_poet.page_inputs.stats.DictStatCollector` object, exposes it to any page object that requests :class:`~web_poet.page_inputs.stats.Stats`, and exposes that object as the :data:`stats ` attribute of the framework: .. code-block:: python from web_poet.framework import Framework framework = Framework() item1 = await framework.get_item("http://example.com/book/1", BookPage) item2 = await framework.get_item("http://example.com/book/2", BookPage) all_stats = framework.stats :class:`~web_poet.framework.Framework` also supports passing a custom stats collector: .. code-block:: python from web_poet.page_inputs.stats import StatCollector class MyStatCollector(StatCollector): ... framework = Framework(stats=MyStatCollector()) scrapinghub-web-poet-ba87b95/docs/frameworks/000077500000000000000000000000001517167256700213135ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/frameworks/additional-requests.rst000066400000000000000000000203461517167256700260330ustar00rootroot00000000000000.. _framework-additional-requests: ============================== Supporting additional requests ============================== To support :ref:`additional requests `, your framework must provide the request download implementation of :class:`~.HttpClient`. .. _advanced-downloader-impl: Providing the Downloader ------------------------ On its own, :class:`~.HttpClient` doesn't do anything. It doesn't know how to execute the request on its own. Thus, for frameworks or projects wanting to use additional requests in Page Objects, they need to set the implementation on how to execute an :class:`~.HttpRequest`. For more info on this, kindly read the API Specifications for :class:`~.HttpClient`. In any case, frameworks that wish to support **web-poet** could provide the HTTP downloader implementation in two ways: .. _setup-contextvars: 1. Context Variable ******************* :mod:`contextvars` is natively supported in :mod:`asyncio` in order to set and access context-aware values. This means that the framework using **web-poet** can assign the request downloader implementation using the :mod:`contextvars` instance named ``web_poet.request_downloader_var``. This can be set using: .. code-block:: python import attrs import web_poet from web_poet import validates_input async def request_implementation( req: web_poet.HttpRequest, ) -> web_poet.HttpResponse: ... def create_http_client(): return web_poet.HttpClient() @attrs.define class SomePage(web_poet.WebPage): http: web_poet.HttpClient @validates_input async def to_item(self): ... # Once this is set, the ``request_implementation`` becomes available to # all instances of HttpClient, unless HttpClient is created with # the ``request_downloader`` argument (see the #2 Dependency Injection # example below). web_poet.request_downloader_var.set(request_implementation) # Assume that it's constructed with the necessary arguments taken somewhere. response = web_poet.HttpResponse(...) page = SomePage(response=response, http=create_http_client()) item = await page.to_item() When the ``web_poet.request_downloader_var`` contextvar is set, :class:`~.HttpClient` instances use it by default. .. warning:: If no value for ``web_poet.request_downloader_var`` is set, then :class:`~.RequestDownloaderVarError` is raised. However, no exception is raised if **option 2** below is used. 2. Dependency Injection *********************** The framework using **web-poet** may be using libraries that don't have a full support to :mod:`contextvars` `(e.g. Twisted)`. With that, an alternative approach would be to supply the request downloader implementation when creating an :class:`~.HttpClient` instance: .. code-block:: python import attrs import web_poet from web_poet import validates_input async def request_implementation( req: web_poet.HttpRequest, ) -> web_poet.HttpResponse: ... def create_http_client(): return web_poet.HttpClient(request_downloader=request_implementation) @attrs.define class SomePage(web_poet.WebPage): http: web_poet.HttpClient @validates_input async def to_item(self): ... # Assume that it's constructed with the necessary arguments taken somewhere. response = web_poet.HttpResponse(...) page = SomePage(response=response, http=create_http_client()) item = await page.to_item() From the code sample above, we can see that every time an :class:`~.HttpClient` instance is created for Page Objects needing it, the framework must create :class:`~.HttpClient` with a framework-specific **request downloader implementation**, using the ``request_downloader`` argument. Downloader Behavior ------------------- The request downloader MUST accept an instance of :class:`~.HttpRequest` as the input and return an instance of :class:`~.HttpResponse`. This is important in order to handle and represent generic HTTP operations. The only time that it won't be returning :class:`~.HttpResponse` would be when it's raising exceptions (see :ref:`framework-exception-handling`). The request downloader MUST resolve Location-based **redirections** when the HTTP method is not ``HEAD``. In other words, for non-``HEAD`` requests the returned :class:`~.HttpResponse` must be the final response, after all redirects. For ``HEAD`` requests redirects MUST NOT be resolved. Lastly, the request downloader function MUST support the ``async/await`` syntax. .. _framework-exception-handling: Exception Handling ------------------ Page Object developers could use the exception classes built inside **web-poet** to handle various ways additional requests MAY fail. In this section, we'll see the rationale and ways the framework MUST be able to do that. Rationale ********* Frameworks that handle **web-poet** MUST be able to ensure that Page Objects having additional requests using :class:`~.HttpClient` are able to work with any type of HTTP downloader implementation. For example, in Python, the common HTTP libraries have different types of base exceptions when something has occurred: * `aiohttp.ClientError `_ * `requests.RequestException `_ * `urllib.error.HTTPError `_ Imagine if Page Objects are **expected** to work in `different` backend implementations like the ones above, then it would cause the code to look like: .. code-block:: python import urllib import aiohttp import attrs import requests import web_poet from web_poet import validates_input @attrs.define class SomePage(web_poet.WebPage): http: web_poet.HttpClient @validates_input async def to_item(self): try: response = await self.http.get("...") except (aiohttp.ClientError, requests.RequestException, urllib.error.HTTPError): # handle the error here ... Such code could turn messy in no time especially when the number of HTTP backends that Page Objects have to support are steadily increasing. Not to mention the plethora of exception types that HTTP libraries have. This means that Page Objects aren't truly portable in different types of frameworks or environments. Rather, they're only limited to work in the specific framework they're supported. In order for Page Objects to work in different Downloader Implementations, the framework that implements the HTTP Downloader backend MUST raise exceptions from the :mod:`web_poet.exceptions.http` module in lieu of the backend specific ones `(e.g. aiohttp, requests, urllib, etc.)`. This makes the code simpler: .. code-block:: python import attrs import web_poet from web_poet import validates_input @attrs.define class SomePage(web_poet.WebPage): http: web_poet.HttpClient @validates_input async def to_item(self): try: response = await self.http.get("...") except web_poet.exceptions.HttpError: # handle the error here ... Expected behavior for Exceptions ******************************** All exceptions that the HTTP Downloader Implementation (see :ref:`advanced-downloader-impl` doc section) explicitly raises when implementing it for **web-poet** MUST be :class:`web_poet.exceptions.http.HttpError` *(or a subclass from it)*. For frameworks that implement and use **web-poet**, exceptions that occurred when handling the additional requests like `connection errors`, `TLS errors`, etc MUST be replaced by :class:`web_poet.exceptions.http.HttpRequestError` by raising it explicitly. For responses that are not really errors like in the ``100-3xx`` status code range, exception MUST NOT be raised at all. For responses with status codes in the ``400-5xx`` range, **web-poet** raises the :class:`web_poet.exceptions.http.HttpResponseError` exception. From this distinction, the framework MUST NOT raise :class:`web_poet.exceptions.http.HttpResponseError` on its own at all, since the :class:`~.HttpClient` already handles that. scrapinghub-web-poet-ba87b95/docs/frameworks/index.rst000066400000000000000000000043661517167256700231650ustar00rootroot00000000000000.. _spec: ======================= Framework specification ======================= Learn how to build a :ref:`web-poet framework `. Design principles ================= :ref:`Page objects ` should be flexible enough to be used with: * synchronous or asynchronous code, callback-based and ``async def / await`` based, * single-node and distributed systems, * different underlying HTTP implementations - or without HTTP support at all, etc. Minimum requirements ==================== A web-poet framework must support building a :ref:`page object ` given a page object class. It must be able to build :ref:`input objects ` for a page object based on type hints on the page object class, i.e. dependency injection, and additional input data required by those input objects, such as a target URL or a dictionary of :ref:`page parameters `. You can implement dependency injection with the andi_ library, which handles signature inspection, :data:`~typing.Optional` and :data:`~typing.Union` annotations, as well as indirect dependencies. For practical examples, see the source code of :mod:`web_poet.framework` and of :doc:`scrapy-poet `. .. _andi: https://github.com/scrapinghub/andi Additional features =================== To provide a better experience to your users, consider extending your web-poet framework further to: - Support as many input classes from the :mod:`web_poet.page_inputs` module as possible. - Support returning a :ref:`page object ` given a target URL and a desired :ref:`output item class `, determining the right :ref:`page object class ` to use based on :ref:`rules `. - Allow users to request an :ref:`output item ` directly, instead of requesting a page object just to call its ``to_item`` method. If you do, consider supporting both synchronous and asynchronous definitions of the ``to_item`` method, e.g. using :func:`~.ensure_awaitable`. - Support :ref:`additional requests `. - Support :ref:`retries `. - Let users set their own :ref:`rules `, e.g. to :ref:`solve conflicts `. scrapinghub-web-poet-ba87b95/docs/frameworks/retries.rst000066400000000000000000000021501517167256700235200ustar00rootroot00000000000000.. _framework-retries: ================== Supporting Retries ================== Web-poet frameworks must catch :exc:`~web_poet.exceptions.core.Retry` exceptions raised from the :meth:`~web_poet.pages.ItemPage.to_item` method of a page object. When :exc:`~web_poet.exceptions.core.Retry` is caught: #. The original request whose response was fed into the page object must be retried. #. A new page object must be created, of the same type as the original page object, and with the same input, except for the response data, which must be the new response. The :meth:`~web_poet.pages.ItemPage.to_item` method of the new page object may raise :exc:`~web_poet.exceptions.core.Retry` again. Web-poet frameworks must allow multiple retries of page objects, repeating the :exc:`~web_poet.exceptions.core.Retry`-capturing logic. However, web-poet frameworks are also encouraged to limit the amount of retries per page object. When retries are exceeded for a given page object, the page object output is ignored. At the moment, web-poet does not enforce any specific maximum number of retries on web-poet frameworks. scrapinghub-web-poet-ba87b95/docs/frameworks/rules.rst000066400000000000000000000015301517167256700231760ustar00rootroot00000000000000.. _framework-rules: ================ Supporting rules ================ Ideally, a framework should support returning the right :ref:`page object ` or :ref:`output item ` given a target URL and a desired :ref:`output item class ` when :ref:`rules ` are used. To provide basic support for rules in your framework, use the :class:`~.RulesRegistry` object at ``web_poet.default_registry`` to choose a page object based on rules: .. code-block:: python from web_poet import default_registry page_cls = default_registry.page_cls_for_item("https://example.com", MyItem) You should also let your users know what is the best approach to :ref:`load rules ` when using your framework. For example, let them know the best location for their calls to the :func:`~.web_poet.rules.consume_modules` function. scrapinghub-web-poet-ba87b95/docs/frameworks/stats.rst000066400000000000000000000006041517167256700232030ustar00rootroot00000000000000.. _framework-stats: ================ Supporting stats ================ To support :ref:`stats `, your framework must provide the :class:`~web_poet.page_inputs.stats.StatCollector` implementation of :class:`~web_poet.page_inputs.stats.Stats`. It is up to you to decide how to store the stats, and how your users can access them at run time (outside page objects) or afterwards. scrapinghub-web-poet-ba87b95/docs/index.rst000066400000000000000000000021661517167256700210010ustar00rootroot00000000000000======== web-poet ======== .. include:: ../README.rst :start-after: intro starts :end-before: intro ends .. warning:: web-poet is in early stages of development; backward-incompatible changes are possible. .. toctree:: :caption: Getting started :maxdepth: 1 intro/overview intro/install intro/tutorial intro/from-ground-up intro/ai .. toctree:: :caption: Writing page objects :maxdepth: 1 page-objects/index page-objects/inputs page-objects/items page-objects/rules page-objects/fields page-objects/additional-requests page-objects/input-validation page-objects/page-params page-objects/stats page-objects/testing page-objects/frameworks .. toctree:: :caption: Using page objects :maxdepth: 1 framework .. toctree:: :caption: Writing frameworks :maxdepth: 1 frameworks/index Rules Additional requests Retries Frameworks .. toctree:: :caption: Reference :maxdepth: 1 api-reference contributing changelog license scrapinghub-web-poet-ba87b95/docs/intro/000077500000000000000000000000001517167256700202665ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/intro/ai.rst000066400000000000000000000030231517167256700214070ustar00rootroot00000000000000.. _ai: =========================== AI-assisted code generation =========================== When using LLMs to write Python code for web scraping, these are the most reasonable approaches to consider: Plain Python functions or classes **Pros:** Simple, dependency-free, and easy for LLMs to produce. **Cons:** You must define your own conventions and testing practices; integration across teams and tools can be ad-hoc. **Use when** you need a quick extractor or the logic is small and unlikely to be reused. :doc:`Scrapy spiders ` **Pros:** Built-in crawling, request scheduling, retries and many utilities. **Cons:** Large surface area for AI generation. Spiders mix crawling, error handling and extraction, which makes testing extraction in isolation difficult. **Avoid** generating full spiders with an LLM; prefer generating extraction logic separately. :ref:`web-poet page objects ` **Pros:** Small, standard contract for extraction with field-level decomposition, first-class testing support, and framework integration. **Cons:** Requires adopting web-poet idioms and a small framework cost, which can be unnecessary for trivial scripts. **Use when** you want maintainability, testability, and a predictable contract that can be used by tools and teams. .. note:: :doc:`scrapy-poet ` provides a great way to use web-poet page objects within Scrapy spiders, giving you the benefits of both approaches. scrapinghub-web-poet-ba87b95/docs/intro/from-ground-up.rst000066400000000000000000000210231517167256700236770ustar00rootroot00000000000000.. _from-ground-up: ================== From the ground up ================== Learn why and how web-poet came to be as you transform a simple, rigid starting web scraping code snippet into maintainable, reusable web-poet code. Writing reusable parsing code ============================= Imagine you are writing code to scrape a book web page from `books.toscrape.com `_, and you implement a ``scrape`` function like this: .. code-block:: python import requests from parsel import Selector def scrape(url: str) -> dict: response = requests.get(url) selector = Selector(response.text) return { "url": response.url, "title": selector.css("h1").get(), } item = scrape( "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html" ) This ``scrape`` function is simple, but it has a big issue: it only supports downloading the specified URL using the requests_ library. What if you want to use aiohttp_, for concurrency support? What if you want to run ``scrape`` with a local snapshot of a URL response, to write an automated test for ``scrape`` that does not rely on a network connection? .. _aiohttp: https://github.com/aio-libs/aiohttp .. _requests: https://requests.readthedocs.io/en/latest/ The first step towards addressing this issue is to split your ``scrape`` function into 2 separate functions, ``download`` and ``parse``: .. code-block:: python import requests from parsel import Selector def parse(response: requests.Response) -> dict: selector = Selector(response.text) return { "url": response.url, "title": selector.css("h1").get(), } def download(url: str) -> requests.Response: return requests.get(url) url = "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html" response = download(url) item = parse(response) Now that ``download`` and ``parse`` are separate functions, you can replace ``download`` with an alternative implementation that uses aiohttp_, or that reads from local files. There is still an issue, though: ``parse`` expects an instance of `requests.Response`_. Any alternative implementation of ``download`` would need to create a response object of the same type, forcing a dependency on requests_ even if downloads are handled with a different library. .. _requests.Response: https://requests.readthedocs.io/en/latest/api/#requests.Response So you need to change the input of the ``parse`` function into something that will not tie you to a specific download library. One option is to create your own, download-independent ``Response`` class, to store the response data that any download function should be able to provide: .. code-block:: python import requests from dataclasses import dataclass from parsel import Selector @dataclass class Response: url: str text: str def parse(response: Response) -> dict: selector = Selector(response.text) return { "url": response.url, "title": selector.css("h1").get(), } def download(url: str) -> Response: response = requests.get(url) return Response(url=response.url, text=response.text) url = "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html" response = download(url) item = parse(response) The ``parse`` function is no longer tied to any specific download library, and alternative versions of the ``download`` function can be implemented with other libraries. Parsing with web-poet ===================== web-poet asks you to organize your code in a very similar way. Let’s convert the ``parse`` function into a :ref:`web-poet page object class `: .. code-block:: python import requests from web_poet import Injectable, HttpResponse class BookPage(Injectable): def __init__(self, response: HttpResponse): self.response = response def to_item(self) -> dict: return { "url": self.response.url, "title": self.response.css("h1").get(), } def download(url: str) -> Response: response = requests.get(url) return HttpResponse( url=response.url, body=response.content, headers=response.headers, ) url = "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html" response = download(url) book_page = BookPage(response=response) item = book_page.to_item() Differences from a previous example: - web-poet provides a standard :class:`~.HttpResponse` class, with helper methods like :meth:`~.HttpResponse.css`. Note how headers are passed when creating an :class:`~.HttpResponse` instance. This is needed to properly decode the body (which is ``bytes``) as text using web browser rules. It involves checking the ``Content-Encoding`` header, HTML meta tags, BOM markers in the body, etc. - Instead of the ``parse`` function we've got a ``BookPage`` class, which inherits from the :class:`~.Injectable` base class, receives response data in its ``__init__`` method, and returns the extracted item in the ``to_item()`` method. ``to_item`` is a standard method name used by ``web-poet``. Receiving a ``response`` argument in ``__init__`` is very common for page objects, so ``web-poet`` provides a shortcut for it: inherit from :class:`~.WebPage`, which provides this ``__init__`` method implementation. You can then refactor your ``BookPage`` class as follows: .. code-block:: python from web_poet import WebPage class BookPage(WebPage): def to_item(self) -> dict: return { "url": self.response.url, "title": self.response.css("h1").get(), } :class:`~.WebPage` even provides shortcuts for some response attributes and methods: .. code-block:: python from web_poet import WebPage class BookPage(WebPage): def to_item(self) -> dict: return { "url": self.url, "title": self.css("h1").get(), } At this point you may be wondering why web-poet requires you to write a class with a ``to_item`` method rather than a function. The answer is flexibility. For example, the use of a class instead of a function makes :ref:`fields ` possible, which make parsing code easier to read: .. code-block:: python from web_poet import WebPage, field class BookPage(WebPage): @field def url(self): return self.url @field def title(self): return self.css("h1").get() Using fields also makes it unnecessary to define ``to_item()`` manually, and allows reading individual fields when you don't need the complete ``to_item()`` output. .. note:: The ``BookPage.to_item()`` method is ``async`` in the example above. See :ref:`fields` for more information. Using classes also makes it easy, for example, to implement dependency injection, which is how web-poet builds :ref:`inputs `. Downloading with web-poet ========================= What about the implementation of the ``download`` function? How would you implement that in web-poet? Well, ideally, you wouldn’t. To parse data from a web page using web-poet, you would only need to write the parsing part, e.g. the ``BookPage`` :ref:`page object class ` above. Then, you let a :ref:`web-poet framework ` handle the download part for you. You pass that framework the URL of a web page to parse, and either a page object class (the ``BookPage`` class here) or an :ref:`item class `, and that's it: .. code-block:: python item = some_framework.get(url, BookPage) web-poet does *not* provide any framework, beyond :ref:`an example one featured in the tutorial ` and not intended for production. The role of web-poet is to define a specification on how to write parsing logic so that it can be reused with different frameworks. :ref:`Page object classes ` should be flexible enough to be used with very different frameworks, including: - synchronous or asynchronous frameworks - asynchronous frameworks based on callbacks or based on coroutines_ (``async def / await`` syntax) .. _coroutines: https://docs.python.org/3/library/asyncio-task.html - single-node and distributed systems - different underlying HTTP implementations, or even implementations with no HTTP support at all scrapinghub-web-poet-ba87b95/docs/intro/install.rst000066400000000000000000000005751517167256700224750ustar00rootroot00000000000000.. _install: ============ Installation ============ To be able to write :ref:`page objects ` and :ref:`test them `, install web-poet `from PyPI`_: .. _from PyPI: https://pypi.org/project/web-poet/ .. code-block:: bash pip install web-poet To use page objects in production, however, you will need a :ref:`web-poet framework `. scrapinghub-web-poet-ba87b95/docs/intro/overview.rst000066400000000000000000000025021517167256700226650ustar00rootroot00000000000000.. _overview: ======== Overview ======== A good web scraping framework helps to keep your code maintainable by, among other things, enabling and encouraging `separation of concerns`_. .. _separation of concerns: https://en.wikipedia.org/wiki/Separation_of_concerns For example, Scrapy_ lets you implement different aspects of web scraping, like ban avoidance or data delivery, into separate components. .. _Scrapy: https://scrapy.org/ However, there are 2 core aspects of web scraping that can be hard to decouple: *crawling*, i.e. visiting URLs, and *parsing*, i.e. extracting data. web-poet lets you :ref:`write data extraction code ` that: - Makes your web scraping code easier to maintain, since your data extraction and crawling code are no longer intertwined and can be maintained separately. - Can be reused with different versions of your crawling code, i.e. with different crawling strategies. - Can be executed independently of your crawling code, enabling easier debugging and easier automated testing. - Can be used with any Python web scraping framework or library that implements the :ref:`web-poet specification `, either directly or through a third-party plugin. See :ref:`frameworks`. To learn more about why and how web-poet came to be, see :ref:`from-ground-up`. scrapinghub-web-poet-ba87b95/docs/intro/tutorial.rst000066400000000000000000000412161517167256700226670ustar00rootroot00000000000000.. _tutorial: ======== Tutorial ======== In this tutorial you will learn to use web-poet as you write web scraping code for book detail pages from `books.toscrape.com`_. .. _books.toscrape.com: http://books.toscrape.com/ To follow this tutorial you must first be familiar with Python_ and :ref:`install web-poet ` with the built-in framework: .. code-block:: bash pip install 'web-poet[framework]' .. _Python: https://docs.python.org/ Create a project directory ========================== web-poet does not limit how you structure your web-poet web scraping code, beyond the limitations of Python itself. However, in this tutorial you will use a specific project directory structure designed with web-poet best practices in mind. Consider using a similar project directory structure in all your web-poet projects. First create your project directory: ``tutorial-project/``. Within the ``tutorial-project`` directory, create: - A ``run.py`` file, a file specific to this tutorial where you will put code to test the execution of your web scraping code. - A ``tutorial`` directory, where you will place your web scraping code. Within the ``tutorial-project/tutorial`` directory, create: - An ``__init__.py`` file, so that the ``tutorial`` directory becomes an importable Python module. - An ``items.py`` file, where you will define item classes to store extracted data. - A ``pages`` directory, where you will define your page object classes. Within the ``tutorial-project/tutorial/pages`` directory, create: - An ``__init__.py`` file. - A ``books_toscrape_com.py`` file, for page object class code targeting `books.toscrape.com`_. Your project directory should look as follows: .. code-block:: text tutorial-project ├── run.py └── tutorial ├── __init__.py ├── items.py └── pages ├── __init__.py └── books_toscrape_com.py Create an item class ==================== While it is possible to store the extracted data in a Python dictionary, it is a good practice to create an item class that: - Defines the specific attributes that you aim to extract, triggering an exception if you extract unintended attributes or fail to extract expected attributes. - Allows defining default values for some attributes. web-poet uses itemadapter_ for item class support, which means that any kind of item class can be used. In this tutorial, you will use attrs_ to define your item class. .. _attrs: https://www.attrs.org/en/stable/ .. _itemadapter: https://github.com/scrapy/itemadapter Copy the following code into ``tutorial-project/tutorial/items.py``: .. literalinclude:: /tutorial-project/tutorial/items.py :language: python :lines: 1-6 This code defines a ``Book`` item class, with a single required ``title`` string attribute to store the book title. ``Book`` is a minimal class designed specifically for this tutorial. In real web-poet projects, you will usually define item classes with many more attributes. .. tip:: For an example of real item classes, see the `zyte-common-items`_ library. .. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/ Also mind that, while in this tutorial you use ``Book`` only for data from 1 website, `books.toscrape.com`_, item classes are usually meant to be usable for many different websites that provide data with a similar data schema. Create a page object class ========================== To write web parsing code with web-poet, you write :ref:`page object classes `, Python classes that define how to extract data from a given type of input, usually some type of webpage from a specific website. In this tutorial you will write a page object class for webpages of `books.toscrape.com`_ that show details about a book, such as these: - http://books.toscrape.com/catalogue/the-exiled_247/index.html - http://books.toscrape.com/catalogue/when-we-collided_955/index.html - http://books.toscrape.com/catalogue/set-me-free_988/index.html Copy the following code into ``tutorial-project/tutorial/pages/books_toscrape_com.py``: .. literalinclude:: /tutorial-project/tutorial/pages/books_toscrape_com.py :language: python :lines: 1-11 In the code above: - You define a page object class named ``BookPage`` by subclassing :class:`~web_poet.pages.WebPage`. It is possible to create a page object class subclassing instead the simpler :class:`~web_poet.pages.ItemPage` class. However, :class:`~web_poet.pages.WebPage`: - Indicates that your page object class requires an HTTP response as input, which gets stored in the :attr:`~web_poet.pages.WebPage.response` attribute of your page object class as an :class:`~web_poet.page_inputs.http.HttpResponse` object. - Provides attributes like :attr:`~web_poet.pages.WebPage.html` and :attr:`~web_poet.pages.WebPage.url`, and methods like :meth:`~web_poet.pages.WebPage.css`, :meth:`~web_poet.pages.WebPage.urljoin`, and :meth:`~web_poet.pages.WebPage.xpath`, that make it easier to write parsing code. - ``BookPage`` declares ``Book`` as its return type. :class:`~web_poet.pages.WebPage`, like its parent class :class:`~web_poet.pages.ItemPage`, is a :ref:`generic class ` that accepts a type parameter. Unlike most generic classes, however, the specified type parameter is used for more than type hinting: it determines the item class that is used to store the data that fields return. - ``BookPage`` is decorated with :meth:`~web_poet.handle_urls`, which indicates for which domain ``BookPage`` is intended to work. It is possible to specify more specific URL patterns, instead of only the target URL domain. However, the URL domain and the output type (``Book``) are usually all the data needed to determine which page object class to use, which is the goal of the :meth:`~web_poet.handle_urls` decorator. - ``BookPage`` defines a field named ``title``. :ref:`Fields ` are methods of page object classes, preferably async methods, decorated with :meth:`~web_poet.fields.field`. Fields define the logic to extract a specific piece of information from the input of your page object class. ``BookPage.title`` extracts the title of a book from a book details webpage. Specifically, it extracts the text from the first ``h1`` element on the input HTTP response. Here, ``title`` is not an arbitrary name. It was chosen specifically to match ``Book.title``, so that during parsing the value that ``BookPage.title`` returns gets mapped to ``Book.title``. .. _tutorial-create-page-object: Use your page object class ========================== Now that you have a page object class defined, it is time to use it. Then copy the following code into ``tutorial-project/run.py``: .. literalinclude:: /tutorial-project/run.py :language: python Execute that code: .. code-block:: bash python tutorial-project/run.py And the ``print(item)`` statement should output the following: .. code-block:: python Book(title="The Exiled") In this tutorial you use :class:`web_poet.framework.Framework`. :mod:`web_poet.framework` is a built-in :ref:`web-poet framework ` for simple use cases, including this tutorial. :class:`~web_poet.framework.Framework` serves to illustrate the power of web-poet: once you have defined your page object class, a web-poet framework only needs 2 inputs from you: - the URL from which you want to extract data, and - the desired output, either a :ref:`page object class ` or, in this case, an :ref:`item class `. If you pass an item class to :meth:`~web_poet.framework.Framework.get_item`, call :func:`~web_poet.rules.consume_modules` once beforehand so your page object classes are registered and the framework can select the correct one. Pass to :func:`~web_poet.rules.consume_modules` the import paths of the modules that define your page object classes. When those modules are loaded, the :meth:`~web_poet.handle_urls` decorators register the classes in :data:`web_poet.default_registry`, which :meth:`~web_poet.framework.Framework.get_item` consults to match a page object class to the given URL and item class. Your web-poet framework can take care of everything else: #. It matches the input URL and item class to ``BookPage``, based on the URL pattern that you defined with the :meth:`~web_poet.handle_urls` decorator, and the return type that you declared in the page object class (``Book``). #. It inspects the inputs declared by ``BookPage``, and builds an instance of ``BookPage`` with the required inputs. ``BookPage`` is a :class:`~web_poet.pages.WebPage` subclass, and :class:`~web_poet.pages.WebPage` declares an attribute named ``response`` of type :class:`~web_poet.page_inputs.http.HttpResponse`. Your web-poet framework sees this, and creates an :class:`~web_poet.page_inputs.http.HttpResponse` object from the input URL as a result, by downloading the URL response, and assigns that object to the ``response`` attribute of a new ``BookPage`` object. #. It builds the output item, ``Book(title='The Exiled')``, using the :meth:`~web_poet.pages.ItemPage.to_item` method of ``BookPage``, inherited from :class:`~web_poet.pages.ItemPage`, which in turn uses all fields of ``BookPage`` to create an instance of ``Book``, which you declared as the return type of ``BookPage``. Extend and override your code ============================= To continue this tutorial, you will need extended versions of ``Book`` and ``BookPage``, with additional fields. However, rather than editing the existing ``Book`` and ``BookPage`` classes, you will see how you can instead create new classes that inherit them. Append the following code to ``tutorial-project/tutorial/items.py``: .. literalinclude:: /tutorial-project/tutorial/items.py :language: python :lines: 9-15 The code above defines a new item class, ``CategorizedBook``, that inherits the ``title`` attribute from ``Book`` and defines 2 more attributes: ``category`` and ``category_rank``. Append the following code to ``tutorial-project/tutorial/pages/books_toscrape_com.py``: .. literalinclude:: /tutorial-project/tutorial/pages/books_toscrape_com.py :language: python :lines: 16, 19-23, 25, 29-32 In the code above: - You define a new page object class: ``CategorizedBookPage``. - ``CategorizedBookPage`` subclasses ``BookPage``, inheriting its ``title`` field, and defining a new one: ``category``. ``CategorizedBookPage`` does *not* define a ``category_rank`` field yet, you will add it later on. For now, the default value defined in ``CategorizedBook`` for ``category_rank`` will be ``None``. - ``CategorizedBookPage`` indicates that it returns a ``CategorizedBook`` object. :class:`~web_poet.pages.WebPage` is a :ref:`generic class `, which is why we could use ``WebPage[Book]`` in the definition of ``BookPage`` to indicate ``Book`` as the output type of ``BookPage``. However, ``BookPage`` is not a generic class, so something like ``BookPage[CategorizedBook]`` would not work. So instead you use :class:`~web_poet.pages.Returns`, a special, generic class that you can inherit to re-define the output type of your page object subclasses. After you update your ``tutorial-project/run.py`` script to request a ``CategorizedBook`` item: .. literalinclude:: /tutorial-project/run_categorized.py :language: python :emphasize-lines: 6, 14 And you execute it again: .. code-block:: bash python tutorial-project/run.py You can see in the new output that your new classes have been used: .. code-block:: python CategorizedBook(title="The Exiled", category="Mystery", category_rank=None) Use additional requests ======================= To extract data about an item, sometimes the HTTP response to a single URL is not enough. Sometimes, you need additional HTTP responses to get all the data that you want. That is the case with the ``category_rank`` attribute. The ``category_rank`` attribute indicates the position in which a book appears in the list of books of the category of that book. For example, `The Exiled`_ is 24th in the Mystery_ category, so the value of ``category_rank`` should be ``24`` for that book. .. _The Exiled: http://books.toscrape.com/catalogue/the-exiled_247/index.html .. _Mystery: https://books.toscrape.com/catalogue/category/books/mystery_3/ However, there is no indication of this value in the book details page. To get this value, you need to visit the URL of the category of the book whose data you are extracting, find the entry of that book within the grid of books of the category, and record in which position you found it. And categories with more than 20 books are split into multiple pages, so you may need more than 1 additional request for some books. Extend ``CategorizedBookPage`` in ``tutorial-project/tutorial/pages/books_toscrape_com.py`` as follows: .. literalinclude:: /tutorial-project/tutorial/pages/books_toscrape_com.py :language: python :lines: 14-15, 17, 19-26, 28-35, 39-52 :emphasize-lines: 1-3, 9, 11-12, 18-33 In the code above: - You declare a new input in ``CategorizedBookPage``, ``http``, of type :class:`~web_poet.page_inputs.client.HttpClient`. You also add the ``@attrs.define`` decorator to ``CategorizedBookPage``, as it is required when adding new required attributes to subclasses of attrs_ classes. - You define the ``category_rank`` field so that it uses the ``http`` input object to send additional requests to find the position of the current book within its category. Specifically: #. You extract the category URL from the book details page. #. You visit that category URL, and you iterate over the listed books until you find one with the same URL as the current book. If you find a match, you return the position at which you found the book. #. If there is no match, and there is a next page, you repeat the previous step with the URL of that next page as the category URL. #. If at some point there are no more “next” pages and you have not yet found the book, you return ``None``. When you execute ``tutorial-project/run.py`` now, ``category_rank`` has the expected value: .. code-block:: python CategorizedBook(title="The Exiled", category="Mystery", category_rank=24) Use parameters ============== You may notice that the execution takes longer now. That is because ``CategorizedBookPage`` now requires 2 or more requests, to find the value of the ``category_rank`` attribute. If you use ``CategorizedBookPage`` as part of a web scraping project that targets a single book URL, it cannot be helped. If you want to extract the ``category_rank`` attribute, you need those additional requests. Your only option to avoid additional requests is to stop extracting the ``category_rank`` attribute. However, if your web scraping project is targeting all book URLs from one or more categories by visiting those category URLs, extracting book URLs from them, and then using ``CategorizedBookPage`` with those book URLs as input, there is something you can change to save many requests: keep track of the positions where you find books as you visit their categories, and pass that position to ``CategorizedBookPage`` as additional input. Extend ``CategorizedBookPage`` in ``tutorial-project/tutorial/pages/books_toscrape_com.py`` as follows: .. literalinclude:: /tutorial-project/tutorial/pages/books_toscrape_com.py :language: python :lines: 14-15, 18-52 :emphasize-lines: 3, 12, 21-23 In the code above, you declare a new input in ``CategorizedBookPage``, ``page_params``, of type :class:`~web_poet.page_inputs.page_params.PageParams`. It is a dictionary of parameters that you may receive from the code using your page object class. In the ``category_rank`` field, you check if you have received a parameter also called ``category_rank``, and if so, you return that value instead of using additional requests to find the value. You can now update your ``tutorial-project/run.py`` script to pass that parameter to :meth:`~web_poet.framework.Framework.get_item`: .. literalinclude:: /tutorial-project/run_params.py :language: python :emphasize-lines: 15 When you execute ``tutorial-project/run.py`` now, execution should take less time, but the result should be the same as before: .. code-block:: python CategorizedBook(title="The Exiled", category="Mystery", category_rank=24) Only that now the value of ``category_rank`` comes from ``tutorial-project/run.py``, and not from additional requests sent by ``CategorizedBookPage``. scrapinghub-web-poet-ba87b95/docs/license.rst000066400000000000000000000000771517167256700213130ustar00rootroot00000000000000.. _license: ======= License ======= .. include:: ../LICENSE scrapinghub-web-poet-ba87b95/docs/page-objects/000077500000000000000000000000001517167256700214765ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/page-objects/additional-requests.rst000066400000000000000000000147271517167256700262240ustar00rootroot00000000000000.. _additional-requests: =================== Additional requests =================== Some websites require page interactions to load some information, such as clicking a button, scrolling down or hovering on some element. These interactions usually trigger background requests that are then loaded using JavaScript. To extract such data, reproduce those requests using :class:`~.HttpClient`. Include :class:`~.HttpClient` among the :ref:`inputs ` of your :ref:`page object `, and use an asynchronous :ref:`field ` or method to call one of its methods. For example, simulating a click on a button that loads product images could look like: .. code-block:: python import attrs from web_poet import HttpClient, HttpError, field from zyte_common_items import Image, ProductPage @attrs.define class MyProductPage(ProductPage): http: HttpClient @field def productId(self): return self.css("::attr(product-id)").get() @field async def images(self): url = f"https://api.example.com/v2/images?id={self.productId}" try: response = await self.http.get(url) except HttpError: return [] else: urls = response.css(".product-images img::attr(src)").getall() return [Image(url=url) for url in urls] .. warning:: :class:`~.HttpClient` should only be used to handle the type of scenarios mentioned above. Using :class:`~.HttpClient` for crawling logic would defeat :ref:`the purpose of web-poet `. Making a request ================ :class:`~.HttpClient` provides multiple asynchronous request methods, such as: .. code-block:: python http = HtpClient() response = await http.get(url) response = await http.post(url, body=b"...") response = await http.request(url, method="...") response = await http.execute(HttpRequest(url, method="...")) Request methods also accept custom headers and body, for example: .. code-block:: python http.post( url, headers={"Content-Type": "application/json;charset=UTF-8"}, body=json.dumps({"foo": "bar"}).encode("utf-8"), ) Request methods may either raise an :class:`~.HttpError` or return an :class:`~.HttpResponse`. See :ref:`httpresponse`. .. note:: :class:`~.HttpClient` methods are expected to follow any redirection except when the request method is ``HEAD``. This means that the :class:`~.HttpResponse` that you get is already the end of any redirection trail. Concurrent requests =================== To send multiple requests concurrently, use :meth:`HttpClient.batch_execute <.HttpClient.batch_execute>`, which accepts any number of :class:`~.HttpRequest` instances as input, and returns :class:`~.HttpResponse` instances (and :class:`~.HttpError` instances when using ``return_exceptions=True``) in the input order. For example: .. code-block:: python import attrs from web_poet import HttpClient, HttpError, HttpRequest, field from zyte_common_items import Image, ProductPage, ProductVariant @attrs.define class MyProductPage(ProductPage): http: HttpClient max_variants = 10 @field def productId(self): return self.css("::attr(product-id)").get() @field async def variants(self): requests = [ HttpRequest(f"https://example.com/api/variant/{self.productId}/{index}") for index in range(self.max_variants) ] responses = await self.http.batch_execute(*requests, return_exceptions=True) return [ ProductVariant(color=response.css("::attr(color)").get()) for response in responses if not isinstance(response, HttpError) ] You can alternatively use :mod:`asyncio` together with :class:`~.HttpClient` to handle multiple requests. For example, you can use :func:`asyncio.as_completed` to process the first response from a group of requests as early as possible. Error handling ============== :class:`~.HttpClient` methods may raise an exception of type :class:`~.HttpError` or a subclass. If the response HTTP status code (:attr:`response.status <.HttpResponse.status>`) is 400 or higher, :class:`~.HttpResponseError` is raised. In case of connection errors, TLS errors and similar, :class:`~.HttpRequestError` is raised. :class:`~.HttpError` provides access to the offending :attr:`~.HttpError.request`, and :class:`~.HttpResponseError` also provides access to the offending :attr:`~.HttpResponseError.response`. .. _retries-additional-requests: Retrying additional requests ============================ :ref:`Input validation ` allows retrying all inputs from a page object. To retry only additional requests, you must handle retries on your own. Your code is responsible for retrying additional requests until good response data is received, or until some maximum number of retries is exceeded. It is up to you to decide what the maximum number of retries should be for a given additional request, based on your experience with the target website. It is also up to you to decide how to implement retries of additional requests. One option would be tenacity_. For example, to try an additional request 3 times before giving up: .. _tenacity: https://tenacity.readthedocs.io/en/latest/index.html .. code-block:: python import attrs from tenacity import retry, stop_after_attempt from web_poet import HttpClient, HttpError, field from zyte_common_items import ProductPage @attrs.define class MyProductPage(ProductPage): http: HttpClient @field def productId(self): return self.css("::attr(product-id)").get() @retry(stop=stop_after_attempt(3)) async def get_images(self): return self.http.get(f"https://api.example.com/v2/images?id={self.productId}") @field async def images(self): try: response = await self.get_images() except HttpError: return [] else: urls = response.css(".product-images img::attr(src)").getall() return [Image(url=url) for url in urls] If the reason your additional request fails is outdated or missing data from page object input, do not try to reproduce the request for that input as an additional request. :ref:`Request fresh input instead `. scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/000077500000000000000000000000001517167256700242245ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/attrs.py000066400000000000000000000003501517167256700257310ustar00rootroot00000000000000from attrs import define from web_poet import HttpResponse, ItemPage, field @define class FooPage(ItemPage[MyItem]): response: HttpResponse @field def foo(self) -> str: return self.response.css(".foo").get() scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/browserpage.py000066400000000000000000000002311517167256700271120ustar00rootroot00000000000000from web_poet import BrowserPage, field class FooPage(BrowserPage[MyItem]): @field def foo(self) -> str: return self.css(".foo").get() scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/itempage.py000066400000000000000000000003741517167256700263750ustar00rootroot00000000000000from web_poet import HttpResponse, ItemPage, field class FooPage(ItemPage[MyItem]): def __init__(self, response: HttpResponse): self.response = response @field def foo(self) -> str: return self.response.css(".foo").get() scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/raw-create.py000066400000000000000000000002071517167256700266270ustar00rootroot00000000000000foo_page = FooPage( response=HttpResponse( "https://example.com", b"\nFoo", ), ) scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/register.py000066400000000000000000000002721517167256700264230ustar00rootroot00000000000000from web_poet import WebPage, field, handle_urls @handle_urls("example.com") class FooPage(WebPage[MyItem]): @field def foo(self) -> str: return self.css(".foo").get() scrapinghub-web-poet-ba87b95/docs/page-objects/code-examples/webpage.py000066400000000000000000000002211517167256700262030ustar00rootroot00000000000000from web_poet import WebPage, field class FooPage(WebPage[MyItem]): @field def foo(self) -> str: return self.css(".foo").get() scrapinghub-web-poet-ba87b95/docs/page-objects/fields.rst000066400000000000000000000532061517167256700235040ustar00rootroot00000000000000.. _fields: ====== Fields ====== A field is a read-only property in a :ref:`page object class ` decorated with :meth:`@field ` instead of :class:`@property `. Each field is named after a key of the :ref:`item ` that the page object class returns. A field uses the :ref:`inputs ` of its page object class to return the right value for the matching item key. For example: .. code-block:: python from typing import Optional import attrs from web_poet import ItemPage, HttpResponse, field @attrs.define class MyPage(ItemPage): response: HttpResponse @field def foo(self) -> Optional[str]: return self.response.css(".foo").get() .. _fields-sync-async: Synchronous and asynchronous fields =================================== Fields can be either synchronous (``def``) or asynchronous (``async def``). Asynchronous fields make sense, for example, when sending :ref:`additional requests `: .. code-block:: python from typing import Optional import attrs from web_poet import ItemPage, HttpClient, HttpResponse, field @attrs.define class MyPage(ItemPage): response: HttpResponse http: HttpClient @field def name(self) -> Optional[str]: return self.response.css(".name").get() @field async def price(self) -> Optional[str]: resp = await self.http.get("...") return resp.json().get("price") Unlike the values of synchronous fields, the values of asynchronous fields need to be awaited: .. code-block:: python page = MyPage(...) name = page.name price = await page.price Mixing synchronous and asynchronous fields can be messy: - You need to know whether a field is synchronous or asynchronous to write the right code to read its value. - If a field changes from synchronous to asynchronous or vice versa, calls that read the field need to be updated. Changing from synchronous to asynchronous might be sometimes necessary due to website changes (e.g. needing :ref:`additional requests `). To address these issues, use :func:`~.ensure_awaitable` to read both synchronous and asynchronous fields with the same code: .. code-block:: python from web_poet.utils import ensure_awaitable page = MyPage(...) name = await ensure_awaitable(page.name) price = await ensure_awaitable(page.price) .. note:: Using asynchronous fields only also works, but prevents accessing other fields from :ref:`field processors `. .. _inheritance: Inheritance =========== To create a page object class that is very similar to another, subclassing the former page object class is often a good approach to maximize code reuse. In a subclass of a :ref:`page object class ` you can :ref:`reimplement fields `, :ref:`add fields `, :ref:`remove fields `, or :ref:`rename fields `. .. _reimplement-field: Reimplementing a field ---------------------- Reimplementing a field when subclassing a :ref:`page object class ` should be straightforward: .. code-block:: python import attrs from web_poet import field, ensure_awaitable from my_library import BasePage @attrs.define class CustomPage(BasePage): @field async def foo(self) -> str: base_foo = await ensure_awaitable(super().foo) return f"{base_foo} (modified)" .. _add-field: Adding a field -------------- To add a new field to a :ref:`page object class ` when subclassing: #. Define a new :ref:`item class ` that includes the new field, for example a subclass of the item class returned by the original page object class. #. In your new page object class, subclass both the original page object class and :class:`~.Returns`, the latter including the new item class between brackets. #. Implement the extraction code for the new :ref:`field ` in the new page object class. For example: .. code-block:: python import attrs from web_poet import field, Returns from my_library import BasePage, BaseItem @attrs.define class CustomItem(BaseItem): new_field: str @attrs.define class CustomPage(BasePage, Returns[CustomItem]): @field def new_field(self) -> str: ... .. _remove-field: Removing a field ---------------- To remove a field from a :ref:`page object class ` when subclassing: #. Define a new :ref:`item class ` that defines all fields but the one being removed. #. In your new page object class, subclass the original page object class, :class:`~.Returns` with the new item class between brackets, and set ``skip_nonitem_fields=True``. When building an item, page object class fields without a matching item class field will now be ignored, rather than raising an exception. Your new page object class will still define the field, but the resulting item will not. For example: .. code-block:: python import attrs from web_poet import Returns from my_library import BasePage @attrs.define class CustomItem: kept_field: str @attrs.define class CustomPage(BasePage, Returns[CustomItem], skip_nonitem_fields=True): pass Alternatively, you can consider :ref:`using a page object as input ` for removing fields. It is more verbose than subclassing, because you need to define every field in your page object class, but it can catch some mismatches between page object class fields and item class fields that would otherwise be hidden by ``skip_nonitem_fields``. .. _rename-field: Renaming a field ---------------- To rename a field from a :ref:`page object class ` when subclassing: #. Define a new :ref:`item class ` that defines all fields, including the renamed field. #. In your new page object class, subclass the original page object class, :class:`~.Returns` with the new item class between brackets, and set ``skip_nonitem_fields=True``. When building an item, page object class fields without a matching item class field will now be ignored, rather than raising an exception. #. Define a field for the new field name that returns the value from the old field name. Your new page object class will still define the old field name, but the resulting item will not. For example: .. code-block:: python import attrs from web_poet import Returns from my_library import BasePage @attrs.define class CustomItem: new_field: str @attrs.define class CustomPage(BasePage, Returns[CustomItem], skip_nonitem_fields=True): @field async def new_field(self) -> str: return ensure_awaitable(self.old_field) Alternatively, you can consider :ref:`using a page object as input ` for renaming fields. It is more verbose than subclassing, because you need to define every field in your page object class, but it can catch some mismatches between page object class fields and item class fields that would otherwise be hidden by ``skip_nonitem_fields``. .. _composition: Composition =========== There are 2 forms of composition that you can use when writing a page object: :ref:`using a page object as input `, and :ref:`using a field mixing `. .. _composition-input: Using a page object as input ---------------------------- You can reuse a page object class from another page object class using composition instead of :ref:`inheritance ` by using the original page object class as a dependency in a brand new page object class returning a brand new item class. This is a good approach when you want to reuse code but the page object classes are very different, or when you want to remove or rename fields without relying on ``skip_nonitem_fields``. For example: .. code-block:: python import attrs from web_poet import ItemPage, field, ensure_awaitable from my_library import BasePage @attrs.define class CustomItem: name: str @attrs.define class CustomPage(ItemPage[CustomItem]): base: BasePage @field async def name(self) -> str: name = await ensure_awaitable(self.base.name) brand = await ensure_awaitable(self.base.brand) return f"{brand}: {name}" Instead of a page object, it is possible to declare the :ref:`item ` it returns as a dependency in your new page object class. For example: .. code-block:: python import attrs from web_poet import ItemPage, field from my_library import BaseItem @attrs.define class CustomItem: name: str @attrs.define class CustomPage(ItemPage[CustomItem]): base: BaseItem @field def name(self) -> str: return f"{self.base.brand}: {self.base.name}" This gives you the flexibility to use :ref:`rules ` to set the page object class to use when building the item. Also, item fields can be read from synchronous methods even if the source page object fields were :ref:`asynchronous `. On the other hand, all fields of the source page object class will always be called to build the entire item, which may be a waste of resources if you only need to access some of the item fields. .. _field-mixins: Field mixins ------------ You can subclass :class:`web_poet.fields.FieldsMixin` to create a mixin_ to reuse field definitions across multiple, otherwise-unrelated classes. For example: .. _mixin: https://en.wikipedia.org/wiki/Mixin .. code-block:: python import attrs from web_poet import ItemPage, field from web_poet.fields import FieldsMixin from my_library import BaseItem1, BaseItem2 @attrs.define class CustomItem: name: str class NameMixin(FieldsMixin): @field def name(self) -> str: return f"{self.base.brand}: {self.base.name}" @attrs.define class CustomPage1(NameMixin, ItemPage[CustomItem]): base: BaseItem1 @attrs.define class CustomPage2(NameMixin, ItemPage[CustomItem]): base: BaseItem2 .. _field-processors: Field processors ================ It's often needed to clean or process field values using reusable functions. :meth:`@field ` takes an optional ``out`` argument with a list of such functions. They will be applied to the field value before returning it: .. code-block:: python from web_poet import ItemPage, HttpResponse, field def clean_tabs(s: str) -> str: return s.replace("\t", " ") def add_brand(s: str, page: ItemPage) -> str: return f"{page.brand} - {s}" class MyPage(ItemPage): response: HttpResponse @field(out=[clean_tabs, str.strip, add_brand]) def name(self) -> str: return self.response.css(".name ::text").get() or "" @field(cached=True) def brand(self) -> str: return self.response.css(".brand ::text").get() or "" .. _processor-page: Accessing other fields from field processors -------------------------------------------- If a processor takes an argument named ``page``, that argument will contain the page object instance. This allows processing a field differently based on the values of other fields. Be careful of circular references. Accessing a field runs its processors; if two fields reference each other, :class:`RecursionError` will be raised. You should enable :ref:`caching ` for fields accessed in processors, to avoid unnecessary recomputation. Processors can be applied to asynchronous fields, but processor functions must be synchronous. As a result, only values of synchronous fields can be accessed from processors through the ``page`` argument. .. _default-processors: Default processors ------------------ In addition to the ``out`` argument of :meth:`@field `, you can define processors at the page object class level by defining a nested class named ``Processors``: .. code-block:: python import attrs from web_poet import ItemPage, HttpResponse, field def clean_tabs(s: str) -> str: return s.replace("\t", " ") @attrs.define class MyPage(ItemPage): response: HttpResponse class Processors: name = [clean_tabs, str.strip] @field def name(self) -> str: return self.response.css(".name ::text").get() or "" If ``Processors`` contains an attribute with the same name as a field, the value of that attribute is used as a list of default processors for the field, to be used if the ``out`` argument of :meth:`@field ` is not defined. You can also reuse and extend the processors defined in a base class by explicitly accessing or subclassing the ``Processors`` class: .. code-block:: python import attrs from web_poet import ItemPage, HttpResponse, field def clean_tabs(s: str) -> str: return s.replace("\t", " ") @attrs.define class MyPage(ItemPage): response: HttpResponse class Processors: name = [str.strip] @field def name(self) -> str: return self.response.css(".name ::text").get() or "" class MyPage2(MyPage): class Processors(MyPage.Processors): # name uses the processors in MyPage.Processors.name # description now also uses them and also clean_tabs description = MyPage.Processors.name + [clean_tabs] @field def description(self) -> str: return self.response.css(".description ::text").get() or "" # brand uses the same processors as name @field(out=MyPage.Processors.name) def brand(self) -> str: return self.response.css(".brand ::text").get() or "" .. _default-processors-nested: Processors for nested fields ---------------------------- Some item fields contain nested items (e.g. a product can contain a list of variants) and it's useful to have processors for fields of these nested items. You can use the same logic for them as for normal fields if you define an extractor class that produces these nested items. Such classes should inherit from :class:`~.Extractor`. In the simplest cases you need to pass a selector to them: .. code-block:: python from typing import Any, Dict, List import attrs from parsel import Selector from web_poet import Extractor, ItemPage, HttpResponse, field @attrs.define class MyPage(ItemPage): response: HttpResponse @field async def variants(self) -> List[Dict[str, Any]]: variants = [] for color_sel in self.response.css(".color"): variant = await VariantExtractor(color_sel).to_item() variants.append(variant) return variants @attrs.define class VariantExtractor(Extractor): sel: Selector @field(out=[str.strip]) def color(self) -> str: return self.sel.css(".name::text").get() or "" In such cases you can also use :class:`~.SelectorExtractor` as a shortcut that provides ``css()`` and ``xpath()``: .. code-block:: python class VariantExtractor(SelectorExtractor): @field(out=[str.strip]) def color(self) -> str: return self.css(".name::text").get() or "" You can also pass other data in addition to, or instead of, selectors, such as dictionaries with some data: .. code-block:: python @attrs.define class VariantExtractor(Extractor): variant_data: dict @field(out=[str.strip]) def color(self) -> str: return self.variant_data.get("color") or "" .. _field-caching: Field caching ============= When writing extraction code for Page Objects, it's common that several attributes reuse some computation. For example, you might need to do an additional request to get an API response, and then fill several attributes from this response: .. code-block:: python from typing import Dict, Optional from web_poet import ItemPage, HttpResponse, HttpClient, validates_input class MyPage(ItemPage): response: HttpResponse http: HttpClient @validates_input async def to_item(self) -> Dict[str, Optional[str]]: api_url = self.response.css("...").get() api_response = await self.http.get(api_url).json() return { "name": self.response.css(".name ::text").get(), "price": api_response.get("price"), "sku": api_response.get("sku"), } When converting such Page Objects to use fields, be careful not to make an API call (or some other heavy computation) multiple times. You can do it by extracting the heavy operation to a method, and caching the results: .. code-block:: python from typing import Dict from web_poet import ItemPage, HttpResponse, HttpClient, field, cached_method class MyPage(ItemPage): response: HttpResponse http: HttpClient @cached_method async def api_response(self) -> Dict[str, str]: api_url = self.response.css("...").get() return await self.http.get(api_url).json() @field def name(self) -> str: return self.response.css(".name ::text").get() or "" @field async def price(self) -> str: api_response = await self.api_response() return api_response.get("price") or "" @field async def sku(self) -> str: api_response = await self.api_response() return api_response.get("sku") or "" As you can see, ``web-poet`` provides :func:`~.cached_method` decorator, which allows to memoize the function results. It supports both sync and async methods, i.e. you can use it on regular methods (``def foo(self)``), as well as on async methods (``async def foo(self)``). The refactored example, with per-attribute fields, is more verbose than the original one, where a single ``to_item`` method is used. However, it provides some advantages — if only a subset of attributes is needed, then it's possible to use the Page Object without doing unnecessary work. For example, if user only needs ``name`` field in the example above, no additional requests (API calls) will be made. Sometimes you might want to cache a ``@field``, i.e. a property which computes an attribute of the final item. In such cases, use ``@field(cached=True)`` decorator instead of ``@field``. ``cached_method`` vs ``lru_cache`` vs ``cached_property`` --------------------------------------------------------- If you're an experienced Python developer, you might wonder why is :func:`~.cached_method` decorator needed, if Python already provides :func:`functools.lru_cache`. For example, one can write this: .. code-block:: python from functools import lru_cache from web_poet import ItemPage class MyPage(ItemPage): ... @lru_cache def heavy_method(self): ... Don't do it! There are two issues with :func:`functools.lru_cache`, which make it unsuitable here: 1. It doesn't work properly on methods, because ``self`` is used as a part of the cache key. It means a reference to an instance is kept in the cache, and so created page objects are never deallocated, causing a memory leak. 2. :func:`functools.lru_cache` doesn't work on ``async def`` methods, so you can't cache e.g. results of API calls using :func:`functools.lru_cache`. :func:`~.cached_method` solves both of these issues. You may also use :func:`functools.cached_property`, or an external package like async_property_ with async versions of ``@property`` and ``@cached_property`` decorators; unlike :func:`functools.lru_cache`, they all work fine for this use case. .. _async_property: https://github.com/ryananguiano/async_property Exception caching ----------------- Note that exceptions are not cached - neither by :func:`~.cached_method`, nor by `@field(cached=True)`, nor by :func:`functools.lru_cache`, nor by :func:`functools.cached_property`. Usually it's not an issue, because an exception is usually propagated, and so there are no duplicate calls anyways. But, just in case, keep this in mind. Field metadata ============== ``web-poet`` allows to store arbitrary information for each field using the ``meta`` keyword argument: .. code-block:: python from web_poet import ItemPage, field class MyPage(ItemPage): @field(meta={"expensive": True}) async def my_field(self): ... To retrieve this information, use :func:`web_poet.fields.get_fields_dict`; it returns a dictionary, where keys are field names, and values are :class:`web_poet.fields.FieldInfo` instances. .. code-block:: python from web_poet.fields import get_fields_dict fields_dict = get_fields_dict(MyPage) field_names = fields_dict.keys() my_field_meta = fields_dict["my_field"].meta print(field_names) # dict_keys(['my_field']) print(my_field_meta) # {'expensive': True} Input validation ================ :ref:`Input validation `, if used, happens before field evaluation, and it may override the values of fields, preventing field evaluation from ever happening. For example: .. code-block:: python class Page(ItemPage[Item]): def validate_input(self) -> Item: return Item(foo="bar") @field def foo(self): raise RuntimeError("This exception is never raised") assert Page().foo == "bar" Field evaluation may still happen for a field if the field is used in the implementation of the ``validate_input`` method. Note, however, that only synchronous fields can be used from the ``validate_input`` method. scrapinghub-web-poet-ba87b95/docs/page-objects/frameworks.rst000066400000000000000000000013311517167256700244060ustar00rootroot00000000000000.. _frameworks: ========== Frameworks ========== :ref:`Page objects ` are not meant to be used in isolation with web-poet. They are meant to be used with a web-poet framework. A web-poet framework is a Python web scraping framework, library, or plugin that implements the :ref:`web-poet specification `. At the moment, the only production-ready web-poet framework that exists is scrapy-poet_, which brings web-poet support to Scrapy_. .. _Scrapy: https://scrapy.org/ .. _scrapy-poet: https://scrapy-poet.readthedocs.io/en/stable/ There is also a :ref:`built-in framework ` for simple use cases. As web-poet matures and sees wider adoption, we hope to see more frameworks add support for it. scrapinghub-web-poet-ba87b95/docs/page-objects/index.rst000066400000000000000000000102221517167256700233340ustar00rootroot00000000000000.. _page-objects: ============ Page objects ============ A page object is a code wrapper for a webpage, or for a part of a webpage, that implements the logic to parse the raw webpage data into structured data. To use web-poet, :ref:`define page object classes ` for your target websites, and :ref:`get the output item ` using a :ref:`web-poet framework `. .. _page-object-classes: Defining a page object class ============================ A page object class is a Python class that: - Subclasses :class:`~web_poet.pages.ItemPage`. - Declares :ref:`typed input parameters ` in its ``__init__`` method. - Uses :ref:`fields `. Alternatively, you can implement a ``to_item`` method, which can be synchronous or asynchronous, and returns the webpage content as an :ref:`item `. For example: .. literalinclude:: code-examples/itempage.py .. note:: ``MyItem`` in the code examples of this page is a placeholder for an :ref:`item class `. Minimizing boilerplate ---------------------- There are a few ways for you to minimize boilerplate when defining a page object class. For example, you can use attrs_ to remove the need for a custom ``__init__`` method: .. _attrs: https://www.attrs.org/en/stable/index.html .. literalinclude:: code-examples/attrs.py If your page object class needs :class:`~web_poet.page_inputs.http.HttpResponse` as input, there is also :class:`~web_poet.pages.WebPage`, an :class:`~web_poet.pages.ItemPage` subclass that declares an :class:`~web_poet.page_inputs.http.HttpResponse` input and provides helper methods to use it: .. literalinclude:: code-examples/webpage.py Similarly, if your page object class needs :class:`~web_poet.page_inputs.browser.BrowserResponse` as input, use :class:`~web_poet.pages.BrowserPage`, which works the same way as :class:`~web_poet.pages.WebPage` but for browser-rendered pages: .. literalinclude:: code-examples/browserpage.py .. _output-item: Getting the output item ======================= You should :ref:`include your page object classes into a page object registry `, e.g. decorate them with :func:`~.handle_urls`: .. literalinclude:: code-examples/register.py Then, provided your page object class code is imported (see :func:`~web_poet.rules.consume_modules`), your :ref:`framework ` can build the output item after you provide the target URL and the desired :ref:`output item class `, as :ref:`shown in the tutorial `. Your framework chooses the right page object class based on your input parameters, downloads the required data, builds a page object, and calls the ``to_item`` method of that page object. Note that, while the examples above use :class:`dict` as an output item for simplicity, using less generic :ref:`item classes ` is recommended. That way, you can use different page object classes, with different output items, for the same website. Getting a page object --------------------- Alternatively, frameworks can return a page object instead of an item, and you can call ``to_item`` yourself. However, there are drawbacks to this approach: - ``to_item`` can be synchronous or asynchronous, so you need to use :func:`~web_poet.utils.ensure_awaitable`: .. code-block:: python from web_poet.utils import ensure_awaitable item = await ensure_awaitable(foo_page.to_item()) - ``to_item`` may raise certain exceptions, like :exc:`~web_poet.exceptions.core.Retry` or :exc:`~web_poet.exceptions.core.UseFallback`, which, depending on your :ref:`framework `, may not be handled automatically when getting a page object instead of an item. Building a page object manually ------------------------------- It is possible to create a page object from a page object class passing its inputs as parameters. For example, to manually create an instance of the ``FooPage`` page object class defined above: .. literalinclude:: code-examples/raw-create.py However, your code will break if the page object class changes its :ref:`inputs `. Building page objects using :ref:`frameworks ` prevents that. scrapinghub-web-poet-ba87b95/docs/page-objects/input-validation.rst000066400000000000000000000104151517167256700255200ustar00rootroot00000000000000.. _input-validation: ================ Input validation ================ Sometimes the data that your page object receives as input may be invalid. You can define a ``validate_input`` method in a page object class to check its input data and determine how to handle invalid input. ``validate_input`` is called on the first execution of ``ItemPage.to_item()`` or the first access to a :ref:`field `. In both cases validation happens early; in the case of fields, it happens before field evaluation. ``validate_input`` is a synchronous method that expects no parameters, and its outcome may be any of the following: - Return ``None``, indicating that the input is valid. .. _retries-input: - Raise :exc:`~web_poet.exceptions.Retry`, indicating that the input looks like the result of a temporary issue, and that trying to fetch similar input again may result in valid input. See also :ref:`retries-additional-requests`. - Raise :exc:`~web_poet.exceptions.UseFallback`, indicating that the page object does not support the input, and that an alternative parsing implementation should be tried instead. For example, imagine you have a page object for website commerce.example, and that commerce.example is built with a popular e-commerce web framework. You could have a generic page object for products of websites using that framework, ``FrameworkProductPage``, and a more specific page object for commerce.example, ``EcommerceExampleProductPage``. If ``EcommerceExampleProductPage`` cannot parse a product page, but it looks like it might be a valid product page, you would raise :exc:`~web_poet.exceptions.UseFallback` to try to parse the same product page with ``FrameworkProductPage``, in case it works. .. note:: web-poet does not dictate how to define or use an alternative parsing implementation as fallback. It is up to web-poet frameworks to choose how they implement fallback handling. - Return an item to override the output of the ``to_item`` method and of fields. For input not matching the expected type of data, returning an item that indicates so is recommended. For example, if your page object parses an e-commerce product, and the input data corresponds to a list of products rather than a single product, you could return a product item that somehow indicates that it is not a valid product item, such as ``Product(is_valid=False)``. For example: .. code-block:: python def validate_input(self): if self.css(".product-id::text") is not None: return if self.css(".http-503-error"): raise Retry() if self.css(".product"): raise UseFallback() if self.css(".product-list"): return Product(is_valid=False) You may use fields in your implementation of the ``validate_input`` method, but only synchronous fields are supported. For example: .. code-block:: python class Page(WebPage[Item]): def validate_input(self): if not self.name: raise UseFallback() @field(cached=True) def name(self): return self.css(".product-name ::text") .. tip:: :ref:`Cache fields ` used in the ``validate_input`` method, so that when they are used from ``to_item`` they are not evaluated again. If you implement a custom ``to_item`` method, as long as you are inheriting from :class:`~web_poet.pages.ItemPage`, you can enable input validation decorating your custom ``to_item`` method with :func:`~web_poet.util.validates_input`: .. code-block:: python from web_poet import validates_input class Page(ItemPage[Item]): @validates_input async def to_item(self): ... :exc:`~web_poet.exceptions.Retry` and :exc:`~web_poet.exceptions.UseFallback` may also be raised from the ``to_item`` method. This could come in handy, for example, if after you execute some asynchronous code, such as an :ref:`additional request `, you find out that you need to retry the original request or use a fallback. Input Validation Exceptions =========================== .. autoexception:: web_poet.exceptions.PageObjectAction .. autoexception:: web_poet.exceptions.Retry .. autoexception:: web_poet.exceptions.UseFallback scrapinghub-web-poet-ba87b95/docs/page-objects/inputs.rst000066400000000000000000000151311517167256700235530ustar00rootroot00000000000000.. _inputs: ====== Inputs ====== :ref:`Page object classes `, in their ``__init__`` method, must define input parameters with type hints pointing to input classes. Those input classes may be: - :ref:`Built-in web-poet input classes `. - :ref:`Custom input classes `. - Other :ref:`page object classes `. - :ref:`Item classes `, when using a :ref:`framework ` that can provide item classes. - Any other class that subclasses :class:`~web_poet.pages.Injectable` or is registered or decorated with :meth:`Injectable.register `. Based on the target URL and parameter type hints, :ref:`frameworks ` automatically build the required objects at run time, and pass them to the ``__init__`` method of the corresponding page object class. For example, if a page object class has an ``__init__`` parameter of type :class:`~web_poet.page_inputs.http.HttpResponse`, and the target URL is https://example.com, your framework would send an HTTP request to https://example.com, download the response, build an :class:`~web_poet.page_inputs.http.HttpResponse` object with the response data, and pass it to the ``__init__`` method of the page object class being used. .. _built-in-inputs: Built-in input classes ====================== .. warning:: Not all :ref:`frameworks ` support all web-poet built-in input classes. The :mod:`web_poet.page_inputs` module defines multiple classes that you can define as inputs for a page object class, including: - :class:`~web_poet.page_inputs.http.HttpResponse`, a complete HTTP response, including URL, headers, and body. This is the most common input for a page object class. See :ref:`httpresponse`. - :class:`~web_poet.page_inputs.client.HttpClient`, to send :ref:`additional requests `. - :class:`~web_poet.page_inputs.http.RequestUrl`, the target URL before following redirects. Useful, for example, to skip the target URL download, and instead use :class:`~web_poet.page_inputs.client.HttpClient` to send a custom request based on parts of the target URL. - :class:`~web_poet.page_inputs.page_params.PageParams`, to receive data from the crawling code. - :class:`~web_poet.page_inputs.stats.Stats`, to write key-value data pairs during parsing that you can inspect later, e.g. for debugging purposes. - :class:`~web_poet.page_inputs.browser.BrowserResponse`, which includes URL, status code and :class:`~web_poet.page_inputs.browser.BrowserHtml` of a rendered web page. .. tip:: You can use :class:`~web_poet.pages.BrowserPage` instead of :class:`~web_poet.pages.ItemPage` to have :class:`~web_poet.page_inputs.browser.BrowserResponse` as input and get convenient shortcuts for working with it. - :class:`~web_poet.page_inputs.response.AnyResponse`, which either holds :class:`~web_poet.page_inputs.browser.BrowserResponse` or :class:`~web_poet.page_inputs.http.HttpResponse` as the ``.response`` instance, depending on which one is available or is more appropriate. .. _Document Object Model: https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model .. _httpresponse: Working with HttpResponse ========================= :class:`~.HttpResponse` has many attributes and methods. .. tip:: You can use :class:`~web_poet.pages.WebPage` instead of :class:`~web_poet.pages.ItemPage` to have :class:`~web_poet.page_inputs.http.HttpResponse` as input and get convenient shortcuts for working with it. To get the entire response body, you can use :attr:`~.HttpResponse.body` for the raw :class:`bytes`, :attr:`~.HttpResponse.text` for the :class:`str` (decoded with the detected :attr:`~.HttpResponse.encoding`), or :meth:`json() <.HttpResponse.json>` to load a JSON response as a Python data structure: >>> response.body b'{"foo": "bar"}' >>> response.text '{"foo": "bar"}' >>> response.json() {'foo': 'bar'} There are also methods to select content from responses: :meth:`jmespath() <.HttpResponse.jmespath>` for JSON and :meth:`css() <.HttpResponse.css>` and :meth:`xpath() <.HttpResponse.xpath>` for HTML and XML: >>> response.jmespath("foo") [] >>> response.css("h1::text") [] >>> response.xpath("//h1/text()") [] .. _browserresponse: Working with BrowserResponse ============================= :class:`~.BrowserResponse` is similar to :class:`~.HttpResponse`, but for browser-rendered pages. In addition to the :attr:`~.BrowserResponse.text` attribute, it has an :attr:`~.BrowserResponse.html` attribute containing the rendered HTML (as a :class:`str`) after JavaScript execution. Like :class:`~.HttpResponse`, it provides :meth:`css() <.BrowserResponse.css>` and :meth:`xpath() <.BrowserResponse.xpath>` methods to select content from the rendered page: >>> response.html '...

Title

...' >>> response.css("h1::text") [] >>> response.xpath("//h1/text()") [] .. _custom-inputs: Custom input classes ==================== You may define your own input classes if you are using a :ref:`framework ` that supports it. However, note that custom input classes may make your :ref:`page object classes ` less portable across frameworks. .. _input-annotations: Input annotations ================= A type hint that points to an input class can be annotated with :obj:`~typing.Annotated`. For example: .. code-block:: python from typing import Annotated from web_poet.page_inputs.http import HttpResponse from web_poet.pages import WebPage class MyPage(WebPage): def __init__(self, response: Annotated[HttpResponse, "my-metadata"]): ... web-poet requires annotations to be JSON-serializable, for :ref:`fixture support `. Because :obj:`~typing.Annotated` requires annotations to be hashable, web-poet provides :func:`~web_poet.annotation_encode` to support :class:`list` and :class:`dict` structures in annotations. For example: .. code-block:: python from typing import Annotated from web_poet import annotation_encode from web_poet.page_inputs.http import HttpResponse from web_poet.pages import WebPage class MyPage(WebPage): def __init__( self, response: Annotated[HttpResponse, annotation_encode({"foo": ["bar"]})] ): ... scrapinghub-web-poet-ba87b95/docs/page-objects/items.rst000066400000000000000000000057121517167256700233560ustar00rootroot00000000000000.. _item-classes: .. _items: ===== Items ===== The ``to_item`` method of a :ref:`page object class ` must return an item. An item is a data container object supported by the itemadapter_ library, such as a :class:`dict`, an attrs_ class, or a :func:`~dataclasses.dataclass` class. For example: .. code-block:: python @attrs.define class MyItem: foo: int bar: str .. _attrs: https://www.attrs.org/en/stable/ .. _itemadapter: https://github.com/scrapy/itemadapter Because itemadapter_ allows implementing support for arbitrary classes, any kind of Python object can potentially work as an item. Defining the item class of a page object class ============================================== When inheriting from :class:`~.ItemPage`, indicate the item class to return between brackets: .. code-block:: python @attrs.define class MyPage(ItemPage[MyItem]): ... :class:`~.ItemPage.to_item` builds an instance of the specified item class based on the page object class :ref:`fields `. .. code-block:: python page = MyPage(...) item = await page.to_item() assert isinstance(item, MyItem) You can also define :class:`~.ItemPage` subclasses that are not meant to be used, only subclassed, and not annotate :class:`~.ItemPage` in them. You can then annotate those classes when subclassing them: .. code-block:: python @attrs.define class MyBasePage(ItemPage): ... @attrs.define class MyPage(MyBasePage[MyItem]): ... To change the item class of a subclass that has already defined its item class, use :class:`~.Returns`: .. code-block:: python @attrs.define class MyOtherPage(MyPage, Returns[MyOtherItem]): ... Best practices for item classes =============================== To keep your code maintainable, we recommend you to: - Instead of :class:`dict`, use proper item classes based on :mod:`dataclasses` or :doc:`attrs `, to make it easier to detect issues like field name typos or missing required fields. - Reuse item classes. For example, if you want to extract product details data from 2 e-commerce websites, try to use the same item class for both of them. Or at least try to define a base item class with shared fields, and only keep website-specific fields in website-specific items. - Keep item classes as logic-free as possible. For example, any parsing and field cleanup logic is better handled through :ref:`page object classes `, e.g. using :ref:`field processors `. Having code that makes item field values different from their counterpart page object field values can subvert the expectations of users of your code, which might need to access page object fields directly, for example for field subset selection. If you are looking for ready-made item classes, check out `zyte-common-items`_. .. _zyte-common-items: https://zyte-common-items.readthedocs.io/en/latest/index.html scrapinghub-web-poet-ba87b95/docs/page-objects/page-params.rst000066400000000000000000000101401517167256700244210ustar00rootroot00000000000000.. _page-params: ================= Using page params ================= In some cases, :ref:`page object classes ` might require or allow parameters from the calling code, e.g. to change their behavior or make optimizations. To support parameters, add :class:`~.PageParams` to your :ref:`inputs `: .. code-block:: python import attrs from web_poet import PageParams, WebPage @attrs.define class MyPage(WebPage): page_params: PageParams In your page object class, you can read parameters from a :class:`~.PageParams` object as you would from a :class:`dict`: .. code-block:: python foo = self.page_params["foo"] bar = self.page_params.get("bar", "default") The way the calling code sets those parameters depends on your :ref:`web-poet framework `. Example: Controlling item values ================================ .. code-block:: python import attrs import web_poet from web_poet import validates_input @attrs.define class ProductPage(web_poet.WebPage): page_params: web_poet.PageParams default_tax_rate = 0.10 @validates_input def to_item(self): item = { "url": self.url, "name": self.css("#main h3.name ::text").get(), "price": self.css("#main .price ::text").get(), } self.calculate_price_with_tax(item) return item @staticmethod def calculate_price_with_tax(item): tax_rate = self.page_params.get("tax_rate", self.default_tax_rate) item["price_with_tax"] = item["price"] * (1 + tax_rate) From the example above, we were able to provide an optional information regarding the **tax rate** of the product. This could be useful when trying to support the different tax rates for each state or territory. However, since we're treating the **tax_rate** as optional information, notice that we also have a the ``default_tax_rate`` as a backup value just in case it's not available. Example: Controlling page object behavior ========================================= Let's try an example wherein :class:`~.PageParams` is able to control how :ref:`additional requests ` are being used. Specifically, we are going to use :class:`~.PageParams` to control the number of pages visited. .. code-block:: python from typing import List import attrs import web_poet from web_poet import validates_input @attrs.define class ProductPage(web_poet.WebPage): http: web_poet.HttpClient page_params: web_poet.PageParams default_max_pages = 5 @validates_input async def to_item(self): return {"product_urls": await self.get_product_urls()} async def get_product_urls(self) -> List[str]: # Simulates scrolling to the bottom of the page to load the next # set of items in an "Infinite Scrolling" category list page. max_pages = self.page_params.get("max_pages", self.default_max_pages) requests = [ self.create_next_page_request(page_num) for page_num in range(2, max_pages + 1) ] responses = await http.batch_execute(*requests) return [ url for response in responses for product_urls in self.parse_product_urls(response) for url in product_urls ] @staticmethod def create_next_page_request(page_num): next_page_url = f"https://example.com/category/products?page={page_num}" return web_poet.Request(url=next_page_url) @staticmethod def parse_product_urls(response: web_poet.HttpResponse): return response.css("#main .products a.link ::attr(href)").getall() From the example above, we can see how :class:`~.PageParams` is able to arbitrarily limit the pagination behavior by passing an optional **max_pages** info. Take note that a ``default_max_pages`` value is also present in the page object class in case the :class:`~.PageParams` instance did not provide it. scrapinghub-web-poet-ba87b95/docs/page-objects/rules.rst000066400000000000000000000205441517167256700233670ustar00rootroot00000000000000.. _rules: ===== Rules ===== Rules are :class:`~.ApplyRule` objects that tell web-poet which :ref:`page object class ` to use based on user input, i.e. the target URL and the requested output class (a :ref:`page object class ` or an :ref:`item class `). Rules are necessary if you want to request an item instance, because rules tell web-poet which page object class to use to generate your item instance. Rules can also be useful as documentation or to get information about page object classes programmatically. :ref:`Rule precedence ` can also be useful. For example, to implement generic page object classes that you can override for specific websites. Defining rules ============== The :func:`~.handle_urls` decorator is the simplest way to define a rule for a page object. For example: .. _handle_url_example: .. code-block:: python from web_poet import ItemPage, handle_urls from my_items import MyItem @handle_urls("example.com") class MyPage(ItemPage[MyItem]): ... The code above tells web-poet to use the ``MyPage`` :ref:`page object class ` when given a URL pointing to the ``example.com`` domain name and being asked for ``MyPage`` or ``MyItem`` as output class. Alternatively, you can manually create and register :class:`~.ApplyRule` objects: .. code-block:: python from url_matcher import Patterns from web_poet import ApplyRule, ItemPage, default_registry from my_items import MyItem class MyPage(ItemPage[MyItem]): ... rule = ApplyRule( for_patterns=Patterns(include=["example.com"]), use=MyPage, to_return=MyItem, ) default_registry.add_rule(rule) URL patterns ------------ Every rule defines a :class:`url_matcher.Patterns` object that determines if any given URL is a match for the rule. :class:`~url_matcher.Patterns` objects offer a simple but powerful syntax for URL matching. For example: ======================= =============================================================== Pattern Behavior ======================= =============================================================== (empty string) Matches any URL example.com Matches any URL on the example.com domain and subdomains example.com/products/ Matches example.com URLs under the /products/ path example.com?productId=* Matches example.com URLs with productId=… in their query string ======================= =============================================================== For details and more examples, see the :ref:`url-matcher documentation `. When using the :func:`~handle_urls` decorator, its ``include``, ``exclude``, and ``priority`` parameters are used to create a :class:`~url_matcher.Patterns` object. When creating an :class:`~.ApplyRule` object manually, you must create a :class:`~url_matcher.Patterns` object yourself and pass it to the ``for_patterns`` parameter of :class:`~.ApplyRule`. .. _rule-precedence: Rule precedence --------------- Often you define rules so that a given user input, i.e. a combination of a target URL and an output class, can only match 1 rule. However, there are scenarios where it can be useful to define 2 or more rules that can all match a given user input. For example, you might want to define a “generic” page object class with some default implementation of field extraction, e.g. based on semantic markup or machine learning, and be able to override it based on the input URL, e.g. for specific websites or URL patterns, with a more specific page object class. For a given user input, when 2 or more rules are a match, web-poet breaks the tie as follows: - One rule can indicate that its :ref:`page object class ` **overrides** another page object class. This is specified by :attr:`ApplyRule.instead_of <~.ApplyRule.instead_of>`. When using the :func:`~handle_urls` decorator, the value comes from the ``instead_of`` parameter of the decorator. For example, the following page object class would override ``MyPage`` from :ref:`above `: .. code-block:: python @handle_urls("example.com", instead_of=MyPage) class OverridingPage(ItemPage[MyItem]): ... That is: - If the requested output class is ``MyPage``, an instance of ``OverridingPage`` is returned instead. - If the requested output class is ``MyItem``, an instance of ``OverridingPage`` is created, and used to build an instance of ``MyItem``, which is returned. - One rule can declare a higher **priority** than another rule, taking precedence. Rule priority is determined by the value of :attr:`ApplyRule.for_patterns.priority `. When using the :func:`~handle_urls` decorator, the value comes from the ``priority`` parameter of the decorator. Rule priority is 500 by default. For example, given the following page object class: .. code-block:: python @handle_urls("example.com", priority=510) class PriorityPage(ItemPage[MyItem]): ... The following would happen: - If the requested output class is ``MyItem``, an instance of ``PriorityPage`` is created, and used to build an instance of ``MyItem``, which is returned. - If the requested output class is ``MyPage``, an instance of ``MyPage`` is returned, since ``PriorityPage`` is not defined as an override for ``MyPage``. ``instead_of`` triumphs ``priority``: If a rule overrides another rule using ``instead_of``, it does not matter if the overridden rule had a higher priority. When multiple rules override the same page object class, through, ``priority`` can break the tie. If none of those tie breakers are in place, the first rule added to the registry takes precedence. However, relying on registration order is discouraged, and you will get a warning if you register 2 or more rules with the same URL patterns, same output item class, same priority, and no ``instead_of`` value. See also :ref:`rule-conflicts`. Rule registries =============== Rules should be stored in a :class:`~.RulesRegistry` object. web-poet defines a default, global :class:`~.RulesRegistry` object at ``web_poet.default_registry``. Rules defined with the :func:`~.handle_urls` decorator are added to this registry. .. _load-rules: Loading rules ------------- For a :ref:`framework ` to apply your rules, you need to make sure that your code that adds those rules to ``web_poet.default_registry`` is executed. When using the :func:`~web_poet.handle_urls` decorator, that usually means that you need to make sure that Python imports the files where the decorator is used. You can use the :func:`~.web_poet.rules.consume_modules` function in some entry point of your code for that: .. code-block:: python from web_poet import consume_modules consume_modules("my_package.pages", "external_package.pages") The ideal location for this function depends on your framework. Check the documentation of your framework for more information. .. _rule-conflicts: Rule conflicts ============== A rule conflict occurs when multiple rules have the same ``instead_of`` and ``priority`` values and can match the same URL. When it affects rules defined in your code base, solve the conflict adjusting those ``instead_of`` and ``priority`` values as needed. When it affects rules from a external package, you have the following options to solve the conflict: - **Subclass** one of the conflicting page object classes in your code base, using a similar rule except for a tie-breaking change to its ``instead_of`` or ``priority`` value. For example, if ``package1.A`` and ``package2.B`` are page object classes with conflicting rules, with a default priority (500), and you want ``package1.A`` to take precedence, declare a new page object class as follows: .. code-block:: python from package1 import A from web_poet import handle_urls @handle_urls(..., priority=510) class NewA(A): pass - If your :ref:`framework ` allows defining a **custom list of rules**, you could use :class:`web_poet.default_registry <~.RulesRegistry>` methods like :meth:`~.RulesRegistry.get_rules` or :meth:`~.RulesRegistry.search` to build such a list, including only rules that have no conflicts. scrapinghub-web-poet-ba87b95/docs/page-objects/stats.rst000066400000000000000000000014731517167256700233730ustar00rootroot00000000000000.. _stats: ===== Stats ===== During parsing, storing some data about the parsing itself can be useful for debugging, monitoring, and reporting. The :class:`~.Stats` page input allows storing such data. For example, you can use stats to track which parsing code is actually used, so that you can remove code once it is no longer necessary due to upstream changes: .. code-block:: python from attrs import define from web_poet import field, Stats, WebPage @attrs.define class MyPage(WebPage): stats: Stats @field def title(self): if title := self.css("h1::text").get(): self.stats.inc("MyPage/field-src/title/h1") elif title := self.css("h2::text").get(): self.stats.inc("MyPage/field-src/title/h2") return title scrapinghub-web-poet-ba87b95/docs/page-objects/testing.rst000066400000000000000000000404751517167256700237170ustar00rootroot00000000000000.. _web-poet-testing: ====================== Tests for page objects ====================== Page Objects that inherit from :class:`~.ItemPage` can be tested by saving the dependencies needed to create one and the result of :meth:`~web_poet.pages.ItemPage.to_item`, recreating the Page Object from the dependencies, running its :meth:`~web_poet.pages.ItemPage.to_item` and comparing the result to the saved one. ``web-poet`` provides the following tools for this: * dependency serialization into a Python object and into a set of files; * recreating Page Objects from the serialized dependencies; * a high-level function to save a test fixture; * a plugin for ``pytest 7.0.0`` and higher that discovers fixtures and runs tests for them. .. _dep-serialization: Serialization ============= :func:`web_poet.serialization.serialize` can be used to serialize an iterable of Page Object dependencies to a Python object. :func:`web_poet.serialization.deserialize` can be used to recreate a Page Object from this serialized data. An instance of :class:`web_poet.serialization.SerializedDataFileStorage` can be used to write the serialized data to a set of files in a given directory and to read it back. .. note:: We only support serializing dependencies, not Page Object instances, because the only universal way to recreate a Page Object is from its dependencies, not from some saved internal state. Each dependency is serialized to one or several ``bytes`` objects, each of which is saved as a single file. :func:`web_poet.serialization.serialize_leaf` and :func:`web_poet.serialization.deserialize_leaf` are used to convert between a dependency and this set of ``bytes`` objects. They are implemented using :func:`functools.singledispatch` and while the types provided by ``web-poet`` are supported out of the box, user-defined types need a pair of implementation functions that need to be registered using :func:`web_poet.serialization.register_serialization`. .. _fixtures: Fixtures ======== The provided ``pytest`` plugin expects fixtures in a certain layout. A set of fixtures for a single Page Object should be contained in a directory named as that Page Object fully qualified class name. Each fixture is a directory inside it, that contains data for Page Object inputs and output:: fixtures └── my_project.pages.MyItemPage ├── test-1 │ ├── inputs │ ├── HttpClient.exists │ │ ├── HttpResponse-body.html │ │ ├── HttpResponse-info.json │ │ └── ResponseUrl.txt │ ├── meta.json │ └── output.json └─── test-2 ├── inputs │ ├── HttpClient.exists │ ├── HttpClient-0-HttpRequest.info.json │ ├── HttpClient-0-HttpResponse.body.html │ ├── HttpClient-0-HttpResponse.info.json │ ├── HttpClient-1-HttpRequest.body.txt │ ├── HttpClient-1-HttpRequest.info.json │ ├── HttpClient-1-HttpResponse.body.html │ ├── HttpClient-1-HttpResponse.info.json │ ├── HttpResponse-body.html │ ├── HttpResponse-info.json │ └── ResponseUrl.txt ├── meta.json └── output.json .. _fixture-save: :func:`web_poet.testing.Fixture.save` can be used to create a fixture inside a Page Object directory from an iterable of dependencies, an output item and an optional metadata dictionary. It can optionally take a name for the fixture directory. By default it uses incrementing names "test-1", "test-2" etc. .. note:: ``output.json`` contains a result of ``page_object.to_item()`` converted to a dict using the itemadapter_ library and saved as JSON. After generating a fixture you can edit ``output.json`` to modify expected field values and add new fields, which is useful when creating tests for code that isn't written yet or before modifying its behavior. .. _web-poet-testing-scrapy-poet: scrapy-poet integration ======================= Projects that use the `scrapy-poet`_ library can use the :ref:`Scrapy command ` provided by it to generate fixtures in a convenient way. It's available starting with scrapy-poet 0.8.0. .. _scrapy-poet: https://github.com/scrapinghub/scrapy-poet .. _web-poet-testing-pytest: Running tests ============= The provided ``pytest`` plugin is automatically registered when ``web-poet`` is installed, and running ``python -m pytest`` in a directory containing fixtures will discover them and run tests for them. By default, the plugin generates: * a test which checks that ``to_item()`` doesn't raise an exception (i.e. it can be executed), * a test per each output attribute of the item, * an additional test to check that there are no extra attributes in the output. For example, if your item has 5 attributes, and you created 2 fixtures, pytest will run (5+1+1)*2 = 14 tests. This allows to report failures for individual fields separately. If ``to_item`` raises an error, there is no point in running other tests, so they're skipped in this case. If you prefer less granular test failure reporting, you can use pytest with the ``--web-poet-test-per-item`` option:: python -m pytest --web-poet-test-per-item In this case there is going to be a single test per fixture: if the result is not fully correct, the test fails. So, following the previous example, it'd be 2 tests instead of 14. .. _web-poet-testing-tdd: Test-Driven Development ======================= You can follow TDD (Test-Driven Development) approach to develop your page objects. To do so, 1. Generate a fixture (see :ref:`web-poet-testing-scrapy-poet`). 2. Populate ``output.json`` with the correct expected output. 3. Run the tests (see :ref:`web-poet-testing-pytest`) and update the code until all tests pass. It's convenient to use web-poet :ref:`fields`, and implement extraction field-by-field, because you'll be getting an additional test passing after each field is implemented. This approach allows a fast feedback loop: there is no need to download page multiple times, and you have a clear progress indication for your work (number of failing tests remaining). Also, in the end you get a regression test, which can be helpful later. Sometimes it may be awkward to set the correct value in JSON before starting the development, especially if a value is large or has a complex structure. For example, this could be the case for e-commerce product description field, which can be hard to copy-paste from the website, and which may have various whitespace normalization rules which you need to apply. In this case, it may be more convenient to implement the extraction first, and only then populate the ``output.json`` file with the correct value. You can use ``python -m web_poet.testing rerun `` command in this case, to re-run the page object using the inputs saved in a fixture. This command prints output of the page object, as JSON; you can then copy-paste relevant parts to the ``output.json`` file. It's also possible to make the command print only some of the fields. For example, you might run the following command after implementing extraction for "description" and "descriptionHtml" fields in ``my_project.pages.MyItemPage``:: python -m web_poet.testing rerun \ fixtures/my_project.pages.MyItemPage/test-1 \ --fields description,descriptionHtml It may output something like this:: { "description": "..description of the product..", "descriptionHtml": "

...

" } If these values look good, you can update ``fixtures/my_project.pages.MyItemPage/test-1/output.json`` file with these values. .. _web-poet-testing-frozen_time: Handling time fields ==================== Sometimes output of a page object might depend on the current time. For example, the item may contain the scraping datetime, or a current timestamp may be used to build some URLs. When a test runs at a different time it will break. To avoid this :ref:`the metadata dictionary ` can contain a ``frozen_time`` field set to the time value used when generating the test. This will instruct the test runner to use the same time value so that field comparisons are still correct. The value can be any string understood by `dateutil`_. If it doesn't include timezone information, the local time of the machine will be assumed. If it includes timezone information, on non-Windows systems the test process will be executed in that timezone, so that output fields that contain local time are correct. On Windows systems (where changing the process timezone is not possible) the time value will be converted to the local time of the machine, and such fields will containt wrong data if these timezones don't match. Consider an example item:: import datetime from web_poet import WebPage, validates_input class DateItemPage(WebPage): @validates_input async def to_item(self) -> dict: # e.g. 2001-01-01 11:00:00 +00 now = datetime.datetime.now(datetime.timezone.utc) return { # '2001-01-01T11:00:00Z' "time_utc": now.strftime("%Y-%M-%dT%H:%M:%SZ"), # if the current timezone is CET, then '2001-01-01T12:00:00+01:00' "time_local": now.astimezone().strftime("%Y-%M-%dT%H:%M:%S%z"), } We will assume that the fixture was generated in CET (UTC+1). * If the fixture doesn't have the ``frozen_time`` metadata field, the item will simply contain the current time and the test will always fail. * If ``frozen_time`` doesn't contain the timezone data (e.g. it is ``2001-01-01T11:00:00``), the item will depend on the machine timezone: in CET it will contain the expected values, in timezones with a different offset ``time_local`` will be different. * If ``frozen_time`` contains the timezone data and the system is not Windows, the ``time_local`` field will contain the date in that timezone, so if the timezone in ``frozen_time`` is not UTC+1, the test will fail. * If the system is Windows, the ``frozen_time`` value will be converted to the machine timezone, so the item will depend on that timezone, just like when ``frozen_time`` doesn't contain the timezone data, and ``time_local`` will similarly be only correct if the machine timezone has the same offset as CET. This means that most combinations of setups will work if ``frozen_time`` contains the timezone data, except for running tests on Windows, in which case the machine timezone should match the timezone in ``frozen_time``. Also, if items do not depend on the machine timezone (e.g. if all datetime-derived data they contain is in UTC), the tests for them should work everywhere. There is an additional limitation which we plan to fix in future versions. The time is set to the ``frozen_time`` value when the test generation (if using the ``scrapy-poet`` command) or the test run starts, but it ticks during the generation/run itself, so if it takes more than 1 second (which is quite possible even in simple cases) the time fields will have values several seconds later than ``frozen_time``. For now we recommend to work around this problem by manually editing the ``output.json`` file to put the value equal to ``frozen_time`` in these fields, as running the test shoudn't take more than 1 second. .. _dateutil: https://github.com/dateutil/dateutil .. _git-lfs: Storing fixtures in Git ======================= Fixtures can take a lot of disk space, as they usually include page responses and may include other large files, so we recommend using `Git LFS`_ when storing them in Git repos to reduce the repo space and get other performance benefits. Even if your fixtures are currently small, it may be useful to do this from the beginning, as migrating files to LFS is not easy and requires rewriting the repo history. To use Git LFS you need a Git hosting provider that supports it, and major providers and software (e.g. GitHub, Bitbucket, GitLab) support it. There are also `implementations`_ for standalone Git servers. Assuming you store the fixtures in the directory named "fixtures" in the repo root, the workflow should be as following. Enable normal diffs for LFS files in this repo:: git config diff.lfs.textconv cat Enable LFS for the fixtures directory before committing anything in it:: git lfs track "fixtures/**" Commit the ``.gitattributes`` file (which stores the tracking information):: git add .gitattributes git commit After generating the fixtures just commit them as usual:: git add fixtures/test-1 git commit After this all usual commands including ``push``, ``pull`` or ``checkout`` should work as expected on these files. Please also check the official Git LFS documentation for more information. .. _Git LFS: https://git-lfs.com/ .. _implementations: https://github.com/git-lfs/git-lfs/wiki/Implementations .. _web-poet-testing-additional-requests: Additional requests support =========================== If the page object uses the :class:`~.HttpClient` dependency to make :ref:`additional requests `, the generated fixtures will contain these requests and their responses (or exceptions raised when the response is not received). When the test runs, :class:`~.HttpClient` will return the saved responses without doing actual requests. Currently requests are compared by their URL, method, headers and body, so if a page object makes requests that differ between runs, the test won't be able to find a saved response and will fail. Test coverage ============= The coverage for page object code is reported correctly if tools such as `coverage`_ are used when running web-poet tests. .. _coverage: https://coverage.readthedocs.io/ .. _web-poet-testing-adapters: Item adapters ============= The testing framework uses the itemadapter_ library to convert items to dicts when storing them in fixtures and when comparing the expected and the actual output. As adapters may influence the resulting dicts, it's important to use the same adapter when generating and running the tests. It may also be useful to use different adapters in tests and in production. For example, you may want to omit empty fields in production, but be able to distinguish between empty and absent fields in tests. For this you can set the ``adapter`` field in :ref:`the metadata dictionary ` to the class that inherits from :class:`itemadapter.ItemAdapter` and has the adapter(s) you want to use in tests in its ``ADAPTER_CLASSES`` attribute (see `the relevant itemadapter docs`_ for more information). An example:: from collections import deque from itemadapter import ItemAdapter from itemadapter.adapter import DictAdapter class MyAdapter(DictAdapter): # any needed customization ... class MyItemAdapter(ItemAdapter): ADAPTER_CLASSES = deque([MyAdapter]) You can then put the ``MyItemAdapter`` class object into ``adapter`` and it will be used by the testing framework. If ``adapter`` is not set, :class:`~web_poet.testing.itemadapter.WebPoetTestItemAdapter` will be used. It works like :class:`itemadapter.ItemAdapter` but doesn't change behavior when :attr:`itemadapter.ItemAdapter.ADAPTER_CLASSES` is modified. .. _itemadapter: https://github.com/scrapy/itemadapter .. _the relevant itemadapter docs: https://github.com/scrapy/itemadapter/#multiple-adapter-classes .. _web-poet-testing-user-props: pytest user properties ====================== After a test run the following `pytest user properties`_ are available: * on per-field tests the ``expected_value`` and ``actual_value`` properties contain JSON-encoded expected and actual field values * on expected exception tests the ``expected_exception`` and ``actual_exception`` properties contain JSON-encoded dicts for expected and actual exceptions, with the ``import_path`` field containing the import path of the exception class and the ``msg`` field containing the first argument of the exception instance. The main use case for this is generating a `JUnitXML report`_ and getting the values from the ``/testsuites/testsuite/testcase/properties/property`` nodes. .. _pytest user properties: https://docs.pytest.org/en/stable/reference/reference.html#pytest.Item.user_properties .. _JUnitXML report: https://docs.pytest.org/en/stable/how-to/output.html#creating-junitxml-format-files scrapinghub-web-poet-ba87b95/docs/requirements.in000066400000000000000000000001271517167256700222060ustar00rootroot00000000000000sphinx-rtd-theme sphinx-scrapy @ git+https://github.com/scrapy/sphinx-scrapy.git@0.8.1 scrapinghub-web-poet-ba87b95/docs/requirements.txt000066400000000000000000000036651517167256700224310ustar00rootroot00000000000000# This file was autogenerated by uv via the following command: # uv pip compile requirements.in -o requirements.txt alabaster==1.0.0 # via sphinx babel==2.18.0 # via sphinx certifi==2026.2.25 # via requests charset-normalizer==3.4.6 # via requests docutils==0.22.4 # via # sphinx # sphinx-markdown-builder # sphinx-rtd-theme idna==3.11 # via requests imagesize==2.0.0 # via sphinx jinja2==3.1.6 # via sphinx markupsafe==3.0.3 # via jinja2 packaging==26.0 # via # sphinx # sphinx-scrapy pygments==2.19.2 # via sphinx requests==2.32.5 # via sphinx roman-numerals==4.1.0 # via sphinx snowballstemmer==3.0.1 # via sphinx sphinx==9.1.0 # via # sphinx-copybutton # sphinx-last-updated-by-git # sphinx-llms-txt # sphinx-markdown-builder # sphinx-rtd-theme # sphinx-scrapy # sphinxcontrib-jquery sphinx-copybutton==0.5.2 # via sphinx-scrapy sphinx-last-updated-by-git==0.3.8 # via sphinx-sitemap sphinx-llms-txt @ git+https://github.com/zytedata/sphinx-llms-txt.git@5e8866cb0cc249aa2017ad9050b3b83a7ca16f69 # via sphinx-scrapy sphinx-markdown-builder @ git+https://github.com/zytedata/sphinx-markdown-builder.git@ac9f8babfe622e4300099ab44b96d9d9228e742e # via sphinx-scrapy sphinx-rtd-theme==3.1.0 # via -r requirements.in sphinx-scrapy @ git+https://github.com/scrapy/sphinx-scrapy.git@13a74e7223a2faa2ab7e6679e6779baf76ff49e0 # via -r requirements.in sphinx-sitemap==2.9.0 # via sphinx-scrapy sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 # via sphinx sphinxcontrib-htmlhelp==2.1.0 # via sphinx sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-qthelp==2.0.0 # via sphinx sphinxcontrib-serializinghtml==2.0.0 # via sphinx tabulate==0.10.0 # via sphinx-markdown-builder urllib3==2.6.3 # via requests scrapinghub-web-poet-ba87b95/docs/tutorial-project/000077500000000000000000000000001517167256700224425ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/tutorial-project/run.py000066400000000000000000000004731517167256700236240ustar00rootroot00000000000000import asyncio from web_poet import consume_modules from web_poet.framework import Framework from tutorial.items import Book consume_modules("tutorial.pages") framework = Framework() item = asyncio.run( framework.get_item("http://books.toscrape.com/catalogue/the-exiled_247/index.html", Book) ) print(item) scrapinghub-web-poet-ba87b95/docs/tutorial-project/run_categorized.py000066400000000000000000000005501517167256700262000ustar00rootroot00000000000000import asyncio from web_poet import consume_modules from web_poet.framework import Framework from tutorial.items import CategorizedBook consume_modules("tutorial.pages") framework = Framework() item = asyncio.run( framework.get_item( "http://books.toscrape.com/catalogue/the-exiled_247/index.html", CategorizedBook, ) ) print(item) scrapinghub-web-poet-ba87b95/docs/tutorial-project/run_params.py000066400000000000000000000006231517167256700251640ustar00rootroot00000000000000import asyncio from web_poet import consume_modules from web_poet.framework import Framework from tutorial.items import CategorizedBook consume_modules("tutorial.pages") framework = Framework() item = asyncio.run( framework.get_item( "http://books.toscrape.com/catalogue/the-exiled_247/index.html", CategorizedBook, page_params={"category_rank": 24}, ) ) print(item) scrapinghub-web-poet-ba87b95/docs/tutorial-project/tutorial/000077500000000000000000000000001517167256700243055ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/tutorial-project/tutorial/__init__.py000066400000000000000000000000001517167256700264040ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/tutorial-project/tutorial/items.py000066400000000000000000000002751517167256700260040ustar00rootroot00000000000000from attrs import define @define class Book: title: str from typing import Optional @define class CategorizedBook(Book): category: str category_rank: Optional[int] = None scrapinghub-web-poet-ba87b95/docs/tutorial-project/tutorial/pages/000077500000000000000000000000001517167256700254045ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/tutorial-project/tutorial/pages/__init__.py000066400000000000000000000000001517167256700275030ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/docs/tutorial-project/tutorial/pages/books_toscrape_com.py000066400000000000000000000031151517167256700316310ustar00rootroot00000000000000from web_poet import field, handle_urls, WebPage from ..items import Book @handle_urls("books.toscrape.com") class BookPage(WebPage[Book]): @field async def title(self): return self.css("h1::text").get() from attrs import define from web_poet import Returns from web_poet import HttpClient, Returns from web_poet import HttpClient, PageParams, Returns from ..items import CategorizedBook @handle_urls("books.toscrape.com") @define class CategorizedBookPage(BookPage, Returns[CategorizedBook]): http: HttpClient page_params: PageParams _books_per_page = 20 @field async def category(self): return self.css(".breadcrumb a::text").getall()[-1] @field async def category_rank(self): category_rank = self.page_params.get("category_rank") if category_rank is not None: return category_rank response, book_url, page = self.response, self.url, 0 category_page_url = self.css(".breadcrumb a::attr(href)").getall()[-1] while category_page_url: category_page_url = response.urljoin(category_page_url) response = await self.http.get(category_page_url) urls = response.css("h3 a::attr(href)").getall() for position, url in enumerate(urls, start=1): url = str(response.urljoin(url)) if url == book_url: return page * self._books_per_page + position category_page_url = response.css(".next a::attr(href)").get() if not category_page_url: return None page += 1 scrapinghub-web-poet-ba87b95/pyproject.toml000066400000000000000000000132711517167256700211230ustar00rootroot00000000000000[build-system] requires = ["hatchling>=1.27.0"] build-backend = "hatchling.build" [project] name = "web-poet" version = "0.24.0" description = "Zyte's Page Object pattern for web scraping" readme = "README.rst" license = "BSD-3-Clause" license-files = ["LICENSE"] authors = [ { name = "Zyte Group Ltd", email = "opensource@zyte.com" }, ] requires-python = ">=3.10" classifiers = [ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: BSD License", "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", ] dependencies = [ "attrs >= 21.3.0", "parsel >= 1.5.0", "url-matcher >= 0.4.0", "multidict >= 0.5.0", "w3lib >= 1.22.0", "async-lru >= 1.0.3", "itemadapter >= 0.8.0", "andi >= 0.5.0", "python-dateutil >= 2.7.0", "time-machine >= 2.7.1", "packaging >= 20.0", ] [project.optional-dependencies] framework = [ "niquests >= 3.14.0", "playwright >= 1.20.0", ] [project.entry-points."pytest11"] "web-poet" = "web_poet.testing.pytest" [project.urls] Homepage = "https://github.com/scrapinghub/web-poet" Documentation = "https://web-poet.readthedocs.io/en/stable/" [tool.hatch.build.targets.sdist] include = [ "/web_poet", "/tests", "/docs", "/CHANGELOG.rst", "/tox.ini", "/README.rst", ] [tool.bumpversion] commit = true tag = true tag_name = "{new_version}" [[tool.bumpversion.files]] filename = 'CHANGELOG.rst' search = "\\(unreleased\\)$" replace = "({now:%Y-%m-%d})" regex = true [[tool.bumpversion.files]] filename = "docs/conf.py" [tool.coverage.report] exclude_also = [ "if TYPE_CHECKING:", ] [tool.mypy] ignore_missing_imports = false implicit_reexport = false [[tool.mypy.overrides]] module = "tests.*" allow_untyped_defs = true allow_incomplete_defs = true [[tool.mypy.overrides]] module = "tests.po_lib_to_return.*" # Ignore mypy errors since the Page Objects contain arbitrary reference values # used for assertions which have varying types. This upsets mypy. ignore_errors = true [tool.pytest.ini_options] asyncio_mode = "strict" asyncio_default_fixture_loop_scope = "function" [tool.ruff] target-version = "py310" # until we migrate away from setup.py [tool.ruff.lint] extend-select = [ # flake8-bugbear "B", # flake8-comprehensions "C4", # pydocstyle "D", # flake8-future-annotations "FA", # flynt "FLY", # refurb "FURB", # isort "I", # flake8-implicit-str-concat "ISC", # flake8-logging "LOG", # Perflint "PERF", # pygrep-hooks "PGH", # flake8-pie "PIE", # pylint "PL", # flake8-pytest-style "PT", # flake8-use-pathlib "PTH", # flake8-pyi "PYI", # flake8-quotes "Q", # flake8-return "RET", # flake8-raise "RSE", # Ruff-specific rules "RUF", # flake8-bandit "S", # flake8-simplify "SIM", # flake8-slots "SLOT", # flake8-debugger "T10", # flake8-type-checking "TC", # pyupgrade "UP", # pycodestyle warnings "W", # flake8-2020 "YTT", ] ignore = [ # Missing docstring in public module "D100", # Missing docstring in public class "D101", # Missing docstring in public function "D103", # Missing docstring in public package "D104", # Missing docstring in magic method "D105", # Missing docstring in __init__ "D107", # One-line docstring should fit on one line with quotes "D200", # No blank lines allowed after function docstring "D202", # 1 blank line required between summary line and description "D205", # Multi-line docstring closing quotes should be on a separate line "D209", # First line should end with a period "D400", # First line should be in imperative mood; try rephrasing "D401", # First line should not be the function's "signature" "D402", # Too many return statements "PLR0911", # Too many branches "PLR0912", # Too many arguments in function definition "PLR0913", # Too many statements "PLR0915", # Magic value used in comparison "PLR2004", # Mutable class attributes should be annotated with `typing.ClassVar` "RUF012", # Use of `assert` detected "S101", # Move application import {} into a type-checking block # (The fix can break singledispatch() and/or andi.) "TC001", # Add `from __future__ import annotations` to simplify # (The fix can break singledispatch() and/or andi.) "FA100", ] [tool.ruff.lint.per-file-ignores] # F401: Ignore "imported but unused" errors in __init__ files, as those # imports are there to expose submodule functions so they can be imported # directly from that module # F403: Ignore * imports in these files # D102: Missing docstring in public method # S: flake8-bandit # B018: Found useless expression. "web_poet/__init__.py" = ["F401", "F403"] "web_poet/exceptions/__init__.py" = ["F401", "F403"] "web_poet/page_inputs/__init__.py" = ["F401", "F403"] "web_poet/serialization/__init__.py" = ["F401", "F403"] "web_poet/testing/__init__.py" = ["F401", "F403"] "web_poet/testing/pytest.py" = ["D102"] "tests/**" = ["B018", "D102", "S"] # Documentation examples may miss docstrings and include undefined, example # names. "docs/**" = ["D102", "F821"] [tool.ruff.lint.pydocstyle] convention = "pep257" [tool.sphinx-scrapy] # Keep in sync with .github/workflows/tests-ubuntu.yml. python-version = "3.13" scrapinghub-web-poet-ba87b95/requirements-dev.txt000066400000000000000000000000221517167256700222350ustar00rootroot00000000000000pre-commit pytest scrapinghub-web-poet-ba87b95/tests/000077500000000000000000000000001517167256700173455ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/__init__.py000066400000000000000000000000001517167256700214440ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/conftest.py000066400000000000000000000021051517167256700215420ustar00rootroot00000000000000from __future__ import annotations from pathlib import Path import pytest from web_poet.page_inputs import HttpResponse, HttpResponseBody from web_poet.rules import RulesRegistry pytest_plugins = ["pytester"] def read_fixture(path: str) -> str: return (Path(__file__).parent / path).read_text(encoding="utf-8") @pytest.fixture def book_list_html(): return read_fixture("fixtures/book_list.html") @pytest.fixture def some_json_response(): body = """ { "description": "paragraph", "website": { "url": "http://www.scrapy.org", "name": "homepage" }, "logo": "/images/logo.png" } """ return HttpResponse( url="http://books.toscrape.com/result.json", body=body.encode("utf-8"), encoding="utf-8", ) @pytest.fixture def book_list_html_response(book_list_html): body = HttpResponseBody(bytes(book_list_html, "utf-8")) return HttpResponse( url="http://books.toscrape.com/index.html", body=body, encoding="utf-8" ) @pytest.fixture def registry(): return RulesRegistry() scrapinghub-web-poet-ba87b95/tests/fixtures/000077500000000000000000000000001517167256700212165ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/fixtures/book_list.html000066400000000000000000001165631517167256700241050ustar00rootroot00000000000000 All products | Books to Scrape - Sandbox
Books to Scrape We love being scraped!
1000 results - showing 1 to 20.
  1. Olio

    Olio

    £23.88

    In stock

scrapinghub-web-poet-ba87b95/tests/po_lib/000077500000000000000000000000001517167256700206115ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib/__init__.py000066400000000000000000000024341517167256700227250ustar00rootroot00000000000000""" This package is just for overrides testing purposes. """ from __future__ import annotations from typing import Any from url_matcher import Patterns from web_poet import ItemPage, handle_urls # NOTE: this module contains a PO with @handle_rules from .. import po_lib_sub # noqa: F401 class POBase(ItemPage): expected_instead_of: type[ItemPage] | list[type[ItemPage]] expected_patterns: Patterns | list[Patterns] expected_to_return: Any = None expected_meta: dict[str, Any] | list[dict[str, Any]] class POTopLevelOverriden1(ItemPage): ... class POTopLevelOverriden2(ItemPage): ... @handle_urls("example.com", instead_of=POTopLevelOverriden1) @handle_urls( "example.com", instead_of=POTopLevelOverriden1, exclude="/*.jpg|", priority=300 ) class POTopLevel1(POBase): expected_instead_of = [POTopLevelOverriden1, POTopLevelOverriden1] expected_patterns = [ Patterns(["example.com"], ["/*.jpg|"], priority=300), Patterns(["example.com"]), ] expected_to_return = [None, None] expected_meta = [{}, {}] @handle_urls("example.com", instead_of=POTopLevelOverriden2) class POTopLevel2(POBase): expected_instead_of = POTopLevelOverriden2 expected_patterns = Patterns(["example.com"]) expected_to_return = None expected_meta = {} scrapinghub-web-poet-ba87b95/tests/po_lib/a_module.py000066400000000000000000000006401517167256700227500ustar00rootroot00000000000000from url_matcher import Patterns from tests.po_lib import POBase from web_poet import ItemPage, handle_urls class POModuleOverriden(ItemPage): ... @handle_urls("example.com", instead_of=POModuleOverriden, extra_arg="foo") class POModule(POBase): expected_instead_of = POModuleOverriden expected_patterns = Patterns(["example.com"]) expected_to_return = None expected_meta = {"extra_arg": "foo"} scrapinghub-web-poet-ba87b95/tests/po_lib/an_empty_module.py000066400000000000000000000000001517167256700243320ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib/an_empty_package/000077500000000000000000000000001517167256700241005ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib/an_empty_package/__init__.py000066400000000000000000000000001517167256700261770ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib/nested_package/000077500000000000000000000000001517167256700235465ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib/nested_package/__init__.py000066400000000000000000000007421517167256700256620ustar00rootroot00000000000000from url_matcher import Patterns from tests.po_lib import POBase from web_poet import ItemPage, handle_urls class PONestedPkgOverriden(ItemPage): ... @handle_urls( include=["example.com", "example.org"], exclude=["/*.jpg|"], instead_of=PONestedPkgOverriden, ) class PONestedPkg(POBase): expected_instead_of = PONestedPkgOverriden expected_patterns = Patterns(["example.com", "example.org"], ["/*.jpg|"]) expected_to_return = None expected_meta = {} scrapinghub-web-poet-ba87b95/tests/po_lib/nested_package/a_nested_module.py000066400000000000000000000010141517167256700272430ustar00rootroot00000000000000from url_matcher import Patterns from tests.po_lib import POBase from web_poet import ItemPage, handle_urls class PONestedModuleOverriden(ItemPage): ... @handle_urls( include=["example.com", "example.org"], exclude=["/*.jpg|"], instead_of=PONestedModuleOverriden, ) class PONestedModule(POBase): expected_instead_of = PONestedModuleOverriden expected_patterns = Patterns( include=["example.com", "example.org"], exclude=["/*.jpg|"] ) expected_to_return = None expected_meta = {} scrapinghub-web-poet-ba87b95/tests/po_lib_sub/000077500000000000000000000000001517167256700214625ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib_sub/__init__.py000066400000000000000000000012041517167256700235700ustar00rootroot00000000000000"""This package is being used by tests/po_lib to validate some behaviors on external depedencies. """ from __future__ import annotations from typing import Any from url_matcher import Patterns from web_poet import ItemPage, handle_urls class POBase(ItemPage): expected_instead_of: type[ItemPage] expected_patterns: Patterns expected_meta: dict[str, Any] class POLibSubOverriden(ItemPage): ... @handle_urls("sub.example", instead_of=POLibSubOverriden) class POLibSub(POBase): expected_instead_of = POLibSubOverriden expected_patterns = Patterns(["sub.example"]) expected_to_return = None expected_meta = {} scrapinghub-web-poet-ba87b95/tests/po_lib_to_return/000077500000000000000000000000001517167256700227125ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests/po_lib_to_return/__init__.py000066400000000000000000000116351517167256700250310ustar00rootroot00000000000000import attrs from url_matcher import Patterns from web_poet import Injectable, ItemPage, Returns, field, handle_urls @attrs.define class Product: name: str price: float @attrs.define class ProductSeparate: name: str price: float @attrs.define class ProductSimilar: name: str price: float @attrs.define class ProductMoreFields(Product): brand: str @attrs.define class ProductFewerFields: name: str @handle_urls("example.com") class SomePage(ItemPage): """A PO which is only marked by the URL pattern.""" expected_instead_of = None expected_patterns = Patterns(["example.com"]) expected_to_return = None expected_meta = {} @field def name(self) -> str: return "some name" @handle_urls("example.com") class ProductPage(ItemPage[Product]): """A base PO to populate the Product item's fields.""" expected_instead_of = None expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} @field def name(self) -> str: return "name" @field def price(self) -> float: return 12.99 @handle_urls("example.com", instead_of=ProductPage) class ImprovedProductPage(ProductPage): """A custom PO inheriting from a base PO which alters some field values.""" expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} @field def name(self) -> str: return "improved name" @handle_urls("example.com", instead_of=ProductPage) class SeparateProductPage(ItemPage[ProductSeparate]): """Same case as with ``ImprovedProductPage`` but it doesn't inherit from ``ProductPage``. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductSeparate expected_meta = {} @field def name(self) -> str: return "separate name" @handle_urls("example.com", instead_of=ProductPage) class SimilarProductPage(ProductPage, Returns[ProductSimilar]): """A custom PO inheriting from a base PO returning the same fields but in a different item class. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductSimilar expected_meta = {} @handle_urls("example.com", instead_of=ProductPage) class MoreProductPage(ProductPage, Returns[ProductMoreFields]): """A custom PO inheriting from a base PO returning more items using a different item class. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductMoreFields expected_meta = {} @field def brand(self) -> str: return "brand" @handle_urls("example.com", instead_of=ProductPage) class LessProductPage( ProductPage, Returns[ProductFewerFields], skip_nonitem_fields=True ): """A custom PO inheriting from a base PO returning less items using a different item class. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductFewerFields expected_meta = {} @field def brand(self) -> str: return "brand" @handle_urls("example.com", instead_of=ProductPage, to_return=ProductSimilar) class CustomProductPage(ProductPage, Returns[Product]): """A custom PO inheriting from a base PO returning the same fields but in a different item class. This PO is the same with ``SimilarProductPage`` but passes a ``to_return`` in the ``@handle_urls`` decorator. This tests the case that the type passed via the ``to_return`` parameter from ``@handle_urls`` takes priority. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductSimilar expected_meta = {} @handle_urls("example.com", instead_of=ProductPage, to_return=ProductSimilar) class CustomProductPageNoReturns(ProductPage): """Same case as with ``CustomProductPage`` but doesn't inherit from ``Returns[Product]``. """ expected_instead_of = ProductPage expected_patterns = Patterns(["example.com"]) expected_to_return = ProductSimilar expected_meta = {} @handle_urls("example.com", to_return=Product) class CustomProductPageDataTypeOnly(Injectable): """A PO that doesn't inherit from ``ItemPage`` and ``WebPage`` which means it doesn't inherit from the ``Returns`` class. This tests the case that the ``to_return`` parameter in ``@handle_urls`` should properly use it in the rules. """ expected_instead_of = None expected_patterns = Patterns(["example.com"]) expected_to_return = Product expected_meta = {} @property def name(self) -> str: return "name" @property def price(self) -> float: return 12.99 async def to_item(self) -> Product: return Product(name=self.name, price=self.price) scrapinghub-web-poet-ba87b95/tests/test_annotations.py000066400000000000000000000006061517167256700233150ustar00rootroot00000000000000from typing import get_type_hints import pytest import web_poet EXPORTED = [getattr(web_poet, t) for t in dir(web_poet) if not t.startswith("_")] EXPORTED_TYPES = [t for t in EXPORTED if isinstance(t, type)] @pytest.mark.parametrize("t", EXPORTED_TYPES) def test_get_type_hints(t: type) -> None: """Test that get_type_hints() works for all exported types.""" get_type_hints(t) scrapinghub-web-poet-ba87b95/tests/test_exceptions.py000066400000000000000000000022061517167256700231370ustar00rootroot00000000000000import pytest from web_poet.exceptions import HttpError, HttpRequestError, HttpResponseError from web_poet.page_inputs import HttpRequest, HttpResponse URL = "https://example.com" def test_http_error_init() -> None: exc = HttpError() assert exc.request is None assert exc.args request = HttpRequest(URL) exc = HttpError(request=request) assert exc.request == request def test_http_request_error_init() -> None: exc = HttpRequestError() assert exc.request is None assert exc.args request = HttpRequest(URL) exc = HttpRequestError(request=request) assert exc.request == request response = HttpResponse(URL, b"") with pytest.raises(TypeError): HttpRequestError(request=request, response=response) # type: ignore[call-arg] def test_http_response_error_init() -> None: exc = HttpResponseError() assert exc.request is None assert exc.response is None assert exc.args request = HttpRequest(URL) response = HttpResponse(URL, b"") exc = HttpResponseError(request=request, response=response) assert exc.request == request assert exc.response == response scrapinghub-web-poet-ba87b95/tests/test_extras.py000066400000000000000000000002441517167256700222640ustar00rootroot00000000000000import pytest def test_framework(): with pytest.raises(ImportError, match="web-poet\\[framework\\]"): import web_poet.framework # noqa: F401,PLC0415 scrapinghub-web-poet-ba87b95/tests/test_fields.py000066400000000000000000000451011517167256700222250ustar00rootroot00000000000000from __future__ import annotations import asyncio import random from typing import TYPE_CHECKING import attrs import pytest from tests.po_lib_to_return import ( CustomProductPage, CustomProductPageDataTypeOnly, CustomProductPageNoReturns, ImprovedProductPage, LessProductPage, MoreProductPage, Product, ProductFewerFields, ProductMoreFields, ProductPage, ProductSimilar, SimilarProductPage, ) from web_poet import ( HttpResponse, ItemPage, WebPage, field, item_from_fields, item_from_fields_sync, ) from web_poet.fields import FieldsMixin, get_fields_dict if TYPE_CHECKING: from collections.abc import Callable @attrs.define class Item: name: str price: str @attrs.define class Page(ItemPage[Item]): response: HttpResponse @field def name(self): return self.response.css("title ::text").get() @field async def price(self): await asyncio.sleep(0.01) return "$123" @attrs.define class InvalidPage(ItemPage[Item]): response: HttpResponse @field def name(self): return self.response.css("title ::text").get() @field def unknown_attribute(self): return "foo" EXAMPLE_RESPONSE = HttpResponse( "http://example.com", body=b"Hello!", ) @pytest.mark.asyncio async def test_fields() -> None: page = Page(response=EXAMPLE_RESPONSE) assert page.name == "Hello!" assert await page.price == "$123" item = await page.to_item() assert isinstance(item, Item) assert item.name == "Hello!" assert item.price == "$123" @pytest.mark.asyncio async def test_fields_invalid_page() -> None: page = InvalidPage(response=EXAMPLE_RESPONSE) with pytest.raises( TypeError, match="unexpected keyword argument 'unknown_attribute'" ): await page.to_item() def test_item_from_fields_sync() -> None: @attrs.define class Page(ItemPage): @field def name(self): return "name" def to_item(self): return item_from_fields_sync(self, dict) page = Page() assert page.to_item() == {"name": "name"} def test_field_non_callable() -> None: with pytest.raises(TypeError): @attrs.define class Page(ItemPage): # https://github.com/python/mypy/issues/1362#issuecomment-438246775 @field # type: ignore[prop-decorator] @property def name(self): return "name" def to_item(self): return item_from_fields_sync(self, dict) def test_field_classmethod() -> None: with pytest.raises(TypeError): @attrs.define class Page(ItemPage): @field @classmethod def name(cls): return "name" def to_item(self): return item_from_fields_sync(self, dict) @pytest.mark.asyncio async def test_field_order() -> None: class DictItemPage(Page): async def to_item(self): return await item_from_fields(self) page = DictItemPage(response=EXAMPLE_RESPONSE) item = await page.to_item() assert item == {"name": "Hello!", "price": "$123"} assert list(item.keys()) == ["name", "price"] def test_field_decorator_no_arguments() -> None: class Page(ItemPage): @field() def name(self): return "Name" def to_item(self): return item_from_fields_sync(self) page = Page() assert page.to_item() == {"name": "Name"} def test_field_preserves_docstrings() -> None: class Page(ItemPage): @field def foo(self): """Foo docs""" return "foo" @field() def bar(self): """Bar docs""" return "bar" assert Page.foo.__doc__ == "Foo docs" assert Page.bar.__doc__ == "Bar docs" page = Page() assert page.foo == "foo" assert page.bar == "bar" def test_field_cache_sync() -> None: class Page(ItemPage): _n_called_1 = 0 _n_called_2 = 0 def __init__(self, name): self.name = name @field(cached=True) def n_called_1(self): self._n_called_1 += 1 return self._n_called_1, self.name @field(cached=False) def n_called_2(self): self._n_called_2 += 1 return self._n_called_2, self.name pages = [Page("first"), Page("second")] for page in pages: assert page.n_called_1 == (1, page.name) assert page.n_called_1 == (1, page.name) assert page.n_called_2 == (1, page.name) assert page.n_called_2 == (2, page.name) @pytest.mark.asyncio async def test_field_cache_async() -> None: class Page(ItemPage): _n_called_1 = 0 _n_called_2 = 0 def __init__(self, name): self.name = name @field(cached=True) async def n_called_1(self): self._n_called_1 += 1 return self._n_called_1, self.name @field(cached=False) async def n_called_2(self): self._n_called_2 += 1 return self._n_called_2, self.name pages = [Page("first"), Page("second")] for page in pages: assert await page.n_called_1 == (1, page.name) assert await page.n_called_1 == (1, page.name) assert await page.n_called_2 == (1, page.name) assert await page.n_called_2 == (2, page.name) @pytest.mark.asyncio async def test_field_cache_async_locked() -> None: class Page(ItemPage): _n_called = 0 @field(cached=True) async def n_called(self): await asyncio.sleep(random.randint(0, 10) / 100.0) self._n_called += 1 return self._n_called page = Page() results = await asyncio.gather( page.n_called, page.n_called, page.n_called, page.n_called, page.n_called, ) assert results == [1, 1, 1, 1, 1] @pytest.mark.asyncio async def test_skip_nonitem_fields_async() -> None: class ExtendedPage(Page): @field def new_attribute(self): return "foo" page = ExtendedPage(response=EXAMPLE_RESPONSE) with pytest.raises(TypeError, match="unexpected keyword argument 'new_attribute'"): await page.to_item() class ExtendedPage2(ExtendedPage): async def to_item(self) -> Item: return await item_from_fields(self, Item, skip_nonitem_fields=True) page = ExtendedPage2(response=EXAMPLE_RESPONSE) item = await page.to_item() assert item == Item(name="Hello!", price="$123") def test_skip_nonitem_fields() -> None: @attrs.define class SyncPage(ItemPage): response: HttpResponse @field def name(self): return self.response.css("title ::text").get() @field def price(self): return "$123" def to_item(self) -> Item: # type: ignore[override] return item_from_fields_sync(self, Item) class ExtendedPage(SyncPage): @field def new_attribute(self): return "foo" page = ExtendedPage(response=EXAMPLE_RESPONSE) with pytest.raises(TypeError, match="unexpected keyword argument 'new_attribute'"): page.to_item() class ExtendedPage2(ExtendedPage): def to_item(self) -> Item: # type: ignore[override] return item_from_fields_sync(self, Item, skip_nonitem_fields=True) page = ExtendedPage2(response=EXAMPLE_RESPONSE) item = page.to_item() assert item == Item(name="Hello!", price="$123") def test_field_meta() -> None: class MyPage(ItemPage): @field(meta={"good": True}) def field1(self): return "foo" @field def field2(self): return "foo" def to_item(self): return item_from_fields_sync(self) page = MyPage() for fields in [get_fields_dict(MyPage), get_fields_dict(page)]: assert list(fields.keys()) == ["field1", "field2"] assert fields["field1"].name == "field1" assert fields["field1"].meta == {"good": True} assert fields["field2"].name == "field2" assert fields["field2"].meta is None def test_field_subclassing() -> None: class Page(ItemPage): @field def field1(self): return 1 @field def field3(self): return 1 assert list(get_fields_dict(Page)) == ["field1", "field3"] assert get_fields_dict(Page)["field3"].meta is None class Page2(Page): @field def field2(self): return 1 @field(meta={"foo": "bar"}) def field3(self): return 1 assert get_fields_dict(Page2)["field3"].meta == {"foo": "bar"} assert list(get_fields_dict(Page2)) == ["field1", "field3", "field2"] assert get_fields_dict(Page)["field3"].meta is None assert list(get_fields_dict(Page)) == ["field1", "field3"] class Page3(Page2): @field def field3(self): return 2 assert get_fields_dict(Page3)["field3"].meta is None assert list(get_fields_dict(Page3)) == ["field1", "field3", "field2"] assert get_fields_dict(Page)["field3"].meta is None assert list(get_fields_dict(Page)) == ["field1", "field3"] assert get_fields_dict(Page2)["field3"].meta == {"foo": "bar"} assert list(get_fields_dict(Page2)) == ["field1", "field3", "field2"] def test_field_subclassing_super() -> None: class Page(ItemPage): @field def field1(self): return 1 class Page2(Page): @field def field1(self): return super().field1 + 1 page = Page() assert page.field1 == 1 page2 = Page2() assert page2.field1 == 2 def test_field_subclassing_from_to_item() -> None: # to_item() should be the same since it was not overridden from the # subclass. class PageToItem(ItemPage): def to_item(self): return {"field1": 1, "field2": 2, "field3": 3, "field4": 4} class Page1(PageToItem): @field def field1(self): return 0 page_1 = Page1() assert page_1.field1 == 0 assert page_1.to_item() == {"field1": 1, "field2": 2, "field3": 3, "field4": 4} # to_item() only reflects the field that was decorated. class Page2(PageToItem): @field def field2(self): return 0 def to_item(self): return item_from_fields_sync(self) page_2 = Page2() assert page_2.field2 == 0 assert page_2.to_item() == {"field2": 0} # to_item() raises an error if there are some required fields from the item_cls # that doesn't have a corresponding field value. @attrs.define class SomeItem: field1: int field2: int field3: int field4: int class Page3(PageToItem): @field def field3(self): return 0 def to_item(self): return item_from_fields_sync(self, item_cls=SomeItem) page_3 = Page3() assert page_3.field3 == 0 with pytest.raises(TypeError): page_3.to_item() def test_field_with_other_decorators() -> None: def clean_str(method): def wrapper(*args, **kwargs): return method(*args, **kwargs).strip() return wrapper class MyPage(ItemPage): @field @clean_str def field_foo(self): return " foo \n" @field(meta={"good": True}) @clean_str def field_foo_meta(self): return " foo \n" @field(cached=True) @clean_str def field_foo_cached(self): return " foo \n" page = MyPage() assert page.field_foo == "foo" assert page.field_foo_meta == "foo" assert page.field_foo_cached == "foo" @pytest.mark.asyncio async def test_field_with_handle_urls() -> None: page = ProductPage() assert page.name == "name" assert page.price == 12.99 assert await page.to_item() == Product(name="name", price=12.99) page = ImprovedProductPage() assert page.name == "improved name" assert page.price == 12.99 assert await page.to_item() == Product(name="improved name", price=12.99) page = SimilarProductPage() assert page.name == "name" assert page.price == 12.99 assert await page.to_item() == ProductSimilar(name="name", price=12.99) page = MoreProductPage() assert page.name == "name" assert page.price == 12.99 assert page.brand == "brand" assert await page.to_item() == ProductMoreFields( name="name", price=12.99, brand="brand" ) page = LessProductPage() assert page.name == "name" assert await page.to_item() == ProductFewerFields(name="name") for page in [ # type: ignore[assignment] CustomProductPage(), CustomProductPageNoReturns(), CustomProductPageDataTypeOnly(), ]: assert page.name == "name" assert page.price == 12.99 assert await page.to_item() == Product(name="name", price=12.99) def test_field_processors_sync() -> None: def proc1(s): return s + "x" @attrs.define class Page(ItemPage): @field(out=[str.strip, proc1]) def name(self): return " name\t " page = Page() assert page.name == "namex" @pytest.mark.asyncio async def test_field_processors_async() -> None: def proc1(s): return s + "x" @attrs.define class Page(ItemPage): @field(out=[str.strip, proc1]) async def name(self): return " name\t " page = Page() assert await page.name == "namex" def test_field_processors_inheritance() -> None: def proc1(s): return s + "x" class BasePage(ItemPage): @field(out=[str.strip, proc1]) def name(self): return " name\t " class Page(BasePage): @field(out=[str.strip]) def name(self): return " name\t " base_page = BasePage() assert base_page.name == "namex" page = Page() assert page.name == "name" def test_field_processors_page() -> None: def proc1(s, page): return page.prefix + s + "x" class Page(ItemPage): @field(out=[str.strip, proc1]) def name(self): return " name\t " @field def prefix(self): return "prefix: " page = Page() assert page.name == "prefix: namex" def test_field_processors_multiple_pages() -> None: def proc(value, page): return page.body + value class Page(WebPage): @field def body(self): return self.response.text @field(out=[proc]) def processed(self): return "suffix" page1 = Page(response=HttpResponse("https://example.com", b"page1")) page2 = Page(response=HttpResponse("https://example.com", b"page2")) assert page1.body == "page1" assert page1.processed == "page1suffix" assert page2.body == "page2" assert page2.processed == "page2suffix" def test_field_processors_circular() -> None: def proc1(s, page): return s + page.b def proc2(s, page): return s + page.a class Page(ItemPage): @field(out=[proc1]) def a(self): return "a" @field(out=[proc2]) def b(self): return "b" page = Page() with pytest.raises(RecursionError): page.a with pytest.raises(RecursionError): page.b def test_field_processors_default() -> None: @attrs.define class BasePage(ItemPage): class Processors: name = [str.strip] @field def name(self): return " name\t " class Page(BasePage): pass base_page = BasePage() assert base_page.name == "name" page = Page() assert page.name == "name" def test_field_processors_override() -> None: def proc1(s): return s + "x" class BasePage(ItemPage): class Processors: f1: list[Callable] = [str.strip] f2 = [str.strip] f3 = [str.strip] f4: list[Callable] = [str.strip] f5: list[Callable] = [str.strip] @field def f1(self): return " f1\t " @field(out=[]) def f2(self): return " f2\t " @field def f3(self): return " f3\t " @field def f4(self): return " f4\t " @field def f5(self): return " f5\t " class Page(BasePage): class Processors(BasePage.Processors): f1 = [proc1] f4 = [*BasePage.Processors.f4, proc1] @field(out=[*BasePage.Processors.f5, proc1]) def f5(self): return " f5\t " base_page = BasePage() assert base_page.f1 == "f1" assert base_page.f2 == " f2\t " assert base_page.f3 == "f3" assert base_page.f4 == "f4" assert base_page.f5 == "f5" page = Page() assert page.f1 == " f1\t x" assert page.f2 == " f2\t " assert page.f3 == "f3" assert page.f4 == "f4x" assert page.f5 == "f5x" def test_field_processors_super() -> None: class BasePage(ItemPage): class Processors: name = [str.strip] desc = [str.strip] @field def name(self): return "name " @field def desc(self): return "desc " class Page(BasePage): class Processors(BasePage.Processors): name: list[Callable] = [] @field def name(self): base_name = super().name return base_name + "2 " class Page2(Page): class Processors(Page.Processors): name: list[Callable] = [] desc: list[Callable] = [] @field def desc(self): base_desc = super().desc return base_desc + "2 " base_page = BasePage() assert base_page.name == "name" page = Page() assert page.name == "name 2 " page2 = Page2() assert page2.desc == "desc 2 " def test_field_processors_builtin() -> None: @attrs.define class Page(ItemPage): @field(out=[int]) def value(self): return "1" page = Page() assert page.value == 1 def test_field_mixin() -> None: class A(ItemPage): @field def a(self): return None class Mixin(FieldsMixin): @field def mixin(self): return None class B(Mixin, A): @field def b(self): return None class C(Mixin, A): @field def c(self): return None assert set(get_fields_dict(A)) == {"a"} assert set(get_fields_dict(B)) == {"a", "b", "mixin"} assert set(get_fields_dict(C)) == {"a", "c", "mixin"} scrapinghub-web-poet-ba87b95/tests/test_framework.py000066400000000000000000000723431517167256700227640ustar00rootroot00000000000000import logging import pytest pytest.importorskip("niquests") pytest.importorskip("playwright") from typing import Annotated import niquests import niquests.structures import pytest from attrs import define from web_poet import Injectable, ItemPage, field from web_poet.exceptions import HttpRequestError, HttpResponseError from web_poet.framework import Framework, _providers, playwright_engine from web_poet.framework._api import _normalize_request from web_poet.page_inputs import Stats from web_poet.page_inputs.browser import BrowserHtml, BrowserResponse from web_poet.page_inputs.client import HttpClient from web_poet.page_inputs.http import ( HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, HttpResponseBody, HttpResponseHeaders, ) from web_poet.page_inputs.page_params import PageParams from web_poet.page_inputs.response import AnyResponse from web_poet.page_inputs.stats import DictStatCollector from web_poet.page_inputs.url import RequestUrl, ResponseUrl @define class SampleItem: foo: str SAMPLE_ITEM = SampleItem(foo="bar") class SampleItemPageStub: def to_item(self): return SAMPLE_ITEM def patch_aget( monkeypatch, *, response_url="https://b.example", status=200, content=b"", headers=None, ): class DummyResponse: def __init__(self): self.url = response_url self.status_code = status self.content = content self.headers = headers or {} state = {"calls": 0} async def fake_aget(_url, timeout=300): state["calls"] += 1 return DummyResponse() monkeypatch.setattr(niquests, "aget", fake_aget) return state def patch_async_playwright( monkeypatch, *, response_url="https://c.example", html="engine:{engine}", status=200, ): state = {"calls": 0, "launches": {}} class DummyGotoResponse: def __init__(self, status_code): self.status = status_code class DummyPage: def __init__(self, engine_name: str): self.url = "about:blank" self._engine = engine_name async def goto(self, _url): state["calls"] += 1 self.url = response_url if status is None: return None return DummyGotoResponse(status) async def content(self): try: return html.format(engine=self._engine) except Exception: return html class DummyBrowser: def __init__(self, engine_name: str): self._engine = engine_name async def new_page(self): return DummyPage(self._engine) async def close(self): return None class DummyEngine: def __init__(self, name: str): self.name = name async def launch(self): state["launches"].setdefault(self.name, 0) state["launches"][self.name] += 1 return DummyBrowser(self.name) class DummyPlaywright: chromium = DummyEngine("chromium") firefox = DummyEngine("firefox") webkit = DummyEngine("webkit") class DummyPlaywrightContext: async def __aenter__(self): return DummyPlaywright() async def __aexit__(self, exc_type, exc, tb): return False def fake_async_playwright(): return DummyPlaywrightContext() monkeypatch.setattr(_providers, "async_playwright", fake_async_playwright) return state @pytest.mark.asyncio async def test_async_to_item(): class Page(ItemPage[SampleItem]): @field async def foo(self): return "bar" framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_sync_to_item(): class Page(ItemPage[SampleItem]): def to_item(self): return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_get_item_page_cls(): class Page(ItemPage[SampleItem]): def to_item(self): return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_get_item_item_cls(registry): @registry.handle_urls("https://a.example") class Page(ItemPage[SampleItem]): def to_item(self): return SAMPLE_ITEM framework = Framework(registry=registry) item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_get_item_no_page(registry): framework = Framework(registry=registry) with pytest.raises( ValueError, match=r"No page object class found for URL: https://a.example" ): await framework.get_item("https://a.example", SampleItem) @pytest.mark.asyncio async def test_http_client(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): http_client: HttpClient async def to_item(self): response = await self.http_client.get("https://a.example") assert response.status == 200 return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 1 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_http_client_allow_status(monkeypatch): http_state = patch_aget(monkeypatch, status=404) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): http_client: HttpClient async def to_item(self): # Default behavior: 404 raises HttpResponseError with pytest.raises(HttpResponseError): await self.http_client.get("https://a.example") # Allow 404 via allow_status resp = await self.http_client.get("https://a.example", allow_status=404) assert resp.status == 404 return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 2 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_page_params(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): page_params: PageParams async def to_item(self): return SampleItem(foo=self.page_params["foo"]) framework = Framework() item = await framework.get_item( "https://a.example", Page, page_params={"foo": "bar"} ) assert item == SAMPLE_ITEM assert http_state["calls"] == 0 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_response_url(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): url: ResponseUrl async def to_item(self): assert str(self.url) == "https://b.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 1 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_browser_response(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright( monkeypatch, response_url="https://c.example", html="hello", status=201, ) @define class Page(ItemPage[SampleItem]): response: BrowserResponse async def to_item(self): assert isinstance(self.response, BrowserResponse) assert str(self.response.url) == "https://c.example" assert self.response.status == 201 assert self.response.text == "hello" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["calls"] == 1 assert http_state["calls"] == 0 @pytest.mark.asyncio async def test_stats_no_collector_passed(): @define class Page(ItemPage[dict]): stats: Stats async def to_item(self): self.stats.set("a", "1") self.stats.inc("b") # return the underlying collector dict for assertion return self.stats._stats._stats framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == {"a": "1", "b": 1} @pytest.mark.asyncio async def test_stats_with_collector_passed(): collector = DictStatCollector() @define class Page(ItemPage[SampleItem]): stats: Stats async def to_item(self): self.stats.set("latest", "ok") self.stats.inc("hits") return SAMPLE_ITEM framework = Framework(stats=collector) item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert collector.data == {"latest": "ok", "hits": 1} @pytest.mark.asyncio async def test_any_response_prefers_http(monkeypatch): http_state = patch_aget(monkeypatch, response_url="https://b.example") browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): response: AnyResponse async def to_item(self): assert isinstance(self.response, AnyResponse) assert isinstance(self.response.response, HttpResponse) assert str(self.response.url) == "https://b.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 1 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_any_response_uses_browser_when_browser_needed(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright( monkeypatch, response_url="https://c.example", html="hello", status=201, ) @define class Page(ItemPage[SampleItem]): browser_response: BrowserResponse response: AnyResponse async def to_item(self): assert isinstance(self.response, AnyResponse) assert isinstance(self.response.response, BrowserResponse) assert str(self.response.url) == "https://c.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["calls"] == 1 assert http_state["calls"] == 0 @pytest.mark.asyncio async def test_request_specific_browser_annotation(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): response: Annotated[BrowserResponse, playwright_engine("firefox")] async def to_item(self): return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 0 assert browser_state["launches"] == {"firefox": 1} @pytest.mark.asyncio async def test_default_browser_param_override(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): response: BrowserResponse async def to_item(self): return SAMPLE_ITEM framework = Framework(default_playwright_engine="webkit") item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 0 assert browser_state["launches"] == {"webkit": 1} @pytest.mark.asyncio async def test_multiple_browser_responses_and_unannotated_choice(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): resp_a: Annotated[BrowserResponse, playwright_engine("firefox")] resp_b: Annotated[BrowserResponse, playwright_engine("chromium")] resp_c: BrowserResponse async def to_item(self): # When annotated deps include the default browser, unannotated deps # should use the default. assert isinstance(self.resp_a, BrowserResponse) assert "engine:firefox" in self.resp_a.text assert isinstance(self.resp_b, BrowserResponse) assert "engine:chromium" in self.resp_b.text assert isinstance(self.resp_c, BrowserResponse) assert "engine:chromium" in self.resp_c.text return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["launches"] == {"chromium": 1, "firefox": 1} assert http_state["calls"] == 0 # repeat with default override to firefox (unannotated should pick default) browser_state2 = patch_async_playwright(monkeypatch) @define class Page2(ItemPage[SampleItem]): resp_a: Annotated[BrowserResponse, playwright_engine("firefox")] resp_b: Annotated[BrowserResponse, playwright_engine("chromium")] resp_c: BrowserResponse async def to_item(self): # with default override to firefox, unannotated resp_c should use firefox assert isinstance(self.resp_a, BrowserResponse) assert "engine:firefox" in self.resp_a.text assert isinstance(self.resp_b, BrowserResponse) assert "engine:chromium" in self.resp_b.text assert isinstance(self.resp_c, BrowserResponse) assert "engine:firefox" in self.resp_c.text return SAMPLE_ITEM framework = Framework(default_playwright_engine="firefox") item = await framework.get_item("https://b.example", Page2) assert item == SAMPLE_ITEM assert browser_state2["launches"] == {"chromium": 1, "firefox": 1} assert http_state["calls"] == 0 # scenario: resp_a uses firefox and resp_b uses webkit => unannotated picks # alphabetical (firefox) browser_state3 = patch_async_playwright(monkeypatch) @define class Page3(ItemPage[SampleItem]): resp_a: Annotated[BrowserResponse, playwright_engine("firefox")] resp_b: Annotated[BrowserResponse, playwright_engine("webkit")] resp_c: BrowserResponse async def to_item(self): # resp_a (firefox), resp_b (webkit), resp_c should pick alphabetical (firefox) assert isinstance(self.resp_a, BrowserResponse) assert "engine:firefox" in self.resp_a.text assert isinstance(self.resp_b, BrowserResponse) assert "engine:webkit" in self.resp_b.text assert isinstance(self.resp_c, BrowserResponse) assert "engine:firefox" in self.resp_c.text return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example/page3", Page3) assert item == SAMPLE_ITEM assert browser_state3["launches"] == {"firefox": 1, "webkit": 1} @pytest.mark.asyncio async def test_browser_html_annotation(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): html: Annotated[BrowserHtml, playwright_engine("firefox")] async def to_item(self): assert isinstance(self.html, BrowserHtml) return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["launches"] == {"firefox": 1} assert http_state["calls"] == 0 @pytest.mark.asyncio async def test_unsupported_browser_raises(monkeypatch): patch_aget(monkeypatch) patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): response: Annotated[BrowserResponse, playwright_engine("foo")] async def to_item(self): return SAMPLE_ITEM framework = Framework() with pytest.raises(ValueError, match=r"Playwright does not provide engine"): await framework.get_item("https://a.example", Page) @pytest.mark.asyncio async def test_browser_html_dependency(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright( monkeypatch, response_url="https://c.example", html="hello", status=200, ) @define class Page(ItemPage[SampleItem]): html: BrowserHtml async def to_item(self): assert isinstance(self.html, BrowserHtml) assert str(self.html) == "hello" assert self.html.xpath("//body/text()").get("").strip() == "hello" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["calls"] == 1 assert http_state["calls"] == 0 @pytest.mark.asyncio async def test_response_url_with_browser_response(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright( monkeypatch, response_url="https://c.example" ) @define class Page(ItemPage[SampleItem]): response_url: ResponseUrl browser_response: BrowserResponse async def to_item(self): assert str(self.browser_response.url) == "https://c.example" assert str(self.response_url) == "https://c.example" return SAMPLE_ITEM item = await Framework().get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["calls"] == 1 assert http_state["calls"] == 0 @pytest.mark.asyncio async def test_http_request_body(): @define class Page(ItemPage[SampleItem]): body: HttpRequestBody async def to_item(self): assert isinstance(self.body, HttpRequestBody) assert bytes(self.body) == b"foo" return SAMPLE_ITEM request = HttpRequest(url="https://a.example", body=b"foo") framework = Framework() item = await framework.get_item(request, Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_browser_non_get(): request = HttpRequest(url="https://a.example", method="POST") @define class Page(ItemPage[SampleItem]): response: BrowserResponse async def to_item(self): return SAMPLE_ITEM framework = Framework() with pytest.raises(HttpRequestError, match=r"plain GET"): await framework.get_item(request, Page) @pytest.mark.asyncio async def test_browser_request_headers(monkeypatch, caplog): # Patch Playwright and capture warnings patch_async_playwright(monkeypatch) caplog.set_level(logging.WARNING) request = HttpRequest(url="https://a.example", headers={"X-Foo": "bar"}) @define class Page(ItemPage[SampleItem]): response: BrowserResponse async def to_item(self): return SAMPLE_ITEM framework = Framework() item = await framework.get_item(request, Page) assert item == SAMPLE_ITEM # Warning should mention the header name and that headers are ignored assert "X-Foo" in caplog.text assert "ignoring headers" in caplog.text.lower() @pytest.mark.asyncio async def test_browser_request_body(): request = HttpRequest(url="https://a.example", body=b"foo") @define class Page(ItemPage[SampleItem]): response: BrowserResponse async def to_item(self): return SAMPLE_ITEM framework = Framework() with pytest.raises( HttpRequestError, match=r"does not support requests with a body" ): await framework.get_item(request, Page) @pytest.mark.asyncio async def test_http_request_headers(): @define class Page(ItemPage[SampleItem]): headers: HttpRequestHeaders async def to_item(self): assert isinstance(self.headers, HttpRequestHeaders) assert self.headers.get("x-foo") == "bar" return SAMPLE_ITEM request = HttpRequest(url="https://a.example", headers={"X-Foo": "bar"}) framework = Framework() item = await framework.get_item(request, Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_http_request(): request = HttpRequest(url="https://a.example") @define class Page(ItemPage[SampleItem]): request: HttpRequest async def to_item(self): assert self.request is request return SAMPLE_ITEM framework = Framework() item = await framework.get_item(request, Page) assert item == SAMPLE_ITEM @pytest.mark.asyncio async def test_http_response_body(monkeypatch): state = patch_aget(monkeypatch, content=b"hello") @define class Page(ItemPage[SampleItem]): body: HttpResponseBody async def to_item(self): assert isinstance(self.body, HttpResponseBody) assert bytes(self.body) == b"hello" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert state["calls"] == 1 @pytest.mark.asyncio async def test_http_response_headers(monkeypatch): state = patch_aget(monkeypatch, headers={"X-Foo": "bar"}) @define class Page(ItemPage[SampleItem]): headers: HttpResponseHeaders async def to_item(self): assert isinstance(self.headers, HttpResponseHeaders) assert self.headers.get("x-foo") == "bar" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert state["calls"] == 1 @pytest.mark.asyncio async def test_request_url(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): request_url: RequestUrl async def to_item(self): assert str(self.request_url) == "https://a.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 0 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_both_urls(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page(ItemPage[SampleItem]): request_url: RequestUrl response_url: ResponseUrl async def to_item(self): assert str(self.request_url) == "https://a.example" assert str(self.response_url) == "https://b.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 1 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_http_and_browser_responses(monkeypatch): http_state = patch_aget(monkeypatch, response_url="https://b.example") browser_state = patch_async_playwright( monkeypatch, response_url="https://c.example" ) @define class Page(ItemPage[SampleItem]): http_response: HttpResponse browser_response: BrowserResponse response_url: ResponseUrl async def to_item(self): assert str(self.http_response.url) == "https://b.example" assert str(self.browser_response.url) == "https://c.example" assert str(self.response_url) == "https://c.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 1 assert browser_state["calls"] == 1 @pytest.mark.asyncio async def test_multiple_http_response_dependencies(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright(monkeypatch) @define class Page2(Injectable): url: ResponseUrl @define class Page(ItemPage[SampleItem]): url: ResponseUrl response: HttpResponse page2: Page2 async def to_item(self): assert str(self.url) == "https://b.example" assert str(self.page2.url) == "https://b.example" assert str(self.response.url) == "https://b.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert http_state["calls"] == 1 assert browser_state["calls"] == 0 @pytest.mark.asyncio async def test_multiple_browser_response_dependencies(monkeypatch): http_state = patch_aget(monkeypatch) browser_state = patch_async_playwright( monkeypatch, response_url="https://c.example" ) @define class Page2(Injectable): url: ResponseUrl @define class Page(ItemPage[SampleItem]): url: ResponseUrl response: BrowserResponse page2: Page2 async def to_item(self): assert str(self.url) == "https://c.example" assert str(self.page2.url) == "https://c.example" assert str(self.response.url) == "https://c.example" return SAMPLE_ITEM framework = Framework() item = await framework.get_item("https://a.example", Page) assert item == SAMPLE_ITEM assert browser_state["calls"] == 1 assert http_state["calls"] == 0 @pytest.mark.parametrize( "input_value", [ HttpRequest(url="https://a.example"), RequestUrl("https://a.example"), ResponseUrl("https://a.example"), "https://a.example", ], ) def test_normalize_request(input_value): result = _normalize_request(input_value) assert isinstance(result, HttpRequest) assert str(result.url) == "https://a.example" assert isinstance(result.url, RequestUrl) if isinstance(input_value, HttpRequest): assert result is input_value def test_get_http_response_from_nirequests_response(): niquests_response = niquests.Response() niquests_response.url = "https://a.example" niquests_response.status_code = 200 niquests_response._content = b"foo" niquests_response.headers = niquests.structures.CaseInsensitiveDict( [ ("User-Agent", "mozilla"), # Niquests response headers never contain multiple headers with the # same name, their values are merged with commas: # https://niquests.readthedocs.io/en/latest/user/quickstart.html#response-headers ("X-Multi", "a, b"), ] ) request = HttpRequest(url="https://a.example") http_response = _providers._get_http_response_from_nirequests_response( request, niquests_response ) assert isinstance(http_response, HttpResponse) assert str(http_response.url) == "https://a.example" assert http_response.status == 200 assert isinstance(http_response.body, HttpResponseBody) assert bytes(http_response.body) == b"foo" assert isinstance(http_response.headers, HttpResponseHeaders) assert http_response.headers.get("user-agent") == "mozilla" assert http_response.headers.get("x-multi") == "a, b" @pytest.mark.asyncio async def test_nirequests_exceptions_are_wrapped(monkeypatch): async def fake_aget(_url, timeout=300): raise RuntimeError("niquests boom") monkeypatch.setattr(niquests, "aget", fake_aget) @define class Page(ItemPage[SampleItem]): response: HttpResponse async def to_item(self): return SAMPLE_ITEM with pytest.raises(HttpRequestError) as exc: await Framework().get_item("https://a.example", Page) assert isinstance(exc.value, HttpRequestError) assert isinstance(exc.value.request, HttpRequest) assert str(exc.value.request.url) == "https://a.example" assert "niquests boom" in str(exc.value) @pytest.mark.asyncio async def test_playwright_exceptions_are_wrapped(monkeypatch): class BadEngine: async def launch(self): raise RuntimeError("playwright boom") class DummyPlaywright: chromium = BadEngine() firefox = BadEngine() webkit = BadEngine() class DummyContext: async def __aenter__(self): return DummyPlaywright() async def __aexit__(self, exc_type, exc, tb): return False def fake_async_playwright(): return DummyContext() monkeypatch.setattr(_providers, "async_playwright", fake_async_playwright) @define class Page(ItemPage[SampleItem]): response: BrowserResponse async def to_item(self): return SAMPLE_ITEM with pytest.raises(HttpRequestError) as exc: await Framework().get_item("https://a.example", Page) assert isinstance(exc.value.request, HttpRequest) assert str(exc.value.request.url) == "https://a.example" assert "playwright boom" in str(exc.value) scrapinghub-web-poet-ba87b95/tests/test_input_validation.py000066400000000000000000000137671517167256700243450ustar00rootroot00000000000000"""Test page object input validation scenarios.""" from __future__ import annotations import attrs import pytest from web_poet import ItemPage, Returns, field, validates_input from web_poet.exceptions import Retry, UseFallback @attrs.define class Item: a: str is_valid: bool = True EXPECTED_ITEM = Item(a="a", is_valid=True) class BasePage(ItemPage[Item]): @field def a(self): return "a" # Valid input class BaseValidInputPage(BasePage): def validate_input(self): pass def test_valid_input_sync_to_item(): class Page(BaseValidInputPage): def to_item(self): return Item(a=self.a) assert Page().to_item() == EXPECTED_ITEM @pytest.mark.asyncio async def test_valid_input_async_to_item(): assert await BaseValidInputPage().to_item() == EXPECTED_ITEM def test_valid_input_sync_field(): assert BaseValidInputPage().a == "a" @pytest.mark.asyncio async def test_valid_input_async_field(): class Page(BaseValidInputPage): @field async def a(self): return "a" assert await Page().a == "a" # Retry class BaseRetryPage(BasePage): def validate_input(self): raise Retry def test_retry_sync_to_item(): class Page(BaseRetryPage): def to_item(self): return Item(a=self.a) page = Page() with pytest.raises(Retry): page.to_item() @pytest.mark.asyncio async def test_retry_async_to_item(): page = BaseRetryPage() with pytest.raises(Retry): await page.to_item() def test_retry_sync_field(): page = BaseRetryPage() with pytest.raises(Retry): page.a @pytest.mark.asyncio async def test_retry_async_field(): class Page(BaseRetryPage): @field async def a(self): return "a" page = Page() with pytest.raises(Retry): await page.a # Use fallback class BaseUseFallbackPage(BasePage): def validate_input(self): if self.a is None: raise UseFallback @field def a(self): return None def test_use_fallback_sync_to_item(): class Page(BaseUseFallbackPage): def to_item(self): return Item(a=self.a) page = Page() with pytest.raises(UseFallback): page.to_item() @pytest.mark.asyncio async def test_use_fallback_async_to_item(): page = BaseUseFallbackPage() with pytest.raises(UseFallback): await page.to_item() def test_use_fallback_sync_field(): page = BaseUseFallbackPage() with pytest.raises(UseFallback): page.a @pytest.mark.asyncio async def test_use_fallback_async_field(): class Page(BaseUseFallbackPage): def validate_input(self): # Cannot use async self.a raise UseFallback @field async def a(self): return "a" page = Page() with pytest.raises(UseFallback): await page.a # Invalid input INVALID_ITEM = Item(a="invalid", is_valid=False) class BaseInvalidInputPage(ItemPage[Item]): def validate_input(self): return INVALID_ITEM @field def a(self): raise RuntimeError("This exception should never be raised") def test_invalid_input_sync_to_item(): class Page(BaseInvalidInputPage): @validates_input def to_item(self): return Item(a=self.a) assert Page().to_item() == INVALID_ITEM @pytest.mark.asyncio async def test_invalid_input_async_to_item(): assert await BaseInvalidInputPage().to_item() == INVALID_ITEM def test_invalid_input_sync_field(): assert BaseInvalidInputPage().a == "invalid" @pytest.mark.asyncio async def test_invalid_input_async_field(): class Page(BaseInvalidInputPage): @field async def a(self): raise RuntimeError("This exception should never be raised") assert await Page().a == "invalid" # Unvalidated input def test_unvalidated_input_sync_to_item(): class Page(BasePage): def to_item(self): return Item(a=self.a) assert Page().to_item() == EXPECTED_ITEM @pytest.mark.asyncio async def test_unvalidated_input_async_to_item(): assert await BasePage().to_item() == EXPECTED_ITEM def test_unvalidated_input_sync_field(): assert BasePage().a == "a" @pytest.mark.asyncio async def test_unvalidated_input_async_field(): class Page(BasePage): @field async def a(self): return "a" assert await Page().a == "a" # Caching class BaseCachingPage(BasePage): _raise = False def validate_input(self): if self._raise: raise UseFallback self._raise = True def test_invalid_input_sync_to_item_caching(): class Page(BaseCachingPage): def to_item(self): return Item(a=self.a) page = Page() page.to_item() page.to_item() @pytest.mark.asyncio async def test_invalid_input_async_to_item_caching(): page = BaseCachingPage() await page.to_item() await page.to_item() def test_invalid_input_sync_field_caching(): page = BaseCachingPage() page.a page.a @pytest.mark.asyncio async def test_invalid_input_async_field_caching(): class Page(BaseCachingPage): @field async def a(self): return "a" page = Page() await page.a await page.a @pytest.mark.asyncio async def test_invalid_input_cross_api_caching(): @attrs.define class _Item(Item): b: str | None = None class Page(BaseCachingPage, Returns[_Item]): @field async def b(self): return "b" page = Page() page.a await page.b await page.to_item() # Recursion @pytest.mark.asyncio async def test_recursion(): """Make sure that using fields within the validate_input method does not result in a recursive call to the validate_input method.""" class Page(BasePage): _raise = False def validate_input(self): if self._raise: raise UseFallback self._raise = True assert self.a == "a" page = Page() assert page.a == "a" scrapinghub-web-poet-ba87b95/tests/test_mixins.py000066400000000000000000000045661517167256700223000ustar00rootroot00000000000000import parsel import pytest from packaging import version from web_poet.mixins import ResponseShortcutsMixin from web_poet.page_inputs import HttpResponse PARSEL_VERSION = version.parse(getattr(parsel, "__version__", "0.0")) PARSEL_18_PLUS = PARSEL_VERSION >= version.parse("1.8.0") # noqa: SIM300 class MyPage(ResponseShortcutsMixin): def __init__(self, response: HttpResponse): self.response = response @pytest.fixture def my_page(book_list_html_response): return MyPage(book_list_html_response) @pytest.fixture def my_json_page(some_json_response): return MyPage(some_json_response) def test_url(my_page) -> None: assert my_page.url == "http://books.toscrape.com/index.html" def test_html(my_page, book_list_html) -> None: assert my_page.html == book_list_html def test_xpath(my_page) -> None: title = my_page.xpath(".//title/text()").get().strip() assert title == "All products | Books to Scrape - Sandbox" @pytest.mark.skipif(not PARSEL_18_PLUS, reason="parsel < 1.8 doesn't support jmespath") def test_jmespath(my_json_page) -> None: for obj in [my_json_page, my_json_page.response]: name = obj.jmespath("website.name").get() assert name == "homepage" @pytest.mark.skipif(PARSEL_18_PLUS, reason="parsel >= 1.8 supports jmespath") def test_jmespath_not_available(my_json_page) -> None: for obj in [my_json_page, my_json_page.response]: with pytest.raises(AttributeError): obj.jmespath("website.name").get() def test_css(my_page) -> None: title = my_page.css("title::text").get().strip() assert title == "All products | Books to Scrape - Sandbox" def test_baseurl(my_page) -> None: assert my_page.base_url == "http://books.toscrape.com/index.html" def test_urljoin(my_page) -> None: assert my_page.urljoin("foo") == "http://books.toscrape.com/foo" def test_custom_baseurl() -> None: body = b""" """ response = HttpResponse( url="http://www.example.com/path", body=body, ) page = MyPage(response=response) assert page.url == "http://www.example.com/path" assert page.base_url == "http://example.com/foo/" assert page.urljoin("bar") == "http://example.com/foo/bar" assert page.urljoin("http://example.com/1") == "http://example.com/1" scrapinghub-web-poet-ba87b95/tests/test_page_inputs.py000066400000000000000000000517721517167256700233100ustar00rootroot00000000000000import codecs import json import aiohttp.web_response import parsel import pytest import requests from web_poet import BrowserResponse, RequestUrl, ResponseUrl from web_poet.page_inputs import ( AnyResponse, BrowserHtml, HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, HttpResponseBody, HttpResponseHeaders, ) from web_poet.page_inputs.http import request_fingerprint @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody]) def test_http_body_hashable(body_cls) -> None: http_body = body_cls(b"content") assert http_body in {http_body} assert http_body in {b"content"} assert http_body not in {b"foo"} @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody]) def test_http_body_bytes_api(body_cls) -> None: http_body = body_cls(b"content") assert http_body == b"content" assert b"ent" in http_body @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody]) def test_http_body_str_api(body_cls) -> None: with pytest.raises(TypeError): body_cls("string content") def test_http_response_body_declared_encoding() -> None: http_body = HttpResponseBody(b"content") assert http_body.declared_encoding() is None http_body = HttpResponseBody( b""" """ ) assert http_body.declared_encoding() == "utf-8" def test_http_response_body_json() -> None: http_body = HttpResponseBody(b"content") with pytest.raises(json.JSONDecodeError): http_body.json() http_body = HttpResponseBody(b'{"foo": 123}') assert http_body.json() == {"foo": 123} http_body = HttpResponseBody('{"ключ": "значение"}'.encode()) assert http_body.json() == {"ключ": "значение"} @pytest.mark.parametrize( ("cls", "body_cls"), [ (HttpRequest, HttpRequestBody), (HttpResponse, HttpResponseBody), ], ) def test_http_defaults(cls, body_cls) -> None: http_body = body_cls(b"content") obj = cls("url", body=http_body) assert str(obj.url) == "url" assert obj.body == b"content" assert not obj.headers assert obj.headers.get("user-agent") is None if cls == HttpResponse: assert obj.status is None else: with pytest.raises(AttributeError): obj.status @pytest.mark.parametrize( ("cls", "headers_cls"), [ (HttpRequest, HttpRequestHeaders), (HttpResponse, HttpResponseHeaders), ], ) def test_http_with_headers_alt_constructor(cls, headers_cls) -> None: headers = headers_cls.from_name_value_pairs( [{"name": "User-Agent", "value": "test agent"}] ) obj = cls("url", body=b"", headers=headers) assert len(obj.headers) == 1 assert obj.headers.get("user-agent") == "test agent" @pytest.mark.parametrize( ("cls", "body_cls"), [ (HttpRequest, HttpRequestBody), (HttpResponse, HttpResponseBody), ], ) def test_http_response_bytes_body(cls, body_cls) -> None: obj = cls("http://example.com", body=b"content") assert isinstance(obj.body, body_cls) assert obj.body == body_cls(b"content") @pytest.mark.parametrize("cls", [HttpRequest, HttpResponse]) def test_http_body_validation_str(cls) -> None: with pytest.raises(TypeError): cls("http://example.com", body="content") @pytest.mark.parametrize("cls", [HttpRequest, HttpResponse]) def test_http_body_validation_None(cls) -> None: with pytest.raises(TypeError): cls("http://example.com", body=None) @pytest.mark.xfail(reason="not implemented") @pytest.mark.parametrize("cls", [HttpRequest, HttpResponse]) def test_http_body_validation_other(cls) -> None: with pytest.raises(TypeError): cls("http://example.com", body=123) @pytest.mark.parametrize("cls", [HttpRequest, HttpResponse]) def test_http_request_headers_init_invalid(cls) -> None: with pytest.raises(TypeError): cls("http://example.com", body=b"", headers=123) @pytest.mark.parametrize("headers_cls", [HttpRequestHeaders, HttpResponseHeaders]) def test_http_response_headers(headers_cls) -> None: headers = headers_cls({"user-agent": "mozilla"}) assert headers["user-agent"] == "mozilla" assert headers["User-Agent"] == "mozilla" with pytest.raises(KeyError): headers["user agent"] @pytest.mark.parametrize( ("cls", "headers_cls"), [ (HttpRequest, HttpRequestHeaders), (HttpResponse, HttpResponseHeaders), ], ) def test_http_headers_init_dict(cls, headers_cls) -> None: obj = cls("http://example.com", body=b"", headers={"user-agent": "chrome"}) assert isinstance(obj.headers, headers_cls) assert obj.headers["user-agent"] == "chrome" assert obj.headers["User-Agent"] == "chrome" def test_http_request_init_minimal() -> None: req = HttpRequest("url") assert str(req.url) == "url" assert isinstance(req.url, RequestUrl) assert req.method == "GET" assert isinstance(req.method, str) assert not req.headers assert isinstance(req.headers, HttpRequestHeaders) assert not req.body assert isinstance(req.body, HttpRequestBody) def test_http_request_init_full() -> None: req_1 = HttpRequest( "url", method="POST", headers={"User-Agent": "test agent"}, body=b"body" ) assert req_1.method == "POST" assert isinstance(req_1.method, str) assert req_1.headers == {"User-Agent": "test agent"} assert req_1.headers.get("user-agent") == "test agent" assert isinstance(req_1.headers, HttpRequestHeaders) assert req_1.body == b"body" assert isinstance(req_1.body, HttpRequestBody) http_headers = HttpRequestHeaders({"User-Agent": "test agent"}) http_body = HttpRequestBody(b"body") req_2 = HttpRequest("url", method="POST", headers=http_headers, body=http_body) assert str(req_1.url) == str(req_2.url) assert req_1.method == req_2.method assert req_1.headers == req_2.headers assert req_1.body == req_2.body def test_http_request_init_with_response_url() -> None: resp = HttpResponse("url", b"") assert isinstance(resp.url, ResponseUrl) req = HttpRequest(resp.url) assert isinstance(req.url, RequestUrl) assert str(req.url) == str(resp.url) @pytest.mark.parametrize( "cls", [ HttpRequestHeaders, HttpResponseHeaders, ], ) def test_http_headers_from_bytes_dict(cls) -> None: raw_headers = { b"Content-Length": [b"316"], b"Content-Encoding": [b"gzip", b"br"], b"server": b"sffe", "X-string": "string", "X-missing": None, "X-tuple": (b"x", "y"), } headers = cls.from_bytes_dict(raw_headers) assert headers.get("content-length") == "316" assert headers.get("content-encoding") == "gzip" assert headers.getall("Content-Encoding") == ["gzip", "br"] assert headers.get("server") == "sffe" assert headers.get("x-string") == "string" assert headers.get("x-missing") is None assert headers.get("x-tuple") == "x" assert headers.getall("x-tuple") == ["x", "y"] @pytest.mark.parametrize( "cls", [ HttpRequestHeaders, HttpResponseHeaders, ], ) def test_http_response_headers_from_bytes_dict_err(cls) -> None: with pytest.raises( ValueError, match=r"Expecting str or bytes\. Received " ): cls.from_bytes_dict({b"Content-Length": [316]}) with pytest.raises( ValueError, match=r"Expecting str or bytes\. Received " ): cls.from_bytes_dict({b"Content-Length": 316}) def test_http_response_headers_init_requests() -> None: requests_response = requests.Response() requests_response.headers["User-Agent"] = "mozilla" response = HttpResponse( "http://example.com", body=b"", headers=requests_response.headers ) assert isinstance(response.headers, HttpResponseHeaders) assert response.headers["user-agent"] == "mozilla" assert response.headers["User-Agent"] == "mozilla" def test_http_response_headers_init_aiohttp() -> None: aiohttp_response = aiohttp.web_response.Response() aiohttp_response.headers["User-Agent"] = "mozilla" response = HttpResponse( "http://example.com", body=b"", headers=aiohttp_response.headers ) assert isinstance(response.headers, HttpResponseHeaders) assert response.headers["user-agent"] == "mozilla" assert response.headers["User-Agent"] == "mozilla" def test_http_response_selectors(book_list_html_response) -> None: title = "All products | Books to Scrape - Sandbox" assert title == book_list_html_response.css("title ::text").get("").strip() assert title == book_list_html_response.xpath("//title/text()").get("").strip() def test_http_response_json() -> None: url = "http://example.com" response = HttpResponse(url, body=b"non json") with pytest.raises(json.JSONDecodeError): response.json() response = HttpResponse(url, body=b'{"key": "value"}') assert response.json() == {"key": "value"} response = HttpResponse(url, body='{"ключ": "значение"}'.encode()) assert response.json() == {"ключ": "значение"} def test_http_response_text() -> None: """This tests a character which raises a UnicodeDecodeError when decoded in 'ascii'. The backup series of encodings for decoding should be able to handle it. """ text = "œ is a Weird Character" body = HttpResponseBody(b"\x9c is a Weird Character") response = HttpResponse("http://example.com", body=body) assert response.text == text @pytest.mark.parametrize( ("headers", "encoding"), [ ({"Content-type": "text/html; charset=utf-8"}, "utf-8"), ({"Content-type": "text/html; charset=UTF8"}, "utf-8"), ({}, None), ({"Content-type": "text/html; charset=iso-8859-1"}, "cp1252"), ({"Content-type": "text/html; charset=None"}, None), ({"Content-type": "text/html; charset=gb2312"}, "gb18030"), ({"Content-type": "text/html; charset=gbk"}, "gb18030"), ({"Content-type": "text/html; charset=UNKNOWN"}, None), ], ) def test_http_headers_declared_encoding(headers, encoding) -> None: headers = HttpResponseHeaders(headers) assert headers.declared_encoding() == encoding response = HttpResponse("http://example.com", body=b"", headers=headers) assert response.encoding == encoding or HttpResponse._DEFAULT_ENCODING def test_http_response_utf16() -> None: """Test utf-16 because UnicodeDammit is known to have problems with""" r = HttpResponse( "http://www.example.com", body=b"\xff\xfeh\x00i\x00", encoding="utf-16" ) assert r.text == "hi" assert r.encoding == "utf-16" def test_explicit_encoding() -> None: response = HttpResponse("http://www.example.com", "£".encode(), encoding="utf-8") assert response.encoding == "utf-8" assert response.text == "£" def test_explicit_encoding_invalid() -> None: response = HttpResponse( "http://www.example.com", body="£".encode(), encoding="latin1" ) assert response.encoding == "latin1" assert response.text == "£".encode().decode("latin1") def test_utf8_body_detection() -> None: response = HttpResponse( "http://www.example.com", b"\xc2\xa3", headers={"Content-type": "text/html; charset=None"}, ) assert response.encoding == "utf-8" response = HttpResponse( "http://www.example.com", body=b"\xc2", headers={"Content-type": "text/html; charset=None"}, ) assert response.encoding != "utf-8" def test_gb2312() -> None: response = HttpResponse( "http://www.example.com", body=b"\xa8D", headers={"Content-type": "text/html; charset=gb2312"}, ) assert response.text == "\u2015" def test_bom_encoding() -> None: response = HttpResponse( "http://www.example.com", body=codecs.BOM_UTF8 + "🎉".encode(), headers={"Content-type": "text/html; charset=cp1251"}, ) assert response.encoding == "utf-8" assert response.text == "🎉" def test_bom_encoding_mismatch() -> None: text = "Привет" body = codecs.BOM_UTF16_LE + text.encode("utf-8") response = HttpResponse( url="http://example.com", headers={"Content-Type": "text/html; charset=cp1251"}, body=body, status=200, ) # The resulting text is different since BOM was the one that was followed. assert response.encoding == "utf-16-le" assert response.text != text assert response.text == "鿐胑룐닐뗐苑" def test_invalid_utf8_encoded_body_with_valid_utf8_BOM() -> None: response = HttpResponse( "http://www.example.com", headers={"Content-type": "text/html; charset=utf-8"}, body=b"\xef\xbb\xbfWORD\xe3\xab", ) assert response.encoding == "utf-8" assert response.text == "WORD\ufffd" def test_bom_is_removed_from_body() -> None: # Inferring encoding from body also cache decoded body as sideeffect, # this test tries to ensure that calling response.encoding and # response.text in indistint order doesn't affect final # values for encoding and decoded body. url = "http://example.com" body = b"\xef\xbb\xbfWORD" headers = {"Content-type": "text/html; charset=utf-8"} # Test response without content-type and BOM encoding response = HttpResponse(url, body=body) assert response.encoding == "utf-8" assert response.text == "WORD" response = HttpResponse(url, body=body) assert response.text == "WORD" assert response.encoding == "utf-8" # Body caching sideeffect isn't triggered when encoding is declared in # content-type header but BOM still need to be removed from decoded # body response = HttpResponse(url, headers=headers, body=body) assert response.encoding == "utf-8" assert response.text == "WORD" response = HttpResponse(url, headers=headers, body=body) assert response.text == "WORD" assert response.encoding == "utf-8" def test_replace_wrong_encoding() -> None: """Test invalid chars are replaced properly""" r = HttpResponse( "http://www.example.com", encoding="utf-8", body=b"PREFIX\xe3\xabSUFFIX" ) # XXX: Policy for replacing invalid chars may suffer minor variations # but it should always contain the unicode replacement char ('\ufffd') assert "\ufffd" in r.text, repr(r.text) assert "PREFIX" in r.text, repr(r.text) assert "SUFFIX" in r.text, repr(r.text) # Do not destroy html tags due to encoding bugs r = HttpResponse( "http://example.com", encoding="utf-8", body=b"\xf0value" ) assert "value" in r.text, repr(r.text) def test_html_encoding() -> None: body = b"""Some page Price: \xa3100' """ r1 = HttpResponse("http://www.example.com", body=body) assert r1.encoding == "cp1252" assert r1.text == body.decode("cp1252") body = b""" Price: \xa3100 """ r2 = HttpResponse("http://www.example.com", body=body) assert r2.encoding == "cp1252" assert r2.text == body.decode("cp1252") def test_html_headers_encoding_precedence() -> None: # for conflicting declarations headers must take precedence body = b"""Some page Price: \xa3100' """ response = HttpResponse( "http://www.example.com", body=body, headers={"Content-type": "text/html; charset=iso-8859-1"}, ) assert response.encoding == "cp1252" assert response.text == body.decode("cp1252") def test_html5_meta_charset() -> None: body = b"""Some pagebla bla""" response = HttpResponse("http://www.example.com", body=body) assert response.encoding == "gb18030" assert response.text == body.decode("gb18030") def test_browser_html() -> None: src = "

Hello,

world!

" html = BrowserHtml(src) assert html == src assert html != "foo" assert html.xpath("//p/text()").getall() == ["Hello, ", "world!"] assert html.css("p::text").getall() == ["Hello, ", "world!"] assert isinstance(html.selector, parsel.Selector) def test_browser_response() -> None: url = "http://example.com" html = "

Hello,

world!

" response = BrowserResponse(url=url, html=html, status=200) assert response.xpath("//p/text()").getall() == ["Hello, ", "world!"] assert response.css("p::text").getall() == ["Hello, ", "world!"] assert isinstance(response.selector, parsel.Selector) assert isinstance(response.html, BrowserHtml) assert str(response.urljoin("products")) == "http://example.com/products" assert response.selector.root.base_url == url @pytest.mark.parametrize( "cls", [ HttpRequest, HttpResponse, ], ) def test_urljoin_absolute(cls) -> None: obj = cls("https://example.com", body=b"") new_url = obj.urljoin("https://toscrape.com/foo") assert isinstance(new_url, RequestUrl) assert str(new_url) == "https://toscrape.com/foo" @pytest.mark.parametrize( "cls", [ HttpRequest, HttpResponse, ], ) def test_urljoin_relative(cls) -> None: obj = cls("https://example.com", body=b"") new_url = obj.urljoin("foo") assert isinstance(new_url, RequestUrl) assert str(new_url) == "https://example.com/foo" def test_urljoin_relative_html_base() -> None: body = b""" """ obj = HttpResponse("https://example.com", body=body) new_url = obj.urljoin("foo") assert isinstance(new_url, RequestUrl) assert str(new_url) == "https://toscrape.com/foo" @pytest.mark.parametrize( "cls", [ RequestUrl, ResponseUrl, ], ) def test_urljoin_input_classes(cls) -> None: obj = HttpResponse("https://example.com", body=b"") new_url = obj.urljoin(cls("foo")) assert isinstance(new_url, RequestUrl) assert str(new_url) == "https://example.com/foo" def test_request_fingerprint() -> None: req1 = HttpRequest(url="http://toscrape.com/1") req2 = HttpRequest(url="http://toscrape.com/1") assert request_fingerprint(req1) == request_fingerprint(req2) req3 = HttpRequest(url="http://toscrape.com/2") assert request_fingerprint(req1) != request_fingerprint(req3) req4 = HttpRequest(url="http://toscrape.com/1", method="POST") assert request_fingerprint(req1) != request_fingerprint(req4) req5 = HttpRequest(url="http://toscrape.com/1", body=b"") assert request_fingerprint(req1) == request_fingerprint(req5) req6 = HttpRequest(url="http://toscrape.com/1", body=b"foo") assert request_fingerprint(req1) != request_fingerprint(req6) req7 = HttpRequest(url="http://toscrape.com/1", headers={}) assert request_fingerprint(req1) == request_fingerprint(req7) req8 = HttpRequest(url="http://toscrape.com/1", headers={"a": "b"}) assert request_fingerprint(req1) != request_fingerprint(req8) req9 = HttpRequest(url="http://toscrape.com/1", headers={"A": "b"}) assert request_fingerprint(req8) == request_fingerprint(req9) req10 = HttpRequest(url="http://toscrape.com/1", headers=[("a", "b"), ("a", "c")]) assert request_fingerprint(req1) != request_fingerprint(req10) assert request_fingerprint(req8) != request_fingerprint(req10) def test_http_or_browser_response() -> None: url = "http://example.com" html = "

Hello,

world!

" browser_response = BrowserResponse(url=url, html=html) response_1 = AnyResponse(response=browser_response) assert isinstance(response_1.response, BrowserResponse) assert response_1.response == browser_response http_response = HttpResponse(url=url, body=html.encode()) response_2 = AnyResponse(response=http_response) assert isinstance(response_2.response, HttpResponse) assert response_2.response == http_response for response in [response_1, response_2]: assert isinstance(response.url, ResponseUrl) assert str(response.url) == url assert response.text == html assert response.xpath("//p/text()").getall() == ["Hello, ", "world!"] assert response.css("p::text").getall() == ["Hello, ", "world!"] assert isinstance(response.selector, parsel.Selector) assert str(response.urljoin("products")) == "http://example.com/products" assert response.status is None assert response.selector.root.base_url == url response = AnyResponse(response=BrowserResponse(url=url, html=html, status=200)) assert response.status == 200 response = AnyResponse( response=HttpResponse(url=url, body=html.encode(), status=200) ) assert response.status == 200 scrapinghub-web-poet-ba87b95/tests/test_pages.py000066400000000000000000000215501517167256700220600ustar00rootroot00000000000000from __future__ import annotations from typing import Generic, Optional, TypeVar import attrs import pytest from web_poet import BrowserResponse, HttpResponse, PageParams, field from web_poet.pages import ( BrowserPage, Injectable, ItemPage, ItemT, Returns, SelectorExtractor, WebPage, is_injectable, ) @attrs.define class Item: name: str def test_page_object() -> None: class MyItemPage(Injectable): def to_item(self) -> dict: return { "foo": "bar", } page_object = MyItemPage() assert page_object.to_item() == { "foo": "bar", } def test_web_page_object(book_list_html_response) -> None: class MyWebPage(WebPage): def to_item(self) -> dict: # type: ignore[override] return { "url": self.url, "title": self.css("title::text").get("").strip(), } page_object = MyWebPage(book_list_html_response) assert page_object.to_item() == { "url": "http://books.toscrape.com/index.html", "title": "All products | Books to Scrape - Sandbox", } def test_is_injectable() -> None: class MyClass: pass class MyItemPage(ItemPage): def to_item(self) -> dict: # type: ignore[override] return { "foo": "bar", } from collections.abc import Set as CollectionsSet # noqa: PYI025,PLC0415 from typing import Set as TypingSet # noqa: UP035,PLC0415 assert is_injectable(None) is False assert is_injectable(type(None)) is False assert is_injectable(set) is False assert is_injectable(set[str]) is False assert is_injectable(TypingSet[str]) is False # noqa: UP006 assert is_injectable(CollectionsSet[str]) is False assert is_injectable(Optional[str]) is False # noqa: UP045 assert is_injectable(str | None) is False assert is_injectable(MyClass) is False assert is_injectable(MyClass()) is False assert is_injectable(MyItemPage) is True assert is_injectable(MyItemPage()) is False assert is_injectable(ItemPage) is True @pytest.mark.asyncio async def test_item_page_typed() -> None: class MyPage(ItemPage[Item]): @field def name(self): return "name" page = MyPage() assert page.item_cls is Item item = await page.to_item() assert isinstance(item, Item) assert item == Item(name="name") @pytest.mark.asyncio async def test_web_page_fields() -> None: class MyPage(WebPage[Item]): @field def name(self): return "name" page = MyPage(HttpResponse(url="http://example.com", body=b"")) assert page.item_cls is Item item = await page.to_item() assert isinstance(item, Item) assert item == Item(name="name") def test_web_page_with_browser_response() -> None: """Test that BrowserPage works with BrowserResponse. BrowserResponse has .html instead of .text, so we use BrowserPage which is designed for BrowserResponse. """ html = """ Test Page Link 1 Link 2 """ browser_response = BrowserResponse(url="https://example.org", html=html, status=200) class MyPage(BrowserPage[dict]): def to_item(self) -> dict: # type: ignore[override] return { "url": self.url, "title": self.css("title::text").get("").strip(), "links": self.css("a::attr(href)").getall(), } page = MyPage(response=browser_response) # These should work without AttributeError assert page.url == "https://example.org" assert page.css("title::text").get() == "Test Page" assert page.xpath("//a/@href").getall() == ["/page1", "/page2"] # Test to_item works correctly item = page.to_item() assert item["url"] == "https://example.org" assert item["title"] == "Test Page" assert item["links"] == ["/page1", "/page2"] @pytest.mark.asyncio async def test_item_page_typed_subclass() -> None: class BasePage(ItemPage[ItemT]): @field def name(self): return "name" class Subclass(BasePage[Item]): pass page: BasePage = BasePage() assert page.item_cls is dict assert (await page.to_item()) == {"name": "name"} page2: Subclass = Subclass() assert page2.item_cls is Item assert (await page2.to_item()) == Item(name="name") @pytest.mark.asyncio async def test_item_page_fields_typo() -> None: class MyPage(ItemPage[Item]): @field def nane(self): return "name" page = MyPage() assert page.item_cls is Item with pytest.raises(TypeError, match="unexpected keyword argument 'nane'"): await page.to_item() @pytest.mark.asyncio async def test_item_page_required_field_missing() -> None: @attrs.define class MyItem: name: str price: float | None class MyPage(ItemPage[MyItem]): @field def price(self): return 100 page = MyPage() assert page.item_cls is MyItem with pytest.raises( TypeError, match="missing 1 required positional argument: 'name'" ): await page.to_item() @pytest.mark.asyncio async def test_item_page_change_item_type_extra_fields() -> None: class BasePage(ItemPage[Item]): @field def name(self): return "hello" @attrs.define class MyItem(Item): price: float class Subclass(BasePage, Returns[MyItem]): @field def price(self): return 123 page = Subclass() assert page.item_cls is MyItem item = await page.to_item() assert isinstance(item, MyItem) assert item == MyItem(name="hello", price=123) @pytest.mark.asyncio async def test_item_page_change_item_type_remove_fields() -> None: @attrs.define class MyItem: name: str price: float class BasePage(ItemPage[MyItem]): @field def name(self): return "hello" @field def price(self): return 123 # Item only contains "name", but not "price" class Subclass(BasePage, Returns[Item], skip_nonitem_fields=True): pass # Same as above but a slotted attrs class with dependency. # See: https://github.com/scrapinghub/web-poet/issues/141 @attrs.define class SubclassWithDep(BasePage, Returns[Item], skip_nonitem_fields=True): params: PageParams # Check if flicking skip_nonitem_fields to False in the subclass works @attrs.define class SubclassSkipFalse(SubclassWithDep, Returns[Item], skip_nonitem_fields=False): pass for page in [Subclass(), SubclassWithDep(params=PageParams())]: assert page.item_cls is Item item = await page.to_item() assert isinstance(item, Item) assert item == Item(name="hello") page = SubclassSkipFalse(params=PageParams()) assert page.item_cls is Item with pytest.raises(TypeError, match="unexpected keyword argument 'price'"): await page.to_item() # Item only contains "name", but not "price", but "price" should be passed class SubclassStrict(BasePage, Returns[Item]): pass page2 = SubclassStrict() assert page2.item_cls is Item with pytest.raises(TypeError, match="unexpected keyword argument 'price'"): await page2.to_item() def test_returns_inheritance() -> None: @attrs.define class MyItem: name: str class BasePage(ItemPage[MyItem]): @field def name(self): return "hello" MetadataT = TypeVar("MetadataT") class HasMetadata(Generic[MetadataT]): pass class DummyMetadata: pass class Page(BasePage, HasMetadata[DummyMetadata]): pass page = Page() assert page.item_cls is MyItem @pytest.mark.asyncio async def test_extractor(book_list_html_response) -> None: @attrs.define class BookItem: name: str price: str @attrs.define class ListItem: books: list[BookItem] @attrs.define class MyPage(ItemPage[ListItem]): response: HttpResponse @field async def books(self): books = [] for book in self.response.css("article"): item = await BookExtractor(book).to_item() books.append(item) return books class BookExtractor(SelectorExtractor[BookItem]): @field(out=[str.lower]) def name(self): return self.css("img.thumbnail::attr(alt)").get() @field def price(self): return self.xpath(".//p[@class='price_color']/text()").get() page = MyPage(book_list_html_response) item = await page.to_item() assert len(item.books) == 20 assert item.books[0].name == "a light in the attic" assert item.books[0].price == "£51.77" scrapinghub-web-poet-ba87b95/tests/test_requests.py000066400000000000000000000240011517167256700226260ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING from unittest import mock import pytest from web_poet.exceptions import HttpResponseError, RequestDownloaderVarError from web_poet.page_inputs import ( HttpClient, HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, ) from web_poet.requests import request_downloader_var if TYPE_CHECKING: from collections.abc import Callable @pytest.fixture def async_mock(): """Workaround since python 3.7 doesn't ship with asyncmock.""" async def async_test(req): return HttpResponse(str(req.url), body=b"") mock.MagicMock.__await__ = lambda x: async_test(x).__await__() return async_test @pytest.mark.asyncio async def test_perform_request_from_httpclient(async_mock) -> None: url = "http://example.com" client = HttpClient() with pytest.raises(RequestDownloaderVarError): await client.get(url) request_downloader_var.set(async_mock) response = await client.get(url) # The async downloader implementation should return the HttpResponse assert str(response.url) == str(url) assert isinstance(response, HttpResponse) @pytest.mark.asyncio async def test_http_client_single_requests(async_mock) -> None: client = HttpClient(async_mock) with mock.patch("web_poet.page_inputs.client.HttpRequest") as mock_request: await client.request("url") await client.get("url-get", headers={"X-Headers": "123"}) await client.post("url-post", headers={"X-Headers": "123"}, body=b"body value") assert mock_request.call_args_list == [ mock.call( url="url", method="GET", headers=HttpRequestHeaders(), body=HttpRequestBody(), ), mock.call( url="url-get", method="GET", headers=HttpRequestHeaders({"X-Headers": "123"}), body=HttpRequestBody(), ), mock.call( url="url-post", method="POST", headers=HttpRequestHeaders({"X-Headers": "123"}), body=HttpRequestBody(b"body value"), ), ] @pytest.fixture def client_with_status() -> Callable: def _param_wrapper(status_code: int): async def stub_request_downloader(*args, **kwargs): async def stub(req): return HttpResponse(req.url, body=b"", status=status_code) return await stub(*args, **kwargs) return stub_request_downloader return _param_wrapper @pytest.mark.asyncio @pytest.mark.parametrize("method_name", ["request", "get", "post", "execute"]) async def test_http_client_allow_status( async_mock, client_with_status, method_name ) -> None: client = HttpClient(async_mock) # Simulate 500 Internal Server Error responses client._request_downloader = client_with_status(500) method = getattr(client, method_name) url_or_request: str | HttpRequest = "url" if method_name == "execute": # NOTE: We're ignoring the type below due to the following mypy bugs: # - https://github.com/python/mypy/issues/10187 # - https://github.com/python/mypy/issues/5313 # - https://github.com/python-attrs/attrs/issues/889 # Currently, the said bugs causes mypy to raise the following error: # 'Incompatible types in assignment (expression has type "ResponseUrl", # variable has type "Optional[str]")' url_or_request = HttpRequest(url_or_request) # type: ignore[arg-type] # Should handle single and multiple values await method(url_or_request, allow_status=500) response = await method(url_or_request, allow_status=[500, 503]) assert isinstance(response, HttpResponse) assert response.status == 500 # As well as strings await method(url_or_request, allow_status="500") await method(url_or_request, allow_status=["500", "503"]) with pytest.raises(HttpResponseError) as excinfo: await method(url_or_request) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) assert str(excinfo.value).startswith("500 INTERNAL_SERVER_ERROR response for") with pytest.raises(HttpResponseError): await method(url_or_request, allow_status=406) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) assert str(excinfo.value).startswith("500 INTERNAL_SERVER_ERROR response for") # As long as "*" is present, then no errors would be raised await method(url_or_request, allow_status="*") await method(url_or_request, allow_status=[500, "*"]) # Globbing isn't supported with pytest.raises(HttpResponseError): await method(url_or_request, allow_status="5*") assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) assert str(excinfo.value).startswith("500 INTERNAL_SERVER_ERROR response for") @pytest.mark.asyncio async def test_http_client_keyword_enforcing(async_mock) -> None: """Only keyword args are allowed after the url param.""" client = HttpClient(async_mock) with pytest.raises(TypeError): await client.request("url", "PATCH") # type: ignore[misc] with pytest.raises(TypeError): await client.get("url", {"Content-Encoding": "utf-8"}) # type: ignore[misc] with pytest.raises(TypeError): await client.post("url", {"X-Header": "value"}, b"body") # type: ignore[misc] @pytest.mark.asyncio async def test_http_client_execute(async_mock) -> None: client = HttpClient(async_mock) request = HttpRequest("url-1") response = await client.execute(request) assert isinstance(response, HttpResponse) assert str(response.url) == "url-1" @pytest.mark.asyncio async def test_http_client_batch_execute(async_mock) -> None: client = HttpClient(async_mock) requests = [ HttpRequest("url-1"), HttpRequest("url-get", method="GET"), HttpRequest("url-post", method="POST"), ] responses = await client.batch_execute(*requests) assert all(isinstance(response, HttpResponse) for response in responses) @pytest.fixture def client_that_errs(async_mock) -> HttpClient: client = HttpClient(async_mock) # Simulate errors inside the request coroutines async def stub_request_downloader(*args, **kwargs): async def err(): raise ValueError("test exception") return await err() client._request_downloader = stub_request_downloader return client @pytest.mark.asyncio async def test_http_client_batch_execute_with_exception(client_that_errs) -> None: requests = [ HttpRequest("url-1"), HttpRequest("url-get", method="GET"), HttpRequest("url-post", method="POST"), ] responses = await client_that_errs.batch_execute(*requests, return_exceptions=True) assert len(responses) == 3 assert isinstance(responses[0], Exception) assert isinstance(responses[1], Exception) assert isinstance(responses[2], Exception) @pytest.mark.asyncio async def test_http_client_batch_execute_with_exception_raised( client_that_errs, ) -> None: requests = [ HttpRequest("url-1"), ] with pytest.raises(ValueError, match="test exception"): await client_that_errs.batch_execute(*requests) @pytest.mark.asyncio async def test_http_client_batch_execute_allow_status( async_mock, client_with_status ) -> None: client = HttpClient(async_mock) # Simulate 400 Bad Request client._request_downloader = client_with_status(400) requests = [HttpRequest("url-1"), HttpRequest("url-2"), HttpRequest("url-3")] await client.batch_execute(*requests, allow_status=400) await client.batch_execute(*requests, allow_status=[400, 403]) await client.batch_execute(*requests, allow_status="400") responses = await client.batch_execute(*requests, allow_status=["400", "403"]) for r in responses: assert isinstance(r, HttpResponse) assert r.status == 400 with pytest.raises(HttpResponseError) as excinfo: await client.batch_execute(*requests) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) assert str(excinfo.value).startswith("400 BAD_REQUEST response for") with pytest.raises(HttpResponseError) as excinfo: await client.batch_execute(*requests, allow_status=406) assert isinstance(excinfo.value.request, HttpRequest) assert isinstance(excinfo.value.response, HttpResponse) assert str(excinfo.value).startswith("400 BAD_REQUEST response for") await client.batch_execute(*requests, return_exceptions=True, allow_status=400) await client.batch_execute( *requests, return_exceptions=True, allow_status=[400, 403] ) await client.batch_execute(*requests, return_exceptions=True, allow_status="400") await client.batch_execute( *requests, return_exceptions=True, allow_status=["400", "403"] ) responses = await client.batch_execute(*requests, return_exceptions=True) for r in responses: assert isinstance(r, HttpResponseError) assert isinstance(r.request, HttpRequest) assert isinstance(r.response, HttpResponse) assert all(str(r).startswith("400 BAD_REQUEST response for") for r in responses) responses = await client.batch_execute( *requests, return_exceptions=True, allow_status=408 ) for r in responses: assert isinstance(r, HttpResponseError) assert isinstance(r.request, HttpRequest) assert isinstance(r.response, HttpResponse) assert all(str(r).startswith("400 BAD_REQUEST response for") for r in responses) # These have no assertions since they're used to see if mypy raises an # error against them. for r in responses: if isinstance(r, HttpResponseError): r.request r.response else: r.url r.body r.status r.headers scrapinghub-web-poet-ba87b95/tests/test_rules.py000066400000000000000000000331661517167256700221210ustar00rootroot00000000000000import warnings from typing import Any import attrs import pytest from url_matcher import Patterns from tests.po_lib import ( POTopLevel1, POTopLevel2, POTopLevelOverriden1, POTopLevelOverriden2, ) from tests.po_lib.a_module import POModule, POModuleOverriden from tests.po_lib.nested_package import PONestedPkg, PONestedPkgOverriden from tests.po_lib.nested_package.a_nested_module import ( PONestedModule, PONestedModuleOverriden, ) from tests.po_lib_sub import POLibSub from tests.po_lib_to_return import ( CustomProductPage, CustomProductPageDataTypeOnly, CustomProductPageNoReturns, ImprovedProductPage, LessProductPage, MoreProductPage, Product, ProductFewerFields, ProductMoreFields, ProductPage, ProductSeparate, ProductSimilar, SeparateProductPage, SimilarProductPage, SomePage, ) from web_poet import ( ApplyRule, RulesRegistry, consume_modules, default_registry, ) from web_poet.page_inputs.url import RequestUrl, ResponseUrl POS = { CustomProductPage, CustomProductPageNoReturns, CustomProductPageDataTypeOnly, ImprovedProductPage, LessProductPage, MoreProductPage, POTopLevel1, POTopLevel2, POModule, PONestedPkg, PONestedModule, ProductPage, SeparateProductPage, SimilarProductPage, SomePage, } def test_apply_rule_uniqueness() -> None: """The same instance of an ApplyRule with the same attribute values should have the same hash identity. """ patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) patterns_b = Patterns(include=["example.com/b"]) rule1 = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden1, meta={"key_1": 1}, ) rule2 = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden1, meta={"key_2": 2}, ) # The ``meta`` parameter is ignored in the hash. assert hash(rule1) == hash(rule2) params: list[dict[str, Any]] = [ { "for_patterns": patterns, "use": POTopLevel1, "instead_of": POTopLevelOverriden1, "to_return": Product, }, { "for_patterns": patterns_b, "use": POTopLevel2, "instead_of": POTopLevelOverriden2, "to_return": ProductSimilar, }, ] for change in params[0]: # Changing any one of the params should result in a hash mismatch rule1 = ApplyRule(**params[0]) # type: ignore[arg-type] kwargs = params[0].copy() kwargs.update({change: params[1][change]}) rule2 = ApplyRule(**kwargs) # type: ignore[arg-type] assert hash(rule1) != hash(rule2) def test_apply_rule_immutability() -> None: patterns = Patterns(include=["example.com"], exclude=["example.com/blog"]) rule = ApplyRule( for_patterns=patterns, use=POTopLevel1, instead_of=POTopLevelOverriden1, ) with pytest.raises(attrs.exceptions.FrozenInstanceError): rule.for_patterns = Patterns(include=["example.com/"]) # type: ignore[misc] with pytest.raises(attrs.exceptions.FrozenInstanceError): rule.use = POTopLevel2 # type: ignore[misc] with pytest.raises(attrs.exceptions.FrozenInstanceError): rule.instead_of = POTopLevelOverriden2 # type: ignore[misc] def test_apply_rule_converter_on_pattern() -> None: # passing strings should auto-converter into Patterns rule = ApplyRule("example.com", use=POTopLevel1, instead_of=POTopLevelOverriden2) assert rule.for_patterns == Patterns( include=["example.com"], exclude=[], priority=500 ) # Passing Patterns should still work rule = ApplyRule( for_patterns=Patterns(["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, ) assert rule.for_patterns == Patterns( include=["example.com"], exclude=[], priority=500 ) def test_apply_rule_kwargs_only() -> None: params = { "use": POTopLevel1, "instead_of": POTopLevelOverriden2, "to_return": Product, "meta": {"key_2": 2}, } remove = set() for param_name in params: remove.add(param_name) with pytest.raises(TypeError): ApplyRule( "example.com", *[params[r] for r in remove], **{k: v for k, v in params.items() if k not in remove}, # type: ignore[arg-type] ) def test_list_page_objects_all() -> None: rules = default_registry.get_rules() page_objects = {po.use for po in rules} # Note that the 'tests_extra.po_lib_sub_not_imported.POLibSubNotImported' # Page Object is not included here since it was never imported anywhere in # our test package. It would only be included if we run any of the following # below. (Note that they should run before `get_rules` is called.) # - from tests_extra import po_lib_sub_not_imported # - import tests_extra.po_lib_sub_not_imported # - web_poet.consume_modules("tests_extra") # Merely having `import tests_extra` won't work since the subpackages and # modules needs to be traversed and imported as well. assert all("po_lib_sub_not_imported" not in po.__module__ for po in page_objects) # Ensure that ALL Override Rules are returned as long as the given # registry's @handle_urls decorator was used. assert page_objects == POS.union({POLibSub}) for rule in rules: # We're ignoring the types below since mypy expects ``Type[ItemPage]`` # which doesn't contain the ``expected_*`` fields in our tests. # Special case since this PO has 2 ``@handle_urls`` decorators. # See ``test_multiple_handle_urls()`` test case below. if rule.use == POTopLevel1: continue assert rule.instead_of == rule.use.expected_instead_of, rule.use # type: ignore[attr-defined] assert rule.for_patterns == rule.use.expected_patterns, rule.use # type: ignore[attr-defined] assert rule.to_return == rule.use.expected_to_return, rule.use # type: ignore[attr-defined] assert rule.meta == rule.use.expected_meta, rule.use # type: ignore[attr-defined] def test_multiple_handle_urls_annotations() -> None: """Using multiple ``@handle_urls`` annotations on a single Page Object should work. """ rules = default_registry.search(use=POTopLevel1) assert len(rules) == 2 for i, rule in enumerate(rules): assert rule.instead_of == rule.use.expected_instead_of[i], rule.use # type: ignore[attr-defined] assert rule.for_patterns == rule.use.expected_patterns[i], rule.use # type: ignore[attr-defined] assert rule.to_return == rule.use.expected_to_return[i], rule.use # type: ignore[attr-defined] assert rule.meta == rule.use.expected_meta[i], rule.use # type: ignore[attr-defined] def test_consume_module_not_existing() -> None: with pytest.raises(ImportError): consume_modules("this_does_not_exist") def test_list_page_objects_all_consume() -> None: """A test similar to the one above but calls ``consume_modules()`` to properly load the ``@handle_urls`` decorators from other modules/packages. """ consume_modules("tests_extra") rules = default_registry.get_rules() page_objects = {po.use for po in rules} assert any("po_lib_sub_not_imported" in po.__module__ for po in page_objects) def test_registry_search() -> None: # param: use rules = default_registry.search(use=POTopLevel2) assert len(rules) == 1 assert rules[0].use == POTopLevel2 # param: instead_of rules = default_registry.search(instead_of=POTopLevelOverriden2) assert len(rules) == 1 assert rules[0].instead_of == POTopLevelOverriden2 rules = default_registry.search(instead_of=None) for rule in rules: assert rule.instead_of is None # param: to_return rules = default_registry.search(to_return=Product) assert rules == [ ApplyRule("example.com", use=ProductPage, to_return=Product), ApplyRule( "example.com", use=ImprovedProductPage, instead_of=ProductPage, to_return=Product, ), ApplyRule( "example.com", # mypy complains here since it's expecting a container class when # declared, i.e, ``ItemPage[SomeItem]`` use=CustomProductPageDataTypeOnly, # type: ignore[arg-type] to_return=Product, ), ] rules = default_registry.search(to_return=None) for rule in rules: assert rule.to_return is None # params: to_return and use rules = default_registry.search(to_return=Product, use=ImprovedProductPage) assert len(rules) == 1 assert rules[0].to_return == Product assert rules[0].use == ImprovedProductPage # params: to_return and instead_of rules = default_registry.search(to_return=Product, instead_of=None) assert len(rules) == 2 assert rules[0].to_return == Product assert rules[0].instead_of is None assert rules[1].to_return == Product assert rules[1].instead_of is None rules = default_registry.search(to_return=None, instead_of=ProductPage) for rule in rules: assert rule.to_return is None assert rule.instead_of is None rules = default_registry.search(to_return=None, instead_of=None) assert len(rules) == 1 assert rules[0].to_return is None assert rules[0].instead_of is None # Such rules doesn't exist rules = default_registry.search(use=POModuleOverriden) assert len(rules) == 0 def test_init_rules() -> None: rules = ( ApplyRule( for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, ), ) registry = RulesRegistry(rules=rules) # Any type of iterable input should convert it to a list. assert registry.get_rules() == list(rules) assert default_registry.get_rules() != rules def test_add_rule() -> None: registry = RulesRegistry() # Basic case of adding a rule rule_1 = ApplyRule( for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden1, to_return=Product, ) registry.add_rule(rule_1) assert registry.get_rules() == [rule_1] # Adding a second rule should not emit a warning as long as both the URL # pattern and `.to_return` value is not the same. rule_2 = ApplyRule( for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, to_return=ProductSimilar, ) with warnings.catch_warnings(record=True) as warnings_emitted: registry.add_rule(rule_2) assert not warnings_emitted assert registry.get_rules() == [rule_1, rule_2] # Warnings should be raised for this case since it's the same URL pattern # and `.to_return` value from one of the past rules. rule_3 = ApplyRule( for_patterns=Patterns(include=["example.com"]), use=POTopLevel1, instead_of=POTopLevelOverriden2, to_return=Product, ) with pytest.warns(UserWarning, match="conflicting rules"): registry.add_rule(rule_3) assert registry.get_rules() == [rule_1, rule_2, rule_3] def test_overrides_for() -> None: for cls in [str, RequestUrl, ResponseUrl]: assert default_registry.overrides_for(cls("https://example.com")) == { POTopLevelOverriden1: POTopLevel1, POTopLevelOverriden2: POTopLevel2, POModuleOverriden: POModule, PONestedPkgOverriden: PONestedPkg, PONestedModuleOverriden: PONestedModule, ProductPage: CustomProductPageNoReturns, } assert default_registry.overrides_for(cls("https://example.org")) == { PONestedModuleOverriden: PONestedModule, PONestedPkgOverriden: PONestedPkg, } def test_page_cls_for_item() -> None: # This is not associated with any rule. class FakeItem: pass method = default_registry.page_cls_for_item for cls in [str, RequestUrl, ResponseUrl]: url = cls("https://example.com") assert method(url, ProductSimilar) == CustomProductPageNoReturns assert method(url, Product) == CustomProductPageDataTypeOnly assert method(url, ProductSeparate) == SeparateProductPage assert method(url, ProductFewerFields) == LessProductPage assert method(url, ProductMoreFields) == MoreProductPage # Type is ignored since item_cls shouldn't be None assert method(url, None) is None # type: ignore[arg-type] # When there's no rule specifying to return this FakeItem assert method(url, FakeItem) is None # When the URL itself doesn't have any ``to_return`` in any of its rules assert method(cls("https://example.org"), FakeItem) is None def test_top_rules_for_item() -> None: registry = RulesRegistry() assert list(registry.top_rules_for_item("https://example.com", Product)) == [] @registry.handle_urls("https://a.example", priority=1000) class A1(ProductPage): pass @registry.handle_urls("https://a.example", priority=900) class A2(ProductPage): pass assert { rule.use for rule in registry.top_rules_for_item("https://a.example", Product) } == {A1} @registry.handle_urls("https://b.example") class B1(ProductPage): pass @registry.handle_urls("https://b.example") class B2(ProductPage): pass assert { rule.use for rule in registry.top_rules_for_item("https://b.example", Product) } == {B1, B2} scrapinghub-web-poet-ba87b95/tests/test_serialization.py000066400000000000000000000273021517167256700236370ustar00rootroot00000000000000import json from typing import Annotated import attrs import pytest from web_poet import ( AnyResponse, BrowserHtml, BrowserResponse, HttpClient, HttpResponse, HttpResponseBody, Injectable, ItemPage, PageParams, RequestUrl, ResponseUrl, Stats, WebPage, ) from web_poet.annotated import AnnotatedInstance, annotation_decode, annotation_encode from web_poet.page_inputs.url import _Url from web_poet.serialization import ( SerializedDataFileStorage, SerializedLeafData, deserialize, deserialize_leaf, register_serialization, serialize, serialize_leaf, ) def _assert_pages_equal(p1, p2) -> None: assert type(p1) == type(p2) # noqa: E721 assert type(p1.response) == type(p2.response) # noqa: E721 assert type(p1.response.body) == type(p2.response.body) # noqa: E721 assert type(p1.response.headers) == type(p2.response.headers) # noqa: E721 assert p1.response.body == p2.response.body assert p1.response.status == p2.response.status assert p1.response.headers == p2.response.headers assert p1.response._encoding == p2.response._encoding _assert_urls_equal(p1.response.url, p2.response.url) def _assert_urls_equal(u1: _Url, u2: _Url) -> None: assert type(u1) == type(u2) # noqa: E721 assert str(u1) == str(u2) def test_serialization_leaf() -> None: leaf = HttpResponseBody(b"foo") serialized_data = serialize_leaf(leaf) assert isinstance(serialized_data["html"], bytes) assert HttpResponseBody(serialized_data["html"]) == leaf deserialized_data = deserialize_leaf(HttpResponseBody, serialized_data) assert leaf == deserialized_data def test_serialization_browser_html() -> None: html_str = "Hello" browser_html = BrowserHtml(html_str) serialized_data = serialize_leaf(browser_html) assert serialized_data == {"body.html": html_str.encode("utf8")} deserialized = deserialize_leaf(BrowserHtml, serialized_data) assert isinstance(deserialized, BrowserHtml) assert deserialized == browser_html def test_serialization_browser_response() -> None: html = BrowserHtml("Hello") url = ResponseUrl("http://example.com") response = BrowserResponse(url=url, html=html, status=200) serialized_data = serialize_leaf(response) assert serialized_data["body.html"] == html.encode("utf8") info = json.loads(serialized_data["info.json"]) assert info == {"url": str(url), "status": 200, "type": "BrowserResponse"} deserialized = deserialize_leaf(BrowserResponse, serialized_data) assert isinstance(deserialized, BrowserResponse) assert deserialized.html == response.html assert deserialized.status == response.status _assert_urls_equal(deserialized.url, response.url) def test_serialization_anyresponse_http_response(book_list_html_response) -> None: any_response = AnyResponse(response=book_list_html_response) serialized_data = serialize_leaf(any_response) info = json.loads(serialized_data["info.json"]) assert "_encoding" in info assert info["_encoding"] == book_list_html_response._encoding assert serialized_data["body.html"] == bytes(book_list_html_response.body) deserialized = deserialize_leaf(AnyResponse, serialized_data) assert isinstance(deserialized, AnyResponse) assert isinstance(deserialized.response, HttpResponse) assert deserialized.response.body == book_list_html_response.body assert deserialized.response.status == book_list_html_response.status assert deserialized.response.headers == book_list_html_response.headers assert deserialized.response._encoding == book_list_html_response._encoding _assert_urls_equal(deserialized.response.url, book_list_html_response.url) def test_serialization_anyresponse_browser_response() -> None: html = BrowserHtml("Hello") url = ResponseUrl("http://example.com") browser_response = BrowserResponse(url=url, html=html, status=200) any_response = AnyResponse(response=browser_response) serialized_data = serialize_leaf(any_response) assert serialized_data["body.html"] == html.encode("utf8") info = json.loads(serialized_data["info.json"]) assert info == {"url": str(url), "status": 200, "type": "BrowserResponse"} assert "_encoding" not in info deserialized = deserialize_leaf(AnyResponse, serialized_data) assert isinstance(deserialized, AnyResponse) assert isinstance(deserialized.response, BrowserResponse) assert deserialized.response.html == browser_response.html assert deserialized.response.status == browser_response.status _assert_urls_equal(deserialized.response.url, browser_response.url) def test_serialization_leaf_unsupported() -> None: class A: pass with pytest.raises( NotImplementedError, match=r"Serialization .+ is not implemented" ): serialize_leaf(A()) with pytest.raises( NotImplementedError, match=r"Deserialization .+ is not implemented" ): deserialize_leaf(A, {}) def test_serialization(book_list_html_response) -> None: @attrs.define class ResponseData(Injectable): response: HttpResponse @attrs.define class MyWebPage(ItemPage): response: HttpResponse url: ResponseUrl request_url: RequestUrl params: PageParams data: ResponseData stats: Stats url_str = "http://books.toscrape.com/index.html" url = ResponseUrl(url_str) request_url = RequestUrl(url_str) page_params = PageParams(foo="bar") stats = Stats() serialized_deps = serialize( [book_list_html_response, url, request_url, page_params, stats] ) info_json = f"""{{ "_encoding": "utf-8", "headers": [], "status": null, "type": "HttpResponse", "url": "{url_str}" }}""".encode() assert serialized_deps == { "HttpResponse": { "body.html": bytes(book_list_html_response.body), "info.json": info_json, }, "ResponseUrl": { "txt": url_str.encode(), }, "RequestUrl": { "txt": url_str.encode(), }, "PageParams": { "json": b'{\n "foo": "bar"\n}', }, "Stats": {}, } po = MyWebPage( book_list_html_response, url, request_url, page_params, ResponseData(book_list_html_response), Stats(), ) deserialized_po = deserialize(MyWebPage, serialized_deps) _assert_pages_equal(po, deserialized_po) assert deserialized_po.data is not None def test_serialization_injectable(book_list_html_response) -> None: with pytest.raises(ValueError, match=r"Injectable type .+ passed"): serialize([WebPage(book_list_html_response)]) def test_serialization_httpresponse_encoding(book_list_html) -> None: body = HttpResponseBody(bytes(book_list_html, "utf-8")) resp_enc = HttpResponse( url=ResponseUrl("http://books.toscrape.com/index.html"), body=body, encoding="utf-8", ) assert resp_enc._encoding == "utf-8" deserialized_resp_enc = deserialize_leaf(HttpResponse, serialize_leaf(resp_enc)) assert deserialized_resp_enc._encoding == "utf-8" resp_noenc = HttpResponse( url=ResponseUrl("http://books.toscrape.com/index.html"), body=body ) assert resp_noenc._encoding is None deserialized_resp_noenc = deserialize_leaf(HttpResponse, serialize_leaf(resp_noenc)) assert deserialized_resp_noenc._encoding is None def test_custom_functions() -> None: class C: value: int def __init__(self, value: int): self.value = value def _serialize(o: C) -> SerializedLeafData: return {"bin": o.value.to_bytes((o.value.bit_length() + 7) // 8, "little")} def _deserialize(t: type[C], data: SerializedLeafData) -> C: return t(int.from_bytes(data["bin"], "little")) register_serialization(_serialize, _deserialize) obj = C(22222222222) deserialized_obj = deserialize_leaf(C, serialize_leaf(obj)) assert obj.value == deserialized_obj.value def test_write_data(book_list_html_response, tmp_path) -> None: @attrs.define class MyWebPage(ItemPage): response: HttpResponse url: ResponseUrl url = ResponseUrl("http://example.com") directory = tmp_path / "ser" directory.mkdir() storage = SerializedDataFileStorage(directory) serialized_deps = serialize([book_list_html_response, url]) storage.write(serialized_deps) assert (directory / "HttpResponse-body.html").exists() assert (directory / "HttpResponse-body.html").read_bytes() == bytes( book_list_html_response.body ) assert (directory / "HttpResponse-info.json").exists() assert (directory / "ResponseUrl.txt").exists() assert (directory / "ResponseUrl.txt").read_text( encoding="utf-8" ) == "http://example.com" read_serialized_deps = storage.read() po = MyWebPage(book_list_html_response, url) deserialized_po = deserialize(MyWebPage, read_serialized_deps) assert type(deserialized_po) == MyWebPage # noqa: E721 _assert_pages_equal(po, deserialized_po) def test_extra_files(book_list_html_response, tmp_path) -> None: directory = tmp_path / "ser" directory.mkdir() storage = SerializedDataFileStorage(directory) serialized_deps = serialize([book_list_html_response]) storage.write(serialized_deps) (directory / "foo.dir").mkdir() (directory / "bar.txt").touch() read_serialized_deps = storage.read() assert "HttpResponse" in read_serialized_deps def test_httpclient_empty(tmp_path) -> None: directory = tmp_path / "ser" directory.mkdir() client = HttpClient() storage = SerializedDataFileStorage(directory) serialized_deps = serialize([client]) storage.write(serialized_deps) assert (directory / "HttpClient.exists").exists() read_serialized_deps = storage.read() assert "HttpClient" in read_serialized_deps def test_annotated(book_list_html_response) -> None: @attrs.define class MyWebPage(ItemPage): response: Annotated[HttpResponse, "foo", 42] url: ResponseUrl url_str = "http://books.toscrape.com/index.html" url = ResponseUrl(url_str) serialized_deps = serialize( [AnnotatedInstance(book_list_html_response, ("foo", 42)), url] ) po = MyWebPage( book_list_html_response, url, ) deserialized_po = deserialize(MyWebPage, serialized_deps) _assert_pages_equal(po, deserialized_po) def test_annotated_duplicate(book_list_html_response) -> None: url_str = "http://books.toscrape.com/index.html" url = ResponseUrl(url_str) with pytest.raises( ValueError, match="Several instances of AnnotatedInstance for HttpResponse were" ): serialize( [ AnnotatedInstance(book_list_html_response, ("foo", 42)), AnnotatedInstance(book_list_html_response, ("bar",)), url, ] ) @pytest.mark.parametrize( "raw_annotation", [ {"foo": "bar"}, ["foo", "bar"], ], ) def test_annotation_codec(raw_annotation): encoded_annotation = annotation_encode(raw_annotation) assert annotation_decode(encoded_annotation) == raw_annotation obj = ResponseUrl("http://example.com") metadata = (encoded_annotation,) serialized = serialize([AnnotatedInstance(obj, metadata)]) expected_key = "AnnotatedInstance ResponseUrl" assert expected_key in serialized deserialized_instance = deserialize_leaf( AnnotatedInstance, serialized[expected_key] ) assert isinstance(deserialized_instance, AnnotatedInstance) assert str(deserialized_instance.result) == str(obj) assert metadata == deserialized_instance.metadata scrapinghub-web-poet-ba87b95/tests/test_stats.py000066400000000000000000000012541517167256700221160ustar00rootroot00000000000000from web_poet.page_inputs import Stats from web_poet.page_inputs.stats import DictStatCollector, DummyStatCollector def test_stats_writes_to_dummy_collector(): stats = Stats() stats.set("a", "1") stats.set("b", 2) stats.inc("b") stats.inc("b", 5) stats.inc("c") assert isinstance(stats._stats, DummyStatCollector) assert stats._stats._stats == {"a": "1", "b": 8, "c": 1} def test_dict_stat_collector_data_returns_dict(): collector = DictStatCollector() collector.set("latest", "ok") collector.inc("hits") collector.inc("hits", 2) assert isinstance(collector.data, dict) assert collector.data == {"latest": "ok", "hits": 3} scrapinghub-web-poet-ba87b95/tests/test_testing.py000066400000000000000000000167571517167256700224530ustar00rootroot00000000000000from __future__ import annotations import json from collections import deque from typing import TYPE_CHECKING, Any import pytest from itemadapter import ItemAdapter from itemadapter.adapter import DictAdapter from web_poet import HttpResponse, WebPage from web_poet.testing import Fixture from web_poet.testing.__main__ import main as cli_main from web_poet.testing.fixture import INPUT_DIR_NAME, META_FILE_NAME, OUTPUT_FILE_NAME from web_poet.utils import get_fq_class_name if TYPE_CHECKING: from pathlib import Path def test_save_fixture(book_list_html_response, tmp_path) -> None: base_dir = tmp_path / "fixtures" / "some.po" item = {"foo": "bar"} meta = {"foo": "bar", "frozen_time": "2022-01-01"} def _assert_fixture_files( directory: Path, expected_meta: dict | None = None ) -> None: input_dir = directory / INPUT_DIR_NAME assert (input_dir / "HttpResponse-body.html").exists() assert (input_dir / "HttpResponse-body.html").read_bytes() == bytes( book_list_html_response.body ) assert (input_dir / "HttpResponse-info.json").exists() assert (directory / OUTPUT_FILE_NAME).exists() assert json.loads((directory / OUTPUT_FILE_NAME).read_bytes()) == item if expected_meta: assert ( json.loads((directory / META_FILE_NAME).read_bytes()) == expected_meta ) else: assert not (directory / META_FILE_NAME).exists() Fixture.save(base_dir, inputs=[book_list_html_response], item=item) _assert_fixture_files(base_dir / "test-1") Fixture.save( base_dir, inputs=[book_list_html_response], item=item, fixture_name="custom" ) _assert_fixture_files(base_dir / "custom") Fixture.save(base_dir, inputs=[book_list_html_response], item=item, meta=meta) _assert_fixture_files(base_dir / "test-2", expected_meta=meta) def test_save_fixture_unicode_item(book_list_html_response, tmp_path) -> None: base_dir = tmp_path / "fixtures" / "some.po" item = {"foo": "✓bar£"} Fixture.save(base_dir, inputs=[book_list_html_response], item=item) fixture = Fixture(base_dir / "test-1") assert json.loads(fixture.output_path.read_bytes()) == item assert fixture.get_expected_output() == item def test_save_fixture_unicode_exception(book_list_html_response, tmp_path) -> None: base_dir = tmp_path / "fixtures" / "some.po" exc = ValueError("✓bar£") Fixture.save(base_dir, inputs=[book_list_html_response], exception=exc) fixture = Fixture(base_dir / "test-1") exc_data = json.loads(fixture.exception_path.read_bytes()) assert exc_data == {"import_path": "builtins.ValueError", "msg": "✓bar£"} expected_exc = fixture.get_expected_exception() assert type(expected_exc) is ValueError assert expected_exc.args == ("✓bar£",) def test_save_fixture_unicode_meta(book_list_html_response, tmp_path) -> None: base_dir = tmp_path / "fixtures" / "some.po" item = {"foo": "bar"} meta = {"foo": "✓bar£", "frozen_time": "2022-01-01"} Fixture.save(base_dir, inputs=[book_list_html_response], item=item, meta=meta) fixture = Fixture(base_dir / "test-1") meta_data = json.loads(fixture.meta_path.read_bytes()) assert meta_data == meta assert fixture.get_meta() == meta class MyItemPage(WebPage): async def to_item(self) -> dict: return {"foo": "bar"} class CapitalizingDictAdapter(DictAdapter): def __getitem__(self, field_name: str) -> Any: item = super().__getitem__(field_name) if isinstance(item, str): return item.capitalize() return item class CustomItemAdapter(ItemAdapter): ADAPTER_CLASSES = deque([CapitalizingDictAdapter]) def test_fixture_adapter(book_list_html_response, tmp_path) -> None: item = {"foo": "bar"} meta = {"adapter": CustomItemAdapter} base_dir = tmp_path / "fixtures" / get_fq_class_name(MyItemPage) fixture = Fixture.save( base_dir, inputs=[book_list_html_response], item=item, meta=meta ) saved_output = json.loads(fixture.output_path.read_bytes()) assert saved_output["foo"] == "Bar" loaded_fixture = Fixture(base_dir / "test-1") page_output = loaded_fixture.get_output(MyItemPage) assert page_output["foo"] == "Bar" actual_output = loaded_fixture.get_expected_output() assert actual_output["foo"] == "Bar" @pytest.mark.parametrize("dir_name", [get_fq_class_name(MyItemPage), "unrelated"]) def test_fixture_asserts( book_list_html_response: HttpResponse, tmp_path: Path, dir_name: str ) -> None: item = {"foo": "bar"} base_dir = tmp_path / "fixtures" / dir_name Fixture.save(base_dir, inputs=[book_list_html_response], item=item) loaded_fixture = Fixture(base_dir / "test-1") loaded_fixture.assert_full_item_correct(MyItemPage) loaded_fixture.assert_field_correct("foo", MyItemPage) loaded_fixture.assert_no_extra_fields(MyItemPage) loaded_fixture.assert_no_toitem_exceptions(MyItemPage) with pytest.raises(KeyError): loaded_fixture.assert_field_correct("bar", MyItemPage) class MyItemPage3(WebPage): async def to_item(self) -> dict: return {"foo": "bar", "egg": "spam", "hello": "world"} def test_cli_rerun( tmp_path: Path, book_list_html_response: HttpResponse, capsys: pytest.CaptureFixture[str], ) -> None: base_dir = tmp_path / "fixtures" / get_fq_class_name(MyItemPage3) fixture = Fixture.save( base_dir, inputs=[book_list_html_response], item={"foo": "bar2", "egg": "spam", "hello": "world"}, ) cli_main(["rerun", str(fixture.path)]) captured = capsys.readouterr() assert not captured.err assert json.loads(captured.out) == {"foo": "bar", "egg": "spam", "hello": "world"} def test_cli_rerun_fields( tmp_path: Path, book_list_html_response: HttpResponse, capsys: pytest.CaptureFixture[str], ) -> None: base_dir = tmp_path / "fixtures" / get_fq_class_name(MyItemPage3) fixture = Fixture.save( base_dir, inputs=[book_list_html_response], item={"foo": "bar2", "egg": "spam", "hello": "world"}, ) cli_main(["rerun", str(fixture.path), "--fields=foo,egg"]) captured = capsys.readouterr() assert not captured.err assert json.loads(captured.out) == {"foo": "bar", "egg": "spam"} def test_cli_rerun_fields_unknown_names( tmp_path: Path, book_list_html_response: HttpResponse, capsys: pytest.CaptureFixture[str], ) -> None: base_dir = tmp_path / "fixtures" / get_fq_class_name(MyItemPage3) fixture = Fixture.save( base_dir, inputs=[book_list_html_response], item={"foo": "bar2", "egg": "spam", "hello": "world"}, ) cli_main(["rerun", str(fixture.path), "--fields=foo,egg2"]) captured = capsys.readouterr() assert ( "Unknown field names: ['egg2']. Allowed names are: ['egg', 'foo', 'hello']" in captured.err ) assert json.loads(captured.out) == {"foo": "bar"} def test_cli_rerun_standalone_fixture( tmp_path: Path, book_list_html_response: HttpResponse, capsys: pytest.CaptureFixture[str], ) -> None: base_dir = tmp_path / "fixtures" / "unrelated" fixture = Fixture.save( base_dir, inputs=[book_list_html_response], item={"foo": "bar2", "egg": "spam", "hello": "world"}, ) cli_main(["rerun", str(fixture.path), "-p", get_fq_class_name(MyItemPage3)]) captured = capsys.readouterr() assert not captured.err assert json.loads(captured.out) == {"foo": "bar", "egg": "spam", "hello": "world"} scrapinghub-web-poet-ba87b95/tests/test_testing_pytest.py000066400000000000000000000464331517167256700240550ustar00rootroot00000000000000from __future__ import annotations import datetime import json from typing import TYPE_CHECKING, Annotated from zoneinfo import ZoneInfo import attrs import dateutil.tz import pytest import time_machine from itemadapter import ItemAdapter from parsel import Selector from zyte_common_items import Item, Metadata, Product from web_poet import HttpClient, HttpRequest, HttpResponse, WebPage, field from web_poet.annotated import AnnotatedInstance from web_poet.exceptions import HttpRequestError, HttpResponseError, Retry, UseFallback from web_poet.page_inputs.client import _SavedResponseData from web_poet.page_inputs.url import RequestUrl from web_poet.testing import Fixture from web_poet.testing.fixture import INPUT_DIR_NAME, META_FILE_NAME, OUTPUT_FILE_NAME from web_poet.utils import get_fq_class_name if TYPE_CHECKING: from pathlib import Path N_TESTS = len(attrs.fields(Product)) + 2 class MyItemPage(WebPage): async def to_item(self) -> dict: return {"foo": "bar"} def _save_fixture( pytester, page_cls, page_inputs, *, expected_output=None, expected_exception=None ): base_dir = pytester.path / "fixtures" / get_fq_class_name(page_cls) return Fixture.save( base_dir, inputs=page_inputs, item=expected_output, exception=expected_exception ) def test_pytest_plugin_pass(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": "bar"}, ) result = pytester.runpytest() result.assert_outcomes(passed=3) def test_pytest_plugin_bad_field_value(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": "not bar"}, ) result = pytester.runpytest() result.assert_outcomes(failed=1, passed=2) result.stdout.fnmatch_lines("item.foo is not correct*") class MyItemPage2(WebPage): async def to_item(self) -> dict: return {"foo": None} def test_pytest_plugin_bad_field_value_None(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage2, page_inputs=[book_list_html_response], expected_output={"foo": "bar"}, ) result = pytester.runpytest() result.assert_outcomes(failed=1, passed=2) result.stdout.fnmatch_lines("item.foo is not correct*") result.stdout.fnmatch_lines("Expected: 'bar', got: None*") def test_pytest_plugin_missing_field(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": "bar", "foo2": "bar2"}, ) result = pytester.runpytest() result.assert_outcomes(failed=1, passed=3) result.stdout.fnmatch_lines("item.foo2 is missing*") def test_pytest_plugin_extra_field(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo2": "bar2"}, ) result = pytester.runpytest() result.assert_outcomes(failed=2, passed=1) result.stdout.fnmatch_lines("item.foo2 is missing*") result.stdout.fnmatch_lines("*unexpected fields*") result.stdout.fnmatch_lines("*foo = 'bar'*") class FieldExceptionPage(WebPage): @field def foo(self): return "foo" @field def bar(self): raise Exception def test_pytest_plugin_field_exception(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=FieldExceptionPage, page_inputs=[book_list_html_response], expected_output={"foo": "foo", "bar": "bar"}, ) result = pytester.runpytest() result.assert_outcomes(failed=1, skipped=3) result.stdout.fnmatch_lines("*FAILED*TO_ITEM_DOESNT_RAISE*") def test_pytest_plugin_compare_item(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": "bar"}, ) result = pytester.runpytest("--web-poet-test-per-item") result.assert_outcomes(passed=1) def test_pytest_plugin_compare_item_unformatted_output( pytester, book_list_html_response ) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": "bar"}, ) base_dir = pytester.path / "fixtures" / get_fq_class_name(MyItemPage) fixture = Fixture(base_dir / "test-1") fixture.output_path.write_text('{"foo":"bar"}') result = pytester.runpytest("--web-poet-test-per-item") result.assert_outcomes(passed=1) def test_pytest_plugin_compare_item_fail(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": "not bar"}, ) result = pytester.runpytest("--web-poet-test-per-item", "-vv") result.assert_outcomes(passed=0, failed=1) result.stdout.fnmatch_lines( "*The output doesn't match.\n" '\'{\\n "foo": "bar"\\n}\' == \'{\\n "foo": "not bar"\\n}\'\n' "*" " {\n" '- "foo": "not bar"\n' "? ----\n" '+ "foo": "bar"\n' " }*" ) @attrs.define(kw_only=True) class MetadataLocalTime(Metadata): dateDownloadedLocal: str | None = None @attrs.define(kw_only=True) class ProductLocalTime(Product): # in newer zyte-common-items this should inherit from ProductMetadata metadata: MetadataLocalTime | None # type: ignore[assignment] def _get_product_item(date: datetime.datetime) -> ProductLocalTime: if date.tzinfo is None: # convert to the aware object so that date_local_str always includes the offset date = date.astimezone() date_str = date.astimezone(dateutil.tz.UTC).strftime("%Y-%M-%dT%H:%M:%SZ") date_local_str = date.strftime("%Y-%M-%dT%H:%M:%S%z") return ProductLocalTime( url="http://example.com", name="foo", metadata=MetadataLocalTime( dateDownloaded=date_str, dateDownloadedLocal=date_local_str, # type: ignore[call-arg] ), ) class DateItemPage(WebPage): async def to_item(self) -> Item: date = datetime.datetime.now().astimezone() return _get_product_item(date) def _assert_frozen_item( frozen_time: datetime.datetime, pytester: pytest.Pytester, response: HttpResponse, *, outcomes: dict[str, int] | None = None, ) -> None: # this makes an item with datetime fields corresponding to frozen_time item = ItemAdapter(_get_product_item(frozen_time)).asdict() # this marks the fixture to be run under frozen_time meta = {"frozen_time": frozen_time.strftime("%Y-%m-%d %H:%M:%S %z")} base_dir = pytester.path / "fixtures" / get_fq_class_name(DateItemPage) Fixture.save(base_dir, inputs=[response], item=item, meta=meta) # this runs the test, faking the time and the timezone from frozen_time, # the result should contain frozen_time in the datetime fields result = pytester.runpytest() if outcomes is None: outcomes = {"passed": N_TESTS} result.assert_outcomes(**outcomes) @pytest.mark.xfail(not time_machine.HAVE_TZSET, reason="Works on Windows only in UTC") def test_pytest_frozen_time_utc(pytester, book_list_html_response) -> None: frozen_time = datetime.datetime(2022, 3, 4, 20, 21, 22, tzinfo=dateutil.tz.UTC) _assert_frozen_item(frozen_time, pytester, book_list_html_response) def test_pytest_frozen_time_naive(pytester, book_list_html_response) -> None: frozen_time = datetime.datetime(2022, 3, 4, 20, 21, 22) _assert_frozen_item(frozen_time, pytester, book_list_html_response) @pytest.mark.skipif(not time_machine.HAVE_TZSET, reason="Not supported on Windows") @pytest.mark.parametrize("offset", [-5, 0, 8]) def test_pytest_frozen_time_tz(pytester, book_list_html_response, offset) -> None: tzinfo = ZoneInfo(f"Etc/GMT{-offset:+d}") frozen_time = datetime.datetime(2022, 3, 4, 20, 21, 22, tzinfo=tzinfo) _assert_frozen_item(frozen_time, pytester, book_list_html_response) @pytest.mark.skipif(time_machine.HAVE_TZSET, reason="Tests Windows-specific code") def test_pytest_frozen_time_tz_windows_fail(pytester, book_list_html_response) -> None: frozen_time = datetime.datetime( 2022, 3, 4, 20, 21, 22, tzinfo=dateutil.tz.tzoffset(None, -7.5 * 3600) ) _assert_frozen_item( frozen_time, pytester, book_list_html_response, outcomes={"failed": 1, "passed": N_TESTS - 1}, ) @pytest.mark.skipif(time_machine.HAVE_TZSET, reason="Tests Windows-specific code") def test_pytest_frozen_time_tz_windows_pass(pytester, book_list_html_response) -> None: frozen_time = datetime.datetime( 2022, 3, 4, 20, 21, 22, tzinfo=dateutil.tz.tzlocal() ) _assert_frozen_item(frozen_time, pytester, book_list_html_response) @attrs.define class ClientPage(WebPage): client: HttpClient async def to_item(self) -> dict: resp1 = await self.client.get("http://books.toscrape.com/1.html") resp2 = await self.client.post("http://books.toscrape.com/2.html", body=b"post") return {"foo": "bar", "additional": [resp1.body.decode(), resp2.body.decode()]} def test_httpclient(pytester, book_list_html_response) -> None: url1 = "http://books.toscrape.com/1.html" request1 = HttpRequest(url1) response1 = HttpResponse(url=url1, body=b"body1", encoding="utf-8") url2 = "http://books.toscrape.com/2.html" request2 = HttpRequest(url2, method="POST", body=b"post") response2 = HttpResponse(url=url2, body=b"body2", encoding="utf-8") responses = [ _SavedResponseData(request1, response1), _SavedResponseData(request2, response2), ] client = HttpClient(responses=responses) base_dir = pytester.path / "fixtures" / get_fq_class_name(ClientPage) item = { "foo": "bar", "additional": ["body1", "body2"], } Fixture.save(base_dir, inputs=[book_list_html_response, client], item=item) input_dir = base_dir / "test-1" / INPUT_DIR_NAME assert (input_dir / "HttpResponse-body.html").read_bytes() == bytes( book_list_html_response.body ) assert (input_dir / "HttpClient-0-HttpRequest.info.json").exists() assert (input_dir / "HttpClient-0-HttpResponse.info.json").exists() assert (input_dir / "HttpClient-0-HttpResponse.body.html").read_bytes() == b"body1" assert (input_dir / "HttpClient-1-HttpResponse.body.html").read_bytes() == b"body2" result = pytester.runpytest() result.assert_outcomes(passed=4) def test_httpclient_no_response(pytester, book_list_html_response) -> None: url = "http://books.toscrape.com/1.html" request = HttpRequest(url) response = HttpResponse(url=url, body=b"body1", encoding="utf-8") responses = [ _SavedResponseData(request, response), ] client = HttpClient(responses=responses) item = { "foo": "bar", "additional": ["body1", "body2"], } _save_fixture( pytester, page_cls=ClientPage, page_inputs=[book_list_html_response, client], expected_output=item, ) result = pytester.runpytest() result.assert_outcomes(failed=1, skipped=3) @attrs.define class ClientResponseErrorPage(WebPage): client: HttpClient async def to_item(self) -> dict: msg = "" try: await self.client.get("http://books.toscrape.com/1.html") except HttpResponseError as ex: msg = ex.args[0] return {"foo": "bar", "exception": msg} def test_httpclient_response_exception(pytester, book_list_html_response) -> None: url = "http://books.toscrape.com/1.html" request = HttpRequest(url) response = HttpResponse(url=url, body=b"body1", status=404, encoding="utf-8") responses = [ _SavedResponseData(request, response), ] client = HttpClient(responses=responses) item = { "foo": "bar", "exception": "404 NOT_FOUND response for http://books.toscrape.com/1.html", } _save_fixture( pytester, page_cls=ClientResponseErrorPage, page_inputs=[book_list_html_response, client], expected_output=item, ) result = pytester.runpytest() result.assert_outcomes(passed=4) @attrs.define class ClientRequestErrorPage(WebPage): client: HttpClient async def to_item(self) -> dict: msg = "" try: await self.client.get("http://books.toscrape.com/1.html") except HttpRequestError as ex: msg = ex.args[0] return {"foo": "bar", "exception": msg} def test_httpclient_request_exception(pytester, book_list_html_response) -> None: url = "http://books.toscrape.com/1.html" request = HttpRequest(url) exception = HttpRequestError("Bad Request", request) responses = [ _SavedResponseData(request, None, exception), ] client = HttpClient(responses=responses) item = { "foo": "bar", "exception": "Bad Request", } fixture = _save_fixture( pytester, page_cls=ClientRequestErrorPage, page_inputs=[book_list_html_response, client], expected_output=item, ) assert (fixture.input_path / "HttpClient-0-exception.json").exists() result = pytester.runpytest() result.assert_outcomes(passed=4) class RetryItemPage(WebPage): async def to_item(self): raise Retry def test_page_object_exception_pass(pytester, book_list_html_response) -> None: fixture = _save_fixture( pytester, page_cls=RetryItemPage, page_inputs=[book_list_html_response], expected_exception=Retry(), ) assert fixture.exception_path.exists() result = pytester.runpytest() result.assert_outcomes(passed=1) def test_page_object_exception_wrong(pytester, book_list_html_response) -> None: fixture = _save_fixture( pytester, page_cls=RetryItemPage, page_inputs=[book_list_html_response], expected_exception=UseFallback(), ) assert fixture.exception_path.exists() result = pytester.runpytest() result.assert_outcomes(failed=1) def test_page_object_exception_none(pytester, book_list_html_response) -> None: fixture = _save_fixture( pytester, page_cls=WebPage, page_inputs=[book_list_html_response], expected_exception=Retry(), ) assert fixture.exception_path.exists() result = pytester.runpytest() result.assert_outcomes(failed=1) @attrs.define(kw_only=True) class MyAnnotatedItemPage(MyItemPage): response: Annotated[HttpResponse, "foo", 42] async def to_item(self) -> dict: return {"foo": "bar"} def test_annotated(pytester, book_list_html_response) -> None: _save_fixture( pytester, page_cls=MyAnnotatedItemPage, page_inputs=[AnnotatedInstance(book_list_html_response, ("foo", 42))], expected_output={"foo": "bar"}, ) result = pytester.runpytest() result.assert_outcomes(passed=3) def test_request_url_output_serialization(book_list_html_response, tmp_path) -> None: base_dir = tmp_path / "fixtures" / "some.po" item = {"foo": RequestUrl("https://books.toscrape.com/")} item_json = {"foo": "https://books.toscrape.com/"} def _assert_fixture_files( directory: Path, expected_meta: dict | None = None ) -> None: input_dir = directory / INPUT_DIR_NAME assert (input_dir / "HttpResponse-body.html").exists() assert (input_dir / "HttpResponse-body.html").read_bytes() == bytes( book_list_html_response.body ) assert (input_dir / "HttpResponse-info.json").exists() assert (directory / OUTPUT_FILE_NAME).exists() assert json.loads((directory / OUTPUT_FILE_NAME).read_bytes()) == item_json if expected_meta: assert ( json.loads((directory / META_FILE_NAME).read_bytes()) == expected_meta ) else: assert not (directory / META_FILE_NAME).exists() Fixture.save(base_dir, inputs=[book_list_html_response], item=item) _assert_fixture_files(base_dir / "test-1") def test_unserializable(book_list_html_response, tmp_path) -> None: class Foo: pass base_dir = tmp_path / "fixtures" / "some.po" item = {"foo": Foo()} with pytest.raises(TypeError): Fixture.save(base_dir, inputs=[book_list_html_response], item=item) @pytest.mark.parametrize("expected", ["bar", "not bar"]) def test_junitxml( pytester: pytest.Pytester, book_list_html_response: HttpResponse, expected: str ) -> None: _save_fixture( pytester, page_cls=MyItemPage, page_inputs=[book_list_html_response], expected_output={"foo": expected}, ) report_name = "junit.xml" result = pytester.runpytest( f"--junitxml={report_name}", "-o", "junit_family=legacy" ) if expected == "bar": result.assert_outcomes(passed=3) else: result.assert_outcomes(failed=1, passed=2) report_path = pytester.path / report_name assert report_path.exists() sel = Selector(report_path.read_text(encoding="utf-8")) testcases = sel.xpath( "//testcase[@classname='fixtures.tests.test_testing_pytest.MyItemPage.test-1.output.json' and @name='foo']" ) assert len(testcases) == 1 testcase = testcases[0] expected_node = testcase.xpath( "properties/property[@name='web_poet_expected_value']" )[0] assert expected_node.xpath("@value").get() == f'"{expected}"' actual_node = testcase.xpath("properties/property[@name='web_poet_actual_value']")[ 0 ] assert actual_node.xpath("@value").get() == '"bar"' @pytest.mark.parametrize("expected", [Retry, UseFallback]) def test_junitxml_expected_exception( pytester: pytest.Pytester, book_list_html_response: HttpResponse, expected: type[Exception], ) -> None: _save_fixture( pytester, page_cls=RetryItemPage, page_inputs=[book_list_html_response], expected_exception=expected(), ) report_name = "junit.xml" result = pytester.runpytest( f"--junitxml={report_name}", "-o", "junit_family=legacy" ) if expected is Retry: result.assert_outcomes(passed=1) else: result.assert_outcomes(failed=1) report_path = pytester.path / report_name assert report_path.exists() sel = Selector(report_path.read_text(encoding="utf-8")) testcases = sel.xpath( "//testcase[@classname='fixtures.tests.test_testing_pytest.RetryItemPage.test-1.exception.json' and @name='TO_ITEM_RAISES']" ) assert len(testcases) == 1 testcase = testcases[0] expected_node = testcase.xpath( "properties/property[@name='web_poet_expected_exception']" )[0] expected_json = expected_node.xpath("@value").get() assert expected_json expected_data = json.loads(expected_json) assert expected_data == {"import_path": get_fq_class_name(expected), "msg": None} actual_node = testcase.xpath( "properties/property[@name='web_poet_actual_exception']" )[0] actual_json = actual_node.xpath("@value").get() assert actual_json actual_data = json.loads(actual_json) assert actual_data == {"import_path": "web_poet.exceptions.core.Retry", "msg": None} scrapinghub-web-poet-ba87b95/tests/test_url.py000066400000000000000000000023121517167256700215560ustar00rootroot00000000000000import pytest from web_poet import RequestUrl, ResponseUrl from web_poet.page_inputs.url import _Url def test_url_base_class() -> None: url_str = "http://example.com" url = _Url(url_str) assert str(url) == url_str assert repr(url) == "_Url('http://example.com')" def test_url_init_validation() -> None: with pytest.raises(TypeError): _Url(123) # type: ignore[arg-type] def test_url_subclasses() -> None: url_str = "http://example.com" class MyUrl(_Url): pass class MyUrl2(_Url): pass url = MyUrl(url_str) assert str(url) == url_str assert url._url == url_str assert repr(url) == "MyUrl('http://example.com')" url2 = MyUrl2(url) assert str(url2) == str(url) @pytest.mark.parametrize("url_cls", [_Url, RequestUrl, ResponseUrl]) def test_str_equality(url_cls) -> None: url_str = "http://example.com#foo" url = url_cls(url_str) assert url != url_str assert str(url) == url_str def test_url_classes_eq() -> None: url_str = "http://example.com#foo" request_url = RequestUrl(url_str) response_url = ResponseUrl(url_str) assert request_url != response_url assert str(request_url) == str(response_url) scrapinghub-web-poet-ba87b95/tests/test_utils.py000066400000000000000000000320611517167256700221200ustar00rootroot00000000000000from __future__ import annotations import asyncio import inspect import random import warnings from typing import Any, Generic, TypeVar from unittest import mock import pytest from web_poet.utils import ( _create_deprecated_class, cached_method, ensure_awaitable, get_generic_param, ) class SomeBaseClass: pass class NewName(SomeBaseClass): pass def _mywarnings(w): return [x for x in w if x.category is DeprecationWarning] def test_no_warning_on_definition() -> None: with warnings.catch_warnings(record=True) as w: _create_deprecated_class("Deprecated", NewName) w = _mywarnings(w) assert w == [] def test_subclassing_warning_message() -> None: # https://github.com/python/mypy/issues/2477#issuecomment-262734005 # Annotating it with Any helps prevent mypy issues for dynamic classes Deprecated: Any = _create_deprecated_class("Deprecated", NewName) with warnings.catch_warnings(record=True) as w: class UserClass(Deprecated): pass w = _mywarnings(w) assert len(w) == 1 expected = ( f"{__name__}.{UserClass.__qualname__} inherits from deprecated class " f"{__name__}.Deprecated, please inherit from {__name__}.NewName. " f"(warning only on first subclass, there may be others)" ) assert str(w[0].message) == expected assert w[0].lineno == inspect.getsourcelines(UserClass)[1] def test_custom_class_paths() -> None: Deprecated: Any = _create_deprecated_class( "Deprecated", NewName, new_class_path="foo.NewClass", old_class_path="bar.OldClass", ) with warnings.catch_warnings(record=True) as w: class UserClass(Deprecated): pass _ = Deprecated() w = _mywarnings(w) assert len(w) == 2 assert "foo.NewClass" in str(w[0].message) assert "bar.OldClass" in str(w[0].message) assert "foo.NewClass" in str(w[1].message) assert "bar.OldClass" in str(w[1].message) def test_subclassing_warns_only_on_direct_childs() -> None: Deprecated: Any = _create_deprecated_class("Deprecated", NewName, warn_once=False) with warnings.catch_warnings(record=True) as w: class UserClass(Deprecated): pass class NoWarnOnMe(UserClass): pass w = _mywarnings(w) assert len(w) == 1 assert "UserClass" in str(w[0].message) def test_subclassing_warns_once_by_default() -> None: Deprecated: Any = _create_deprecated_class("Deprecated", NewName) with warnings.catch_warnings(record=True) as w: class UserClass(Deprecated): pass class FooClass(Deprecated): pass class BarClass(Deprecated): pass w = _mywarnings(w) assert len(w) == 1 assert "UserClass" in str(w[0].message) def test_warning_on_instance() -> None: Deprecated: Any = _create_deprecated_class("Deprecated", NewName) # ignore subclassing warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) class UserClass(Deprecated): pass with warnings.catch_warnings(record=True) as w: _, lineno = Deprecated(), inspect.getlineno(inspect.currentframe()) # type: ignore[arg-type] _ = UserClass() # subclass instances don't warn w = _mywarnings(w) assert len(w) == 1 expected = ( f"{__name__}.Deprecated is deprecated, instantiate {__name__}.NewName instead." ) assert str(w[0].message) == expected assert w[0].lineno == lineno def test_warning_auto_message() -> None: with warnings.catch_warnings(record=True) as w: Deprecated: Any = _create_deprecated_class("Deprecated", NewName) class UserClass2(Deprecated): pass msg = str(w[0].message) assert f"{__name__}.NewName" in msg assert f"{__name__}.Deprecated" in msg def test_issubclass() -> None: with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) DeprecatedName: Any = _create_deprecated_class("DeprecatedName", NewName) class UpdatedUserClass1(NewName): pass class UpdatedUserClass1a(NewName): pass class OutdatedUserClass1(DeprecatedName): pass class OutdatedUserClass1a(DeprecatedName): pass class UnrelatedClass: pass class OldStyleClass: pass assert issubclass(UpdatedUserClass1, NewName) assert issubclass(UpdatedUserClass1a, NewName) assert issubclass(UpdatedUserClass1, DeprecatedName) assert issubclass(UpdatedUserClass1a, DeprecatedName) assert issubclass(OutdatedUserClass1, DeprecatedName) assert not issubclass(UnrelatedClass, DeprecatedName) assert not issubclass(OldStyleClass, DeprecatedName) assert not issubclass(OldStyleClass, DeprecatedName) assert not issubclass(OutdatedUserClass1, OutdatedUserClass1a) assert not issubclass(OutdatedUserClass1a, OutdatedUserClass1) with pytest.raises(TypeError): issubclass(object(), DeprecatedName) # type: ignore[arg-type] def test_isinstance() -> None: with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) DeprecatedName: Any = _create_deprecated_class("DeprecatedName", NewName) class UpdatedUserClass2(NewName): pass class UpdatedUserClass2a(NewName): pass class OutdatedUserClass2(DeprecatedName): pass class OutdatedUserClass2a(DeprecatedName): pass class UnrelatedClass: pass class OldStyleClass: pass assert isinstance(UpdatedUserClass2(), NewName) assert isinstance(UpdatedUserClass2a(), NewName) assert isinstance(UpdatedUserClass2(), DeprecatedName) assert isinstance(UpdatedUserClass2a(), DeprecatedName) assert isinstance(OutdatedUserClass2(), DeprecatedName) assert isinstance(OutdatedUserClass2a(), DeprecatedName) assert not isinstance(OutdatedUserClass2a(), OutdatedUserClass2) assert not isinstance(OutdatedUserClass2(), OutdatedUserClass2a) assert not isinstance(UnrelatedClass(), DeprecatedName) assert not isinstance(OldStyleClass(), DeprecatedName) def test_clsdict() -> None: with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) Deprecated: Any = _create_deprecated_class( "Deprecated", NewName, {"foo": "bar"} ) assert Deprecated.foo == "bar" def test_deprecate_a_class_with_custom_metaclass() -> None: Meta1 = type("Meta1", (type,), {}) New = Meta1("New", (), {}) _create_deprecated_class("Deprecated", New) def test_deprecate_subclass_of_deprecated_class() -> None: with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") Deprecated: Any = _create_deprecated_class("Deprecated", NewName) AlsoDeprecated: Any = _create_deprecated_class( "AlsoDeprecated", Deprecated, new_class_path="foo.Bar" ) w = _mywarnings(w) assert len(w) == 0, str(map(str, w)) with warnings.catch_warnings(record=True) as w: AlsoDeprecated() class UserClass(AlsoDeprecated): pass w = _mywarnings(w) assert len(w) == 2 assert "AlsoDeprecated" in str(w[0].message) assert "foo.Bar" in str(w[0].message) assert "AlsoDeprecated" in str(w[1].message) assert "foo.Bar" in str(w[1].message) def test_inspect_stack() -> None: with ( mock.patch("inspect.stack", side_effect=IndexError), warnings.catch_warnings(record=True) as w, ): DeprecatedName: Any = _create_deprecated_class("DeprecatedName", NewName) class SubClass(DeprecatedName): pass assert "Error detecting parent module" in str(w[0].message) @pytest.mark.asyncio async def test_ensure_awaitable_sync() -> None: assert await ensure_awaitable(5) == 5 def foo(): return 42 assert await ensure_awaitable(foo()) == 42 @pytest.mark.asyncio async def test_ensure_awaitable_async() -> None: async def foo(): return 42 assert await ensure_awaitable(foo()) == 42 async def bar(): await asyncio.sleep(0.01) return 42 assert await ensure_awaitable(bar()) == 42 def test_cached_method_basic() -> None: class Foo: n_called = 0 def __init__(self, name): self.name = name @cached_method def meth(self): self.n_called += 1 return self.n_called, self.name foo = Foo("first") assert foo.meth() == (1, "first") assert foo.meth() == (1, "first") bar = Foo("second") assert bar.meth() == (1, "second") assert bar.meth() == (1, "second") @pytest.mark.asyncio async def test_cached_method_async() -> None: class Foo: n_called = 0 def __init__(self, name): self.name = name @cached_method async def meth(self): self.n_called += 1 return self.n_called, self.name foo = Foo("first") assert await foo.meth() == (1, "first") assert await foo.meth() == (1, "first") bar = Foo("second") assert await bar.meth() == (1, "second") assert await bar.meth() == (1, "second") def test_cached_method_argument() -> None: class Foo: n_called = 0 def __init__(self, name): self.name = name @cached_method def meth(self, x): self.n_called += 1 return self.n_called, self.name, x foo = Foo("first") assert foo.meth(5) == (1, "first", 5) assert foo.meth(5) == (1, "first", 5) assert foo.meth(6) == (2, "first", 6) assert foo.meth(6) == (2, "first", 6) @pytest.mark.asyncio async def test_cached_method_argument_async() -> None: class Foo: n_called = 0 def __init__(self, name): self.name = name @cached_method async def meth(self, x): self.n_called += 1 return self.n_called, self.name, x foo = Foo("first") assert await foo.meth(5) == (1, "first", 5) assert await foo.meth(5) == (1, "first", 5) assert await foo.meth(6) == (2, "first", 6) assert await foo.meth(6) == (2, "first", 6) def test_cached_method_unhashable() -> None: class Foo(list): n_called = 0 @cached_method def meth(self): self.n_called += 1 return self.n_called foo = Foo() assert foo.meth() == 1 assert foo.meth() == 1 @pytest.mark.asyncio async def test_cached_method_unhashable_async() -> None: class Foo(list): n_called = 0 @cached_method async def meth(self): self.n_called += 1 return self.n_called foo = Foo() assert await foo.meth() == 1 assert await foo.meth() == 1 def test_cached_method_exception() -> None: class Error(Exception): pass class Foo(list): n_called = 0 @cached_method def meth(self): self.n_called += 1 raise Error foo = Foo() for idx in range(2): with pytest.raises(Error): foo.meth() assert foo.n_called == idx + 1 @pytest.mark.asyncio async def test_cached_method_exception_async() -> None: class Error(Exception): pass class Foo(list): n_called = 0 @cached_method async def meth(self): self.n_called += 1 raise Error foo = Foo() for idx in range(2): with pytest.raises(Error): await foo.meth() assert foo.n_called == idx + 1 @pytest.mark.asyncio async def test_cached_method_async_race() -> None: class Foo: _n_called = 0 @cached_method async def n_called(self): await asyncio.sleep(random.randint(0, 10) / 100.0) self._n_called += 1 return self._n_called foo = Foo() results = await asyncio.gather( foo.n_called(), foo.n_called(), foo.n_called(), foo.n_called(), foo.n_called(), ) assert results == [1, 1, 1, 1, 1] ItemT = TypeVar("ItemT") class Item: pass class Item2: pass class MyGeneric(Generic[ItemT]): pass class MyGeneric2(Generic[ItemT]): pass class Base(MyGeneric[ItemT]): pass class BaseSpecialized(MyGeneric[Item]): pass class BaseAny(MyGeneric): pass class Derived(Base): pass class Specialized(BaseSpecialized): pass class SpecializedAdditionalClass(BaseSpecialized, Item2): pass class SpecializedTwice(BaseSpecialized, Base[Item2]): pass class SpecializedTwoGenerics(MyGeneric2[Item2], BaseSpecialized): pass @pytest.mark.parametrize( ("cls", "param"), [ (MyGeneric, None), (Base, None), (BaseAny, None), (Derived, None), (BaseSpecialized, Item), (Specialized, Item), (SpecializedAdditionalClass, Item), (SpecializedTwice, Item2), (SpecializedTwoGenerics, Item), ], ) def test_get_generic_param(cls, param) -> None: assert get_generic_param(cls, expected=MyGeneric) == param scrapinghub-web-poet-ba87b95/tests_extra/000077500000000000000000000000001517167256700205505ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests_extra/__init__.py000066400000000000000000000002571517167256700226650ustar00rootroot00000000000000""" This test package was created separately to see the behavior of retrieving the Override rules declared on a registry where @handle_urls is defined on another package. """ scrapinghub-web-poet-ba87b95/tests_extra/po_lib_sub_not_imported/000077500000000000000000000000001517167256700254505ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests_extra/po_lib_sub_not_imported/__init__.py000066400000000000000000000014701517167256700275630ustar00rootroot00000000000000""" This package quite is similar to tests/po_lib_sub in terms of code contents. What we're ultimately trying to test here is to see if the `default_registry` captures the rules annotated in this module if it was not imported. """ from __future__ import annotations from typing import Any from url_matcher import Patterns from web_poet import ItemPage, handle_urls class POBase: expected_instead_of: type[ItemPage] expected_patterns: Patterns expected_meta: dict[str, Any] class POLibSubOverridenNotImported: ... @handle_urls("sub_not_imported.example", instead_of=POLibSubOverridenNotImported) class POLibSubNotImported(POBase): expected_instead_of = POLibSubOverridenNotImported expected_patterns = Patterns(["sub_not_imported.example"]) expected_to_return = None expected_meta = {} scrapinghub-web-poet-ba87b95/tests_typing/000077500000000000000000000000001517167256700207375ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/tests_typing/test_fields.mypy-testing000066400000000000000000000060521517167256700256420ustar00rootroot00000000000000import pytest import attrs from web_poet import ( ItemPage, field, item_from_fields, item_from_fields_sync, ) class Page(ItemPage): @field def name(self): return "hello" def process_price(value: float) -> float: return max([0, value]) class TypedPage(ItemPage): @field def description(self) -> str: return "hello" @field(out=[str.strip]) def name(self) -> str: return "hello" @field(out=[process_price, str]) def price(self) -> float: return 123.0 @field() def currency(self) -> str: return "$" @field async def adescription(self) -> str: return "hello" @field(out=[str.strip]) async def aname(self) -> str: return "hello" @field(out=[process_price, str]) async def aprice(self) -> float: return 123.0 @field() async def acurrency(self) -> str: return "$" @attrs.define class Item: name: str @pytest.mark.mypy_testing @pytest.mark.xfail def test_field_type_no_params() -> None: page = TypedPage() reveal_type(page.description) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail def test_field_type() -> None: page = TypedPage() reveal_type(page.currency) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail def test_field_type_out() -> None: page = TypedPage() reveal_type(page.name) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail def test_field_type_changed_type() -> None: page = TypedPage() reveal_type(page.price) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail async def test_field_type_no_params_async() -> None: page = TypedPage() reveal_type(await page.adescription) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail async def test_field_type_async() -> None: page = TypedPage() reveal_type(await page.acurrency) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail async def test_field_type_out_async() -> None: page = TypedPage() reveal_type(await page.name) # R: builtins.str @pytest.mark.mypy_testing @pytest.mark.xfail async def test_field_type_changed_type_async() -> None: page = TypedPage() reveal_type(await page.price) # R: builtins.str @pytest.mark.mypy_testing async def test_item_from_fields() -> None: page = Page() item1 = await item_from_fields(page, item_cls=dict) reveal_type(item1) # R: builtins.dict[Any, Any] item2 = await item_from_fields(page, item_cls=Item) reveal_type(item2) # R: __main__.Item @pytest.mark.mypy_testing def test_item_from_fields_sync() -> None: page = Page() item1 = item_from_fields_sync(page, item_cls=dict) reveal_type(item1) # R: builtins.dict[Any, Any] item2 = item_from_fields_sync(page, item_cls=Item) reveal_type(item2) # R: __main__.Item @pytest.mark.mypy_testing @pytest.mark.xfail async def test_item_from_fields_default_item_cls() -> None: page = Page() item1 = await item_from_fields(page) reveal_type(item1) # R: builtins.dict[Any, Any] scrapinghub-web-poet-ba87b95/tests_typing/test_item_page.mypy-testing000066400000000000000000000034401517167256700263240ustar00rootroot00000000000000import attr import pytest import attrs from web_poet import ItemPage, field, Returns from web_poet.pages import ItemT @attrs.define class Item: name: str @pytest.mark.mypy_testing @pytest.mark.xfail async def test_item_page() -> None: class MyPage(ItemPage): @field def name(self): return "hello" page = MyPage() item = await page.to_item() reveal_type(item) # R: dict @pytest.mark.mypy_testing async def test_item_page_parametrized() -> None: class MyPage(ItemPage[Item]): @field def name(self): return "hello" page = MyPage() item = await page.to_item() reveal_type(item) # R: __main__.Item @pytest.mark.mypy_testing async def test_item_page_parametrized_subclass() -> None: class BasePage(ItemPage[Item]): @field def name(self): return "hello" class Subclass(BasePage): pass page = Subclass() item = await page.to_item() reveal_type(item) # R: __main__.Item @pytest.mark.mypy_testing async def test_item_page_subclass_parametrized() -> None: class BasePage(ItemPage[ItemT]): @field def name(self): return "hello" class Subclass(BasePage[Item]): pass page = Subclass() item = await page.to_item() reveal_type(item) # R: __main__.Item @pytest.mark.mypy_testing @pytest.mark.xfail async def test_item_page_change_type() -> None: class BasePage(ItemPage[Item]): @field def name(self): return "hello" @attr.define class MyItem(Item): price: float class Subclass(BasePage, Returns[MyItem]): @field def price(self): return 123 page = Subclass() item = await page.to_item() reveal_type(item) # R: MyItem scrapinghub-web-poet-ba87b95/tests_typing/test_utils.mypy-testing000066400000000000000000000013541517167256700255340ustar00rootroot00000000000000import pytest from web_poet.utils import memoizemethod_noargs, cached_method @pytest.mark.mypy_testing def test_memoizemethod_noargs(): class Foo: @memoizemethod_noargs def meth(self) -> str: return '' foo = Foo() reveal_type(foo.meth()) # R: builtins.str @pytest.mark.mypy_testing def test_cached_method_sync(): class Foo: @cached_method def meth(self) -> str: return '' foo = Foo() reveal_type(foo.meth()) # R: builtins.str @pytest.mark.mypy_testing async def test_cached_method_async(): class Foo: @cached_method async def meth(self) -> str: return '' foo = Foo() reveal_type(await foo.meth()) # R: builtins.str scrapinghub-web-poet-ba87b95/tox.ini000066400000000000000000000045461517167256700175270ustar00rootroot00000000000000[tox] requires = sphinx-scrapy @ git+https://github.com/scrapy/sphinx-scrapy.git@0.8.1 envlist = pre-commit,mypy,types,docs,twinecheck,min,min-framework,py310,py311,py312,py313,py314,framework [testenv] package = editable setenv = PYTHONPATH = {toxinidir} deps = pytest pytest-asyncio pytest-cov requests aiohttp zyte-common-items # -p no:web-poet disables the pytest plugin provided by web-poet. Otherwise, # anything imported by that plugin will be missing from coverage data. commands = pytest \ --cov=web_poet \ --cov-report=term-missing \ --cov-report=xml \ --doctest-modules \ --ignore=web_poet/framework \ -p no:web-poet \ {posargs:web_poet tests} [testenv:mypy] extras = framework deps = mypy==1.16.1 aiohttp==3.12.13 andi==0.8.0 pytest==8.4.1 types-requests==2.32.4.20250611 types-python-dateutil==2.9.0.20250516 url-matcher==0.6.0 zyte-common-items==0.29.0 commands = mypy web_poet tests [testenv:types] extras = framework deps = {[testenv]deps} {[testenv:mypy]deps} pytest-mypy-testing==0.1.3 commands = py.test {posargs: tests_typing} [testenv:pre-commit] deps = -rrequirements-dev.txt commands = pre-commit run --all-files --show-diff-on-failure [testenv:min] basepython = python3.10 deps = {[testenv]deps} pytest==7.0.0 pytest-asyncio==0.17.2 requests==2.27.0 zyte-common-items==0.1.0 aiohttp==3.7.0 attrs==21.3.0 parsel==1.5.0 url-matcher==0.4.0 tldextract==3.0.0 multidict==5.0.0 w3lib==1.22.0 async-lru==1.0.3 itemadapter==0.8.0 andi==0.5.0 python-dateutil==2.7.0 time-machine==2.7.1 packaging==20.0 # pin older cssselect for old parsel cssselect==1.2.0 [testenv:twinecheck] basepython = python3 deps = twine==6.1.0 build==1.2.2.post1 commands = python -m build --sdist twine check dist/* [testenv:framework] deps = pytest pytest-asyncio pytest-cov extras = framework commands = pytest \ --cov=web_poet.framework \ --cov-report=term-missing \ --cov-report=xml \ -p no:web-poet \ {posargs:tests/test_framework.py} [testenv:min-framework] basepython = python3.10 deps = {[testenv:framework]deps} niquests==3.14.0 playwright==1.20.0 commands = {[testenv:framework]commands} scrapinghub-web-poet-ba87b95/web_poet/000077500000000000000000000000001517167256700200075ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/__init__.py000066400000000000000000000030401517167256700221150ustar00rootroot00000000000000from .annotated import AnnotatedInstance, annotation_decode, annotation_encode from .fields import field, item_from_fields, item_from_fields_sync from .page_inputs import ( AnyResponse, BrowserHtml, BrowserResponse, HttpClient, HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, HttpResponseBody, HttpResponseHeaders, PageParams, RequestUrl, ResponseUrl, Stats, ) from .pages import ( BrowserPage, Extractor, Injectable, ItemPage, Returns, SelectorExtractor, WebPage, validates_input, ) from .requests import request_downloader_var from .rules import ( ApplyRule, RulesRegistry, consume_modules, ) from .utils import cached_method __all__ = [ "AnnotatedInstance", "AnyResponse", "ApplyRule", "BrowserHtml", "BrowserPage", "BrowserResponse", "Extractor", "HttpClient", "HttpRequest", "HttpRequestBody", "HttpRequestHeaders", "HttpResponse", "HttpResponseBody", "HttpResponseHeaders", "Injectable", "ItemPage", "PageParams", "RequestUrl", "ResponseUrl", "Returns", "RulesRegistry", "SelectorExtractor", "Stats", "WebPage", "annotation_decode", "annotation_encode", "cached_method", "consume_modules", "default_registry", "field", "handle_urls", "item_from_fields", "item_from_fields_sync", "request_downloader_var", "validates_input", ] default_registry = RulesRegistry() handle_urls = default_registry.handle_urls scrapinghub-web-poet-ba87b95/web_poet/_base.py000066400000000000000000000057551517167256700214460ustar00rootroot00000000000000"""Contains some internal definitions that is internal to **web-poet**. In general, users shouldn't import and use the contents of this module. """ from __future__ import annotations from typing import TYPE_CHECKING, AnyStr, TypeAlias, overload from multidict import CIMultiDict if TYPE_CHECKING: # typing.Self requires Python 3.11 from typing_extensions import Self _AnyStrDict: TypeAlias = dict[AnyStr, AnyStr | list[AnyStr] | tuple[AnyStr, ...]] class _HttpHeaders(CIMultiDict): """A base container for holding the HTTP headers. For more info on its other features, read the API spec of :class:`multidict.CIMultiDict`. """ @classmethod def from_name_value_pairs(cls, arg: list[dict]) -> Self: """An alternative constructor for instantiation using a ``List[Dict]`` where the 'key' is the header name while the 'value' is the header value. >>> pairs = [ ... {"name": "Content-Encoding", "value": "gzip"}, ... {"name": "content-length", "value": "648"} ... ] >>> headers = _HttpHeaders.from_name_value_pairs(pairs) >>> headers <_HttpHeaders('Content-Encoding': 'gzip', 'content-length': '648')> """ return cls([(pair["name"], pair["value"]) for pair in arg]) @classmethod def from_bytes_dict(cls, arg: _AnyStrDict, encoding: str = "utf-8") -> Self: """An alternative constructor for instantiation where the header-value pairs could be in raw bytes form. This supports multiple header values in the form of ``List[bytes]`` and ``Tuple[bytes]]`` alongside a plain ``bytes`` value. A value in ``str`` also works and wouldn't break the decoding process at all. By default, it converts the ``bytes`` value using "utf-8". However, this can easily be overridden using the ``encoding`` parameter. >>> raw_values = { ... b"Content-Encoding": [b"gzip", b"br"], ... b"Content-Type": [b"text/html"], ... b"content-length": b"648", ... } >>> headers = _HttpHeaders.from_bytes_dict(raw_values) >>> headers <_HttpHeaders('Content-Encoding': 'gzip', 'Content-Encoding': 'br', 'Content-Type': 'text/html', 'content-length': '648')> """ @overload def _norm(data: str | bytes) -> str: ... @overload def _norm(data: None) -> None: ... def _norm(data: str | bytes | None) -> str | None: if isinstance(data, str) or data is None: return data if isinstance(data, bytes): return data.decode(encoding) raise ValueError(f"Expecting str or bytes. Received {type(data)}") converted = [] for header, value in arg.items(): if isinstance(value, (list, tuple)): converted.extend([(_norm(header), _norm(v)) for v in value]) else: converted.append((_norm(header), _norm(value))) return cls(converted) scrapinghub-web-poet-ba87b95/web_poet/annotated.py000066400000000000000000000034051517167256700223400ustar00rootroot00000000000000from __future__ import annotations from dataclasses import dataclass from typing import Annotated, Any def annotation_encode(obj: Any) -> Any: """Encodes *obj* for :obj:`~typing.Annotated`. Annotated params must be hashable. This function converts dicts and lists into hashable alternatives (tuples and frozensets). For example: .. code-block:: python foo = Annotated(Bar, annotation_encode({"a": [1, 2, 3]})) *obj* must not contain tuples or frozensets, or unhashable data besides dicts and lists. """ if isinstance(obj, (tuple, list)): return tuple(annotation_encode(e) for e in obj) if isinstance(obj, dict): return frozenset( (annotation_encode(k), annotation_encode(v)) for k, v in obj.items() ) return obj def annotation_decode(obj: Any) -> Any: """Converts a result of :func:`annotation_encode` back to original form.""" if isinstance(obj, tuple): return [annotation_decode(o) for o in obj] if isinstance(obj, frozenset): return {annotation_decode(k): annotation_decode(v) for k, v in obj} return obj @dataclass class AnnotatedInstance: """Wrapper for instances of annotated dependencies. It is used when both the dependency value and the dependency annotation are needed. :param result: The wrapped dependency instance. :type result: Any :param metadata: The copy of the annotation. :type metadata: Tuple[Any, ...] """ result: Any metadata: tuple[Any, ...] def get_annotated_cls(self) -> Annotated[Any, ...]: """Returns a re-created :class:`typing.Annotated` type.""" return Annotated[(type(self.result), *self.metadata)] __all__: list[str] = [] # Prefer imports from web_poet.__init__.py scrapinghub-web-poet-ba87b95/web_poet/exceptions/000077500000000000000000000000001517167256700221705ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/exceptions/__init__.py000066400000000000000000000000501517167256700242740ustar00rootroot00000000000000from .core import * from .http import * scrapinghub-web-poet-ba87b95/web_poet/exceptions/core.py000066400000000000000000000041331517167256700234730ustar00rootroot00000000000000""" Core Exceptions ~~~~~~~~~~~~~~~ These exceptions are tied to how **web-poet** operates. """ from __future__ import annotations from typing import TYPE_CHECKING if TYPE_CHECKING: from web_poet import HttpRequest __all__ = [ "NoSavedHttpResponse", "PageObjectAction", "RequestDownloaderVarError", "Retry", "UseFallback", ] class RequestDownloaderVarError(Exception): """The ``web_poet.request_downloader_var`` had its contents accessed but there wasn't any value set during the time requests are executed. See the documentation section about :ref:`setting up the contextvars ` to learn more about this. """ class PageObjectAction(ValueError): """Base class for exceptions that can be raised from a page object to indicate something to be done about that page object.""" class Retry(PageObjectAction): """The page object found that the input data is partial or empty, and a request retry may provide better input. *message* is the reason for the retry. *max_retries* is the desired maximum retries. If not specified, the framework defaults are used instead. """ def __init__(self, message: str | None = None, max_retries: int | None = None): self.max_retries = max_retries super().__init__(message) class UseFallback(PageObjectAction): """The page object cannot extract data from the input, but the input seems valid, so an alternative data extraction implementation for the same item type may succeed.""" class NoSavedHttpResponse(AssertionError): """Indicates that there is no saved response for this request. Can only be raised when a :class:`~.HttpClient` instance is used to get saved responses. :param request: The :class:`~.HttpRequest` instance that was used. :type request: HttpRequest """ def __init__(self, msg: str | None = None, request: HttpRequest | None = None): self.request = request if msg is None: msg = f"There is no saved response available for this HTTP Request: {self.request}" super().__init__(msg) scrapinghub-web-poet-ba87b95/web_poet/exceptions/http.py000066400000000000000000000046571517167256700235350ustar00rootroot00000000000000""" HTTP Exceptions ~~~~~~~~~~~~~~~ These are exceptions pertaining to common issues faced when executing HTTP operations. """ from __future__ import annotations from web_poet.page_inputs.http import HttpRequest, HttpResponse class HttpError(IOError): """Indicates that an exception has occurred when handling an HTTP operation. This is used as a **base class** for more specific errors and could be vague since it could denote problems either in the HTTP Request or Response. For more specific errors, it would be better to use :class:`.HttpRequestError` and :class:`.HttpResponseError`. :param request: Request that triggered the exception. :type request: HttpRequest """ def __init__(self, msg: str | None = None, request: HttpRequest | None = None): #: Request that triggered the exception. self.request: HttpRequest | None = request if msg is None: msg = f"An Error ocurred when executing this HTTP Request: {self.request}" super().__init__(msg) class HttpRequestError(HttpError): """Indicates that an exception has occurred when the **HTTP Request** was being handled. :param request: The :class:`~.HttpRequest` instance that was used. :type request: HttpRequest """ class HttpResponseError(HttpError): """Indicates that an exception has occurred when the **HTTP Response** was received. For responses that are in the status code ``100-3xx range``, this exception shouldn't be raised at all. However, for responses in the ``400-5xx``, this will be raised by **web-poet**. .. note:: Frameworks implementing **web-poet** should **NOT** raise this exception. This exception is raised by web-poet itself, based on ``allow_status`` parameter found in the methods of :class:`~.HttpClient`. :param request: Request that got the response that triggered the exception. :type request: HttpRequest :param response: Response that triggered the exception. :type response: HttpResponse """ def __init__( self, msg: str | None = None, response: HttpResponse | None = None, request: HttpRequest | None = None, ): #: Response that triggered the exception. self.response: HttpResponse | None = response if msg is None: msg = f"Unexpected HTTP Response received: {self.response}" super().__init__(msg, request=request) scrapinghub-web-poet-ba87b95/web_poet/fields.py000066400000000000000000000223071517167256700216330ustar00rootroot00000000000000""" ``web_poet.fields`` is a module with helpers for putting extraction logic into separate Page Object methods / properties. """ from __future__ import annotations import inspect from contextlib import suppress from functools import update_wrapper, wraps from typing import TYPE_CHECKING, Any, TypeVar, cast import attrs from itemadapter import ItemAdapter from web_poet.utils import cached_method, callable_has_parameter, ensure_awaitable if TYPE_CHECKING: from collections.abc import Callable _FIELDS_INFO_ATTRIBUTE_READ = "_web_poet_fields_info" _FIELDS_INFO_ATTRIBUTE_WRITE = "_web_poet_fields_info_temp" _FIELD_METHODS_ATTRIBUTE = "_web_poet_field_methods" @attrs.define class FieldInfo: """Information about a field""" #: name of the field name: str #: field metadata meta: dict | None = None #: field processors out: list[Callable] | None = None class FieldsMixin: """A mixin which is required for a class to support fields""" def __init_subclass__(cls, **kwargs: Any) -> None: super().__init_subclass__(**kwargs) # To support fields, we must ensure that fields dict is not shared # between subclasses, i.e. a decorator in a subclass doesn't affect # the base class. This is done by making decorator write to a # temporary location, and then merging it all on subclass creation. this_class_fields: dict[str, FieldInfo] = getattr( cls, _FIELDS_INFO_ATTRIBUTE_WRITE, {} ) base_fields: dict[str, FieldInfo] = {} for base_class in cls.__bases__: fields = getattr(base_class, _FIELDS_INFO_ATTRIBUTE_READ, {}) base_fields.update(fields) if base_fields or this_class_fields: fields = {**base_fields, **this_class_fields} setattr(cls, _FIELDS_INFO_ATTRIBUTE_READ, fields) with suppress(AttributeError): delattr(cls, _FIELDS_INFO_ATTRIBUTE_WRITE) setattr(cls, _FIELD_METHODS_ATTRIBUTE, {}) def field( method=None, *, cached: bool = False, meta: dict | None = None, out: list[Callable] | None = None, ): """ Page Object method decorated with ``@field`` decorator becomes a property, which is then used by :class:`~.ItemPage`'s to_item() method to populate a corresponding item attribute. By default, the value is computed on each property access. Use ``@field(cached=True)`` to cache the property value. The ``meta`` parameter allows to store arbitrary information for the field, e.g. ``@field(meta={"expensive": True})``. This information can be later retrieved for all fields using the :func:`get_fields_dict` function. The ``out`` parameter is an optional list of field processors, which are functions applied to the value of the field before returning it. """ class _field: def __init__(self, method): if not callable(method): raise TypeError( f"@field decorator must be used on methods, {method!r} is decorated instead" ) self.original_method = method self.name: str | None = None update_wrapper(self, method) def __set_name__(self, owner, name: str) -> None: self.name = name if not hasattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE): setattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE, {}) field_info = FieldInfo(name=name, meta=meta, out=out) getattr(owner, _FIELDS_INFO_ATTRIBUTE_WRITE)[name] = field_info def __get__(self, instance, owner=None): # When accessed on the class (instance is None) return the # descriptor itself (which has been wrapped with the original # function attributes) so that __doc__ and other metadata are # preserved. if instance is None: return self # We use the original method and the out arg from the field and # the Processors class from the instance class, so caching needs to # take into account the instance class and the field object. So we # use the field object id() as a key when caching the method in # the instance class. cache_key = id(self) method = self._get_processed_method(owner, cache_key) if method is None: if out is not None: processor_functions = out elif hasattr(owner, "Processors"): assert self.name is not None processor_functions = getattr(owner.Processors, self.name, []) else: processor_functions = [] processors: list[tuple[Callable, bool]] = [] for processor_function in processor_functions: takes_page = callable_has_parameter(processor_function, "page") processors.append((processor_function, takes_page)) method = self._processed(self.original_method, processors) if cached: method = cached_method(method) self._set_processed_method(owner, cache_key, method) return method(instance) @staticmethod def _get_processed_method(page_cls, key: int): return getattr(page_cls, _FIELD_METHODS_ATTRIBUTE).get(key) @staticmethod def _set_processed_method(page_cls, key: int, method) -> None: getattr(page_cls, _FIELD_METHODS_ATTRIBUTE)[key] = method @staticmethod def _process(value: Any, page, processors: list[tuple[Callable, bool]]) -> Any: for processor, takes_page in processors: value = processor(value, page=page) if takes_page else processor(value) return value @staticmethod def _processed(method, processors: list[tuple[Callable, bool]]): """Returns a wrapper for method that calls processors on its result""" if inspect.iscoroutinefunction(method): async def processed(page): if hasattr(page, "_validate_input"): validation_item = page._validate_input() if validation_item is not None: return getattr(validation_item, method.__name__) return _field._process(await method(page), page, processors) else: def processed(page): if hasattr(page, "_validate_input"): validation_item = page._validate_input() if validation_item is not None: return getattr(validation_item, method.__name__) return _field._process(method(page), page, processors) return wraps(method)(processed) if method is not None: # @field syntax res = _field(method) update_wrapper(cast("Callable", res), method) return res # @field(...) syntax return _field def get_fields_dict(cls_or_instance) -> dict[str, FieldInfo]: """Return a dictionary with information about the fields defined for the class: keys are field names, and values are :class:`web_poet.fields.FieldInfo` instances. """ return getattr(cls_or_instance, _FIELDS_INFO_ATTRIBUTE_READ, {}) T = TypeVar("T") # FIXME: type is ignored as a workaround for https://github.com/python/mypy/issues/3737 # inference works properly if a non-default item_cls is passed; for dict # it's not working (return type is Any) async def item_from_fields( obj, item_cls: type[T] = dict, # type: ignore[assignment] *, skip_nonitem_fields: bool = False, ) -> T: """Return an item of ``item_cls`` type, with its attributes populated from the ``obj`` methods decorated with :class:`field` decorator. If ``skip_nonitem_fields`` is True, ``@fields`` whose names are not among ``item_cls`` field names are not passed to ``item_cls.__init__``. When ``skip_nonitem_fields`` is False (default), all ``@fields`` are passed to ``item_cls.__init__``, possibly causing exceptions if ``item_cls.__init__`` doesn't support them. """ item_dict = item_from_fields_sync(obj, item_cls=dict, skip_nonitem_fields=False) field_names = list(item_dict.keys()) if skip_nonitem_fields: field_names = _without_unsupported_field_names(item_cls, field_names) return item_cls( **{name: await ensure_awaitable(item_dict[name]) for name in field_names} ) def item_from_fields_sync( obj, item_cls: type[T] = dict, # type: ignore[assignment] *, skip_nonitem_fields: bool = False, ) -> T: """Synchronous version of :func:`item_from_fields`.""" field_names = list(get_fields_dict(obj)) if skip_nonitem_fields: field_names = _without_unsupported_field_names(item_cls, field_names) return item_cls(**{name: getattr(obj, name) for name in field_names}) def _without_unsupported_field_names( item_cls: type, field_names: list[str] ) -> list[str]: item_field_names = ItemAdapter.get_field_names_from_class(item_cls) if item_field_names is None: # item_cls doesn't define field names upfront return field_names[:] return list(set(field_names) & set(item_field_names)) scrapinghub-web-poet-ba87b95/web_poet/framework/000077500000000000000000000000001517167256700220045ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/framework/__init__.py000066400000000000000000000007371517167256700241240ustar00rootroot00000000000000"""Built-in :ref:`web-poet framework ` for simple use cases.""" try: import niquests # noqa: F401 from playwright.async_api import async_playwright # noqa: F401 except ImportError as exception: message = ( "Could not import web_poet.framework dependencies. Install web-poet[framework]." ) raise ImportError(message) from exception from ._api import Framework, playwright_engine __all__ = [ "Framework", "playwright_engine", ] scrapinghub-web-poet-ba87b95/web_poet/framework/_api.py000066400000000000000000000176031517167256700232750ustar00rootroot00000000000000from __future__ import annotations from typing import Annotated, Any, TypeAlias, get_args, get_origin import andi from andi.typeutils import strip_annotated from playwright.async_api import async_playwright from web_poet import default_registry from web_poet.annotated import annotation_encode from web_poet.page_inputs import ( BrowserHtml, BrowserResponse, HttpRequest, ) from web_poet.page_inputs.stats import DictStatCollector, StatCollector from web_poet.page_inputs.url import RequestUrl, ResponseUrl from web_poet.pages import ItemPage, is_injectable from web_poet.rules import RulesRegistry from web_poet.utils import ensure_awaitable from ._providers import DEFAULT_PLAYWRIGHT_ENGINE, PROVIDERS, ResponseFetcher ANNOTATION_PREFIX = "playwright_engine." def playwright_engine(name: str) -> str: """Helper to create a hashable metadata value for Annotated Playwright engine names. Example usage: .. code-block:: python Annotated[BrowserResponse, playwright_engine("firefox")] """ return annotation_encode(f"{ANNOTATION_PREFIX}{name}") RequestLike: TypeAlias = HttpRequest | RequestUrl | ResponseUrl | str def _normalize_request(request: RequestLike) -> HttpRequest: if not isinstance(request, HttpRequest): request = HttpRequest(url=request) return request class Framework: """Manager of the :ref:`built-in framework `. *registry* is the :class:`~web_poet.rules.RulesRegistry` from where page objects resolve their dependencies. If ``None``, :data:`~web_poet.default_registry` is used. *default_playwright_engine* is the Playwright browser engine to use when browser inputs do not specify one. Examples: ``"chromium"``, ``"firefox"``, ``"webkit"``. *stats* is a :class:`~web_poet.page_inputs.stats.StatCollector` instance to collect stats written by the page object through the :class:`~web_poet.page_inputs.stats.Stats` dependency. If not specified, a :class:`~web_poet.page_inputs.stats.DictStatCollector` is used. You can access the collector through the :attr:`stats` attribute, e.g. to read its data. """ def __init__( self, *, registry: RulesRegistry | None = None, default_playwright_engine: str | None = None, stats: StatCollector | None = None, ) -> None: self._registry = registry or default_registry self._default_playwright_engine = default_playwright_engine self.stats: StatCollector = stats or DictStatCollector() async def get_page( self, request: RequestLike, page_cls: type[ItemPage], *, page_params: dict[Any, Any] | None = None, ) -> ItemPage: """Return a page object built from *request* and *page_cls*. *page_params* is a dict that the page object may access through the :class:`~web_poet.page_inputs.PageParams` dependency. """ request = _normalize_request(request) plan = andi.plan( page_cls, is_injectable=is_injectable, externally_provided=set(PROVIDERS), ) instances: dict[Any, Any] = {} required_deps: set[type] = set() for fn_or_cls, _ in plan: base = strip_annotated(fn_or_cls) assert isinstance(base, type) required_deps.add(base) response_fetcher = ResponseFetcher( required_deps=required_deps, default_playwright_engine=self._default_playwright_engine, ) # first pass: collect explicit Playwright engine names from Annotated # browser deps explicit_engines: set[str] = set() for fn_or_cls, _ in plan: base = strip_annotated(fn_or_cls) if ( base in {BrowserResponse, BrowserHtml} and get_origin(fn_or_cls) is Annotated ): meta = get_args(fn_or_cls)[1:] if meta and isinstance(meta[0], str): m = meta[0] if m.startswith(ANNOTATION_PREFIX): explicit_engines.add(m.split(".", 1)[1]) if not explicit_engines: chosen_engine_for_unannotated = ( self._default_playwright_engine or DEFAULT_PLAYWRIGHT_ENGINE ) elif ( self._default_playwright_engine and self._default_playwright_engine in explicit_engines ): chosen_engine_for_unannotated = self._default_playwright_engine else: chosen_engine_for_unannotated = min(explicit_engines) # validate requested browsers are available in playwright before doing work needed_browsers = set(explicit_engines) # include chosen browser for unannotated deps if there are any browser deps if required_deps & {BrowserResponse, BrowserHtml}: needed_browsers.add(chosen_engine_for_unannotated) if needed_browsers: async with async_playwright() as playwright: for b in needed_browsers: if getattr(playwright, b, None) is None: raise ValueError(f"Playwright does not provide engine '{b}'") # second pass: instantiate dependencies, forwarding browser kwarg when needed for fn_or_cls, kwargs_spec in plan: kwargs = kwargs_spec.kwargs(instances) base = strip_annotated(fn_or_cls) assert isinstance(base, type) playwright_engine_kw: str | None = None if ( base in {BrowserResponse, BrowserHtml} and get_origin(fn_or_cls) is Annotated ): meta = get_args(fn_or_cls)[1:] if meta and isinstance(meta[0], str): m = meta[0] if m.startswith(ANNOTATION_PREFIX): playwright_engine_kw = m.split(".", 1)[1] elif base in {BrowserResponse, BrowserHtml}: playwright_engine_kw = chosen_engine_for_unannotated provider = PROVIDERS.get(base) if provider is not None: call_kwargs = { "request": request, "page_params": page_params, "page_cls": page_cls, "registry": self._registry, "response_fetcher": response_fetcher, "stats": self.stats, **kwargs, } if playwright_engine_kw is not None: call_kwargs["playwright_engine"] = playwright_engine_kw value = await ensure_awaitable(provider(**call_kwargs)) else: value = await ensure_awaitable(base(**kwargs)) instances[fn_or_cls] = value return instances[page_cls] async def get_item( self, request: RequestLike, item_or_page_cls: type, *, page_params: dict[Any, Any] | None = None, ) -> Any: """Return an item built from *request*. *item_or_page_cls* is either an item class or a page object class. If it is an item class, the page class to use is determined by the :class:`~web_poet.rules.RulesRegistry` passed to :class:`~web_poet.framework.Framework`. *page_params* is a dict that the page object may access through the :class:`~web_poet.page_inputs.PageParams` dependency """ request = _normalize_request(request) if issubclass(item_or_page_cls, ItemPage): page_cls: type | None = item_or_page_cls else: page_cls = self._registry.page_cls_for_item(request.url, item_or_page_cls) if page_cls is None: raise ValueError(f"No page object class found for URL: {request.url}") assert page_cls is not None page = await self.get_page(request, page_cls, page_params=page_params) return await ensure_awaitable(page.to_item()) scrapinghub-web-poet-ba87b95/web_poet/framework/_providers.py000066400000000000000000000172031517167256700245350ustar00rootroot00000000000000from __future__ import annotations import logging from typing import TYPE_CHECKING, Any, get_type_hints import niquests from playwright.async_api import async_playwright from web_poet.exceptions import HttpRequestError from web_poet.page_inputs import ( AnyResponse, BrowserHtml, BrowserResponse, HttpClient, HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, HttpResponseBody, HttpResponseHeaders, PageParams, RequestUrl, ResponseUrl, Stats, ) from web_poet.page_inputs.stats import StatCollector if TYPE_CHECKING: from collections.abc import Callable DEFAULT_PLAYWRIGHT_ENGINE = "chromium" PROVIDERS: dict[type, Callable[..., Any]] = {} logger = logging.getLogger(__name__) def _get_http_response_from_nirequests_response( request: HttpRequest, response: niquests.Response ) -> HttpResponse: return HttpResponse( response.url or request.url, status=response.status_code, body=response.content or b"", headers=response.headers, ) async def _get_http_response_from_http_request(request: HttpRequest) -> HttpResponse: try: response = await niquests.aget(str(request.url), timeout=300) except Exception as exc: raise HttpRequestError(str(exc), request=request) from exc return _get_http_response_from_nirequests_response(request, response) async def _get_browser_response_from_http_request( request: HttpRequest, playwright_engine: str | None = None ) -> BrowserResponse: if request.method.upper() != "GET": raise HttpRequestError( "Browser provider only supports plain GET requests with a URL (no " "headers or body)", request=request, ) if request.headers and len(request.headers): header_names = ", ".join(map(str, list(request.headers.keys()))) logger.warning( "Browser provider does not support requests with headers; ignoring headers: %s", header_names, ) if request.body and len(request.body): raise HttpRequestError( "Browser provider does not support requests with a body; only " "plain GET with a URL is supported", request=request, ) engine_name = playwright_engine or DEFAULT_PLAYWRIGHT_ENGINE try: async with async_playwright() as playwright: engine = getattr(playwright, engine_name) browser_obj = await engine.launch() try: page = await browser_obj.new_page() goto_response = await page.goto(str(request.url)) html = await page.content() return BrowserResponse( url=page.url or str(request.url), html=html, status=None if goto_response is None else goto_response.status, ) finally: await browser_obj.close() except Exception as exc: raise HttpRequestError(str(exc), request=request) from exc class ResponseFetcher: def __init__( self, required_deps: set[type] | None = None, default_playwright_engine: str | None = None, ) -> None: self.http_response: HttpResponse | None = None self._browser_responses: dict[str, BrowserResponse] = {} self.default_playwright_engine = default_playwright_engine required_deps = required_deps or set() self._needs_http_response = bool( required_deps & {HttpResponse, HttpResponseBody, HttpResponseHeaders} ) self._needs_browser_response = bool( required_deps & {BrowserResponse, BrowserHtml} ) async def get_http_response(self, request: HttpRequest) -> HttpResponse: if self.http_response is None: self.http_response = await _get_http_response_from_http_request(request) return self.http_response async def get_browser_response( self, request: HttpRequest, playwright_engine: str | None = None ) -> BrowserResponse: engine_name = ( playwright_engine or self.default_playwright_engine or DEFAULT_PLAYWRIGHT_ENGINE ) if engine_name not in self._browser_responses: self._browser_responses[ engine_name ] = await _get_browser_response_from_http_request( request, playwright_engine=engine_name ) return self._browser_responses[engine_name] async def get_any_response( self, request: HttpRequest, playwright_engine: str | None = None ) -> AnyResponse: if self._needs_browser_response: browser_response = await self.get_browser_response( request, playwright_engine=playwright_engine ) return AnyResponse(response=browser_response) http_response = await self.get_http_response(request) return AnyResponse(response=http_response) def _provider(func: Callable[..., Any]): dep = get_type_hints(func).get("return") assert isinstance(dep, type) PROVIDERS[dep] = func return func @_provider async def _get_http_response( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> HttpResponse: return await response_fetcher.get_http_response(request) @_provider async def _get_browser_response( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> BrowserResponse: return await response_fetcher.get_browser_response( request, playwright_engine=_kwargs.get("playwright_engine") ) @_provider async def _get_browser_html( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> BrowserHtml: response = await response_fetcher.get_browser_response( request, playwright_engine=_kwargs.get("playwright_engine") ) return response.html @_provider def _get_request_body(request: HttpRequest, **_kwargs) -> HttpRequestBody: return HttpRequestBody(request.body) @_provider def _get_request_headers(request: HttpRequest, **_kwargs) -> HttpRequestHeaders: return request.headers @_provider async def _get_response_body( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> HttpResponseBody: response = await response_fetcher.get_http_response(request) return response.body @_provider async def _get_response_headers( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> HttpResponseHeaders: response = await response_fetcher.get_http_response(request) return response.headers @_provider async def _get_response_url( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> ResponseUrl: response = await response_fetcher.get_any_response( request, playwright_engine=_kwargs.get("playwright_engine") ) return response.url @_provider async def _get_any_response( request: HttpRequest, response_fetcher: ResponseFetcher, **_kwargs ) -> AnyResponse: return await response_fetcher.get_any_response( request, playwright_engine=_kwargs.get("playwright_engine") ) @_provider def _get_request_url(request: HttpRequest, **_kwargs) -> RequestUrl: return request.url @_provider def _get_page_params( page_params: dict[Any, Any] | None = None, **_kwargs ) -> PageParams: return PageParams(page_params or {}) @_provider def _get_request(request: HttpRequest, **_kwargs) -> HttpRequest: return request @_provider def _get_stats(stats: StatCollector | None = None, **_kwargs) -> Stats: return Stats(stat_collector=stats) @_provider def _get_http_client(**_kwargs) -> HttpClient: return HttpClient(request_downloader=_get_http_response_from_http_request) scrapinghub-web-poet-ba87b95/web_poet/mixins.py000066400000000000000000000074211517167256700216740ustar00rootroot00000000000000import abc from typing import Generic, Protocol, TypeVar from urllib.parse import urljoin import parsel from w3lib.html import get_base_url from web_poet.page_inputs.url import RequestUrl, ResponseUrl class _ResponseLike(Protocol): """Protocol for response objects.""" url: ResponseUrl | str text: str ResponseT = TypeVar("ResponseT", bound=_ResponseLike) class SelectorShortcutsMixin: def xpath(self, query, **kwargs) -> parsel.SelectorList: """A shortcut to ``.selector.xpath()``.""" return self.selector.xpath(query, **kwargs) # type: ignore[attr-defined] def css(self, query) -> parsel.SelectorList: """A shortcut to ``.selector.css()``.""" return self.selector.css(query) # type: ignore[attr-defined] def jmespath(self, query: str, **kwargs) -> parsel.SelectorList: """A shortcut to ``.selector.jmespath()``.""" if not hasattr(self.selector, "jmespath"): # type: ignore[attr-defined] raise AttributeError( "Please install parsel >= 1.8.1 to get jmespath support" ) return self.selector.jmespath(query, **kwargs) # type: ignore[attr-defined] class SelectableMixin(abc.ABC, SelectorShortcutsMixin): """ Inherit from this mixin, implement ``._selector_input`` method, get ``.selector`` property and ``.xpath`` / ``.css`` / ``.jmespath`` methods. """ __cached_selector = None @abc.abstractmethod def _selector_input(self) -> str: raise NotImplementedError # pragma: nocover @property def selector(self) -> parsel.Selector: """Cached instance of :external:class:`parsel.selector.Selector`.""" # caching is implemented in a manual way to avoid issues with # non-hashable classes, where memoizemethod_noargs doesn't work if self.__cached_selector is not None: return self.__cached_selector base_url = str(self.url) if hasattr(self, "url") else None sel = parsel.Selector(text=self._selector_input(), base_url=base_url) self.__cached_selector = sel return sel class UrlShortcutsMixin: _cached_base_url = None def _url_shortcuts_input(self) -> str: return self._selector_input() # type: ignore[attr-defined] @property def _base_url(self) -> str: if self._cached_base_url is None: text = self._url_shortcuts_input()[:4096] self._cached_base_url = get_base_url(text, str(self.url)) # type: ignore[attr-defined] return self._cached_base_url def urljoin(self, url: str | RequestUrl | ResponseUrl) -> RequestUrl: """Return *url* as an absolute URL. If *url* is relative, it is made absolute relative to the base URL of *self*.""" return RequestUrl(urljoin(self._base_url, str(url))) class ResponseShortcutsMixin(Generic[ResponseT], SelectableMixin, UrlShortcutsMixin): # noqa: PYI059 """Common shortcut methods for working with HTML responses. This mixin could be used with Page Object base classes. It requires "response" attribute to be present. """ response: ResponseT _cached_base_url = None @property def url(self) -> str: """Shortcut to HTML Response's URL, as a string.""" return str(self.response.url) @property def html(self) -> str: """Shortcut to HTML Response's content.""" return self.response.text def _selector_input(self) -> str: return self.html @property def base_url(self) -> str: """Return the base url of the given response""" return self._base_url def urljoin(self, url: str) -> str: # type: ignore[override] """Convert url to absolute, taking in account url and baseurl of the response""" return str(super().urljoin(url)) scrapinghub-web-poet-ba87b95/web_poet/page_inputs/000077500000000000000000000000001517167256700223255ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/page_inputs/__init__.py000066400000000000000000000012331517167256700244350ustar00rootroot00000000000000from .browser import BrowserHtml, BrowserResponse from .client import HttpClient from .http import ( HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, HttpResponseBody, HttpResponseHeaders, ) from .page_params import PageParams from .response import AnyResponse from .stats import Stats from .url import RequestUrl, ResponseUrl __all__ = [ "AnyResponse", "BrowserHtml", "BrowserResponse", "HttpClient", "HttpRequest", "HttpRequestBody", "HttpRequestHeaders", "HttpResponse", "HttpResponseBody", "HttpResponseHeaders", "PageParams", "RequestUrl", "ResponseUrl", "Stats", ] scrapinghub-web-poet-ba87b95/web_poet/page_inputs/browser.py000066400000000000000000000025321517167256700243640ustar00rootroot00000000000000import attrs from web_poet.mixins import SelectableMixin, UrlShortcutsMixin from .url import ResponseUrl class BrowserHtml(SelectableMixin, str): # noqa: SLOT000 """HTML returned by a web browser, i.e. snapshot of the DOM tree in HTML format. """ def _selector_input(self) -> str: return self @attrs.define(auto_attribs=False, slots=False, eq=False) class BrowserResponse(SelectableMixin, UrlShortcutsMixin): """Browser response: url, HTML and status code. ``url`` should be browser's window.location, not a URL of the request, if possible. ``html`` contains the HTML returned by the browser, i.e. a snapshot of DOM tree in HTML format. The following are optional since it would depend on the source of the ``BrowserResponse`` if these are available or not: ``status`` should represent the int status code of the HTTP response. """ url: ResponseUrl = attrs.field(converter=ResponseUrl) html: BrowserHtml = attrs.field(converter=BrowserHtml) status: int | None = attrs.field(default=None, kw_only=True) @property def text(self) -> str: """HTML returned by the browser, identical to ``self.html``. Provided for compatibility with :class:`~.HttpResponse`. """ return str(self.html) def _selector_input(self) -> str: return self.html scrapinghub-web-poet-ba87b95/web_poet/page_inputs/client.py000066400000000000000000000241531517167256700241620ustar00rootroot00000000000000from __future__ import annotations import asyncio import logging from dataclasses import dataclass from http import HTTPStatus from typing import TYPE_CHECKING, TypeAlias, cast from web_poet.exceptions import HttpError, HttpResponseError from web_poet.exceptions.core import NoSavedHttpResponse from web_poet.page_inputs.http import ( HttpRequest, HttpRequestBody, HttpRequestHeaders, HttpResponse, request_fingerprint, ) from web_poet.requests import RequestDownloaderT, _perform_request from web_poet.utils import as_list if TYPE_CHECKING: from collections.abc import Iterable from web_poet.page_inputs.url import _Url logger = logging.getLogger(__name__) _StrMapping: TypeAlias = dict[str, str] _Headers: TypeAlias = _StrMapping | HttpRequestHeaders _Body: TypeAlias = bytes | HttpRequestBody _StatusList: TypeAlias = str | int | list[str | int] @dataclass class _SavedResponseData: """Class for storing a request and its result.""" request: HttpRequest response: HttpResponse | None exception: HttpError | None = None def fingerprint(self) -> str: """Return the request fingeprint.""" return request_fingerprint(self.request) class HttpClient: """Async HTTP client to be used in Page Objects. See :ref:`additional-requests` for the usage information. HttpClient doesn't make HTTP requests on itself. It uses either the request function assigned to the ``web_poet.request_downloader_var`` :mod:`contextvar `, or a function passed via ``request_downloader`` argument of the :meth:`~.HttpClient.__init__` method. Either way, this function should be an ``async def`` function which receives an :class:`~.HttpRequest` instance, and either returns a :class:`~.HttpResponse` instance, or raises a subclass of :class:`~.HttpError`. You can read more in the :ref:`advanced-downloader-impl` documentation. """ def __init__( self, request_downloader: RequestDownloaderT | None = None, *, save_responses: bool = False, return_only_saved_responses: bool = False, responses: Iterable[_SavedResponseData] | None = None, ): self._request_downloader = request_downloader or _perform_request self.save_responses = save_responses self.return_only_saved_responses = return_only_saved_responses self._saved_responses: dict[str, _SavedResponseData] = { data.fingerprint(): data for data in responses or [] } @staticmethod def _handle_status( response: HttpResponse, request: HttpRequest, *, allow_status: _StatusList | None = None, ) -> None: allow_status_normalized = list(map(str, as_list(allow_status))) allow_all_status = any( True for s in allow_status_normalized if s.strip() == "*" ) if ( allow_all_status or response.status is None # allows serialized responses from tests or response.status < 400 or str(response.status) in allow_status_normalized ): return status_name = _http_status_name(response.status) msg = f"{response.status} {status_name} response for {response.url}" raise HttpResponseError(msg, request=request, response=response) async def request( self, url: str | _Url, *, method: str = "GET", headers: _Headers | None = None, body: _Body | None = None, allow_status: _StatusList | None = None, ) -> HttpResponse: """This is a shortcut for creating an :class:`~.HttpRequest` instance and executing that request. :class:`~.HttpRequestError` is raised for *connection errors*, *connection and read timeouts*, etc. An :class:`~.HttpResponse` instance is returned for successful responses in the ``100-3xx`` status code range. Otherwise, an exception of type :class:`~.HttpResponseError` is raised. Rasing :class:`~.HttpResponseError` can be suppressed for certain status codes using the ``allow_status`` param - it is a list of status code values for which :class:`~.HttpResponse` should be returned instead of raising :class:`~.HttpResponseError`. There is a special "*" ``allow_status`` value which allows any status code. There is no need to include ``100-3xx`` status codes in ``allow_status``, because :class:`~.HttpResponseError` is not raised for them. """ headers = headers or {} body = body or b"" req = HttpRequest(url=url, method=method, headers=headers, body=body) return await self.execute(req, allow_status=allow_status) async def get( self, url: str | _Url, *, headers: _Headers | None = None, allow_status: _StatusList | None = None, ) -> HttpResponse: """Similar to :meth:`~.HttpClient.request` but peforming a ``GET`` request. """ return await self.request( url=url, method="GET", headers=headers, allow_status=allow_status, ) async def post( self, url: str | _Url, *, headers: _Headers | None = None, body: _Body | None = None, allow_status: _StatusList | None = None, ) -> HttpResponse: """Similar to :meth:`~.HttpClient.request` but performing a ``POST`` request. """ return await self.request( url=url, method="POST", headers=headers, body=body, allow_status=allow_status, ) async def execute( self, request: HttpRequest, *, allow_status: _StatusList | None = None ) -> HttpResponse: """Execute the specified :class:`~.HttpRequest` instance using the request implementation configured in the :class:`~.HttpClient` instance. :class:`~.HttpRequestError` is raised for *connection errors*, *connection and read timeouts*, etc. :class:`~.HttpResponse` instance is returned for successful responses in the ``100-3xx`` status code range. Otherwise, an exception of type :class:`~.HttpResponseError` is raised. Rasing :class:`~.HttpResponseError` can be suppressed for certain status codes using the ``allow_status`` param - it is a list of status code values for which :class:`~.HttpResponse` should be returned instead of raising :class:`~.HttpResponseError`. There is a special "*" ``allow_status`` value which allows any status code. There is no need to include ``100-3xx`` status codes in ``allow_status``, because :class:`~.HttpResponseError` is not raised for them. """ if self.return_only_saved_responses: for fp, saved_data in self._saved_responses.items(): if request_fingerprint(request) == fp: if saved_data.exception: raise saved_data.exception assert saved_data.response self._handle_status( saved_data.response, saved_data.request, allow_status=allow_status, ) return saved_data.response raise NoSavedHttpResponse(request=request) try: response = await self._request_downloader(request) except HttpError as ex: if self.save_responses: self._saved_responses[request_fingerprint(request)] = ( _SavedResponseData(request, None, ex) ) raise if self.save_responses: self._saved_responses[request_fingerprint(request)] = _SavedResponseData( request, response ) self._handle_status(response, request, allow_status=allow_status) return response async def batch_execute( self, *requests: HttpRequest, return_exceptions: bool = False, allow_status: _StatusList | None = None, ) -> list[HttpResponse | HttpResponseError]: """Similar to :meth:`~.HttpClient.execute` but accepts a collection of :class:`~.HttpRequest` instances that would be batch executed. The order of the :class:`~.HttpResponses` would correspond to the order of :class:`~.HttpRequest` passed. If any of the :class:`~.HttpRequest` raises an exception upon execution, the exception is raised. To prevent this, the actual exception can be returned alongside any successful :class:`~.HttpResponse`. This enables salvaging any usable responses despite any possible failures. This can be done by setting ``True`` to the ``return_exceptions`` parameter. Like :meth:`~.HttpClient.execute`, :class:`~.HttpResponseError` will be raised for responses with status codes in the ``400-5xx`` range. The ``allow_status`` parameter could be used the same way here to prevent these exceptions from being raised. You can omit ``allow_status="*"`` if you're passing ``return_exceptions=True``. However, it would be returning :class:`~.HttpResponseError` instead of :class:`~.HttpResponse`. Lastly, a :class:`~.HttpRequestError` may be raised on cases like *connection errors*, *connection and read timeouts*, etc. """ coroutines = [self.execute(r, allow_status=allow_status) for r in requests] responses = await asyncio.gather( *coroutines, return_exceptions=return_exceptions ) return cast("list[HttpResponse | HttpResponseError]", responses) def get_saved_responses(self) -> Iterable[_SavedResponseData]: """Return saved requests and responses.""" return self._saved_responses.values() def _http_status_name(status: int) -> str: """ >>> _http_status_name(200) 'OK' >>> _http_status_name(404) 'NOT_FOUND' >>> _http_status_name(999) 'UNKNOWN' """ try: return HTTPStatus(status).name except ValueError: return "UNKNOWN" scrapinghub-web-poet-ba87b95/web_poet/page_inputs/http.py000066400000000000000000000207271517167256700236660ustar00rootroot00000000000000import json from hashlib import sha1 from typing import Any from urllib.parse import urljoin import attrs from w3lib.encoding import ( html_body_declared_encoding, html_to_unicode, http_content_type_encoding, read_bom, resolve_encoding, ) from w3lib.url import canonicalize_url from web_poet._base import _HttpHeaders from web_poet.mixins import SelectableMixin, UrlShortcutsMixin from web_poet.utils import memoizemethod_noargs from .url import RequestUrl as _RequestUrl from .url import ResponseUrl as _ResponseUrl class HttpRequestBody(bytes): """A container for holding the raw HTTP request body in bytes format.""" class HttpResponseBody(bytes): """A container for holding the raw HTTP response body in bytes format.""" def bom_encoding(self) -> str | None: """Returns the encoding from the byte order mark if present.""" return read_bom(self)[0] def declared_encoding(self) -> str | None: """Return the encoding specified in meta tags in the html body, or ``None`` if no suitable encoding was found""" return html_body_declared_encoding(self) def json(self) -> Any: """ Deserialize a JSON document to a Python object. """ return json.loads(self) class HttpRequestHeaders(_HttpHeaders): """A container for holding the HTTP request headers. It's able to accept instantiation via an Iterable of Tuples: >>> pairs = [("Content-Encoding", "gzip"), ("content-length", "648")] >>> HttpRequestHeaders(pairs) It's also accepts a mapping of key-value pairs as well: >>> pairs = {"Content-Encoding": "gzip", "content-length": "648"} >>> headers = HttpRequestHeaders(pairs) >>> headers Note that this also supports case insensitive header-key lookups: >>> headers.get("content-encoding") 'gzip' >>> headers.get("Content-Length") '648' These are just a few of the functionalities it inherits from :class:`multidict.CIMultiDict`. For more info on its other features, read the API spec of :class:`multidict.CIMultiDict`. """ class HttpResponseHeaders(_HttpHeaders): """A container for holding the HTTP response headers. It's able to accept instantiation via an Iterable of Tuples: >>> pairs = [("Content-Encoding", "gzip"), ("content-length", "648")] >>> HttpResponseHeaders(pairs) It's also accepts a mapping of key-value pairs as well: >>> pairs = {"Content-Encoding": "gzip", "content-length": "648"} >>> headers = HttpResponseHeaders(pairs) >>> headers Note that this also supports case insensitive header-key lookups: >>> headers.get("content-encoding") 'gzip' >>> headers.get("Content-Length") '648' These are just a few of the functionalities it inherits from :class:`multidict.CIMultiDict`. For more info on its other features, read the API spec of :class:`multidict.CIMultiDict`. """ def declared_encoding(self) -> str | None: """Return encoding detected from the Content-Type header, or None if encoding is not found""" content_type = self.get("Content-Type", "") return http_content_type_encoding(content_type) @attrs.define(auto_attribs=False, slots=False, eq=False) class HttpRequest: """Represents a generic HTTP request used by other functionalities in **web-poet** like :class:`~.HttpClient`. .. tip:: To build a request to submit an HTML form, use the :doc:`form2request library `, which provides integration with web-poet. """ url: _RequestUrl = attrs.field(converter=_RequestUrl) method: str = attrs.field(default="GET", kw_only=True) headers: HttpRequestHeaders = attrs.field( factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True ) body: HttpRequestBody = attrs.field( factory=HttpRequestBody, converter=HttpRequestBody, kw_only=True ) def urljoin(self, url: str | _RequestUrl | _ResponseUrl) -> _RequestUrl: """Return *url* as an absolute URL. If *url* is relative, it is made absolute relative to :attr:`url`.""" return _RequestUrl(urljoin(str(self.url), str(url))) @attrs.define(auto_attribs=False, slots=False, eq=False) class HttpResponse(SelectableMixin, UrlShortcutsMixin): """A container for the contents of a response, downloaded directly using an HTTP client. ``url`` should be a URL of the response (after all redirects), not a URL of the request, if possible. ``body`` contains the raw HTTP response body. The following are optional since it would depend on the source of the ``HttpResponse`` if these are available or not. For example, the responses could simply come off from a local HTML file which doesn't contain ``headers`` and ``status``. ``status`` should represent the int status code of the HTTP response. ``headers`` should contain the HTTP response headers. ``encoding`` encoding of the response. If None (default), encoding is auto-detected from headers and body content. """ url: _ResponseUrl = attrs.field(converter=_ResponseUrl) body: HttpResponseBody = attrs.field(converter=HttpResponseBody) status: int | None = attrs.field(default=None, kw_only=True) headers: HttpResponseHeaders = attrs.field( factory=HttpResponseHeaders, converter=HttpResponseHeaders, kw_only=True ) _encoding: str | None = attrs.field(default=None, kw_only=True) _DEFAULT_ENCODING = "ascii" _cached_text: str | None = None @property def text(self) -> str: """ Content of the HTTP body, converted to unicode using the detected encoding of the response, according to the web browser rules (respecting Content-Type header, etc.) """ # Access self.encoding before self._cached_text, because # there is a chance self._cached_text would be already populated # while detecting the encoding encoding = self.encoding if self._cached_text is None: fake_content_type_header = f"charset={encoding}" encoding, text = html_to_unicode(fake_content_type_header, self.body) self._cached_text = text return self._cached_text def _selector_input(self) -> str: return self.text @property def encoding(self) -> str | None: """Encoding of the response""" return ( self._encoding or self._body_bom_encoding() or self._headers_declared_encoding() or self._body_declared_encoding() or self._body_inferred_encoding() ) @memoizemethod_noargs def json(self) -> Any: """Deserialize a JSON document to a Python object.""" return self.body.json() @memoizemethod_noargs def _body_bom_encoding(self) -> str | None: return self.body.bom_encoding() @memoizemethod_noargs def _headers_declared_encoding(self) -> str | None: return self.headers.declared_encoding() @memoizemethod_noargs def _body_declared_encoding(self) -> str | None: return self.body.declared_encoding() @memoizemethod_noargs def _body_inferred_encoding(self) -> str | None: content_type = self.headers.get("Content-Type", "") body_encoding, text = html_to_unicode( content_type, self.body, auto_detect_fun=self._auto_detect_fun, default_encoding=self._DEFAULT_ENCODING, ) self._cached_text = text return body_encoding def _auto_detect_fun(self, body: bytes) -> str | None: for enc in (self._DEFAULT_ENCODING, "utf-8", "cp1252"): try: body.decode(enc) except UnicodeError: continue return resolve_encoding(enc) return None def request_fingerprint(req: HttpRequest) -> str: """Return the fingerprint of the request.""" fp = sha1() # noqa: S324 fp.update(req.method.encode() + b"\n") fp.update(canonicalize_url(str(req.url)).encode() + b"\n") for name, value in sorted(req.headers.items()): fp.update(f"{name.title()}:{value}\n".encode()) fp.update(b"\n") fp.update(req.body) return fp.hexdigest() scrapinghub-web-poet-ba87b95/web_poet/page_inputs/page_params.py000066400000000000000000000004671517167256700251650ustar00rootroot00000000000000from __future__ import annotations from typing import TypeVar _KT = TypeVar("_KT") _VT = TypeVar("_VT") class PageParams(dict[_KT, _VT]): """Container class that could contain any arbitrary data to be passed into a Page Object. Note that this is simply a subclass of Python's ``dict``. """ scrapinghub-web-poet-ba87b95/web_poet/page_inputs/response.py000066400000000000000000000016211517167256700245350ustar00rootroot00000000000000import attrs from web_poet.mixins import SelectableMixin, UrlShortcutsMixin from web_poet.page_inputs.browser import BrowserResponse from web_poet.page_inputs.http import HttpResponse from web_poet.page_inputs.url import ResponseUrl @attrs.define class AnyResponse(SelectableMixin, UrlShortcutsMixin): """A container that holds either :class:`~.BrowserResponse` or :class:`~.HttpResponse`.""" response: BrowserResponse | HttpResponse @property def url(self) -> ResponseUrl: """URL of the response.""" return self.response.url @property def text(self) -> str: """Text or HTML contents of the response.""" return self.response.text @property def status(self) -> int | None: """The int status code of the HTTP response, if available.""" return self.response.status def _selector_input(self) -> str: return self.text scrapinghub-web-poet-ba87b95/web_poet/page_inputs/stats.py000066400000000000000000000047541517167256700240470ustar00rootroot00000000000000from __future__ import annotations from abc import ABC, abstractmethod from typing import Any, TypeAlias StatNum: TypeAlias = int | float class StatCollector(ABC): """Base class for web-poet to implement the storing of data written through :class:`~web_poet.page_inputs.stats.Stats`.""" @abstractmethod def set(self, key: str, value: Any) -> None: """Set the value of stat *key* to *value*.""" @abstractmethod def inc(self, key: str, value: StatNum = 1) -> None: """Increment the value of stat *key* by *value*, or set it to *value* if *key* has no value.""" class DummyStatCollector(StatCollector): """:class:`~web_poet.page_inputs.stats.StatCollector` implementation that does not persist stats. It is used when running automatic tests, where stat storage is not necessary.""" def __init__(self) -> None: self._stats: dict[str, Any] = {} def set(self, key: str, value: Any) -> None: # noqa: D102 self._stats[key] = value def inc(self, key: str, value: StatNum = 1) -> None: # noqa: D102 if key in self._stats: assert isinstance(self._stats[key], (int, float)) self._stats[key] += value else: self._stats[key] = value class DictStatCollector(DummyStatCollector): """Simple :class:`~web_poet.page_inputs.stats.StatCollector` implementation that stores stats in a :class:`dict` accessible through the :attr:`data` property.""" @property def data(self) -> dict[str, Any]: """Dictionary data.""" return self._stats class Stats: """Page input class to write key-value data pairs during parsing that you can inspect later. See :ref:`stats`. Stats can be set to a fixed value or, if numeric, incremented. Stats are write-only. Storage and read access of stats depends on the web-poet framework that you are using. Check the documentation of your web-poet framework to find out if it supports stats, and if so, how to read stored stats. """ def __init__(self, stat_collector: StatCollector | None = None): self._stats = stat_collector or DummyStatCollector() def set(self, key: str, value: Any) -> None: """Set the value of stat *key* to *value*.""" self._stats.set(key, value) def inc(self, key: str, value: StatNum = 1) -> None: """Increment the value of stat *key* by *value*, or set it to *value* if *key* has no value.""" self._stats.inc(key, value) scrapinghub-web-poet-ba87b95/web_poet/page_inputs/url.py000066400000000000000000000011451517167256700235020ustar00rootroot00000000000000from __future__ import annotations class _Url: """Base URL class.""" def __init__(self, url: str | _Url): if not isinstance(url, (str, _Url)): raise TypeError( f"`url` must be a str or an instance of _Url, " f"got {url.__class__} instance instead" ) self._url = str(url) def __str__(self) -> str: return self._url def __repr__(self) -> str: return f"{self.__class__.__name__}({self._url!r})" class ResponseUrl(_Url): """URL of the response""" class RequestUrl(_Url): """URL of the request""" scrapinghub-web-poet-ba87b95/web_poet/pages.py000066400000000000000000000121721517167256700214630ustar00rootroot00000000000000import abc import inspect from contextlib import suppress from functools import wraps from types import GenericAlias from typing import Any, Generic, TypeVar, overload import attr import parsel from web_poet.fields import FieldsMixin, item_from_fields from web_poet.mixins import ResponseShortcutsMixin, SelectorShortcutsMixin from web_poet.page_inputs import BrowserResponse, HttpResponse from web_poet.utils import ( CallableT, cached_method, get_generic_param, ) class Injectable(abc.ABC, FieldsMixin): """Base Page Object class, which all Page Objects should inherit from (probably through Injectable subclasses). Frameworks which are using ``web-poet`` Page Objects should use :func:`is_injectable` function to detect if an object is an Injectable, and if an object is injectable, allow building it automatically through dependency injection, using https://github.com/scrapinghub/andi library. Instead of inheriting you can also use ``Injectable.register(MyWebPage)``. ``Injectable.register`` can also be used as a decorator. """ def is_injectable(cls: Any) -> bool: """Return True if ``cls`` is a class which inherits from :class:`~.Injectable`.""" return ( isinstance(cls, type) and not isinstance(cls, GenericAlias) and issubclass(cls, Injectable) ) ItemT = TypeVar("ItemT") class Returns(Generic[ItemT]): """Inherit from this generic mixin to change the item class used by :class:`~.ItemPage`""" @property def item_cls(self) -> type: """Item class""" return get_item_cls(self.__class__, default=dict) @overload def get_item_cls(cls: type, default: type) -> type: ... @overload def get_item_cls(cls: type, default: None) -> type | None: ... def get_item_cls(cls: type, default: type | None = None) -> type | None: param = get_generic_param(cls, Returns) return param or default _NOT_SET = object() def validates_input(to_item: CallableT) -> CallableT: """Decorator to apply input validation to custom to_item method implementations in :class:`~web_poet.pages.ItemPage` subclasses.""" if inspect.iscoroutinefunction(to_item): @wraps(to_item) async def _to_item(self, *args, **kwargs): validation_item = self._validate_input() if validation_item is not None: return validation_item return await to_item(self, *args, **kwargs) else: @wraps(to_item) def _to_item(self, *args, **kwargs): validation_item = self._validate_input() if validation_item is not None: return validation_item return to_item(self, *args, **kwargs) return _to_item # type: ignore[return-value] class Extractor(Returns[ItemT], FieldsMixin): """Base class for field support.""" _skip_nonitem_fields = _NOT_SET def _get_skip_nonitem_fields(self) -> bool: value = self._skip_nonitem_fields return False if value is _NOT_SET else bool(value) def __init_subclass__( cls, skip_nonitem_fields: Any = _NOT_SET, **kwargs: Any ) -> None: super().__init_subclass__(**kwargs) if skip_nonitem_fields is _NOT_SET: # This is a workaround for attrs issue. # See: https://github.com/scrapinghub/web-poet/issues/141 return cls._skip_nonitem_fields = skip_nonitem_fields async def to_item(self) -> ItemT: """Extract an item""" return await item_from_fields( self, item_cls=self.item_cls, skip_nonitem_fields=self._get_skip_nonitem_fields(), ) class ItemPage(Extractor[ItemT], Injectable): """Base class for page objects.""" @cached_method def _validate_input(self) -> Any: """Run self.validate_input if defined.""" if not hasattr(self, "validate_input"): return None with suppress(AttributeError): if self.__validating_input: # We are in a recursive call, i.e. _validate_input is being # called from _validate_input itself (likely through a @field # method). return None self.__validating_input: bool = True validation_item = self.validate_input() self.__validating_input = False return validation_item @validates_input async def to_item(self) -> ItemT: """Extract an item from a web page""" return await super().to_item() @attr.s(auto_attribs=True) class WebPage(ItemPage[ItemT], ResponseShortcutsMixin): """Base Page Object which requires :class:`~.HttpResponse` and provides XPath / CSS shortcuts. """ response: HttpResponse @attr.s(auto_attribs=True) class BrowserPage(ItemPage[ItemT], ResponseShortcutsMixin): """Base Page Object which requires :class:`~.BrowserResponse` and provides XPath / CSS shortcuts. """ response: BrowserResponse @attr.s(auto_attribs=True) class SelectorExtractor(Extractor[ItemT], SelectorShortcutsMixin): """Extractor that takes a :class:`parsel.Selector` and provides shortcuts for its methods.""" selector: parsel.Selector scrapinghub-web-poet-ba87b95/web_poet/py.typed000066400000000000000000000000001517167256700214740ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/requests.py000066400000000000000000000025341517167256700222400ustar00rootroot00000000000000from __future__ import annotations import logging from collections.abc import Awaitable, Callable from contextvars import ContextVar from typing import TypeAlias from web_poet.exceptions import RequestDownloaderVarError from web_poet.page_inputs.http import HttpRequest, HttpResponse logger = logging.getLogger(__name__) #: Frameworks that wants to support additional requests in ``web-poet`` should #: set the appropriate implementation of ``request_downloader_var`` #: for requesting data. RequestDownloaderT: TypeAlias = Callable[[HttpRequest], Awaitable[HttpResponse]] request_downloader_var: ContextVar[RequestDownloaderT] = ContextVar( "request_downloader" ) async def _perform_request(request: HttpRequest) -> HttpResponse: """Given a :class:`~.Request`, execute it using the **request implementation** that was set in the ``web_poet.request_downloader_var`` :mod:`contextvars` instance. """ logger.info(f"Requesting page: {request}") try: request_downloader = request_downloader_var.get() except LookupError as ex: raise RequestDownloaderVarError( "Additional requests are used inside the Page Object but the " "current framework has not set any HttpRequest Backend via " "'web_poet.request_downloader_var'" ) from ex return await request_downloader(request) scrapinghub-web-poet-ba87b95/web_poet/rules.py000066400000000000000000000413111517167256700215130ustar00rootroot00000000000000import importlib import importlib.util import pkgutil import warnings from collections import defaultdict, deque from collections.abc import Generator, Iterable, Mapping from operator import attrgetter from typing import Any, TypeAlias import attrs from url_matcher import Patterns, URLMatcher from web_poet.page_inputs.url import _Url from web_poet.pages import ItemPage, get_item_cls from web_poet.utils import as_list, str_to_pattern Strings: TypeAlias = str | Iterable[str] @attrs.define(frozen=True) class ApplyRule: """A rule that primarily applies Page Object and Item overrides for a given URL pattern. This is instantiated when using the :func:`web_poet.handle_urls` decorator. It's also being returned as a ``List[ApplyRule]`` when calling the ``web_poet.default_registry``'s :meth:`~.RulesRegistry.get_rules` method. You can access any of its attributes: * ``for_patterns`` - contains the list of URL patterns associated with this rule. You can read the API documentation of the `url-matcher `_ package for more information about the patterns. * ``use`` - The Page Object that will be **used** in cases where the URL pattern represented by the ``for_patterns`` attribute is matched. * ``instead_of`` - *(optional)* The Page Object that will be **replaced** with the Page Object specified via the ``use`` parameter. * ``to_return`` - *(optional)* The item class that the Page Object specified in ``use`` is capable of returning. * ``meta`` - *(optional)* Any other information you may want to store. This doesn't do anything for now but may be useful for future API updates. The main functionality of this class lies in the ``instead_of`` and ``to_return`` parameters. Should both of these be omitted, then :class:`~.ApplyRule` simply tags which URL patterns the given Page Object defined in ``use`` is expected to be used on. When ``to_return`` is not None (e.g. ``to_return=MyItem``), the Page Object in ``use`` is declared as capable of returning a certain item class (i.e. ``MyItem``). When ``instead_of`` is not None (e.g. ``instead_of=ReplacedPageObject``), the rule adds an expectation that the ``ReplacedPageObject`` wouldn't be used for the URLs matching ``for_patterns``, since the Page Object in ``use`` will replace it. If there are multiple rules which match a certain URL, the rule to apply is picked based on the priorities set in ``for_patterns``. More information regarding its usage in :ref:`rules`. .. tip:: The :class:`~.ApplyRule` is also hashable. This makes it easy to store unique rules and identify any duplicates. """ for_patterns: Patterns = attrs.field(converter=str_to_pattern) use: type[ItemPage] = attrs.field(kw_only=True) instead_of: type[ItemPage] | None = attrs.field(default=None, kw_only=True) to_return: type[Any] | None = attrs.field(default=None, kw_only=True) meta: dict[str, Any] = attrs.field(factory=dict, kw_only=True) def __hash__(self): return hash((self.for_patterns, self.use, self.instead_of, self.to_return)) class RulesRegistry: """ RulesRegistry provides features for storing, retrieving, and searching for the :class:`~.ApplyRule` instances. ``web-poet`` provides a default Registry named ``default_registry`` for convenience. It can be accessed this way: .. code-block:: python from web_poet import handle_urls, default_registry, WebPage from my_items import Product @handle_urls("example.com") class ExampleComProductPage(WebPage[Product]): ... rules = default_registry.get_rules() The ``@handle_urls`` decorator exposed as ``web_poet.handle_urls`` is a shortcut for ``default_registry.handle_urls``. .. note:: It is encouraged to use the ``web_poet.default_registry`` instead of creating your own :class:`~.RulesRegistry` instance. Using multiple registries would be unwieldy in most cases. However, it might be applicable in certain scenarios like storing custom rules to separate it from the ``default_registry``. """ def __init__(self, *, rules: Iterable[ApplyRule] | None = None): self._rules: dict[int, ApplyRule] = {} self._overrides_matchers: defaultdict[type[ItemPage] | None, URLMatcher] = ( defaultdict(URLMatcher) ) self._item_matchers: defaultdict[type | None, URLMatcher] = defaultdict( URLMatcher ) # Ensures that URLMatcher is deterministic in returning a rule when # matching. As of url_macher==0.2.0, `url_matcher.URLMatcher._sort_domain` # has this sorting criteria: # * Priority (descending) # * Sorted list of includes for this domain (descending) # * Rule identifier (descending) # This means that if the priority and domain are the same, the last tie # breaker would be the "Rule identifier", this means we can base it on # the order of rule addition to the registry, i.e. a counter. self._rule_counter = 0 if rules is not None: for rule in rules: self.add_rule(rule) def add_rule(self, rule: ApplyRule) -> None: """Registers an :class:`web_poet.rules.ApplyRule` instance.""" matched = self._item_matchers.get(rule.to_return) if matched: # A common case when a page object subclasses another one with the # same URL pattern. pattern_dupes = { pattern for pattern in matched.patterns.values() if pattern == rule.for_patterns } if pattern_dupes: rules_to_warn = [ r for p in pattern_dupes for r in self.search(for_patterns=p, to_return=rule.to_return) ] + [rule] warnings.warn( f"The registry contains {len(rules_to_warn)} conflicting " f"rules with to_return={rule.to_return} " f"and the same URL pattern:\n\n" f"{self._format_list(pattern_dupes)} " f"\n\n" f"The first rule added to the registry is used when the URL patterns are the same and " f"the priorities are equal; other rules are ignored. " f"This is error-prone. Consider setting the priority explicitly " f"for these rules:\n\n" f"{self._format_list(rules_to_warn)}", stacklevel=3, # optimized for the common case of @handle_urls ) self._rule_counter += 1 rule_id = self._rule_counter self._overrides_matchers[rule.instead_of].add_or_update( rule_id, rule.for_patterns ) self._item_matchers[rule.to_return].add_or_update(rule_id, rule.for_patterns) self._rules[rule_id] = rule @classmethod def _format_list(cls, objects: Iterable[object]) -> str: return "\n".join(repr(rule) for rule in objects) def handle_urls( self, include: Strings, *, instead_of: type[ItemPage] | None = None, to_return: type | None = None, exclude: Strings | None = None, priority: int = 500, **kwargs, ): """ Class decorator that indicates that the decorated Page Object should work for the given URL patterns. The URL patterns are matched using the ``include`` and ``exclude`` parameters while ``priority`` breaks any ties. See the documentation of the `url-matcher `_ package for more information about them. This decorator is able to derive the item class returned by the Page Object. This is important since it marks what type of item the Page Object is capable of returning for the given URL patterns. For certain advanced cases, you can pass a ``to_return`` parameter which replaces any derived values (though this isn't generally recommended). Passing another Page Object into the ``instead_of`` parameter indicates that the decorated Page Object will be used instead of that for the given set of URL patterns. See :ref:`rule-precedence`. Any extra parameters are stored as meta information that can be later used. :param include: The URLs that should be handled by the decorated Page Object. :param instead_of: The Page Object that should be `replaced`. :param to_return: The item class holding the data returned by the Page Object. This could be omitted as it could be derived from the ``Returns[ItemClass]`` or ``ItemPage[ItemClass]`` declaration of the Page Object. See :ref:`item-classes` section. :param exclude: The URLs for which the Page Object should **not** be applied. :param priority: The resolution priority in case of `conflicting` rules. A conflict happens when the ``include``, ``override``, and ``exclude`` parameters are the same. If so, the `highest priority` will be chosen. """ def wrapper(cls): rule = ApplyRule( for_patterns=Patterns( include=as_list(include), exclude=as_list(exclude), priority=priority, ), use=cls, instead_of=instead_of, to_return=to_return or get_item_cls(cls), meta=kwargs, ) self.add_rule(rule) return cls return wrapper def get_rules(self) -> list[ApplyRule]: """Return all the :class:`~.ApplyRule` that were declared using the ``@handle_urls`` decorator. .. note:: Remember to consider calling :func:`~.web_poet.rules.consume_modules` beforehand to recursively import all submodules which contains the ``@handle_urls`` decorators from external Page Objects. """ return list(self._rules.values()) def search(self, **kwargs: Any) -> list[ApplyRule]: """Return any :class:`ApplyRule` from the registry that matches with all the provided attributes. Sample usage: .. code-block:: python rules = registry.search(use=ProductPO, instead_of=GenericPO) print(len(rules)) # 1 print(rules[0].use) # ProductPO print(rules[0].instead_of) # GenericPO """ # Use a dict instead of set() to preserve the order. rule_ids = {} if "to_return" in kwargs: matcher = self._item_matchers.get(kwargs["to_return"]) if matcher: rule_ids.update(matcher.patterns) if "instead_of" in kwargs: matcher = self._overrides_matchers.get(kwargs["instead_of"]) if matcher: if rule_ids: # If both params are used then narrow down the rules. rule_ids = { k: v for k, v in matcher.patterns.items() if k in rule_ids } else: rule_ids.update(matcher.patterns) rules = [self._rules[id_] for id_ in rule_ids] if rules and kwargs.keys() <= {"to_return", "instead_of"}: return rules # Search other parameters as well getter = attrgetter(*kwargs.keys()) def finder(rule: ApplyRule): attribs = getter(rule) if not isinstance(attribs, tuple): attribs = (attribs,) return attribs == tuple(kwargs.values()) return [rule for rule in rules or self.get_rules() if finder(rule)] def _match_url_for_page_object( self, url: _Url | str, matcher: URLMatcher | None = None ) -> type[ItemPage] | None: """Returns the page object to use based on the URL and URLMatcher.""" if not url or matcher is None: return None rule_id = matcher.match(str(url)) if rule_id is not None: return self._rules[rule_id].use return None def overrides_for(self, url: _Url | str) -> Mapping[type[ItemPage], type[ItemPage]]: """Finds all of the page objects associated with the given URL and returns a Mapping where the 'key' represents the page object that is **overridden** by the page object in 'value'.""" result: dict[type[ItemPage], type[ItemPage]] = {} for replaced_page, matcher in self._overrides_matchers.items(): if replaced_page is None: continue page = self._match_url_for_page_object(url, matcher) if page: result[replaced_page] = page return result def page_cls_for_item(self, url: _Url | str, item_cls: type) -> type | None: """Return the page object class associated with the given URL that's able to produce the given ``item_cls``.""" if item_cls is None: return None matcher = self._item_matchers.get(item_cls) return self._match_url_for_page_object(url, matcher) def top_rules_for_item( self, url: _Url | str, item_cls: type ) -> Generator[ApplyRule]: """Iterates the top rules that apply for *url* and *item_cls*. If multiple rules score the same, multiple rules are iterated. This may be useful, for example, if you want to apply some custom logic to choose between rules that otherwise have the same score. For example: .. code-block:: python from web_poet import default_registry def browser_page_cls_for_item(url, item_cls): fallback = None for rule in default_registry.top_rules_for_item(url, item_cls): if rule.meta.get("browser", False): return rule.use if not fallback: fallback = rule.use if not fallback: raise ValueError(f"No rule found for URL {url!r} and item class {item_cls}") return fallback """ if not url or not item_cls: return matcher = self._item_matchers.get(item_cls) if not matcher: return max_priority = None for rule_id in matcher.match_all(str(url)): rule = self._rules[rule_id] if max_priority is None: max_priority = rule.for_patterns.priority elif rule.for_patterns.priority < max_priority: break yield rule def _walk_module(module: str) -> Iterable: """Return all modules from a module recursively. Note that this will import all the modules and submodules. It returns the provided module as well. """ def onerror(err): raise err # pragma: no cover spec = importlib.util.find_spec(module) if not spec: raise ImportError(f"Module {module} not found") mod = importlib.import_module(spec.name) yield mod if spec.submodule_search_locations: for info in pkgutil.walk_packages( spec.submodule_search_locations, f"{spec.name}.", onerror ): mod = importlib.import_module(info.name) yield mod def consume_modules(*modules: str) -> None: """This recursively imports all packages/modules so that the ``@handle_urls`` decorators are properly discovered and imported. Let's take a look at an example: .. code-block:: python # FILE: my_page_obj_project/load_rules.py from web_poet import default_registry, consume_modules consume_modules("other_external_pkg.po", "another_pkg.lib") rules = default_registry.get_rules() For this case, the :class:`~.ApplyRule` are coming from: - ``my_page_obj_project`` `(since it's the same module as the file above)` - ``other_external_pkg.po`` - ``another_pkg.lib`` - any other modules that was imported in the same process inside the packages/modules above. If the ``default_registry`` had other ``@handle_urls`` decorators outside of the packages/modules listed above, then the corresponding :class:`~.ApplyRule` won't be returned. Unless, they were recursively imported in some way similar to :func:`~.web_poet.rules.consume_modules`. """ for module in modules: gen = _walk_module(module) # Inspired by itertools recipe: https://docs.python.org/3/library/itertools.html # Using a deque() results in a tiny bit performance improvement that list(). deque(gen, maxlen=0) scrapinghub-web-poet-ba87b95/web_poet/serialization/000077500000000000000000000000001517167256700226645ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/serialization/__init__.py000066400000000000000000000011301517167256700247700ustar00rootroot00000000000000from . import functions # needed to run register functions from .api import ( DeserializeFunction, SerializedData, SerializedDataFileStorage, SerializedLeafData, SerializeFunction, deserialize, deserialize_leaf, load_class, register_serialization, serialize, serialize_leaf, ) __all__ = [ "DeserializeFunction", "SerializeFunction", "SerializedData", "SerializedDataFileStorage", "SerializedLeafData", "deserialize", "deserialize_leaf", "load_class", "register_serialization", "serialize", "serialize_leaf", ] scrapinghub-web-poet-ba87b95/web_poet/serialization/api.py000066400000000000000000000200551517167256700240110ustar00rootroot00000000000000from __future__ import annotations from collections.abc import Callable from functools import singledispatch from importlib import import_module from pathlib import Path from typing import TYPE_CHECKING, Any, TypeAlias, TypeVar, cast import andi from andi.typeutils import strip_annotated import web_poet from web_poet import Injectable from web_poet.annotated import AnnotatedInstance from web_poet.pages import is_injectable from web_poet.utils import get_fq_class_name if TYPE_CHECKING: import os from collections.abc import Iterable # represents a leaf dependency of any type serialized as a set of files SerializedLeafData: TypeAlias = dict[str, bytes] # represents a set of leaf dependencies of different types SerializedData: TypeAlias = dict[str, SerializedLeafData] T = TypeVar("T") InjectableT = TypeVar("InjectableT", bound=Injectable) SerializeFunction: TypeAlias = Callable[[T], SerializedLeafData] DeserializeFunction: TypeAlias = Callable[[type[T], SerializedLeafData], T] class SerializedDataFileStorage: def __init__(self, directory: str | os.PathLike[str]) -> None: super().__init__() self.directory: Path = Path(directory) @staticmethod def _split_file_name(file_name: str) -> tuple[str, str]: """Extract the type name and the type-specific suffix from a file name. >>> SerializedDataFileStorage._split_file_name("TypeName.ext") ('TypeName', 'ext') >>> SerializedDataFileStorage._split_file_name("Qualified.TypeName.ext") ('Qualified.TypeName', 'ext') >>> SerializedDataFileStorage._split_file_name("TypeName-component.ext") ('TypeName', 'component.ext') >>> SerializedDataFileStorage._split_file_name("Qualified.TypeName-component.ext") ('Qualified.TypeName', 'component.ext') >>> SerializedDataFileStorage._split_file_name("Qualified.TypeName-component-with-dashes.ext") ('Qualified.TypeName', 'component-with-dashes.ext') """ if "-" in file_name: type_name, suffix = file_name.split("-", 1) else: type_name, suffix = file_name.rsplit(".", 1) return type_name, suffix @staticmethod def _make_file_name(type_name: str, suffix: str) -> str: """Combine the type name and the type-specific suffix into a file name. >>> SerializedDataFileStorage._make_file_name('TypeName', 'ext') 'TypeName.ext' >>> SerializedDataFileStorage._make_file_name('Qualified.TypeName', 'ext') 'Qualified.TypeName.ext' >>> SerializedDataFileStorage._make_file_name('TypeName', 'component.ext') 'TypeName-component.ext' >>> SerializedDataFileStorage._make_file_name('Qualified.TypeName', 'component.ext') 'Qualified.TypeName-component.ext' >>> SerializedDataFileStorage._make_file_name('Qualified.TypeName', 'component-with-dashes.ext') 'Qualified.TypeName-component-with-dashes.ext' """ if "." not in suffix: # TypeName.ext return type_name + "." + suffix # TypeName-component.ext return type_name + "-" + suffix def read(self) -> SerializedData: # noqa: D102 result: SerializedData = {} for entry in self.directory.iterdir(): if not entry.is_file(): continue type_name, suffix = self._split_file_name(entry.name) if type_name not in result: result[type_name] = {} result[type_name][suffix] = entry.read_bytes() return result def write(self, data: SerializedData) -> None: # noqa: D102 for type_name, leaf in data.items(): for suffix, contents in leaf.items(): full_name = self._make_file_name(type_name, suffix) file_name = Path(self.directory, full_name) file_name.write_bytes(contents) def serialize_leaf(o: Any) -> SerializedLeafData: raise NotImplementedError(f"Serialization for {type(o)} is not implemented") def _deserialize_leaf_base(cls: type[Any], data: SerializedLeafData) -> Any: raise NotImplementedError(f"Deserialization for {cls} is not implemented") serialize_leaf.f_deserialize = _deserialize_leaf_base # type: ignore[attr-defined] serialize_leaf = singledispatch(serialize_leaf) def register_serialization( f_serialize: SerializeFunction[T], f_deserialize: DeserializeFunction[T] ) -> None: serialize_leaf.register(f_serialize) # type: ignore[attr-defined] f_serialize.f_deserialize = f_deserialize # type: ignore[attr-defined] def deserialize_leaf(cls: type[T], data: SerializedLeafData) -> T: f_ser: SerializeFunction[T] = serialize_leaf.dispatch(cls) # type: ignore[attr-defined] return cast("T", f_ser.f_deserialize(cls, data)) # type: ignore[attr-defined] def _get_name_for_class(cls: type) -> str: """Return the type name that will be used for serialization. For classes available in the web_poet module it's the type name, for others it's the fully qualified type name. >>> _get_name_for_class(Injectable) 'Injectable' >>> from decimal import Decimal >>> _get_name_for_class(Decimal) 'decimal.Decimal' """ if getattr(web_poet, cls.__name__, None) == cls: return cls.__name__ return get_fq_class_name(cls) def serialize(deps: Iterable[Any]) -> SerializedData: result: SerializedData = {} for dep in deps: cls = dep.__class__ if is_injectable(cls): raise ValueError(f"Injectable type {cls} passed to serialize()") if cls is AnnotatedInstance: key = f"AnnotatedInstance {_get_name_for_class(dep.result.__class__)}" else: key = _get_name_for_class(cls) if key in result: cls_name = cls.__name__ if cls is AnnotatedInstance: cls_name = f"AnnotatedInstance for {dep.result.__class__.__name__}" raise ValueError( f"Several instances of {cls_name} were passed to serialize()." ) result[key] = serialize_leaf(dep) return result def load_class(type_name: str) -> type: """Return the type by its name. Requires the fully qualified name unless the type is available in the web_poet module. >>> load_class("decimal.Decimal") >>> load_class("web_poet.pages.WebPage") >>> load_class("WebPage") >>> load_class("decimal.foo") Traceback (most recent call last): ... ValueError: Unknown type decimal.foo >>> load_class("foo.bar") Traceback (most recent call last): ... ValueError: Unable to import module foo """ if "." in type_name: module, name = type_name.rsplit(".", 1) else: module = "web_poet" name = type_name try: mod = import_module(module) except ModuleNotFoundError as ex: raise ValueError(f"Unable to import module {module}") from ex result = getattr(mod, name, None) if not result: raise ValueError(f"Unknown type {type_name}") return cast("type", result) def deserialize(cls: type[InjectableT], data: SerializedData) -> InjectableT: deps: dict[Callable, Any] = {} for dep_type_name, dep_data in data.items(): if dep_type_name.startswith("AnnotatedInstance "): annotated_result = deserialize_leaf(AnnotatedInstance, dep_data) dep_type = annotated_result.get_annotated_cls() deserialized_dep = annotated_result.result else: dep_type = load_class(dep_type_name) deserialized_dep = deserialize_leaf(dep_type, dep_data) deps[dep_type] = deserialized_dep externally_provided = {strip_annotated(cls) for cls in deps} plan = andi.plan( cls, is_injectable=is_injectable, externally_provided=externally_provided ) for fn_or_cls, kwargs_spec in plan[:-1]: if strip_annotated(fn_or_cls) in externally_provided: continue deps[strip_annotated(fn_or_cls)] = fn_or_cls(**kwargs_spec.kwargs(deps)) return cls(**plan.final_kwargs(deps)) scrapinghub-web-poet-ba87b95/web_poet/serialization/functions.py000066400000000000000000000214221517167256700252470ustar00rootroot00000000000000from __future__ import annotations import json from typing import Any, cast from .. import ( AnyResponse, BrowserHtml, BrowserResponse, HttpClient, HttpRequest, HttpRequestBody, HttpResponse, HttpResponseBody, PageParams, Stats, ) from ..annotated import AnnotatedInstance, annotation_decode, annotation_encode from ..exceptions import HttpError from ..page_inputs.client import _SavedResponseData from ..page_inputs.url import _Url from .api import ( SerializedLeafData, _get_name_for_class, deserialize_leaf, load_class, register_serialization, serialize_leaf, ) from .utils import _exception_from_dict, _exception_to_dict, _format_json def _serialize_HttpRequest(o: HttpRequest) -> SerializedLeafData: info = { "url": str(o.url), "method": o.method, "headers": list(o.headers.items()), } result: SerializedLeafData = { "info.json": _format_json(info).encode(), } if o.body: result["body.txt"] = bytes(o.body) return result def _deserialize_HttpRequest( cls: type[HttpRequest], data: SerializedLeafData ) -> HttpRequest: body = HttpRequestBody(data.get("body.txt", b"")) info = json.loads(data["info.json"]) return cls( body=body, url=info["url"], method=info["method"], headers=info["headers"], ) register_serialization(_serialize_HttpRequest, _deserialize_HttpRequest) def _serialize_HttpResponse(o: HttpResponse) -> SerializedLeafData: info = { "url": str(o.url), "status": o.status, "headers": list(o.headers.items()), "_encoding": o._encoding, "type": "HttpResponse", } return { "body.html": bytes(o.body), "info.json": _format_json(info).encode(), } def _deserialize_HttpResponse( cls: type[HttpResponse], data: SerializedLeafData ) -> HttpResponse: body = HttpResponseBody(data["body.html"]) info = json.loads(data["info.json"]) return cls( body=body, url=info["url"], status=info["status"], headers=info["headers"], encoding=info["_encoding"], ) register_serialization(_serialize_HttpResponse, _deserialize_HttpResponse) def _serialize_HttpResponseBody(o: HttpResponseBody) -> SerializedLeafData: return {"html": bytes(o)} def _deserialize_HttpResponseBody( cls: type[HttpResponseBody], data: SerializedLeafData ) -> HttpResponseBody: return cls(data["html"]) register_serialization(_serialize_HttpResponseBody, _deserialize_HttpResponseBody) def _serialize__Url(o: _Url) -> SerializedLeafData: return {"txt": str(o).encode()} def _deserialize__Url(cls: type[_Url], data: SerializedLeafData) -> _Url: return cls(data["txt"].decode()) register_serialization(_serialize__Url, _deserialize__Url) def _serialize_HttpClient(o: HttpClient) -> SerializedLeafData: serialized_data: SerializedLeafData = { "exists": b"", } for i, data in enumerate(o.get_saved_responses()): serialized_request = serialize_leaf(data.request) for k, v in serialized_request.items(): serialized_data[f"{i}-HttpRequest.{k}"] = v if data.response: serialized_response = serialize_leaf(data.response) for k, v in serialized_response.items(): serialized_data[f"{i}-HttpResponse.{k}"] = v if data.exception: # the request attribute is currently not saved exc_data = _exception_to_dict(data.exception) serialized_data[f"{i}-exception.json"] = _format_json(exc_data).encode() return serialized_data def _deserialize_HttpClient( cls: type[HttpClient], data: SerializedLeafData ) -> HttpClient: responses: list[_SavedResponseData] = [] serialized_requests: dict[str, SerializedLeafData] = {} serialized_responses: dict[str, SerializedLeafData] = {} serialized_exceptions: dict[str, SerializedLeafData] = {} for k, v in data.items(): if k == "exists": continue # k is number-("HttpRequest"|"HttpResponse").("body"|"info").ext # or number-"exception.json" key, type_suffix = k.split("-", 1) type_name, suffix = type_suffix.split(".", 1) if type_name == "HttpRequest": serialized_requests.setdefault(key, {})[suffix] = v elif type_name == "HttpResponse": serialized_responses.setdefault(key, {})[suffix] = v elif type_name == "exception": serialized_exceptions.setdefault(key, {})[suffix] = v for key, serialized_request in serialized_requests.items(): serialized_response = serialized_responses.get(key) serialized_exception = serialized_exceptions.get(key) if not serialized_response and not serialized_exception: continue request = deserialize_leaf(HttpRequest, serialized_request) if serialized_response: response = deserialize_leaf(HttpResponse, serialized_response) else: response = None exception: HttpError | None if serialized_exception: exc_data = json.loads(serialized_exception["json"]) exception = cast("HttpError", _exception_from_dict(exc_data)) else: exception = None responses.append(_SavedResponseData(request, response, exception)) return cls(return_only_saved_responses=True, responses=responses) register_serialization(_serialize_HttpClient, _deserialize_HttpClient) def _serialize_PageParams(o: PageParams) -> SerializedLeafData: return {"json": _format_json(o).encode()} def _deserialize_PageParams( cls: type[PageParams], data: SerializedLeafData ) -> PageParams: return cls(json.loads(data["json"])) register_serialization(_serialize_PageParams, _deserialize_PageParams) def _serialize_Stats(o: Stats) -> SerializedLeafData: return {} def _deserialize_Stats(cls: type[Stats], data: SerializedLeafData) -> Stats: return cls() register_serialization(_serialize_Stats, _deserialize_Stats) def _serialize_AnnotatedInstance(o: AnnotatedInstance) -> SerializedLeafData: serialized_data: SerializedLeafData = { "metadata.json": _format_json(annotation_decode(o.metadata)).encode(), "result_type.txt": _get_name_for_class(type(o.result)).encode(), } serialized_result = serialize_leaf(o.result) for k, v in serialized_result.items(): serialized_data["result-" + k] = v return serialized_data def _deserialize_AnnotatedInstance( cls: type[AnnotatedInstance], data: SerializedLeafData ) -> AnnotatedInstance: metadata = annotation_encode(json.loads(data["metadata.json"])) result_type = load_class(data["result_type.txt"].decode()) serialized_result = {} for k, v in data.items(): if not k.startswith("result-"): continue serialized_result[k.split("-", 1)[1]] = v result: Any = deserialize_leaf(result_type, serialized_result) return cls(result=result, metadata=metadata) register_serialization(_serialize_AnnotatedInstance, _deserialize_AnnotatedInstance) def _serialize_BrowserHtml(o: BrowserHtml) -> SerializedLeafData: return {"body.html": o.encode("utf8")} def _deserialize_BrowserHtml( cls: type[BrowserHtml], data: SerializedLeafData ) -> BrowserHtml: body: bytes = data.get("body.html") or b"" return cls(body.decode("utf8")) register_serialization(_serialize_BrowserHtml, _deserialize_BrowserHtml) def _serialize_BrowserResponse(o: BrowserResponse) -> SerializedLeafData: info = { "url": str(o.url), "status": o.status, "type": "BrowserResponse", } return { "body.html": o.html.encode("utf8"), "info.json": _format_json(info).encode(), } def _deserialize_BrowserResponse( cls: type[BrowserResponse], data: SerializedLeafData ) -> BrowserResponse: html = BrowserHtml(data["body.html"].decode("utf8")) info = json.loads(data["info.json"]) return cls( url=info["url"], html=html, status=info["status"], ) register_serialization(_serialize_BrowserResponse, _deserialize_BrowserResponse) def _serialize_AnyResponse(o: AnyResponse) -> SerializedLeafData: if isinstance(o.response, HttpResponse): return _serialize_HttpResponse(o.response) return _serialize_BrowserResponse(o.response) def _deserialize_AnyResponse( cls: type[AnyResponse], data: SerializedLeafData ) -> AnyResponse: response: BrowserResponse | HttpResponse info = json.loads(data["info.json"]) if info.get("type") == "BrowserResponse": response = _deserialize_BrowserResponse(BrowserResponse, data) return cls(response=response) response = _deserialize_HttpResponse(HttpResponse, data) return cls(response=response) register_serialization(_serialize_AnyResponse, _deserialize_AnyResponse) scrapinghub-web-poet-ba87b95/web_poet/serialization/utils.py000066400000000000000000000021511517167256700243750ustar00rootroot00000000000000from __future__ import annotations import json from typing import Any, cast from web_poet.page_inputs.url import _Url from web_poet.serialization.api import _get_name_for_class, load_class def _exception_to_dict(ex: Exception) -> dict[str, Any]: """Serialize an exception. Only the exception type and the first argument are saved. """ return { "import_path": _get_name_for_class(type(ex)), "msg": ex.args[0] if ex.args else None, } def _exception_from_dict(data: dict[str, Any]) -> Exception: """Deserialize an exception. Only the exception type and the first argument are restored. """ exc_cls = load_class(data["import_path"]) return cast("Exception", exc_cls(data["msg"])) def _format_json(data: Any) -> str: """Produce a formatted JSON string with preset options.""" return json.dumps( data, ensure_ascii=False, sort_keys=True, indent=2, cls=_CustomJSONEncoder ) class _CustomJSONEncoder(json.JSONEncoder): def default(self, o: Any) -> Any: if isinstance(o, _Url): return str(o) return super().default(o) scrapinghub-web-poet-ba87b95/web_poet/testing/000077500000000000000000000000001517167256700214645ustar00rootroot00000000000000scrapinghub-web-poet-ba87b95/web_poet/testing/__init__.py000066400000000000000000000000641517167256700235750ustar00rootroot00000000000000from .fixture import Fixture __all__ = ["Fixture"] scrapinghub-web-poet-ba87b95/web_poet/testing/__main__.py000066400000000000000000000037161517167256700235650ustar00rootroot00000000000000from __future__ import annotations import argparse import sys from pathlib import Path from typing import TYPE_CHECKING from web_poet.serialization import load_class from web_poet.testing import Fixture if TYPE_CHECKING: from collections.abc import Sequence def rerun(args: argparse.Namespace) -> None: fixture_path = Path(args.fixture_path) fixture = Fixture(fixture_path) page_name = args.page_object or fixture_path.parent.name page_cls: type = load_class(page_name) item = fixture.get_output(page_cls) if args.fields: fields = args.fields.split(",") unknown_fields = sorted(set(fields) - item.keys()) if unknown_fields: print( f"Unknown field names: {unknown_fields}. " f"Allowed names are: {sorted(item.keys())}\n", file=sys.stderr, ) item = {field: item[field] for field in fields if field in item} print(fixture.item_to_json(item)) def main(argv: Sequence[str] | None = None) -> None: parser = argparse.ArgumentParser( prog="python -m web_poet.testing", description="web-poet testing utilities", ) subparsers = parser.add_subparsers() parser_rerun = subparsers.add_parser( "rerun", description="Run the page object used in a fixture, print its output " "as JSON. This is most useful when the page object is changed, " "and you want to update the test case." "", ) parser_rerun.add_argument("fixture_path", type=str, help="Path to a fixture") parser_rerun.add_argument( "--page-object", "-p", type=str, help="Page object type name" ) parser_rerun.add_argument( "--fields", "-f", type=str, help="Field names, comma-separated" ) parser_rerun.set_defaults(func=rerun) args = parser.parse_args(argv) if hasattr(args, "func"): args.func(args) else: parser.print_help() if __name__ == "__main__": main() scrapinghub-web-poet-ba87b95/web_poet/testing/exceptions.py000066400000000000000000000004671517167256700242260ustar00rootroot00000000000000class FieldMissing(AssertionError): pass class FieldValueIncorrect(AssertionError): pass class FieldsUnexpected(AssertionError): pass class ItemValueIncorrect(AssertionError): pass class ExceptionNotRaised(AssertionError): pass class WrongExceptionRaised(AssertionError): pass scrapinghub-web-poet-ba87b95/web_poet/testing/fixture.py000066400000000000000000000251531517167256700235320ustar00rootroot00000000000000from __future__ import annotations import asyncio import json import logging from pathlib import Path from typing import TYPE_CHECKING, Any, cast from zoneinfo import ZoneInfo import dateutil.parser import dateutil.tz import time_machine from web_poet import ItemPage from web_poet.serialization import ( SerializedDataFileStorage, deserialize, load_class, serialize, ) from web_poet.utils import ( cached_method, ensure_awaitable, get_fq_class_name, memoizemethod_noargs, ) from ..serialization.utils import _exception_from_dict, _exception_to_dict, _format_json from .exceptions import ( ExceptionNotRaised, FieldMissing, FieldsUnexpected, FieldValueIncorrect, ItemValueIncorrect, WrongExceptionRaised, ) from .itemadapter import WebPoetTestItemAdapter if TYPE_CHECKING: import datetime import os from collections.abc import Iterable from itemadapter import ItemAdapter # typing.Self requires Python 3.11 from typing_extensions import Self logger = logging.getLogger(__name__) INPUT_DIR_NAME = "inputs" OUTPUT_FILE_NAME = "output.json" EXCEPTION_FILE_NAME = "exception.json" META_FILE_NAME = "meta.json" def _get_available_filename(template: str, directory: str | os.PathLike[str]) -> str: i = 1 while True: result = Path(directory, template.format(i)) if not result.exists(): return result.name i += 1 class Fixture: """Represents a directory containing one test.""" def __init__(self, path: Path) -> None: self.path = path self._output_error: Exception | None = None @property def test_name(self) -> str: """The name of the test.""" return self.path.name @property def input_path(self) -> Path: """The inputs subdirectory path.""" return self.path / INPUT_DIR_NAME @property def output_path(self) -> Path: """The output file path.""" return self.path / OUTPUT_FILE_NAME @property def exception_path(self) -> Path: """The exception file path.""" return self.path / EXCEPTION_FILE_NAME @property def meta_path(self) -> Path: """The metadata file path.""" return self.path / META_FILE_NAME def is_valid(self) -> bool: """Return True if the fixture file structure is correct, False otherwise.""" return self.input_path.is_dir() and ( self.output_path.is_file() or self.exception_path.is_file() ) def get_page(self, page_cls: type) -> ItemPage: """Return the page object created from the saved input.""" if not issubclass(page_cls, ItemPage): raise TypeError( f"{type(page_cls).__name__} is not a descendant of ItemPage" ) storage = SerializedDataFileStorage(self.input_path) return deserialize(page_cls, storage.read()) def get_meta(self) -> dict: """Return the test metadata.""" if not self.meta_path.exists(): return {} meta_dict = json.loads(self.meta_path.read_bytes()) if meta_dict.get("adapter"): meta_dict["adapter"] = load_class(meta_dict["adapter"]) return meta_dict def _get_adapter_cls(self) -> type[ItemAdapter]: """Return the adapter class set in the metadata.""" cls = self.get_meta().get("adapter") if not cls: return WebPoetTestItemAdapter return cast("type[ItemAdapter]", cls) def _get_output(self, page_cls: type) -> dict[str, Any]: page = self.get_page(page_cls) item = asyncio.run(ensure_awaitable(page.to_item())) return self._item_to_dict(item) @cached_method def get_output(self, page_cls: type) -> dict: """ Return the output from the recreated Page Object, taking frozen time in account. """ try: meta = self.get_meta() frozen_time: str | None = meta.get("frozen_time") if frozen_time: frozen_time_parsed = self._parse_frozen_time(frozen_time) with time_machine.travel(frozen_time_parsed): return self._get_output(page_cls) else: return self._get_output(page_cls) except Exception as e: self._output_error = e raise def _item_to_dict(self, item: Any) -> dict[str, Any]: """Convert an item to a dict. Uses the adapter class set in the metadata. """ return self._get_adapter_cls()(item).asdict() def item_to_json(self, item: Any) -> str: """Convert an item to a JSON string. Uses the adapter class set in the metadata. """ return _format_json(self._item_to_dict(item)) @memoizemethod_noargs def get_expected_output(self) -> dict: """Return the saved output.""" return json.loads(self.output_path.read_bytes()) @memoizemethod_noargs def get_expected_exception(self) -> Exception: """Return the saved exception.""" data = json.loads(self.exception_path.read_bytes()) return _exception_from_dict(data) @staticmethod def _parse_frozen_time(meta_value: str) -> datetime.datetime: """Parse and possibly fix the frozen_time metadata string.""" parsed_value = dateutil.parser.parse(meta_value) if parsed_value.tzinfo is None: # if it's left as None, time_machine will set it to timezone.utc, # but we want to interpret the value as local time return parsed_value.astimezone() if not time_machine.HAVE_TZSET: logger.warning( f"frozen_time {meta_value} includes timezone data which" f" is not supported on Windows, converting to local" ) return parsed_value.astimezone() if parsed_value.tzinfo == dateutil.tz.UTC: return parsed_value.replace(tzinfo=ZoneInfo("UTC")) offset = parsed_value.tzinfo.utcoffset(None) assert offset is not None # typing offset_hours = int(offset.days * 24 + offset.seconds / 3600) tzinfo = ZoneInfo(f"Etc/GMT{-offset_hours:+d}") return parsed_value.replace(tzinfo=tzinfo) def get_expected_output_fields(self): """Return a list of the expected output field names.""" output = self.get_expected_output() return list(output.keys()) def assert_full_item_correct(self, page_cls: type) -> None: """Get the output and assert that it matches the expected output.""" output = _format_json(self.get_output(page_cls)) expected_output = _format_json(self.get_expected_output()) if output != expected_output: raise ItemValueIncorrect(output, expected_output) def assert_field_correct( self, name: str, page_cls: type, *, user_props: list[tuple[str, object]] | None = None, ) -> None: """Assert that a certain field in the output matches the expected value.""" expected_field = json.loads(_format_json(self.get_expected_output()[name])) self._append_user_prop(user_props, "expected_value", expected_field) actual_item = self.get_output(page_cls) if name not in actual_item: raise FieldMissing(name) actual_field = json.loads(_format_json(actual_item[name])) self._append_user_prop(user_props, "actual_value", actual_field) if actual_field != expected_field: raise FieldValueIncorrect(actual_field, expected_field) def assert_no_extra_fields(self, page_cls: type) -> None: """Assert that there are no extra fields in the output.""" output = self.get_output(page_cls) expected_output = self.get_expected_output() extra_field_keys = output.keys() - expected_output.keys() extra_fields = {key: output[key] for key in extra_field_keys} if extra_fields: raise FieldsUnexpected(extra_fields) def to_item_raised(self) -> bool: """Return True if to_item raised an error. Note that if to_item hasn't been called yet, this method returns False. """ return self._output_error is not None def assert_no_toitem_exceptions(self, page_cls: type) -> None: """Assert that to_item() can be run (doesn't raise an error).""" self.get_output(page_cls) def assert_toitem_exception( self, page_cls: type, *, user_props: list[tuple[str, object]] | None = None, ) -> None: """Assert that to_item() raises an exception of the expected type.""" expected_exception = self.get_expected_exception() self._append_user_prop( user_props, "expected_exception", _exception_to_dict(expected_exception) ) try: self.get_output(page_cls) except Exception as ex: self._append_user_prop( user_props, "actual_exception", _exception_to_dict(ex) ) if type(ex) is not type(expected_exception): raise WrongExceptionRaised from ex else: raise ExceptionNotRaised @staticmethod def _append_user_prop( user_props: list[tuple[str, object]] | None, name: str, value: object ) -> None: """A replacement for the ``record_property`` fixture.""" if user_props is not None: user_props.append((f"web_poet_{name}", json.dumps(value))) @classmethod def save( cls, base_directory: str | os.PathLike[str], *, inputs: Iterable[Any], item: Any = None, exception: Exception | None = None, meta: dict | None = None, fixture_name: str | None = None, ) -> Self: """Save and return a fixture.""" if not fixture_name: fixture_name = _get_available_filename("test-{}", base_directory) fixture_dir = Path(base_directory, fixture_name) fixture = cls(fixture_dir) fixture.input_path.mkdir(parents=True) serialized_inputs = serialize(inputs) storage = SerializedDataFileStorage(fixture.input_path) storage.write(serialized_inputs) if meta: if meta.get("adapter"): meta["adapter"] = get_fq_class_name(meta["adapter"]) fixture.meta_path.write_text(_format_json(meta), encoding="utf-8") if item is not None: with fixture.output_path.open("w", encoding="utf-8") as f: f.write(fixture.item_to_json(item)) if exception: exc_data = _exception_to_dict(exception) fixture.exception_path.write_text(_format_json(exc_data), encoding="utf-8") return fixture scrapinghub-web-poet-ba87b95/web_poet/testing/itemadapter.py000066400000000000000000000012171517167256700243360ustar00rootroot00000000000000from __future__ import annotations from collections import deque from itemadapter import ItemAdapter from itemadapter.adapter import ( AdapterInterface, AttrsAdapter, DataclassAdapter, DictAdapter, PydanticAdapter, ScrapyItemAdapter, ) class WebPoetTestItemAdapter(ItemAdapter): """A default adapter implementation""" # In case the user changes ItemAdapter.ADAPTER_CLASSES it's copied here. ADAPTER_CLASSES: deque[type[AdapterInterface]] = deque( [ ScrapyItemAdapter, DictAdapter, DataclassAdapter, AttrsAdapter, PydanticAdapter, ] ) scrapinghub-web-poet-ba87b95/web_poet/testing/pytest.py000066400000000000000000000175221517167256700233750ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING import pytest from web_poet.serialization import load_class from web_poet.testing.exceptions import ( ExceptionNotRaised, FieldMissing, FieldsUnexpected, FieldValueIncorrect, ItemValueIncorrect, WrongExceptionRaised, ) from web_poet.testing.fixture import EXCEPTION_FILE_NAME, OUTPUT_FILE_NAME, Fixture from web_poet.testing.utils import comparison_error_message from web_poet.utils import get_fq_class_name if TYPE_CHECKING: from collections.abc import Iterable from pathlib import Path class TestCase(pytest.File): """Represents the ``output.json`` or ``exception.json`` file in a testcase directory and is tied to a specific page class.""" def __init__(self, *args, type_name: str, **kwargs) -> None: super().__init__(*args, **kwargs) self.fixture = Fixture(self.path.parent) self.page_cls: type = load_class(type_name) def collect(self) -> Iterable[pytest.Item | pytest.Collector]: if self.fixture.exception_path.exists(): return [ WebPoetExpectedException.from_parent( parent=self, name="TO_ITEM_RAISES", fixture=self.fixture, page_cls=self.page_cls, ) ] if self.config.getoption("WEB_POET_TEST_PER_ITEM", default=False): return [ WebPoetItem.from_parent( parent=self, name="item", fixture=self.fixture, page_cls=self.page_cls, ) ] overall_tests: list[pytest.Item] = [ WebPoetNoToItemException.from_parent( parent=self, name="TO_ITEM_DOESNT_RAISE", fixture=self.fixture, page_cls=self.page_cls, ), WebPoetNoExtraFieldsItem.from_parent( parent=self, name="NO_EXTRA_FIELDS", fixture=self.fixture, page_cls=self.page_cls, ), ] field_tests: list[pytest.Item] = [ WebPoetFieldItem.from_parent( parent=self, name=field, fixture=self.fixture, page_cls=self.page_cls, field_name=field, ) for field in self.fixture.get_expected_output_fields() ] return overall_tests + field_tests class _WebPoetItem(pytest.Item): def __init__(self, *, fixture: Fixture, page_cls: type, **kwargs) -> None: super().__init__(**kwargs) self.fixture: Fixture = fixture self.page_cls: type = page_cls self.testname: str = ( f"{get_fq_class_name(self.page_cls)}/{self.fixture.test_name}" ) class WebPoetItem(_WebPoetItem): def runtest(self) -> None: self.fixture.assert_full_item_correct(page_cls=self.page_cls) def reportinfo(self): return self.path, 0, self.testname def repr_failure(self, excinfo, style=None): if isinstance(excinfo.value, ItemValueIncorrect): got, expected = excinfo.value.args return comparison_error_message( config=self.config, op="==", expected=expected, got=got, prefix="The output doesn't match.", ) return super().repr_failure(excinfo, style) class WebPoetNoExtraFieldsItem(_WebPoetItem): def runtest(self) -> None: if self.fixture.to_item_raised(): raise pytest.skip( "Skipping a test for unexpected item fields " "because to_item raised an exception." ) self.fixture.assert_no_extra_fields(page_cls=self.page_cls) def reportinfo(self): return self.path, 0, f"{self.testname}: extra fields" def repr_failure(self, excinfo, style=None): if isinstance(excinfo.value, FieldsUnexpected): fields = excinfo.value.args[0] return f"The item contains unexpected fields: \n{self._format_extra_fields(fields)}" return super().repr_failure(excinfo, style) def _format_extra_fields(self, extra_fields): lines = [] for field, value in extra_fields.items(): lines.append(f" * {field} = {value!r}") return "\n".join(lines) class WebPoetNoToItemException(_WebPoetItem): def runtest(self) -> None: self.fixture.assert_no_toitem_exceptions(page_cls=self.page_cls) def reportinfo(self): return ( self.path, 0, f"{self.testname}: to_item doesn't raise an error", ) class WebPoetExpectedException(_WebPoetItem): def runtest(self) -> None: self.fixture.assert_toitem_exception( page_cls=self.page_cls, user_props=self.user_properties ) def reportinfo(self): return ( self.path, 0, f"{self.testname}: to_item raises {self.fixture.get_expected_exception().__class__.__name__}", ) def repr_failure(self, excinfo, style=None): expected = self.fixture.get_expected_exception() if isinstance(excinfo.value, ExceptionNotRaised): return ( f"to_item() didn't raise an exception." f" {get_fq_class_name(type(expected))} was expected." ) if isinstance(excinfo.value, WrongExceptionRaised): got = excinfo.value.__cause__ inner_excinfo = pytest.ExceptionInfo.from_exc_info( (type(got), got, got.__traceback__) ) return ( f"to_item() raised a wrong exception. Expected" f" {get_fq_class_name(type(expected))}, got" f" {get_fq_class_name(type(got))}.\n\n" + str(super().repr_failure(inner_excinfo, style)) ) return super().repr_failure(excinfo, style) class WebPoetFieldItem(_WebPoetItem): def __init__(self, *, field_name: str, **kwargs) -> None: super().__init__(**kwargs) self.field_name = field_name def runtest(self) -> None: if self.fixture.to_item_raised(): raise pytest.skip( f"Skipping a test for item.{self.field_name} " f"because to_item raised an exception" ) self.fixture.assert_field_correct( self.field_name, page_cls=self.page_cls, user_props=self.user_properties ) def reportinfo(self): return self.path, 0, f"{self.testname} @ {self.field_name}" def repr_failure(self, excinfo, style=None): if isinstance(excinfo.value, FieldValueIncorrect): got, expected = excinfo.value.args return comparison_error_message( config=self.config, op="==", expected=expected, got=got, prefix=f"item.{self.field_name} is not correct.", ) if isinstance(excinfo.value, FieldMissing): field_name = excinfo.value.args[0] return f"item.{field_name} is missing." return super().repr_failure(excinfo, style) def pytest_addoption(parser: pytest.Parser, pluginmanager: pytest.PytestPluginManager): parser.addoption( "--web-poet-test-per-item", dest="WEB_POET_TEST_PER_ITEM", action="store_true", help="web-poet: use a single test per item, not a test per field", ) def pytest_collect_file( file_path: Path, parent: pytest.Collector ) -> pytest.Collector | None: if file_path.name in {OUTPUT_FILE_NAME, EXCEPTION_FILE_NAME}: fixture = Fixture(file_path.parent) if fixture.is_valid(): return TestCase.from_parent( parent, path=file_path, type_name=file_path.parent.parent.name ) return None scrapinghub-web-poet-ba87b95/web_poet/testing/utils.py000066400000000000000000000011721517167256700231770ustar00rootroot00000000000000from __future__ import annotations from typing import TYPE_CHECKING, Any from _pytest.assertion.util import assertrepr_compare if TYPE_CHECKING: import pytest def comparison_error_message( config: pytest.Config, op: str, expected: Any, got: Any, prefix: str = "" ) -> str: """Generate an error message""" lines = [prefix] if prefix else [] explanation_lines = assertrepr_compare( config=config, op=op, left=got, right=expected ) if explanation_lines: lines.extend(explanation_lines) else: lines.append(f"Expected: {expected!r}, got: {got!r}") return "\n".join(lines) scrapinghub-web-poet-ba87b95/web_poet/utils.py000066400000000000000000000250651517167256700215310ustar00rootroot00000000000000from __future__ import annotations import inspect import weakref from collections import deque from collections.abc import Callable, Iterable from functools import lru_cache, partial, wraps from types import MethodType from typing import Any, TypeVar, get_args from warnings import warn import packaging.version from async_lru import __version__ as async_lru_version from async_lru import alru_cache from url_matcher import Patterns def callable_has_parameter(obj: Callable[..., Any], name: str) -> bool: try: sig = inspect.signature(obj) except ValueError: # built-in, e.g. int return False else: return name in sig.parameters def get_fq_class_name(cls: type) -> str: """Return the fully qualified name for a type. >>> from web_poet import Injectable >>> get_fq_class_name(Injectable) 'web_poet.pages.Injectable' >>> from decimal import Decimal >>> get_fq_class_name(Decimal) 'decimal.Decimal' """ return f"{cls.__module__}.{cls.__qualname__}" def _clspath(cls: type, forced: str | None = None) -> str: if forced is not None: return forced return get_fq_class_name(cls) def _create_deprecated_class( name: str, new_class: type, clsdict: dict[str, Any] | None = None, warn_once: bool = True, old_class_path: str | None = None, new_class_path: str | None = None, subclass_warn_message: str = "{cls} inherits from deprecated class {old}, please inherit from {new}.", instance_warn_message: str = "{cls} is deprecated, instantiate {new} instead.", ) -> type: """ Return a "deprecated" class that causes its subclasses to issue a warning. Subclasses of ``new_class`` are considered subclasses of this class. It also warns when the deprecated class is instantiated, but do not when its subclasses are instantiated. It can be used to rename a base class in a library. For example, if we have class OldName(SomeClass): # ... and we want to rename it to NewName, we can do the following:: class NewName(SomeClass): # ... OldName = _create_deprecated_class('OldName', NewName) Then, if user class inherits from OldName, warning is issued. Also, if some code uses ``issubclass(sub, OldName)`` or ``isinstance(sub(), OldName)`` checks they'll still return True if sub is a subclass of NewName instead of OldName. """ class DeprecatedClass(new_class.__class__): # type: ignore[misc, name-defined] deprecated_class = None warned_on_subclass = False def __new__( metacls, name: str, bases: tuple[type, ...], clsdict_: dict[str, Any] ) -> type: cls = super().__new__(metacls, name, bases, clsdict_) if metacls.deprecated_class is None: metacls.deprecated_class = cls return cls def __init__(cls, name: str, bases: tuple[type, ...], clsdict_: dict[str, Any]): meta = cls.__class__ old = meta.deprecated_class if old in bases and not (warn_once and meta.warned_on_subclass): meta.warned_on_subclass = True msg = subclass_warn_message.format( cls=_clspath(cls), old=_clspath(old, old_class_path), new=_clspath(new_class, new_class_path), ) if warn_once: msg += " (warning only on first subclass, there may be others)" warn(msg, DeprecationWarning, stacklevel=2) super().__init__(name, bases, clsdict_) # see https://www.python.org/dev/peps/pep-3119/#overloading-isinstance-and-issubclass # and https://docs.python.org/reference/datamodel.html#customizing-instance-and-subclass-checks # for implementation details def __instancecheck__(cls, inst: Any) -> bool: return any(cls.__subclasscheck__(c) for c in (type(inst), inst.__class__)) def __subclasscheck__(cls, sub: type) -> bool: if cls is not DeprecatedClass.deprecated_class: # we should do the magic only if second `issubclass` argument # is the deprecated class itself - subclasses of the # deprecated class should not use custom `__subclasscheck__` # method. return super().__subclasscheck__(sub) if not inspect.isclass(sub): raise TypeError("issubclass() arg 1 must be a class") mro = getattr(sub, "__mro__", ()) return any(c in {cls, new_class} for c in mro) def __call__(cls, *args: Any, **kwargs: Any) -> Any: old = DeprecatedClass.deprecated_class if cls is old: msg = instance_warn_message.format( cls=_clspath(cls, old_class_path), new=_clspath(new_class, new_class_path), ) warn(msg, DeprecationWarning, stacklevel=2) return super().__call__(*args, **kwargs) deprecated_cls = DeprecatedClass(name, (new_class,), clsdict or {}) try: frm = inspect.stack()[1] parent_module = inspect.getmodule(frm[0]) if parent_module is not None: deprecated_cls.__module__ = parent_module.__name__ except Exception as e: # Sometimes inspect.stack() fails (e.g. when the first import of # deprecated class is in jinja2 template). __module__ attribute is not # important enough to raise an exception as users may be unable # to fix inspect.stack() errors. warn(f"Error detecting parent module: {e!r}", stacklevel=1) return deprecated_cls CallableT = TypeVar("CallableT", bound=Callable) def memoizemethod_noargs(method: CallableT) -> CallableT: """Decorator to cache the result of a method (without arguments) using a weak reference to its object. It is faster than :func:`cached_method`, and doesn't add new attributes to the instance, but it doesn't work if objects are unhashable. """ cache: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary() @wraps(method) def new_method(self, *args, **kwargs): if self not in cache: cache[self] = method(self, *args, **kwargs) return cache[self] return new_method # type: ignore[return-value] def cached_method(method: CallableT) -> CallableT: """A decorator to cache method or coroutine method results, so that if it's called multiple times for the same instance, computation is only done once. The cache is unbound, but it's tied to the instance lifetime. .. note:: :func:`cached_method` is needed because :func:`functools.lru_cache` doesn't work well on methods: self is used as a cache key, so a reference to an instance is kept in the cache, and this prevents deallocation of instances. This decorator adds a new private attribute to the instance named ``_cached_method_{decorated_method_name}``; make sure the class doesn't define an attribute of the same name. """ cached_meth_name = f"_cached_method_{method.__name__}" if inspect.iscoroutinefunction(method): meth = _cached_method_async(method, cached_meth_name) else: meth = _cached_method_sync(method, cached_meth_name) meth.cached_method_name = cached_meth_name return meth def _cached_method_sync(method, cached_method_name: str): @wraps(method) def inner(self, *args, **kwargs): if not hasattr(self, cached_method_name): # on a first call, create a lru_cache-wrapped method, # and store it on the instance bound_method = MethodType(method, self) cached_meth = lru_cache(maxsize=None)(bound_method) setattr(self, cached_method_name, cached_meth) else: cached_meth = getattr(self, cached_method_name) return cached_meth(*args, **kwargs) return inner def _cached_method_async(method, cached_method_name: str): @wraps(method) async def inner(self, *args, **kwargs): if not hasattr(self, cached_method_name): # on a first call, create an alru_cache-wrapped method, # and store it on the instance bound_method = MethodType(method, self) cached_meth = _alru_cache(maxsize=None)(bound_method) setattr(self, cached_method_name, cached_meth) else: cached_meth = getattr(self, cached_method_name) return await cached_meth(*args, **kwargs) return inner # async_lru >= 2.0.0 removed cache_exceptions argument, and changed # its default value. `_alru_cache` is a compatibility function which works with # all async_lru versions and uses the same approach for exception caching # as async_lru >= 2.0.0. _alru_cache: Callable = alru_cache _async_lru_version = packaging.version.parse(async_lru_version) if _async_lru_version.major < 2: _alru_cache = partial(alru_cache, cache_exceptions=False) def as_list(value: Any) -> list[Any]: """Normalizes the value input as a list. >>> as_list(None) [] >>> as_list("foo") ['foo'] >>> as_list(123) [123] >>> as_list(["foo", "bar", 123]) ['foo', 'bar', 123] >>> as_list(("foo", "bar", 123)) ['foo', 'bar', 123] >>> as_list(range(5)) [0, 1, 2, 3, 4] >>> def gen(): ... yield 1 ... yield 2 >>> as_list(gen()) [1, 2] """ if value is None: return [] if isinstance(value, str): return [value] if not isinstance(value, Iterable): return [value] return list(value) async def ensure_awaitable(obj): """Return the value of obj, awaiting it if needed""" if inspect.isawaitable(obj): return await obj return obj def str_to_pattern(url_pattern: str | Patterns) -> Patterns: if isinstance(url_pattern, Patterns): return url_pattern return Patterns([url_pattern]) def get_generic_param(cls: type, expected: type | tuple[type, ...]) -> type | None: """Search the base classes recursively breadth-first for a generic class and return its param. Returns the param of the first found class that is a subclass of ``expected``. """ visited = set() queue = deque([cls]) while queue: node = queue.popleft() visited.add(node) for base in getattr(node, "__orig_bases__", []): origin = getattr(base, "__origin__", None) if origin and issubclass(origin, expected): result = get_args(base)[0] if not isinstance(result, TypeVar): return result queue.append(base) return None