pax_global_header 0000666 0000000 0000000 00000000064 15205334607 0014517 g ustar 00root root 0000000 0000000 52 comment=c516547eb3bb76d69b3377080ca71f52394b83ff
rushter-selectolax-b2a09be/ 0000775 0000000 0000000 00000000000 15205334607 0016042 5 ustar 00root root 0000000 0000000 rushter-selectolax-b2a09be/.dockerignore 0000664 0000000 0000000 00000000225 15205334607 0020515 0 ustar 00root root 0000000 0000000 .git
.gitignore
.gitmodules
.idea
.mypy_cache
.pytest_cache
.ruff_cache
.venv
build
dist
__pycache__
*.pyc
*.pyo
*.pyd
*.so
cython_debug
docs/_build
rushter-selectolax-b2a09be/.github/ 0000775 0000000 0000000 00000000000 15205334607 0017402 5 ustar 00root root 0000000 0000000 rushter-selectolax-b2a09be/.github/workflows/ 0000775 0000000 0000000 00000000000 15205334607 0021437 5 ustar 00root root 0000000 0000000 rushter-selectolax-b2a09be/.github/workflows/make_release.yml 0000664 0000000 0000000 00000003610 15205334607 0024577 0 ustar 00root root 0000000 0000000 name: Build and upload to PyPI
on:
release:
types:
- published
jobs:
build_wheels:
name: Build wheels for ${{ matrix.os }}
runs-on: ${{ matrix.runs-on }}
strategy:
matrix:
include:
- os: linux-intel
runs-on: ubuntu-latest
- os: linux-arm
runs-on: ubuntu-24.04-arm
- os: windows-intel
runs-on: windows-latest
- os: windows-arm
runs-on: windows-11-arm
- os: macos-intel
runs-on: macos-15-intel
- os: macos-arm
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Build wheels
uses: pypa/cibuildwheel@v3.2.0
env:
CIBW_PLATFORM: ${{ matrix.platform || 'auto' }}
- uses: actions/upload-artifact@v4
with:
name: cibw-wheels-${{ matrix.os }}-${{ matrix.platform}}-${{ strategy.job-index }}
path: ./wheelhouse/*.whl
build_sdist:
name: Build source distribution
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Build sdist
run: |
pip install -U Cython packaging setuptools wheel
python setup.py build_ext --inplace --cython
python setup.py sdist
- uses: actions/upload-artifact@v4
with:
path: dist/*.tar.gz
retention-days: 1
name: cibw-sdist
upload_pypi:
needs: [build_wheels, build_sdist]
runs-on: ubuntu-latest
environment: release
permissions:
id-token: write
if: github.event_name == 'release' && github.event.action == 'published'
steps:
- uses: actions/download-artifact@v4
with:
pattern: cibw-*
path: dist
merge-multiple: true
- uses: pypa/gh-action-pypi-publish@release/v1
rushter-selectolax-b2a09be/.github/workflows/pythonpackage.yml 0000664 0000000 0000000 00000003435 15205334607 0025024 0 ustar 00root root 0000000 0000000 name: Python package
on:
pull_request:
branches:
- master
push:
branches:
- master
jobs:
test:
strategy:
max-parallel: 6
matrix:
python-version: ["3.11", "3.12", "3.13" ]
platform: [ubuntu-24.04, macos-latest]
runs-on: ${{ matrix.platform }}
timeout-minutes: 6
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools
pip install -r requirements_dev.txt
USE_LEXBOR=1 python setup.py build_ext --inplace --cython
- name: Test with pytest
run: |
USE_LEXBOR=1 pytest tests
- name: Test typesafety
run: |
pytest typesafety
lint:
strategy:
max-parallel: 6
matrix:
python-version: ["3.13" ]
platform: [ ubuntu-24.04 ]
runs-on: ${{ matrix.platform }}
timeout-minutes: 6
steps:
- uses: actions/checkout@v4
with:
submodules: true
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip setuptools
pip install -r requirements_dev.txt
python3 -m pip install types-pyinstaller
USE_LEXBOR=1 python setup.py build_ext --inplace --cython
- name: Lint using Ruff
run: ruff check selectolax tests
- name: Lint Mypy
run: mypy selectolax tests
- name: Lint Cython
run: cython-lint selectolax/
rushter-selectolax-b2a09be/.gitignore 0000664 0000000 0000000 00000001533 15205334607 0020034 0 ustar 00root root 0000000 0000000 # Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
.idea
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover
.hypothesis/
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# pyenv python configuration file
.python-version
selectolax/parser.c
selectolax/lexbor.c
# virtual env
.venv/
venv/
tmp/
rushter-selectolax-b2a09be/.gitmodules 0000664 0000000 0000000 00000000234 15205334607 0020216 0 ustar 00root root 0000000 0000000 [submodule "modest"]
path = modest
url = https://github.com/lexborisov/modest
[submodule "lexbor"]
path = lexbor
url = https://github.com/lexbor/lexbor
rushter-selectolax-b2a09be/.readthedocs.yaml 0000664 0000000 0000000 00000002263 15205334607 0021274 0 ustar 00root root 0000000 0000000 # Read the Docs configuration file for Sphinx projects
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-lts-latest
tools:
python: latest
# You can also specify other tool versions:
# nodejs: "20"
# rust: "1.70"
# golang: "1.20"
jobs:
pre_build:
- git submodule sync
- git submodule update --init --recursive
- pip install -r requirements_dev.txt
- python setup.py develop
# Build documentation in the "docs/" directory with Sphinx
sphinx:
configuration: docs/conf.py
# You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
# builder: "dirhtml"
# Fail on all warnings to avoid broken references
# fail_on_warning: true
# Optionally build your docs in additional formats such as PDF and ePub
# formats:
# - pdf
# - epub
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: requirements_dev.txt
rushter-selectolax-b2a09be/CHANGES.md 0000664 0000000 0000000 00000021103 15205334607 0017431 0 ustar 00root root 0000000 0000000 # selectolax Changelog
# Version 0.4.10
- Do not destroy nodes when stripping tags
- Allow building selectolax using older lexbor versions (#218)
- Update lexbor. Fixes crashes when parsing HTML (#217).
# Version 0.4.9
- Add an ability to specify tags and namespace for fragmented parser
- Add a new serialization mode when pretty printing: `html5test`
- Allow empty HTML fragments
- Fix attrs access for non-element nodes
- Improve buffer cleanups
- Fix duplicate text when doing `text(deep=True)` on a text node
- Improve text concatenation performance
- Improve attribute handling
# Version 0.4.8
- Add Add `html_pretty`, `inner_html_pretty` methods
- Enable free-threading
- Improve `merge_text_nodes`
- Update lexbor
# Version 0.4.7
- Fix `.text()` and `iter()` for HTML fragments when there are multiple nodes at the root level. Resolves #209.
- Update lexbor. Resolves #212.
- Breaking changes: Empty tags are now serialized to `
` instead of `
`
([Commit 4530fed](https://github.com/lexbor/lexbor/commit/4530fed3f3a2b1c3729f7742be4f56131cb8e086)).
- Improve `unwrap_tags` and `merge_text_nodes`.
# Version 0.4.6
- Fix HTML parsing in fragment parser for `LexborHTMLParser`
- Fix memory leak in fragment parser
- Improve `skip_empty` parameter for text methods
- Add `comment_content` method
- Minor performance optimizations
- Add `create_tag` method to `LexborHTMLParser`
- Fix advanced selector (`.select()`) when attributes are empty.
# Version 0.4.5
- Broken release. Not published to PyPi.
# Version 0.4.4
- Add `is_fragment` parameter to `LexborHTMLParser` @pygarap
- Add the ability to skip empty text nodes for lexbor backend to `.text`, `.iter`, `.traverse` @pygarap
- Add new properties to lexbor backend: `is_element_node`, `is_text_node`, `is_comment_node`, `is_document_node`. @pygarap
- Update `lexbor` library
# Version 0.4.3
- Update `lexbor` library
- Fix missing description on PyPi.
# Version 0.4.2
- Broken release. Not published to PyPi.
# Version 0.4.1
- Fix parsing of CSS selectors that contain Unicode characters.
# Version 0.4.0
- Fix incorrect default value in docstrings for strict argument
- Fix incorrect exception handling for `any_css_matches`
- Fix docstring for `css_first` method
- Fix memory leak in `merge_text_nodes` for lexbor backend
- Update lexbor backend
- Add `.inner_html` property. Allows to get and set inner HTML of a node.
- Update various docstrings.
- Optimize performance for`css_first` in lexbor backend
- Fix segfaults when accessing attributes. Resolves #135.
- Add new `.clone` method to lexbor backend. Resolve #117.
- Improve unicode handling for malformed text. Resolves #138.
- Fix segfaults when doing double `.decompose`. Resolves #179.
- Fix sefgaults when doing double `.unwrap`. Resolves #169.
- Fix typo for tag names. Clarify available tag names.
## Version 0.3.34
Released
- Lexbor backend now supports `:lexbor-contains("abc" i)` CSS pseudo-class to match text nodes.
## Version 0.3.33
Released
- Add `merge_text_nodes` to lexbor backend. Fixes #170. @amirshukayev
- Performance improvements in Cython code. @Vizonex
## Version 0.3.32
Released
- Update lexbor. New version of lexbor fixes bugs with CSS selectors.
## Version 0.3.31
Released
- Improve type hints, add docstrings to type hints
- Prevent decomposing of the root node
- Unpin Cython version and make it Optional
- Allow empty attribute values. Fixes #165.
## Version 0.3.30
Released
- Update lexbor
- Expose `SelectolaxError` exception in lexbor.pyi
## Version 0.3.29
Released
- Feat: Add unwrap empty tags functionality. Fixes #159.
## Version 0.3.28
Released
- Fix: Update lexbor and improve HTML serialization speed. Fixes #153.
- Fix: typo in type annotations. Fixes #147.
- Fix: Fix incorrect type annotations for `LexborHTMLParser.__init__`. Fixes #144.
## Version 0.3.27
Released
- Fix: Header detected as head
## Version 0.3.26
Released
- Improve type hints
## Version 0.3.25
Released
- Feat: Add `parse_fragment()` and `create_tag()`
- Add missing typing for `Node.insert_child()`
- Add `Node.parser` to access the `HTMLParser` to which the node belongs
## Version 0.3.24
Released
- Add `Node.insert_child` method to lexbor and modest backends
## Version 0.3.23
Released
- Add Python 3.13 wheels
- Update lexbor
## Version 0.3.21
Released
- ***Breaking change***: `lexbor` backend now includes the root node when querying CSS selectors. Same as `Modest` backend.
- Fix `css_matches` and `any_css_matches` methods for `Modest` backend on some compilers
## Version 0.3.20
Released
- Fixup for 0.3.19 release
- Fix tag order for `lexbor` backend
## Version 0.3.19
Released
- Increase maximum HTML size to 2.4GB
## Version 0.3.18
Released
- Fix memory leak when using CSS selectors, `lexbor` backend
## Version 0.3.17
Released
- Update lexbor
- Add Python 3.12 wheels
## Version 0.3.16
Released
- Make HTML nodes hashable
- Pin Cython version
## Version 0.3.15
Released
- Improve typing. Thanks to @nesb1
## Version 0.3.14
Released
- Fix memory leak for `lexbor` backend
## Version 0.3.13
Released
- Update `lexbor`
## Version 0.3.12
Released
- Update `lexbor`
- Add Python 3.11 wheels
## Version 0.3.11
Released
- Fix out-of-bounds bug for `merge_text_nodes` method.
## Version 0.3.10
Released
This release does not contain any changes.
Due to a typo in the version number ([#70](https://github.com/rushter/selectolax/issues/70)), we need to make a new release.
## Version 0.3.9
Released
- Remove trailing separator when using `text(deep=True, separator='x')`.
- Add a new `merge_text_nodes` method for Modest backend.
## Version 0.3.8
Released
- Fix incorrect text handling when using `text(deep=True)` on a text node.
## Version 0.3.7
Released
- Fix return type of HTMLParser.tags
## Version 0.3.6
Released
- Improve text handling
- Add binary builds for Python 3.10 and ARM on MacOS and Linux
## Version 0.3.5
Released
- Add type annotations
## Version 0.3.4
Released
- Fix `HTMLParser.html`
## Version 0.3.3
Released
- Use `document` for the `HTMLParser.html`, `LexborHTMLParser.html` root properties
## Version 0.3.2
Released
- Fix `selector` method for lexbor
- Improve text extraction for lexbor
## Version 0.3.1
Released
- Fix `setup.py` for Windows
## Version 0.3.0
Released
- Added `lexbor` backend
- Fix cloning for `Modest` backend
## Version 0.2.14
Released
- Added advanced Selector (the `select` method)
- Improved speed of `strip_tags`
- Added `clone` method for the `HtmlParser` object
- Exposed `detect_encoding`, `decode_errors`, `use_meta_tags`, `raw_html` attributes for `HtmlParser`
- Added `sget` method to the `attrs` property
## Version 0.2.13
Released
- Don't throw exception when encoding text as UTF-8 bytes fails ([#40](https://github.com/rushter/selectolax/issues/40)).
- Fix Node.attrs.items() causes ([#39](https://github.com/rushter/selectolax/issues/39)).
## Version 0.2.12
Released
- Build wheels Apple Silicon
## Version 0.2.11
Released
- Fix strip argument is ignored for the root node ([#35](https://github.com/rushter/selectolax/issues/35)).
- Fix CSS parser hangs on a bad CSS selector ([#36](https://github.com/rushter/selectolax/issues/36)).
## Version 0.2.10
Released
- Fix root node property ([#32](https://github.com/rushter/selectolax/issues/32)). The `root` property now points to the html tag.
## Version 0.2.9
Released
- Fix README for PyPI
## Version 0.2.8
Released
- Add wheels for Python 3.9
## Version 0.2.7
Released
- Add `raw_value` attribute for `Node` objects ([#22](https://github.com/rushter/selectolax/issues/22))
- Improve node modification operations
## Version 0.2.6
Released
- Fix dependency on the source `Node` when inserting to or modifying destination `Node`
## Version 0.2.5
Released
- Allow to pass Node instances to `replace_with`, `insert_before` and `insert_after` methods
- Added `insert_before` and `insert_after` methods
## Version 0.2.4
Released
- Set maximum input size to 80MB
- Update modest
## Version 0.2.3
Released
- Rebuild PyPi wheels to support Python 3.8 and manylinux2010
## Version 0.2.2
Released
- Fix node comparison
## Version 0.2.1
Released
- Add optional `include_text` parameter for the `iter` and `traverse` methods
## Version 0.2.0
Released
- Fix `iter()` does not yield text nodes
- Switch from TravisCI to Github Actions
- Build and ship wheels for Windows, MacOS and Linux using Azure Pipelines
- Add `unwrap` and `unwrap_tags` method ([#7](https://github.com/rushter/selectolax/issues/7))
- Add `replace_with` method ([#13](https://github.com/rushter/selectolax/issues/13))
- Add `attrs` property
- Add `traverse` method
rushter-selectolax-b2a09be/Dockerfile 0000664 0000000 0000000 00000000667 15205334607 0020045 0 ustar 00root root 0000000 0000000 FROM python:3.12-slim
RUN apt-get update && apt-get install -y \
gcc \
libc-dev \
make \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY requirements_dev.txt .
RUN pip install --no-cache-dir Cython setuptools wheel && \
pip install --no-cache-dir -r requirements_dev.txt
COPY . .
RUN python setup.py install
RUN mkdir /test_run && \
cp test.py *.html /test_run/
WORKDIR /test_run
CMD ["python", "test.py"]
rushter-selectolax-b2a09be/LICENSE 0000664 0000000 0000000 00000002065 15205334607 0017052 0 ustar 00root root 0000000 0000000
MIT License
Copyright (c) 2018-2026, Artem Golubin
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
rushter-selectolax-b2a09be/MANIFEST.in 0000664 0000000 0000000 00000001155 15205334607 0017602 0 ustar 00root root 0000000 0000000 include CONTRIBUTING.rst
include HISTORY.rst
include LICENSE
include README.md
include CHANGES.md
include selectolax/*
include selectolax/lexbor/*
include selectolax/modest/*
include selectolax/lexbor/*.so
exclude selectolax/*.so
recursive-include modest/source *.c *.h
recursive-include modest/include *.h
include modest/*
include modest/include/*
include modest/source/*
recursive-include lexbor/source *.c *.h
include lexbor/*
include lexbor/source/*
recursive-include tests *
recursive-exclude * __pycache__
recursive-exclude * *.py[co]
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
rushter-selectolax-b2a09be/Makefile 0000664 0000000 0000000 00000004677 15205334607 0017520 0 ustar 00root root 0000000 0000000 .PHONY: clean clean-test clean-pyc clean-build docs help
.DEFAULT_GOAL := help
define BROWSER_PYSCRIPT
import os, webbrowser, sys
try:
from urllib import pathname2url
except:
from urllib.request import pathname2url
webbrowser.open("file://" + pathname2url(os.path.abspath(sys.argv[1])))
endef
export BROWSER_PYSCRIPT
define PRINT_HELP_PYSCRIPT
import re, sys
for line in sys.stdin:
match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line)
if match:
target, help = match.groups()
print("%-20s %s" % (target, help))
endef
export PRINT_HELP_PYSCRIPT
BROWSER := python -c "$$BROWSER_PYSCRIPT"
help:
@python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST)
clean: clean-build clean-pyc clean-test ## remove all build, test, coverage and Python artifacts
clean-build: ## remove build artifacts
rm -fr build/
rm -fr dist/
rm -rf .eggs/
find . -maxdepth 1 -name '*.egg-info' -exec rm -fr {} +
find . -maxdepth 1 -name '*.egg' -exec rm -f {} +
clean-pyc: ## remove Python file artifacts
find . -name '*.pyc' -exec rm -f {} +
find . -name '*.pyo' -exec rm -f {} +
find . -name '*~' -exec rm -f {} +
find . -name '__pycache__' -exec rm -fr {} +
clean-test: ## remove test and coverage artifacts
rm -f .coverage
rm -fr htmlcov/
lint: ## check style with ruff
ruff format selectolax tests
ruff check --fix selectolax tests
cython-lint selectolax/
mypy selectolax tests
.PHONY: test
test: ## run tests quickly with the default Python
pytest tests -s -v
coverage: ## check code coverage quickly with the default Python
coverage run --source selectolax -m pytest
coverage report -m
coverage html
$(BROWSER) htmlcov/index.html
docs: ## generate Sphinx HTML documentation, including API docs
rm -f docs/selectolax.rst
rm -f docs/modules.rst
sphinx-apidoc -o docs/ selectolax
$(MAKE) -C docs clean
$(MAKE) -C docs html
$(BROWSER) docs/_build/html/index.html
servedocs: docs ## compile the docs watching for changes
watchmedo shell-command -p '*.rst' -c '$(MAKE) -C docs html' -R -D .
release: clean ## package and upload a release
python setup.py sdist upload
python setup.py bdist_wheel upload
dist: clean ## builds source and wheel package
python setup.py sdist
python setup.py bdist_wheel
ls -l dist
install: clean ## install the package to the active Python's site-packages
python setup.py install
dev:
python setup.py build_ext --inplace --cython --lexbor
dev-static: clean-build
python setup.py build_ext --inplace --cython --static --disable-modest
rushter-selectolax-b2a09be/README.md 0000664 0000000 0000000 00000013532 15205334607 0017325 0 ustar 00root root 0000000 0000000 
---
A fast HTML5 parser with CSS selectors, written in Cython,
using [Modest](https://github.com/lexborisov/Modest/) and [Lexbor](https://github.com/lexbor/lexbor) engines.
---
[](https://pypi.org/project/selectolax)
[](https://pepy.tech/projects/selectolax)
[](https://github.com/rushter/selectolax/actions/workflows/pythonpackage.yml?query=branch%3Amaster+event%3Apush)
[](https://pypi.org/project/selectolax)
[](https://github.com/rushter/selectolax/blob/master/LICENSE)
---
## Installation
From PyPI using pip:
```bash
pip install selectolax
```
If installation fails due to compilation errors, you may need to install [Cython](https://github.com/cython/cython):
```bash
pip install selectolax[cython]
```
This usually happens when you try to install an outdated version of selectolax on a newer version of Python.
Development version from GitHub:
```bash
git clone --recursive https://github.com/rushter/selectolax
cd selectolax
pip install -r requirements_dev.txt
python setup.py install
```
How to compile selectolax while developing:
```bash
make clean
make dev
```
## Basic examples
Here are some basic examples to get you started with selectolax:
Parsing HTML and extracting text:
```python
In [1]: from selectolax.lexbor import LexborHTMLParser
...:
...: html = """
...:
Hi there
...:
Lorem Ipsum is simply dummy text of the printing and typesetting industry.
...:
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
...: """
...: tree = LexborHTMLParser(html)
In [2]: tree.css_first('h1#title').text()
Out[2]: 'Hi there'
In [3]: tree.css_first('h1#title').attributes
Out[3]: {'id': 'title', 'data-updated': '20201101'}
In [4]: [node.text() for node in tree.css('.post')]
Out[4]:
['Lorem Ipsum is simply dummy text of the printing and typesetting industry. ',
'Lorem ipsum dolor sit amet, consectetur adipiscing elit.']
```
### Using advanced CSS selectors
```python
In [1]: html = "
"
...: selector = "div > :nth-child(2n+1):not(:has(a))"
In [2]: for node in LexborHTMLParser(html).css(selector):
...: print(node.attributes, node.text(), node.tag)
...: print(node.parent.tag)
...: print(node.html)
...:
{'id': 'p1'} p
div
{'id': 'p5'} text p
div
text
```
#### Using `lexbor-contains` CSS pseudo-class to match text
```python
from selectolax.lexbor import LexborHTMLParser
html = "
hello
lexbor is AwesOme
"
parser = LexborHTMLParser(html)
# Case-insensitive search
results = parser.css('p:lexbor-contains("awesome" i)')
# Case-sensitive search
results = parser.css('p:lexbor-contains("AwesOme")')
assert len(results) == 1
assert results[0].text() == "lexbor is AwesOme"
```
* [More examples](https://selectolax.readthedocs.io/en/latest/examples.html)
### Available backends
Selectolax supports two backends: `Modest` and `Lexbor`. By default, all examples use the `Lexbor` backend.
Most of the features between backends are almost identical, but there are some differences.
As of 2024, the preferred backend is `Lexbor`. The `Modest` backend is still available for compatibility reasons
and the underlying C library that selectolax uses is not maintained anymore.
To use `lexbor`, just import the parser and use it in the similar way to the `HTMLParser`.
```python
In [1]: from selectolax.lexbor import LexborHTMLParser
In [2]: html = """
...: Hi there
...:
2021-08-15
...: """
In [3]: parser = LexborHTMLParser(html)
In [4]: parser.root.css_first("#updated").text()
Out[4]: '2021-08-15'
```
## Simple Benchmark
* Extract title, links, scripts and a meta tag from main pages of top 754 domains. See `examples/benchmark.py` for more information.
| Package | Time |
|-------------------------------|-----------|
| Beautiful Soup (html.parser) | 61.02 sec.|
| lxml / Beautiful Soup (lxml) | 9.09 sec. |
| html5_parser | 16.10 sec.|
| selectolax (Modest) | 2.94 sec. |
| selectolax (Lexbor) | 2.39 sec. |
## Links
* [selectolax API reference and examples](https://selectolax.readthedocs.io/en/latest/index.html)
* [Video introduction to web scraping using selectolax](https://youtu.be/HpRsfpPuUzE)
* [How to Scrape 7k Products with Python using selectolax and httpx](https://www.youtube.com/watch?v=XpGvq755J2U)
* [Modest introduction](https://lexborisov.github.io/Modest/)
* [Modest benchmark](https://lexborisov.github.io/benchmark-html-parsers/)
* [Python benchmark](https://rushter.com/blog/python-fast-html-parser/)
* [Another Python benchmark](https://www.peterbe.com/plog/selectolax-or-pyquery)
* [Universal interface to lxml and selectolax](https://github.com/lorien/domselect)
## License
* Modest engine — [LGPL2.1](https://github.com/lexborisov/Modest/blob/master/LICENSE)
* lexbor engine — [Apache-2.0 license](https://github.com/lexbor/lexbor?tab=Apache-2.0-1-ov-file#readme)
* selectolax - [MIT](https://github.com/rushter/selectolax/blob/master/LICENSE)
## Contributors
Thanks to all the contributors of selectolax!
rushter-selectolax-b2a09be/docs/ 0000775 0000000 0000000 00000000000 15205334607 0016772 5 ustar 00root root 0000000 0000000 rushter-selectolax-b2a09be/docs/.gitignore 0000664 0000000 0000000 00000000057 15205334607 0020764 0 ustar 00root root 0000000 0000000 /selectolax.rst
/selectolax.*.rst
/modules.rst
rushter-selectolax-b2a09be/docs/Makefile 0000664 0000000 0000000 00000015172 15205334607 0020440 0 ustar 00root root 0000000 0000000 # Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = _build
# User-friendly check for sphinx-build
ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
endif
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
help:
@echo "Please use \`make ' where is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " singlehtml to make a single large HTML file"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " devhelp to make HTML files and a Devhelp project"
@echo " epub to make an epub"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " latexpdf to make LaTeX files and run them through pdflatex"
@echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
@echo " text to make text files"
@echo " man to make manual pages"
@echo " texinfo to make Texinfo files"
@echo " info to make Texinfo files and run them through makeinfo"
@echo " gettext to make PO message catalogs"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " xml to make Docutils-native XML files"
@echo " pseudoxml to make pseudoxml-XML files for display purposes"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
singlehtml:
$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
@echo
@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/selectolax.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/selectolax.qhc"
devhelp:
$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
@echo
@echo "Build finished."
@echo "To view the help file:"
@echo "# mkdir -p $$HOME/.local/share/devhelp/selectolax"
@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/selectolax"
@echo "# devhelp"
epub:
$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
@echo
@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make' in that directory to run these through (pdf)latex" \
"(use \`make latexpdf' here to do that automatically)."
latexpdf:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through pdflatex..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
latexpdfja:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo "Running LaTeX files through platex and dvipdfmx..."
$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
text:
$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
@echo
@echo "Build finished. The text files are in $(BUILDDIR)/text."
man:
$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
@echo
@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
texinfo:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo
@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
@echo "Run \`make' in that directory to run these through makeinfo" \
"(use \`make info' here to do that automatically)."
info:
$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
@echo "Running Texinfo files through makeinfo..."
make -C $(BUILDDIR)/texinfo info
@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
gettext:
$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
@echo
@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
xml:
$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
@echo
@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
pseudoxml:
$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
@echo
@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
rushter-selectolax-b2a09be/docs/conf.py 0000775 0000000 0000000 00000021231 15205334607 0020273 0 ustar 00root root 0000000 0000000 #!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# selectolax documentation build configuration file, created by
# sphinx-quickstart on Tue Jul 9 22:26:36 2013.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import os
import platform
import sys
# If extensions (or modules to document with autodoc) are in another
# directory, add these directories to sys.path here. If the directory is
# relative to the documentation root, use os.path.abspath to make it
# absolute, like shown here.
# sys.path.insert(0, os.path.abspath('.'))
# sys.path.insert(0, os.path.abspath('../../'))
# Get the project root dir, which is the parent dir of this
cwd = os.getcwd()
project_root = os.path.dirname(cwd)
# Insert the project root dir as the first element in the PYTHONPATH.
# This lets us ensure that the source package is imported, and that its
# version is used.
if platform.system() == "Darwin":
sys.path.insert(0, project_root)
import selectolax
# -- General configuration ---------------------------------------------
# If your documentation needs a minimal Sphinx version, state it here.
# needs_sphinx = '1.0'
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.viewcode",
"numpydoc",
"sphinxext.opengraph",
"sphinx_copybutton",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = ".rst"
# The encoding of source files.
# source_encoding = 'utf-8-sig'
# The master toctree document.
master_doc = "index"
# General information about the project.
project = "selectolax"
copyright = "2018-2026, Artem Golubin"
# The version info for the project you're documenting, acts as replacement
# for |version| and |release|, also used in various other places throughout
# the built documents.
#
# The short X.Y version.
version = selectolax.__version__
# The full version, including alpha/beta/rc tags.
release = selectolax.__version__
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
# language = None
# There are two options for replacing |today|: either, you set today to
# some non-false value, then it is used:
# today = ''
# Else, today_fmt is used as the format for a strftime call.
# today_fmt = '%B %d, %Y'
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
exclude_patterns = ["_build"]
# The reST default role (used for this markup: `text`) to use for all
# documents.
# default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
# add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
# add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
# show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = "sphinx"
# A list of ignored prefixes for module index sorting.
# modindex_common_prefix = []
# If true, keep warnings as "system message" paragraphs in the built
# documents.
# keep_warnings = False
# -- Options for HTML output -------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
# html_theme = 'default'
html_theme = "furo"
# Theme options are theme-specific and customize the look and feel of a
# theme further. For a list of options available for each theme, see the
# documentation.
html_theme_options = {
"source_repository": "https://github.com/rushter/selectolax",
"source_branch": "master",
"source_directory": "docs/",
}
# Add any paths that contain custom themes here, relative to this directory.
# html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# " v documentation".
# html_title = None
# A shorter title for the navigation bar. Default is the same as
# html_title.
# html_short_title = None
# The name of an image file (relative to this directory) to place at the
# top of the sidebar.
html_logo = "logo.png"
# The name of an image file (within the static path) to use as favicon
# of the docs. This file should be a Windows icon file (.ico) being
# 16x16 or 32x32 pixels large.
# html_favicon = None
# Add any paths that contain custom static files (such as style sheets)
# here, relative to this directory. They are copied after the builtin
# static files, so a file named "default.css" will overwrite the builtin
# "default.css".
# html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page
# bottom, using the given strftime format.
# html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
# html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
# html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names
# to template names.
# html_additional_pages = {}
# If false, no module index is generated.
# html_domain_indices = True
# If false, no index is generated.
# html_use_index = True
# If true, the index is split into individual pages for each letter.
# html_split_index = False
# If true, links to the reST sources are added to the pages.
# html_show_sourcelink = True
# If true, "Created using Sphinx" is shown in the HTML footer.
# Default is True.
# html_show_sphinx = True
# If true, "(C) Copyright ..." is shown in the HTML footer.
# Default is True.
# html_show_copyright = True
# If true, an OpenSearch description file will be output, and all pages
# will contain a tag referring to it. The value of this option
# must be the base URL from which the finished HTML is served.
# html_use_opensearch = ''
# This is the file name suffix for HTML files (e.g. ".xhtml").
# html_file_suffix = None
# Output file base name for HTML help builder.
htmlhelp_basename = "selectolaxdoc"
# -- Options for LaTeX output ------------------------------------------
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
# 'preamble': '',
}
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass
# [howto/manual]).
latex_documents = [
("index", "selectolax.tex", "selectolax Documentation", "Artem Golubin", "manual"),
]
# The name of an image file (relative to this directory) to place at
# the top of the title page.
# latex_logo = None
# For "manual" documents, if this is true, then toplevel headings
# are parts, not chapters.
# latex_use_parts = False
# If true, show page references after internal links.
# latex_show_pagerefs = False
# If true, show URL addresses after external links.
# latex_show_urls = False
# Documents to append as an appendix to all manuals.
# latex_appendices = []
# If false, no module index is generated.
# latex_domain_indices = True
# -- Options for manual page output ------------------------------------
# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [("index", "selectolax", "selectolax Documentation", ["Artem Golubin"], 1)]
# If true, show URL addresses after external links.
# man_show_urls = False
# -- Options for Texinfo output ----------------------------------------
# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(
"index",
"selectolax",
"selectolax Documentation",
"Artem Golubin",
"selectolax",
"One line description of project.",
"Miscellaneous",
),
]
# Documents to append as an appendix to all manuals.
# texinfo_appendices = []
# If false, no module index is generated.
# texinfo_domain_indices = True
# How to display URL addresses: 'footnote', 'no', or 'inline'.
# texinfo_show_urls = 'footnote'
# If true, do not generate a @detailmenu in the "Top" node's menu.
# texinfo_no_detailmenu = False
numpydoc_show_class_members = False
rushter-selectolax-b2a09be/docs/examples.rst 0000664 0000000 0000000 00000053634 15205334607 0021355 0 ustar 00root root 0000000 0000000 Examples
========
This page contains simple examples of how to use Selectolax for HTML parsing and manipulation.
.. note::
All examples use the Lexbor backend (``from selectolax.lexbor import LexborHTMLParser``)
which provides better performance and features compared to the older Modest backend.
Basic HTML Parsing
------------------
There are 3 ways to create or parse objects in Selectolax:
1. Parse HTML as a full document using ``LexborHTMLParser()``
2. Parse HTML as a fragment using ``LexborHTMLParser(..., is_fragment=True)``
3. Create single node using ``LexborHTMLParser(...).create_node()``
- ``LexborHTMLParser()`` - Returns the HTML tree as parsed by Lexbor, unmodified. The HTML is assumed to be a full document. ````, ````, and ```` tags are added if missing.
- ``LexborHTMLParser(..., is_fragment=True)`` - Intended for HTML fragments/partials.
Behaves the same way as `DocumentFragment` in browsers.
Drops ````, ````, and ```` tags if present in the input HTML.
Use it to parse snippets of HTML that are not complete documents.
.. code-block:: python
from selectolax.lexbor import LexborHTMLParser
html = """
Welcome to selectolax tutorial
Excepteur sint occaecat cupidatat non proident
Lorem ipsum
Lorem ipsum dolor sit amet, ea quo modus meliore platonem.
"""
fragment = """
Hello there!
"""
# Parse HTML as a full document
parser = LexborHTMLParser(html)
# Parse HTML as a fragment
frag_parser = LexborHTMLParser(html, is_fragment=True)
# Create a new node for `parser`.
node = parser.create_node("div")
CSS Selectors
-------------
Select All Elements with CSS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Find all paragraph elements with class 'p3' and examine their properties.
.. code-block:: python
from selectolax.lexbor import LexborHTMLParser
html = """
Excepteur sint occaecat cupidatat non proident
Lorem ipsum
Lorem ipsum dolor sit amet, ea quo modus meliore platonem.
attributes: {'class': 'p3', 'style': 'display:none;'}
node text: Excepteur sint occaecat cupidatat non proident
tag: p
parent tag: div
last child inside current node: Excepteur sint occaecat cupidatat non proident
---------------------
---------------------
Node:
Lorem ipsum
attributes: {'class': 'p3', 'vid': ''}
node text: Lorem ipsum
tag: p
parent tag: div
last child inside current node: Lorem ipsum
---------------------
Select First Match
~~~~~~~~~~~~~~~~~~
Get the first matching element using CSS selectors.
.. code-block:: python
parser = LexborHTMLParser(html)
# Get first h1 element
print("H1: %s" % parser.css_first('h1').text())
**Output:**
.. code-block:: text
H1: Welcome to selectolax tutorial
Default Return Values
~~~~~~~~~~~~~~~~~~~~~
Handle cases where no elements match your selector by providing a default value.
.. code-block:: python
# Return default value if no matches found
print("Title: %s" % parser.css_first('title', default='not-found'))
**Output:**
.. code-block:: text
Title: not-found
Strict Mode
~~~~~~~~~~~
Ensure exactly one match exists, otherwise raise an error.
.. code-block:: python
# This will raise an error if multiple matches are found
try:
result = parser.css_first("p.p3", default='not-found', strict=True)
except Exception as e:
print(f"Error: {e}")
**Output:**
.. code-block:: text
ValueError: Expected 1 match, but found 2 matches
CSS Chaining
~~~~~~~~~~~~
Chain multiple CSS selectors to progressively filter results.
.. code-block:: python
html = """
"""
parser = LexborHTMLParser(html)
# Chain selectors: start with div, then span, then .red
red_spans = parser.select('div').css("span").css(".red").matches
print([node.html for node in red_spans])
**Output:**
.. code-block:: text
['', '']
HTML manipulation
-----------------
Getting HTML data back
~~~~~~~~~~~~~~~~~~~~~~
You can get HTML data back using `.html` or `.inner_html` properties.
They can be called on any node.
.. code-block:: python
from selectolax.lexbor import LexborHTMLParser
html = """
Changing HTML
~~~~~~~~~~~~~~
You can also change HTML by setting the `.inner_html` property.
.. code-block:: python
from selectolax.lexbor import LexborHTMLParser
html = """
DOM Navigation
--------------
Parent Elements
~~~~~~~~~~~~~~~
Get parent element in the DOM tree.
.. code-block:: python
# Print parent of p#stext
print(parser.css_first('p#stext').parent.html)
**Output:**
.. code-block:: text
Lorem ipsum dolor sit amet, ea quo modus meliore platonem.
Nested Selectors
~~~~~~~~~~~~~~~~
Chain CSS selectors to find nested elements.
.. code-block:: python
# Chain CSS selectors
result = parser.css_first('div#text').css_first('p:nth-child(2)').html
print(result)
**Output:**
.. code-block:: text
Lorem ipsum
Iterating Over Child Nodes
~~~~~~~~~~~~~~~~~~~~~~~~~~~
Walk all child nodes of an element.
.. code-block:: python
for node in parser.css("div#text"):
for cnode in node.iter():
print(cnode.tag, cnode.html)
**Output:**
.. code-block:: text
p
Excepteur sint occaecat cupidatat non proident
p
Lorem ipsum
DOM Modification
----------------
Tag Removal
~~~~~~~~~~~
Completely remove elements from the DOM tree.
.. code-block:: python
parser = LexborHTMLParser(html)
# Remove all p tags
for node in parser.tags('p'):
node.decompose()
print(parser.body.html)
**Output:**
.. code-block:: text
Welcome to selectolax tutorial
Tag Unwrapping
~~~~~~~~~~~~~~
Remove tags but preserve their content.
.. code-block:: python
parser = LexborHTMLParser(html)
# Remove p and i tags but keep their content
parser.unwrap_tags(['p', 'i'])
print(parser.body.html)
**Output:**
.. code-block:: text
Welcome to selectolax tutorial
Excepteur sint occaecat cupidatat non proident
Lorem ipsum
Lorem ipsum dolor sit amet, ea quo modus meliore platonem.
Attribute Manipulation
~~~~~~~~~~~~~~~~~~~~~~
Add, modify, and remove element attributes.
.. code-block:: python
parser = LexborHTMLParser(html)
node = parser.css_first('div#text')
# Set attributes
node.attrs['data'] = 'secret data'
node.attrs['id'] = 'new_id'
print(node.attributes)
# Remove attributes
del node.attrs['id']
print(node.attributes)
print(node.html)
**Output:**
.. code-block:: text
{'id': 'new_id', 'data': 'secret data'}
{'data': 'secret data'}
Excepteur sint occaecat cupidatat non proident
Lorem ipsum
Inserting Nodes
~~~~~~~~~~~~~~~
Insert new content into the DOM at specific positions.
.. code-block:: python
html = """
"""
parser = LexborHTMLParser(html)
# Insert text before an element
red_node = parser.css_first('.red')
red_node.insert_before("Hello")
# Insert HTML nodes
subtree = LexborHTMLParser("
Hi
")
green_node = parser.css_first('.green')
green_node.insert_before(subtree)
# Insert before, after, or as child
car_div = parser.create_node("div")
car_div.inner_html = "Car"
green_node.insert_before(car_div)
green_node.insert_after(car_div)
green_node.insert_child(car_div)
print(parser.body.html)
Tree Traversal
--------------
Walk every node in the DOM tree and extract text content.
.. code-block:: python
parser = LexborHTMLParser(html)
# Traverse the entire tree
for node in parser.root.traverse(include_text=True):
if node.tag == '-text':
text = node.text(deep=True).strip()
if text:
print(text)
else:
print(node.tag)
**Output:**
.. code-block:: text
html
head
body
div
p
Excepteur
i
sint
occaecat cupidatat non proident
p
Lorem ipsum
div
p
Lorem ipsum dolor sit amet, ea quo modus meliore platonem.
Common Patterns
---------------
Extract Text Content
~~~~~~~~~~~~~~~~~~~~
Extract text content from HTML elements with various formatting options.
.. code-block:: python
parser = LexborHTMLParser('
Hello world!
')
# Get text content with different options
node = parser.css_first('p')
# Get all text content
print(node.text()) # "Hello world!"
# Get text with custom separator
print(node.text(separator=' | ')) # "Hello | world | !"
# Get text without stripping whitespace
print(node.text(strip=False))
**Output:**
.. code-block:: text
Hello world!
Hello | world | !
Hello world!
Clean HTML
~~~~~~~~~~
Remove potentially dangerous or unwanted HTML elements.
.. code-block:: python
dirty_html = '''
Good content
More content
'''
parser = LexborHTMLParser(dirty_html)
# Remove unwanted tags
for tag in parser.css('script, style'):
tag.decompose()
print(parser.body.html)
**Output:**
.. code-block:: text
Good content
More content
Extract Links and Images
~~~~~~~~~~~~~~~~~~~~~~~~
Extract all links and images from HTML content.
.. code-block:: python
html = '''
'''
parser = LexborHTMLParser(html)
# Extract all links
for link in parser.css('a[href]'):
print(f"Link: {link.text()} -> {link.attrs['href']}")
# Extract all images
for img in parser.css('img[src]'):
print(f"Image: {img.attrs.get('alt', 'No alt')} -> {img.attrs['src']}")
**Output:**
.. code-block:: text
Link: Link 1 -> https://example.com
Link: Link 2 -> /page2
Image: Image 1 -> image1.jpg
Image: Image 2 -> image2.png
Advanced selectors
------------------
Text Content Filtering
~~~~~~~~~~~~~~~~~~~~~~
Use advanced selectors to filter elements based on their text content.
.. code-block:: python
html = """
"""
parser = LexborHTMLParser(html)
# Filter script tags containing specific text
scripts_with_super = parser.select('script').text_contains("super").matches
print([node.text() for node in scripts_with_super])
**Output:**
.. code-block:: text
['\n var super_variable = 100;\n']
CSS Attribute and Pseudo-class Selectors
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. code-block:: python
html = """
First Post
Content of first post
John2023-01-01
Second Post
Content of second post
Jane2023-01-02
"""
parser = LexborHTMLParser(html)
# Attribute selectors
published_posts = parser.css('article.post.published')
print(f"Published posts: {len(published_posts)}")
# Descendant selectors
authors = parser.css('article .meta .author')
for author in authors:
print(f"Author: {author.text()}")
# Pseudo-class selectors
first_article = parser.css('article:first-child')
if first_article:
print(f"First article title: {first_article[0].css_first('h2').text()}")
# Attribute value selectors
specific_post = parser.css_first('article[data-id="1"]')
if specific_post:
print(f"Post ID 1 title: {specific_post.css_first('h2').text()}")
**Output:**
.. code-block:: text
Published posts: 1
Author: John
Author: Jane
First article title: First Post
Post ID 1 title: First Post
Text Content Pseudo-class Selectors
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Use lexbor-specific pseudo-classes for case-sensitive and case-insensitive text matching.
.. code-block:: python
html = '