pax_global_header 0000666 0000000 0000000 00000000064 15202532665 0014520 g ustar 00root root 0000000 0000000 52 comment=e0aee790be2b2510e3c9842d5e782545cc0fb6f1
cogent3-scinexus-e0aee79/ 0000775 0000000 0000000 00000000000 15202532665 0015422 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/.github/ 0000775 0000000 0000000 00000000000 15202532665 0016762 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/.github/workflows/ 0000775 0000000 0000000 00000000000 15202532665 0021017 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/.github/workflows/ci.yml 0000664 0000000 0000000 00000004715 15202532665 0022144 0 ustar 00root root 0000000 0000000 name: CI
on:
push:
pull_request:
# NOTE:
# if changing python versions, also update versions in
# - release.yml
# - noxfile.py
jobs:
tests:
name: "Python ${{ matrix.python-version }} (${{ matrix.os }})"
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.11", "3.14"]
steps:
- uses: "actions/checkout@v6"
with:
fetch-depth: 0
# Setup env
- uses: "actions/setup-python@v6"
with:
python-version: "${{ matrix.python-version }}"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: Install MPI (Ubuntu)
if: startsWith(matrix.os, 'ubuntu')
run: |
sudo apt-get update
sudo apt-get install -y openmpi-bin libopenmpi-dev
- name: "Run nox for ${{ matrix.python-version }}"
shell: bash
run: |
lname="snx-${{matrix.os}}-${{matrix.python-version}}.lcov"
xname="snx-${{matrix.os}}-${{matrix.python-version}}.xml"
cov="lcov -o$lname xml -o$xname"
uv run --group dev nox --force-python python -s testcov -- $cov
- name: Coveralls Parallel
uses: coverallsapp/github-action@v2
with:
parallel: true
github-token: ${{ secrets.github_token }}
flag-name: run-${{matrix.python-version}}-${{matrix.os}}
file: "snx-${{matrix.os}}-${{matrix.python-version}}.lcov"
type_check:
name: Type Check
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ["3.14"]
os: [ubuntu-latest]
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: "actions/setup-python@v6"
with:
python-version: "${{ matrix.python-version }}"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: "Run Type Checking for ${{ matrix.python-version }}"
run: |
uv run --group dev nox --force-python python -s type_check
finish:
name: "Finish Coveralls"
needs: tests
runs-on: ubuntu-latest
steps:
- name: Coveralls Finished
uses: coverallsapp/github-action@v2
with:
github-token: ${{ secrets.github_token }}
parallel-finished: true cogent3-scinexus-e0aee79/.github/workflows/codeql.yml 0000664 0000000 0000000 00000001442 15202532665 0023012 0 ustar 00root root 0000000 0000000 name: "CodeQL"
on:
push:
branches-ignore:
- master
pull_request:
branches-ignore:
- master
schedule:
- cron: '39 20 * * 6'
jobs:
analyze:
name: Analyze
runs-on: ubuntu-latest
permissions:
actions: read
contents: read
security-events: write
strategy:
fail-fast: false
matrix:
language: [ 'python' ]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Initialize CodeQL
uses: github/codeql-action/init@v4
with:
languages: ${{ matrix.language }}
- name: Autobuild
uses: github/codeql-action/autobuild@v4
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v4
with:
category: "/language:${{matrix.language}}"
cogent3-scinexus-e0aee79/.github/workflows/docs.yml 0000664 0000000 0000000 00000003404 15202532665 0022473 0 ustar 00root root 0000000 0000000 name: Build Docs
concurrency:
group: docs-build-${{ github.ref }}
cancel-in-progress: true
on:
workflow_dispatch:
release:
types: [published]
push:
branches:
- main
paths:
- 'docs/**'
- '.readthedocs.yaml'
- 'zensical.toml'
- 'rtd_get_docs.py'
permissions:
contents: read
jobs:
build-docs:
if: github.repository == 'cogent3/SciNexus'
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
fetch-depth: 0
submodules: recursive
- uses: "actions/setup-python@v5"
with:
python-version: "3.14"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: Build Documentation
run: |
# update executable components
uv run --group dev nox -s cogdocs
# build actual docs
uv run --group dev zensical build --clean
working-directory: ${{ github.workspace }}
- name: Upload Documentation Artifact
uses: actions/upload-artifact@v4
with:
name: scinexus-docs-html
path: site
trigger_rtd:
if: github.repository == 'cogent3/SciNexus'
runs-on: ubuntu-latest
needs: build-docs
steps:
- name: Trigger Read the Docs build
env:
RTDS_WEBHOOK_URL: ${{ secrets.RTDS_WEBHOOK_URL }}
RTDS_WEBHOOK_TOKEN: ${{ secrets.RTDS_WEBHOOK_TOKEN }}
BRANCH: ${{ github.ref_name }}
run: |
curl -sS --fail-with-body -X POST \
--data-urlencode "token=${RTDS_WEBHOOK_TOKEN}" \
--data-urlencode "branches=${BRANCH}" \
"${RTDS_WEBHOOK_URL}"
cogent3-scinexus-e0aee79/.github/workflows/linters.yml 0000664 0000000 0000000 00000001455 15202532665 0023227 0 ustar 00root root 0000000 0000000 name: Lint code using ruff
on:
push:
pull_request:
jobs:
linters:
runs-on: ubuntu-latest
if: github.repository != 'cogent3/SciNexus'
steps:
- uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: '3.14'
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: Format code using ruff
run: uv run --group dev nox -s fmt
- name: Commit changes
uses: EndBug/add-and-commit@v10
with:
author_name: ${{ github.actor }}
author_email: ${{ github.actor }}@users.noreply.github.com
message: "STY: pre-commit linting with ruff"
add: "."
cogent3-scinexus-e0aee79/.github/workflows/release.yml 0000664 0000000 0000000 00000006653 15202532665 0023174 0 ustar 00root root 0000000 0000000 name: Release
on: [workflow_dispatch]
jobs:
test:
name: "Test on Python ${{ matrix.python-version }} (${{ matrix.os }})"
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
python-version: ["3.11", "3.12", "3.13", "3.14"]
steps:
- uses: "actions/checkout@v6"
with:
fetch-depth: 0
# Setup env
- uses: "actions/setup-python@v6"
with:
python-version: "${{ matrix.python-version }}"
- name: Install MPI (Ubuntu)
if: startsWith(matrix.os, 'ubuntu')
run: |
sudo apt-get update
sudo apt-get install -y openmpi-bin libopenmpi-dev
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: "Run nox for ${{ matrix.python-version }}"
shell: bash
run: |
uv run --group dev nox -db uv --force-python python -s test
uv run --group dev nox -db uv --force-python python -s test_types
uv run --group dev nox -db uv --force-python python -s test_docs
- name: "Run MPI tests"
if: startsWith(matrix.os, 'ubuntu')
shell: bash
run: |
uv run --group dev nox -db uv --force-python python -s testmpi
docbuild:
name: "Test the docs"
runs-on: ubuntu-latest
steps:
- uses: "actions/checkout@v6"
with:
fetch-depth: 0
- uses: "actions/setup-python@v6"
with:
python-version: "3.14"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: "test the docs code"
run: |
uv run --group dev nox -s cogdocs
uv run --group dev zensical build
build:
name: Build wheel and sdist
needs: test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0
- uses: actions/setup-python@v6
with:
python-version: '3.14'
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: Build sdist and wheel
run: uv build
- name: Upload sdist and wheel
uses: actions/upload-artifact@v7
with:
name: snx-wheel-sdist
path: |
./dist/*.whl
./dist/*.tar.gz
release_test:
name: Release to Test PyPI
needs: [build, docbuild]
environment: release_test
runs-on: ubuntu-latest
permissions:
id-token: write
steps:
- name: Download sdist and wheel
uses: actions/download-artifact@v8
with:
name: snx-wheel-sdist
path: ./dist
- name: Publish package distributions to Test PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
repository-url: https://test.pypi.org/legacy/
release:
name: Release to PyPI
needs: release_test
environment: release
runs-on: ubuntu-latest
permissions:
id-token: write
steps:
- name: Download sdist and wheel
uses: actions/download-artifact@v8
with:
name: snx-wheel-sdist
path: ./dist
- name: Publish package distributions to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
cogent3-scinexus-e0aee79/.gitignore 0000664 0000000 0000000 00000000511 15202532665 0017407 0 ustar 00root root 0000000 0000000 *
!.github/**/*.yml
!.readthedocs.yaml
!.hgignore
!LICENSE
!README.md
!changelog.md
!docs/conftest.py
!docs/**/*.md
!docs/**/*.svg
!docs/*.txt
!noxfile.py
!pyproject.toml
!rtd_get_docs.py
!ruff.toml
!scripts/*.py
!src/**/*.py
!tests/data/*.fasta
!tests/data/*.gz
!tests/data/*.tsv
!tests/data/*.log
!tests/**/*.py
!zensical.toml
cogent3-scinexus-e0aee79/.hgignore 0000664 0000000 0000000 00000000627 15202532665 0017232 0 ustar 00root root 0000000 0000000 syntax:glob
.svn
*.pyc
*.pyo
*.so
*.o
*.DS_Store
*.tmproj
*.rej
*.orig
*.wpr
*.pdf
_build/*
build
*htmlcov*
*.idea
*.coverage*
*egg-info*
*.wpu
.cache*
*taskpaper
*.ipynb
*.ipynb_checkpoints*
*.sublime*
*.patch
*.pytest_cache
*.tox
*.nox
*.vscode
*.code-workspace
coverage.xml
__pycache__
junit-*.xml
dist/*
working/*
lcov*.info
.ruff_cache/*
.devcontainer/*
venv*
.venv*
.mypy*
CLAUDE.md
site/*
docs/data/* cogent3-scinexus-e0aee79/.readthedocs.yaml 0000664 0000000 0000000 00000001175 15202532665 0020655 0 ustar 00root root 0000000 0000000 # .readthedocs.yaml
# Read the Docs configuration file for MkDocs projects
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.14"
commands:
# Install the required dependencies
- pip install requests
# Run the script to download and extract the pre-built docs
- python rtd_get_docs.py
- echo "Documentation downloaded and extracted"
# Disable the default build processes since we're using pre-built docs
sphinx:
configuration: null
python:
install: []
cogent3-scinexus-e0aee79/LICENSE 0000664 0000000 0000000 00000002726 15202532665 0016436 0 ustar 00root root 0000000 0000000 BSD 3-Clause License
Copyright (c) 2026, cogent3
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cogent3-scinexus-e0aee79/README.md 0000664 0000000 0000000 00000013421 15202532665 0016702 0 ustar 00root root 0000000 0000000
[](https://coveralls.io/github/cogent3/scinexus?branch=main) [](https://app.codacy.com/gh/cogent3/scinexus/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [](https://github.com/astral-sh/ruff) [](https://github.com/cogent3/scinexus/actions/workflows/ci.yml) 
*`scinexus` is a framework for rapid development of data processing applications. It enables interoperability between objects through defined data types, allowing development of scientific domain app ecosystems. Just as `attrs` and `dataclasses` use type hints to simplify data type definition, `scinexus` uses them to simplify writing best-practice scientific algorithms.*
Many scientific problems require repeating calculations across many files or database records. Such tasks suit data-level parallelism, but writing robust, maintainable code for them is often tedious and quickly becomes complex.
As the Unix philosophy articulates, writing algorithms that do one thing well and can be composed together through piping data of known type is a *Very Good Thing*™.
**`scinexus` encourages this design pattern and eliminates the boilerplate.** We leverage the Python type annotation system to govern the compatibility (composability) of different applications. This enables in-process composition of your applications with validation of the consistency of the pipeline and the consistency of the data being run through it.
**`scinexus` is designed for scientific reproducibility.** Scientific computations should record all conditions needed to reproduce an analysis. `scinexus` reduces the effort by intercepting all arguments (including defaults) used in app construction and logging the resulting app state.
## Examples
Developers can choose inheriting from a base class or use the `scinexus.define_app` decorator to make composable apps. The following examples show simple composition
Loading files so missing data does not cause a crash
```python
from scinexus import define_app
@define_app(app_type="loader")
def read_json(path: str) -> dict:
import json
with open(path) as f:
return json.load(f)
@define_app
def validate(data: dict, required_field: str) -> dict:
if required_field not in data:
# this becomes a NotCompleted sentinel object
# your run doesn't crash!
raise ValueError(f"missing {required_field!r} field")
return data
app = read_json() + validate(required_field="name")
```
You can apply `app` to a single file path as `app(filepath)`, or operate in parallel (and show a progress bar) on a sequence of file paths as
```python
results = list(app.as_completed(["some_file_path.json", "some_other_file_path.json"], parallel=True, show_progress=True)
```
A contrived numerical example
```python
from scinexus import define_app
@define_app
def normalise(values: list[float]) -> list[float]:
lo, hi = min(values), max(values)
return [(v - lo) / (hi - lo) for v in values]
@define_app
def threshold(values: list[float]) -> list[bool]:
return [v > 0.5 for v in values]
app = normalise() + threshold()
app([1.0, 5.0, 3.0, 9.0])
```
A configurable app
```python
from scinexus import define_app
@define_app(app_type="loader")
def load_csv(path: str) -> list[dict]:
import csv
with open(path) as f:
return list(csv.DictReader(f))
@define_app
class summarise:
def __init__(self, column: str) -> None:
"""column contains the values to produce summary stats for"""
self.column = column
def main(self, rows: list[dict]) -> dict[str, float]:
vals = [float(r[self.column]) for r in rows]
return {"mean": sum(vals) / len(vals), "min": min(vals), "max": max(vals)}
app = load_csv() + summarise(column="price")
```
## Features
- Type checking at composition time
- Durable computing -- failures recorded as `NotCompleted` records, not exceptions
- Data-level parallel execution with pluggable backends (stdlib, loky, MPI, or custom)
- Progress bars (`tqdm` or `rich`)
- Automated logging and citation tracking
- Checkpointing via data stores (directory, SQLite)
## Installation
```bash
pip install scinexus
```
## The `scinexus` origin story
The app framework and utility functions in `scinexus` incubated inside [cogent3](https://github.com/cogent3/cogent3) from March 2019, accumulating over seven years of development, testing, and real-world use in computational genomics before being extracted into a standalone package. The design is mature and has underpinned analyses in published studies.
The extraction into `scinexus` makes the infrastructure available to any scientific Python project, free of the `cogent3` dependency. See the [changelog](changelog.md) for a detailed list of changes from the cogent3 app infrastructure.
We acknowledge here that many members of the `cogent3` community contributed to the code that now lives here, including [@GavinHuttley](https://github.com/GavinHuttley), [@rmcar17](https://github.com/rmcar17), [@Nick-Foto](https://github.com/Nick-Foto), [@KatherineCaley](https://github.com/KatherineCaley), [@fredjaya](https://github.com/fredjaya), and [@khiron](https://github.com/khiron).
cogent3-scinexus-e0aee79/changelog.md 0000664 0000000 0000000 00000005145 15202532665 0017700 0 ustar 00root root 0000000 0000000 # Changelog
Changes from the original cogent3 app infrastructure.
## New Features
- Standalone package extracted from `cogent3.app` — no cogent3 dependency required.
- Generic base classes `AppBase[T, R]`, `ComposableApp[T, R]`, and `WriterApp[T, R]` that apps can inherit from directly as an alternative to the `@define_app` decorator. Type checkers can resolve types through class inheritance without a plugin.
- mypy plugin (`scinexus._mypy_plugin`) for correct type inference of `@define_app` decorated classes. Synthesises the `__call__` return type as `R | NotCompleted`.
- `check_data_type` attribute on apps — a settable property to toggle runtime input type checking on or off. Disabling can speed up execution and simplify debugging.
- `NotCompletedType` enum (`ERROR`, `FAIL`, `BUG`) for categorising failure types, replacing bare strings.
- `set_summary_display()` / `get_summary_display()` — a module-level registry allowing downstream packages (e.g. cogent3) to register custom display functions for data store summary methods (`describe`, `summary_logs`, `summary_not_completed`, `summary_citations`, `validate`).
- `citations` and `bib` properties on apps for tracking software citations via the `citeable` library. Citations propagate through composed pipelines.
- Pluggable parallel backends -- choose between stdlib multiprocessing, loky, or MPI via `set_parallel_backend()`.
- Pluggable progress bars -- use `tqdm` or `rich` via the `Progress` protocol and `set_default_progress()`.
- `set_id_from_source()` / `get_id_from_source()` -- register a custom function for extracting storage identifiers from data.
- `apply_to()` accepts `logger=False` to disable log file creation.
## Enhancements
- App composition (`+`) now makes shallow copies of the right-hand operand. Composed pipelines no longer share mutable state.
- Composition-time type compatibility checking via `check_type_compatibility()` — catches type mismatches when apps are composed with `+`, before any data is processed.
- Data store summary methods (`describe`, `summary_logs`, etc.) return `list[dict]` or `dict` instead of cogent3 `Table` objects. Custom display can be restored via `set_summary_display()`.
- All modules pass mypy strict type checking.
- `StrOrBytes` type alias replaced with `str | bytes` throughout.
- Inline `assert` statements replaced with explicit `ValueError` / `TypeError` raises.
- Type-hint-related imports moved under `TYPE_CHECKING` for lighter runtime import overhead.
## Deprecated
- `ComposableApp.disconnect()` — discontinued, will be removed in version 2026.9. No longer required since composition uses shallow copies.
cogent3-scinexus-e0aee79/docs/ 0000775 0000000 0000000 00000000000 15202532665 0016352 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/docs/conftest.py 0000664 0000000 0000000 00000000314 15202532665 0020547 0 ustar 00root root 0000000 0000000 from scinexus.typing import register_type_namespace
try:
from cogent3.app.typing import _get_resolution_namespace
register_type_namespace(_get_resolution_namespace)
except ImportError:
pass
cogent3-scinexus-e0aee79/docs/explanation/ 0000775 0000000 0000000 00000000000 15202532665 0020674 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/docs/explanation/app-lifecycle.md 0000664 0000000 0000000 00000002202 15202532665 0023727 0 ustar 00root root 0000000 0000000 # The app lifecycle
!!! abstract ""
The different app types, their base classes, and how `define_app` transforms a user defined class or function into a composable app.
## Types of apps
### Loaders
These are responsible for loading data and are composable. They inherit from `LoaderApp`.
```python
from scinexus import LoaderApp
```
### Writers
These are responsible for writing data and are composable. They inherit from `WriterApp`.
```python
from scinexus import WriterApp
```
### Generic
Generic apps do other operations on data and are composable. They inherit from `ComposableApp`
```python
from scinexus import ComposableApp
```
### Non-composable
Non-composable apps cannot be combined with other apps into pipelines.
```python
from scinexus import NonComposableApp
```
!!! info
You can create your app by inheriting from one of the above base classes. Or you can use the `define_app` decorator. Using the decorator is the fastest way to turn something you already have into a composable app. Under the hood, the decorator is basically injecting the base classes described above into the inheritance of your own classes.
cogent3-scinexus-e0aee79/docs/explanation/customisation-hooks.md 0000664 0000000 0000000 00000010151 15202532665 0025236 0 ustar 00root root 0000000 0000000 # Customisation hooks
!!! abstract ""
How `scinexus` uses module-level registry functions to let downstream packages customise summary display and identifier extraction without subclassing.
## The pattern
`scinexus` uses module-level registry functions to let downstream packages customise behaviour without subclassing or monkey-patching. Each hook follows the same shape:
- **`set_*(func)`** registers a callable (or `None` to clear)
- **`get_*()`** returns the currently registered callable (or the default)
This keeps `scinexus` free of dependencies on downstream packages while still allowing them to integrate deeply.
## `set_summary_display` — transforming summary output
Data store summary properties (`.describe`, `.summary_logs`, `.summary_not_completed`, `.summary_citations`, `.validate()`) collect their data as plain Python dicts or lists of dicts. By default these are returned as-is.
A downstream package can register a display function that transforms these raw structures into richer objects. The function must accept `(data, *, name)` where `data` is the raw dict or list and `name` is the summary method name (e.g. `"describe"`).
### How `cogent3` uses this
When `cogent3.app` is imported, it registers a function that converts summaries into `cogent3.core.table.Table` objects:
```python { notest }
from scinexus.data_store import set_summary_display
from cogent3.core.table import Table
def _summary_to_table(data, *, name):
if isinstance(data, dict):
title = data.pop("title", name)
rows = [[k, v] for k, v in data.items()]
return Table(
header=["Condition", "Value"],
data=rows,
title=title,
)
if isinstance(data, list):
if not data:
return Table(header=[], data=[], title=name)
header = list(data[0].keys())
rows = [list(row.values()) for row in data]
return Table(header=header, data=rows, title=name)
return data
set_summary_display(_summary_to_table)
```
After this registration, every call to `dstore.describe` or `dstore.summary_not_completed` returns a `Table` with a rich notebook repr, rather than a plain dict.
## `set_id_from_source` — customising unique ID extraction
When `apply_to()` or `as_completed()` processes a data store, each result needs a unique identifier so the writer can store it and skip already-processed inputs on subsequent runs. By default, `scinexus` extracts this ID using `get_unique_id`, which strips format suffixes from file names:
```
"gene_001.fasta.gz" → "gene_001"
```
If your data uses a different naming convention — for example, IDs embedded in the file content or in a metadata field — you can register a custom extractor:
```python { notest }
from scinexus.data_store import set_id_from_source
def my_id_extractor(data):
"""Extract ID from a metadata dict."""
return data.info.source.split("/")[-1].split("_")[0]
set_id_from_source(my_id_extractor)
```
The registered function is consulted by:
- `WriterApp.apply_to()` — to derive output record keys
- `AppBase.as_completed()` — to identify results
- `NotCompleted` — to normalise the `source=` attribute on error records
Pass `None` to restore the default:
```python { notest }
set_id_from_source(None) # back to get_unique_id
```
Per-call overrides via the `id_from_source` keyword on `apply_to()` and `as_completed()` still take precedence over the registered function.
## The default ID pipeline: `get_data_source` → `get_unique_id`
The default extractor, `get_unique_id`, works in two steps:
1. **`get_data_source(data)`** extracts a source string from the input. This is a singledispatch function that handles:
- `str` / `Path` → the file name
- `dict` → looks for `data["info"]["source"]` or `data["source"]`
- `DataMemberABC` → the member's `unique_id`
- Any object with a `.source` attribute → recurses on that attribute
2. **`get_unique_id(name)`** strips format suffixes (e.g. `.fasta`, `.gz`) from the source string returned by `get_data_source`.
Together they turn inputs like `DataMember(unique_id="gene_001.fasta.gz")` into the key `"gene_001"`.
cogent3-scinexus-e0aee79/docs/explanation/data-store-model.md 0000664 0000000 0000000 00000004105 15202532665 0024357 0 ustar 00root root 0000000 0000000 # Data store model
!!! abstract ""
How data stores map inputs to outputs via unique IDs, how checkpointing works to skip already-processed items, the three backends (directory, zip, SQLite) and when to use each, and how citations and logs are stored alongside results.
## Data stores -- collections of data records
If you download [raw.zip](../data/raw.zip) and unzip it, you will see it contains 1,035 files ending with a `.fa` filename suffix. (It also contains a tab delimited file and a log file, which we ignore for now.) The directory `raw` is a "data store" and the `.fa` files are "members" of it. In summary, a data store is a collection of members of the same "type". This means we can apply the same application to every member.
### Types of data store
| Class Name | Supported Operations | Supported Data Types | Identifying Suffix |
|---|---|---|---|
| `DataStoreDirectory` | read / write / append | text | None |
| `ReadOnlyDataStoreZipped` | read | text | `.zip` |
| `DataStoreSqlite` | read, write, append | text or bytes | `.sqlitedb` |
!!! note
The `ReadOnlyDataStoreZipped` is just a compressed `DataStoreDirectory`.
### The structure of data stores
If a directory was not created by `scinexus` as a `DataStoreDirectory` then it has only the structure that existed previously.
If a data store was created by `scinexus`, either as a directory or as a `sqlitedb`, then it contains four types of data: completed records, *not* completed records, log files and md5 files. In a `DataStoreDirectory`, these are organised using the file system. The completed members are valid data records (as distinct from not completed) and are at the top level. The remaining types are in subdirectories.
```
demo_dstore
├── logs
├── md5
├── not_completed
└── ...
```
`logs/` stores `scitrack` log files produced by `scinexus` writer apps. `md5/` stores plain text files with the md5 sum of a corresponding data member which are used to check the integrity of the data store.
The `DataStoreSqlite` stores the same information, just in SQL tables.
cogent3-scinexus-e0aee79/docs/explanation/flow.md 0000664 0000000 0000000 00000004365 15202532665 0022175 0 ustar 00root root 0000000 0000000 # Execution flow of a composed app
!!! abstract ""
How data flows through a composed pipeline, step by step.
Consider two apps composed into a pipeline:
```python { notest }
from scinexus import define_app
@define_app(app_type="loader")
def read_json(path: str) -> dict:
import json
with open(path) as f:
return json.load(f)
@define_app
def validate(data: dict, required_field: str) -> dict:
if required_field not in data:
raise ValueError(f"missing {required_field!r} field")
return data
app = read_json() + validate(required_field="name")
```
Composing with `+` creates a new app where `validate` is the outermost app and `read_json` is stored as its `.input` attribute. When you call `app(filepath)`, execution begins at the outermost app and works inward.
## The execution flow when you call `app(filepath)`
```mermaid
flowchart TD
entry["Executes scinexus __call__(val)"] --> none{val is None?}
none -- yes --> nc_none[create and return NotCompleted ERROR, recording current app as origin]
none -- no --> nc{val is NotCompleted?}
nc -- yes --> nc_return[returns same NotCompleted]
nc -- no --> has_input{has an input app?}
has_input -- yes --> call_input["call input(val), which enters the top of this chart"]
call_input --> input_nc{result is NotCompleted?}
input_nc -- yes --> nc_input[return same NotCompleted]
input_nc -- no --> type_check
has_input -- no --> type_check{val type is valid for self}
type_check -- fail --> nc_type[create and return NotCompleted ERROR]
type_check -- pass --> main["main(val)"]
main -- exception --> nc_main[NotCompleted ERROR]
main -- success --> result["return result (which may be NotCompleted FAIL)"]
classDef errorNode fill:#fde0c8,stroke:#333
classDef successNode fill:#c8e0fd,stroke:#333
class nc_none,nc_return,nc_input,nc_type,nc_main errorNode
class result successNode
```
This is the same sequence for every composed app, regardless of pipeline length. Each app in the chain runs the same `__call__` checks, so `NotCompleted` propagation and exception handling are consistent throughout. See [Runtime type checking](type-system.md#runtime-type-checking) for details on how type validation works and how to disable it.
cogent3-scinexus-e0aee79/docs/explanation/index.md 0000664 0000000 0000000 00000001471 15202532665 0022330 0 ustar 00root root 0000000 0000000 # Explanation
Background and design rationale behind `scinexus`.
- [Why composable apps?](why-composable-apps.md) -- the problem `scinexus` solves and how it compares to alternatives
- [The app lifecycle](app-lifecycle.md) -- base classes and `define_app`
- [Type system](type-system.md) -- how composition-time type checking works
- [Execution flow](flow.md) -- step-by-step data flow through a composed pipeline
- [NotCompleted design](not-completed-design.md) -- why a sentinel pattern instead of exceptions
- [Source tracking](source-tracking.md) -- how `source_proxy` tracks data provenance through pipelines
- [Customisation hooks](customisation-hooks.md) -- `set_summary_display` and `set_id_from_source` registry functions
- [Data store model](data-store-model.md) -- unique IDs, checkpointing, and backend choices
cogent3-scinexus-e0aee79/docs/explanation/not-completed-design.md 0000664 0000000 0000000 00000007135 15202532665 0025245 0 ustar 00root root 0000000 0000000 # `NotCompleted` design
!!! abstract ""
Why `scinexus` uses a sentinel object instead of exceptions for handling failures in batch processing.
## The problem with exceptions in pipelines
When applying an algorithm to hundreds or thousands of data records, some records will inevitably fail — bad data, missing fields, violated preconditions. If failures raise exceptions, you face an unpleasant choice:
- **Let it crash.** You lose all progress and must restart from scratch.
- **Wrap everything in try/except.** Your pipeline logic becomes cluttered with error-handling boilerplate, and you must decide at every step what to catch and what to re-raise.
Neither approach scales well. You want failures to be recorded and the pipeline to continue processing the remaining records.
## The sentinel pattern
`NotCompleted` is `scinexus`'s answer: a sentinel return value that signals "this record could not be processed" without raising an exception. It carries structured information about the failure:
- **`.type`** — `FALSE` (a condition was not met) or `ERROR` (an unexpected exception occurred)
- **`.origin`** — which app produced the failure
- **`.source`** — which input data failed
- **`.message`** — a human-readable explanation
Because `NotCompleted` is a regular return value, it flows through the same code paths as successful results.
## Why it subclasses `int` and is falsy
`NotCompleted` subclasses `int` with a value of `0`, making it evaluate to `False` in boolean contexts. This means you can check for failure with a simple truthiness test:
```python { notest }
result = my_app(data)
if not result:
print(f"Failed: {result.message}")
```
Subclassing `int` rather than defining `__bool__` alone ensures consistent behaviour with Python's truth-testing protocol across all contexts (including NumPy arrays and other libraries that inspect types).
## Automatic propagation through pipelines
When apps are composed with `+`, the resulting pipeline checks each intermediate result. If any step returns a `NotCompleted`, subsequent steps are skipped and the `NotCompleted` is returned as the final result. This means:
- A single failure does not corrupt downstream steps.
- The failure's `.origin` accurately records where the problem occurred, not where it was finally caught.
- No try/except scaffolding is needed in pipeline code.
```python { linenums="1" notest }
import cogent3 as c3
aln = c3.get_dataset("primate-brca1")
select_seqs = c3.get_app("take_named_seqs", "Mouse", "Human")
min_length = c3.get_app("min_length", 300)
app = select_seqs + min_length
result = app(aln)
print(result)
# NotCompleted(type=FAIL, origin=take_named_seqs, source="brca1", message="named
# seq(s) {'Mouse'} not in ('FlyingLem', 'TreeShrew', 'Galago', 'HowlerMon',
# 'Rhesus', 'Orangutan', 'Gorilla', 'Chimpanzee', 'Human')")
```
## Recording failures in data stores
When a pipeline is run via `apply_to()` on a data store, `NotCompleted` results are automatically written to a separate area (the `not_completed/` subdirectory or SQL table). This gives you a complete audit trail: you can inspect which records failed, which app was responsible, and why — all without interrupting the processing of successful records.
See [Handle failures](../howto/handle-failures.md) for usage examples.
cogent3-scinexus-e0aee79/docs/explanation/source-tracking.md 0000664 0000000 0000000 00000005563 15202532665 0024327 0 ustar 00root root 0000000 0000000 # Source tracking
!!! abstract ""
How `source_proxy` preserves the link between input identity and output when data is transformed through a pipeline.
## The problem
When you call `apply_to()` or `as_completed()` on a data store, each member is fed through the pipeline independently. The pipeline may transform the data into something completely different — a new object with no reference back to the input that produced it. But the writer at the end of the pipeline needs to know *which input* produced *which output* so it can assign the correct unique ID in the output data store.
For example, if a loader reads `"gene_001.fa"` and the pipeline returns a translated protein sequence, the writer needs to store that result under the key `"gene_001"`. Without a mechanism to carry the input identity forward, this link is lost.
## How `source_proxy` solves it
`source_proxy` is a transparent wrapper that carries two extra pieces of state alongside the wrapped object:
- **`.source`** — the original input (or its identifier), preserved across transformations
- **`.uuid`** — a unique identifier for this proxy instance, used for hashing
When `as_completed()` or `apply_to()` processes a data store, each member is wrapped in a `source_proxy` before entering the pipeline. Because `source_proxy` delegates attribute access to the wrapped object via `__getattr__`, downstream apps see the original object and do not need to know about the proxy.
```python { notest }
from scinexus.composable import source_proxy
proxy = source_proxy(some_data)
proxy.source # the original input
proxy.uuid # unique identifier for this proxy
proxy.any_attr # delegates to some_data.any_attr
```
## How `propagate_source` preserves the link
After each pipeline step, the result needs to be re-associated with the original source. `propagate_source` handles this:
1. If the result already has a `.source` attribute (e.g. it is a `DataMember` or another object that natively tracks its origin), the proxy is **unwrapped** — the result stands on its own.
2. Otherwise, the proxy's wrapped object is **updated** to the new result via `set_obj()`, and the proxy (still carrying the original `.source`) is returned.
This means the source identity survives an arbitrary number of pipeline steps, even when intermediate apps return entirely new objects.
## Why this matters for writers
`WriterApp.apply_to()` uses the source to derive unique IDs for output records. This enables **append-only semantics**: on a subsequent run against the same data store, records that already exist in the output are skipped. The unique ID comes from the original input's identity (via `get_data_source()`), which is only available because `source_proxy` carried it through the pipeline.
Without source tracking, the writer would have no way to determine whether a result corresponds to an input that has already been processed.
cogent3-scinexus-e0aee79/docs/explanation/type-system.md 0000664 0000000 0000000 00000005534 15202532665 0023530 0 ustar 00root root 0000000 0000000 # Type system
!!! abstract ""
Why types are checked at composition time rather than call time, how `check_type_compatibility` works, handling of Union types and forward references, the role of `register_type_namespace`, and the relationship to `typeguard` for runtime checking.
## Composability rules
There are rules around app composition, starting with app types. Loaders and writers are special cases. If included, a loader must always be first, e.g.
```python { notest }
app = a_loader + a_generic
```
If included, a writer must always be last, e.g.
```python { notest }
app = a_generic + a_writer
```
Changing the order for either of the above will result in a `TypeError`.
The next constraint on app composition are the input and output types of the apps involved. Specifically, apps define the type of input they work on and the type of output they produce. For two apps to be composed, the output (or return) type of app on the left (e.g. `a_loader`) must overlap with the input type of the app on the right (e.g. `a_generic`). If they don't match, a `TypeError` is raised.
## Built-in type protocols and aliases
SciNexus defines two type-level constructs used across the framework:
- `SerialisableType` -- a `Protocol` that any object with a `to_rich_dict()` method satisfies. Database writer apps rely on this to serialise results before storing them in a data store.
- `IdentifierType` -- a type alias (`str | Path | DataMemberABC`) representing the accepted ways to identify a member of a data store. Loader apps accept this as input.
See the [API reference](../reference/utilities.md#type-system) for details.
## Runtime type checking
In addition to checking type compatibility when apps are composed, scinexus validates input data at runtime before each call to `main()`. This uses `typeguard.check_type` to verify that the data matches the app's declared input type. On a mismatch, a `NotCompleted` is returned with a message naming the received and expected types.
!!! important "Why this matters"
Without runtime type checking, passing the wrong data type to an app still fails — but the error occurs inside the users `main()` and can be confusing. For example, a message like `'NoneType' object has no attribute 'blah'` gives little indication that the real problem is a type mismatch from an upstream app. With runtime checking enabled, scinexus catches this before entering `main()` and reports the mismatch clearly.
### Disabling type checking with `check_data_type`
Runtime type checking is enabled by default. For mature pipelines where type correctness has been established, you can disable it to remove the small overhead of the `typeguard` check:
```python { notest }
app = read_json() + validate(required_field="name")
app.check_data_type = False
```
Setting `check_data_type` on the outermost app propagates the setting to all apps in the pipeline.
cogent3-scinexus-e0aee79/docs/explanation/why-composable-apps.md 0000664 0000000 0000000 00000004330 15202532665 0025110 0 ustar 00root root 0000000 0000000 # Why composable apps?
!!! abstract ""
The design philosophy behind `scinexus` and why composing single-purpose apps leads to more robust, reproducible scientific computation.
## Make your algorithms more robust
As the robustness of POSIX based operating systems (think Linux, Mac OS, Unix) can attest, writing algorithms that stitch together multiple single purpose applications is a *Very Good Thing*™. This is most elegantly expressed as a part of the Unix design philosophy.
???+ quote
Write programs that do one thing and do it well. Write programs to work together.
— Doug McIlroy
**`scinexus` encourages this design pattern.** We leverage the Python type annotation system to govern the compatibility (composability) of different applications. This enables in-process composition of your applications with validation of the consistency of the pipeline and the consistency of the data being run through it.
We can expand on this slightly for the problem of scientific computation by considering the critical benchmark of satisfying the conditions for reproducible computation, i.e. the obligation to track all of the properties affecting the execution of your algorithm. Examples of this are the operating system, the language version, the seed used for the random number generator, etc.
**`scinexus` does this for you.** For example, we intercept all arguments (including default values) passed to the construction of apps and record them so that the app state is logged. If you, the developer, also leverage the capabilities of the [`scitrack`](https://pypi.org/project/scitrack/) logging package (which `scinexus` has as a dependency), you can capture extra information such as versions of packages that your application depends on. We provide an [example](../howto/log-and-cite.md#leveraging-scitrack-for-reproducibility) of using `scitrack` for these cases.
## Improve the accessibility of your work for end users
Apps are ready-made functions that users can run on their data without needing technical expertise. They’re easy for non-programmers to use, and can be linked together into pipelines. This lets users process one or thousands of records at once—without writing loops, conditionals, or other structural code.
cogent3-scinexus-e0aee79/docs/howto/ 0000775 0000000 0000000 00000000000 15202532665 0017512 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/docs/howto/customise-display-and-ids.md 0000664 0000000 0000000 00000014634 15202532665 0025037 0 ustar 00root root 0000000 0000000 # Customise display and IDs
!!! abstract ""
How to use `set_summary_display` to transform data store summary output into richer objects, and `set_id_from_source` to control how unique identifiers are extracted from data.
## Summary display default
By default, summary properties like `.describe` return Python primitive types like `dict` and `list`.
```python { linenums="1" notest }
from scinexus import open_data_store
dstore = open_data_store("data/raw.zip", suffix="fa", mode="r")
print(type(dstore.describe), "", dstore.describe, sep="\n")
{'completed': 1035, 'not_completed': 0, 'logs': 0}
```
## Customising summary display
You can register a customised display function for your project. For `cogent3`, it converts them into `cogent3` `Table` objects:
```python { linenums="1" notest }
from scinexus.data_store import set_summary_display
from cogent3.core.table import Table
def summary_to_table(data, *, name):
if isinstance(data, dict):
title = data.pop("title", name)
rows = [[k, v] for k, v in data.items()]
return Table(header=["Condition", "Value"], data=rows, title=title)
if isinstance(data, list):
if not data:
return Table(header=[], data=[], title=name)
header = list(data[0].keys())
rows = [list(row.values()) for row in data]
return Table(header=header, data=rows, title=name)
return data
set_summary_display(summary_to_table)
```
This results in the following:
```python { linenums="1" notest }
describe
======================
Condition Value
----------------------
completed 1035
not_completed 0
logs 0
----------------------
```
!!! note
`cogent3` registers this transformation automatically when you `import cogent3.app`, so you get `Table` output without any setup in cogent3 projects.
### Unsetting the display function
Reset the display function and revert to the default `scinexus` behaviour as follows:
```python { notest }
set_summary_display(None)
```
## Default unique ID extraction
Being able to extract unique identifiers for individual data objects is fundamental to the ability of scinexus to track provenance of individual results. Because of its roots from `cogent3`, the `scinexus` default `get_unique_id` function extracts this information from a `.source` attribute. That function, strips format suffixes from file names to derive unique keys for data store records.
```python { linenums="1" notest }
from scinexus import get_id_from_source
func = get_id_from_source()
print(func("gene_001.fasta.gz"), func("sample.txt"))
# gene_001 sample
```
## Customising unique ID extraction
Register a custom extractor when your naming convention differs:
```python { linenums="1" notest }
from scinexus.data_store import set_id_from_source, get_id_from_source
def extract_ensembl_id(data):
name = str(data)
if name.startswith("ENSG"):
return name.split(".")[0]
return name
set_id_from_source(extract_ensembl_id)
func = get_id_from_source()
# Now the registered function is used as the default
print(func("ENSG00000157184.fa"), func("gene_001.fasta.gz"))
# ENSG00000157184 gene_001.fasta.gz
```
## Reset to default
```python { linenums="1" notest }
from scinexus.data_store import set_id_from_source
set_id_from_source(None)
```
## Over-riding the default per-call
You can also override per-call without affecting the global default:
```python { notest }
result = app.apply_to(dstore, id_from_source=extract_ensembl_id)
```
cogent3-scinexus-e0aee79/docs/howto/extend-type-checking.md 0000664 0000000 0000000 00000010673 15202532665 0024062 0 ustar 00root root 0000000 0000000 # Extend type checking
!!! abstract ""
How to use `register_type_namespace` to make forward references from downstream packages resolvable at composition time, enabling third-party types in app pipelines.
## The problem
When you compose apps with `+`, `scinexus` checks that the output type of the left app is compatible with the input type of the right app. Type hints are often written as forward references — strings like `"Alignment"` or `"PhyloNode"` — to avoid circular imports. At composition time `scinexus` must resolve these strings to actual classes, but it only knows about its own types by default. If your package defines custom types used in app hints, `scinexus` cannot resolve them without help.
## The solution
`register_type_namespace` lets a downstream package register a **lazy namespace provider** — a zero-argument callable that returns a `dict[str, type]`. When `scinexus` encounters an unresolved forward reference, it queries each registered provider in order until it finds a match.
```python { notest }
from scinexus.typing import register_type_namespace
register_type_namespace(my_provider)
```
The provider is called lazily each time a name needs resolving, so the package can defer heavy imports. Providers are responsible for their own caching. Registration is idempotent: re-registering the same callable is a no-op.
???- example "How `cogent3` does it"
```python { linenums="1" notest }
from scinexus.typing import register_type_namespace
_resolution_ns = None
def _get_resolution_namespace():
global _resolution_ns
if _resolution_ns is not None:
return _resolution_ns
from cogent3.core.alignment import Alignment, SequenceCollection # (1)!
from cogent3.core.tree import PhyloNode
# ... other imports ...
_resolution_ns = {
"Alignment": Alignment,
"SequenceCollection": SequenceCollection,
"PhyloNode": PhyloNode, # (3)!
# ... other types ...
}
return _resolution_ns
register_type_namespace(_get_resolution_namespace) # (2)!
```
1. `cogent3` defines many types (`Alignment`, `PhyloNode`, `Table`, etc.) that are used as forward references in app type hints.
2. In `cogent3/app/typing.py`, a resolution namespace is built lazily and registered with `scinexus`
3. With this registration, any `scinexus` app that uses `"PhyloNode"` as a type hint will resolve correctly at composition time without the user importing `PhyloNode` explicitly.
## Registering your own package's types
Follow the same pattern: define a lazy provider function that imports and caches your types, then register it at module level.
```python { notest }
from scinexus.typing import register_type_namespace
_ns = None
def _get_my_types():
global _ns
if _ns is not None:
return _ns
from my_package.core import MyDataType, MyResultType
_ns = {
"MyDataType": MyDataType,
"MyResultType": MyResultType,
}
return _ns
register_type_namespace(_get_my_types)
```
Place this in a module that is imported early (e.g. your package's `typing.py` or `__init__.py`). Once registered, apps using `"MyDataType"` as a forward reference will resolve correctly when composed with other apps.
cogent3-scinexus-e0aee79/docs/howto/handle-failures.md 0000664 0000000 0000000 00000015456 15202532665 0023112 0 ustar 00root root 0000000 0000000 # Handle failures
!!! abstract ""
How to create `NotCompleted` values, check their truthiness, inspect their attributes, and control propagation with `skip_not_completed=False`.
## `NotCompleted` FALSE type
A FALSE type is returned when a condition is not met. For example, below we create an app that selects 2 specific sequences from an alignment. Applying this to a data set where a "Mouse" sequence does not exist produces a FALSE type.
```python { linenums="1" notest }
import cogent3 as c3
aln = c3.get_dataset("primate-brca1")
select_seqs = c3.get_app("take_named_seqs", "Mouse", "Human")
result = select_seqs(aln)
assert result == False
print(result)
# NotCompleted(type=FAIL, origin=take_named_seqs, source="brca1", message="named
# seq(s) {'Mouse'} not in ('FlyingLem', 'TreeShrew', 'Galago', 'HowlerMon',
# 'Rhesus', 'Orangutan', 'Gorilla', 'Chimpanzee', 'Human')")
```
## Inspecting `NotCompleted` attributes
The `NotCompleted` instance has attributes identifying what data failed:
```python { notest }
result.source
```
Where the failure occurred:
```python { notest }
result.origin
```
And the reason for the failure:
```python { notest }
result.message
```
The `.type` attribute is the `NotCompletedType` enum value (e.g. `NotCompletedType.FALSE`, `NotCompletedType.ERROR`, or `NotCompletedType.BUG`).
## `NotCompleted` ERROR type
An ERROR type is returned if an unexpected condition occurs, such as an exception raised during execution. Here we illustrate this by trying to open a file with an incorrect path.
???- example "Example"
```python { linenums="1" notest }
import cogent3 as c3
reader = c3.get_app("load_aligned", moltype="dna")
result = reader("primate_brca1.fasta")
print(result)
# NotCompleted(type=ERROR, origin=load_aligned, source="primate_brca1",
# message="Traceback (most recent call last): File
# "/Users/gavin/repos/SciNexus/src/scinexus/composable.py", line 545, in __call__
# result = self.main(val, *args, **kwargs) [...]
```
## Composed functions propagate `NotCompleted` results
If you have a composed function with multiple steps and a failure occurs, the resulting `NotCompleted` is returned without any of the subsequent steps being applied. For example, we make a composed app from both of the above apps:
???- example "Example"
```python { linenums="1" notest }
import cogent3 as c3
reader = c3.get_app("load_aligned", moltype="dna")
select_seqs = c3.get_app("take_named_seqs", "Mouse", "Human")
app = reader + select_seqs
result = app("data/primate_brca1.fasta")
print(result)
# NotCompleted(type=FAIL, origin=take_named_seqs, source="primate_brca1",
# message="named seq(s) {'Mouse'} not in ('FlyingLem', 'TreeShrew', 'Galago',
# 'HowlerMon', 'Rhesus', 'Orangutan', 'Gorilla', 'Chimpanzee', 'Human')")
```
The failure originated in `select_seqs` (an instance of `take_named_seqs`), and `reader` ran successfully — but the `NotCompleted` propagated through the rest of the pipeline.
???- example "Example"
```python { linenums="1" notest }
import cogent3 as c3
reader = c3.get_app("load_aligned", moltype="dna")
select_seqs = c3.get_app("take_named_seqs", "Mouse", "Human")
app = reader + select_seqs
result = app("primate_brca1.fasta")
print(result)
# NotCompleted(type=ERROR, origin=load_aligned, source="primate_brca1", [...]
```
Here the failure originated in `reader` (bad path), and `select_seqs` was never called.
## Creating `NotCompleted` in your own apps
You can return a `NotCompleted` from your own app to signal that a particular input cannot be processed:
```python { linenums="1" notest }
from scinexus import define_app, NotCompleted, NotCompletedType
@define_app
def require_min_length(val: str, min_length: int = 10) -> str:
if len(val) < min_length:
return NotCompleted(
NotCompletedType.FALSE,
"require_min_length",
val,
message=f"too short: {len(val)} < {min_length}",
)
return val
```
## Receiving `NotCompleted` with `skip_not_completed=False`
By default, apps skip `NotCompleted` inputs — they propagate without calling `main()`. If your app needs to see `NotCompleted` values (e.g. a writer that records failures), set `skip_not_completed=False`:
```python { linenums="1" notest }
from scinexus import define_app, NotCompleted
@define_app(skip_not_completed=False)
def log_failures(val: str) -> str:
if isinstance(val, NotCompleted):
print(f"Failure: {val.message}")
return val
return val
```
cogent3-scinexus-e0aee79/docs/howto/index.md 0000664 0000000 0000000 00000002071 15202532665 0021143 0 ustar 00root root 0000000 0000000 # How-to guides
Task-oriented recipes for common `scinexus` operations, each self-contained.
- [Write a function app](write-a-function-app.md) -- use `@define_app` on a function
- [Write a class app](write-a-class-app.md) -- use `@define_app` on a class with `main()`
- [Handle failures](handle-failures.md) -- create and inspect `NotCompleted` values
- [Use data stores](use-data-stores.md) -- open, read, and write data stores
- [Read and write files](read-and-write-files.md) -- IO with compression, atomic writes, streaming
- [Run in parallel](run-in-parallel.md) -- parallel execution with pluggable backends
- [Track progress](track-progress.md) -- progress bars with `tqdm` or `rich`
- [Log and cite](log-and-cite.md) -- `scitrack` logging and citation tracking
- [Customise display and IDs](customise-display-and-ids.md) -- `set_summary_display` and `set_id_from_source`
- [Extend type checking](extend-type-checking.md) -- register type namespaces for downstream packages
- [Migrate from cogent3](migrate-from-cogent3.md) -- update code from `cogent3.app` to `scinexus`
cogent3-scinexus-e0aee79/docs/howto/log-and-cite.md 0000664 0000000 0000000 00000017740 15202532665 0022310 0 ustar 00root root 0000000 0000000 # Log and cite
!!! abstract ""
How to use `scitrack` logging in apps, control logging in `apply_to`, and access citation records from composed pipelines.
## Leveraging `scitrack` for reproducibility
We reproduce here one of the examples from [scitrack](https://github.com/HuttleyLab/scitrack).
???- example "Using `scitrack` in a `click` app"
```python linenums="1"
import click
from scitrack import CachingLogger
LOGGER = CachingLogger()
@click.command()
@click.option("-i", "--infile", type=click.Path(exists=True))
@click.option("-t", "--test", is_flag=True, help="Run test.")
def main(infile, test):
# capture the local variables, at this point just provided arguments
LOGGER.log_args() # (1)!
LOGGER.log_versions("numpy") # (2)!
LOGGER.input_file(infile) # (3)!
LOGGER.log_file_path = "some_path.log" # (4)!
if __name__ == "__main__":
main()
```
1. :man_raising_hand: A single statement and you have captured all the input arguments and their values, including defaults!
2. This captures the version numbers of the packages our application depends on.
3. This logs the path to `infile` and its md5sum.
4. Until you assign the path where you want the file written, this content has been cached.
## Controlling logging in `apply_to`
By default, `apply_to` creates a `CachingLogger` that records the composable function, package versions, output paths, MD5 checksums of every result, and total elapsed time. The log is then written into the output data store. This is the recommended setting for production analyses because it gives you a complete, self-contained record of what ran and what it produced.
```python { notest }
result = process.apply_to(dstore) # logger=True by default
```
You can also pass your own `CachingLogger` instance if you want to configure it beforehand or reuse one across multiple calls.
```python { notest }
from scitrack import CachingLogger
LOGGER = CachingLogger()
LOGGER.log_args()
result = process.apply_to(dstore, logger=LOGGER)
```
### Disabling logging
Set `logger=False` to skip logging entirely.
```python { notest }
result = process.apply_to(dstore, logger=False)
```
This is useful when:
- **Your project is small** and a full provenance log is unnecessary.
- **Logging is handled externally**, for example by a workflow manager or your own `CachingLogger` that wraps several `apply_to` calls.
- **You want to avoid the overhead** of computing an MD5 checksum for every result object, which can be noticeable for large or numerous outputs.
## Make it easy for your work to be cited
Correctly attributing the authors of algorithms and software is a requirement of good scientific practice. `scinexus` makes this easy by letting app authors declare citations that are automatically tracked through composed pipelines.
Use the `cite` parameter of `define_app` (or the base classes) to attach a citation. The `citeable` library provides several classes for this purpose.
???- example "Adding a citation to your app"
```python { linenums="1" notest }
from citeable import Software
from scinexus import define_app
from cogent3.app.typing import AlignedSeqsType
from cogent3 import get_app
my_cite = Software(
author=["Doe, J", "Smith, A"],
title="My Sequence Filter",
year=2025,
url="https://example.com/my-filter",
version="0.1.0",
)
@define_app(cite=my_cite) # (1)!
def strict_filter(val: AlignedSeqsType) -> AlignedSeqsType:
"""Remove sequences shorter than the alignment."""
return val.omit_bad_seqs()
app = strict_filter()
loader = get_app("load_aligned", moltype="dna", format_name="fasta")
pipeline = loader + strict_filter()
print(pipeline.citations) # (2)!
print(f"\n{pipeline.bib}") # (3)!
# (Software( author=['Doe, J', 'Smith, A'], title='My Sequence Filter',
# year=2025, version='0.1.0', url='https://example.com/my-filter', [...]
```
1. Use the `cite` parameter of `define_app` to attach a citation
2. The `.citations` property returns citations as a tuple. When apps are composed into a pipeline, `.citations` collects unique citations from all apps in the chain.
3. The `.bib` gives the BibTeX string.
## Extracting citations from a data store
When a composed pipeline is run via `apply_to()`, citations are automatically saved in the output data store.
???- example "Citations in data stores"
```python { linenums="1" notest }
from citeable import Software
from scinexus import define_app, open_data_store
from cogent3.app.typing import AlignedSeqsType
from cogent3 import get_app
my_cite = Software(
author=["Doe, J"],
title="My Sequence Filter",
year=2025,
)
@define_app(cite=my_cite)
def strict_filter(val: AlignedSeqsType) -> AlignedSeqsType:
return val.omit_bad_seqs()
in_dstore = open_data_store("data/raw.zip", suffix="fa", limit=5)
out_dstore = open_data_store("cited_results", suffix="fa", mode="w")
loader = get_app("load_aligned", moltype="dna", format_name="fasta")
writer = get_app("write_seqs", data_store=out_dstore, format_name="fasta")
process = loader + strict_filter() + writer
result = process.apply_to(in_dstore)
result.summary_citations # (1)!
result.write_bib("my_analysis.bib") # (2)!
```
1. Because we are using `cogent3`, the property returns a `cogent3` `Table` of all citations stored in the data store.
2. You can export to a BibTeX file.
!!! note
`ReadOnlyDataStoreZipped` supports reading stored citations but not writing them.
cogent3-scinexus-e0aee79/docs/howto/migrate-from-cogent3.md 0000664 0000000 0000000 00000005616 15202532665 0023775 0 ustar 00root root 0000000 0000000 # Migrate from cogent3
!!! abstract ""
A guide for users moving from `cogent3.app` to `scinexus`.
## Update imports
Replace `cogent3.app` imports with their `scinexus` equivalents:
```python { notest }
# before
from cogent3.app.composable import define_app, NotCompleted
# after
from scinexus import define_app, NotCompleted
```
## Summary methods return plain Python objects
Data store summary methods (`describe`, `summary_logs`, `summary_not_completed`, `summary_citations`, `validate`) now return `list[dict]` or `dict` instead of cogent3 `Table` objects.
To restore `Table`-based display, register a converter with `set_summary_display()`:
```python { notest }
from cogent3.core.table import Table
from scinexus.data_store import set_summary_display
def _summary_to_table(data, *, name):
if isinstance(data, dict):
rows = [[k, v] for k, v in data.items()]
return Table(header=["Condition", "Value"], data=rows, title=name)
if isinstance(data, list) and data:
header = list(data[0].keys())
rows = [list(row.values()) for row in data]
return Table(header=header, data=rows, title=name)
return data
set_summary_display(_summary_to_table)
```
## NotCompletedType is now an enum
Failure types are categorised using the `NotCompletedType` enum rather than bare strings:
```python { notest }
from scinexus.composable import NotCompletedType
# before
nc.type == "ERROR"
# after
nc.type == NotCompletedType.ERROR
```
The three values are `ERROR`, `FAIL`, and `BUG`.
## App composition uses shallow copies
Composing apps with `+` now creates a shallow copy of the right-hand operand. Composed pipelines no longer share mutable state, so `ComposableApp.disconnect()` is no longer needed and is deprecated.
## New features
These capabilities are new in `scinexus` and were not available in `cogent3.app`:
- **`check_data_type` property** -- toggle runtime input type checking on or off. See [Runtime type checking](../explanation/type-system.md#runtime-type-checking).
- **Better IDE integration through static typing support** -- `AppBase[T, R]`, `ComposableApp[T, R]`, and `WriterApp[T, R]` can be inherited from directly as an alternative to `@define_app`.
- **Pluggable parallel backends** -- choose between stdlib multiprocessing, loky, or MPI backends. See [Run in parallel](run-in-parallel.md).
- **Pluggable progress bars** -- use `tqdm` or `rich` for progress display. See [Track progress](track-progress.md).
- **Custom identifier extraction** -- register a custom function for extracting storage identifiers from data via `set_id_from_source()`. See [Customise display and IDs](customise-display-and-ids.md).
- **Logging can be disabled** -- pass `logger=False` to `apply_to()` to skip log file creation.
## Full changelog
See the [changelog](https://github.com/cogent3/scinexus/blob/main/changelog.md) for a complete list of changes from the cogent3 app infrastructure.
cogent3-scinexus-e0aee79/docs/howto/read-and-write-files.md 0000664 0000000 0000000 00000010416 15202532665 0023741 0 ustar 00root root 0000000 0000000 # Read and write files
!!! abstract ""
How to use `open_()` for reading and writing files with automatic compression detection (gzip, bzip2, lzma, zip), `atomic_write` for safe file writes that clean up on failure, `iter_splitlines` and `iter_line_blocks` for streaming large files, and `is_url`/`open_url` for working with URLs.
## Writing a compressed file
`open_()` detects the compression format from the file suffix and handles it automatically. Writing a gzip-compressed text file is identical to writing a plain text file — just use a `.gz` suffix.
```python { linenums="1" notest }
from scinexus import open_
with open_("data/sample.txt.gz", "wt") as f:
f.write("Hello, compressed world!\n")
f.write("Line two of the file.\n")
```
## Reading a compressed file
Reading works the same way — `open_()` detects the `.gz` suffix and decompresses transparently.
```python { linenums="1" notest }
from scinexus import open_
with open_("data/sample.txt.gz") as f:
print(f.read())
# Hello, compressed world! Line two of the file.
```
Supported compression formats are gzip (`.gz`), bzip2 (`.bz2`), lzma (`.xz`, `.lzma`), and zip (`.zip`).
## Reading a URL
`open_()` also handles URLs. Use `is_url()` to check whether a path is a URL before opening it.
???- example "Checking and reading a URL"
```python linenums="1"
from scinexus.io_util import is_url
from scinexus import open_
url = "https://github.com/user-attachments/files/26728407/raw.zip"
print(is_url(url)) # (1)!
with open_(url, "rb") as f: # (2)!
header = f.read(20)
print(header)
```
1. `is_url()` returns `True` for `http`, `https`, and `file` scheme URLs.
2. `open_()` detects the URL and delegates to `open_url()`. Only read mode is supported for URLs.
## Efficiently reading large files
Reading an entire large file into memory or iterating line by line with Python's built-in `readline()` can be inefficient. The built-in approach makes a system call for every line, which becomes a bottleneck for files with millions of lines. `scinexus` provides two functions that read data in large chunks and then split into lines, greatly reducing I/O overhead.
### `iter_splitlines`
`iter_splitlines(path, chunk_size=1_000_000)` reads a file in chunks (default 1 MB) and yields individual lines. It correctly handles lines that span chunk boundaries.
```python { notest }
from scinexus.io_util import iter_splitlines
for line in iter_splitlines("large_file.txt"):
process(line)
```
### `iter_line_blocks`
`iter_line_blocks(path, num_lines=1000, chunk_size=5_000_000)` builds on `iter_splitlines` — it accumulates lines into lists of `num_lines` and yields each list. This is useful when downstream processing works on batches (e.g. FASTA records where each record spans a fixed number of lines).
```python { notest }
from scinexus.io_util import iter_line_blocks
for block in iter_line_blocks("large_file.txt", num_lines=1000):
process_batch(block) # block is a list of up to 1000 strings
```
Use `iter_splitlines` when you need one line at a time. Use `iter_line_blocks` when your processing naturally operates on batches of lines.
cogent3-scinexus-e0aee79/docs/howto/run-in-parallel.md 0000664 0000000 0000000 00000016236 15202532665 0023046 0 ustar 00root root 0000000 0000000 # Run in parallel
!!! abstract ""
How to choose your preferred parallelisation backend, how to parallelise any function using `parallel.map`, `parallel.imap`, and `parallel.as_completed` as standalone utilities, and how to enable parallel execution in app pipelines with `parallel=True` and `par_kw`.
## Data level parallelism
`scinexus` supports parallel computation for the common case where the same calculation needs to be applied to many independent data items. A master process splits the work among available CPU cores, each worker processes its share, and results are collected.
!!! warning
Parallelism is not always faster. You should see a performance gain when the computation time per task significantly exceeds the overhead of distributing work. If individual tasks are very fast, the overhead of inter-process communication can dominate.
If individual output files are small, storing results in a single file (e.g. a `.sqlitedb` database) is more efficient than writing many small files.
## Choosing a parallel backend
`scinexus` supports three parallel backends. The default uses only the Python standard library and requires no extra installs.
| Backend | Install | Best for |
|---|---|---|
| `"multiprocess"` | included | scripts, CI, environments where you control dependencies |
| `"loky"` | `pip install "scinexus[loky]"` | Jupyter notebooks, interactive sessions, long-running pools |
| `"mpi"` | `pip install "scinexus[mpi]"` | HPC clusters with multiple nodes |
Set the backend once, typically at the top of your script or notebook:
```python { notest }
import scinexus
scinexus.set_parallel_backend("loky")
```
!!! note
The `"loky"` backend uses [loky](https://loky.readthedocs.io/) which provides reusable process pools and robust pickling via `cloudpickle`. This makes it the recommended choice for Jupyter notebooks, where the stdlib `ProcessPoolExecutor` can fail to serialise closures and lambda functions.
### Getting a specific backend without changing the default
If your code requires a particular backend, pass the ``backend`` argument to ``get_parallel_backend``. This returns an instance of the requested backend without changing the global default, so other packages that depend on the current setting are unaffected:
```python { notest }
from scinexus import get_parallel_backend
backend = get_parallel_backend(backend="loky")
```
## Parallel computation on a single computer
### Using `app.apply_to()`
If you have a composed app **with** a writer, use `apply_to()` with the `parallel` and `par_kw` keyword arguments:
```python { notest }
result = app.apply_to(dstore, parallel=True, par_kw=dict(max_workers=4))
```
### Using `app.as_completed()`
If you have a composed app **without** a writer, use `as_completed()`. This returns a generator, so wrap it with `list()` or iterate over it:
```python { notest }
results = list(app.as_completed(dstore, parallel=True, par_kw=dict(max_workers=4)))
```
### Using `scinexus.parallel` directly
For parallelising any function (not just apps), use the functions in `scinexus.parallel`.
#### `parallel.as_completed` -- results in completion order
Returns results as they finish. The order may differ from the input order. It also tends to balance work better across compute nodes than `imap` or `map`.
```python { notest }
from scinexus import parallel
result = list(parallel.as_completed(is_prime, PRIMES, max_workers=4))
```
The first argument is the function to call, the second is the iterable of inputs. Each input element is passed as a single argument to the function. The data is broken into chunks across workers automatically.
!!! note
If you don't specify `max_workers`, all available CPUs are used.
#### `parallel.imap` -- preserving input order (generator)
Returns results in the same order as the input, yielding one at a time:
```python { notest }
from scinexus import parallel
for result in parallel.imap(process_item, items, max_workers=4):
handle(result)
```
#### `parallel.map` -- preserving input order (list)
Same as `imap` but returns a list:
```python { notest }
from scinexus import parallel
results = parallel.map(process_item, items, max_workers=4)
```
### Complete example
```python { notest }
import math
from scinexus import parallel
def is_prime(n):
if n % 2 == 0:
return False
sqrt_n = int(math.floor(math.sqrt(n)))
for i in range(3, sqrt_n + 1, 2):
if n % i == 0:
return False
return True
PRIMES = [
112272535095293,
112582705942171,
115280095190773,
115797848077099,
117450548693743,
993960000099397,
]
if __name__ == "__main__":
results = parallel.map(is_prime, PRIMES, max_workers=4)
for number, prime in zip(PRIMES, results):
print(f"{number} is prime: {prime}")
```
## Parallel computation on multiple computers (MPI)
On systems with multiple nodes (e.g. an HPC cluster), use MPI via the [mpi4py](https://mpi4py.readthedocs.io/) library. You need to install an MPI implementation (e.g. [OpenMPI](https://www.open-mpi.org/)) and the `mpi4py` Python package
```bash
pip install mpi4py
```
or installing `scinexus` [with `mpi` extra](../install.md#optional-extras).
Set the backend to MPI:
```python { notest }
import scinexus
scinexus.set_parallel_backend("mpi")
```
Or pass `use_mpi=True` to any of the parallel functions:
```python { notest }
from scinexus import parallel
results = parallel.map(is_prime, PRIMES, use_mpi=True, max_workers=PBS_NCPUS)
```
Or with app pipelines:
```python { notest }
result = app.apply_to(dstore, parallel=True, par_kw=dict(use_mpi=True, max_workers=4))
```
To run an MPI script, invoke it via `mpiexec`:
```bash
mpiexec -n $PBS_NCPUS python3 -m mpi4py.futures my_script.py
```
!!! note
You can use MPI for parallel execution on a single computer too. This can be useful for testing your code locally before migrating to a larger system.
### MPI script structure
MPI scripts must guard the main logic behind `if __name__ == "__main__":`:
```python { notest }
import os
from scinexus import parallel
PBS_NCPUS = int(os.environ["PBS_NCPUS"])
def process(data): ...
if __name__ == "__main__":
results = parallel.map(process, my_data, use_mpi=True, max_workers=PBS_NCPUS)
```
## Custom backends
You can integrate any parallel engine by subclassing `Parallel`:
```python { notest }
from scinexus.parallel import Parallel, set_parallel_backend
class DaskBackend(Parallel):
def __init__(self, client):
self._client = client
def imap(self, f, s, max_workers=None, **kwargs):
futures = self._client.map(f, list(s))
yield from self._client.gather(futures)
def as_completed(self, f, s, max_workers=None, **kwargs):
from dask.distributed import as_completed
futures = self._client.map(f, list(s))
for future in as_completed(futures):
yield future.result()
def is_master_process(self):
from dask.distributed import get_worker
try:
get_worker()
return False
except ValueError:
return True
def get_rank(self):
return 0
def get_size(self):
return sum(self._client.nthreads().values())
set_parallel_backend(DaskBackend(client))
```
cogent3-scinexus-e0aee79/docs/howto/track-progress.md 0000664 0000000 0000000 00000014462 15202532665 0023011 0 ustar 00root root 0000000 0000000 # Track progress
!!! abstract ""
How to choose your preferred progress bar backend and customise progress bars.
`scinexus` defaults to using the [tqdm](https://pypi.org/project/tqdm/) for progress bars. These behave well across terminal and notebook environments. We also support using [rich](https://pypi.org/project/rich/) for its progress bars. A single API for different progress backends.
## Choosing the progress bar backend
Use `set_progress_backend` to switch between backends. The default is `tqdm`.
```python { notest }
import scinexus
scinexus.set_progress_backend("rich") # switch to rich
scinexus.set_progress_backend("tqdm") # switch back to tqdm
scinexus.set_progress_backend(None) # reset to default (tqdm)
```
## Getting a progress bar
Use `get_progress` to obtain a `Progress` instance. Passing `show_progress=True` returns the current default backend.
```python { notest }
import scinexus
pbar = scinexus.get_progress(show_progress=True)
for item in pbar(range(100), msg="Processing"):
pass # your work here
```
You can pass keyword arguments to configure the default backend:
```python { notest }
import scinexus
pbar = scinexus.get_progress(show_progress=True, colour="blue", leave=True)
```
You can also pass a `Progress` instance directly:
```python { notest }
from scinexus.progress import RichProgress
pbar = scinexus.get_progress(show_progress=RichProgress())
```
!!! note
If you call `get_progress(show_progress=False)`, it returns `NoProgress`, which silently passes through the iterable.
## Nesting progress bars
=== "Using `tqdm` (default)"
Create nested progress bars using `child()`. Each bar can have its own description via the `msg` keyword. Create the child once before the loop — it automatically resets to zero on each subsequent call.
```python { notest }
import scinexus
pbar = scinexus.get_progress(show_progress=True)
child = pbar.child()
for batch in pbar(range(3), msg="Outer loop"):
for item in child(range(10), msg=f"Inner batch {batch}"):
pass # your work here
```
=== "Using `rich`"
The same nesting pattern works with the `rich` backend:
```python { notest }
import scinexus
scinexus.set_progress_backend("rich")
pbar = scinexus.get_progress(show_progress=True)
child = pbar.child()
for batch in pbar(range(3), msg="Outer loop"):
for item in child(range(10), msg=f"Inner batch {batch}"):
pass # your work here
```
`rich` children share the same `rich.progress.Progress` display instance, so all bars render together in a single live display.
The outer bar tracks the top-level iteration. Each call to `child()` creates a new `Progress` at the next cursor position, so inner bars appear below the outer one. The child bar is reused across iterations — on the second and subsequent calls, the bar resets to zero instead of creating a new one.
#### Push-based sub-contexts
When you need to report fractional progress rather than iterating, use `context()`:
```python { notest }
import scinexus
pbar = scinexus.get_progress(show_progress=True)
child = pbar.child()
for batch in pbar(range(3), msg="Processing"):
with child.context(msg=f"Batch {batch}") as ctx:
for i in range(100):
ctx.update(progress=i / 100, msg=f"Step {i}")
```
The context maps progress values from `[0.0, 1.0]` to the configured `[start, end]` range and is cleaned up automatically when the `with` block exits.
## Cleaning up
Both `Progress` and `ProgressContext` support the context manager protocol. Using a progress bar as a context manager ensures that `close()` is called automatically, which finalises the display and moves the cursor past the bars. Without cleanup, leftover bars can leave the terminal cursor in the wrong position.
=== "Using `tqdm` (default)"
```python
import scinexus
with scinexus.get_progress(show_progress=True) as pbar: # (1)!
child = pbar.child()
for batch in pbar(range(3), msg="Outer"):
for item in child(range(10), msg=f"Batch {batch}"):
pass
```
1. `close()` is called automatically and the cursor position is restored.
=== "Using `rich`"
```python
import scinexus
scinexus.set_progress_backend("rich")
with scinexus.get_progress(show_progress=True) as pbar: # (1)!
child = pbar.child()
for batch in pbar(range(3), msg="Outer"):
for item in child(range(10), msg=f"Batch {batch}"):
pass
```
1. `close()` is called automatically and the cursor position is restored.
???- tip "No context manager? No problem!"
```python
import scinexus
pbar = scinexus.get_progress(show_progress=True)
child = pbar.child()
for batch in pbar(range(3), msg="Outer"):
for item in child(range(10), msg=f"Batch {batch}"):
pass
pbar.close() # (1)!
```
1. Call `close()` explicitly when you are done
!!! note
Calling `close()` on a `Progress` instance also closes all of its children. For standalone `ProgressContext` objects (from `context()`), use the `with` statement as shown in the [push-based sub-contexts](#push-based-sub-contexts) section.
## Customising appearance
### Persisting bars after completion
By default, `tqdm` keeps the outermost bar visible after completion but clears nested bars. `rich` removes all bars. Use `leave` to control this:
```python { notest }
from scinexus.progress import TqdmProgress, RichProgress
# Keep all tqdm bars visible after completion
pbar = TqdmProgress(leave=True)
# Keep all rich bars visible after completion
pbar = RichProgress(leave=True)
```
You can also set `leave` independently on child bars:
```python { notest }
from scinexus.progress import TqdmProgress
pbar = TqdmProgress(leave=True)
child = pbar.child(leave=False) # child bars disappear, outer persists
for batch in pbar(range(3), msg="Outer"):
for item in child(range(10), msg=f"Batch {batch}"):
pass
```
### Setting bar colour
Both backends support a `colour` parameter. For `tqdm`, this sets the bar colour directly. For `rich`, it styles the bar column when the display is auto-created.
```python { notest }
from scinexus.progress import TqdmProgress, RichProgress
pbar = TqdmProgress(colour="green")
pbar = RichProgress(colour="cyan")
```
Colour is inherited by child bars.
cogent3-scinexus-e0aee79/docs/howto/use-data-stores.md 0000664 0000000 0000000 00000021675 15202532665 0023067 0 ustar 00root root 0000000 0000000 # Use data stores
!!! abstract ""
How to use `open_data_store` in read, write, and append modes with directory, zip, and SQLite backends, iterate over members, and inspect `.completed`, `.not_completed`, and `.summary_`.
## How do I use a data store?
A data store is just a "container". To open a data store you use the `open_data_store()` function. To load the data for a member of a data store you need an appropriately selected loader type of app.
## Supported operations on a data store
All data store classes can be iterated over, indexed, checked for membership. These operations return a `DataMember` object. In addition to providing access to members, the data store classes have convenience methods for describing their contents and providing summaries of log files that are included and of the `NotCompleted` members (see not completed).
## Opening a data store
Use the `open_data_store()` function, illustrated below. Use the mode argument to identify whether to open as read only (`mode="r"`), write (`mode="w"`) or append(`mode="a"`).
### Opening a read only data store
We open the zipped directory described above, defining the filenames ending in ``.fa`` as the data store members. All files within the directory become members of the data store (unless we use the ``limit`` argument).
```python { linenums="1" notest }
from scinexus import open_data_store
dstore = open_data_store("data/raw.zip", suffix="fa", mode="r") # (1)!
print(dstore)
dstore.describe # (2)!
m = dstore[0] # (3)!
for m in dstore[:5]: # (4)!
print(m)
m.read()[:20] # (5)!
# 1035x member
# ReadOnlyDataStoreZipped(source='/Users/gavin/repos/SciNexus/docs/data/raw.zip',
# members=[DataMember(data_store=/Users/gavin/repos/SciNexus/docs/data/raw.zip,
# unique_id=ENSG00000157184.fa),
# DataMember(data_store=/Users/gavin/repos/SciNexus/docs/data/raw.zip,
# unique_id=ENSG00000131791.fa)]...) ENSG00000157184.fa ENSG00000131791.fa
# ENSG00000127054.fa ENSG00000067704.fa ENSG00000182004.fa
```
1. Open a data store.
2. The `.describe` property summarises the contents.
3. You can index like any Python sequence.
4. Or loop over members.
5. And read data from a member.
!!! note
For a `DataStoreSqlite` member, the default data storage format is bytes. So reading the content of an individual record is best done using the `load_db` app.
### Making a writeable data store
The creation of a writeable data store is specified with `mode="w"`, or (to append) `mode="a"`. In the former case, any existing records are overwritten. In the latter case, existing records are ignored.
## `DataStoreSqlite` stores serialised data
When you specify a Sqlitedb data store as your output (by using `open_data_store()`) you write multiple records into a single file making distribution easier.
One important issue to note is the process which creates a Sqlitedb "locks" the file. If that process exits unnaturally (e.g. the run that was producing it was interrupted) then the file may remain in a locked state. If the db is in this state, `scinexus` will not modify it unless you explicitly unlock it.
This is represented in the display as shown below.
```python { linenums="1" notest }
{'completed': 175, 'not_completed': 0, 'logs': 1, 'title': 'Unlocked db store.'}
```
To unlock, you execute the following:
```python { notest }
dstore.unlock(force=True)
```
## Interrogating run logs
If you use the `apply_to()` method, a `scitrack` logfile will be stored in the data store. This includes useful information regarding the run conditions that produced the contents of the data store.
```python { linenums="1" notest }
# [{'time': '2019-07-24 14:42:56', 'name': 'logs/load_unaligned-progressive_align-
# write_db-pid8650.log', 'python_version': '3.7.3', 'who': 'gavin', 'command':
# '/Users/gavin/miniconda3/envs/c3dev/lib/python3.7/site-
# packages/ipykernel_launcher.py -f [...]
```
Log files can be accessed via a special attribute.
```python { linenums="1" notest }
# [DataMember(data_store=/Users/gavin/repos/SciNexus/docs/data/demo-
# locked.sqlitedb, unique_id=logs/load_unaligned-progressive_align-write_db-
# pid8650.log)]
```
Each element in that list is a `DataMember` which you can use to get the data contents. The following
```python { notest }
print(dstore.logs[0].read()[:225])
```
Produces
```python { linenums="1" notest }
# 2019-07-24 14:42:56 Eratosthenes.local:8650 INFO system_details :
# system=Darwin Kernel Version 18.6.0: Thu Apr 25 23:16:27 PDT 2019;
# root:xnu-4903.261.4~2/RELEASE_X86_64 2019-07-24 14:42:56
# Eratosthenes.local:8650 INFO python
```
## Citations – giving credit to package developers
When apps declare citations, those citations are automatically saved alongside your results when you use `apply_to()`.
```python { linenums="1" notest }
import pathlib
import shutil
from citeable import Software
from scinexus import define_app, open_data_store
from cogent3 import get_app
from cogent3.app.typing import AlignedSeqsType
my_cite = Software(
author=["Doe, J"],
title="My Sequence Filter",
year=2025,
)
@define_app(cite=my_cite)
def strict_filter(val: AlignedSeqsType) -> AlignedSeqsType:
return val.omit_bad_seqs()
in_dstore = open_data_store("data/raw.zip", suffix="fa", limit=5)
out_dstore = open_data_store("cited_results", suffix="fa", mode="w")
loader = get_app("load_aligned", moltype="dna", format_name="fasta")
writer = get_app("write_seqs", data_store=out_dstore, format_name="fasta")
process = loader + strict_filter() + writer
result = process.apply_to(in_dstore)
result.write_bib("my_analysis.bib")
print(pathlib.Path("my_analysis.bib").read_text())
# @software{cogent3, author = {Huttley, Gavin and Caley, Katherine and
# Fotovat, Nabi and Ma, Stephen Ka-Wah and Koh, Moses and Morris, Richard and
# McArthur, Robert and McDonald, Daniel and Jaya, Fred and Maxwell, Peter and
# Martini, James and La, Thomas and Lang, Yapeng}, title = {{cogent3}: [...]
```
The `summary_citations` property returns a table of all citations stored in the data store (line 24). Export to BibTeX with `write_bib()` (line 26).
!!! note
`ReadOnlyDataStoreZipped` supports reading stored citations but not writing them.
cogent3-scinexus-e0aee79/docs/howto/write-a-class-app.md 0000664 0000000 0000000 00000007031 15202532665 0023266 0 ustar 00root root 0000000 0000000 # Write a class app
!!! abstract ""
How to inherit from `scinexus` app base classes, or using the `define_app` decorator, specifying input/output type hints.
## Using inheritance from a base class
```python linenums="1"
from collections.abc import Callable
from citeable import Software
from scinexus import ComposableApp
my_cite = Software(
author=["Doe, J", "Smith, A"],
title="My Sequence Filter",
year=2025,
url="https://example.com/my-filter",
version="0.1.0",
)
class my_app( # (1)!
ComposableApp[str, str], # (2)!
cite=my_cite, # (3)!
):
def __init__(self, convert: Callable[[str], str]):
self.convert = convert
def main(self, val: str) -> str: # (4)!
return self.convert(val)
```
1. We suggest naming your apps using the PEP8 naming style for functions (lowercase separated by underscores) because the instances will be used like functions.
2. We type hint the input / output types with the base class.
3. We assign the citation in the class definition.
4. Your class **must** have a `main()` method with type hints specified for its first argument and its return type.
## Using the `define_app` decorator
How to use `@define_app` on a class with a `main()` method, configure it via `__init__` parameters, and control behaviour with the `app_type` parameter.
```python { linenums="1" notest }
from collections.abc import Callable
from citeable import Software
from scinexus import ComposableApp
my_cite = Software(
author=["Doe, J", "Smith, A"],
title="My Sequence Filter",
year=2025,
url="https://example.com/my-filter",
version="0.1.0",
)
@define_app(cite=my_cite) # (1)!
class my_app:
def __init__(self, convert: Callable[[str], str]):
self.convert = convert
def main(self, val: str) -> str: # (2)!
return self.convert(val)
```
1. The `define_app` decorator is used. You can specify the `app_type` here, which we don't in this case, and assign your citation.
2. Your class **must** have a `main()` method with type hints specified for its first argument and its return type.
### Specifying the app type
The `define_app` decorator has a default `app_type` of `"generic"`. This means the app does data transformation and does not load or write data. The supported app types are indicated by the `AppType` enum:
```python { linenums="1" notest }
from scinexus.composable import AppType
print(list(AppType))
# [, , , ]
```
If your app is not intended to be composed sequentially with other apps, set it to non-composable:
```python
from scinexus import define_app, AppType
@define_app(app_type=AppType.NON_COMPOSABLE)
class my_standalone_app:
def main(self, val: str) -> str:
return val.upper()
```
## Handling `NotCompleted` values
By default, apps skip `NotCompleted` inputs — they propagate through the pipeline without calling `main()`. If your app needs access to `NotCompleted` instances (e.g. you are developing a writer that records failures), set `skip_not_completed=False`:
```python
from scinexus import define_app, NotCompleted
@define_app(skip_not_completed=False)
class my_writer:
def main(self, val: str) -> str:
if isinstance(val, NotCompleted):
# handle the failure
...
return val
```
cogent3-scinexus-e0aee79/docs/howto/write-a-function-app.md 0000664 0000000 0000000 00000004267 15202532665 0024016 0 ustar 00root root 0000000 0000000 # Write a function app
!!! abstract ""
How to use `@define_app` on a plain function, specifying input/output type hints and turning function parameters into constructor arguments.
```python { linenums="1" notest }
from collections.abc import Callable
from citeable import Software
from scinexus import define_app, AppType
my_cite = Software(
author=["Doe, J", "Smith, A"],
title="My Sequence Filter",
year=2025,
url="https://example.com/my-filter",
version="0.1.0",
)
@define_app(
app_type=AppType.GENERIC, # (1)!
cite=my_cite, # (2)!
)
def my_app(val: str, convert: Callable[[str], str]) -> str: # (3)!
return convert(val)
app = my_app(str.upper)
print(app("hello world"))
# HELLO WORLD
```
1. We specify the `app_type` explicitly here.
2. We assign the citation in the class definition.
3. The function definition has type hints for its first argument and its return type.
!!! note
Your function can only have one required argument. It can have any number of optional arguments.
Pay attention to the order of arguments for the function! Every call to the app provides a new instance of `val`. Whereas `str.upper` is assigned to the variable `convert`. You can think of all of the other arguments as being arguments to a class constructor. Under the hood, `scinexus` caches these and injects them into each call of your function with new values of `val`.
cogent3-scinexus-e0aee79/docs/images/ 0000775 0000000 0000000 00000000000 15202532665 0017617 5 ustar 00root root 0000000 0000000 cogent3-scinexus-e0aee79/docs/images/logo-bw.png 0000664 0000000 0000000 00000002311 15202532665 0021670 0 ustar 00root root 0000000 0000000 PNG
IHDR pHYs l l\. tEXtSoftware www.inkscape.org< VIDATH{ewSDgEai,gyDHl
+Lrha+rsYJ)u!sn+}V]q{x!;}]@8s$.%0ZwT_L |]wxlű9ɏq=lJ2
oRܖIQx
I:oB, E'Ob f5!xU
8;{'9szgO܅7'Yw0$ӫGcMF.o|LIVc_LdX_73p8b43d_<;|Ёc/]ݻqwU=ګ\ql)XWU QU!u`WIzG`44o4"\Sq^,OqnO#;pfUmIBEÊE;J|]DyP'$1YU[AgMU~sq"A|;|'n_oq {?@?S:43{e))f`o
qvk~{&h*ޤ44sj_`͢,Ω8.⑵a㠪bbގO^cFI
}3W4)IYd$p26kYs74杂PU/cS/|:ڤ`۰67$&Σ1$4̯L۪O4üi@gjnpڀ{v5Uth:Gf'dgjCҧMQM4ecFYO;j|Ḫ;7ڭz_ZU%m4AErMdJ0yh9'aknB8F#m3 #o4M߭ {5gcOzO> Ҵ3g^/K31 IENDB` cogent3-scinexus-e0aee79/docs/images/logo-bw.svg 0000664 0000000 0000000 00000115123 15202532665 0021711 0 ustar 00root root 0000000 0000000
cogent3-scinexus-e0aee79/docs/images/logo-text-bw.svg 0000664 0000000 0000000 00000126214 15202532665 0022676 0 ustar 00root root 0000000 0000000
cogent3-scinexus-e0aee79/docs/images/logo-text-wb.svg 0000664 0000000 0000000 00000126301 15202532665 0022673 0 ustar 00root root 0000000 0000000
cogent3-scinexus-e0aee79/docs/images/logo-wb.png 0000664 0000000 0000000 00000002176 15202532665 0021701 0 ustar 00root root 0000000 0000000 PNG
IHDR pHYs l l\. tEXtSoftware www.inkscape.org< IDATHic9n۫V,hKc%hH*"A[b6%Zb-H,IC+mPZԵ(u%L3_4"Bű8uoNX~
b.h`*B;¹^:[ |p'ÁXp3!a