pax_global_header 0000666 0000000 0000000 00000000064 13621342762 0014520 g ustar 00root root 0000000 0000000 52 comment=de586251cb579bcb80eef435825cb3cedc202f52
toil-releases-3.24.0/ 0000775 0000000 0000000 00000000000 13621342762 0014356 5 ustar 00root root 0000000 0000000 toil-releases-3.24.0/.gitignore 0000664 0000000 0000000 00000000433 13621342762 0016346 0 ustar 00root root 0000000 0000000 *.pyc
/src/*.egg-info
/build
/dist
/docs/_build
__pycache__
.eggs/
.cache/
.pytest_cache/
.idea/
/test-report.xml
/test-report-*.xml
venv/
v3nv/
tmp/
/src/toil/test/cwl/spec
/cwltool_deps/
/docs/generated_rst/
/docker/Dockerfile
/docker/toil-*.tar.gz
/src/toil/version.py
.coverage*
toil-releases-3.24.0/.gitlab-ci.yml 0000664 0000000 0000000 00000016371 13621342762 0017022 0 ustar 00root root 0000000 0000000 image: quay.io/vgteam/vg_ci_prebake:latest
# Note that we must run in a priviliged container for our internal Docker daemon to come up.
before_script:
- startdocker || true
- docker info
- cat /etc/hosts
- export PYTHONIOENCODING=utf-8
- mkdir -p ~/.kube && cp "$GITLAB_SECRET_FILE_KUBE_CONFIG" ~/.kube/config
- mkdir -p ~/.aws && cp "$GITLAB_SECRET_FILE_AWS_CREDENTIALS" ~/.aws/credentials
after_script:
# We need to clean up any files that Toil may have made via Docker that
# aren't deletable by the Gitlab user. If we don't do this, Gitlab will try
# and clean them up before running the next job on the runner, fail, and fail
# that next job.
- pwd
- sudo rm -rf tmp
- stopdocker || true
stages:
- main_tests
- test
- integration
# Python2.7
py2_batch_systems:
stage: test
script:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
- python -m pytest -s -r s src/toil/test/batchSystems/batchSystemTest.py
- python -m pytest -s -r s src/toil/test/mesos/MesosDataStructuresTest.py
py2_cwl:
stage: test
script:
- pwd
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/cwl/cwlTest.py
py2_wdl:
stage: test
script:
- pwd
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/wdl/toilwdlTest.py
py2_jobstore_and_provisioning:
stage: test
script:
- pwd
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/sort/sortTest.py
- python -m pytest -s -r s src/toil/test/provisioners/aws/awsProvisionerTest.py
- python -m pytest -s -r s src/toil/test/provisioners/clusterScalerTest.py
- python -m pytest -s -r s src/toil/test/provisioners/gceProvisionerTest.py
py2_main:
stage: main_tests
script:
- pwd
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/src
- python -m pytest -s -r s src/toil/test/utils
py2_appliance_build:
stage: main_tests
script:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
# This reads GITLAB_SECRET_FILE_QUAY_CREDENTIALS
- python setup_gitlab_docker.py
- make push_docker
py2_integration_jobstore:
stage: integration
script:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
- export TOIL_TEST_INTEGRATIVE=True; export TOIL_AWS_KEYNAME=id_rsa; export TOIL_AWS_ZONE=us-west-2a
# This reads GITLAB_SECRET_FILE_SSH_KEYS
- python setup_gitlab_ssh.py
- python -m pytest -s -r s src/toil/test/jobStores/jobStoreTest.py
py2_integration_sort:
stage: integration
script:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
- export TOIL_TEST_INTEGRATIVE=True; export TOIL_AWS_KEYNAME=id_rsa; export TOIL_AWS_ZONE=us-west-2a
# This reads GITLAB_SECRET_FILE_SSH_KEYS
- python setup_gitlab_ssh.py
- python -m pytest -s -r s src/toil/test/sort/sortTest.py
- python -m pytest -s -r s src/toil/test/provisioners/clusterScalerTest.py
#py2_integration_provisioner:
# stage: integration
# script:
# - pwd
# - apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
# - virtualenv -p python2.7 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
# - export TOIL_TEST_INTEGRATIVE=True; export TOIL_AWS_KEYNAME=id_rsa; export TOIL_AWS_ZONE=us-west-2a
# # This reads GITLAB_SECRET_FILE_SSH_KEYS
# - python setup_gitlab_ssh.py
# - python -m pytest -s -r s src/toil/test/provisioners/aws/awsProvisionerTest.py
# Python3.6
py3_batch_systems:
stage: test
script:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
- python -m pytest -s -r s src/toil/test/batchSystems/batchSystemTest.py
- python -m pytest -s -r s src/toil/test/mesos/MesosDataStructuresTest.py
py3_cwl:
stage: test
script:
- pwd
- virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/cwl/cwlTest.py
py3_wdl:
stage: test
script:
- pwd
- virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/wdl/toilwdlTest.py
py3_jobstore_and_provisioning:
stage: test
script:
- pwd
- virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/jobStores/jobStoreTest.py
- python -m pytest -s -r s src/toil/test/sort/sortTest.py
- python -m pytest -s -r s src/toil/test/provisioners/aws/awsProvisionerTest.py
- python -m pytest -s -r s src/toil/test/provisioners/clusterScalerTest.py
- python -m pytest -s -r s src/toil/test/provisioners/gceProvisionerTest.py
py3_main:
stage: main_tests
script:
- pwd
- virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor
- python -m pytest -s -r s src/toil/test/src
- python -m pytest -s -r s src/toil/test/utils
py3_appliance_build:
stage: main_tests
script:
- pwd
- apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
- virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
# This reads GITLAB_SECRET_FILE_QUAY_CREDENTIALS
- python3.6 setup_gitlab_docker.py
- make push_docker
#py3_integration:
# stage: integration
# script:
# - pwd
# - apt update && DEBIAN_FRONTEND=noninteractive apt install -y tzdata && apt install -y jq
# - virtualenv -p python3.6 venv && . venv/bin/activate && make prepare && make develop extras=[all] && pip install htcondor awscli==1.16.272
# - export TOIL_TEST_INTEGRATIVE=True
# - export TOIL_AWS_KEYNAME=id_rsa
# - export TOIL_AWS_ZONE=us-west-2a
# # This reads GITLAB_SECRET_FILE_SSH_KEYS
# - python setup_gitlab_ssh.py
# - python -m pytest -s -r s src/toil/test/jobStores/jobStoreTest.py
toil-releases-3.24.0/.travis.yml 0000664 0000000 0000000 00000000625 13621342762 0016472 0 ustar 00root root 0000000 0000000 language: python
python:
- "3.5"
- "3.6"
- "2.7"
install:
- make prepare
- make develop extras=[aws,google] # adding extras to avoid import errors
script:
- TOIL_TEST_QUICK=True make test_offline
env:
# Necessary to get boto to work in Travis's Ubuntu Precise
# environment (see #2498). Consider removing this if/when we
# transition to the Xenial environment.
- BOTO_CONFIG=/dev/null
toil-releases-3.24.0/CODE_OF_CONDUCT.md 0000664 0000000 0000000 00000006233 13621342762 0017161 0 ustar 00root root 0000000 0000000 # Contributor Covenant Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
## Scope
This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at bd2k-genomics@googlegroups.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
[homepage]: http://contributor-covenant.org
[version]: http://contributor-covenant.org/version/1/4/
toil-releases-3.24.0/CONTRIBUTING.md 0000664 0000000 0000000 00000000361 13621342762 0016607 0 ustar 00root root 0000000 0000000 Contribution Guidelines
=======================
Before proposing a pull request, please read our [Contributor's Guide][1].
[1]: https://toil.readthedocs.io/en/master/contributing/contributing.html#contributing "Toil Contributor's Guide"
toil-releases-3.24.0/LICENSE 0000664 0000000 0000000 00000026374 13621342762 0015377 0 ustar 00root root 0000000 0000000 Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2011 UCSC Computational Genomics Lab
Original Contributors: Benedict Paten, Hannes Schmidt, John Vivian,
Christopher Ketchum, Joel Armstrong and co-authors (benedictpaten@gmail.com)
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
toil-releases-3.24.0/Makefile 0000664 0000000 0000000 00000022672 13621342762 0016027 0 ustar 00root root 0000000 0000000 # Copyright (C) 2015-2018 UCSC Computational Genomics Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include common.mk
define help
Supported targets: prepare, develop, docs, sdist, clean, test, docker and push_docker.
Please note that all build targets require a virtualenv to be active.
The 'prepare' target installs Toil's build requirements into the current virtualenv.
The 'develop' target creates an editable install of Toil and its runtime requirements in the
current virtualenv. The install is called 'editable' because changes to the source code
immediately affect the virtualenv. Set the 'extras' variable to ensure that the 'develop' target
installs support for extras; some tests require extras to be installed. Consult setup.py for the
list of supported extras. To install Toil in develop mode with all extras, run
make develop extras=[all]
The 'sdist' target creates a source distribution of Toil. It is used for some unit tests and for
installing the currently checked out version of Toil into the appliance image.
The 'clean' target cleans up the side effects of 'develop', 'sdist', 'docs', and 'docker'
on this machine. It does not undo externally visible effects like removing packages already
uploaded to PyPI.
The 'docs' target uses Sphinx to create HTML documentation in the docs/_build directory
Targets are provided to run Toil's tests. Note that these targets do *not* automatically install
Toil's dependencies; it is recommended to 'make develop' before running any of them.
The 'test' target runs Toil's unit tests serially with pytest. It will run some docker tests and
setup. Note: this target does not capture output from the terminal. For any of the test targets,
set the 'tests' variable to run a particular test, e.g.
make test tests=src/toil/test/sort/sortTest.py::SortTest::testSort
The 'integration_test' target runs toil's integration tests. These are more thorough but also
more costly than the regular unit tests. For the AWS integration tests to run, the environment
variable 'TOIL_AWS_KEYNAME' must be set. This user will be charged for expenses accrued during the
test.
The 'docker' target builds the Docker images that make up the Toil appliance. You may set the
TOIL_DOCKER_REGISTRY variable to override the default registry that the 'push_docker' target pushes
the appliance images to, for example:
TOIL_DOCKER_REGISTRY=quay.io/USER make docker
If Docker is not installed, Docker-related targets tasks and tests will be skipped. The
same can be achieved by setting TOIL_DOCKER_REGISTRY to an empty string.
The 'push_docker' target pushes the Toil appliance images to a remote Docker registry. It
requires the TOIL_DOCKER_REGISTRY variable to be set to a value other than the default to avoid
accidentally pushing to the official Docker registry for Toil.
The TOIL_DOCKER_NAME environment variable can be set to customize the appliance image name that
is created by the 'docker' target and pushed by the 'push_docker' target. The Toil team\'s
continuous integration system overrides this variable to avoid conflicts between concurrently
executing builds for the same revision, e.g. toil-pr and toil-it.
endef
export help
help:
@printf "$$help"
# This Makefile uses bash features like printf and <()
SHELL=bash
python=python
pip=pip
tests=src
tests_local=src/toil/test
# do slightly less than travis timeout of 10 min.
pytest_args_local=-vv --timeout=530
extras=
sdist_name:=toil-$(shell $(python) version_template.py distVersion).tar.gz
green=\033[0;32m
normal=\033[0m
red=\033[0;31m
cyan=\033[0;36m
develop: check_venv
$(pip) install -e .$(extras)
clean_develop: check_venv
- $(pip) uninstall -y toil
- rm -rf src/*.egg-info
- rm src/toil/version.py
sdist: dist/$(sdist_name)
dist/$(sdist_name): check_venv
@test -f dist/$(sdist_name) && mv dist/$(sdist_name) dist/$(sdist_name).old || true
$(python) setup.py sdist
@test -f dist/$(sdist_name).old \
&& ( cmp -s <(tar -xOzf dist/$(sdist_name)) <(tar -xOzf dist/$(sdist_name).old) \
&& mv dist/$(sdist_name).old dist/$(sdist_name) \
&& printf "$(cyan)No significant changes to sdist, reinstating backup.$(normal)\n" \
|| rm dist/$(sdist_name).old ) \
|| true
clean_sdist:
- rm -rf dist
- rm src/toil/version.py
# We always claim to be Travis, so that local test runs will not skip Travis tests.
# Gitlab doesn't run tests via the Makefile.
# The auto-deployment test needs the docker appliance
test: check_venv check_build_reqs docker
TRAVIS=true \
$(python) -m pytest --cov=toil $(pytest_args_local) $(tests)
# This target will skip building docker and all docker based tests
# these are our travis tests; rename?
test_offline: check_venv check_build_reqs
@printf "$(cyan)All docker related tests will be skipped.$(normal)\n"
TOIL_SKIP_DOCKER=True \
TRAVIS=true \
$(python) -m pytest $(pytest_args_local) $(tests_local)
ifdef TOIL_DOCKER_REGISTRY
docker_image:=$(TOIL_DOCKER_REGISTRY)/$(TOIL_DOCKER_NAME)
grafana_image:=$(TOIL_DOCKER_REGISTRY)/toil-grafana
prometheus_image:=$(TOIL_DOCKER_REGISTRY)/toil-prometheus
mtail_image:=$(TOIL_DOCKER_REGISTRY)/toil-mtail
define tag_docker
@printf "$(cyan)Removing old tag $2. This may fail but that's expected.$(normal)\n"
-docker rmi $2
docker tag $1 $2
@printf "$(green)Tagged appliance image $1 as $2.$(normal)\n"
endef
docker: docker/Dockerfile
# Pre-pull everything
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull ubuntu:16.04 && break || sleep 60; done
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull prom/prometheus:v2.0.0 && break || sleep 60; done
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull grafana/grafana && break || sleep 60; done
for i in $$(seq 1 11); do if [[ $$i == "11" ]] ; then exit 1 ; fi ; docker pull sscaling/mtail && break || sleep 60; done
@set -ex \
; cd docker \
; docker build --tag=$(docker_image):$(TOIL_DOCKER_TAG) -f Dockerfile .
@set -ex \
; cd dashboard/prometheus \
; docker build --tag=$(prometheus_image):$(TOIL_DOCKER_TAG) -f Dockerfile .
@set -ex \
; cd dashboard/grafana \
; docker build --tag=$(grafana_image):$(TOIL_DOCKER_TAG) -f Dockerfile .
@set -ex \
; cd dashboard/mtail \
; docker build --tag=$(mtail_image):$(TOIL_DOCKER_TAG) -f Dockerfile .
docker/$(sdist_name): dist/$(sdist_name)
cp $< $@
docker/Dockerfile: docker/Dockerfile.py docker/$(sdist_name)
_TOIL_SDIST_NAME=$(sdist_name) $(python) docker/Dockerfile.py > $@
clean_docker:
-rm docker/Dockerfile docker/$(sdist_name)
-docker rmi $(docker_image):$(TOIL_DOCKER_TAG)
push_docker: docker
# Weird if logic is so we fail if all the pushes fail
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(docker_image):$(TOIL_DOCKER_TAG) && break || sleep 60; done
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(grafana_image):$(TOIL_DOCKER_TAG) && break || sleep 60; done
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(prometheus_image):$(TOIL_DOCKER_TAG) && break || sleep 60; done
for i in $$(seq 1 6); do if [[ $$i == "6" ]] ; then exit 1 ; fi ; docker push $(mtail_image):$(TOIL_DOCKER_TAG) && break || sleep 60; done
else
docker docker_push clean_docker:
@printf "$(cyan)Skipping '$@' target as TOIL_DOCKER_REGISTRY is empty or Docker is not installed.$(normal)\n"
endif
docs: check_venv check_build_reqs
# Strange, but seemingly benign Sphinx warning floods stderr if not filtered:
cd docs && make html
clean_docs: check_venv
- cd docs && make clean
clean: clean_develop clean_sdist clean_docs
check_build_reqs:
@$(python) -c 'import mock; import pytest' \
|| ( printf "$(red)Build requirements are missing. Run 'make prepare' to install them.$(normal)\n" ; false )
prepare: check_venv
$(pip) install mock==1.0.1 pytest==4.3.1 pytest-cov==2.6.1 stubserver==1.0.1 pytest-timeout==1.3.3 cwltest
check_venv:
@$(python) -c 'import sys, os; sys.exit( int( 0 if "VIRTUAL_ENV" in os.environ else 1 ) )' \
|| ( printf "$(red)A virtualenv must be active.$(normal)\n" ; false )
check_clean_working_copy:
@printf "$(green)Checking if your working copy is clean ...$(normal)\n"
@git diff --exit-code > /dev/null \
|| ( printf "$(red)Your working copy looks dirty.$(normal)\n" ; false )
@git diff --cached --exit-code > /dev/null \
|| ( printf "$(red)Your index looks dirty.$(normal)\n" ; false )
@test -z "$$(git ls-files --other --exclude-standard --directory)" \
|| ( printf "$(red)You have untracked files:$(normal)\n" \
; git ls-files --other --exclude-standard --directory \
; false )
check_cpickle:
# fail if cPickle.dump(s) called without HIGHEST_PROTOCOL
# https://github.com/BD2KGenomics/toil/issues/1503
! find src -iname '*.py' | xargs grep 'cPickle.dump' | grep --invert-match HIGHEST_PROTOCOL
.PHONY: help \
prepare \
check_cpickle \
develop clean_develop \
sdist clean_sdist \
test test_offline \
docs clean_docs \
clean \
check_venv \
check_clean_working_copy \
check_build_reqs \
docker clean_docker push_docker
toil-releases-3.24.0/README.rst 0000664 0000000 0000000 00000002502 13621342762 0016044 0 ustar 00root root 0000000 0000000 ATTENTION: Toil will be dropping python 2.7 support on January 1, 2020 when python 2.7 itself is scheduled to die. This is when the last release of python 2.7 compatible toil will be.
Toil is a scalable, efficient, cross-platform (Linux & macOS) pipeline management system,
written entirely in Python, and designed around the principles of functional
programming.
* Check the `website`_ for a description of Toil and its features.
* Full documentation for the latest stable release can be found at
`Read the Docs`_.
* Please subscribe to low-volume `announce`_ mailing list so we keep you informed
* Google Groups discussion `forum`_
* See our occasional `blog`_ for tutorials.
.. _website: http://toil.ucsc-cgl.org/
.. _Read the Docs: https://toil.readthedocs.io/en/latest
.. _announce: https://groups.google.com/forum/#!forum/toil-announce
.. _forum: https://groups.google.com/forum/#!forum/toil-community
.. _blog: https://toilpipelines.wordpress.com/
.. image:: https://badges.gitter.im/bd2k-genomics-toil/Lobby.svg
:alt: Join the chat at https://gitter.im/bd2k-genomics-toil/Lobby
:target: https://gitter.im/bd2k-genomics-toil/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge
Note: Toil moved from https://github.com/BD2KGenomics/toil to https://github.com/DataBiosphere/toil on July 5th, 2018.
toil-releases-3.24.0/ROADMAP.md 0000664 0000000 0000000 00000002330 13621342762 0015761 0 ustar 00root root 0000000 0000000 # Near Term (In Progress, Estimated Completion Date?)
* Libcloud provisioning
- [ ] Google
- [ ] AWS
- [ ] Fix flaky tests
- [ ] Run massive workflows
- [ ] Better feedback (error messages, logging).
# Medium Term (~ 6-month goals, by ~June 2018?)
* Batch systems
- [ ] Google Pipelines
- [ ] AWS Batch
- [ ] Containerize leader (work with Consonance)
- [ ] Change the thread pool model to improve single machine usage.
* Improve the development process.
- [ ] Add a linter
- [ ] Add a code coverage tool.
- [ ] Organize tests.
- [ ] Better access to tests for external developers.
- [ ] TES support
- [ ] WES Support (if Consonance does not work well)
# Longer Term
- [ ] Better track versions of specifications (e.g. CWL, WDL) and dependencies.
- [ ] Add other provisioners: OpenStack
- [ ] Singularity support.
- [ ] Uniform configuration (i.e. not just environment variables).
- [ ] Add management and monitoring UIs.
- [ ] Python 3 support.
- [ ] Add URL wrapping for streaming instead of copying.
# Completed
- [x] Basic WDL support.
- [x] Travis CI for commits.
- [x] Run Toil within Popper (https://cross.ucsc.edu/tag/popper/).
- [x] Grafana for workflow monitoring
- [x] Finish Google jobStore (GCP)
toil-releases-3.24.0/attic/ 0000775 0000000 0000000 00000000000 13621342762 0015462 5 ustar 00root root 0000000 0000000 toil-releases-3.24.0/attic/README.md 0000664 0000000 0000000 00000065164 13621342762 0016755 0 ustar 00root root 0000000 0000000 #Toil
Python based pipeline management software for clusters that makes running recursive and dynamically scheduled computations straightforward. So far works with gridEngine, lsf, parasol and on multi-core machines.
##Authors
[Benedict Paten](https://github.com/benedictpaten/), [Dent Earl](https://github.com/dentearl/), [Daniel Zerbino](https://github.com/dzserbino/), [Glenn Hickey](https://github.com/glennhickey/), other UCSC people.
##Requirements
* Python 2.5 or later, but less than 3.0
##Installation
1. Install sonLib. See https://github.com/benedictpaten/sonLib
2. Place the directory containing the toil in the same directory as sonLib.
The directory containing both sonLib and toil should be on your python path. i.e.
PYTHONPATH=${PYTHONPATH}:FOO where FOO/toil is the path containing the base directory of toil.
3. Build the code:
Type 'make all' in the base directory, this just puts some stuff that is currently all python based in the bin dir. In the future there might be some actual compilation.
4. Test the code:
python allTests.py or 'make test'.
##Running and examining a toil script
The following walks through running a toil script and using the command-line tools **toilStatus**, **toilRun** and **toilStats**, which are used to analyse the status, restart and print performance statistics, respectively, about a run.
Once toil is installed, running a toil script is performed by executing the script from the command-line, e.g. (using the file sorting toy example in **tests/sort/scriptTreeTest_Sort.py**):
[]$ scriptTreeTest_Sort.py --fileToSort foo --toil bar/toil --batchSystem parasol --logLevel INFO --stats
Which in this case uses the parasol batch system, and INFO level logging and where foo is the file to sort and bar/toil is the location of a directory (which should not already exist) from which the batch will be managed. Details of the toil options are described below; the stats option is used to gather statistics about the jobs in a run.
The script will return a zero exit value if the toil system is successfully able to run to completion, else it will create an exception. If the script fails because a job failed then the log file information of the job will be reported to std error.
The toil directory (here 'bar/toil') is not automatically deleted regardless of success or failure, and contains a record of the jobs run, which can be enquired about using the **toilStatus** command. e.g.
[]$toilStatus bar/toil --verbose
```
There are 0 active jobs, 0 parent jobs with children, 0 totally failed jobs and 0 empty jobs (i.e. finished but not cleaned up) curre
ntly in toil: toil
There are no failed jobs to report
```
If a job failed, this provides a convenient way to reprint the error. The following are the important options to **toilStatus**:
--toil=TOIL Directory containing the jobtree. The toil location can also be specified as the argument to the script. default=./toil
--verbose Print loads of information, particularly all the log
files of jobs that failed. default=False
--failIfNotComplete Return exit value of 1 if toil jobs not all
completed. default=False
If a job in the script failed or the system goes down, you may wish to retry the job after fixing the error. This can be achieved by restarting the script with the **toilRun** command which will restart an existing toil.
[]$ toilRun --toil bar/toil --logLevel INFO
It will always attempt to restart the jobs from the previous point of failure.
If the script was run with the **--stats** option then **toilStats** can be run on the pipeline do generate information about the performance of the run, in terms of how many jobs were run, how long they executed for and how much CPU time/wait time was involved, e.g.:
[]$toilStats bar/toil
```
Batch System: singleMachine
Default CPU: 1 Default Memory: 2097152K
Job Time: 0.50 Max CPUs: 9.22337e+18 Max Threads: 4
Total Clock: 0.09 Total Runtime: 7.60
Slave
Count | Time* | Clock | Wait | Memory
n | min med* ave max total | min med ave max total | min med ave max total | min med ave max total
365 | 0.01 0.02 0.02 0.06 6.82 | 0.01 0.01 0.01 0.04 4.71 | 0.00 0.00 0.01 0.03 2.11 | 9781248K 13869056K 13799121K 14639104K 5036679168K
Job
Slave Jobs | min med ave max
| 2 2 2 2
Count | Time* | Clock | Wait | Memory
n | min med* ave max total | min med ave max total | min med ave max total | min med ave max total
367 | 0.00 0.00 0.00 0.03 0.68 | 0.00 0.00 0.00 0.01 0.42 | 0.00 0.00 0.00 0.03 0.26 | 9461760K 13869056K 13787694K 14639104K 5060083712K
Cleanup
Count | Time* | Clock | Wait | Memory
n | min med* ave max total | min med ave max total | min med ave max total | min med ave max total
1 | 0.00 0.00 0.00 0.00 0.00 | 0.00 0.00 0.00 0.00 0.00 | 0.00 0.00 0.00 0.00 0.00 | 14639104K 14639104K 14639104K 14639104K 14639104K
Up
Count | Time* | Clock | Wait | Memory
n | min med* ave max total | min med ave max total | min med ave max total | min med ave max total
124 | 0.00 0.00 0.00 0.01 0.15 | 0.00 0.00 0.00 0.01 0.12 | 0.00 0.00 0.00 0.01 0.03 | 13713408K 14090240K 14044985K 14581760K 1741578240K
Setup
Count | Time* | Clock | Wait | Memory
n | min med* ave max total | min med ave max total | min med ave max total | min med ave max total
1 | 0.00 0.00 0.00 0.00 0.00 | 0.00 0.00 0.00 0.00 0.00 | 0.00 0.00 0.00 0.00 0.00 | 9551872K 9551872K 9551872K 9551872K 9551872K
Down
Count | Time* | Clock | Wait | Memory
n | min med* ave max total | min med ave max total | min med ave max total | min med ave max total
241 | 0.00 0.00 0.00 0.03 0.53 | 0.00 0.00 0.00 0.00 0.30 | 0.00 0.00 0.00 0.03 0.23 | 9461760K 13828096K 13669354K 14155776K 3294314496K
```
The breakdown is given per "slave", which is unit of serial execution, and per "job", which corresponds to a scriptTree job (see below).
Despite its simplicity, we've found this can be **very** useful for tracking down performance issues, particularly when trying out a pipeline on a new system.
The important arguments to **toilStats** are:
--outputFile=OUTPUTFILE
File in which to write results
--raw output the raw xml data.
--pretty, --human if not raw, prettify the numbers to be human readable.
--categories=CATEGORIES
comma separated list from [time, clock, wait, memory]
--sortCategory=SORTCATEGORY
how to sort Job list. may be from [alpha, time,
clock, wait, memory, count]. default=%(default)s
--sortField=SORTFIELD
how to sort Job list. may be from [min, med, ave,
max, total]. default=%(default)s
--sortReverse, --reverseSort
reverse sort order.
--cache stores a cache to speed up data display.
##toil options
A toil script will have the following command-line options.
Options that control logging.
--logOff Turn off logging. (default is CRITICAL)
--logInfo Turn on logging at INFO level. (default is CRITICAL)
--logDebug Turn on logging at DEBUG level. (default is CRITICAL)
--logLevel=LOGLEVEL
Log at level (may be either OFF/INFO/DEBUG/CRITICAL).
(default is CRITICAL)
--logFile=LOGFILE File to log in
--rotatingLogging Turn on rotating logging, which prevents log files
getting too big.
Options to specify the location of the toil and turn on stats
collation about the performance of jobs.
--toil=TOIL Directory in which to place job management files and
the global accessed temporary file directories(this
needs to be globally accessible by all machines
running jobs). If you pass an existing directory it
will check if it's a valid existing jobtree, then try
and restart the jobs in it. The default=./toil
--stats Records statistics about the job-tree to be used by
toilStats. default=False
Options for specifying the batch system, and arguments to the
batch system/big batch system (see below).
--batchSystem=BATCHSYSTEM
The type of batch system to run the job(s) with,
currently can be
'singleMachine'/'parasol'/'acidTest'/'gridEngine'/'lsf'.
default=singleMachine
--maxThreads=MAXTHREADS
The maximum number of threads (technically processes
at this point) to use when running in single machine
mode. Increasing this will allow more jobs to run
concurrently when running on a single machine.
default=4
--parasolCommand=PARASOLCOMMAND
The command to run the parasol program default=parasol
Options to specify default cpu/memory requirements (if not
specified by the jobs themselves), and to limit the total amount of
memory/cpu requested from the batch system.
--defaultMemory=DEFAULTMEMORY
The default amount of memory to request for a job (in
bytes), by default is 2^31 = 2 gigabytes,
default=2147483648
--defaultCores=DEFAULTCORES
The default the number of cpus to dedicate a job.
default=1
--maxCpus=MAXCPUS The maximum number of cpus to request from the batch
system at any one time. default=9223372036854775807
--maxMemory=MAXMEMORY
The maximum amount of memory to request from the batch
system at any one time. default=9223372036854775807
Options for rescuing/killing/restarting jobs, includes options for jobs that either run too long/fail or get lost (some
batch systems have issues!).
--retryCount=RETRYCOUNT
Number of times to retry a failing job before giving
up and labeling job failed. default=0
--maxJobDuration=MAXJOBDURATION
Maximum runtime of a job (in seconds) before we kill
it (this is a lower bound, and the actual time before
killing the job may be longer).
default=9223372036854775807
--rescueJobsFrequency=RESCUEJOBSFREQUENCY
Period of time to wait (in seconds) between checking
for missing/overlong jobs, that is jobs which get lost
by the batch system. Expert parameter. (default is set
by the batch system)
toil big batch system options; toil can employ a secondary batch system for running large
memory/cpu jobs using the following arguments.
--bigBatchSystem=BIGBATCHSYSTEM
The batch system to run for jobs with larger
memory/cpus requests, currently can be
'singleMachine'/'parasol'/'acidTest'/'gridEngine'.
default=none
--bigMemoryThreshold=BIGMEMORYTHRESHOLD
The memory threshold above which to submit to the big
queue. default=9223372036854775807
--bigCpuThreshold=BIGCPUTHRESHOLD
The cpu threshold above which to submit to the big
queue. default=9223372036854775807
--bigMaxCpus=BIGMAXCPUS
The maximum number of big batch system cpus to allow
at one time on the big queue.
default=9223372036854775807
--bigMaxMemory=BIGMAXMEMORY
The maximum amount of memory to request from the big
batch system at any one time.
default=9223372036854775807
Miscellaneous options.
--jobTime=JOBTIME The approximate time (in seconds) that you'd like a
list of child jobs to be run serially before being
parallelized. This parameter allows one to avoid over
parallelizing tiny jobs, and therefore paying
significant scheduling overhead, by running tiny jobs
in series on a single node/core of the cluster.
default=30
--maxLogFileSize=MAXLOGFILESIZE
The maximum size of a job log file to keep (in bytes),
log files larger than this will be truncated to the
last X bytes. Default is 50 kilobytes, default=50120
--command=COMMAND The command to run (which will generate subsequent
jobs). This is deprecated
##Overview of toil
The following sections are for people creating toil scripts and as general information. The presentation **[docs/toilSlides.pdf](https://github.com/benedictpaten/toil/blob/master/doc/toilSlides.pdf)** is also a quite useful, albeit slightly out of date, guide to using toil. -
Most batch systems (such as LSF, Parasol, etc.) do not allow jobs to spawn
other jobs in a simple way.
The basic pattern provided by toil is as follows:
1. You have a job running on your cluster which requires further parallelisation.
2. You create a list of jobs to perform this parallelisation. These are the 'child' jobs of your process, we call them collectively the 'children'.
3. You create a 'follow-on' job, to be performed after all the children have successfully completed. This job is responsible for cleaning up the input files created for the children and doing any further processing. Children should not cleanup files created by parents, in case of a batch system failure which requires the child to be re-run (see 'Atomicity' below).
4. You end your current job successfully.
5. The batch system runs the children. These jobs may in turn have children and follow-on jobs.
6. Upon completion of all the children (and children's children and follow-ons, collectively descendants) the follow-on job is run. The follow-on job may create more children.
##scriptTree
ScriptTree provides a Python interface to toil, and is now the only way to interface with toil (previously you could manipulate XML files, but I've removed that functionality as I improved the underlying system).
Aside from being the interface to toil, scriptTree was designed to remediate some of the pain of writing wrapper scripts for cluster jobs, via the extension of a simple python wrapper class (called a 'Job' to avoid confusion with the more general use of the word 'job') which does much of the work for you. Using scriptTree, you can describe your script as a series of these classes which link together, with all the arguments and options specified in one place. The script then, using the magic of python pickles, generates all the wrappers dynamically and clean them up when done.
This inherited template pattern has the following advantages:
1. You write (potentially) just one script, not a series of wrappers. It is much easier to understand, maintain, document and explain.
2. You write less boiler plate.
3. You can organise all the input arguments and options in one place.
The best way to learn how to use script tree is to look at an example. The following is taken from (an old version of) toil.test.sort.scriptTreeTest_Sort.py
which provides a complete script for performing a parallel merge sort.
Below is the first 'Job' of this script inherited from the base class 'toil.scriptTree.Job'. Its job is to setup the merge sort.
```python
class Setup(Job):
"""Sets up the sort.
"""
def __init__(self, inputFile, N):
Job.__init__(self, time=1, memory=1000000, cpu=1)
self.inputFile = inputFile
self.N = N
def run(self):
tempOutputFile = getTempFile(rootDir=self.getGlobalTempDir())
self.addChildJob(Down(self.inputFile, 0, os.path.getsize(self.inputFile), self.N, tempOutputFile))
self.setFollowOnJob(Cleanup(tempOutputFile, self.inputFile))
```
The constructor (**__init__()**) assigns some variables to the class. When invoking the constructor of the base class (which should be the first thing the job does), you can optionally pass time (in seconds), memory (in bytes) and cpu parameters. The time parameter is your estimate of how long the job will run - UPDATE: IT IS CURRENTLY UNUSED BY THE SCHEDULAR. The memory and cpu parameters allow you to guarantee resources for a job.
The run method is where the variables assigned by the constructor are used and where in general actual work is done.
Aside from doing the specific work of the job (in this case creating a temporary file to hold some intermediate output), the run method is also where children and a follow-on job are created, using **addChildJob()** and **setFollowOnJob()**. A job may have arbitrary numbers of children, but one or zero follow-on jobs.
Jobs are also provided with two temporary file directories called **localTempDir** and **globalTempDir**, which can be accessed with the methods **getLocalTempDir()** and **getGlobalTempDir()**, respectively. The **localTempDir** is the path to a temporary directory that is local to the machine on which the job is being executed and that will exist only for the length of the run method. It is useful for storing interim results that are computed during runtime. All files in this directory are guaranteed to be removed once the run method has finished - even if your job crashes.
A job can either be created as a follow-on, or it can be the very first job, or it can be created as a child of another job. Let a job not created as a follow-on be called a 'founder'. Each founder job may have a follow-on job. If it has a follow-on job, this follow-on job may in turn have a follow-on, etc. Thus each founder job defines a chain of follow-ons. Let a founder job and its maximal sequence of follow-ons be called a 'chain'. Let the last follow-on job in a chain be called the chain's 'closer'. For each chain of jobs a temporary directory, **globalTempDir**, is created immediately prior to calling the founder job's run method, this directory and its contents then persist until the completion of closer job's run method. Thus the **globalTempDir** is a scratch directory in which temporary results can be stored on disk between job jobs in a chain. Furthermore, files created in this directory can be passed to the children of job jobs in the chain, allowing results to be transmitted from a job job to its children.
##Making Functions into Jobs
To avoid the need to create a Job class for every job, I've added the ability to wrap functions, hence the code for the setup function described above becomes:
```
def setup(job, inputFile, N):
"""Sets up the sort.
"""
tempOutputFile = getTempFile(rootDir=job.getGlobalTempDir())
job.addChildJobFn(down, (inputFile, 0, os.path.getsize(inputFile), N, tempOutputFile))
job.setFollowOnFn(cleanup, (tempOutputFile, inputFile))
```
The code to turn this into a job uses the static method **[Job.makeJobFnJob](https://github.com/benedictpaten/toil/blob/development/scriptTree/job.py#L142)**:
```
Job.makeJobFnJob(setup, (fileToSort, N))
```
Notice that the child and follow-on jobs have also been refactored as functions, hence the methods **[addChildJobFn](https://github.com/benedictpaten/toil/blob/development/scriptTree/job.py#L82)** and **[setFollowOnFn](https://github.com/benedictpaten/toil/blob/development/scriptTree/job.py#L67)**, which take functions as opposed to Job objects.
Note, there are two types of functions you can wrap - **job functions**, whose first argument must be the wrapping job object (the setup function above is an excample of a job function), and plain functions that do not have a reference to the wrapping job.
##Creating a scriptTree script:
ScriptTree jobs are serialized (written and retrieved from disk) so that they can be executed in parallel on a cluster of different machines. Thankfully, this is mostly transparent to the user, except for the fact that jobs must be 'pickled' (see python docs), which creates a few constraints upon what can and can not be passed to and stored by a job.
Currently the preferred way to run a pipeline is to create an executable python script.
For example, see **tests/sorts/scriptTreeTest_Sort.py**.
The first line to notice is:
```python
from toil.scriptTree.job import Job, Stack
```
This imports the Job and Stack objects (the stack object is used to run the jobs).
Most of the code defines a series of jobs (see above).
The **main()** method is where the script is setup and run.
The line:
```python
parser = OptionParser()
```
Creates an options parser using the python module optparse.
The line:
```python
Stack.addToilOptions(parser)
```
Adds the toil options to the parser. Most importantly it adds the command line options "--toil [path to toil]", which is the location in which the toil will be created, and which must be supplied to the script.
The subsequent lines parse the input arguments, notably the line:
```python
options, args = parser.parse_args()
```
reads in the input parameters.
The line:
```python
i = Stack(Setup(options.fileToSort, int(options.N))).startToil(options)
```
Is where the first job is created (the Setup job shown above), where a stack object is created, which is passed the first job as its sole construction argument, and finally where the toil is executed from, using the stack's **startToil(options)** function. The 'options' argument will contain a dictionary of command line arguments which are used by toil. The return value of this function is equal to the number of failed jobs. In this case we choose to throw an exception if there are any remaining.
One final important detail, the lines:
```python
if __name__ == '__main__':
from toil.test.sort.scriptTreeTest_Sort import *
```
reload the objects in the module, such that their module names will be absolute (this is necessary for the serialization that is used). Jobs in other classes that are imported do not need to be reloaded in this way.
##Atomicity
toil and scriptTree are designed to be robust, so that individuals jobs (jobs) can fail, and be subsequently restarted. It is assumed jobs can fail at any point. Thus until toil knows your children have been completed okay you can not assume that your Job has been completed. To ensure that your pipeline can be restarted after a failure ensure that every job (job):
1. **Never cleans up / alters its own input files.** Instead, parents and follow on jobs may clean up the files of children or prior jobs.
2. Can be re-run from just its input files any number of times. A job should only depend on its input, and it should be possible to run the job as many times as desired, essentially until news of its completion is successfully transmitted to the jobtree master process.
These two properties are the key to job atomicity. Additionally, you'll find it much easier if a job:
3. Only creates temp files in the two provided temporary file directories. This ensures we don't soil the cluster's disks.
4. Logs sensibly, so that error messages can be transmitted back to the master and the pipeline can be successfully debugged.
##Environment
toil replicates the environment in which toil or scriptTree is invoked and provides this environment to all the jobs/jobs. This ensures uniformity of the environment variables for every job.
##FAQ's:
* _How robust is toil to failures of nodes and/or the master?_
Toil checkpoints its state on disk, so that it or the job manager can be wiped out and restarted. There is some gnarly test code to show how this works, it will keep crashing everything, at random points, but eventually everything will complete okay. As a user you needn't worry about any of this, but your child jobs must be atomic (as with all batch systems), and must follow the convention regarding input files.
* _How scaleable?_
We have tested having 1000 concurrent jobs running on our cluster. This will depend on the underlying batch system being used.
* _Can you support the XYZ batch system?_
See the abstract base class '[AbstractBatchSystem](https://github.com/benedictpaten/toil/blob/master/batchSystems/abstractBatchSystem.py)' in the code to see what functions need to be implemented. It's reasonably straight forward.
* _Is there an API for the toil top level commands?_
Not really - at this point please use scriptTree and the few command line utilities present in the bin directory.
* _Why am I getting the error "ImportError: No module named etree.ElementTree"?_
The version of python in your path is less than 2.5. When toil spawns a new job it will use the python found in your PATH. Make sure that the first python in your PATH points to a python version greater than or equal to 2.5 but less than 3.0
toil-releases-3.24.0/attic/absolute_imports.py 0000664 0000000 0000000 00000011106 13621342762 0021426 0 ustar 00root root 0000000 0000000 from __future__ import absolute_import
import os
import sys
import ast
import tempfile
import shutil
def enable_absolute_imports(script, script_name):
"""
Empty modules
>>> enable_absolute_imports('')
'from __future__ import absolute_import\\n'
Ignore empty lines
>>> enable_absolute_imports('\\n')
'from __future__ import absolute_import\\n'
Append after initial comments, like shebangs
>>> enable_absolute_imports('#foo\\n')
'#foo\\nfrom __future__ import absolute_import\\n'
Insert before regular comments
>>> enable_absolute_imports('#foo\\nimport bar\\n')
'#foo\\nfrom __future__ import absolute_import\\nimport bar\\n'
Insert before non-import statements
>>> enable_absolute_imports('if False:\\n pass\\n')
'from __future__ import absolute_import\\nif False:\\n pass\\n'
Idempotence
>>> enable_absolute_imports('from __future__ import absolute_import\\n') is None
True
Other __future__ imports
>>> enable_absolute_imports('from __future__ import print_function\\n')
'from __future__ import absolute_import\\nfrom __future__ import print_function\\n'
Insert before from ... immport statements
>>> enable_absolute_imports('from blah import fasel\\n')
'from __future__ import absolute_import\\nfrom blah import fasel\\n'
Insert before multiple future imports
>>> enable_absolute_imports('from __future__ import print_function\\nfrom __future__ import nested_scopes\\n')
'from __future__ import absolute_import\\nfrom __future__ import print_function\\nfrom __future__ import nested_scopes\\n'
Insert before wrapped multi-name future import
>>> enable_absolute_imports('from __future__ import (print_function,\\n nested_scopes)\\n')
'from __future__ import absolute_import\\nfrom __future__ import (print_function,\\n nested_scopes)\\n'
Usually docstrings show up as attributes of other nodes but unassociated docstring become
Expr nodes in the AST.
>>> enable_absolute_imports("#foo\\n\\n'''bar'''\\n\\npass")
"#foo\\n\\nfrom __future__ import absolute_import\\n'''bar'''\\n\\npass\\n"
Unassociated multiline docstring
>>> enable_absolute_imports("#foo\\n\\n'''bar\\n'''\\n\\npass")
"#foo\\n\\nfrom __future__ import absolute_import\\n'''bar\\n'''\\n\\npass\\n"
"""
tree = ast.parse(script, filename=script_name)
lines = script.split('\n')
while lines and lines[-1] == "":
lines.pop()
node = None
for child in ast.iter_child_nodes(tree):
if isinstance(child, ast.Import):
node = child
break
elif isinstance(child, ast.ImportFrom):
assert child.level == 0 # don't know what this means
if child.module == '__future__':
if any(alias.name == 'absolute_import' for alias in child.names):
return None
else:
if node is None: node = child
else:
node = child
break
if node is None:
if len(tree.body) == 0:
node = ast.stmt()
node.lineno = len(lines) + 1
else:
node = tree.body[0]
# This crazy heuristic tries to handle top-level docstrings with newlines in them
# for which lineno is the line where the docstring ends
if isinstance(node, ast.Expr) and isinstance(node.value, ast.Str):
node.lineno -= node.value.s.count('\n')
line = 'from __future__ import absolute_import'
lines.insert(node.lineno - 1, line)
lines.append("")
return '\n'.join(lines)
def main(root_path):
for dir_path, dir_names, file_names in os.walk(root_path):
for file_name in file_names:
if file_name.endswith('.py') and file_name != 'setup.py':
file_path = os.path.join(dir_path, file_name)
with open(file_path) as file:
script = file.read()
new_script = enable_absolute_imports(script, file_name)
if new_script is not None:
temp_handle, temp_file_path = tempfile.mkstemp(prefix=file_name, dir=dir_path)
try:
with os.fdopen(temp_handle, 'w') as temp_file:
temp_file.write(new_script)
except:
os.unlink(temp_file_path)
raise
else:
shutil.copymode(file_path,temp_file_path)
os.rename(temp_file_path, file_path)
if __name__ == '__main__':
main(sys.argv[1])
toil-releases-3.24.0/attic/jobTreeSlides.pdf 0000664 0000000 0000000 00001167261 13621342762 0020731 0 ustar 00root root 0000000 0000000 %PDF-1.3
%Äåòåë§ó ÐÄÆ
4 0 obj
<< /Length 5 0 R /Filter /FlateDecode >>
stream
xu’Koƒ0„ïüŠ9‚¯nU£¦RnH=T=$©R5©‰ªüû®ÁD¤!±×ëoÆsÄŽPTø—Ì%šÏ8`:o ®¡u”l!E†/S\þ[êO!=ƒµ”™:³£s|>Â?‹ú¾¼¡›H¹-ï,¨€Ûã¾Qxi-”²ÜPT{L$Xª-^/“ˆEÅŸ ³*Äë4AÚNMâ5ÇuàÕ;1P>«£$uC(ƒú”Ö˜@iò’¿¶PB«œÀã‘íê×õ¡Þ¾Ý€çNwaª
ßAó¦\¥g×&ì«s?—®}Ïc‘ÌRÈŒ…}õW7/;P‰rŽ#cùKž5W~«–ÞsPø'›)\ç>P¥ÐÜ,úªÓGÎÔ[{›-
2lygö¶»†›¹hGŸ¦Tòà¹RrÀ*Ra™90½íJÈÞöxÑì6«Ë‹zÝœWÍfÂq JP½‡(|¹È«_
endstream
endobj
5 0 obj
386
endobj
2 0 obj
<< /Type /Page /Parent 3 0 R /Resources 6 0 R /Contents 4 0 R /MediaBox [0 0 1920 1100]
/Annots 11 0 R >>
endobj
6 0 obj
<< /ProcSet [ /PDF /Text ] /ColorSpace << /Cs1 7 0 R /Cs2 8 0 R >> /ExtGState
<< /Gs1 15 0 R >> /Font << /F2.0 14 0 R /F1.0 9 0 R >> >>
endobj
11 0 obj
[ 12 0 R 13 0 R ]
endobj
15 0 obj
<< /Type /ExtGState /AAPL:AA false >>
endobj
16 0 obj
<< /Length 17 0 R /N 1 /Alternate /DeviceGray /Filter /FlateDecode >>
stream
x…ROHQþÍ6„ˆA…xˆw
•)¬¬ ÚvuY•m[•Ò¢gߺ£³3Ó›Ù5Å“]¢