allmydata-tahoe-1.10.2/0000755000175000017500000000000012556560072012771 5ustar ramramallmydata-tahoe-1.10.2/README.rst0000644000175000017500000000321512556560070014457 0ustar ramram========== Tahoe-LAFS ========== Tahoe-LAFS is a Free and Open decentralized cloud storage system. It distributes your data across multiple servers. Even if some of the servers fail or are taken over by an attacker, the entire file store continues to function correctly, preserving your privacy and security. To get started please see `quickstart.rst`_ in the docs directory. LICENCE ======= Copyright 2006-2015 The Tahoe-LAFS Software Foundation You may use this package under the GNU General Public License, version 2 or, at your option, any later version. You may use this package under the Transitive Grace Period Public Licence, version 1.0, or at your option, any later version. (You may choose to use this package under the terms of either licence, at your option.) See the file `COPYING.GPL`_ for the terms of the GNU General Public License, version 2. See the file `COPYING.TGPPL.rst`_ for the terms of the Transitive Grace Period Public Licence, version 1.0. See `TGPPL.PDF`_ for why the TGPPL exists, graphically illustrated on three slides. .. _quickstart.rst: https://github.com/tahoe-lafs/tahoe-lafs/blob/master/docs/quickstart.rst .. _COPYING.GPL: https://github.com/tahoe-lafs/tahoe-lafs/blob/master/COPYING.GPL .. _COPYING.TGPPL.rst: https://github.com/tahoe-lafs/tahoe-lafs/blob/master/COPYING.TGPPL.rst .. _TGPPL.PDF: https://tahoe-lafs.org/~zooko/tgppl.pdf ---- .. image:: https://travis-ci.org/tahoe-lafs/tahoe-lafs.png?branch=master :target: https://travis-ci.org/tahoe-lafs/tahoe-lafs .. image:: https://coveralls.io/repos/tahoe-lafs/tahoe-lafs/badge.png?branch=master :target: https://coveralls.io/r/tahoe-lafs/tahoe-lafs?branch=master allmydata-tahoe-1.10.2/Makefile0000644000175000017500000002343212556560070014433 0ustar ramram # NOTE: this Makefile requires GNU make default: build PYTHON=python export PYTHON # setup.py will extend sys.path to include our support/lib/... directory # itself. It will also create it in the beginning of the 'develop' command. TAHOE=$(PYTHON) bin/tahoe SOURCES=src/allmydata src/buildtest static misc bin/tahoe-script.template setup.py APPNAME=allmydata-tahoe # This is necessary only if you want to automatically produce a new # _version.py file from the current git history (without doing a build). .PHONY: make-version make-version: $(PYTHON) ./setup.py update_version .built: $(MAKE) build src/allmydata/_version.py: $(MAKE) make-version # It is unnecessary to have this depend on build or src/allmydata/_version.py, # since 'setup.py build' always updates the version. .PHONY: build build: $(PYTHON) setup.py build touch .built # Build OS X pkg packages. .PHONY: build-osx-pkg test-osx-pkg upload-osx-pkg build-osx-pkg: build misc/build_helpers/build-osx-pkg.sh $(APPNAME) test-osx-pkg: $(PYTHON) misc/build_helpers/test-osx-pkg.py upload-osx-pkg: @echo "uploading to ~tahoe-tarballs/OS-X-packages/ via flappserver" @if [ "X${BB_BRANCH}" = "Xmaster" ] || [ "X${BB_BRANCH}" = "X" ]; then \ flappclient --furlfile ~/.tahoe-osx-pkg-upload.furl upload-file tahoe-lafs-*-osx.pkg; \ else \ echo not uploading tahoe-lafs-osx-pkg because this is not trunk but is branch \"${BB_BRANCH}\" ; \ fi # TESTING # you can use 'make test TEST=allmydata.test.test_introducer' to run just # test_introducer. TEST=allmydata.test.test_client.Basic.test_permute works # too. TEST=allmydata # It is unnecessary to have this depend on build or src/allmydata/_version.py, # since 'setup.py test' always updates the version and builds before testing. .PHONY: test test: $(PYTHON) setup.py test $(TRIALARGS) -s $(TEST) touch .built .PHONY: check check: test .PHONY: quicktest quicktest: make-version $(TAHOE) debug trial $(TRIALARGS) $(TEST) # "make tmpfstest" may be a faster way of running tests on Linux. It works best when you have # at least 330 MiB of free physical memory (to run the whole test suite). Since it uses sudo # to mount/unmount the tmpfs filesystem, it might prompt for your password. .PHONY: tmpfstest tmpfstest: time make _tmpfstest 'TMPDIR=$(shell mktemp -d --tmpdir=.)' .PHONY: _tmpfstest _tmpfstest: make-version sudo mount -t tmpfs -o size=400m tmpfs '$(TMPDIR)' -$(TAHOE) debug trial --rterrors '--temp-directory=$(TMPDIR)/_trial_temp' $(TRIALARGS) $(TEST) sudo umount '$(TMPDIR)' rmdir '$(TMPDIR)' # code coverage: install the "coverage" package from PyPI, do "make test-coverage" to # do a unit test run with coverage-gathering enabled, then use "make coverage-output" to # generate an HTML report. Also see "make .coverage.el" and misc/coding_tools/coverage.el # for Emacs integration. # This might need to be python-coverage on Debian-based distros. COVERAGE=coverage COVERAGEARGS=--branch --source=src/allmydata # --include appeared in coverage-3.4 COVERAGE_OMIT=--include '$(CURDIR)/src/allmydata/*' --omit '$(CURDIR)/src/allmydata/test/*' .PHONY: test-coverage test-coverage: build rm -f .coverage $(TAHOE) '@$(COVERAGE)' run $(COVERAGEARGS) @tahoe debug trial $(TRIALARGS) $(TEST) .PHONY: coverage-output coverage-output: rm -rf coverage-html coverage html -i -d coverage-html $(COVERAGE_OMIT) cp .coverage coverage-html/coverage.data @echo "now point your browser at coverage-html/index.html" .coverage.el: .coverage $(PYTHON) misc/coding_tools/coverage2el.py .PHONY: code-checks code-checks: build version-and-path check-interfaces check-miscaptures -find-trailing-spaces -check-umids pyflakes .PHONY: version-and-path version-and-path: $(TAHOE) --version-and-path .PHONY: check-interfaces check-interfaces: $(TAHOE) @misc/coding_tools/check-interfaces.py 2>&1 |tee violations.txt @echo .PHONY: check-miscaptures check-miscaptures: $(PYTHON) misc/coding_tools/check-miscaptures.py $(SOURCES) 2>&1 |tee miscaptures.txt @echo .PHONY: pyflakes pyflakes: @$(PYTHON) -OOu `which pyflakes` $(SOURCES) |sort |uniq @echo .PHONY: check-umids check-umids: $(PYTHON) misc/coding_tools/check-umids.py `find $(SOURCES) -name '*.py' -not -name 'old.py'` @echo .PHONY: -check-umids -check-umids: -$(PYTHON) misc/coding_tools/check-umids.py `find $(SOURCES) -name '*.py' -not -name 'old.py'` @echo .PHONY: doc-checks doc-checks: check-rst .PHONY: check-rst check-rst: @for x in `find *.rst docs -name "*.rst"`; do rst2html -v $${x} >/dev/null; done 2>&1 |grep -v 'Duplicate implicit target name:' @echo .PHONY: count-lines count-lines: @echo -n "files: " @find src -name '*.py' |grep -v /build/ |wc -l @echo -n "lines: " @cat `find src -name '*.py' |grep -v /build/` |wc -l @echo -n "TODO: " @grep TODO `find src -name '*.py' |grep -v /build/` | wc -l @echo -n "XXX: " @grep XXX `find src -name '*.py' |grep -v /build/` | wc -l .PHONY: check-memory check-memory: .built rm -rf _test_memory $(TAHOE) @src/allmydata/test/check_memory.py upload $(TAHOE) @src/allmydata/test/check_memory.py upload-self $(TAHOE) @src/allmydata/test/check_memory.py upload-POST $(TAHOE) @src/allmydata/test/check_memory.py download $(TAHOE) @src/allmydata/test/check_memory.py download-GET $(TAHOE) @src/allmydata/test/check_memory.py download-GET-slow $(TAHOE) @src/allmydata/test/check_memory.py receive .PHONY: check-memory-once check-memory-once: .built rm -rf _test_memory $(TAHOE) @src/allmydata/test/check_memory.py $(MODE) # The check-speed target uses a pre-established client node to run a canned # set of performance tests against a test network that is also # pre-established (probably on a remote machine). Provide it with the path to # a local directory where this client node has been created (and populated # with the necessary FURLs of the test network). This target will start that # client with the current code and then run the tests. Afterwards it will # stop the client. # # The 'sleep 5' is in there to give the new client a chance to connect to its # storageservers, since check_speed.py has no good way of doing that itself. .PHONY: check-speed check-speed: .built if [ -z '$(TESTCLIENTDIR)' ]; then exit 1; fi @echo "stopping any leftover client code" -$(TAHOE) stop $(TESTCLIENTDIR) $(TAHOE) start $(TESTCLIENTDIR) sleep 5 $(TAHOE) @src/allmydata/test/check_speed.py $(TESTCLIENTDIR) $(TAHOE) stop $(TESTCLIENTDIR) # The check-grid target also uses a pre-established client node, along with a # long-term directory that contains some well-known files. See the docstring # in src/allmydata/test/check_grid.py to see how to set this up. .PHONY: check-grid check-grid: .built if [ -z '$(TESTCLIENTDIR)' ]; then exit 1; fi $(TAHOE) @src/allmydata/test/check_grid.py $(TESTCLIENTDIR) bin/tahoe .PHONY: bench-dirnode bench-dirnode: .built $(TAHOE) @src/allmydata/test/bench_dirnode.py # the provisioning tool runs as a stand-alone webapp server .PHONY: run-provisioning-tool run-provisioning-tool: .built $(TAHOE) @misc/operations_helpers/provisioning/run.py # 'make repl' is a simple-to-type command to get a Python interpreter loop # from which you can type 'import allmydata' .PHONY: repl repl: $(TAHOE) debug repl .PHONY: test-get-ignore test-git-ignore: $(MAKE) $(PYTHON) misc/build_helpers/test-git-ignore.py .PHONY: test-clean test-clean: find . |grep -vEe "allfiles.tmp|src/allmydata/_(version|appname).py" |sort >allfiles.tmp.old $(MAKE) $(MAKE) distclean find . |grep -vEe "allfiles.tmp|src/allmydata/_(version|appname).py" |sort >allfiles.tmp.new diff allfiles.tmp.old allfiles.tmp.new # It would be nice if 'make clean' deleted any automatically-generated # _version.py too, so that 'make clean; make all' could be useable as a # "what the heck is going on, get me back to a clean state', but we need # 'make clean' to work on non-checkout trees without destroying useful information. # Use 'make distclean' instead to delete all generated files. .PHONY: clean clean: rm -rf build _trial_temp _test_memory .built rm -f `find src *.egg -name '*.so' -or -name '*.pyc'` rm -rf support dist rm -rf `ls -d *.egg | grep -vEe"setuptools-|setuptools_darcs-|darcsver-"` rm -rf *.pyc rm -rf misc/dependencies/build misc/dependencies/temp rm -rf misc/dependencies/tahoe_deps.egg-info rm -f bin/tahoe bin/tahoe.pyscript rm -f *.pkg .PHONY: distclean distclean: clean rm -rf src/allmydata_tahoe.egg-info rm -f src/allmydata/_version.py rm -f src/allmydata/_appname.py .PHONY: find-trailing-spaces find-trailing-spaces: $(PYTHON) misc/coding_tools/find-trailing-spaces.py -r $(SOURCES) @echo .PHONY: -find-trailing-spaces -find-trailing-spaces: -$(PYTHON) misc/coding_tools/find-trailing-spaces.py -r $(SOURCES) @echo # The test-desert-island target grabs the tahoe-deps tarball, unpacks it, # does a build, then asserts that the build did not try to download anything # as it ran. Invoke this on a new tree, or after a 'clean', to make sure the # support/lib/ directory is gone. .PHONY: fetch-and-unpack-deps fetch-and-unpack-deps: test -f tahoe-deps.tar.gz || wget https://tahoe-lafs.org/source/tahoe-lafs/deps/tahoe-lafs-deps.tar.gz rm -rf tahoe-deps tar xzf tahoe-lafs-deps.tar.gz .PHONY: test-desert-island test-desert-island: $(MAKE) fetch-and-unpack-deps $(MAKE) 2>&1 | tee make.out $(PYTHON) misc/build_helpers/check-build.py make.out no-downloads .PHONY: test-pip-install test-pip-install: $(PYTHON) misc/build_helpers/test-pip-install.py # TARBALL GENERATION .PHONY: tarballs tarballs: $(MAKE) make-version $(PYTHON) setup.py sdist --formats=bztar,gztar,zip $(PYTHON) setup.py sdist --sumo --formats=bztar,gztar,zip .PHONY: upload-tarballs upload-tarballs: @if [ "X${BB_BRANCH}" = "Xmaster" ] || [ "X${BB_BRANCH}" = "X" ]; then for f in dist/$(APPNAME)-*; do flappclient --furlfile ~/.tahoe-tarball-upload.furl upload-file $$f; done ; else echo not uploading tarballs because this is not trunk but is branch \"${BB_BRANCH}\" ; fi allmydata-tahoe-1.10.2/src/0000755000175000017500000000000012556560072013560 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/0000755000175000017500000000000012556560072015530 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/introducer/0000755000175000017500000000000012556560072017706 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/introducer/interfaces.py0000644000175000017500000001331412556560070022403 0ustar ramram from zope.interface import Interface from foolscap.api import StringConstraint, TupleOf, SetOf, DictOf, Any, \ RemoteInterface, Referenceable from old import RIIntroducerSubscriberClient_v1 FURL = StringConstraint(1000) # old introducer protocol (v1): # # Announcements are (FURL, service_name, remoteinterface_name, # nickname, my_version, oldest_supported) # the (FURL, service_name, remoteinterface_name) refer to the service being # announced. The (nickname, my_version, oldest_supported) refer to the # client as a whole. The my_version/oldest_supported strings can be parsed # by an allmydata.util.version.Version instance, and then compared. The # first goal is to make sure that nodes are not confused by speaking to an # incompatible peer. The second goal is to enable the development of # backwards-compatibility code. Announcement_v1 = TupleOf(FURL, str, str, str, str, str) # v2 protocol over foolscap: Announcements are 3-tuples of (bytes, str, str) # or (bytes, none, none) Announcement_v2 = Any() class RIIntroducerSubscriberClient_v2(RemoteInterface): __remote_name__ = "RIIntroducerSubscriberClient_v2.tahoe.allmydata.com" def announce_v2(announcements=SetOf(Announcement_v2)): """I accept announcements from the publisher.""" return None def set_encoding_parameters(parameters=(int, int, int)): """Advise the client of the recommended k-of-n encoding parameters for this grid. 'parameters' is a tuple of (k, desired, n), where 'n' is the total number of shares that will be created for any given file, while 'k' is the number of shares that must be retrieved to recover that file, and 'desired' is the minimum number of shares that must be placed before the uploader will consider its job a success. n/k is the expansion ratio, while k determines the robustness. Introducers should specify 'n' according to the expected size of the grid (there is no point to producing more shares than there are peers), and k according to the desired reliability-vs-overhead goals. Note that setting k=1 is equivalent to simple replication. """ return None SubscriberInfo = DictOf(str, Any()) class RIIntroducerPublisherAndSubscriberService_v2(RemoteInterface): """To publish a service to the world, connect to me and give me your announcement message. I will deliver a copy to all connected subscribers. To hear about services, connect to me and subscribe to a specific service_name.""" __remote_name__ = "RIIntroducerPublisherAndSubscriberService_v2.tahoe.allmydata.com" def get_version(): return DictOf(str, Any()) def publish(announcement=Announcement_v1): return None def publish_v2(announcement=Announcement_v2, canary=Referenceable): return None def subscribe(subscriber=RIIntroducerSubscriberClient_v1, service_name=str): return None def subscribe_v2(subscriber=RIIntroducerSubscriberClient_v2, service_name=str, subscriber_info=SubscriberInfo): """Give me a subscriber reference, and I will call its announce_v2() method with any announcements that match the desired service name. I will ignore duplicate subscriptions. The subscriber_info dictionary tells me about the subscriber, and is used for diagnostic/status displays.""" return None class IIntroducerClient(Interface): """I provide service introduction facilities for a node. I help nodes publish their services to the rest of the world, and I help them learn about services available on other nodes.""" def publish(service_name, ann, signing_key=None): """Publish the given announcement dictionary (which must be JSON-serializable), plus some additional keys, to the world. Each announcement is characterized by a (service_name, serverid) pair. When the server sees two announcements with the same pair, the later one will replace the earlier one. The serverid is derived from the signing_key, if present, otherwise it is derived from the 'anonymous-storage-FURL' key. If signing_key= is set to an instance of SigningKey, it will be used to sign the announcement.""" def subscribe_to(service_name, callback, *args, **kwargs): """Call this if you will eventually want to use services with the given SERVICE_NAME. This will prompt me to subscribe to announcements of those services. Your callback will be invoked with at least two arguments: a pubkey and an announcement dictionary, followed by any additional callback args/kwargs you gave me. The pubkey will be None unless the announcement was signed by the corresponding pubkey, in which case it will be a printable string like 'v0-base32..'. I will run your callback for both new announcements and for announcements that have changed, but you must be prepared to tolerate duplicates. The announcement that I give you comes from some other client. It will be a JSON-serializable dictionary which (by convention) is expected to have at least the following keys: version: 0 nickname: unicode app-versions: {} my-version: str oldest-supported: str service-name: str('storage') anonymous-storage-FURL: str(furl) Note that app-version will be an empty dictionary if either the publishing client or the Introducer are running older code. """ def connected_to_introducer(): """Returns a boolean, True if we are currently connected to the introducer, False if not.""" allmydata-tahoe-1.10.2/src/allmydata/introducer/old.py0000644000175000017500000005206412556560070021043 0ustar ramram import time from base64 import b32decode from zope.interface import implements, Interface from twisted.application import service import allmydata from allmydata.interfaces import InsufficientVersionError from allmydata.util import log, idlib, rrefutil from foolscap.api import StringConstraint, TupleOf, SetOf, DictOf, Any, \ RemoteInterface, Referenceable, eventually, SturdyRef from allmydata.introducer.common import SubscriberDescriptor, \ AnnouncementDescriptor FURL = StringConstraint(1000) # We keep a copy of the old introducer (both client and server) here to # support compatibility tests. The old client is supposed to handle the new # server, and new client is supposed to handle the old server. # Announcements are (FURL, service_name, remoteinterface_name, # nickname, my_version, oldest_supported) # the (FURL, service_name, remoteinterface_name) refer to the service being # announced. The (nickname, my_version, oldest_supported) refer to the # client as a whole. The my_version/oldest_supported strings can be parsed # by an allmydata.util.version.Version instance, and then compared. The # first goal is to make sure that nodes are not confused by speaking to an # incompatible peer. The second goal is to enable the development of # backwards-compatibility code. Announcement = TupleOf(FURL, str, str, str, str, str) class RIIntroducerSubscriberClient_v1(RemoteInterface): __remote_name__ = "RIIntroducerSubscriberClient.tahoe.allmydata.com" def announce(announcements=SetOf(Announcement)): """I accept announcements from the publisher.""" return None def set_encoding_parameters(parameters=(int, int, int)): """Advise the client of the recommended k-of-n encoding parameters for this grid. 'parameters' is a tuple of (k, desired, n), where 'n' is the total number of shares that will be created for any given file, while 'k' is the number of shares that must be retrieved to recover that file, and 'desired' is the minimum number of shares that must be placed before the uploader will consider its job a success. n/k is the expansion ratio, while k determines the robustness. Introducers should specify 'n' according to the expected size of the grid (there is no point to producing more shares than there are peers), and k according to the desired reliability-vs-overhead goals. Note that setting k=1 is equivalent to simple replication. """ return None # When Foolscap can handle multiple interfaces (Foolscap#17), the # full-powered introducer will implement both RIIntroducerPublisher and # RIIntroducerSubscriberService. Until then, we define # RIIntroducerPublisherAndSubscriberService as a combination of the two, and # make everybody use that. class RIIntroducerPublisher_v1(RemoteInterface): """To publish a service to the world, connect to me and give me your announcement message. I will deliver a copy to all connected subscribers.""" __remote_name__ = "RIIntroducerPublisher.tahoe.allmydata.com" def publish(announcement=Announcement): # canary? return None class RIIntroducerSubscriberService_v1(RemoteInterface): __remote_name__ = "RIIntroducerSubscriberService.tahoe.allmydata.com" def subscribe(subscriber=RIIntroducerSubscriberClient_v1, service_name=str): """Give me a subscriber reference, and I will call its new_peers() method will any announcements that match the desired service name. I will ignore duplicate subscriptions. """ return None class RIIntroducerPublisherAndSubscriberService_v1(RemoteInterface): __remote_name__ = "RIIntroducerPublisherAndSubscriberService.tahoe.allmydata.com" def get_version(): return DictOf(str, Any()) def publish(announcement=Announcement): return None def subscribe(subscriber=RIIntroducerSubscriberClient_v1, service_name=str): return None class IIntroducerClient(Interface): """I provide service introduction facilities for a node. I help nodes publish their services to the rest of the world, and I help them learn about services available on other nodes.""" def publish(furl, service_name, remoteinterface_name): """Once you call this, I will tell the world that the Referenceable available at FURL is available to provide a service named SERVICE_NAME. The precise definition of the service being provided is identified by the Foolscap 'remote interface name' in the last parameter: this is supposed to be a globally-unique string that identifies the RemoteInterface that is implemented.""" def subscribe_to(service_name, callback, *args, **kwargs): """Call this if you will eventually want to use services with the given SERVICE_NAME. This will prompt me to subscribe to announcements of those services. Your callback will be invoked with at least two arguments: a serverid (binary string), and an announcement dictionary, followed by any additional callback args/kwargs you give me. I will run your callback for both new announcements and for announcements that have changed, but you must be prepared to tolerate duplicates. The announcement dictionary that I give you will have the following keys: version: 0 service-name: str('storage') FURL: str(furl) remoteinterface-name: str(ri_name) nickname: unicode app-versions: {} my-version: str oldest-supported: str Note that app-version will be an empty dictionary until #466 is done and both the introducer and the remote client have been upgraded. For current (native) server types, the serverid will always be equal to the binary form of the FURL's tubid. """ def connected_to_introducer(): """Returns a boolean, True if we are currently connected to the introducer, False if not.""" class IntroducerClient_v1(service.Service, Referenceable): implements(RIIntroducerSubscriberClient_v1, IIntroducerClient) def __init__(self, tub, introducer_furl, nickname, my_version, oldest_supported): self._tub = tub self.introducer_furl = introducer_furl assert type(nickname) is unicode self._nickname_utf8 = nickname.encode("utf-8") # we always send UTF-8 self._my_version = my_version self._oldest_supported = oldest_supported self._published_announcements = set() self._publisher = None self._local_subscribers = [] # (servicename,cb,args,kwargs) tuples self._subscribed_service_names = set() self._subscriptions = set() # requests we've actually sent # _current_announcements remembers one announcement per # (servicename,serverid) pair. Anything that arrives with the same # pair will displace the previous one. This stores unpacked # announcement dictionaries, which can be compared for equality to # distinguish re-announcement from updates. It also provides memory # for clients who subscribe after startup. self._current_announcements = {} self.encoding_parameters = None # hooks for unit tests self._debug_counts = { "inbound_message": 0, "inbound_announcement": 0, "wrong_service": 0, "duplicate_announcement": 0, "update": 0, "new_announcement": 0, "outbound_message": 0, } self._debug_outstanding = 0 def _debug_retired(self, res): self._debug_outstanding -= 1 return res def startService(self): service.Service.startService(self) self._introducer_error = None rc = self._tub.connectTo(self.introducer_furl, self._got_introducer) self._introducer_reconnector = rc def connect_failed(failure): self.log("Initial Introducer connection failed: perhaps it's down", level=log.WEIRD, failure=failure, umid="c5MqUQ") d = self._tub.getReference(self.introducer_furl) d.addErrback(connect_failed) def _got_introducer(self, publisher): self.log("connected to introducer, getting versions") default = { "http://allmydata.org/tahoe/protocols/introducer/v1": { }, "application-version": "unknown: no get_version()", } d = rrefutil.add_version_to_remote_reference(publisher, default) d.addCallback(self._got_versioned_introducer) d.addErrback(self._got_error) def _got_error(self, f): # TODO: for the introducer, perhaps this should halt the application self._introducer_error = f # polled by tests def _got_versioned_introducer(self, publisher): self.log("got introducer version: %s" % (publisher.version,)) # we require a V1 introducer needed = "http://allmydata.org/tahoe/protocols/introducer/v1" if needed not in publisher.version: raise InsufficientVersionError(needed, publisher.version) self._publisher = publisher publisher.notifyOnDisconnect(self._disconnected) self._maybe_publish() self._maybe_subscribe() def _disconnected(self): self.log("bummer, we've lost our connection to the introducer") self._publisher = None self._subscriptions.clear() def log(self, *args, **kwargs): if "facility" not in kwargs: kwargs["facility"] = "tahoe.introducer" return log.msg(*args, **kwargs) def publish(self, furl, service_name, remoteinterface_name): assert type(self._nickname_utf8) is str # we always send UTF-8 ann = (furl, service_name, remoteinterface_name, self._nickname_utf8, self._my_version, self._oldest_supported) self._published_announcements.add(ann) self._maybe_publish() def subscribe_to(self, service_name, cb, *args, **kwargs): self._local_subscribers.append( (service_name,cb,args,kwargs) ) self._subscribed_service_names.add(service_name) self._maybe_subscribe() for (servicename,nodeid),ann_d in self._current_announcements.items(): if servicename == service_name: eventually(cb, nodeid, ann_d) def _maybe_subscribe(self): if not self._publisher: self.log("want to subscribe, but no introducer yet", level=log.NOISY) return for service_name in self._subscribed_service_names: if service_name not in self._subscriptions: # there is a race here, but the subscription desk ignores # duplicate requests. self._subscriptions.add(service_name) self._debug_outstanding += 1 d = self._publisher.callRemote("subscribe", self, service_name) d.addBoth(self._debug_retired) d.addErrback(rrefutil.trap_deadref) d.addErrback(log.err, format="server errored during subscribe", facility="tahoe.introducer", level=log.WEIRD, umid="2uMScQ") def _maybe_publish(self): if not self._publisher: self.log("want to publish, but no introducer yet", level=log.NOISY) return # this re-publishes everything. The Introducer ignores duplicates for ann in self._published_announcements: self._debug_counts["outbound_message"] += 1 self._debug_outstanding += 1 d = self._publisher.callRemote("publish", ann) d.addBoth(self._debug_retired) d.addErrback(rrefutil.trap_deadref) d.addErrback(log.err, format="server errored during publish %(ann)s", ann=ann, facility="tahoe.introducer", level=log.WEIRD, umid="xs9pVQ") def remote_announce(self, announcements): self.log("received %d announcements" % len(announcements)) self._debug_counts["inbound_message"] += 1 for ann in announcements: try: self._process_announcement(ann) except: log.err(format="unable to process announcement %(ann)s", ann=ann) # Don't let a corrupt announcement prevent us from processing # the remaining ones. Don't return an error to the server, # since they'd just ignore it anyways. pass def _process_announcement(self, ann): self._debug_counts["inbound_announcement"] += 1 (furl, service_name, ri_name, nickname_utf8, ver, oldest) = ann if service_name not in self._subscribed_service_names: self.log("announcement for a service we don't care about [%s]" % (service_name,), level=log.UNUSUAL, umid="dIpGNA") self._debug_counts["wrong_service"] += 1 return self.log("announcement for [%s]: %s" % (service_name, ann), umid="BoKEag") assert type(furl) is str assert type(service_name) is str assert type(ri_name) is str assert type(nickname_utf8) is str nickname = nickname_utf8.decode("utf-8") assert type(nickname) is unicode assert type(ver) is str assert type(oldest) is str nodeid = b32decode(SturdyRef(furl).tubID.upper()) nodeid_s = idlib.shortnodeid_b2a(nodeid) ann_d = { "version": 0, "service-name": service_name, "FURL": furl, "nickname": nickname, "app-versions": {}, # need #466 and v2 introducer "my-version": ver, "oldest-supported": oldest, } index = (service_name, nodeid) if self._current_announcements.get(index, None) == ann_d: self.log("reannouncement for [%(service)s]:%(nodeid)s, ignoring", service=service_name, nodeid=nodeid_s, level=log.UNUSUAL, umid="B1MIdA") self._debug_counts["duplicate_announcement"] += 1 return if index in self._current_announcements: self._debug_counts["update"] += 1 else: self._debug_counts["new_announcement"] += 1 self._current_announcements[index] = ann_d # note: we never forget an index, but we might update its value for (service_name2,cb,args,kwargs) in self._local_subscribers: if service_name2 == service_name: eventually(cb, nodeid, ann_d, *args, **kwargs) def remote_set_encoding_parameters(self, parameters): self.encoding_parameters = parameters def connected_to_introducer(self): return bool(self._publisher) class IntroducerService_v1(service.MultiService, Referenceable): implements(RIIntroducerPublisherAndSubscriberService_v1) name = "introducer" VERSION = { "http://allmydata.org/tahoe/protocols/introducer/v1": { }, "application-version": str(allmydata.__full_version__), } def __init__(self, basedir="."): service.MultiService.__init__(self) self.introducer_url = None # 'index' is (service_name, tubid) self._announcements = {} # dict of index -> (announcement, timestamp) self._subscribers = {} # [service_name]->[rref]->timestamp self._debug_counts = {"inbound_message": 0, "inbound_duplicate": 0, "inbound_update": 0, "outbound_message": 0, "outbound_announcements": 0, "inbound_subscribe": 0} self._debug_outstanding = 0 def _debug_retired(self, res): self._debug_outstanding -= 1 return res def log(self, *args, **kwargs): if "facility" not in kwargs: kwargs["facility"] = "tahoe.introducer" return log.msg(*args, **kwargs) def get_announcements(self, include_stub_clients=True): announcements = [] for index, (ann_t, when) in self._announcements.items(): (furl, service_name, ri_name, nickname, ver, oldest) = ann_t if service_name == "stub_client" and not include_stub_clients: continue ann_d = {"nickname": nickname.decode("utf-8", "replace"), "my-version": ver, "service-name": service_name, "anonymous-storage-FURL": furl, } # the V2 introducer uses (service_name, key_s, tubid_s) as an # index, so match that format for AnnouncementDescriptor new_index = (index[0], None, idlib.nodeid_b2a(index[1])) ad = AnnouncementDescriptor(when, new_index, None, ann_d) announcements.append(ad) return announcements def get_subscribers(self): s = [] for service_name, subscribers in self._subscribers.items(): for rref, when in subscribers.items(): tubid = rref.getRemoteTubID() or "?" advertised_addresses = rrefutil.hosts_for_rref(rref) remote_address = rrefutil.stringify_remote_address(rref) nickname, version, app_versions = u"?", u"?", {} sd = SubscriberDescriptor(service_name, when, nickname, version, app_versions, advertised_addresses, remote_address, tubid) s.append(sd) return s def remote_get_version(self): return self.VERSION def remote_publish(self, announcement): try: self._publish(announcement) except: log.err(format="Introducer.remote_publish failed on %(ann)s", ann=announcement, level=log.UNUSUAL, umid="620rWA") raise def _publish(self, announcement): self._debug_counts["inbound_message"] += 1 self.log("introducer: announcement published: %s" % (announcement,) ) (furl, service_name, ri_name, nickname_utf8, ver, oldest) = announcement #print "PUB", service_name, nickname_utf8 nodeid = b32decode(SturdyRef(furl).tubID.upper()) index = (service_name, nodeid) if index in self._announcements: (old_announcement, timestamp) = self._announcements[index] if old_announcement == announcement: self.log("but we already knew it, ignoring", level=log.NOISY) self._debug_counts["inbound_duplicate"] += 1 return else: self.log("old announcement being updated", level=log.NOISY) self._debug_counts["inbound_update"] += 1 self._announcements[index] = (announcement, time.time()) for s in self._subscribers.get(service_name, []): self._debug_counts["outbound_message"] += 1 self._debug_counts["outbound_announcements"] += 1 self._debug_outstanding += 1 d = s.callRemote("announce", set([announcement])) d.addBoth(self._debug_retired) d.addErrback(rrefutil.trap_deadref) d.addErrback(log.err, format="subscriber errored on announcement %(ann)s", ann=announcement, facility="tahoe.introducer", level=log.UNUSUAL, umid="jfGMXQ") def remote_subscribe(self, subscriber, service_name): self.log("introducer: subscription[%s] request at %s" % (service_name, subscriber)) self._debug_counts["inbound_subscribe"] += 1 if service_name not in self._subscribers: self._subscribers[service_name] = {} subscribers = self._subscribers[service_name] if subscriber in subscribers: self.log("but they're already subscribed, ignoring", level=log.UNUSUAL) return subscribers[subscriber] = time.time() def _remove(): self.log("introducer: unsubscribing[%s] %s" % (service_name, subscriber)) subscribers.pop(subscriber, None) subscriber.notifyOnDisconnect(_remove) announcements = set( [ ann for (sn2,nodeid),(ann,when) in self._announcements.items() if sn2 == service_name] ) self._debug_counts["outbound_message"] += 1 self._debug_counts["outbound_announcements"] += len(announcements) self._debug_outstanding += 1 d = subscriber.callRemote("announce", announcements) d.addBoth(self._debug_retired) d.addErrback(rrefutil.trap_deadref) d.addErrback(log.err, format="subscriber errored during subscribe %(anns)s", anns=announcements, facility="tahoe.introducer", level=log.UNUSUAL, umid="1XChxA") allmydata-tahoe-1.10.2/src/allmydata/introducer/client.py0000644000175000017500000003753212556560070021546 0ustar ramram import time from zope.interface import implements from twisted.application import service from foolscap.api import Referenceable, eventually, RemoteInterface from allmydata.interfaces import InsufficientVersionError from allmydata.introducer.interfaces import IIntroducerClient, \ RIIntroducerSubscriberClient_v1, RIIntroducerSubscriberClient_v2 from allmydata.introducer.common import sign_to_foolscap, unsign_from_foolscap,\ convert_announcement_v1_to_v2, convert_announcement_v2_to_v1, \ make_index, get_tubid_string_from_ann, get_tubid_string from allmydata.util import log from allmydata.util.rrefutil import add_version_to_remote_reference from allmydata.util.keyutil import BadSignatureError class WrapV2ClientInV1Interface(Referenceable): # for_v1 """I wrap a v2 IntroducerClient to make it look like a v1 client, so it can be attached to an old server.""" implements(RIIntroducerSubscriberClient_v1) def __init__(self, original): self.original = original def remote_announce(self, announcements): lp = self.original.log("received %d announcements (v1)" % len(announcements)) anns_v1 = set([convert_announcement_v1_to_v2(ann_v1) for ann_v1 in announcements]) return self.original.got_announcements(anns_v1, lp) def remote_set_encoding_parameters(self, parameters): self.original.remote_set_encoding_parameters(parameters) class RIStubClient(RemoteInterface): # for_v1 """Each client publishes a service announcement for a dummy object called the StubClient. This object doesn't actually offer any services, but the announcement helps the Introducer keep track of which clients are subscribed (so the grid admin can keep track of things like the size of the grid and the client versions in use. This is the (empty) RemoteInterface for the StubClient.""" class StubClient(Referenceable): # for_v1 implements(RIStubClient) V1 = "http://allmydata.org/tahoe/protocols/introducer/v1" V2 = "http://allmydata.org/tahoe/protocols/introducer/v2" class IntroducerClient(service.Service, Referenceable): implements(RIIntroducerSubscriberClient_v2, IIntroducerClient) def __init__(self, tub, introducer_furl, nickname, my_version, oldest_supported, app_versions, sequencer): self._tub = tub self.introducer_furl = introducer_furl assert type(nickname) is unicode self._nickname = nickname self._my_version = my_version self._oldest_supported = oldest_supported self._app_versions = app_versions self._sequencer = sequencer self._my_subscriber_info = { "version": 0, "nickname": self._nickname, "app-versions": self._app_versions, "my-version": self._my_version, "oldest-supported": self._oldest_supported, } self._stub_client = None # for_v1 self._stub_client_furl = None self._outbound_announcements = {} # not signed self._published_announcements = {} # signed self._canary = Referenceable() self._publisher = None self._local_subscribers = [] # (servicename,cb,args,kwargs) tuples self._subscribed_service_names = set() self._subscriptions = set() # requests we've actually sent # _inbound_announcements remembers one announcement per # (servicename,serverid) pair. Anything that arrives with the same # pair will displace the previous one. This stores tuples of # (unpacked announcement dictionary, verifyingkey, rxtime). The ann # dicts can be compared for equality to distinguish re-announcement # from updates. It also provides memory for clients who subscribe # after startup. self._inbound_announcements = {} self.encoding_parameters = None # hooks for unit tests self._debug_counts = { "inbound_message": 0, "inbound_announcement": 0, "wrong_service": 0, "duplicate_announcement": 0, "update": 0, "new_announcement": 0, "outbound_message": 0, } self._debug_outstanding = 0 def _debug_retired(self, res): self._debug_outstanding -= 1 return res def startService(self): service.Service.startService(self) self._introducer_error = None rc = self._tub.connectTo(self.introducer_furl, self._got_introducer) self._introducer_reconnector = rc def connect_failed(failure): self.log("Initial Introducer connection failed: perhaps it's down", level=log.WEIRD, failure=failure, umid="c5MqUQ") d = self._tub.getReference(self.introducer_furl) d.addErrback(connect_failed) def _got_introducer(self, publisher): self.log("connected to introducer, getting versions") default = { "http://allmydata.org/tahoe/protocols/introducer/v1": { }, "application-version": "unknown: no get_version()", } d = add_version_to_remote_reference(publisher, default) d.addCallback(self._got_versioned_introducer) d.addErrback(self._got_error) def _got_error(self, f): # TODO: for the introducer, perhaps this should halt the application self._introducer_error = f # polled by tests def _got_versioned_introducer(self, publisher): self.log("got introducer version: %s" % (publisher.version,)) # we require an introducer that speaks at least one of (V1, V2) if not (V1 in publisher.version or V2 in publisher.version): raise InsufficientVersionError("V1 or V2", publisher.version) self._publisher = publisher publisher.notifyOnDisconnect(self._disconnected) self._maybe_publish() self._maybe_subscribe() def _disconnected(self): self.log("bummer, we've lost our connection to the introducer") self._publisher = None self._subscriptions.clear() def log(self, *args, **kwargs): if "facility" not in kwargs: kwargs["facility"] = "tahoe.introducer.client" return log.msg(*args, **kwargs) def subscribe_to(self, service_name, cb, *args, **kwargs): self._local_subscribers.append( (service_name,cb,args,kwargs) ) self._subscribed_service_names.add(service_name) self._maybe_subscribe() for index,(ann,key_s,when) in self._inbound_announcements.items(): servicename = index[0] if servicename == service_name: eventually(cb, key_s, ann, *args, **kwargs) def _maybe_subscribe(self): if not self._publisher: self.log("want to subscribe, but no introducer yet", level=log.NOISY) return for service_name in self._subscribed_service_names: if service_name in self._subscriptions: continue self._subscriptions.add(service_name) if V2 in self._publisher.version: self._debug_outstanding += 1 d = self._publisher.callRemote("subscribe_v2", self, service_name, self._my_subscriber_info) d.addBoth(self._debug_retired) else: d = self._subscribe_handle_v1(service_name) # for_v1 d.addErrback(log.err, facility="tahoe.introducer.client", level=log.WEIRD, umid="2uMScQ") def _subscribe_handle_v1(self, service_name): # for_v1 # they don't speak V2: must be a v1 introducer. Fall back to the v1 # 'subscribe' method, using a client adapter. ca = WrapV2ClientInV1Interface(self) self._debug_outstanding += 1 d = self._publisher.callRemote("subscribe", ca, service_name) d.addBoth(self._debug_retired) # We must also publish an empty 'stub_client' object, so the # introducer can count how many clients are connected and see what # versions they're running. if not self._stub_client_furl: self._stub_client = sc = StubClient() self._stub_client_furl = self._tub.registerReference(sc) def _publish_stub_client(ignored): furl = self._stub_client_furl self.publish("stub_client", { "anonymous-storage-FURL": furl, "permutation-seed-base32": get_tubid_string(furl), }) d.addCallback(_publish_stub_client) return d def create_announcement_dict(self, service_name, ann): ann_d = { "version": 0, # "seqnum" and "nonce" will be populated with new values in # publish(), each time we make a change "nickname": self._nickname, "app-versions": self._app_versions, "my-version": self._my_version, "oldest-supported": self._oldest_supported, "service-name": service_name, } ann_d.update(ann) return ann_d def publish(self, service_name, ann, signing_key=None): # we increment the seqnum every time we publish something new current_seqnum, current_nonce = self._sequencer() ann_d = self.create_announcement_dict(service_name, ann) self._outbound_announcements[service_name] = ann_d # publish all announcements with the new seqnum and nonce for service_name,ann_d in self._outbound_announcements.items(): ann_d["seqnum"] = current_seqnum ann_d["nonce"] = current_nonce ann_t = sign_to_foolscap(ann_d, signing_key) self._published_announcements[service_name] = ann_t self._maybe_publish() def _maybe_publish(self): if not self._publisher: self.log("want to publish, but no introducer yet", level=log.NOISY) return # this re-publishes everything. The Introducer ignores duplicates for ann_t in self._published_announcements.values(): self._debug_counts["outbound_message"] += 1 if V2 in self._publisher.version: self._debug_outstanding += 1 d = self._publisher.callRemote("publish_v2", ann_t, self._canary) d.addBoth(self._debug_retired) else: d = self._handle_v1_publisher(ann_t) # for_v1 d.addErrback(log.err, ann_t=ann_t, facility="tahoe.introducer.client", level=log.WEIRD, umid="xs9pVQ") def _handle_v1_publisher(self, ann_t): # for_v1 # they don't speak V2, so fall back to the old 'publish' method # (which takes an unsigned tuple of bytestrings) self.log("falling back to publish_v1", level=log.UNUSUAL, umid="9RCT1A") ann_v1 = convert_announcement_v2_to_v1(ann_t) self._debug_outstanding += 1 d = self._publisher.callRemote("publish", ann_v1) d.addBoth(self._debug_retired) return d def remote_announce_v2(self, announcements): lp = self.log("received %d announcements (v2)" % len(announcements)) return self.got_announcements(announcements, lp) def got_announcements(self, announcements, lp=None): # this is the common entry point for both v1 and v2 announcements self._debug_counts["inbound_message"] += 1 for ann_t in announcements: try: # this might raise UnknownKeyError or bad-sig error ann, key_s = unsign_from_foolscap(ann_t) # key is "v0-base32abc123" except BadSignatureError: self.log("bad signature on inbound announcement: %s" % (ann_t,), parent=lp, level=log.WEIRD, umid="ZAU15Q") # process other announcements that arrived with the bad one continue self._process_announcement(ann, key_s) def _process_announcement(self, ann, key_s): self._debug_counts["inbound_announcement"] += 1 service_name = str(ann["service-name"]) if service_name not in self._subscribed_service_names: self.log("announcement for a service we don't care about [%s]" % (service_name,), level=log.UNUSUAL, umid="dIpGNA") self._debug_counts["wrong_service"] += 1 return # for ASCII values, simplejson might give us unicode *or* bytes if "nickname" in ann and isinstance(ann["nickname"], str): ann["nickname"] = unicode(ann["nickname"]) nick_s = ann.get("nickname",u"").encode("utf-8") lp2 = self.log(format="announcement for nickname '%(nick)s', service=%(svc)s: %(ann)s", nick=nick_s, svc=service_name, ann=ann, umid="BoKEag") # how do we describe this node in the logs? desc_bits = [] if key_s: desc_bits.append("serverid=" + key_s[:20]) if "anonymous-storage-FURL" in ann: tubid_s = get_tubid_string_from_ann(ann) desc_bits.append("tubid=" + tubid_s[:8]) description = "/".join(desc_bits) # the index is used to track duplicates index = make_index(ann, key_s) # is this announcement a duplicate? if (index in self._inbound_announcements and self._inbound_announcements[index][0] == ann): self.log(format="reannouncement for [%(service)s]:%(description)s, ignoring", service=service_name, description=description, parent=lp2, level=log.UNUSUAL, umid="B1MIdA") self._debug_counts["duplicate_announcement"] += 1 return # does it update an existing one? if index in self._inbound_announcements: old,_,_ = self._inbound_announcements[index] if "seqnum" in old: # must beat previous sequence number to replace if ("seqnum" not in ann or not isinstance(ann["seqnum"], (int,long))): self.log("not replacing old announcement, no valid seqnum: %s" % (ann,), parent=lp2, level=log.NOISY, umid="zFGH3Q") return if ann["seqnum"] <= old["seqnum"]: # note that exact replays are caught earlier, by # comparing the entire signed announcement. self.log("not replacing old announcement, " "new seqnum is too old (%s <= %s) " "(replay attack?): %s" % (ann["seqnum"], old["seqnum"], ann), parent=lp2, level=log.UNUSUAL, umid="JAAAoQ") return # ok, seqnum is newer, allow replacement self._debug_counts["update"] += 1 self.log("replacing old announcement: %s" % (ann,), parent=lp2, level=log.NOISY, umid="wxwgIQ") else: self._debug_counts["new_announcement"] += 1 self.log("new announcement[%s]" % service_name, parent=lp2, level=log.NOISY) self._inbound_announcements[index] = (ann, key_s, time.time()) # note: we never forget an index, but we might update its value for (service_name2,cb,args,kwargs) in self._local_subscribers: if service_name2 == service_name: eventually(cb, key_s, ann, *args, **kwargs) def remote_set_encoding_parameters(self, parameters): self.encoding_parameters = parameters def connected_to_introducer(self): return bool(self._publisher) allmydata-tahoe-1.10.2/src/allmydata/introducer/common.py0000644000175000017500000001366612556560070021562 0ustar ramram import re, simplejson from allmydata.util import keyutil, base32, rrefutil def make_index(ann, key_s): """Return something that can be used as an index (e.g. a tuple of strings), such that two messages that refer to the same 'thing' will have the same index. This is a tuple of (service-name, signing-key, None) for signed announcements, or (service-name, None, tubid_s) for unsigned announcements.""" service_name = str(ann["service-name"]) if key_s: return (service_name, key_s, None) else: tubid_s = get_tubid_string_from_ann(ann) return (service_name, None, tubid_s) def get_tubid_string_from_ann(ann): return get_tubid_string(str(ann.get("anonymous-storage-FURL") or ann.get("FURL"))) def get_tubid_string(furl): m = re.match(r'pb://(\w+)@', furl) assert m return m.group(1).lower() def convert_announcement_v1_to_v2(ann_t): (furl, service_name, ri_name, nickname, ver, oldest) = ann_t assert type(furl) is str assert type(service_name) is str # ignore ri_name assert type(nickname) is str assert type(ver) is str assert type(oldest) is str ann = {"version": 0, "nickname": nickname.decode("utf-8", "replace"), "app-versions": {}, "my-version": ver, "oldest-supported": oldest, "service-name": service_name, "anonymous-storage-FURL": furl, "permutation-seed-base32": get_tubid_string(furl), } msg = simplejson.dumps(ann).encode("utf-8") return (msg, None, None) def convert_announcement_v2_to_v1(ann_v2): (msg, sig, pubkey) = ann_v2 ann = simplejson.loads(msg) assert ann["version"] == 0 ann_t = (str(ann["anonymous-storage-FURL"]), str(ann["service-name"]), "remoteinterface-name is unused", ann["nickname"].encode("utf-8"), str(ann["my-version"]), str(ann["oldest-supported"]), ) return ann_t def sign_to_foolscap(ann, sk): # return (bytes, None, None) or (bytes, sig-str, pubkey-str). A future # HTTP-based serialization will use JSON({msg:b64(JSON(msg).utf8), # sig:v0-b64(sig), pubkey:v0-b64(pubkey)}) . msg = simplejson.dumps(ann).encode("utf-8") if sk: sig = "v0-"+base32.b2a(sk.sign(msg)) vk_bytes = sk.get_verifying_key_bytes() ann_t = (msg, sig, "v0-"+base32.b2a(vk_bytes)) else: ann_t = (msg, None, None) return ann_t class UnknownKeyError(Exception): pass def unsign_from_foolscap(ann_t): (msg, sig_vs, claimed_key_vs) = ann_t key_vs = None if sig_vs and claimed_key_vs: if not sig_vs.startswith("v0-"): raise UnknownKeyError("only v0- signatures recognized") if not claimed_key_vs.startswith("v0-"): raise UnknownKeyError("only v0- keys recognized") claimed_key = keyutil.parse_pubkey("pub-"+claimed_key_vs) sig_bytes = base32.a2b(keyutil.remove_prefix(sig_vs, "v0-")) claimed_key.verify(sig_bytes, msg) key_vs = claimed_key_vs ann = simplejson.loads(msg.decode("utf-8")) return (ann, key_vs) class SubscriberDescriptor: """This describes a subscriber, for status display purposes. It contains the following attributes: .service_name: what they subscribed to (string) .when: time when they subscribed (seconds since epoch) .nickname: their self-provided nickname, or "?" (unicode) .version: their self-provided version (string) .app_versions: versions of each library they use (dict str->str) .advertised_addresses: what hosts they listen on (list of strings) .remote_address: the external address from which they connected (string) .tubid: for subscribers connecting with Foolscap, their tubid (string) """ def __init__(self, service_name, when, nickname, version, app_versions, advertised_addresses, remote_address, tubid): self.service_name = service_name self.when = when self.nickname = nickname self.version = version self.app_versions = app_versions self.advertised_addresses = advertised_addresses self.remote_address = remote_address self.tubid = tubid class AnnouncementDescriptor: """This describes an announcement, for status display purposes. It contains the following attributes, which will be empty ("" for strings) if the client did not provide them: .when: time the announcement was first received (seconds since epoch) .index: the announcements 'index', a tuple of (string-or-None). The server remembers one announcement per index. .canary: a Referenceable on the announcer, so the server can learn when they disconnect (for the status display) .announcement: raw dictionary of announcement data .service_name: which service they are announcing (string) .version: 'my-version' portion of announcement (string) .nickname: their self-provided nickname, or "" (unicode) .serverid: the server identifier. This is a pubkey (for V2 clients), or a tubid (for V1 clients). .advertised_addresses: which hosts they listen on (list of strings) if the announcement included a key for 'anonymous-storage-FURL', else an empty list. """ def __init__(self, when, index, canary, ann_d): self.when = when self.index = index self.canary = canary self.announcement = ann_d self.service_name = ann_d["service-name"] self.version = ann_d.get("my-version", "") self.nickname = ann_d.get("nickname", u"") (service_name, key_s, tubid_s) = index self.serverid = key_s or tubid_s furl = ann_d.get("anonymous-storage-FURL") if furl: self.advertised_addresses = rrefutil.hosts_for_furl(furl) else: self.advertised_addresses = [] allmydata-tahoe-1.10.2/src/allmydata/introducer/server.py0000644000175000017500000004214412556560070021571 0ustar ramram import time, os.path, textwrap from zope.interface import implements from twisted.application import service from foolscap.api import Referenceable import allmydata from allmydata import node from allmydata.util import log, rrefutil from allmydata.util.fileutil import abspath_expanduser_unicode from allmydata.introducer.interfaces import \ RIIntroducerPublisherAndSubscriberService_v2 from allmydata.introducer.common import convert_announcement_v1_to_v2, \ convert_announcement_v2_to_v1, unsign_from_foolscap, make_index, \ get_tubid_string_from_ann, SubscriberDescriptor, AnnouncementDescriptor class FurlFileConflictError(Exception): pass class IntroducerNode(node.Node): PORTNUMFILE = "introducer.port" NODETYPE = "introducer" GENERATED_FILES = ['introducer.furl'] def __init__(self, basedir=u"."): node.Node.__init__(self, basedir) self.read_config() self.init_introducer() webport = self.get_config("node", "web.port", None) if webport: self.init_web(webport) # strports string def init_introducer(self): introducerservice = IntroducerService(self.basedir) self.add_service(introducerservice) old_public_fn = os.path.join(self.basedir, u"introducer.furl") private_fn = os.path.join(self.basedir, u"private", u"introducer.furl") if os.path.exists(old_public_fn): if os.path.exists(private_fn): msg = """This directory (%s) contains both an old public 'introducer.furl' file, and a new-style 'private/introducer.furl', so I cannot safely remove the old one. Please make sure your desired FURL is in private/introducer.furl, and remove the public file. If this causes your Introducer's FURL to change, you need to inform all grid members so they can update their tahoe.cfg. """ raise FurlFileConflictError(textwrap.dedent(msg)) os.rename(old_public_fn, private_fn) d = self.when_tub_ready() def _publish(res): furl = self.tub.registerReference(introducerservice, furlFile=private_fn) self.log(" introducer is at %s" % furl, umid="qF2L9A") self.introducer_url = furl # for tests d.addCallback(_publish) d.addErrback(log.err, facility="tahoe.init", level=log.BAD, umid="UaNs9A") def init_web(self, webport): self.log("init_web(webport=%s)", args=(webport,), umid="2bUygA") from allmydata.webish import IntroducerWebishServer nodeurl_path = os.path.join(self.basedir, u"node.url") config_staticdir = self.get_config("node", "web.static", "public_html").decode('utf-8') staticdir = abspath_expanduser_unicode(config_staticdir, base=self.basedir) ws = IntroducerWebishServer(self, webport, nodeurl_path, staticdir) self.add_service(ws) class WrapV1SubscriberInV2Interface: # for_v1 """I wrap a RemoteReference that points at an old v1 subscriber, enabling it to be treated like a v2 subscriber. """ def __init__(self, original): self.original = original # also used for tests def __eq__(self, them): return self.original == them def __ne__(self, them): return self.original != them def __hash__(self): return hash(self.original) def getRemoteTubID(self): return self.original.getRemoteTubID() def getSturdyRef(self): return self.original.getSturdyRef() def getPeer(self): return self.original.getPeer() def getLocationHints(self): return self.original.getLocationHints() def callRemote(self, methname, *args, **kwargs): m = getattr(self, "wrap_" + methname) return m(*args, **kwargs) def wrap_announce_v2(self, announcements): anns_v1 = [convert_announcement_v2_to_v1(ann) for ann in announcements] return self.original.callRemote("announce", set(anns_v1)) def wrap_set_encoding_parameters(self, parameters): # note: unused return self.original.callRemote("set_encoding_parameters", parameters) def notifyOnDisconnect(self, *args, **kwargs): return self.original.notifyOnDisconnect(*args, **kwargs) class IntroducerService(service.MultiService, Referenceable): implements(RIIntroducerPublisherAndSubscriberService_v2) name = "introducer" # v1 is the original protocol, supported since 1.0 (but only advertised # starting in 1.3). v2 is the new signed protocol, supported after 1.9 VERSION = { "http://allmydata.org/tahoe/protocols/introducer/v1": { }, "http://allmydata.org/tahoe/protocols/introducer/v2": { }, "application-version": str(allmydata.__full_version__), } def __init__(self, basedir="."): service.MultiService.__init__(self) self.introducer_url = None # 'index' is (service_name, key_s, tubid), where key_s or tubid is # None self._announcements = {} # dict of index -> # (ann_t, canary, ann, timestamp) # ann (the announcement dictionary) is cleaned up: nickname is always # unicode, servicename is always ascii, etc, even though # simplejson.loads sometimes returns either # self._subscribers is a dict mapping servicename to subscriptions # 'subscriptions' is a dict mapping rref to a subscription # 'subscription' is a tuple of (subscriber_info, timestamp) # 'subscriber_info' is a dict, provided directly for v2 clients, or # synthesized for v1 clients. The expected keys are: # version, nickname, app-versions, my-version, oldest-supported self._subscribers = {} # self._stub_client_announcements contains the information provided # by v1 clients. We stash this so we can match it up with their # subscriptions. self._stub_client_announcements = {} # maps tubid to sinfo # for_v1 self._debug_counts = {"inbound_message": 0, "inbound_duplicate": 0, "inbound_no_seqnum": 0, "inbound_old_replay": 0, "inbound_update": 0, "outbound_message": 0, "outbound_announcements": 0, "inbound_subscribe": 0} self._debug_outstanding = 0 # also covers WrapV1SubscriberInV2Interface def _debug_retired(self, res): self._debug_outstanding -= 1 return res def log(self, *args, **kwargs): if "facility" not in kwargs: kwargs["facility"] = "tahoe.introducer.server" return log.msg(*args, **kwargs) def get_announcements(self, include_stub_clients=True): """Return a list of AnnouncementDescriptor for all announcements""" announcements = [] for (index, (_, canary, ann, when)) in self._announcements.items(): if ann["service-name"] == "stub_client": if not include_stub_clients: continue ad = AnnouncementDescriptor(when, index, canary, ann) announcements.append(ad) return announcements def get_subscribers(self): """Return a list of SubscriberDescriptor objects for all subscribers""" s = [] for service_name, subscriptions in self._subscribers.items(): for rref,(subscriber_info,when) in subscriptions.items(): # note that if the subscriber didn't do Tub.setLocation, # tubid will be None. Also, subscribers do not tell us which # pubkey they use; only publishers do that. tubid = rref.getRemoteTubID() or "?" advertised_addresses = rrefutil.hosts_for_rref(rref) remote_address = rrefutil.stringify_remote_address(rref) # these three assume subscriber_info["version"]==0, but # should tolerate other versions if not subscriber_info: # V1 clients that haven't yet sent their stub_info data subscriber_info = {} nickname = subscriber_info.get("nickname", u"?") version = subscriber_info.get("my-version", u"?") app_versions = subscriber_info.get("app-versions", {}) # 'when' is the time they subscribed sd = SubscriberDescriptor(service_name, when, nickname, version, app_versions, advertised_addresses, remote_address, tubid) s.append(sd) return s def remote_get_version(self): return self.VERSION def remote_publish(self, ann_t): # for_v1 lp = self.log("introducer: old (v1) announcement published: %s" % (ann_t,), umid="6zGOIw") ann_v2 = convert_announcement_v1_to_v2(ann_t) return self.publish(ann_v2, None, lp) def remote_publish_v2(self, ann_t, canary): lp = self.log("introducer: announcement (v2) published", umid="L2QXkQ") return self.publish(ann_t, canary, lp) def publish(self, ann_t, canary, lp): try: self._publish(ann_t, canary, lp) except: log.err(format="Introducer.remote_publish failed on %(ann)s", ann=ann_t, level=log.UNUSUAL, parent=lp, umid="620rWA") raise def _publish(self, ann_t, canary, lp): self._debug_counts["inbound_message"] += 1 self.log("introducer: announcement published: %s" % (ann_t,), umid="wKHgCw") ann, key = unsign_from_foolscap(ann_t) # might raise BadSignatureError index = make_index(ann, key) service_name = str(ann["service-name"]) if service_name == "stub_client": # for_v1 self._attach_stub_client(ann, lp) return old = self._announcements.get(index) if old: (old_ann_t, canary, old_ann, timestamp) = old if old_ann == ann: self.log("but we already knew it, ignoring", level=log.NOISY, umid="myxzLw") self._debug_counts["inbound_duplicate"] += 1 return else: if "seqnum" in old_ann: # must beat previous sequence number to replace if ("seqnum" not in ann or not isinstance(ann["seqnum"], (int,long))): self.log("not replacing old ann, no valid seqnum", level=log.NOISY, umid="ySbaVw") self._debug_counts["inbound_no_seqnum"] += 1 return if ann["seqnum"] <= old_ann["seqnum"]: self.log("not replacing old ann, new seqnum is too old" " (%s <= %s) (replay attack?)" % (ann["seqnum"], old_ann["seqnum"]), level=log.UNUSUAL, umid="sX7yqQ") self._debug_counts["inbound_old_replay"] += 1 return # ok, seqnum is newer, allow replacement self.log("old announcement being updated", level=log.NOISY, umid="304r9g") self._debug_counts["inbound_update"] += 1 self._announcements[index] = (ann_t, canary, ann, time.time()) #if canary: # canary.notifyOnDisconnect ... # use a CanaryWatcher? with cw.is_connected()? # actually we just want foolscap to give rref.is_connected(), since # this is only for the status display for s in self._subscribers.get(service_name, []): self._debug_counts["outbound_message"] += 1 self._debug_counts["outbound_announcements"] += 1 self._debug_outstanding += 1 d = s.callRemote("announce_v2", set([ann_t])) d.addBoth(self._debug_retired) d.addErrback(log.err, format="subscriber errored on announcement %(ann)s", ann=ann_t, facility="tahoe.introducer", level=log.UNUSUAL, umid="jfGMXQ") def _attach_stub_client(self, ann, lp): # There might be a v1 subscriber for whom this is a stub_client. # We might have received the subscription before the stub_client # announcement, in which case we now need to fix up the record in # self._subscriptions . # record it for later, in case the stub_client arrived before the # subscription subscriber_info = self._get_subscriber_info_from_ann(ann) ann_tubid = get_tubid_string_from_ann(ann) self._stub_client_announcements[ann_tubid] = subscriber_info lp2 = self.log("stub_client announcement, " "looking for matching subscriber", parent=lp, level=log.NOISY, umid="BTywDg") for sn in self._subscribers: s = self._subscribers[sn] for (subscriber, info) in s.items(): # we correlate these by looking for a subscriber whose tubid # matches this announcement sub_tubid = subscriber.getRemoteTubID() if sub_tubid == ann_tubid: self.log(format="found a match, nodeid=%(nodeid)s", nodeid=sub_tubid, level=log.NOISY, parent=lp2, umid="xsWs1A") # found a match. Does it need info? if not info[0]: self.log(format="replacing info", level=log.NOISY, parent=lp2, umid="m5kxwA") # yup s[subscriber] = (subscriber_info, info[1]) # and we don't remember or announce stub_clients beyond what we # need to get the subscriber_info set up def _get_subscriber_info_from_ann(self, ann): # for_v1 sinfo = { "version": ann["version"], "nickname": ann["nickname"], "app-versions": ann["app-versions"], "my-version": ann["my-version"], "oldest-supported": ann["oldest-supported"], } return sinfo def remote_subscribe(self, subscriber, service_name): # for_v1 self.log("introducer: old (v1) subscription[%s] request at %s" % (service_name, subscriber), umid="hJlGUg") return self.add_subscriber(WrapV1SubscriberInV2Interface(subscriber), service_name, None) def remote_subscribe_v2(self, subscriber, service_name, subscriber_info): self.log("introducer: subscription[%s] request at %s" % (service_name, subscriber), umid="U3uzLg") return self.add_subscriber(subscriber, service_name, subscriber_info) def add_subscriber(self, subscriber, service_name, subscriber_info): self._debug_counts["inbound_subscribe"] += 1 if service_name not in self._subscribers: self._subscribers[service_name] = {} subscribers = self._subscribers[service_name] if subscriber in subscribers: self.log("but they're already subscribed, ignoring", level=log.UNUSUAL, umid="Sy9EfA") return if not subscriber_info: # for_v1 # v1 clients don't provide subscriber_info, but they should # publish a 'stub client' record which contains the same # information. If we've already received this, it will be in # self._stub_client_announcements tubid = subscriber.getRemoteTubID() if tubid in self._stub_client_announcements: subscriber_info = self._stub_client_announcements[tubid] subscribers[subscriber] = (subscriber_info, time.time()) def _remove(): self.log("introducer: unsubscribing[%s] %s" % (service_name, subscriber), umid="vYGcJg") subscribers.pop(subscriber, None) subscriber.notifyOnDisconnect(_remove) # now tell them about any announcements they're interested in announcements = set( [ ann_t for idx,(ann_t,canary,ann,when) in self._announcements.items() if idx[0] == service_name] ) if announcements: self._debug_counts["outbound_message"] += 1 self._debug_counts["outbound_announcements"] += len(announcements) self._debug_outstanding += 1 d = subscriber.callRemote("announce_v2", announcements) d.addBoth(self._debug_retired) d.addErrback(log.err, format="subscriber errored during subscribe %(anns)s", anns=announcements, facility="tahoe.introducer", level=log.UNUSUAL, umid="mtZepQ") return d allmydata-tahoe-1.10.2/src/allmydata/introducer/__init__.py0000644000175000017500000000032712556560070022017 0ustar ramram # This is for compatibilty with old .tac files, which reference # allmydata.introducer.IntroducerNode from allmydata.introducer.server import IntroducerNode # hush pyflakes _unused = [IntroducerNode] del _unused allmydata-tahoe-1.10.2/src/allmydata/stats.py0000644000175000017500000002625212556560070017245 0ustar ramram import os import pickle import pprint import time from collections import deque from twisted.internet import reactor from twisted.application import service from twisted.application.internet import TimerService from zope.interface import implements from foolscap.api import eventually, DeadReferenceError, Referenceable, Tub from allmydata.util import log, fileutil from allmydata.util.encodingutil import quote_local_unicode_path from allmydata.interfaces import RIStatsProvider, RIStatsGatherer, IStatsProducer class LoadMonitor(service.MultiService): implements(IStatsProducer) loop_interval = 1 num_samples = 60 def __init__(self, provider, warn_if_delay_exceeds=1): service.MultiService.__init__(self) self.provider = provider self.warn_if_delay_exceeds = warn_if_delay_exceeds self.started = False self.last = None self.stats = deque() self.timer = None def startService(self): if not self.started: self.started = True self.timer = reactor.callLater(self.loop_interval, self.loop) service.MultiService.startService(self) def stopService(self): self.started = False if self.timer: self.timer.cancel() self.timer = None return service.MultiService.stopService(self) def loop(self): self.timer = None if not self.started: return now = time.time() if self.last is not None: delay = now - self.last - self.loop_interval if delay > self.warn_if_delay_exceeds: log.msg(format='excessive reactor delay (%ss)', args=(delay,), level=log.UNUSUAL) self.stats.append(delay) while len(self.stats) > self.num_samples: self.stats.popleft() self.last = now self.timer = reactor.callLater(self.loop_interval, self.loop) def get_stats(self): if self.stats: avg = sum(self.stats) / len(self.stats) m_x = max(self.stats) else: avg = m_x = 0 return { 'load_monitor.avg_load': avg, 'load_monitor.max_load': m_x, } class CPUUsageMonitor(service.MultiService): implements(IStatsProducer) HISTORY_LENGTH = 15 POLL_INTERVAL = 60 def __init__(self): service.MultiService.__init__(self) # we don't use time.clock() here, because the constructor is run by # the twistd parent process (as it loads the .tac file), whereas the # rest of the program will be run by the child process, after twistd # forks. Instead, set self.initial_cpu as soon as the reactor starts # up. self.initial_cpu = 0.0 # just in case eventually(self._set_initial_cpu) self.samples = [] # we provide 1min, 5min, and 15min moving averages TimerService(self.POLL_INTERVAL, self.check).setServiceParent(self) def _set_initial_cpu(self): self.initial_cpu = time.clock() def check(self): now_wall = time.time() now_cpu = time.clock() self.samples.append( (now_wall, now_cpu) ) while len(self.samples) > self.HISTORY_LENGTH+1: self.samples.pop(0) def _average_N_minutes(self, size): if len(self.samples) < size+1: return None first = -size-1 elapsed_wall = self.samples[-1][0] - self.samples[first][0] elapsed_cpu = self.samples[-1][1] - self.samples[first][1] fraction = elapsed_cpu / elapsed_wall return fraction def get_stats(self): s = {} avg = self._average_N_minutes(1) if avg is not None: s["cpu_monitor.1min_avg"] = avg avg = self._average_N_minutes(5) if avg is not None: s["cpu_monitor.5min_avg"] = avg avg = self._average_N_minutes(15) if avg is not None: s["cpu_monitor.15min_avg"] = avg now_cpu = time.clock() s["cpu_monitor.total"] = now_cpu - self.initial_cpu return s class StatsProvider(Referenceable, service.MultiService): implements(RIStatsProvider) def __init__(self, node, gatherer_furl): service.MultiService.__init__(self) self.node = node self.gatherer_furl = gatherer_furl # might be None self.counters = {} self.stats_producers = [] # only run the LoadMonitor (which submits a timer every second) if # there is a gatherer who is going to be paying attention. Our stats # are visible through HTTP even without a gatherer, so run the rest # of the stats (including the once-per-minute CPUUsageMonitor) if gatherer_furl: self.load_monitor = LoadMonitor(self) self.load_monitor.setServiceParent(self) self.register_producer(self.load_monitor) self.cpu_monitor = CPUUsageMonitor() self.cpu_monitor.setServiceParent(self) self.register_producer(self.cpu_monitor) def startService(self): if self.node and self.gatherer_furl: d = self.node.when_tub_ready() def connect(junk): nickname_utf8 = self.node.nickname.encode("utf-8") self.node.tub.connectTo(self.gatherer_furl, self._connected, nickname_utf8) d.addCallback(connect) service.MultiService.startService(self) def count(self, name, delta=1): val = self.counters.setdefault(name, 0) self.counters[name] = val + delta def register_producer(self, stats_producer): self.stats_producers.append(IStatsProducer(stats_producer)) def get_stats(self): stats = {} for sp in self.stats_producers: stats.update(sp.get_stats()) ret = { 'counters': self.counters, 'stats': stats } log.msg(format='get_stats() -> %(stats)s', stats=ret, level=log.NOISY) return ret def remote_get_stats(self): return self.get_stats() def _connected(self, gatherer, nickname): gatherer.callRemoteOnly('provide', self, nickname or '') class StatsGatherer(Referenceable, service.MultiService): implements(RIStatsGatherer) poll_interval = 60 def __init__(self, basedir): service.MultiService.__init__(self) self.basedir = basedir self.clients = {} self.nicknames = {} self.timer = TimerService(self.poll_interval, self.poll) self.timer.setServiceParent(self) def get_tubid(self, rref): return rref.getRemoteTubID() def remote_provide(self, provider, nickname): tubid = self.get_tubid(provider) if tubid == '': print "WARNING: failed to get tubid for %s (%s)" % (provider, nickname) # don't add to clients to poll (polluting data) don't care about disconnect return self.clients[tubid] = provider self.nicknames[tubid] = nickname def poll(self): for tubid,client in self.clients.items(): nickname = self.nicknames.get(tubid) d = client.callRemote('get_stats') d.addCallbacks(self.got_stats, self.lost_client, callbackArgs=(tubid, nickname), errbackArgs=(tubid,)) d.addErrback(self.log_client_error, tubid) def lost_client(self, f, tubid): # this is called lazily, when a get_stats request fails del self.clients[tubid] del self.nicknames[tubid] f.trap(DeadReferenceError) def log_client_error(self, f, tubid): log.msg("StatsGatherer: error in get_stats(), peerid=%s" % tubid, level=log.UNUSUAL, failure=f) def got_stats(self, stats, tubid, nickname): raise NotImplementedError() class StdOutStatsGatherer(StatsGatherer): verbose = True def remote_provide(self, provider, nickname): tubid = self.get_tubid(provider) if self.verbose: print 'connect "%s" [%s]' % (nickname, tubid) provider.notifyOnDisconnect(self.announce_lost_client, tubid) StatsGatherer.remote_provide(self, provider, nickname) def announce_lost_client(self, tubid): print 'disconnect "%s" [%s]' % (self.nicknames[tubid], tubid) def got_stats(self, stats, tubid, nickname): print '"%s" [%s]:' % (nickname, tubid) pprint.pprint(stats) class PickleStatsGatherer(StdOutStatsGatherer): # inherit from StdOutStatsGatherer for connect/disconnect notifications def __init__(self, basedir=u".", verbose=True): self.verbose = verbose StatsGatherer.__init__(self, basedir) self.picklefile = os.path.join(basedir, "stats.pickle") if os.path.exists(self.picklefile): f = open(self.picklefile, 'rb') try: self.gathered_stats = pickle.load(f) except Exception: print ("Error while attempting to load pickle file %s.\n" "You may need to restore this file from a backup, or delete it if no backup is available.\n" % quote_local_unicode_path(self.picklefile)) raise f.close() else: self.gathered_stats = {} def got_stats(self, stats, tubid, nickname): s = self.gathered_stats.setdefault(tubid, {}) s['timestamp'] = time.time() s['nickname'] = nickname s['stats'] = stats self.dump_pickle() def dump_pickle(self): tmp = "%s.tmp" % (self.picklefile,) f = open(tmp, 'wb') pickle.dump(self.gathered_stats, f) f.close() if os.path.exists(self.picklefile): os.unlink(self.picklefile) os.rename(tmp, self.picklefile) class StatsGathererService(service.MultiService): furl_file = "stats_gatherer.furl" def __init__(self, basedir=".", verbose=False): service.MultiService.__init__(self) self.basedir = basedir self.tub = Tub(certFile=os.path.join(self.basedir, "stats_gatherer.pem")) self.tub.setServiceParent(self) self.tub.setOption("logLocalFailures", True) self.tub.setOption("logRemoteFailures", True) self.tub.setOption("expose-remote-exception-types", False) self.stats_gatherer = PickleStatsGatherer(self.basedir, verbose) self.stats_gatherer.setServiceParent(self) portnumfile = os.path.join(self.basedir, "portnum") try: portnum = open(portnumfile, "r").read() except EnvironmentError: portnum = None self.listener = self.tub.listenOn(portnum or "tcp:0") d = self.tub.setLocationAutomatically() if portnum is None: d.addCallback(self.save_portnum) d.addCallback(self.tub_ready) d.addErrback(log.err) def save_portnum(self, junk): portnum = self.listener.getPortnum() portnumfile = os.path.join(self.basedir, 'portnum') fileutil.write(portnumfile, '%d\n' % (portnum,)) def tub_ready(self, ignored): ff = os.path.join(self.basedir, self.furl_file) self.gatherer_furl = self.tub.registerReference(self.stats_gatherer, furlFile=ff) allmydata-tahoe-1.10.2/src/allmydata/util/0000755000175000017500000000000012556560072016505 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/util/repeatable_random.py0000644000175000017500000000702412556560070022524 0ustar ramram""" If you execute force_repeatability() then the following things are changed in the runtime: 1. random.random() and its sibling functions, and random.Random.seed() in the random module are seeded with a known seed so that they will return the same sequence on each run. 2. os.urandom() is replaced by a fake urandom that returns a pseudorandom sequence. 3. time.time() is replaced by a fake time that returns an incrementing number. (Original time.time is available as time.realtime.) Which seed will be used? If the environment variable REPEATABLE_RANDOMNESS_SEED is set, then it will use that. Else, it will use the current real time. In either case it logs the seed that it used. Caveats: 1. If some code has acquired a random.Random object before force_repeatability() is executed, then that Random object will produce non-reproducible results. For example, the tempfile module in the Python Standard Library does this. 2. Likewise if some code called time.time() before force_repeatability() was called, then it will have gotten a real time stamp. For example, trial does this. (Then it later subtracts that real timestamp from a faketime timestamp to calculate elapsed time, resulting in a large negative elapsed time.) 3. The output from the fake urandom has weird distribution for performance reasons-- every byte after the first 20 bytes resulting from a single call to os.urandom() is zero. In practice this hasn't caused any problems. """ import os, random, time if not hasattr(time, "realtime"): time.realtime = time.time if not hasattr(os, "realurandom"): os.realurandom = os.urandom if not hasattr(random, "realseed"): random.realseed = random.seed tdelta = 0 seeded = False def force_repeatability(): now = 1043659734.0 def faketime(): global tdelta tdelta += 1 return now + tdelta time.faketime = faketime time.time = faketime from allmydata.util.idlib import i2b def fakeurandom(n): if n > 20: z = i2b(random.getrandbits(20*8)) elif n == 0: return '' else: z = i2b(random.getrandbits(n*8)) x = z + "0" * (n-len(z)) assert len(x) == n return x os.fakeurandom = fakeurandom os.urandom = fakeurandom global seeded if not seeded: SEED = os.environ.get('REPEATABLE_RANDOMNESS_SEED', None) if SEED is None: # Generate a seed which is integral and fairly short (to ease cut-and-paste, writing it down, etc.). t = time.realtime() subsec = t % 1 t += (subsec * 1000000) t %= 1000000 SEED = long(t) import sys sys.stdout.write("REPEATABLE_RANDOMNESS_SEED: %s\n" % SEED) ; sys.stdout.flush() sys.stdout.write("In order to reproduce this run of the code, set the environment variable \"REPEATABLE_RANDOMNESS_SEED\" to %s before executing.\n" % SEED) ; sys.stdout.flush() random.seed(SEED) def seed_which_refuses(a): sys.stdout.write("I refuse to reseed to %s. Go away!\n" % (a,)) ; sys.stdout.flush() return random.realseed = random.seed random.seed = seed_which_refuses seeded = True import setutil setutil.RandomSet.DETERMINISTIC = True def restore_real_clock(): time.time = time.realtime def restore_real_urandom(): os.urandom = os.realurandom def restore_real_seed(): random.seed = random.realseed def restore_non_repeatability(): restore_real_seed() restore_real_urandom() restore_real_clock() allmydata-tahoe-1.10.2/src/allmydata/util/spans.py0000644000175000017500000004244612556560070020213 0ustar ramram class Spans: """I represent a compressed list of booleans, one per index (an integer). Typically, each index represents an offset into a large string, pointing to a specific byte of a share. In this context, True means that byte has been received, or has been requested. Another way to look at this is maintaining a set of integers, optimized for operations on spans like 'add range to set' and 'is range in set?'. This is a python equivalent of perl's Set::IntSpan module, frequently used to represent .newsrc contents. Rather than storing an actual (large) list or dictionary, I represent my internal state as a sorted list of spans, each with a start and a length. My API is presented in terms of start+length pairs. I provide set arithmetic operators, to efficiently answer questions like 'I want bytes XYZ, I already requested bytes ABC, and I've already received bytes DEF: what bytes should I request now?'. The new downloader will use it to keep track of which bytes we've requested or received already. """ def __init__(self, _span_or_start=None, length=None): self._spans = list() if length is not None: self._spans.append( (_span_or_start, length) ) elif _span_or_start: for (start,length) in _span_or_start: self.add(start, length) self._check() def _check(self): assert sorted(self._spans) == self._spans prev_end = None try: for (start,length) in self._spans: if prev_end is not None: assert start > prev_end prev_end = start+length except AssertionError: print "BAD:", self.dump() raise def add(self, start, length): assert start >= 0 assert length > 0 #print " ADD [%d+%d -%d) to %s" % (start, length, start+length, self.dump()) first_overlap = last_overlap = None for i,(s_start,s_length) in enumerate(self._spans): #print " (%d+%d)-> overlap=%s adjacent=%s" % (s_start,s_length, overlap(s_start, s_length, start, length), adjacent(s_start, s_length, start, length)) if (overlap(s_start, s_length, start, length) or adjacent(s_start, s_length, start, length)): last_overlap = i if first_overlap is None: first_overlap = i continue # no overlap if first_overlap is not None: break #print " first_overlap", first_overlap, last_overlap if first_overlap is None: # no overlap, so just insert the span and sort by starting # position. self._spans.insert(0, (start,length)) self._spans.sort() else: # everything from [first_overlap] to [last_overlap] overlapped first_start,first_length = self._spans[first_overlap] last_start,last_length = self._spans[last_overlap] newspan_start = min(start, first_start) newspan_end = max(start+length, last_start+last_length) newspan_length = newspan_end - newspan_start newspan = (newspan_start, newspan_length) self._spans[first_overlap:last_overlap+1] = [newspan] #print " ADD done: %s" % self.dump() self._check() return self def remove(self, start, length): assert start >= 0 assert length > 0 #print " REMOVE [%d+%d -%d) from %s" % (start, length, start+length, self.dump()) first_complete_overlap = last_complete_overlap = None for i,(s_start,s_length) in enumerate(self._spans): s_end = s_start + s_length o = overlap(s_start, s_length, start, length) if o: o_start, o_length = o o_end = o_start+o_length if o_start == s_start and o_end == s_end: # delete this span altogether if first_complete_overlap is None: first_complete_overlap = i last_complete_overlap = i elif o_start == s_start: # we only overlap the left side, so trim the start # 1111 # rrrr # oo # -> 11 new_start = o_end new_end = s_end assert new_start > s_start new_length = new_end - new_start self._spans[i] = (new_start, new_length) elif o_end == s_end: # we only overlap the right side # 1111 # rrrr # oo # -> 11 new_start = s_start new_end = o_start assert new_end < s_end new_length = new_end - new_start self._spans[i] = (new_start, new_length) else: # we overlap the middle, so create a new span. No need to # examine any other spans. # 111111 # rr # LL RR left_start = s_start left_end = o_start left_length = left_end - left_start right_start = o_end right_end = s_end right_length = right_end - right_start self._spans[i] = (left_start, left_length) self._spans.append( (right_start, right_length) ) self._spans.sort() break if first_complete_overlap is not None: del self._spans[first_complete_overlap:last_complete_overlap+1] #print " REMOVE done: %s" % self.dump() self._check() return self def dump(self): return "len=%d: %s" % (self.len(), ",".join(["[%d-%d]" % (start,start+l-1) for (start,l) in self._spans]) ) def each(self): for start, length in self._spans: for i in range(start, start+length): yield i def __iter__(self): for s in self._spans: yield s def __nonzero__(self): # this gets us bool() return bool(self.len()) def len(self): # guess what! python doesn't allow __len__ to return a long, only an # int. So we stop using len(spans), use spans.len() instead. return sum([length for start,length in self._spans]) def __add__(self, other): s = self.__class__(self) for (start, length) in other: s.add(start, length) return s def __sub__(self, other): s = self.__class__(self) for (start, length) in other: s.remove(start, length) return s def __iadd__(self, other): for (start, length) in other: self.add(start, length) return self def __isub__(self, other): for (start, length) in other: self.remove(start, length) return self def __and__(self, other): if not self._spans: return self.__class__() bounds = self.__class__(self._spans[0][0], self._spans[-1][0]+self._spans[-1][1]) not_other = bounds - other return self - not_other def __contains__(self, (start,length)): for span_start,span_length in self._spans: o = overlap(start, length, span_start, span_length) if o: o_start,o_length = o if o_start == start and o_length == length: return True return False def overlap(start0, length0, start1, length1): # return start2,length2 of the overlapping region, or None # 00 00 000 0000 00 00 000 00 00 00 00 # 11 11 11 11 111 11 11 1111 111 11 11 left = max(start0, start1) right = min(start0+length0, start1+length1) # if there is overlap, 'left' will be its start, and right-1 will # be the end' if left < right: return (left, right-left) return None def adjacent(start0, length0, start1, length1): if (start0 < start1) and start0+length0 == start1: return True elif (start1 < start0) and start1+length1 == start0: return True return False class DataSpans: """I represent portions of a large string. Equivalently, I can be said to maintain a large array of characters (with gaps of empty elements). I can be used to manage access to a remote share, where some pieces have been retrieved, some have been requested, and others have not been read. """ def __init__(self, other=None): self.spans = [] # (start, data) tuples, non-overlapping, merged if other: for (start, data) in other.get_chunks(): self.add(start, data) def __nonzero__(self): # this gets us bool() return bool(self.len()) def len(self): # return number of bytes we're holding return sum([len(data) for (start,data) in self.spans]) def _dump(self): # return iterator of sorted list of offsets, one per byte for (start,data) in self.spans: for i in range(start, start+len(data)): yield i def dump(self): return "len=%d: %s" % (self.len(), ",".join(["[%d-%d]" % (start,start+len(data)-1) for (start,data) in self.spans]) ) def get_chunks(self): return list(self.spans) def get_spans(self): """Return a Spans object with a bit set for each byte I hold""" return Spans([(start, len(data)) for (start,data) in self.spans]) def assert_invariants(self): if not self.spans: return prev_start = self.spans[0][0] prev_end = prev_start + len(self.spans[0][1]) for start, data in self.spans[1:]: if not start > prev_end: # adjacent or overlapping: bad print "ASSERTION FAILED", self.spans raise AssertionError def get(self, start, length): # returns a string of LENGTH, or None #print "get", start, length, self.spans end = start+length for (s_start,s_data) in self.spans: s_end = s_start+len(s_data) #print " ",s_start,s_end if s_start <= start < s_end: # we want some data from this span. Because we maintain # strictly merged and non-overlapping spans, everything we # want must be in this span. offset = start - s_start if offset + length > len(s_data): #print " None, span falls short" return None # span falls short #print " some", s_data[offset:offset+length] return s_data[offset:offset+length] if s_start >= end: # we've gone too far: no further spans will overlap #print " None, gone too far" return None #print " None, ran out of spans" return None def add(self, start, data): # first: walk through existing spans, find overlap, modify-in-place # create list of new spans # add new spans # sort # merge adjacent spans #print "add", start, data, self.spans end = start + len(data) i = 0 while len(data): #print " loop", start, data, i, len(self.spans), self.spans if i >= len(self.spans): #print " append and done" # append a last span self.spans.append( (start, data) ) break (s_start,s_data) = self.spans[i] # five basic cases: # a: OLD b:OLDD c1:OLD c2:OLD d1:OLDD d2:OLD e: OLLDD # NEW NEW NEW NEWW NEW NEW NEW # # we handle A by inserting a new segment (with "N") and looping, # turning it into B or C. We handle B by replacing a prefix and # terminating. We handle C (both c1 and c2) by replacing the # segment (and, for c2, looping, turning it into A). We handle D # by replacing a suffix (and, for d2, looping, turning it into # A). We handle E by replacing the middle and terminating. if start < s_start: # case A: insert a new span, then loop with the remainder #print " insert new span" s_len = s_start-start self.spans.insert(i, (start, data[:s_len])) i += 1 start = s_start data = data[s_len:] continue s_len = len(s_data) s_end = s_start+s_len if s_start <= start < s_end: #print " modify this span", s_start, start, s_end # we want to modify some data in this span: a prefix, a # suffix, or the whole thing if s_start == start: if s_end <= end: #print " replace whole segment" # case C: replace this segment self.spans[i] = (s_start, data[:s_len]) i += 1 start += s_len data = data[s_len:] # C2 is where len(data)>0 continue # case B: modify the prefix, retain the suffix #print " modify prefix" self.spans[i] = (s_start, data + s_data[len(data):]) break if start > s_start and end < s_end: # case E: modify the middle #print " modify middle" prefix_len = start - s_start # we retain this much suffix_len = s_end - end # and retain this much newdata = s_data[:prefix_len] + data + s_data[-suffix_len:] self.spans[i] = (s_start, newdata) break # case D: retain the prefix, modify the suffix #print " modify suffix" prefix_len = start - s_start # we retain this much suffix_len = s_len - prefix_len # we replace this much #print " ", s_data, prefix_len, suffix_len, s_len, data self.spans[i] = (s_start, s_data[:prefix_len] + data[:suffix_len]) i += 1 start += suffix_len data = data[suffix_len:] #print " now", start, data # D2 is where len(data)>0 continue # else we're not there yet #print " still looking" i += 1 continue # now merge adjacent spans #print " merging", self.spans newspans = [] for (s_start,s_data) in self.spans: if newspans and adjacent(newspans[-1][0], len(newspans[-1][1]), s_start, len(s_data)): newspans[-1] = (newspans[-1][0], newspans[-1][1] + s_data) else: newspans.append( (s_start, s_data) ) self.spans = newspans self.assert_invariants() #print " done", self.spans def remove(self, start, length): i = 0 end = start + length #print "remove", start, length, self.spans while i < len(self.spans): (s_start,s_data) = self.spans[i] if s_start >= end: # this segment is entirely right of the removed region, and # all further segments are even further right. We're done. break s_len = len(s_data) s_end = s_start + s_len o = overlap(start, length, s_start, s_len) if not o: i += 1 continue o_start, o_len = o o_end = o_start + o_len if o_len == s_len: # remove the whole segment del self.spans[i] continue if o_start == s_start: # remove a prefix, leaving the suffix from o_end to s_end prefix_len = o_end - o_start self.spans[i] = (o_end, s_data[prefix_len:]) i += 1 continue elif o_end == s_end: # remove a suffix, leaving the prefix from s_start to o_start prefix_len = o_start - s_start self.spans[i] = (s_start, s_data[:prefix_len]) i += 1 continue # remove the middle, creating a new segment # left is s_start:o_start, right is o_end:s_end left_len = o_start - s_start left = s_data[:left_len] right_len = s_end - o_end right = s_data[-right_len:] self.spans[i] = (s_start, left) self.spans.insert(i+1, (o_end, right)) break #print " done", self.spans def pop(self, start, length): data = self.get(start, length) if data: self.remove(start, length) return data allmydata-tahoe-1.10.2/src/allmydata/util/dictutil.py0000644000175000017500000005252012556560070020702 0ustar ramram""" Tools to mess with dicts. """ import copy, operator from bisect import bisect_left, insort_left from allmydata.util.assertutil import _assert, precondition def move(k, d1, d2, strict=False): """ Move item with key k from d1 to d2. """ if strict and not d1.has_key(k): raise KeyError, k d2[k] = d1[k] del d1[k] def subtract(d1, d2): """ Remove all items from d1 whose key occurs in d2. @returns d1 """ if len(d1) > len(d2): for k in d2.keys(): if d1.has_key(k): del d1[k] else: for k in d1.keys(): if d2.has_key(k): del d1[k] return d1 class DictOfSets(dict): def add(self, key, value): if key in self: self[key].add(value) else: self[key] = set([value]) def update(self, otherdictofsets): for key, values in otherdictofsets.iteritems(): if key in self: self[key].update(values) else: self[key] = set(values) def discard(self, key, value): if not key in self: return self[key].discard(value) if not self[key]: del self[key] class UtilDict: def __init__(self, initialdata={}): self.d = {} self.update(initialdata) def del_if_present(self, key): if self.has_key(key): del self[key] def items_sorted_by_value(self): """ @return a sequence of (key, value,) pairs sorted according to value """ l = [(x[1], x[0],) for x in self.d.iteritems()] l.sort() return [(x[1], x[0],) for x in l] def items_sorted_by_key(self): """ @return a sequence of (key, value,) pairs sorted according to key """ l = self.d.items() l.sort() return l def __repr__(self, *args, **kwargs): return self.d.__repr__(*args, **kwargs) def __str__(self, *args, **kwargs): return self.d.__str__(*args, **kwargs) def __contains__(self, *args, **kwargs): return self.d.__contains__(*args, **kwargs) def __len__(self, *args, **kwargs): return self.d.__len__(*args, **kwargs) def __cmp__(self, other): try: return self.d.__cmp__(other) except TypeError, le: # maybe we should look for a .d member in other. I know this is insanely kludgey, but the Right Way To Do It is for dict.__cmp__ to use structural typing ("duck typing") try: return self.d.__cmp__(other.d) except: raise le def __eq__(self, *args, **kwargs): return self.d.__eq__(*args, **kwargs) def __ne__(self, *args, **kwargs): return self.d.__ne__(*args, **kwargs) def __gt__(self, *args, **kwargs): return self.d.__gt__(*args, **kwargs) def __ge__(self, *args, **kwargs): return self.d.__ge__(*args, **kwargs) def __le__(self, *args, **kwargs): return self.d.__le__(*args, **kwargs) def __lt__(self, *args, **kwargs): return self.d.__lt__(*args, **kwargs) def __getitem__(self, *args, **kwargs): return self.d.__getitem__(*args, **kwargs) def __setitem__(self, *args, **kwargs): return self.d.__setitem__(*args, **kwargs) def __delitem__(self, *args, **kwargs): return self.d.__delitem__(*args, **kwargs) def __iter__(self, *args, **kwargs): return self.d.__iter__(*args, **kwargs) def clear(self, *args, **kwargs): return self.d.clear(*args, **kwargs) def copy(self, *args, **kwargs): return self.__class__(self.d.copy(*args, **kwargs)) def fromkeys(self, *args, **kwargs): return self.__class__(self.d.fromkeys(*args, **kwargs)) def get(self, key, default=None): return self.d.get(key, default) def has_key(self, *args, **kwargs): return self.d.has_key(*args, **kwargs) def items(self, *args, **kwargs): return self.d.items(*args, **kwargs) def iteritems(self, *args, **kwargs): return self.d.iteritems(*args, **kwargs) def iterkeys(self, *args, **kwargs): return self.d.iterkeys(*args, **kwargs) def itervalues(self, *args, **kwargs): return self.d.itervalues(*args, **kwargs) def keys(self, *args, **kwargs): return self.d.keys(*args, **kwargs) def pop(self, *args, **kwargs): return self.d.pop(*args, **kwargs) def popitem(self, *args, **kwargs): return self.d.popitem(*args, **kwargs) def setdefault(self, *args, **kwargs): return self.d.setdefault(*args, **kwargs) def update(self, *args, **kwargs): self.d.update(*args, **kwargs) def values(self, *args, **kwargs): return self.d.values(*args, **kwargs) class NumDict: def __init__(self, initialdict={}): self.d = copy.deepcopy(initialdict) def add_num(self, key, val, default=0): """ If the key doesn't appear in self then it is created with value default (before addition). """ self.d[key] = self.d.get(key, default) + val def subtract_num(self, key, val, default=0): self.d[key] = self.d.get(key, default) - val def sum(self): """ @return: the sum of all values """ return reduce(operator.__add__, self.d.values()) def inc(self, key, default=0): """ Increment the value associated with key in dict. If there is no such key, then one will be created with initial value 0 (before inc() -- therefore value 1 after inc). """ self.add_num(key, 1, default) def dec(self, key, default=0): """ Decrement the value associated with key in dict. If there is no such key, then one will be created with initial value 0 (before dec() -- therefore value -1 after dec). """ self.subtract_num(key, 1, default) def items_sorted_by_value(self): """ @return a sequence of (key, value,) pairs sorted according to value """ l = [(x[1], x[0],) for x in self.d.iteritems()] l.sort() return [(x[1], x[0],) for x in l] def item_with_largest_value(self): it = self.d.iteritems() (winner, winnerval,) = it.next() try: while True: n, nv = it.next() if nv > winnerval: winner = n winnerval = nv except StopIteration: pass return (winner, winnerval,) def items_sorted_by_key(self): """ @return a sequence of (key, value,) pairs sorted according to key """ l = self.d.items() l.sort() return l def __repr__(self, *args, **kwargs): return self.d.__repr__(*args, **kwargs) def __str__(self, *args, **kwargs): return self.d.__str__(*args, **kwargs) def __contains__(self, *args, **kwargs): return self.d.__contains__(*args, **kwargs) def __len__(self, *args, **kwargs): return self.d.__len__(*args, **kwargs) def __cmp__(self, other): try: return self.d.__cmp__(other) except TypeError, le: # maybe we should look for a .d member in other. I know this is insanely kludgey, but the Right Way To Do It is for dict.__cmp__ to use structural typing ("duck typing") try: return self.d.__cmp__(other.d) except: raise le def __eq__(self, *args, **kwargs): return self.d.__eq__(*args, **kwargs) def __ne__(self, *args, **kwargs): return self.d.__ne__(*args, **kwargs) def __gt__(self, *args, **kwargs): return self.d.__gt__(*args, **kwargs) def __ge__(self, *args, **kwargs): return self.d.__ge__(*args, **kwargs) def __le__(self, *args, **kwargs): return self.d.__le__(*args, **kwargs) def __lt__(self, *args, **kwargs): return self.d.__lt__(*args, **kwargs) def __getitem__(self, *args, **kwargs): return self.d.__getitem__(*args, **kwargs) def __setitem__(self, *args, **kwargs): return self.d.__setitem__(*args, **kwargs) def __delitem__(self, *args, **kwargs): return self.d.__delitem__(*args, **kwargs) def __iter__(self, *args, **kwargs): return self.d.__iter__(*args, **kwargs) def clear(self, *args, **kwargs): return self.d.clear(*args, **kwargs) def copy(self, *args, **kwargs): return self.__class__(self.d.copy(*args, **kwargs)) def fromkeys(self, *args, **kwargs): return self.__class__(self.d.fromkeys(*args, **kwargs)) def get(self, key, default=0): return self.d.get(key, default) def has_key(self, *args, **kwargs): return self.d.has_key(*args, **kwargs) def items(self, *args, **kwargs): return self.d.items(*args, **kwargs) def iteritems(self, *args, **kwargs): return self.d.iteritems(*args, **kwargs) def iterkeys(self, *args, **kwargs): return self.d.iterkeys(*args, **kwargs) def itervalues(self, *args, **kwargs): return self.d.itervalues(*args, **kwargs) def keys(self, *args, **kwargs): return self.d.keys(*args, **kwargs) def pop(self, *args, **kwargs): return self.d.pop(*args, **kwargs) def popitem(self, *args, **kwargs): return self.d.popitem(*args, **kwargs) def setdefault(self, *args, **kwargs): return self.d.setdefault(*args, **kwargs) def update(self, *args, **kwargs): return self.d.update(*args, **kwargs) def values(self, *args, **kwargs): return self.d.values(*args, **kwargs) def del_if_present(d, k): if d.has_key(k): del d[k] class ValueOrderedDict: """ Note: this implementation assumes that the values do not mutate and change their sort order. That is, it stores the values in a sorted list and as items are added and removed from the dict, it makes updates to the list which will keep the list sorted. But if a value that is currently sitting in the list changes its sort order, then the internal consistency of this object will be lost. If that happens, and if assertion checking is turned on, then you will get an assertion failure the very next time you try to do anything with this ValueOrderedDict. However, those internal consistency checks are very slow and almost certainly unacceptable to leave turned on in production code. """ class ItemIterator: def __init__(self, c): self.c = c self.i = 0 def __iter__(self): return self def next(self): precondition(self.i <= len(self.c.l), "The iterated ValueOrderedDict doesn't have this many elements. Most likely this is because someone altered the contents of the ValueOrderedDict while the iteration was in progress.", self.i, self.c) precondition((self.i == len(self.c.l)) or self.c.d.has_key(self.c.l[self.i][1]), "The iterated ValueOrderedDict doesn't have this key. Most likely this is because someone altered the contents of the ValueOrderedDict while the iteration was in progress.", self.i, (self.i < len(self.c.l)) and self.c.l[self.i], self.c) if self.i == len(self.c.l): raise StopIteration le = self.c.l[self.i] self.i += 1 return (le[1], le[0],) def iteritems(self): return ValueOrderedDict.ItemIterator(self) def items(self): return zip(map(operator.__getitem__, self.l, [1]*len(self.l)), map(operator.__getitem__, self.l, [0]*len(self.l))) def values(self): return map(operator.__getitem__, self.l, [0]*len(self.l)) def keys(self): return map(operator.__getitem__, self.l, [1]*len(self.l)) class KeyIterator: def __init__(self, c): self.c = c self.i = 0 def __iter__(self): return self def next(self): precondition(self.i <= len(self.c.l), "The iterated ValueOrderedDict doesn't have this many elements. Most likely this is because someone altered the contents of the ValueOrderedDict while the iteration was in progress.", self.i, self.c) precondition((self.i == len(self.c.l)) or self.c.d.has_key(self.c.l[self.i][1]), "The iterated ValueOrderedDict doesn't have this key. Most likely this is because someone altered the contents of the ValueOrderedDict while the iteration was in progress.", self.i, (self.i < len(self.c.l)) and self.c.l[self.i], self.c) if self.i == len(self.c.l): raise StopIteration le = self.c.l[self.i] self.i += 1 return le[1] def iterkeys(self): return ValueOrderedDict.KeyIterator(self) class ValueIterator: def __init__(self, c): self.c = c self.i = 0 def __iter__(self): return self def next(self): precondition(self.i <= len(self.c.l), "The iterated ValueOrderedDict doesn't have this many elements. Most likely this is because someone altered the contents of the ValueOrderedDict while the iteration was in progress.", self.i, self.c) precondition((self.i == len(self.c.l)) or self.c.d.has_key(self.c.l[self.i][1]), "The iterated ValueOrderedDict doesn't have this key. Most likely this is because someone altered the contents of the ValueOrderedDict while the iteration was in progress.", self.i, (self.i < len(self.c.l)) and self.c.l[self.i], self.c) if self.i == len(self.c.l): raise StopIteration le = self.c.l[self.i] self.i += 1 return le[0] def itervalues(self): return ValueOrderedDict.ValueIterator(self) def __init__(self, initialdata={}): self.d = {} # k: key, v: val self.l = [] # sorted list of tuples of (val, key,) self.update(initialdata) assert self._assert_invariants() def __len__(self): return len(self.l) def __repr_n__(self, n=None): s = ["{",] try: iter = self.iteritems() x = iter.next() s.append(str(x[0])); s.append(": "); s.append(str(x[1])) i = 1 while (n is None) or (i < n): x = iter.next() s.append(", "); s.append(str(x[0])); s.append(": "); s.append(str(x[1])) except StopIteration: pass s.append("}") return ''.join(s) def __repr__(self): return "<%s %s>" % (self.__class__.__name__, self.__repr_n__(),) def __str__(self): return "<%s %s>" % (self.__class__.__name__, self.__repr_n__(16),) def __eq__(self, other): for (k, v,) in other.iteritems(): if not self.d.has_key(k) or self.d[k] != v: return False return True def __ne__(self, other): return not self.__eq__(other) def _assert_invariants(self): iter = self.l.__iter__() try: oldx = iter.next() while True: x = iter.next() # self.l is required to be sorted _assert(x >= oldx, x, oldx) # every element of self.l is required to appear in self.d _assert(self.d.has_key(x[1]), x) oldx =x except StopIteration: pass for (k, v,) in self.d.iteritems(): i = bisect_left(self.l, (v, k,)) while (self.l[i][0] is not v) or (self.l[i][1] is not k): i += 1 _assert(i < len(self.l), i, len(self.l), k, v, self.l) _assert(self.l[i][0] is v, i, v, l=self.l, d=self.d) _assert(self.l[i][1] is k, i, k, l=self.l, d=self.d) return True def insert(self, key, val=None): assert self._assert_invariants() result = self.__setitem__(key, val) assert self._assert_invariants() return result def setdefault(self, key, default=None): assert self._assert_invariants() if not self.has_key(key): self[key] = default assert self._assert_invariants() return self[key] def __setitem__(self, key, val=None): assert self._assert_invariants() if self.d.has_key(key): oldval = self.d[key] if oldval != val: # re-sort i = bisect_left(self.l, (oldval, key,)) while (self.l[i][0] is not oldval) or (self.l[i][1] is not key): i += 1 self.l.pop(i) insort_left(self.l, (val, key,)) elif oldval is not val: # replace i = bisect_left(self.l, (oldval, key,)) while (self.l[i][0] is not oldval) or (self.l[i][1] is not key): i += 1 self.l[i] = (val, key,) else: insort_left(self.l, (val, key,)) self.d[key] = val assert self._assert_invariants() return val def remove(self, key, default=None, strictkey=True): assert self._assert_invariants() result = self.__delitem__(key, default, strictkey) assert self._assert_invariants() return result def __getitem__(self, key, default=None, strictkey=True): if not self.d.has_key(key): if strictkey: raise KeyError, key else: return default return self.d[key] def __delitem__(self, key, default=None, strictkey=True): """ @param strictkey: True if you want a KeyError in the case that key is not there, False if you want a reference to default in the case that key is not there @param default: the object to return if key is not there; This is ignored if strictkey. @return: the object removed or default if there is not item by that key and strictkey is False """ assert self._assert_invariants() if self.d.has_key(key): val = self.d.pop(key) i = bisect_left(self.l, (val, key,)) while (self.l[i][0] is not val) or (self.l[i][1] is not key): i += 1 self.l.pop(i) assert self._assert_invariants() return val elif strictkey: assert self._assert_invariants() raise KeyError, key else: assert self._assert_invariants() return default def clear(self): assert self._assert_invariants() self.d.clear() del self.l[:] assert self._assert_invariants() def update(self, otherdict): """ @return: self """ assert self._assert_invariants() for (k, v,) in otherdict.iteritems(): self.insert(k, v) assert self._assert_invariants() return self def has_key(self, key): assert self._assert_invariants() return self.d.has_key(key) def popitem(self): if not self.l: raise KeyError, 'popitem(): dictionary is empty' le = self.l.pop(0) del self.d[le[1]] return (le[1], le[0],) def pop(self, k, default=None, strictkey=False): if not self.d.has_key(k): if strictkey: raise KeyError, k else: return default v = self.d.pop(k) i = bisect_left(self.l, (v, k,)) while (self.l[i][0] is not v) or (self.l[i][1] is not k): i += 1 self.l.pop(i) return v def pop_from_list(self, i=0): le = self.l.pop(i) del self.d[le[1]] return le[1] class AuxValueDict(dict): """I behave like a regular dict, but each key is associated with two values: the main value, and an auxilliary one. Setting the main value (with the usual d[key]=value) clears the auxvalue. You can set both main and auxvalue at the same time, and can retrieve the values separately. The main use case is a dictionary that represents unpacked child values for a directory node, where a common pattern is to modify one or more children and then pass the dict back to a packing function. The original packed representation can be cached in the auxvalue, and the packing function can use it directly on all unmodified children. On large directories with a complex packing function, this can save considerable time.""" def __init__(self, *args, **kwargs): super(AuxValueDict, self).__init__(*args, **kwargs) self.auxilliary = {} def __setitem__(self, key, value): super(AuxValueDict, self).__setitem__(key, value) self.auxilliary[key] = None # clear the auxvalue def __delitem__(self, key): super(AuxValueDict, self).__delitem__(key) self.auxilliary.pop(key) def get_aux(self, key, default=None): """Retrieve the auxilliary value. There is no way to distinguish between an auxvalue of 'None' and a key that does not have an auxvalue, and get_aux() will not raise KeyError when called with a missing key.""" return self.auxilliary.get(key, default) def set_with_aux(self, key, value, auxilliary): """Set both the main value and the auxilliary value. There is no way to distinguish between an auxvalue of 'None' and a key that does not have an auxvalue.""" super(AuxValueDict, self).__setitem__(key, value) self.auxilliary[key] = auxilliary allmydata-tahoe-1.10.2/src/allmydata/util/assertutil.py0000644000175000017500000000513412556560070021257 0ustar ramram""" Tests useful in assertion checking, prints out nicely formated messages too. """ from allmydata.util.humanreadable import hr def _assert(___cond=False, *___args, **___kwargs): if ___cond: return True msgbuf=[] if ___args: msgbuf.append("%s %s" % tuple(map(hr, (___args[0], type(___args[0]),)))) msgbuf.extend([", %s %s" % tuple(map(hr, (arg, type(arg),))) for arg in ___args[1:]]) if ___kwargs: msgbuf.append(", %s: %s %s" % ((___kwargs.items()[0][0],) + tuple(map(hr, (___kwargs.items()[0][1], type(___kwargs.items()[0][1]),))))) else: if ___kwargs: msgbuf.append("%s: %s %s" % ((___kwargs.items()[0][0],) + tuple(map(hr, (___kwargs.items()[0][1], type(___kwargs.items()[0][1]),))))) msgbuf.extend([", %s: %s %s" % tuple(map(hr, (k, v, type(v),))) for k, v in ___kwargs.items()[1:]]) raise AssertionError, "".join(msgbuf) def precondition(___cond=False, *___args, **___kwargs): if ___cond: return True msgbuf=["precondition", ] if ___args or ___kwargs: msgbuf.append(": ") if ___args: msgbuf.append("%s %s" % tuple(map(hr, (___args[0], type(___args[0]),)))) msgbuf.extend([", %s %s" % tuple(map(hr, (arg, type(arg),))) for arg in ___args[1:]]) if ___kwargs: msgbuf.append(", %s: %s %s" % ((___kwargs.items()[0][0],) + tuple(map(hr, (___kwargs.items()[0][1], type(___kwargs.items()[0][1]),))))) else: if ___kwargs: msgbuf.append("%s: %s %s" % ((___kwargs.items()[0][0],) + tuple(map(hr, (___kwargs.items()[0][1], type(___kwargs.items()[0][1]),))))) msgbuf.extend([", %s: %s %s" % tuple(map(hr, (k, v, type(v),))) for k, v in ___kwargs.items()[1:]]) raise AssertionError, "".join(msgbuf) def postcondition(___cond=False, *___args, **___kwargs): if ___cond: return True msgbuf=["postcondition", ] if ___args or ___kwargs: msgbuf.append(": ") if ___args: msgbuf.append("%s %s" % tuple(map(hr, (___args[0], type(___args[0]),)))) msgbuf.extend([", %s %s" % tuple(map(hr, (arg, type(arg),))) for arg in ___args[1:]]) if ___kwargs: msgbuf.append(", %s: %s %s" % ((___kwargs.items()[0][0],) + tuple(map(hr, (___kwargs.items()[0][1], type(___kwargs.items()[0][1]),))))) else: if ___kwargs: msgbuf.append("%s: %s %s" % ((___kwargs.items()[0][0],) + tuple(map(hr, (___kwargs.items()[0][1], type(___kwargs.items()[0][1]),))))) msgbuf.extend([", %s: %s %s" % tuple(map(hr, (k, v, type(v),))) for k, v in ___kwargs.items()[1:]]) raise AssertionError, "".join(msgbuf) allmydata-tahoe-1.10.2/src/allmydata/util/idlib.py0000644000175000017500000000034212556560070020137 0ustar ramram from foolscap import base32 def nodeid_b2a(nodeid): # we display nodeids using the same base32 alphabet that Foolscap uses return base32.encode(nodeid) def shortnodeid_b2a(nodeid): return nodeid_b2a(nodeid)[:8] allmydata-tahoe-1.10.2/src/allmydata/util/netstring.py0000644000175000017500000000307512556560070021077 0ustar ramram def netstring(s): assert isinstance(s, str), s # no unicode here return "%d:%s," % (len(s), s,) def split_netstring(data, numstrings, position=0, required_trailer=None): """like string.split(), but extracts netstrings. Ignore all bytes of data before the 'position' byte. Return a tuple of (list of elements (numstrings in length), new position index). The new position index points to the first byte which was not consumed (the 'required_trailer', if any, counts as consumed). If 'required_trailer' is not None, throw ValueError if leftover data does not exactly equal 'required_trailer'.""" assert type(position) in (int, long), (repr(position), type(position)) elements = [] assert numstrings >= 0 while position < len(data): colon = data.index(":", position) length = int(data[position:colon]) string = data[colon+1:colon+1+length] assert len(string) == length, (len(string), length) elements.append(string) position = colon+1+length assert data[position] == ",", position position += 1 if len(elements) == numstrings: break if len(elements) < numstrings: raise ValueError("ran out of netstrings") if required_trailer is not None: if ((len(data) - position) != len(required_trailer)) or (data[position:] != required_trailer): raise ValueError("leftover data in netstrings") return (elements, position + len(required_trailer)) else: return (elements, position) allmydata-tahoe-1.10.2/src/allmydata/util/consumer.py0000644000175000017500000000162612556560070020715 0ustar ramram """This file defines a basic download-to-memory consumer, suitable for use in a filenode's read() method. See download_to_data() for an example of its use. """ from zope.interface import implements from twisted.internet.interfaces import IConsumer class MemoryConsumer: implements(IConsumer) def __init__(self): self.chunks = [] self.done = False def registerProducer(self, p, streaming): self.producer = p if streaming: # call resumeProducing once to start things off p.resumeProducing() else: while not self.done: p.resumeProducing() def write(self, data): self.chunks.append(data) def unregisterProducer(self): self.done = True def download_to_data(n, offset=0, size=None): d = n.read(MemoryConsumer(), offset, size) d.addCallback(lambda mc: "".join(mc.chunks)) return d allmydata-tahoe-1.10.2/src/allmydata/util/fake_inotify.py0000644000175000017500000000664012556560070021532 0ustar ramram # Most of this is copied from Twisted 11.0. The reason for this hack is that # twisted.internet.inotify can't be imported when the platform does not support inotify. # from /usr/src/linux/include/linux/inotify.h IN_ACCESS = 0x00000001L # File was accessed IN_MODIFY = 0x00000002L # File was modified IN_ATTRIB = 0x00000004L # Metadata changed IN_CLOSE_WRITE = 0x00000008L # Writeable file was closed IN_CLOSE_NOWRITE = 0x00000010L # Unwriteable file closed IN_OPEN = 0x00000020L # File was opened IN_MOVED_FROM = 0x00000040L # File was moved from X IN_MOVED_TO = 0x00000080L # File was moved to Y IN_CREATE = 0x00000100L # Subfile was created IN_DELETE = 0x00000200L # Subfile was delete IN_DELETE_SELF = 0x00000400L # Self was deleted IN_MOVE_SELF = 0x00000800L # Self was moved IN_UNMOUNT = 0x00002000L # Backing fs was unmounted IN_Q_OVERFLOW = 0x00004000L # Event queued overflowed IN_IGNORED = 0x00008000L # File was ignored IN_ONLYDIR = 0x01000000 # only watch the path if it is a directory IN_DONT_FOLLOW = 0x02000000 # don't follow a sym link IN_MASK_ADD = 0x20000000 # add to the mask of an already existing watch IN_ISDIR = 0x40000000 # event occurred against dir IN_ONESHOT = 0x80000000 # only send event once IN_CLOSE = IN_CLOSE_WRITE | IN_CLOSE_NOWRITE # closes IN_MOVED = IN_MOVED_FROM | IN_MOVED_TO # moves IN_CHANGED = IN_MODIFY | IN_ATTRIB # changes IN_WATCH_MASK = (IN_MODIFY | IN_ATTRIB | IN_CREATE | IN_DELETE | IN_DELETE_SELF | IN_MOVE_SELF | IN_UNMOUNT | IN_MOVED_FROM | IN_MOVED_TO) _FLAG_TO_HUMAN = [ (IN_ACCESS, 'access'), (IN_MODIFY, 'modify'), (IN_ATTRIB, 'attrib'), (IN_CLOSE_WRITE, 'close_write'), (IN_CLOSE_NOWRITE, 'close_nowrite'), (IN_OPEN, 'open'), (IN_MOVED_FROM, 'moved_from'), (IN_MOVED_TO, 'moved_to'), (IN_CREATE, 'create'), (IN_DELETE, 'delete'), (IN_DELETE_SELF, 'delete_self'), (IN_MOVE_SELF, 'move_self'), (IN_UNMOUNT, 'unmount'), (IN_Q_OVERFLOW, 'queue_overflow'), (IN_IGNORED, 'ignored'), (IN_ONLYDIR, 'only_dir'), (IN_DONT_FOLLOW, 'dont_follow'), (IN_MASK_ADD, 'mask_add'), (IN_ISDIR, 'is_dir'), (IN_ONESHOT, 'one_shot') ] def humanReadableMask(mask): """ Auxiliary function that converts an hexadecimal mask into a series of human readable flags. """ s = [] for k, v in _FLAG_TO_HUMAN: if k & mask: s.append(v) return s # This class is not copied from Twisted; it acts as a mock. class INotify(object): def startReading(self): pass def stopReading(self): pass def watch(self, filepath, mask=IN_WATCH_MASK, autoAdd=False, callbacks=None, recursive=False): self.callbacks = callbacks def event(self, filepath, mask): for cb in self.callbacks: cb(None, filepath, mask) __all__ = ["INotify", "humanReadableMask", "IN_WATCH_MASK", "IN_ACCESS", "IN_MODIFY", "IN_ATTRIB", "IN_CLOSE_NOWRITE", "IN_CLOSE_WRITE", "IN_OPEN", "IN_MOVED_FROM", "IN_MOVED_TO", "IN_CREATE", "IN_DELETE", "IN_DELETE_SELF", "IN_MOVE_SELF", "IN_UNMOUNT", "IN_Q_OVERFLOW", "IN_IGNORED", "IN_ONLYDIR", "IN_DONT_FOLLOW", "IN_MASK_ADD", "IN_ISDIR", "IN_ONESHOT", "IN_CLOSE", "IN_MOVED", "IN_CHANGED"] allmydata-tahoe-1.10.2/src/allmydata/util/log.py0000644000175000017500000000422212556560070017636 0ustar ramramfrom allmydata.util import nummedobj from foolscap.logging import log from twisted.python import log as tw_log NOISY = log.NOISY # 10 OPERATIONAL = log.OPERATIONAL # 20 UNUSUAL = log.UNUSUAL # 23 INFREQUENT = log.INFREQUENT # 25 CURIOUS = log.CURIOUS # 28 WEIRD = log.WEIRD # 30 SCARY = log.SCARY # 35 BAD = log.BAD # 40 msg = log.msg # If log.err() happens during a unit test, the unit test should fail. We # accomplish this by sending it to twisted.log too. When a WEIRD/SCARY/BAD # thing happens that is nevertheless handled, use log.msg(failure=f, # level=WEIRD) instead. def err(failure=None, _why=None, **kwargs): tw_log.err(failure, _why, **kwargs) if 'level' not in kwargs: kwargs['level'] = log.UNUSUAL return log.err(failure, _why, **kwargs) class LogMixin(object): """ I remember a msg id and a facility and pass them to log.msg() """ def __init__(self, facility=None, grandparentmsgid=None): self._facility = facility self._grandparentmsgid = grandparentmsgid self._parentmsgid = None def log(self, msg, facility=None, parent=None, *args, **kwargs): if facility is None: facility = self._facility if parent is None: pmsgid = self._parentmsgid if pmsgid is None: pmsgid = self._grandparentmsgid msgid = log.msg(msg, facility=facility, parent=pmsgid, *args, **kwargs) if self._parentmsgid is None: self._parentmsgid = msgid return msgid class PrefixingLogMixin(nummedobj.NummedObj, LogMixin): """ I prepend a prefix to each msg, which includes my class and instance number as well as a prefix supplied by my subclass. """ def __init__(self, facility=None, grandparentmsgid=None, prefix=''): nummedobj.NummedObj.__init__(self) LogMixin.__init__(self, facility, grandparentmsgid) if prefix: self._prefix = "%s(%s): " % (self.__repr__(), prefix) else: self._prefix = "%s: " % (self.__repr__(),) def log(self, msg="", facility=None, parent=None, *args, **kwargs): return LogMixin.log(self, self._prefix + msg, facility, parent, *args, **kwargs) allmydata-tahoe-1.10.2/src/allmydata/util/dbutil.py0000644000175000017500000000417312556560070020345 0ustar ramram import os, sys import sqlite3 from sqlite3 import IntegrityError [IntegrityError] class DBError(Exception): pass def get_db(dbfile, stderr=sys.stderr, create_version=(None, None), updaters={}, just_create=False, dbname="db", journal_mode=None, synchronous=None): """Open or create the given db file. The parent directory must exist. create_version=(SCHEMA, VERNUM), and SCHEMA must have a 'version' table. Updaters is a {newver: commands} mapping, where e.g. updaters[2] is used to get from ver=1 to ver=2. Returns a (sqlite3,db) tuple, or raises DBError. """ must_create = not os.path.exists(dbfile) try: db = sqlite3.connect(dbfile) except (EnvironmentError, sqlite3.OperationalError), e: raise DBError("Unable to create/open %s file %s: %s" % (dbname, dbfile, e)) schema, target_version = create_version c = db.cursor() # Enabling foreign keys allows stricter integrity checking. # The default is unspecified according to . c.execute("PRAGMA foreign_keys = ON;") if journal_mode is not None: c.execute("PRAGMA journal_mode = %s;" % (journal_mode,)) if synchronous is not None: c.execute("PRAGMA synchronous = %s;" % (synchronous,)) if must_create: c.executescript(schema) c.execute("INSERT INTO version (version) VALUES (?)", (target_version,)) db.commit() try: c.execute("SELECT version FROM version") version = c.fetchone()[0] except sqlite3.DatabaseError, e: # this indicates that the file is not a compatible database format. # Perhaps it was created with an old version, or it might be junk. raise DBError("%s file is unusable: %s" % (dbname, e)) if just_create: # for tests return (sqlite3, db) while version < target_version and version+1 in updaters: c.executescript(updaters[version+1]) db.commit() version = version+1 if version != target_version: raise DBError("Unable to handle %s version %s" % (dbname, version)) return (sqlite3, db) allmydata-tahoe-1.10.2/src/allmydata/util/statistics.py0000644000175000017500000002127012556560070021251 0ustar ramram# Copyright (c) 2009 Shawn Willden # mailto:shawn@willden.org # I hereby license all patches I have contributed or will contribute to the # Allmydata Tahoe-LAFS project, including the file 'statistics.py', under # either the GNU General Public License, version 2 or later, or under the # Transitive Grace Period Public License, version 1 or later. from __future__ import division from allmydata.util.mathutil import round_sigfigs import math import sys def pr_file_loss(p_list, k): """ Probability of single-file loss for shares with reliabilities in p_list. Computes the probability that a single file will become unrecoverable, based on the individual share survival probabilities and and k (number of shares needed for recovery). Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the probability that a file with k=3, N=10 and stored on five servers with reliability .9 and five servers with reliability .99 is lost. See survival_pmf docstring for important statistical assumptions. """ assert 0 < k <= len(p_list) assert valid_probability_list(p_list) # Sum elements 0 through k-1 of the share set PMF to get the # probability that less than k shares survived. return sum(survival_pmf(p_list)[0:k]) def survival_pmf(p_list): """ Return the collective PMF of share survival count for a set of shares with the individual survival probabilities in p_list. Example: survival_pmf([.99] * 10 + [.8] * 6) returns the probability mass function for the number of shares that will survive from an initial set of 16, 10 with p=0.99 and 6 with p=0.8. The ith element of the resulting list is the probability that exactly i shares will survive. This calculation makes the following assumptions: 1. p_list[i] is the probability that any individual share will will survive during the time period in question (whatever that may be). 2. The share failures are "independent", in the statistical sense. Note that if a group of shares are stored on the same machine or even in the same data center, they are NOT independent and this calculation is therefore wrong. """ assert valid_probability_list(p_list) pmf = survival_pmf_via_conv(p_list) assert valid_pmf(pmf) return pmf def survival_pmf_via_bd(p_list): """ Compute share survival PMF using the binomial distribution PMF as much as possible. This is more efficient than the convolution method below, but doesn't work for large numbers of shares because the binomial_coeff calculation blows up. Since the efficiency gains only matter in the case of large numbers of shares, it's pretty much useless except for testing the convolution methond. Note that this function does little to no error checking and is intended for internal use and testing only. """ pmf_list = [ binomial_distribution_pmf(p_list.count(p), p) for p in set(p_list) ] return reduce(convolve, pmf_list) def survival_pmf_via_conv(p_list): """ Compute share survival PMF using iterated convolution of trivial PMFs. Note that this function does little to no error checking and is intended for internal use and testing only. """ pmf_list = [ [1 - p, p] for p in p_list ]; return reduce(convolve, pmf_list) def print_pmf(pmf, n=4, out=sys.stdout): """ Print a PMF in a readable form, with values rounded to n significant digits. """ for k, p in enumerate(pmf): print >>out, "i=" + str(k) + ":", round_sigfigs(p, n) def pr_backup_file_loss(p_list, backup_p, k): """ Probability of single-file loss in a backup context Same as pr_file_loss, except it factors in the probability of survival of the original source, specified as backup_p. Because that's a precondition to caring about the availability of the backup, it's an independent event. """ assert valid_probability_list(p_list) assert 0 < backup_p <= 1 assert 0 < k <= len(p_list) return pr_file_loss(p_list, k) * (1 - backup_p) def find_k(p_list, target_loss_prob): """ Find the highest k value that achieves the targeted loss probability, given the share reliabilities given in p_list. """ assert valid_probability_list(p_list) assert 0 < target_loss_prob < 1 pmf = survival_pmf(p_list) return find_k_from_pmf(pmf, target_loss_prob) def find_k_from_pmf(pmf, target_loss_prob): """ Find the highest k value that achieves the targeted loss probability, given the share survival PMF given in pmf. """ assert valid_pmf(pmf) assert 0 < target_loss_prob < 1 loss_prob = 0.0 for k, p_k in enumerate(pmf): loss_prob += p_k if loss_prob > target_loss_prob: return k # we shouldn't be able to get here, since sum(pmf)==1.0 k = len(pmf) - 1 return k def repair_count_pmf(survival_pmf, k): """ Return Pr[D=d], where D represents the number of shares that have to be repaired at the end of an interval, starting with a full set and subject to losses described in survival_pmf. """ n = len(survival_pmf) - 1 # Probability of 0 to repair is the probability of all shares # surviving plus the probability of less than k surviving. pmf = [ survival_pmf[n] + sum(survival_pmf[0:k]) ] # Probability of more than 0, up to N-k to repair for i in range(1, n-k+1): pmf.append(survival_pmf[n-i]) # Probability of more than N-k to repair is 0, because that means # there are less than k available and the file is irreparable. for i in range(n-k+1, n+1): pmf.append(0.0) assert(valid_pmf(pmf)) return pmf def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio): return file_size + float(file_size) / k * shares * ul_dl_ratio def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio): """ Return the expected cost for a repair run on a file with the given survival_pmf and requiring k shares, in which upload cost is 'ul_dl_ratio' times download cost. """ repair_pmf = repair_count_pmf(survival_pmf, k) expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio) * repair_pmf[new_shares] for new_shares in range(1, len(repair_pmf))]) return expected_cost def eternal_repair_cost(cost_function, file_size, survival_pmf, k, discount_rate=0, ul_dl_ratio=1.0): """ Calculate the eternal repair cost for a file that is aggressively repaired, i.e. the sum of repair costs until the file is dead. """ c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio) f = 1 - sum(survival_pmf[0:k]) r = float(discount_rate) return (c * (1-r)) / (1 - (1-r) * f) def valid_pmf(pmf): """ Validate that pmf looks like a proper discrete probability mass function in list form. Returns true if the elements of pmf sum to 1. """ return round(sum(pmf),5) == 1.0 def valid_probability_list(p_list): """ Validate that p_list is a list of probibilities """ for p in p_list: if p < 0 or p > 1: return False return True def convolve(list_a, list_b): """ Returns the discrete convolution of two lists. Given two random variables X and Y, the convolution of their probability mass functions Pr(X) and Pr(Y) is equal to the Pr(X+Y). """ n = len(list_a) m = len(list_b) result = [] for i in range(n + m - 1): sum = 0.0 lower = max(0, i - n + 1) upper = min(m - 1, i) for j in range(lower, upper+1): sum += list_a[i-j] * list_b[j] result.append(sum) return result def binomial_distribution_pmf(n, p): """ Returns Pr(K), where K ~ B(n,p), as a list of values. Returns the full probability mass function of a B(n, p) as a list of values, where the kth element is Pr(K=k), or, in the Tahoe context, the probability that exactly k copies of a file share survive, when placed on n independent servers with survival probability p. """ assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p assert n > 0 result = [] for k in range(n+1): result.append(math.pow(p , k ) * math.pow(1 - p, n - k) * binomial_coeff(n, k)) assert valid_pmf(result) return result; def binomial_coeff(n, k): """ Returns the number of ways that k items can be chosen from a set of n. """ assert n >= k if k > n/2: k = n - k accum = 1.0 for i in range(1, k+1): accum = accum * (n - k + i) // i; return int(accum + 0.5) allmydata-tahoe-1.10.2/src/allmydata/util/pollmixin.py0000644000175000017500000000366212556560070021077 0ustar ramram import time from twisted.internet import task class TimeoutError(Exception): pass class PollComplete(Exception): pass class PollMixin: _poll_should_ignore_these_errors = [] def poll(self, check_f, pollinterval=0.01, timeout=1000): # Return a Deferred, then call check_f periodically until it returns # True, at which point the Deferred will fire.. If check_f raises an # exception, the Deferred will errback. If the check_f does not # indicate success within timeout= seconds, the Deferred will # errback. If timeout=None, no timeout will be enforced, and the loop # will poll forever (or really until Trial times out). cutoff = None if timeout is not None: cutoff = time.time() + timeout lc = task.LoopingCall(self._poll, check_f, cutoff) d = lc.start(pollinterval) def _convert_done(f): f.trap(PollComplete) return None d.addErrback(_convert_done) return d def _poll(self, check_f, cutoff): if cutoff is not None and time.time() > cutoff: raise TimeoutError("PollMixin never saw %s return True" % check_f) if check_f(): raise PollComplete() # since PollMixin is mostly used for unit tests, quit if we see any # logged errors. This should give us a nice fast failure, instead of # waiting for a timeout. Tests which use flushLoggedErrors() will # need to warn us by putting the error types they'll be ignoring in # self._poll_should_ignore_these_errors if hasattr(self, "_observer") and hasattr(self._observer, "getErrors"): errs = [] for e in self._observer.getErrors(): if not e.check(*self._poll_should_ignore_these_errors): errs.append(e) if errs: print errs self.fail("Errors snooped, terminating early") allmydata-tahoe-1.10.2/src/allmydata/util/iputil.py0000644000175000017500000002115412556560070020366 0ustar ramram# from the Python Standard Library import os, re, socket, subprocess, errno from sys import platform # from Twisted from twisted.internet import defer, threads, reactor from twisted.internet.protocol import DatagramProtocol from twisted.internet.error import CannotListenError from twisted.python.procutils import which from twisted.python import log try: import resource def increase_rlimits(): # We'd like to raise our soft resource.RLIMIT_NOFILE, since certain # systems (OS-X, probably solaris) start with a relatively low limit # (256), and some unit tests want to open up more sockets than this. # Most linux systems start with both hard and soft limits at 1024, # which is plenty. # unfortunately the values to pass to setrlimit() vary widely from # one system to another. OS-X reports (256, HUGE), but the real hard # limit is 10240, and accepts (-1,-1) to mean raise it to the # maximum. Cygwin reports (256, -1), then ignores a request of # (-1,-1): instead you have to guess at the hard limit (it appears to # be 3200), so using (3200,-1) seems to work. Linux reports a # sensible (1024,1024), then rejects (-1,-1) as trying to raise the # maximum limit, so you could set it to (1024,1024) but you might as # well leave it alone. try: current = resource.getrlimit(resource.RLIMIT_NOFILE) except AttributeError: # we're probably missing RLIMIT_NOFILE return if current[0] >= 1024: # good enough, leave it alone return try: if current[1] > 0 and current[1] < 1000000: # solaris reports (256, 65536) resource.setrlimit(resource.RLIMIT_NOFILE, (current[1], current[1])) else: # this one works on OS-X (bsd), and gives us 10240, but # it doesn't work on linux (on which both the hard and # soft limits are set to 1024 by default). resource.setrlimit(resource.RLIMIT_NOFILE, (-1,-1)) new = resource.getrlimit(resource.RLIMIT_NOFILE) if new[0] == current[0]: # probably cygwin, which ignores -1. Use a real value. resource.setrlimit(resource.RLIMIT_NOFILE, (3200,-1)) except ValueError: log.msg("unable to set RLIMIT_NOFILE: current value %s" % (resource.getrlimit(resource.RLIMIT_NOFILE),)) except: # who knows what. It isn't very important, so log it and continue log.err() except ImportError: def _increase_rlimits(): # TODO: implement this for Windows. Although I suspect the # solution might be "be running under the iocp reactor and # make this function be a no-op". pass # pyflakes complains about two 'def FOO' statements in the same time, # since one might be shadowing the other. This hack appeases pyflakes. increase_rlimits = _increase_rlimits def get_local_addresses_async(target="198.41.0.4"): # A.ROOT-SERVERS.NET """ Return a Deferred that fires with a list of IPv4 addresses (as dotted-quad strings) that are currently configured on this host, sorted in descending order of how likely we think they are to work. @param target: we want to learn an IP address they could try using to connect to us; The default value is fine, but it might help if you pass the address of a host that you are actually trying to be reachable to. """ addresses = [] local_ip = get_local_ip_for(target) if local_ip is not None: addresses.append(local_ip) if platform == "cygwin": d = _cygwin_hack_find_addresses() else: d = _find_addresses_via_config() def _collect(res): for addr in res: if addr != "0.0.0.0" and not addr in addresses: addresses.append(addr) return addresses d.addCallback(_collect) return d def get_local_ip_for(target): """Find out what our IP address is for use by a given target. @return: the IP address as a dotted-quad string which could be used by to connect to us. It might work for them, it might not. If there is no suitable address (perhaps we don't currently have an externally-visible interface), this will return None. """ try: target_ipaddr = socket.gethostbyname(target) except socket.gaierror: # DNS isn't running, or somehow we encountered an error # note: if an interface is configured and up, but nothing is # connected to it, gethostbyname("A.ROOT-SERVERS.NET") will take 20 # seconds to raise socket.gaierror . This is synchronous and occurs # for each node being started, so users of # test.common.SystemTestMixin (like test_system) will see something # like 120s of delay, which may be enough to hit the default trial # timeouts. For that reason, get_local_addresses_async() was changed # to default to the numerical ip address for A.ROOT-SERVERS.NET, to # avoid this DNS lookup. This also makes node startup fractionally # faster. return None try: udpprot = DatagramProtocol() port = reactor.listenUDP(0, udpprot) udpprot.transport.connect(target_ipaddr, 7) localip = udpprot.transport.getHost().host d = port.stopListening() d.addErrback(log.err) except (socket.error, CannotListenError): # no route to that host localip = None return localip # Wow, I'm really amazed at home much mileage we've gotten out of calling # the external route.exe program on windows... It appears to work on all # versions so far. # ... thus wrote Greg Smith in time immemorial... # Also, the Win32 APIs for this are really klunky and error-prone. --Daira _win32_re = re.compile(r'^\s*\d+\.\d+\.\d+\.\d+\s.+\s(?P
\d+\.\d+\.\d+\.\d+)\s+(?P\d+)\s*$', flags=re.M|re.I|re.S) _win32_commands = (('route.exe', ('print',), _win32_re),) # These work in most Unices. _addr_re = re.compile(r'^\s*inet [a-zA-Z]*:?(?P
\d+\.\d+\.\d+\.\d+)[\s/].+$', flags=re.M|re.I|re.S) _unix_commands = (('/bin/ip', ('addr',), _addr_re), ('/sbin/ifconfig', ('-a',), _addr_re), ('/usr/sbin/ifconfig', ('-a',), _addr_re), ('/usr/etc/ifconfig', ('-a',), _addr_re), ('ifconfig', ('-a',), _addr_re), ('/sbin/ifconfig', (), _addr_re), ) def _find_addresses_via_config(): return threads.deferToThread(_synchronously_find_addresses_via_config) def _synchronously_find_addresses_via_config(): # originally by Greg Smith, hacked by Zooko and then Daira # We don't reach here for cygwin. if platform == 'win32': commands = _win32_commands else: commands = _unix_commands for (pathtotool, args, regex) in commands: # If pathtotool is a fully qualified path then we just try that. # If it is merely an executable name then we use Twisted's # "which()" utility and try each executable in turn until one # gives us something that resembles a dotted-quad IPv4 address. if os.path.isabs(pathtotool): exes_to_try = [pathtotool] else: exes_to_try = which(pathtotool) for exe in exes_to_try: try: addresses = _query(exe, args, regex) except Exception: addresses = [] if addresses: return addresses return [] def _query(path, args, regex): if not os.path.isfile(path): return [] env = {'LANG': 'en_US.UTF-8'} TRIES = 5 for trial in xrange(TRIES): try: p = subprocess.Popen([path] + list(args), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) (output, err) = p.communicate() break except OSError, e: if e.errno == errno.EINTR and trial < TRIES-1: continue raise addresses = [] outputsplit = output.split('\n') for outline in outputsplit: m = regex.match(outline) if m: addr = m.group('address') if addr not in addresses: addresses.append(addr) return addresses def _cygwin_hack_find_addresses(): addresses = [] for h in ["localhost", "127.0.0.1",]: addr = get_local_ip_for(h) if addr is not None and addr not in addresses: addresses.append(addr) return defer.succeed(addresses) allmydata-tahoe-1.10.2/src/allmydata/util/deferredutil.py0000644000175000017500000001413312556560070021535 0ustar ramram import time from foolscap.api import eventually, fireEventually from twisted.internet import defer, reactor from allmydata.util import log from allmydata.util.pollmixin import PollMixin # utility wrapper for DeferredList def _check_deferred_list(results): # if any of the component Deferreds failed, return the first failure such # that an addErrback() would fire. If all were ok, return a list of the # results (without the success/failure booleans) for success,f in results: if not success: return f return [r[1] for r in results] def DeferredListShouldSucceed(dl): d = defer.DeferredList(dl) d.addCallback(_check_deferred_list) return d def _parseDListResult(l): return [x[1] for x in l] def _unwrapFirstError(f): f.trap(defer.FirstError) raise f.value.subFailure def gatherResults(deferredList): """Returns list with result of given Deferreds. This builds on C{DeferredList} but is useful since you don't need to parse the result for success/failure. @type deferredList: C{list} of L{Deferred}s """ d = defer.DeferredList(deferredList, fireOnOneErrback=True, consumeErrors=True) d.addCallbacks(_parseDListResult, _unwrapFirstError) return d def _with_log(op, res): """ The default behaviour on firing an already-fired Deferred is unhelpful for debugging, because the AlreadyCalledError can easily get lost or be raised in a context that results in a different error. So make sure it is logged (for the abstractions defined here). If we are in a test, log.err will cause the test to fail. """ try: op(res) except defer.AlreadyCalledError, e: log.err(e, op=repr(op), level=log.WEIRD) def eventually_callback(d): def _callback(res): eventually(_with_log, d.callback, res) return res return _callback def eventually_errback(d): def _errback(res): eventually(_with_log, d.errback, res) return res return _errback def eventual_chain(source, target): source.addCallbacks(eventually_callback(target), eventually_errback(target)) class HookMixin: """ I am a helper mixin that maintains a collection of named hooks, primarily for use in tests. Each hook is set to an unfired Deferred using 'set_hook', and can then be fired exactly once at the appropriate time by '_call_hook'. I assume a '_hooks' attribute that should set by the class constructor to a dict mapping each valid hook name to None. """ def set_hook(self, name, d=None): """ Called by the hook observer (e.g. by a test). If d is not given, an unfired Deferred is created and returned. The hook must not already be set. """ if d is None: d = defer.Deferred() assert self._hooks[name] is None, self._hooks[name] assert isinstance(d, defer.Deferred), d self._hooks[name] = d return d def _call_hook(self, res, name): """ Called to trigger the hook, with argument 'res'. This is a no-op if the hook is unset. Otherwise, the hook will be unset, and then its Deferred will be fired synchronously. The expected usage is "deferred.addBoth(self._call_hook, 'hookname')". This ensures that if 'res' is a failure, the hook will be errbacked, which will typically cause the test to also fail. 'res' is returned so that the current result or failure will be passed through. """ d = self._hooks[name] if d is None: return defer.succeed(None) self._hooks[name] = None _with_log(d.callback, res) return res def async_iterate(process, iterable, *extra_args, **kwargs): """ I iterate over the elements of 'iterable' (which may be deferred), eventually applying 'process' to each one, optionally with 'extra_args' and 'kwargs'. 'process' should return a (possibly deferred) boolean: True to continue the iteration, False to stop. I return a Deferred that fires with True if all elements of the iterable were processed (i.e. 'process' only returned True values); with False if the iteration was stopped by 'process' returning False; or that fails with the first failure of either 'process' or the iterator. """ iterator = iter(iterable) d = defer.succeed(None) def _iterate(ign): d2 = defer.maybeDeferred(iterator.next) def _cb(item): d3 = defer.maybeDeferred(process, item, *extra_args, **kwargs) def _maybe_iterate(res): if res: d4 = fireEventually() d4.addCallback(_iterate) return d4 return False d3.addCallback(_maybe_iterate) return d3 def _eb(f): f.trap(StopIteration) return True d2.addCallbacks(_cb, _eb) return d2 d.addCallback(_iterate) return d def for_items(cb, mapping): """ For each (key, value) pair in a mapping, I add a callback to cb(None, key, value) to a Deferred that fires immediately. I return that Deferred. """ d = defer.succeed(None) for k, v in mapping.items(): d.addCallback(lambda ign, k=k, v=v: cb(None, k, v)) return d class WaitForDelayedCallsMixin(PollMixin): def _delayed_calls_done(self): # We're done when the only remaining DelayedCalls fire after threshold. # (These will be associated with the test timeout, or else they *should* # cause an unclean reactor error because the test should have waited for # them.) threshold = time.time() + 10 for delayed in reactor.getDelayedCalls(): if delayed.getTime() < threshold: return False return True def wait_for_delayed_calls(self, res=None): """ Use like this at the end of a test: d.addBoth(self.wait_for_delayed_calls) """ d = self.poll(self._delayed_calls_done) d.addErrback(log.err, "error while waiting for delayed calls") d.addBoth(lambda ign: res) return d allmydata-tahoe-1.10.2/src/allmydata/util/happinessutil.py0000644000175000017500000003070312556560070021750 0ustar ramram""" I contain utilities useful for calculating servers_of_happiness, and for reporting it in messages """ from copy import deepcopy def failure_message(peer_count, k, happy, effective_happy): # If peer_count < needed_shares, this error message makes more # sense than any of the others, so use it. if peer_count < k: msg = ("shares could be placed or found on only %d " "server(s). " "We were asked to place shares on at least %d " "server(s) such that any %d of them have " "enough shares to recover the file." % (peer_count, happy, k)) # Otherwise, if we've placed on at least needed_shares # peers, but there isn't an x-happy subset of those peers # for x >= needed_shares, we use this error message. elif effective_happy < k: msg = ("shares could be placed or found on %d " "server(s), but they are not spread out evenly " "enough to ensure that any %d of these servers " "would have enough shares to recover the file. " "We were asked to place " "shares on at least %d servers such that any " "%d of them have enough shares to recover the " "file." % (peer_count, k, happy, k)) # Otherwise, if there is an x-happy subset of peers where # x >= needed_shares, but x < servers_of_happiness, then # we use this message. else: msg = ("shares could be placed on only %d server(s) " "such that any %d of them have enough shares " "to recover the file, but we were asked to " "place shares on at least %d such servers." % (effective_happy, k, happy)) return msg def shares_by_server(servermap): """ I accept a dict of shareid -> set(peerid) mappings, and return a dict of peerid -> set(shareid) mappings. My argument is a dictionary with sets of peers, indexed by shares, and I transform that into a dictionary of sets of shares, indexed by peerids. """ ret = {} for shareid, peers in servermap.iteritems(): assert isinstance(peers, set) for peerid in peers: ret.setdefault(peerid, set()).add(shareid) return ret def merge_servers(servermap, upload_trackers=None): """ I accept a dict of shareid -> set(serverid) mappings, and optionally a set of ServerTrackers. If no set of ServerTrackers is provided, I return my first argument unmodified. Otherwise, I update a copy of my first argument to include the shareid -> serverid mappings implied in the set of ServerTrackers, returning the resulting dict. """ # Since we mutate servermap, and are called outside of a # context where it is okay to do that, make a copy of servermap and # work with it. servermap = deepcopy(servermap) if not upload_trackers: return servermap assert(isinstance(servermap, dict)) assert(isinstance(upload_trackers, set)) for tracker in upload_trackers: for shnum in tracker.buckets: servermap.setdefault(shnum, set()).add(tracker.get_serverid()) return servermap def servers_of_happiness(sharemap): """ I accept 'sharemap', a dict of shareid -> set(peerid) mappings. I return the 'servers_of_happiness' number that sharemap results in. To calculate the 'servers_of_happiness' number for the sharemap, I construct a bipartite graph with servers in one partition of vertices and shares in the other, and with an edge between a server s and a share t if s is to store t. I then compute the size of a maximum matching in the resulting graph; this is then returned as the 'servers_of_happiness' for my arguments. For example, consider the following layout: server 1: shares 1, 2, 3, 4 server 2: share 6 server 3: share 3 server 4: share 4 server 5: share 2 From this, we can construct the following graph: L = {server 1, server 2, server 3, server 4, server 5} R = {share 1, share 2, share 3, share 4, share 6} V = L U R E = {(server 1, share 1), (server 1, share 2), (server 1, share 3), (server 1, share 4), (server 2, share 6), (server 3, share 3), (server 4, share 4), (server 5, share 2)} G = (V, E) Note that G is bipartite since every edge in e has one endpoint in L and one endpoint in R. A matching in a graph G is a subset M of E such that, for any vertex v in V, v is incident to at most one edge of M. A maximum matching in G is a matching that is no smaller than any other matching. For this graph, a matching of cardinality 5 is: M = {(server 1, share 1), (server 2, share 6), (server 3, share 3), (server 4, share 4), (server 5, share 2)} Since G is bipartite, and since |L| = 5, we cannot have an M' such that |M'| > |M|. Then M is a maximum matching in G. Intuitively, and as long as k <= 5, we can see that the layout above has servers_of_happiness = 5, which matches the results here. """ if sharemap == {}: return 0 sharemap = shares_by_server(sharemap) graph = flow_network_for(sharemap) # This is an implementation of the Ford-Fulkerson method for finding # a maximum flow in a flow network applied to a bipartite graph. # Specifically, it is the Edmonds-Karp algorithm, since it uses a # BFS to find the shortest augmenting path at each iteration, if one # exists. # # The implementation here is an adapation of an algorithm described in # "Introduction to Algorithms", Cormen et al, 2nd ed., pp 658-662. dim = len(graph) flow_function = [[0 for sh in xrange(dim)] for s in xrange(dim)] residual_graph, residual_function = residual_network(graph, flow_function) while augmenting_path_for(residual_graph): path = augmenting_path_for(residual_graph) # Delta is the largest amount that we can increase flow across # all of the edges in path. Because of the way that the residual # function is constructed, f[u][v] for a particular edge (u, v) # is the amount of unused capacity on that edge. Taking the # minimum of a list of those values for each edge in the # augmenting path gives us our delta. delta = min(map(lambda (u, v), rf=residual_function: rf[u][v], path)) for (u, v) in path: flow_function[u][v] += delta flow_function[v][u] -= delta residual_graph, residual_function = residual_network(graph, flow_function) num_servers = len(sharemap) # The value of a flow is the total flow out of the source vertex # (vertex 0, in our graph). We could just as well sum across all of # f[0], but we know that vertex 0 only has edges to the servers in # our graph, so we can stop after summing flow across those. The # value of a flow computed in this way is the size of a maximum # matching on the bipartite graph described above. return sum([flow_function[0][v] for v in xrange(1, num_servers+1)]) def flow_network_for(sharemap): """ I take my argument, a dict of peerid -> set(shareid) mappings, and turn it into a flow network suitable for use with Edmonds-Karp. I then return the adjacency list representation of that network. Specifically, I build G = (V, E), where: V = { peerid in sharemap } U { shareid in sharemap } U {s, t} E = {(s, peerid) for each peerid} U {(peerid, shareid) if peerid is to store shareid } U {(shareid, t) for each shareid} s and t will be source and sink nodes when my caller starts treating the graph I return like a flow network. Without s and t, the returned graph is bipartite. """ # Servers don't have integral identifiers, and we can't make any # assumptions about the way shares are indexed -- it's possible that # there are missing shares, for example. So before making a graph, # we re-index so that all of our vertices have integral indices, and # that there aren't any holes. We start indexing at 1, so that we # can add a source node at index 0. sharemap, num_shares = reindex(sharemap, base_index=1) num_servers = len(sharemap) graph = [] # index -> [index], an adjacency list # Add an entry at the top (index 0) that has an edge to every server # in sharemap graph.append(sharemap.keys()) # For each server, add an entry that has an edge to every share that it # contains (or will contain). for k in sharemap: graph.append(sharemap[k]) # For each share, add an entry that has an edge to the sink. sink_num = num_servers + num_shares + 1 for i in xrange(num_shares): graph.append([sink_num]) # Add an empty entry for the sink, which has no outbound edges. graph.append([]) return graph def reindex(sharemap, base_index): """ Given sharemap, I map peerids and shareids to integers that don't conflict with each other, so they're useful as indices in a graph. I return a sharemap that is reindexed appropriately, and also the number of distinct shares in the resulting sharemap as a convenience for my caller. base_index tells me where to start indexing. """ shares = {} # shareid -> vertex index num = base_index ret = {} # peerid -> [shareid], a reindexed sharemap. # Number the servers first for k in sharemap: ret[num] = sharemap[k] num += 1 # Number the shares for k in ret: for shnum in ret[k]: if not shares.has_key(shnum): shares[shnum] = num num += 1 ret[k] = map(lambda x: shares[x], ret[k]) return (ret, len(shares)) def residual_network(graph, f): """ I return the residual network and residual capacity function of the flow network represented by my graph and f arguments. graph is a flow network in adjacency-list form, and f is a flow in graph. """ new_graph = [[] for i in xrange(len(graph))] cf = [[0 for s in xrange(len(graph))] for sh in xrange(len(graph))] for i in xrange(len(graph)): for v in graph[i]: if f[i][v] == 1: # We add an edge (v, i) with cf[v,i] = 1. This means # that we can remove 1 unit of flow from the edge (i, v) new_graph[v].append(i) cf[v][i] = 1 cf[i][v] = -1 else: # We add the edge (i, v), since we're not using it right # now. new_graph[i].append(v) cf[i][v] = 1 cf[v][i] = -1 return (new_graph, cf) def augmenting_path_for(graph): """ I return an augmenting path, if there is one, from the source node to the sink node in the flow network represented by my graph argument. If there is no augmenting path, I return False. I assume that the source node is at index 0 of graph, and the sink node is at the last index. I also assume that graph is a flow network in adjacency list form. """ bfs_tree = bfs(graph, 0) if bfs_tree[len(graph) - 1]: n = len(graph) - 1 path = [] # [(u, v)], where u and v are vertices in the graph while n != 0: path.insert(0, (bfs_tree[n], n)) n = bfs_tree[n] return path return False def bfs(graph, s): """ Perform a BFS on graph starting at s, where graph is a graph in adjacency list form, and s is a node in graph. I return the predecessor table that the BFS generates. """ # This is an adaptation of the BFS described in "Introduction to # Algorithms", Cormen et al, 2nd ed., p. 532. # WHITE vertices are those that we haven't seen or explored yet. WHITE = 0 # GRAY vertices are those we have seen, but haven't explored yet GRAY = 1 # BLACK vertices are those we have seen and explored BLACK = 2 color = [WHITE for i in xrange(len(graph))] predecessor = [None for i in xrange(len(graph))] distance = [-1 for i in xrange(len(graph))] queue = [s] # vertices that we haven't explored yet. color[s] = GRAY distance[s] = 0 while queue: n = queue.pop(0) for v in graph[n]: if color[v] == WHITE: color[v] = GRAY distance[v] = distance[n] + 1 predecessor[v] = n queue.append(v) color[n] = BLACK return predecessor allmydata-tahoe-1.10.2/src/allmydata/util/nummedobj.py0000644000175000017500000000373012556560070021040 0ustar ramramfrom allmydata.util import dictutil class NummedObj(object): """ This is useful for nicer debug printouts. Instead of objects of the same class being distinguished from one another by their memory address, they each get a unique number, which can be read as "the first object of this class", "the second object of this class", etc. This is especially useful because separate runs of a program will yield identical debug output, (assuming that the objects get created in the same order in each run). This makes it possible to diff outputs from separate runs to see what changed, without having to ignore a difference on every line due to different memory addresses of objects. """ objnums = dictutil.NumDict() # key: class names, value: highest used object number def __init__(self, klass=None): """ @param klass: in which class are you counted? If default value of `None', then self.__class__ will be used. """ if klass is None: klass = self.__class__ self._classname = klass.__name__ NummedObj.objnums.inc(self._classname) self._objid = NummedObj.objnums[self._classname] def __repr__(self): return "<%s #%d>" % (self._classname, self._objid,) def __lt__(self, other): return (self._objid, self._classname,) < (other._objid, other._classname,) def __le__(self, other): return (self._objid, self._classname,) <= (other._objid, other._classname,) def __eq__(self, other): return (self._objid, self._classname,) == (other._objid, other._classname,) def __ne__(self, other): return (self._objid, self._classname,) != (other._objid, other._classname,) def __gt__(self, other): return (self._objid, self._classname,) > (other._objid, other._classname,) def __ge__(self, other): return (self._objid, self._classname,) >= (other._objid, other._classname,) def __hash__(self): return id(self) allmydata-tahoe-1.10.2/src/allmydata/util/limiter.py0000644000175000017500000000226512556560070020527 0ustar ramram from twisted.internet import defer from foolscap.api import eventually class ConcurrencyLimiter: """I implement a basic concurrency limiter. Add work to it in the form of (callable, args, kwargs) tuples. No more than LIMIT callables will be outstanding at any one time. """ def __init__(self, limit=10): self.limit = limit self.pending = [] self.active = 0 def __repr__(self): return "" % (self.active, len(self.pending), self.limit) def add(self, cb, *args, **kwargs): d = defer.Deferred() task = (cb, args, kwargs, d) self.pending.append(task) self.maybe_start_task() return d def maybe_start_task(self): if self.active >= self.limit: return if not self.pending: return (cb, args, kwargs, done_d) = self.pending.pop(0) self.active += 1 d = defer.maybeDeferred(cb, *args, **kwargs) d.addBoth(self._done, done_d) def _done(self, res, done_d): self.active -= 1 eventually(done_d.callback, res) eventually(self.maybe_start_task) allmydata-tahoe-1.10.2/src/allmydata/util/humanreadable.py0000644000175000017500000001032512556560070021646 0ustar ramramimport exceptions, os from repr import Repr class BetterRepr(Repr): def __init__(self): Repr.__init__(self) # Note: These levels can get adjusted dynamically! My goal is to get more info when printing important debug stuff like exceptions and stack traces and less info when logging normal events. --Zooko 2000-10-14 self.maxlevel = 6 self.maxdict = 6 self.maxlist = 6 self.maxtuple = 6 self.maxstring = 300 self.maxother = 300 def repr_function(self, obj, level): if hasattr(obj, 'func_code'): return '<' + obj.func_name + '() at ' + os.path.basename(obj.func_code.co_filename) + ':' + str(obj.func_code.co_firstlineno) + '>' else: return '<' + obj.func_name + '() at (builtin)' def repr_instance_method(self, obj, level): if hasattr(obj, 'func_code'): return '<' + obj.im_class.__name__ + '.' + obj.im_func.__name__ + '() at ' + os.path.basename(obj.im_func.func_code.co_filename) + ':' + str(obj.im_func.func_code.co_firstlineno) + '>' else: return '<' + obj.im_class.__name__ + '.' + obj.im_func.__name__ + '() at (builtin)' def repr_long(self, obj, level): s = `obj` # XXX Hope this isn't too slow... if len(s) > self.maxlong: i = max(0, (self.maxlong-3)/2) j = max(0, self.maxlong-3-i) s = s[:i] + '...' + s[len(s)-j:] if s[-1] == 'L': return s[:-1] return s def repr_instance(self, obj, level): """ If it is an instance of Exception, format it nicely (trying to emulate the format that you see when an exception is actually raised, plus bracketing '<''s). If it is an instance of dict call self.repr_dict() on it. If it is an instance of list call self.repr_list() on it. Else call Repr.repr_instance(). """ if isinstance(obj, exceptions.Exception): # Don't cut down exception strings so much. tms = self.maxstring self.maxstring = max(512, tms * 4) tml = self.maxlist self.maxlist = max(12, tml * 4) try: if hasattr(obj, 'args'): if len(obj.args) == 1: return '<' + obj.__class__.__name__ + ': ' + self.repr1(obj.args[0], level-1) + '>' else: return '<' + obj.__class__.__name__ + ': ' + self.repr1(obj.args, level-1) + '>' else: return '<' + obj.__class__.__name__ + '>' finally: self.maxstring = tms self.maxlist = tml if isinstance(obj, dict): return self.repr_dict(obj, level) if isinstance(obj, list): return self.repr_list(obj, level) return Repr.repr_instance(self, obj, level) def repr_list(self, obj, level): """ copied from standard repr.py and fixed to work on multithreadedly mutating lists. """ if level <= 0: return '[...]' n = len(obj) myl = obj[:min(n, self.maxlist)] s = '' for item in myl: entry = self.repr1(item, level-1) if s: s = s + ', ' s = s + entry if n > self.maxlist: s = s + ', ...' return '[' + s + ']' def repr_dict(self, obj, level): """ copied from standard repr.py and fixed to work on multithreadedly mutating dicts. """ if level <= 0: return '{...}' s = '' n = len(obj) items = obj.items()[:min(n, self.maxdict)] items.sort() for key, val in items: entry = self.repr1(key, level-1) + ':' + self.repr1(val, level-1) if s: s = s + ', ' s = s + entry if n > self.maxdict: s = s + ', ...' return '{' + s + '}' # This object can be changed by other code updating this module's "brepr" # variables. This is so that (a) code can use humanreadable with # "from humanreadable import hr; hr(mything)", and (b) code can override # humanreadable to provide application-specific human readable output # (e.g. libbase32's base32id.AbbrevRepr). brepr = BetterRepr() def hr(x): return brepr.repr(x) allmydata-tahoe-1.10.2/src/allmydata/util/verlib.py0000644000175000017500000002715312556560070020350 0ustar ramram""" "Rational" version definition and parsing for DistutilsVersionFight discussion at PyCon 2009. """ import re class IrrationalVersionError(Exception): """This is an irrational version.""" pass class HugeMajorVersionNumError(IrrationalVersionError): """An irrational version because the major version number is huge (often because a year or date was used). See `error_on_huge_major_num` option in `NormalizedVersion` for details. This guard can be disabled by setting that option False. """ pass # A marker used in the second and third parts of the `parts` tuple, for # versions that don't have those segments, to sort properly. An example # of versions in sort order ('highest' last): # 1.0b1 ((1,0), ('b',1), ('f',)) # 1.0.dev345 ((1,0), ('f',), ('dev', 345)) # 1.0 ((1,0), ('f',), ('f',)) # 1.0.post256.dev345 ((1,0), ('f',), ('f', 'post', 256, 'dev', 345)) # 1.0.post345 ((1,0), ('f',), ('f', 'post', 345, 'f')) # ^ ^ ^ # 'b' < 'f' ---------------------/ | | # | | # 'dev' < 'f' < 'post' -------------------/ | # | # 'dev' < 'f' ----------------------------------------------/ # Other letters would do, but 'f' for 'final' is kind of nice. FINAL_MARKER = ('f',) VERSION_RE = re.compile(r''' ^ (?P\d+\.\d+) # minimum 'N.N' (?P(?:\.\d+)*) # any number of extra '.N' segments (?: (?P[abc]|rc) # 'a'=alpha, 'b'=beta, 'c'=release candidate # 'rc'= alias for release candidate (?P\d+(?:\.\d+)*) )? (?P(\.post(?P\d+))?(\.dev(?P\d+))?)? $''', re.VERBOSE) class NormalizedVersion(object): """A rational version. Good: 1.2 # equivalent to "1.2.0" 1.2.0 1.2a1 1.2.3a2 1.2.3b1 1.2.3c1 1.2.3.4 TODO: fill this out Bad: 1 # mininum two numbers 1.2a # release level must have a release serial 1.2.3b """ def __init__(self, s, error_on_huge_major_num=True): """Create a NormalizedVersion instance from a version string. @param s {str} The version string. @param error_on_huge_major_num {bool} Whether to consider an apparent use of a year or full date as the major version number an error. Default True. One of the observed patterns on PyPI before the introduction of `NormalizedVersion` was version numbers like this: 2009.01.03 20040603 2005.01 This guard is here to strongly encourage the package author to use an alternate version, because a release deployed into PyPI and, e.g. downstream Linux package managers, will forever remove the possibility of using a version number like "1.0" (i.e. where the major number is less than that huge major number). """ self._parse(s, error_on_huge_major_num) @classmethod def from_parts(cls, version, prerelease=FINAL_MARKER, devpost=FINAL_MARKER): return cls(cls.parts_to_str((version, prerelease, devpost))) def _parse(self, s, error_on_huge_major_num=True): """Parses a string version into parts.""" match = VERSION_RE.search(s) if not match: raise IrrationalVersionError(s) groups = match.groupdict() parts = [] # main version block = self._parse_numdots(groups['version'], s, False, 2) extraversion = groups.get('extraversion') if extraversion not in ('', None): block += self._parse_numdots(extraversion[1:], s) parts.append(tuple(block)) # prerelease prerel = groups.get('prerel') if prerel is not None: block = [prerel] block += self._parse_numdots(groups.get('prerelversion'), s, pad_zeros_length=1) parts.append(tuple(block)) else: parts.append(FINAL_MARKER) # postdev if groups.get('postdev'): post = groups.get('post') dev = groups.get('dev') postdev = [] if post is not None: postdev.extend([FINAL_MARKER[0], 'post', int(post)]) if dev is None: postdev.append(FINAL_MARKER[0]) if dev is not None: postdev.extend(['dev', int(dev)]) parts.append(tuple(postdev)) else: parts.append(FINAL_MARKER) self.parts = tuple(parts) if error_on_huge_major_num and self.parts[0][0] > 1980: raise HugeMajorVersionNumError("huge major version number, %r, " "which might cause future problems: %r" % (self.parts[0][0], s)) def _parse_numdots(self, s, full_ver_str, drop_trailing_zeros=True, pad_zeros_length=0): """Parse 'N.N.N' sequences, return a list of ints. @param s {str} 'N.N.N...' sequence to be parsed @param full_ver_str {str} The full version string from which this comes. Used for error strings. @param drop_trailing_zeros {bool} Whether to drop trailing zeros from the returned list. Default True. @param pad_zeros_length {int} The length to which to pad the returned list with zeros, if necessary. Default 0. """ nums = [] for n in s.split("."): if len(n) > 1 and n[0] == '0': raise IrrationalVersionError("cannot have leading zero in " "version number segment: '%s' in %r" % (n, full_ver_str)) nums.append(int(n)) if drop_trailing_zeros: while nums and nums[-1] == 0: nums.pop() while len(nums) < pad_zeros_length: nums.append(0) return nums def __str__(self): return self.parts_to_str(self.parts) @classmethod def parts_to_str(cls, parts): """Transforms a version expressed in tuple into its string representation.""" # XXX This doesn't check for invalid tuples main, prerel, postdev = parts s = '.'.join(str(v) for v in main) if prerel is not FINAL_MARKER: s += prerel[0] s += '.'.join(str(v) for v in prerel[1:]) if postdev and postdev is not FINAL_MARKER: if postdev[0] == 'f': postdev = postdev[1:] i = 0 while i < len(postdev): if i % 2 == 0: s += '.' s += str(postdev[i]) i += 1 return s def __repr__(self): return "%s('%s')" % (self.__class__.__name__, self) def _cannot_compare(self, other): raise TypeError("cannot compare %s and %s" % (type(self).__name__, type(other).__name__)) def __eq__(self, other): if not isinstance(other, NormalizedVersion): self._cannot_compare(other) return self.parts == other.parts def __lt__(self, other): if not isinstance(other, NormalizedVersion): self._cannot_compare(other) return self.parts < other.parts def __ne__(self, other): return not self.__eq__(other) def __gt__(self, other): return not (self.__lt__(other) or self.__eq__(other)) def __le__(self, other): return self.__eq__(other) or self.__lt__(other) def __ge__(self, other): return self.__eq__(other) or self.__gt__(other) def suggest_normalized_version(s): """Suggest a normalized version close to the given version string. If you have a version string that isn't rational (i.e. NormalizedVersion doesn't like it) then you might be able to get an equivalent (or close) rational version from this function. This does a number of simple normalizations to the given string, based on observation of versions currently in use on PyPI. Given a dump of those version during PyCon 2009, 4287 of them: - 2312 (53.93%) match NormalizedVersion without change - with the automatic suggestion - 3474 (81.04%) match when using this suggestion method @param s {str} An irrational version string. @returns A rational version string, or None, if couldn't determine one. """ try: NormalizedVersion(s) return s # already rational except IrrationalVersionError: pass rs = s.lower() # part of this could use maketrans for orig, repl in (('-alpha', 'a'), ('-beta', 'b'), ('alpha', 'a'), ('beta', 'b'), ('rc', 'c'), ('-final', ''), ('-pre', 'c'), ('-release', ''), ('.release', ''), ('-stable', ''), ('+', '.'), ('_', '.'), (' ', ''), ('.final', ''), ('final', '')): rs = rs.replace(orig, repl) # if something ends with dev or pre, we add a 0 rs = re.sub(r"pre$", r"pre0", rs) rs = re.sub(r"dev$", r"dev0", rs) # if we have something like "b-2" or "a.2" at the end of the # version, that is pobably beta, alpha, etc # let's remove the dash or dot rs = re.sub(r"([abc|rc])[\-\.](\d+)$", r"\1\2", rs) # 1.0-dev-r371 -> 1.0.dev371 # 0.1-dev-r79 -> 0.1.dev79 rs = re.sub(r"[\-\.](dev)[\-\.]?r?(\d+)$", r".\1\2", rs) # Clean: 2.0.a.3, 2.0.b1, 0.9.0~c1 rs = re.sub(r"[.~]?([abc])\.?", r"\1", rs) # Clean: v0.3, v1.0 if rs.startswith('v'): rs = rs[1:] # Clean leading '0's on numbers. #TODO: unintended side-effect on, e.g., "2003.05.09" # PyPI stats: 77 (~2%) better rs = re.sub(r"\b0+(\d+)(?!\d)", r"\1", rs) # Clean a/b/c with no version. E.g. "1.0a" -> "1.0a0". Setuptools infers # zero. # PyPI stats: 245 (7.56%) better rs = re.sub(r"(\d+[abc])$", r"\g<1>0", rs) # the 'dev-rNNN' tag is a dev tag rs = re.sub(r"\.?(dev-r|dev\.r)\.?(\d+)$", r".dev\2", rs) # clean the - when used as a pre delimiter rs = re.sub(r"-(a|b|c)(\d+)$", r"\1\2", rs) # a terminal "dev" or "devel" can be changed into ".dev0" rs = re.sub(r"[\.\-](dev|devel)$", r".dev0", rs) # a terminal "dev" can be changed into ".dev0" rs = re.sub(r"(?![\.\-])dev$", r".dev0", rs) # a terminal "final" or "stable" can be removed rs = re.sub(r"(final|stable)$", "", rs) # The 'r' and the '-' tags are post release tags # 0.4a1.r10 -> 0.4a1.post10 # 0.9.33-17222 -> 0.9.33.post17222 # 0.9.33-r17222 -> 0.9.33.post17222 rs = re.sub(r"\.?(r|-|-r)\.?(\d+)$", r".post\2", rs) # Clean 'r' instead of 'dev' usage: # 0.9.33+r17222 -> 0.9.33.dev17222 # 1.0dev123 -> 1.0.dev123 # 1.0.git123 -> 1.0.dev123 # 1.0.bzr123 -> 1.0.dev123 # 0.1a0dev.123 -> 0.1a0.dev123 # PyPI stats: ~150 (~4%) better rs = re.sub(r"\.?(dev|git|bzr)\.?(\d+)$", r".dev\2", rs) # Clean '.pre' (normalized from '-pre' above) instead of 'c' usage: # 0.2.pre1 -> 0.2c1 # 0.2-c1 -> 0.2c1 # 1.0preview123 -> 1.0c123 # PyPI stats: ~21 (0.62%) better rs = re.sub(r"\.?(pre|preview|-c)(\d+)$", r"c\g<2>", rs) # Tcl/Tk uses "px" for their post release markers rs = re.sub(r"p(\d+)$", r".post\1", rs) try: NormalizedVersion(rs) return rs # already rational except IrrationalVersionError: pass return None allmydata-tahoe-1.10.2/src/allmydata/util/observer.py0000644000175000017500000001111312556560070020701 0ustar ramram# -*- test-case-name: allmydata.test.test_observer -*- import weakref from twisted.internet import defer from foolscap.api import eventually """The idiom we use is for the observed object to offer a method named 'when_something', which returns a deferred. That deferred will be fired when something happens. The way this is typically implemented is that the observed has an ObserverList whose when_fired method is called in the observed's 'when_something'.""" class OneShotObserverList: """A one-shot event distributor.""" def __init__(self): self._fired = False self._result = None self._watchers = [] self.__repr__ = self._unfired_repr def _unfired_repr(self): return "" % (self._watchers, ) def _fired_repr(self): return " %s>" % (self._result, ) def _get_result(self): return self._result def when_fired(self): if self._fired: return defer.succeed(self._get_result()) d = defer.Deferred() self._watchers.append(d) return d def fire(self, result): assert not self._fired self._fired = True self._result = result self._fire(result) def _fire(self, result): for w in self._watchers: eventually(w.callback, result) del self._watchers self.__repr__ = self._fired_repr def fire_if_not_fired(self, result): if not self._fired: self.fire(result) class LazyOneShotObserverList(OneShotObserverList): """ a variant of OneShotObserverList which does not retain the result it handles, but rather retains a callable() through which is retrieves the data if and when needed. """ def __init__(self): OneShotObserverList.__init__(self) def _get_result(self): return self._result_producer() def fire(self, result_producer): """ @param result_producer: a no-arg callable which returns the data which is to be considered the 'result' for this observer list. note that this function may be called multiple times - once upon initial firing, and potentially once more for each subsequent when_fired() deferred created """ assert not self._fired self._fired = True self._result_producer = result_producer if self._watchers: # if not, don't call result_producer self._fire(self._get_result()) class ObserverList: """A simple class to distribute events to a number of subscribers.""" def __init__(self): self._watchers = [] def subscribe(self, observer): self._watchers.append(observer) def unsubscribe(self, observer): self._watchers.remove(observer) def notify(self, *args, **kwargs): for o in self._watchers: eventually(o, *args, **kwargs) class EventStreamObserver: """A simple class to distribute multiple events to a single subscriber. It accepts arbitrary kwargs, but no posargs.""" def __init__(self): self._watcher = None self._undelivered_results = [] self._canceler = None def set_canceler(self, c, methname): """I will call c.METHNAME(self) when somebody cancels me.""" # we use a weakref to avoid creating a cycle between us and the thing # we're observing: they'll be holding a reference to us to compare # against the value we pass to their canceler function. However, # since bound methods are first-class objects (and not kept alive by # the object they're bound to), we can't just stash a weakref to the # bound cancel method. Instead, we must hold a weakref to the actual # object, and obtain its cancel method later. # http://code.activestate.com/recipes/81253-weakmethod/ has an # alternative. self._canceler = (weakref.ref(c), methname) def subscribe(self, observer, **watcher_kwargs): self._watcher = (observer, watcher_kwargs) while self._undelivered_results: self._notify(self._undelivered_results.pop(0)) def notify(self, **result_kwargs): if self._watcher: self._notify(result_kwargs) else: self._undelivered_results.append(result_kwargs) def _notify(self, result_kwargs): o, watcher_kwargs = self._watcher kwargs = dict(result_kwargs) kwargs.update(watcher_kwargs) eventually(o, **kwargs) def cancel(self): wr,methname = self._canceler o = wr() if o: getattr(o,methname)(self) allmydata-tahoe-1.10.2/src/allmydata/util/base62.py0000644000175000017500000001033312556560070020137 0ustar ramram# from the Python Standard Library import string from allmydata.util.mathutil import log_ceil, log_floor chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" BASE62CHAR = '[' + chars + ']' vals = ''.join([chr(i) for i in range(62)]) c2vtranstable = string.maketrans(chars, vals) v2ctranstable = string.maketrans(vals, chars) identitytranstable = string.maketrans(chars, chars) def b2a(os): """ @param os the data to be encoded (a string) @return the contents of os in base-62 encoded form """ cs = b2a_l(os, len(os)*8) assert num_octets_that_encode_to_this_many_chars(len(cs)) == len(os), "%s != %s, numchars: %s" % (num_octets_that_encode_to_this_many_chars(len(cs)), len(os), len(cs)) return cs def b2a_l(os, lengthinbits): """ @param os the data to be encoded (a string) @param lengthinbits the number of bits of data in os to be encoded b2a_l() will generate a base-62 encoded string big enough to encode lengthinbits bits. So for example if os is 3 bytes long and lengthinbits is 17, then b2a_l() will generate a 3-character- long base-62 encoded string (since 3 chars is sufficient to encode more than 2^17 values). If os is 3 bytes long and lengthinbits is 18 (or None), then b2a_l() will generate a 4-character string (since 4 chars are required to hold 2^18 values). Note that if os is 3 bytes long and lengthinbits is 17, the least significant 7 bits of os are ignored. Warning: if you generate a base-62 encoded string with b2a_l(), and then someone else tries to decode it by calling a2b() instead of a2b_l(), then they will (potentially) get a different string than the one you encoded! So use b2a_l() only when you are sure that the encoding and decoding sides know exactly which lengthinbits to use. If you do not have a way for the encoder and the decoder to agree upon the lengthinbits, then it is best to use b2a() and a2b(). The only drawback to using b2a() over b2a_l() is that when you have a number of bits to encode that is not a multiple of 8, b2a() can sometimes generate a base-62 encoded string that is one or two characters longer than necessary. @return the contents of os in base-62 encoded form """ os = [ord(o) for o in reversed(os)] # treat os as big-endian -- and we want to process the least-significant o first value = 0 numvalues = 1 # the number of possible values that value could be for o in os: o *= numvalues value += o numvalues *= 256 chars = [] while numvalues > 0: chars.append(value % 62) value //= 62 numvalues //= 62 return string.translate(''.join([chr(c) for c in reversed(chars)]), v2ctranstable) # make it big-endian def num_octets_that_encode_to_this_many_chars(numcs): return log_floor(62**numcs, 256) def num_chars_that_this_many_octets_encode_to(numos): return log_ceil(256**numos, 62) def a2b(cs): """ @param cs the base-62 encoded data (a string) """ return a2b_l(cs, num_octets_that_encode_to_this_many_chars(len(cs))*8) def a2b_l(cs, lengthinbits): """ @param lengthinbits the number of bits of data in encoded into cs a2b_l() will return a result just big enough to hold lengthinbits bits. So for example if cs is 2 characters long (encoding between 5 and 12 bits worth of data) and lengthinbits is 8, then a2b_l() will return a string of length 1 (since 1 byte is sufficient to store 8 bits), but if lengthinbits is 9, then a2b_l() will return a string of length 2. Please see the warning in the docstring of b2a_l() regarding the use of b2a() versus b2a_l(). @return the data encoded in cs """ cs = [ord(c) for c in reversed(string.translate(cs, c2vtranstable))] # treat cs as big-endian -- and we want to process the least-significant c first value = 0 numvalues = 1 # the number of possible values that value could be for c in cs: c *= numvalues value += c numvalues *= 62 numvalues = 2**lengthinbits bytes = [] while numvalues > 1: bytes.append(value % 256) value //= 256 numvalues //= 256 return ''.join([chr(b) for b in reversed(bytes)]) # make it big-endian allmydata-tahoe-1.10.2/src/allmydata/util/sibpath.py0000644000175000017500000000177712556560070020523 0ustar ramramimport os import sys from twisted.python.util import sibpath as tsibpath def sibpath(path, sibling): """ Looks for a named sibling relative to the given path. If such a file exists, its path will be returned, otherwise a second search will be made for the named sibling relative to the path of the executable currently running. This is useful in the case that something built with py2exe, for example, needs to find data files relative to its install. Note hence that care should be taken not to search for private package files whose names might collide with files which might be found installed alongside the python interpreter itself. If no file is found in either place, the sibling relative to the given path is returned, likely leading to a file not found error. """ sib = tsibpath(path, sibling) if not os.path.exists(sib): exe_sib = tsibpath(sys.executable, sibling) if os.path.exists(exe_sib): return exe_sib return sib allmydata-tahoe-1.10.2/src/allmydata/util/time_format.py0000644000175000017500000000424112556560070021364 0ustar ramram# ISO-8601: # http://www.cl.cam.ac.uk/~mgk25/iso-time.html import calendar, datetime, re, time def iso_utc_date(now=None, t=time.time): if now is None: now = t() return datetime.datetime.utcfromtimestamp(now).isoformat()[:10] def iso_utc(now=None, sep='_', t=time.time): if now is None: now = t() return datetime.datetime.utcfromtimestamp(now).isoformat(sep) def iso_local(now=None, sep='_', t=time.time): if now is None: now = t() return datetime.datetime.fromtimestamp(now).isoformat(sep) def iso_utc_time_to_seconds(isotime, _conversion_re=re.compile(r"(?P\d{4})-(?P\d{2})-(?P\d{2})[T_ ](?P\d{2}):(?P\d{2}):(?P\d{2})(?P\.\d+)?")): """ The inverse of iso_utc(). Real ISO-8601 is "2003-01-08T06:30:59". We also accept the widely used variants "2003-01-08_06:30:59" and "2003-01-08 06:30:59". """ m = _conversion_re.match(isotime) if not m: raise ValueError, (isotime, "not a complete ISO8601 timestamp") year, month, day = int(m.group('year')), int(m.group('month')), int(m.group('day')) hour, minute, second = int(m.group('hour')), int(m.group('minute')), int(m.group('second')) subsecstr = m.group('subsecond') if subsecstr: subsecfloat = float(subsecstr) else: subsecfloat = 0 return calendar.timegm( (year, month, day, hour, minute, second, 0, 1, 0) ) + subsecfloat def parse_duration(s): orig = s unit = None DAY = 24*60*60 MONTH = 31*DAY YEAR = 365*DAY if s.endswith("s"): s = s[:-1] if s.endswith("day"): unit = DAY s = s[:-len("day")] elif s.endswith("month"): unit = MONTH s = s[:-len("month")] elif s.endswith("mo"): unit = MONTH s = s[:-len("mo")] elif s.endswith("year"): unit = YEAR s = s[:-len("YEAR")] else: raise ValueError("no unit (like day, month, or year) in '%s'" % orig) s = s.strip() return int(s) * unit def parse_date(s): # return seconds-since-epoch for the UTC midnight that starts the given # day return int(iso_utc_time_to_seconds(s + "T00:00:00")) allmydata-tahoe-1.10.2/src/allmydata/util/keyutil.py0000644000175000017500000000252112556560070020543 0ustar ramramimport os from pycryptopp.publickey import ed25519 from allmydata.util.base32 import a2b, b2a BadSignatureError = ed25519.BadSignatureError class BadPrefixError(Exception): pass def remove_prefix(s_bytes, prefix): if not s_bytes.startswith(prefix): raise BadPrefixError("did not see expected '%s' prefix" % (prefix,)) return s_bytes[len(prefix):] # in base32, keys are 52 chars long (both signing and verifying keys) # in base62, keys is 43 chars long # in base64, keys is 43 chars long # # We can't use base64 because we want to reserve punctuation and preserve # cut-and-pasteability. The base62 encoding is shorter than the base32 form, # but the minor usability improvement is not worth the documentation and # specification confusion of using a non-standard encoding. So we stick with # base32. def make_keypair(): sk_bytes = os.urandom(32) sk = ed25519.SigningKey(sk_bytes) vk_bytes = sk.get_verifying_key_bytes() return ("priv-v0-"+b2a(sk_bytes), "pub-v0-"+b2a(vk_bytes)) def parse_privkey(privkey_vs): sk_bytes = a2b(remove_prefix(privkey_vs, "priv-v0-")) sk = ed25519.SigningKey(sk_bytes) vk_bytes = sk.get_verifying_key_bytes() return (sk, "pub-v0-"+b2a(vk_bytes)) def parse_pubkey(pubkey_vs): vk_bytes = a2b(remove_prefix(pubkey_vs, "pub-v0-")) return ed25519.VerifyingKey(vk_bytes) allmydata-tahoe-1.10.2/src/allmydata/util/hashutil.py0000644000175000017500000001707712556560070020712 0ustar ramramfrom pycryptopp.hash.sha256 import SHA256 import os from allmydata.util.netstring import netstring try: import hashlib sha1 = hashlib.sha1 except ImportError: # hashlib was added in Python 2.5 import sha sha1 = sha.new # Be very very cautious when modifying this file. Almost any change will # cause a compatibility break, invalidating all outstanding URIs and making # any previously uploaded files become inaccessible. BE CONSERVATIVE AND TEST # AGAINST OLD DATA! # Various crypto values are this size: hash outputs (from SHA-256d), # randomly-generated secrets such as the lease secret, and symmetric encryption # keys. In the near future we will add DSA private keys, and salts of various # kinds. CRYPTO_VAL_SIZE=32 class _SHA256d_Hasher: # use SHA-256d, as defined by Ferguson and Schneier: hash the output # again to prevent length-extension attacks def __init__(self, truncate_to=None): self.h = SHA256() self.truncate_to = truncate_to self._digest = None def update(self, data): assert isinstance(data, str) # no unicode self.h.update(data) def digest(self): if self._digest is None: h1 = self.h.digest() del self.h h2 = SHA256(h1).digest() if self.truncate_to: h2 = h2[:self.truncate_to] self._digest = h2 return self._digest def tagged_hasher(tag, truncate_to=None): hasher = _SHA256d_Hasher(truncate_to) hasher.update(netstring(tag)) return hasher def tagged_hash(tag, val, truncate_to=None): hasher = tagged_hasher(tag, truncate_to) hasher.update(val) return hasher.digest() def tagged_pair_hash(tag, val1, val2, truncate_to=None): s = _SHA256d_Hasher(truncate_to) s.update(netstring(tag)) s.update(netstring(val1)) s.update(netstring(val2)) return s.digest() ## specific hash tags that we use # immutable STORAGE_INDEX_TAG = "allmydata_immutable_key_to_storage_index_v1" BLOCK_TAG = "allmydata_encoded_subshare_v1" UEB_TAG = "allmydata_uri_extension_v1" PLAINTEXT_TAG = "allmydata_plaintext_v1" CIPHERTEXT_TAG = "allmydata_crypttext_v1" CIPHERTEXT_SEGMENT_TAG = "allmydata_crypttext_segment_v1" PLAINTEXT_SEGMENT_TAG = "allmydata_plaintext_segment_v1" CONVERGENT_ENCRYPTION_TAG = "allmydata_immutable_content_to_key_with_added_secret_v1+" CLIENT_RENEWAL_TAG = "allmydata_client_renewal_secret_v1" CLIENT_CANCEL_TAG = "allmydata_client_cancel_secret_v1" FILE_RENEWAL_TAG = "allmydata_file_renewal_secret_v1" FILE_CANCEL_TAG = "allmydata_file_cancel_secret_v1" BUCKET_RENEWAL_TAG = "allmydata_bucket_renewal_secret_v1" BUCKET_CANCEL_TAG = "allmydata_bucket_cancel_secret_v1" # mutable MUTABLE_WRITEKEY_TAG = "allmydata_mutable_privkey_to_writekey_v1" MUTABLE_WRITE_ENABLER_MASTER_TAG = "allmydata_mutable_writekey_to_write_enabler_master_v1" MUTABLE_WRITE_ENABLER_TAG = "allmydata_mutable_write_enabler_master_and_nodeid_to_write_enabler_v1" MUTABLE_PUBKEY_TAG = "allmydata_mutable_pubkey_to_fingerprint_v1" MUTABLE_READKEY_TAG = "allmydata_mutable_writekey_to_readkey_v1" MUTABLE_DATAKEY_TAG = "allmydata_mutable_readkey_to_datakey_v1" MUTABLE_STORAGEINDEX_TAG = "allmydata_mutable_readkey_to_storage_index_v1" # dirnodes DIRNODE_CHILD_WRITECAP_TAG = "allmydata_mutable_writekey_and_salt_to_dirnode_child_capkey_v1" DIRNODE_CHILD_SALT_TAG = "allmydata_dirnode_child_rwcap_to_salt_v1" def storage_index_hash(key): # storage index is truncated to 128 bits (16 bytes). We're only hashing a # 16-byte value to get it, so there's no point in using a larger value. We # use this same tagged hash to go from encryption key to storage index for # random-keyed immutable files and convergent-encryption immutabie # files. Mutable files use ssk_storage_index_hash(). return tagged_hash(STORAGE_INDEX_TAG, key, 16) def block_hash(data): return tagged_hash(BLOCK_TAG, data) def block_hasher(): return tagged_hasher(BLOCK_TAG) def uri_extension_hash(data): return tagged_hash(UEB_TAG, data) def uri_extension_hasher(): return tagged_hasher(UEB_TAG) def plaintext_hash(data): return tagged_hash(PLAINTEXT_TAG, data) def plaintext_hasher(): return tagged_hasher(PLAINTEXT_TAG) def crypttext_hash(data): return tagged_hash(CIPHERTEXT_TAG, data) def crypttext_hasher(): return tagged_hasher(CIPHERTEXT_TAG) def crypttext_segment_hash(data): return tagged_hash(CIPHERTEXT_SEGMENT_TAG, data) def crypttext_segment_hasher(): return tagged_hasher(CIPHERTEXT_SEGMENT_TAG) def plaintext_segment_hash(data): return tagged_hash(PLAINTEXT_SEGMENT_TAG, data) def plaintext_segment_hasher(): return tagged_hasher(PLAINTEXT_SEGMENT_TAG) KEYLEN = 16 IVLEN = 16 def convergence_hash(k, n, segsize, data, convergence): h = convergence_hasher(k, n, segsize, convergence) h.update(data) return h.digest() def convergence_hasher(k, n, segsize, convergence): assert isinstance(convergence, str) param_tag = netstring("%d,%d,%d" % (k, n, segsize)) tag = CONVERGENT_ENCRYPTION_TAG + netstring(convergence) + param_tag return tagged_hasher(tag, KEYLEN) def random_key(): return os.urandom(KEYLEN) def my_renewal_secret_hash(my_secret): return tagged_hash(my_secret, CLIENT_RENEWAL_TAG) def my_cancel_secret_hash(my_secret): return tagged_hash(my_secret, CLIENT_CANCEL_TAG) def file_renewal_secret_hash(client_renewal_secret, storage_index): return tagged_pair_hash(FILE_RENEWAL_TAG, client_renewal_secret, storage_index) def file_cancel_secret_hash(client_cancel_secret, storage_index): return tagged_pair_hash(FILE_CANCEL_TAG, client_cancel_secret, storage_index) def bucket_renewal_secret_hash(file_renewal_secret, peerid): assert len(peerid) == 20, "%s: %r" % (len(peerid), peerid) # binary! return tagged_pair_hash(BUCKET_RENEWAL_TAG, file_renewal_secret, peerid) def bucket_cancel_secret_hash(file_cancel_secret, peerid): assert len(peerid) == 20, "%s: %r" % (len(peerid), peerid) # binary! return tagged_pair_hash(BUCKET_CANCEL_TAG, file_cancel_secret, peerid) def _xor(a, b): return "".join([chr(ord(c) ^ ord(b)) for c in a]) def hmac(tag, data): ikey = _xor(tag, "\x36") okey = _xor(tag, "\x5c") h1 = SHA256(ikey + data).digest() h2 = SHA256(okey + h1).digest() return h2 def mutable_rwcap_key_hash(iv, writekey): return tagged_pair_hash(DIRNODE_CHILD_WRITECAP_TAG, iv, writekey, KEYLEN) def mutable_rwcap_salt_hash(writekey): return tagged_hash(DIRNODE_CHILD_SALT_TAG, writekey, IVLEN) def ssk_writekey_hash(privkey): return tagged_hash(MUTABLE_WRITEKEY_TAG, privkey, KEYLEN) def ssk_write_enabler_master_hash(writekey): return tagged_hash(MUTABLE_WRITE_ENABLER_MASTER_TAG, writekey) def ssk_write_enabler_hash(writekey, peerid): assert len(peerid) == 20, "%s: %r" % (len(peerid), peerid) # binary! wem = ssk_write_enabler_master_hash(writekey) return tagged_pair_hash(MUTABLE_WRITE_ENABLER_TAG, wem, peerid) def ssk_pubkey_fingerprint_hash(pubkey): return tagged_hash(MUTABLE_PUBKEY_TAG, pubkey) def ssk_readkey_hash(writekey): return tagged_hash(MUTABLE_READKEY_TAG, writekey, KEYLEN) def ssk_readkey_data_hash(IV, readkey): return tagged_pair_hash(MUTABLE_DATAKEY_TAG, IV, readkey, KEYLEN) def ssk_storage_index_hash(readkey): return tagged_hash(MUTABLE_STORAGEINDEX_TAG, readkey, KEYLEN) def timing_safe_compare(a, b): n = os.urandom(32) return bool(tagged_hash(n, a) == tagged_hash(n, b)) BACKUPDB_DIRHASH_TAG = "allmydata_backupdb_dirhash_v1" def backupdb_dirhash(contents): return tagged_hash(BACKUPDB_DIRHASH_TAG, contents) allmydata-tahoe-1.10.2/src/allmydata/util/pkgresutil.py0000644000175000017500000000352412556560070021252 0ustar ramram def install(): """ This installs a hook into setuptools' pkg_resources infrastructure, so that resource files can be found in files relative to the runnin executable, in addition to the usual egg and source lookup mechanisms. This overrides the ZipProvider, since that is the lookup mechanism triggered within pkg_resources when running code out of a py2exe or py2app build's library.zip. """ import os, sys import pkg_resources, zipimport platform_libdirs = { 'darwin': '../Resources/pkg_resources', } exedir = os.path.dirname(sys.executable) libdir = platform_libdirs.get(sys.platform, 'pkg_resources') class Provider(pkg_resources.ZipProvider): def __init__(self, module): self._module_name = module.__name__ pkg_resources.ZipProvider.__init__(self, module) def get_resource_filename(self, manager, resource_name): #print 'get_resource_filename(%s, %s)' % (manager, resource_name) path = [exedir, libdir] + self._module_name.split('.') + [resource_name] localfile = os.path.join(*path) #print ' checking(%s)' % (localfile,) if os.path.exists(localfile): #print 'found locally' return localfile else: try: ret = pkg_resources.ZipProvider.get_resource_filename(self, manager, resource_name) #print 'returning %s' % (ret,) return ret except NotImplementedError: #print 'get_resource_filename(%s,%s): not found' % (self._module_name, resource_name) #import traceback #traceback.print_exc() return '' pkg_resources.register_loader_type(zipimport.zipimporter, Provider) allmydata-tahoe-1.10.2/src/allmydata/util/rrefutil.py0000644000175000017500000000373012556560070020714 0ustar ramram from twisted.internet import address from foolscap.api import Violation, RemoteException, DeadReferenceError, \ SturdyRef def add_version_to_remote_reference(rref, default): """I try to add a .version attribute to the given RemoteReference. I call the remote get_version() method to learn its version. I'll add the default value if the remote side doesn't appear to have a get_version() method.""" d = rref.callRemote("get_version") def _got_version(version): rref.version = version return rref def _no_get_version(f): f.trap(Violation, RemoteException) rref.version = default return rref d.addCallbacks(_got_version, _no_get_version) return d def trap_and_discard(f, *errorTypes): f.trap(*errorTypes) pass def trap_deadref(f): return trap_and_discard(f, DeadReferenceError) def hosts_for_rref(rref, ignore_localhost=True): # actually, this only returns hostnames advertised = [] for hint in rref.getLocationHints(): # Foolscap-0.2.5 and earlier used strings in .locationHints, but we # require a newer version that uses tuples of ("ipv4", host, port) assert not isinstance(hint, str), hint if hint[0] == "ipv4": host = hint[1] if ignore_localhost and host == "127.0.0.1": continue advertised.append(host) return advertised def hosts_for_furl(furl, ignore_localhost=True): advertised = [] for hint in SturdyRef(furl).locationHints: assert not isinstance(hint, str), hint if hint[0] == "ipv4": host = hint[1] if ignore_localhost and host == "127.0.0.1": continue advertised.append(host) return advertised def stringify_remote_address(rref): remote = rref.getPeer() if isinstance(remote, address.IPv4Address): return "%s:%d" % (remote.host, remote.port) # loopback is a non-IPv4Address return str(remote) allmydata-tahoe-1.10.2/src/allmydata/util/fileutil.py0000644000175000017500000004540512556560070020702 0ustar ramram""" Futz with files like a pro. """ import sys, exceptions, os, stat, tempfile, time, binascii from twisted.python import log from pycryptopp.cipher.aes import AES def rename(src, dst, tries=4, basedelay=0.1): """ Here is a superkludge to workaround the fact that occasionally on Windows some other process (e.g. an anti-virus scanner, a local search engine, etc.) is looking at your file when you want to delete or move it, and hence you can't. The horrible workaround is to sit and spin, trying to delete it, for a short time and then give up. With the default values of tries and basedelay this can block for less than a second. @param tries: number of tries -- each time after the first we wait twice as long as the previous wait @param basedelay: how long to wait before the second try """ for i in range(tries-1): try: return os.rename(src, dst) except EnvironmentError, le: # XXX Tighten this to check if this is a permission denied error (possibly due to another Windows process having the file open and execute the superkludge only in this case. log.msg("XXX KLUDGE Attempting to move file %s => %s; got %s; sleeping %s seconds" % (src, dst, le, basedelay,)) time.sleep(basedelay) basedelay *= 2 return os.rename(src, dst) # The last try. def remove(f, tries=4, basedelay=0.1): """ Here is a superkludge to workaround the fact that occasionally on Windows some other process (e.g. an anti-virus scanner, a local search engine, etc.) is looking at your file when you want to delete or move it, and hence you can't. The horrible workaround is to sit and spin, trying to delete it, for a short time and then give up. With the default values of tries and basedelay this can block for less than a second. @param tries: number of tries -- each time after the first we wait twice as long as the previous wait @param basedelay: how long to wait before the second try """ try: os.chmod(f, stat.S_IWRITE | stat.S_IEXEC | stat.S_IREAD) except: pass for i in range(tries-1): try: return os.remove(f) except EnvironmentError, le: # XXX Tighten this to check if this is a permission denied error (possibly due to another Windows process having the file open and execute the superkludge only in this case. if not os.path.exists(f): return log.msg("XXX KLUDGE Attempting to remove file %s; got %s; sleeping %s seconds" % (f, le, basedelay,)) time.sleep(basedelay) basedelay *= 2 return os.remove(f) # The last try. class ReopenableNamedTemporaryFile: """ This uses tempfile.mkstemp() to generate a secure temp file. It then closes the file, leaving a zero-length file as a placeholder. You can get the filename with ReopenableNamedTemporaryFile.name. When the ReopenableNamedTemporaryFile instance is garbage collected or its shutdown() method is called, it deletes the file. """ def __init__(self, *args, **kwargs): fd, self.name = tempfile.mkstemp(*args, **kwargs) os.close(fd) def __repr__(self): return "<%s instance at %x %s>" % (self.__class__.__name__, id(self), self.name) def __str__(self): return self.__repr__() def __del__(self): self.shutdown() def shutdown(self): remove(self.name) class EncryptedTemporaryFile: # not implemented: next, readline, readlines, xreadlines, writelines def __init__(self): self.file = tempfile.TemporaryFile() self.key = os.urandom(16) # AES-128 def _crypt(self, offset, data): offset_big = offset // 16 offset_small = offset % 16 iv = binascii.unhexlify("%032x" % offset_big) cipher = AES(self.key, iv=iv) cipher.process("\x00"*offset_small) return cipher.process(data) def close(self): self.file.close() def flush(self): self.file.flush() def seek(self, offset, whence=0): # 0 = SEEK_SET self.file.seek(offset, whence) def tell(self): offset = self.file.tell() return offset def read(self, size=-1): """A read must not follow a write, or vice-versa, without an intervening seek.""" index = self.file.tell() ciphertext = self.file.read(size) plaintext = self._crypt(index, ciphertext) return plaintext def write(self, plaintext): """A read must not follow a write, or vice-versa, without an intervening seek. If seeking and then writing causes a 'hole' in the file, the contents of the hole are unspecified.""" index = self.file.tell() ciphertext = self._crypt(index, plaintext) self.file.write(ciphertext) def truncate(self, newsize): """Truncate or extend the file to 'newsize'. If it is extended, the contents after the old end-of-file are unspecified. The file position after this operation is unspecified.""" self.file.truncate(newsize) def make_dirs(dirname, mode=0777): """ An idempotent version of os.makedirs(). If the dir already exists, do nothing and return without raising an exception. If this call creates the dir, return without raising an exception. If there is an error that prevents creation or if the directory gets deleted after make_dirs() creates it and before make_dirs() checks that it exists, raise an exception. """ tx = None try: os.makedirs(dirname, mode) except OSError, x: tx = x if not os.path.isdir(dirname): if tx: raise tx raise exceptions.IOError, "unknown error prevented creation of directory, or deleted the directory immediately after creation: %s" % dirname # careful not to construct an IOError with a 2-tuple, as that has a special meaning... def rm_dir(dirname): """ A threadsafe and idempotent version of shutil.rmtree(). If the dir is already gone, do nothing and return without raising an exception. If this call removes the dir, return without raising an exception. If there is an error that prevents deletion or if the directory gets created again after rm_dir() deletes it and before rm_dir() checks that it is gone, raise an exception. """ excs = [] try: os.chmod(dirname, stat.S_IWRITE | stat.S_IEXEC | stat.S_IREAD) for f in os.listdir(dirname): fullname = os.path.join(dirname, f) if os.path.isdir(fullname): rm_dir(fullname) else: remove(fullname) os.rmdir(dirname) except Exception, le: # Ignore "No such file or directory" if (not isinstance(le, OSError)) or le.args[0] != 2: excs.append(le) # Okay, now we've recursively removed everything, ignoring any "No # such file or directory" errors, and collecting any other errors. if os.path.exists(dirname): if len(excs) == 1: raise excs[0] if len(excs) == 0: raise OSError, "Failed to remove dir for unknown reason." raise OSError, excs def remove_if_possible(f): try: remove(f) except: pass def du(basedir): size = 0 for root, dirs, files in os.walk(basedir): for f in files: fn = os.path.join(root, f) size += os.path.getsize(fn) return size def move_into_place(source, dest): """Atomically replace a file, or as near to it as the platform allows. The dest file may or may not exist.""" if "win32" in sys.platform.lower(): remove_if_possible(dest) os.rename(source, dest) def write_atomically(target, contents, mode="b"): f = open(target+".tmp", "w"+mode) try: f.write(contents) finally: f.close() move_into_place(target+".tmp", target) def write(path, data, mode="wb"): wf = open(path, mode) try: wf.write(data) finally: wf.close() def read(path): rf = open(path, "rb") try: return rf.read() finally: rf.close() def put_file(path, inf): precondition_abspath(path) # TODO: create temporary file and move into place? outf = open(path, "wb") try: while True: data = inf.read(32768) if not data: break outf.write(data) finally: outf.close() def precondition_abspath(path): if not isinstance(path, unicode): raise AssertionError("an abspath must be a Unicode string") if sys.platform == "win32": # This intentionally doesn't view absolute paths starting with a drive specification, or # paths relative to the current drive, as acceptable. if not path.startswith("\\\\"): raise AssertionError("an abspath should be normalized using abspath_expanduser_unicode") else: # This intentionally doesn't view the path '~' or paths starting with '~/' as acceptable. if not os.path.isabs(path): raise AssertionError("an abspath should be normalized using abspath_expanduser_unicode") # Work around . This code is adapted from # # with some simplifications. _getfullpathname = None try: from nt import _getfullpathname except ImportError: pass def abspath_expanduser_unicode(path, base=None): """ Return the absolute version of a path. If 'base' is given and 'path' is relative, the path will be expanded relative to 'base'. 'path' must be a Unicode string. 'base', if given, must be a Unicode string corresponding to an absolute path as returned by a previous call to abspath_expanduser_unicode. """ if not isinstance(path, unicode): raise AssertionError("paths must be Unicode strings") if base is not None: precondition_abspath(base) path = expanduser(path) if _getfullpathname: # On Windows, os.path.isabs will incorrectly return True # for paths without a drive letter (that are not UNC paths), # e.g. "\\". See . try: if base is None: path = _getfullpathname(path or u".") else: path = _getfullpathname(os.path.join(base, path)) except OSError: pass if not os.path.isabs(path): if base is None: path = os.path.join(os.getcwdu(), path) else: path = os.path.join(base, path) # We won't hit because # there is always at least one Unicode path component. path = os.path.normpath(path) if sys.platform == "win32": path = to_windows_long_path(path) return path def to_windows_long_path(path): # '/' is normally a perfectly valid path component separator in Windows. # However, when using the "\\?\" syntax it is not recognized, so we # replace it with '\' here. path = path.replace(u"/", u"\\") # Note that other normalizations such as removing '.' and '..' should # be done outside this function. if path.startswith(u"\\\\?\\") or path.startswith(u"\\\\.\\"): return path elif path.startswith(u"\\\\"): return u"\\\\?\\UNC\\" + path[2 :] else: return u"\\\\?\\" + path have_GetDiskFreeSpaceExW = False if sys.platform == "win32": from ctypes import WINFUNCTYPE, windll, POINTER, byref, c_ulonglong, create_unicode_buffer, \ get_last_error from ctypes.wintypes import BOOL, DWORD, LPCWSTR, LPWSTR # GetEnvironmentVariableW = WINFUNCTYPE( DWORD, LPCWSTR, LPWSTR, DWORD, use_last_error=True )(("GetEnvironmentVariableW", windll.kernel32)) try: # PULARGE_INTEGER = POINTER(c_ulonglong) # GetDiskFreeSpaceExW = WINFUNCTYPE( BOOL, LPCWSTR, PULARGE_INTEGER, PULARGE_INTEGER, PULARGE_INTEGER, use_last_error=True )(("GetDiskFreeSpaceExW", windll.kernel32)) have_GetDiskFreeSpaceExW = True except Exception: import traceback traceback.print_exc() def expanduser(path): # os.path.expanduser is hopelessly broken for Unicode paths on Windows (ticket #1674). if sys.platform == "win32": return windows_expanduser(path) else: return os.path.expanduser(path) def windows_expanduser(path): if not path.startswith('~'): return path home_dir = windows_getenv(u'USERPROFILE') if home_dir is None: home_drive = windows_getenv(u'HOMEDRIVE') home_path = windows_getenv(u'HOMEPATH') if home_drive is None or home_path is None: raise OSError("Could not find home directory: neither %USERPROFILE% nor (%HOMEDRIVE% and %HOMEPATH%) are set.") home_dir = os.path.join(home_drive, home_path) if path == '~': return home_dir elif path.startswith('~/') or path.startswith('~\\'): return os.path.join(home_dir, path[2 :]) else: return path # ERROR_ENVVAR_NOT_FOUND = 203 def windows_getenv(name): # Based on , # with improved error handling. Returns None if there is no enivronment variable of the given name. if not isinstance(name, unicode): raise AssertionError("name must be Unicode") n = GetEnvironmentVariableW(name, None, 0) # GetEnvironmentVariableW returns DWORD, so n cannot be negative. if n == 0: err = get_last_error() if err == ERROR_ENVVAR_NOT_FOUND: return None raise OSError("Windows error %d attempting to read size of environment variable %r" % (err, name)) if n == 1: # Avoid an ambiguity between a zero-length string and an error in the return value of the # call to GetEnvironmentVariableW below. return u"" buf = create_unicode_buffer(u'\0'*n) retval = GetEnvironmentVariableW(name, buf, n) if retval == 0: err = get_last_error() if err == ERROR_ENVVAR_NOT_FOUND: return None raise OSError("Windows error %d attempting to read environment variable %r" % (err, name)) if retval >= n: raise OSError("Unexpected result %d (expected less than %d) from GetEnvironmentVariableW attempting to read environment variable %r" % (retval, n, name)) return buf.value def get_disk_stats(whichdir, reserved_space=0): """Return disk statistics for the storage disk, in the form of a dict with the following fields. total: total bytes on disk free_for_root: bytes actually free on disk free_for_nonroot: bytes free for "a non-privileged user" [Unix] or the current user [Windows]; might take into account quotas depending on platform used: bytes used on disk avail: bytes available excluding reserved space An AttributeError can occur if the OS has no API to get disk information. An EnvironmentError can occur if the OS call fails. whichdir is a directory on the filesystem in question -- the answer is about the filesystem, not about the directory, so the directory is used only to specify which filesystem. reserved_space is how many bytes to subtract from the answer, so you can pass how many bytes you would like to leave unused on this filesystem as reserved_space. """ if have_GetDiskFreeSpaceExW: # If this is a Windows system and GetDiskFreeSpaceExW is available, use it. # (This might put up an error dialog unless # SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOOPENFILEERRORBOX) has been called, # which we do in allmydata.windows.fixups.initialize().) n_free_for_nonroot = c_ulonglong(0) n_total = c_ulonglong(0) n_free_for_root = c_ulonglong(0) retval = GetDiskFreeSpaceExW(whichdir, byref(n_free_for_nonroot), byref(n_total), byref(n_free_for_root)) if retval == 0: raise OSError("Windows error %d attempting to get disk statistics for %r" % (get_last_error(), whichdir)) free_for_nonroot = n_free_for_nonroot.value total = n_total.value free_for_root = n_free_for_root.value else: # For Unix-like systems. # # # s = os.statvfs(whichdir) # on my mac laptop: # statvfs(2) is a wrapper around statfs(2). # statvfs.f_frsize = statfs.f_bsize : # "minimum unit of allocation" (statvfs) # "fundamental file system block size" (statfs) # statvfs.f_bsize = statfs.f_iosize = stat.st_blocks : preferred IO size # on an encrypted home directory ("FileVault"), it gets f_blocks # wrong, and s.f_blocks*s.f_frsize is twice the size of my disk, # but s.f_bavail*s.f_frsize is correct total = s.f_frsize * s.f_blocks free_for_root = s.f_frsize * s.f_bfree free_for_nonroot = s.f_frsize * s.f_bavail # valid for all platforms: used = total - free_for_root avail = max(free_for_nonroot - reserved_space, 0) return { 'total': total, 'free_for_root': free_for_root, 'free_for_nonroot': free_for_nonroot, 'used': used, 'avail': avail, } def get_available_space(whichdir, reserved_space): """Returns available space for share storage in bytes, or None if no API to get this information is available. whichdir is a directory on the filesystem in question -- the answer is about the filesystem, not about the directory, so the directory is used only to specify which filesystem. reserved_space is how many bytes to subtract from the answer, so you can pass how many bytes you would like to leave unused on this filesystem as reserved_space. """ try: return get_disk_stats(whichdir, reserved_space)['avail'] except AttributeError: return None except EnvironmentError: log.msg("OS call to get disk statistics failed") return 0 allmydata-tahoe-1.10.2/src/allmydata/util/base32.py0000644000175000017500000002700412556560070020137 0ustar ramram# from the Python Standard Library import string from allmydata.util.assertutil import precondition z_base_32_alphabet = "ybndrfg8ejkmcpqxot1uwisza345h769" # Zooko's choice, rationale in "DESIGN" doc rfc3548_alphabet = "abcdefghijklmnopqrstuvwxyz234567" # RFC3548 standard used by Gnutella, Content-Addressable Web, THEX, Bitzi, Web-Calculus... chars = rfc3548_alphabet vals = ''.join(map(chr, range(32))) c2vtranstable = string.maketrans(chars, vals) v2ctranstable = string.maketrans(vals, chars) identitytranstable = string.maketrans('', '') def _get_trailing_chars_without_lsbs(N, d): """ @return: a list of chars that can legitimately appear in the last place when the least significant N bits are ignored. """ s = [] if N < 4: s.extend(_get_trailing_chars_without_lsbs(N+1, d=d)) i = 0 while i < len(chars): if not d.has_key(i): d[i] = None s.append(chars[i]) i = i + 2**N return s def get_trailing_chars_without_lsbs(N): precondition((N >= 0) and (N < 5), "N is required to be > 0 and < len(chars).", N=N) if N == 0: return chars d = {} return ''.join(_get_trailing_chars_without_lsbs(N, d=d)) BASE32CHAR = '['+get_trailing_chars_without_lsbs(0)+']' BASE32CHAR_4bits = '['+get_trailing_chars_without_lsbs(1)+']' BASE32CHAR_3bits = '['+get_trailing_chars_without_lsbs(2)+']' BASE32CHAR_2bits = '['+get_trailing_chars_without_lsbs(3)+']' BASE32CHAR_1bits = '['+get_trailing_chars_without_lsbs(4)+']' BASE32STR_1byte = BASE32CHAR+BASE32CHAR_3bits BASE32STR_2bytes = BASE32CHAR+'{3}'+BASE32CHAR_1bits BASE32STR_3bytes = BASE32CHAR+'{4}'+BASE32CHAR_4bits BASE32STR_4bytes = BASE32CHAR+'{6}'+BASE32CHAR_2bits BASE32STR_anybytes = '((?:%s{8})*' % (BASE32CHAR,) + "(?:|%s|%s|%s|%s))" % (BASE32STR_1byte, BASE32STR_2bytes, BASE32STR_3bytes, BASE32STR_4bytes) def b2a(os): """ @param os the data to be encoded (a string) @return the contents of os in base-32 encoded form """ return b2a_l(os, len(os)*8) def b2a_or_none(os): if os is not None: return b2a(os) def b2a_l(os, lengthinbits): """ @param os the data to be encoded (a string) @param lengthinbits the number of bits of data in os to be encoded b2a_l() will generate a base-32 encoded string big enough to encode lengthinbits bits. So for example if os is 2 bytes long and lengthinbits is 15, then b2a_l() will generate a 3-character- long base-32 encoded string (since 3 quintets is sufficient to encode 15 bits). If os is 2 bytes long and lengthinbits is 16 (or None), then b2a_l() will generate a 4-character string. Note that b2a_l() does not mask off unused least-significant bits, so for example if os is 2 bytes long and lengthinbits is 15, then you must ensure that the unused least-significant bit of os is a zero bit or you will get the wrong result. This precondition is tested by assertions if assertions are enabled. Warning: if you generate a base-32 encoded string with b2a_l(), and then someone else tries to decode it by calling a2b() instead of a2b_l(), then they will (probably) get a different string than the one you encoded! So only use b2a_l() when you are sure that the encoding and decoding sides know exactly which lengthinbits to use. If you do not have a way for the encoder and the decoder to agree upon the lengthinbits, then it is best to use b2a() and a2b(). The only drawback to using b2a() over b2a_l() is that when you have a number of bits to encode that is not a multiple of 8, b2a() can sometimes generate a base-32 encoded string that is one or two characters longer than necessary. @return the contents of os in base-32 encoded form """ precondition(isinstance(lengthinbits, (int, long,)), "lengthinbits is required to be an integer.", lengthinbits=lengthinbits) precondition((lengthinbits+7)/8 == len(os), "lengthinbits is required to specify a number of bits storable in exactly len(os) octets.", lengthinbits=lengthinbits, lenos=len(os)) os = map(ord, os) numquintets = (lengthinbits+4)/5 numoctetsofdata = (lengthinbits+7)/8 # print "numoctetsofdata: %s, len(os): %s, lengthinbits: %s, numquintets: %s" % (numoctetsofdata, len(os), lengthinbits, numquintets,) # strip trailing octets that won't be used del os[numoctetsofdata:] # zero out any unused bits in the final octet if lengthinbits % 8 != 0: os[-1] = os[-1] >> (8-(lengthinbits % 8)) os[-1] = os[-1] << (8-(lengthinbits % 8)) # append zero octets for padding if needed numoctetsneeded = (numquintets*5+7)/8 + 1 os.extend([0]*(numoctetsneeded-len(os))) quintets = [] cutoff = 256 num = os[0] i = 0 while len(quintets) < numquintets: i = i + 1 assert len(os) > i, "len(os): %s, i: %s, len(quintets): %s, numquintets: %s, lengthinbits: %s, numoctetsofdata: %s, numoctetsneeded: %s, os: %s" % (len(os), i, len(quintets), numquintets, lengthinbits, numoctetsofdata, numoctetsneeded, os,) num = num * 256 num = num + os[i] if cutoff == 1: cutoff = 256 continue cutoff = cutoff * 8 quintet = num / cutoff quintets.append(quintet) num = num - (quintet * cutoff) cutoff = cutoff / 32 quintet = num / cutoff quintets.append(quintet) num = num - (quintet * cutoff) if len(quintets) > numquintets: assert len(quintets) == (numquintets+1), "len(quintets): %s, numquintets: %s, quintets: %s" % (len(quintets), numquintets, quintets,) quintets = quintets[:numquintets] res = string.translate(string.join(map(chr, quintets), ''), v2ctranstable) assert could_be_base32_encoded_l(res, lengthinbits), "lengthinbits: %s, res: %s" % (lengthinbits, res,) return res # b2a() uses the minimal number of quintets sufficient to encode the binary # input. It just so happens that the relation is like this (everything is # modulo 40 bits). # num_qs = NUM_OS_TO_NUM_QS[num_os] NUM_OS_TO_NUM_QS=(0, 2, 4, 5, 7,) # num_os = NUM_QS_TO_NUM_OS[num_qs], but if not NUM_QS_LEGIT[num_qs] then # there is *no* number of octets which would have resulted in this number of # quintets, so either the encoded string has been mangled (truncated) or else # you were supposed to decode it with a2b_l() (which means you were supposed # to know the actual length of the encoded data). NUM_QS_TO_NUM_OS=(0, 1, 1, 2, 2, 3, 3, 4) NUM_QS_LEGIT=(1, 0, 1, 0, 1, 1, 0, 1,) NUM_QS_TO_NUM_BITS=tuple(map(lambda x: x*8, NUM_QS_TO_NUM_OS)) # A fast way to determine whether a given string *could* be base-32 encoded data, assuming that the # original data had 8K bits for a positive integer K. # The boolean value of s8[len(s)%8][ord(s[-1])], where s is the possibly base-32 encoded string # tells whether the final character is reasonable. def add_check_array(cs, sfmap): checka=[0] * 256 for c in cs: checka[ord(c)] = 1 sfmap.append(tuple(checka)) def init_s8(): s8 = [] add_check_array(chars, s8) for lenmod8 in (1, 2, 3, 4, 5, 6, 7,): if NUM_QS_LEGIT[lenmod8]: add_check_array(get_trailing_chars_without_lsbs(4-(NUM_QS_TO_NUM_BITS[lenmod8]%5)), s8) else: add_check_array('', s8) return tuple(s8) s8 = init_s8() # A somewhat fast way to determine whether a given string *could* be base-32 encoded data, given a # lengthinbits. # The boolean value of s5[lengthinbits%5][ord(s[-1])], where s is the possibly base-32 encoded # string tells whether the final character is reasonable. def init_s5(): s5 = [] add_check_array(get_trailing_chars_without_lsbs(0), s5) for lenmod5 in [1,2,3,4]: add_check_array(get_trailing_chars_without_lsbs(5-lenmod5), s5) return tuple(s5) s5 = init_s5() def could_be_base32_encoded(s, s8=s8, tr=string.translate, identitytranstable=identitytranstable, chars=chars): precondition(isinstance(s, str), s) if s == '': return True return s8[len(s)%8][ord(s[-1])] and not tr(s, identitytranstable, chars) def could_be_base32_encoded_l(s, lengthinbits, s5=s5, tr=string.translate, identitytranstable=identitytranstable, chars=chars): precondition(isinstance(s, str), s) if s == '': return True assert lengthinbits%5 < len(s5), lengthinbits assert ord(s[-1]) < s5[lengthinbits%5] return (((lengthinbits+4)/5) == len(s)) and s5[lengthinbits%5][ord(s[-1])] and not string.translate(s, identitytranstable, chars) def num_octets_that_encode_to_this_many_quintets(numqs): # Here is a computation that conveniently expresses this: return (numqs*5+3)/8 def a2b(cs): """ @param cs the base-32 encoded data (a string) """ precondition(could_be_base32_encoded(cs), "cs is required to be possibly base32 encoded data.", cs=cs) precondition(isinstance(cs, str), cs) return a2b_l(cs, num_octets_that_encode_to_this_many_quintets(len(cs))*8) def a2b_l(cs, lengthinbits): """ @param lengthinbits the number of bits of data in encoded into cs a2b_l() will return a result big enough to hold lengthinbits bits. So for example if cs is 4 characters long (encoding at least 15 and up to 20 bits) and lengthinbits is 16, then a2b_l() will return a string of length 2 (since 2 bytes is sufficient to store 16 bits). If cs is 4 characters long and lengthinbits is 20, then a2b_l() will return a string of length 3 (since 3 bytes is sufficient to store 20 bits). Note that b2a_l() does not mask off unused least- significant bits, so for example if cs is 4 characters long and lengthinbits is 17, then you must ensure that all three of the unused least-significant bits of cs are zero bits or you will get the wrong result. This precondition is tested by assertions if assertions are enabled. (Generally you just require the encoder to ensure this consistency property between the least significant zero bits and value of lengthinbits, and reject strings that have a length-in-bits which isn't a multiple of 8 and yet don't have trailing zero bits, as improperly encoded.) Please see the warning in the docstring of b2a_l() regarding the use of b2a() versus b2a_l(). @return the data encoded in cs """ precondition(could_be_base32_encoded_l(cs, lengthinbits), "cs is required to be possibly base32 encoded data.", cs=cs, lengthinbits=lengthinbits) precondition(isinstance(cs, str), cs) if cs == '': return '' qs = map(ord, string.translate(cs, c2vtranstable)) numoctets = (lengthinbits+7)/8 numquintetsofdata = (lengthinbits+4)/5 # strip trailing quintets that won't be used del qs[numquintetsofdata:] # zero out any unused bits in the final quintet if lengthinbits % 5 != 0: qs[-1] = qs[-1] >> (5-(lengthinbits % 5)) qs[-1] = qs[-1] << (5-(lengthinbits % 5)) # append zero quintets for padding if needed numquintetsneeded = (numoctets*8+4)/5 qs.extend([0]*(numquintetsneeded-len(qs))) octets = [] pos = 2048 num = qs[0] * pos i = 1 while len(octets) < numoctets: while pos > 256: pos = pos / 32 num = num + (qs[i] * pos) i = i + 1 octet = num / 256 octets.append(octet) num = num - (octet * 256) num = num * 256 pos = pos * 256 assert len(octets) == numoctets, "len(octets): %s, numoctets: %s, octets: %s" % (len(octets), numoctets, octets,) res = ''.join(map(chr, octets)) precondition(b2a_l(res, lengthinbits) == cs, "cs is required to be the canonical base-32 encoding of some data.", b2a(res), res=res, cs=cs) return res allmydata-tahoe-1.10.2/src/allmydata/util/encodingutil.py0000644000175000017500000002336512556560070021552 0ustar ramram""" Functions used to convert inputs from whatever encoding used in the system to unicode and back. """ import sys, os, re, locale from types import NoneType from allmydata.util.assertutil import precondition from twisted.python import usage from allmydata.util import log from allmydata.util.fileutil import abspath_expanduser_unicode def canonical_encoding(encoding): if encoding is None: log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD) encoding = 'utf-8' encoding = encoding.lower() if encoding == "cp65001": encoding = 'utf-8' elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968": encoding = 'ascii' return encoding def check_encoding(encoding): # sometimes Python returns an encoding name that it doesn't support for conversion # fail early if this happens try: u"test".encode(encoding) except (LookupError, AttributeError): raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,)) filesystem_encoding = None io_encoding = None is_unicode_platform = False def _reload(): global filesystem_encoding, io_encoding, is_unicode_platform filesystem_encoding = canonical_encoding(sys.getfilesystemencoding()) check_encoding(filesystem_encoding) if sys.platform == 'win32': # On Windows we install UTF-8 stream wrappers for sys.stdout and # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py). io_encoding = 'utf-8' else: ioenc = None if hasattr(sys.stdout, 'encoding'): ioenc = sys.stdout.encoding if ioenc is None: try: ioenc = locale.getpreferredencoding() except Exception: pass # work around io_encoding = canonical_encoding(ioenc) check_encoding(io_encoding) is_unicode_platform = sys.platform in ["win32", "darwin"] _reload() def get_filesystem_encoding(): """ Returns expected encoding for local filenames. """ return filesystem_encoding def get_io_encoding(): """ Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv. """ return io_encoding def argv_to_unicode(s): """ Decode given argv element to unicode. If this fails, raise a UsageError. """ precondition(isinstance(s, str), s) try: return unicode(s, io_encoding) except UnicodeDecodeError: raise usage.UsageError("Argument %s cannot be decoded as %s." % (quote_output(s), io_encoding)) def argv_to_abspath(s): """ Convenience function to decode an argv element to an absolute path, with ~ expanded. If this fails, raise a UsageError. """ return abspath_expanduser_unicode(argv_to_unicode(s)) def unicode_to_argv(s, mangle=False): """ Encode the given Unicode argument as a bytestring. If the argument is to be passed to a different process, then the 'mangle' argument should be true; on Windows, this uses a mangled encoding that will be reversed by code in runner.py. """ precondition(isinstance(s, unicode), s) if mangle and sys.platform == "win32": # This must be the same as 'mangle' in bin/tahoe-script.template. return str(re.sub(ur'[^\x20-\x7F]', lambda m: u'\x7F%x;' % (ord(m.group(0)),), s)) else: return s.encode(io_encoding) def unicode_to_url(s): """ Encode an unicode object used in an URL. """ # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded. # FIXME return to_str(s) #precondition(isinstance(s, unicode), s) #return s.encode('utf-8') def to_str(s): if s is None or isinstance(s, str): return s return s.encode('utf-8') def from_utf8_or_none(s): precondition(isinstance(s, (NoneType, str)), s) if s is None: return s return s.decode('utf-8') PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL) PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) def is_printable_ascii(s): return PRINTABLE_ASCII.search(s) is not None def unicode_to_output(s): """ Encode an unicode object for representation on stdout or stderr. """ precondition(isinstance(s, unicode), s) try: out = s.encode(io_encoding) except (UnicodeEncodeError, UnicodeDecodeError): raise UnicodeEncodeError(io_encoding, s, 0, 0, "A string could not be encoded as %s for output to the terminal:\n%r" % (io_encoding, repr(s))) if PRINTABLE_8BIT.search(out) is None: raise UnicodeEncodeError(io_encoding, s, 0, 0, "A string encoded as %s for output to the terminal contained unsafe bytes:\n%r" % (io_encoding, repr(s))) return out def _unicode_escape(m, quote_newlines): u = m.group(0) if u == u'"' or u == u'$' or u == u'`' or u == u'\\': return u'\\' + u elif u == u'\n' and not quote_newlines: return u if len(u) == 2: codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 else: codepoint = ord(u) if codepoint > 0xFFFF: return u'\\U%08x' % (codepoint,) elif codepoint > 0xFF: return u'\\u%04x' % (codepoint,) else: return u'\\x%02x' % (codepoint,) def _str_escape(m, quote_newlines): c = m.group(0) if c == '"' or c == '$' or c == '`' or c == '\\': return '\\' + c elif c == '\n' and not quote_newlines: return c else: return '\\x%02x' % (ord(c),) MUST_DOUBLE_QUOTE_NL = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) MUST_DOUBLE_QUOTE = re.compile(ur'[^\n\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) # if we must double-quote, then we have to escape ", $ and `, but need not escape ' ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): """ Encode either a Unicode string or a UTF-8-encoded bytestring for representation on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or control bytes in the output. (Newlines are counted as control bytes iff quote_newlines is True.) Quoting may use either single or double quotes. Within single quotes, all characters stand for themselves, and ' will not appear. Within double quotes, Python-compatible backslash escaping is used. If not explicitly given, quote_newlines is True when quotemarks is True. """ precondition(isinstance(s, (str, unicode)), s) if quote_newlines is None: quote_newlines = quotemarks if isinstance(s, str): try: s = s.decode('utf-8') except UnicodeDecodeError: return 'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _str_escape(m, quote_newlines), s),) must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE if must_double_quote.search(s) is None: try: out = s.encode(encoding or io_encoding) if quotemarks or out.startswith('"'): return "'%s'" % (out,) else: return out except (UnicodeDecodeError, UnicodeEncodeError): pass escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s) return '"%s"' % (escaped.encode(encoding or io_encoding, 'backslashreplace'),) def quote_path(path, quotemarks=True): return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks, quote_newlines=True) def quote_local_unicode_path(path, quotemarks=True): precondition(isinstance(path, unicode), path) if sys.platform == "win32" and path.startswith(u"\\\\?\\"): path = path[4 :] if path.startswith(u"UNC\\"): path = u"\\\\" + path[4 :] return quote_output(path, quotemarks=quotemarks, quote_newlines=True) def unicode_platform(): """ Does the current platform handle Unicode filenames natively? """ return is_unicode_platform class FilenameEncodingError(Exception): """ Filename cannot be encoded using the current encoding of your filesystem (%s). Please configure your locale correctly or rename this file. """ pass def listdir_unicode_fallback(path): """ This function emulates a fallback Unicode API similar to one available under Windows or MacOS X. If badly encoded filenames are encountered, an exception is raised. """ precondition(isinstance(path, unicode), path) try: byte_path = path.encode(filesystem_encoding) except (UnicodeEncodeError, UnicodeDecodeError): raise FilenameEncodingError(path) try: return [unicode(fn, filesystem_encoding) for fn in os.listdir(byte_path)] except UnicodeDecodeError: raise FilenameEncodingError(fn) def listdir_unicode(path): """ Wrapper around listdir() which provides safe access to the convenient Unicode API even under platforms that don't provide one natively. """ precondition(isinstance(path, unicode), path) # On Windows and MacOS X, the Unicode API is used # On other platforms (ie. Unix systems), the byte-level API is used if is_unicode_platform: return os.listdir(path) else: return listdir_unicode_fallback(path) allmydata-tahoe-1.10.2/src/allmydata/util/abbreviate.py0000644000175000017500000000454612556560070021172 0ustar ramram import re HOUR = 3600 DAY = 24*3600 WEEK = 7*DAY MONTH = 30*DAY YEAR = 365*DAY def abbreviate_time(s): def _plural(count, unit): count = int(count) if count == 1: return "%d %s" % (count, unit) return "%d %ss" % (count, unit) if s is None: return "unknown" if s < 120: return _plural(s, "second") if s < 3*HOUR: return _plural(s/60, "minute") if s < 2*DAY: return _plural(s/HOUR, "hour") if s < 2*MONTH: return _plural(s/DAY, "day") if s < 4*YEAR: return _plural(s/MONTH, "month") return _plural(s/YEAR, "year") def abbreviate_space(s, SI=True): if s is None: return "unknown" if SI: U = 1000.0 isuffix = "B" else: U = 1024.0 isuffix = "iB" def r(count, suffix): return "%.2f %s%s" % (count, suffix, isuffix) if s < 1024: # 1000-1023 get emitted as bytes, even in SI mode return "%d B" % s if s < U*U: return r(s/U, "k") if s < U*U*U: return r(s/(U*U), "M") if s < U*U*U*U: return r(s/(U*U*U), "G") if s < U*U*U*U*U: return r(s/(U*U*U*U), "T") if s < U*U*U*U*U*U: return r(s/(U*U*U*U*U), "P") return r(s/(U*U*U*U*U*U), "E") def abbreviate_space_both(s): return "(%s, %s)" % (abbreviate_space(s, True), abbreviate_space(s, False)) def parse_abbreviated_size(s): if s is None or s == "": return None m = re.match(r"^(\d+)([KMGTPE]?[I]?[B]?)$", s.upper()) if not m: raise ValueError("unparseable value %s" % s) number, suffix = m.groups() if suffix.endswith("B"): suffix = suffix[:-1] multiplier = {"": 1, "I": 1, "K": 1000, "M": 1000 * 1000, "G": 1000 * 1000 * 1000, "T": 1000 * 1000 * 1000 * 1000, "P": 1000 * 1000 * 1000 * 1000 * 1000, "E": 1000 * 1000 * 1000 * 1000 * 1000 * 1000, "KI": 1024, "MI": 1024 * 1024, "GI": 1024 * 1024 * 1024, "TI": 1024 * 1024 * 1024 * 1024, "PI": 1024 * 1024 * 1024 * 1024 * 1024, "EI": 1024 * 1024 * 1024 * 1024 * 1024 * 1024, }[suffix] return int(number) * multiplier allmydata-tahoe-1.10.2/src/allmydata/util/namespace.py0000644000175000017500000000004312556560070021006 0ustar ramram class Namespace(object): pass allmydata-tahoe-1.10.2/src/allmydata/util/pipeline.py0000644000175000017500000001103212556560070020657 0ustar ramram from twisted.internet import defer from twisted.python.failure import Failure from twisted.python import log from allmydata.util.assertutil import precondition class PipelineError(Exception): """One of the pipelined messages returned an error. The received Failure object is stored in my .error attribute.""" def __init__(self, error): self.error = error def __repr__(self): return "" % (self.error,) def __str__(self): return "" % (self.error,) class SingleFileError(Exception): """You are not permitted to add a job to a full pipeline.""" class ExpandableDeferredList(defer.Deferred): # like DeferredList(fireOnOneErrback=True) with a built-in # gatherResults(), but you can add new Deferreds until you close it. This # gives you a chance to add don't-complain-about-unhandled-error errbacks # immediately after attachment, regardless of whether you actually end up # wanting the list or not. def __init__(self): defer.Deferred.__init__(self) self.resultsReceived = 0 self.resultList = [] self.failure = None self.closed = False def addDeferred(self, d): precondition(not self.closed, "don't call addDeferred() on a closed ExpandableDeferredList") index = len(self.resultList) self.resultList.append(None) d.addCallbacks(self._cbDeferred, self._ebDeferred, callbackArgs=(index,)) return d def close(self): self.closed = True self.checkForFinished() def checkForFinished(self): if not self.closed: return if self.called: return if self.failure: self.errback(self.failure) elif self.resultsReceived == len(self.resultList): self.callback(self.resultList) def _cbDeferred(self, res, index): self.resultList[index] = res self.resultsReceived += 1 self.checkForFinished() return res def _ebDeferred(self, f): self.failure = f self.checkForFinished() return f class Pipeline: """I manage a size-limited pipeline of Deferred operations, usually callRemote() messages.""" def __init__(self, capacity): self.capacity = capacity # how full we can be self.gauge = 0 # how full we are self.failure = None self.waiting = [] # callers of add() who are blocked self.unflushed = ExpandableDeferredList() def add(self, _size, _func, *args, **kwargs): # We promise that all the Deferreds we return will fire in the order # they were returned. To make it easier to keep this promise, we # prohibit multiple outstanding calls to add() . if self.waiting: raise SingleFileError if self.failure: return defer.fail(self.failure) self.gauge += _size fd = defer.maybeDeferred(_func, *args, **kwargs) fd.addBoth(self._call_finished, _size) self.unflushed.addDeferred(fd) fd.addErrback(self._eat_pipeline_errors) fd.addErrback(log.err, "_eat_pipeline_errors didn't eat it") if self.gauge < self.capacity: return defer.succeed(None) d = defer.Deferred() self.waiting.append(d) return d def flush(self): if self.failure: return defer.fail(self.failure) d, self.unflushed = self.unflushed, ExpandableDeferredList() d.close() d.addErrback(self._flushed_error) return d def _flushed_error(self, f): precondition(self.failure) # should have been set by _call_finished return self.failure def _call_finished(self, res, size): self.gauge -= size if isinstance(res, Failure): res = Failure(PipelineError(res)) if not self.failure: self.failure = res if self.failure: while self.waiting: d = self.waiting.pop(0) d.errback(self.failure) else: while self.waiting and (self.gauge < self.capacity): d = self.waiting.pop(0) d.callback(None) # the d.callback() might trigger a new call to add(), which # will raise our gauge and might cause the pipeline to be # filled. So the while() loop gets a chance to tell the # caller to stop. return res def _eat_pipeline_errors(self, f): f.trap(PipelineError) return None allmydata-tahoe-1.10.2/src/allmydata/util/mathutil.py0000644000175000017500000000245112556560070020706 0ustar ramram""" A few commonly needed functions. """ import math def div_ceil(n, d): """ The smallest integer k such that k*d >= n. """ return (n/d) + (n%d != 0) def next_multiple(n, k): """ The smallest multiple of k which is >= n. """ return div_ceil(n, k) * k def pad_size(n, k): """ The smallest number that has to be added to n so that n is a multiple of k. """ if n%k: return k - n%k else: return 0 def is_power_of_k(n, k): return k**int(math.log(n, k) + 0.5) == n def next_power_of_k(n, k): if n == 0: x = 0 else: x = int(math.log(n, k) + 0.5) if k**x < n: return k**(x+1) else: return k**x def ave(l): return sum(l) / len(l) def log_ceil(n, b): """ The smallest integer k such that b^k >= n. log_ceil(n, 2) is the number of bits needed to store any of n values, e.g. the number of bits needed to store any of 128 possible values is 7. """ p = 1 k = 0 while p < n: p *= b k += 1 return k def log_floor(n, b): """ The largest integer k such that b^k <= n. """ p = 1 k = 0 while p <= n: p *= b k += 1 return k - 1 def round_sigfigs(f, n): fmt = "%." + str(n-1) + "e" return float(fmt % f) allmydata-tahoe-1.10.2/src/allmydata/util/cachedir.py0000644000175000017500000000237012556560070020621 0ustar ramram import os.path, stat, weakref, time from twisted.application import service, internet from allmydata.util import fileutil HOUR = 60*60 class CacheDirectoryManager(service.MultiService): def __init__(self, basedir, pollinterval=1*HOUR, old=1*HOUR): service.MultiService.__init__(self) self.basedir = basedir fileutil.make_dirs(basedir) self.old = old self.files = weakref.WeakValueDictionary() t = internet.TimerService(pollinterval, self.check) t.setServiceParent(self) def get_file(self, key): assert isinstance(key, str) # used as filename absfn = os.path.join(self.basedir, key) if os.path.exists(absfn): os.utime(absfn, None) cf = CacheFile(absfn) self.files[key] = cf return cf def check(self): now = time.time() for fn in os.listdir(self.basedir): if fn in self.files: continue absfn = os.path.join(self.basedir, fn) mtime = os.stat(absfn)[stat.ST_MTIME] if now - mtime > self.old: os.remove(absfn) class CacheFile: def __init__(self, absfn): self.filename = absfn def get_filename(self): return self.filename allmydata-tahoe-1.10.2/src/allmydata/util/__init__.py0000644000175000017500000000000012556560070020602 0ustar ramramallmydata-tahoe-1.10.2/src/allmydata/_version.py0000644000175000017500000000036612556560072017733 0ustar ramram # This _version.py is generated from git metadata by the tahoe setup.py. __pkgname__ = 'allmydata-tahoe' real_version = '1.10.2' full_version = 'befa4babea7f92609654207c164b6d07f3baf92b' branch = 'master' verstr = '1.10.2' __version__ = verstr allmydata-tahoe-1.10.2/src/allmydata/immutable/0000755000175000017500000000000012556560072017507 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/0000755000175000017500000000000012556560072021645 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/finder.py0000644000175000017500000002201212556560070023461 0ustar ramram import time now = time.time from foolscap.api import eventually from allmydata.util import base32, log from twisted.internet import reactor from share import Share, CommonShare def incidentally(res, f, *args, **kwargs): """Add me to a Deferred chain like this: d.addBoth(incidentally, func, arg) and I'll behave as if you'd added the following function: def _(res): func(arg) return res This is useful if you want to execute an expression when the Deferred fires, but don't care about its value. """ f(*args, **kwargs) return res class RequestToken: def __init__(self, server): self.server = server class ShareFinder: OVERDUE_TIMEOUT = 10.0 def __init__(self, storage_broker, verifycap, node, download_status, logparent=None, max_outstanding_requests=10): self.running = True # stopped by Share.stop, from Terminator self.verifycap = verifycap self._started = False self._storage_broker = storage_broker self.share_consumer = self.node = node self.max_outstanding_requests = max_outstanding_requests self._hungry = False self._commonshares = {} # shnum to CommonShare instance self.pending_requests = set() self.overdue_requests = set() # subset of pending_requests self.overdue_timers = {} self._storage_index = verifycap.storage_index self._si_prefix = base32.b2a_l(self._storage_index[:8], 60) self._node_logparent = logparent self._download_status = download_status self._lp = log.msg(format="ShareFinder[si=%(si)s] starting", si=self._si_prefix, level=log.NOISY, parent=logparent, umid="2xjj2A") def update_num_segments(self): (numsegs, authoritative) = self.node.get_num_segments() assert authoritative for cs in self._commonshares.values(): cs.set_authoritative_num_segments(numsegs) def start_finding_servers(self): # don't get servers until somebody uses us: creating the # ImmutableFileNode should not cause work to happen yet. Test case is # test_dirnode, which creates us with storage_broker=None if not self._started: si = self.verifycap.storage_index servers = self._storage_broker.get_servers_for_psi(si) self._servers = iter(servers) self._started = True def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._lp return log.msg(*args, **kwargs) def stop(self): self.running = False while self.overdue_timers: req,t = self.overdue_timers.popitem() t.cancel() # called by our parent CiphertextDownloader def hungry(self): self.log(format="ShareFinder[si=%(si)s] hungry", si=self._si_prefix, level=log.NOISY, umid="NywYaQ") self.start_finding_servers() self._hungry = True eventually(self.loop) # internal methods def loop(self): pending_s = ",".join([rt.server.get_name() for rt in self.pending_requests]) # sort? self.log(format="ShareFinder loop: running=%(running)s" " hungry=%(hungry)s, pending=%(pending)s", running=self.running, hungry=self._hungry, pending=pending_s, level=log.NOISY, umid="kRtS4Q") if not self.running: return if not self._hungry: return non_overdue = self.pending_requests - self.overdue_requests if len(non_overdue) >= self.max_outstanding_requests: # cannot send more requests, must wait for some to retire return server = None try: if self._servers: server = self._servers.next() except StopIteration: self._servers = None if server: self.send_request(server) # we loop again to get parallel queries. The check above will # prevent us from looping forever. eventually(self.loop) return if self.pending_requests: # no server, but there are still requests in flight: maybe one of # them will make progress return self.log(format="ShareFinder.loop: no_more_shares, ever", level=log.UNUSUAL, umid="XjQlzg") # we've run out of servers (so we can't send any more requests), and # we have nothing in flight. No further progress can be made. They # are destined to remain hungry. eventually(self.share_consumer.no_more_shares) def send_request(self, server): req = RequestToken(server) self.pending_requests.add(req) lp = self.log(format="sending DYHB to [%(name)s]", name=server.get_name(), level=log.NOISY, umid="Io7pyg") time_sent = now() d_ev = self._download_status.add_dyhb_request(server, time_sent) # TODO: get the timer from a Server object, it knows best self.overdue_timers[req] = reactor.callLater(self.OVERDUE_TIMEOUT, self.overdue, req) d = server.get_rref().callRemote("get_buckets", self._storage_index) d.addBoth(incidentally, self._request_retired, req) d.addCallbacks(self._got_response, self._got_error, callbackArgs=(server, req, d_ev, time_sent, lp), errbackArgs=(server, req, d_ev, lp)) d.addErrback(log.err, format="error in send_request", level=log.WEIRD, parent=lp, umid="rpdV0w") d.addCallback(incidentally, eventually, self.loop) def _request_retired(self, req): self.pending_requests.discard(req) self.overdue_requests.discard(req) if req in self.overdue_timers: self.overdue_timers[req].cancel() del self.overdue_timers[req] def overdue(self, req): del self.overdue_timers[req] assert req in self.pending_requests # paranoia, should never be false self.overdue_requests.add(req) eventually(self.loop) def _got_response(self, buckets, server, req, d_ev, time_sent, lp): shnums = sorted([shnum for shnum in buckets]) time_received = now() d_ev.finished(shnums, time_received) dyhb_rtt = time_received - time_sent if not buckets: self.log(format="no shares from [%(name)s]", name=server.get_name(), level=log.NOISY, parent=lp, umid="U7d4JA") return shnums_s = ",".join([str(shnum) for shnum in shnums]) self.log(format="got shnums [%(shnums)s] from [%(name)s]", shnums=shnums_s, name=server.get_name(), level=log.NOISY, parent=lp, umid="0fcEZw") shares = [] for shnum, bucket in buckets.iteritems(): s = self._create_share(shnum, bucket, server, dyhb_rtt) shares.append(s) self._deliver_shares(shares) def _create_share(self, shnum, bucket, server, dyhb_rtt): if shnum in self._commonshares: cs = self._commonshares[shnum] else: numsegs, authoritative = self.node.get_num_segments() cs = CommonShare(numsegs, self._si_prefix, shnum, self._node_logparent) if authoritative: cs.set_authoritative_num_segments(numsegs) # Share._get_satisfaction is responsible for updating # CommonShare.set_numsegs after we know the UEB. Alternatives: # 1: d = self.node.get_num_segments() # d.addCallback(cs.got_numsegs) # the problem is that the OneShotObserverList I was using # inserts an eventual-send between _get_satisfaction's # _satisfy_UEB and _satisfy_block_hash_tree, and the # CommonShare didn't get the num_segs message before # being asked to set block hash values. To resolve this # would require an immediate ObserverList instead of # an eventual-send -based one # 2: break _get_satisfaction into Deferred-attached pieces. # Yuck. self._commonshares[shnum] = cs s = Share(bucket, server, self.verifycap, cs, self.node, self._download_status, shnum, dyhb_rtt, self._node_logparent) return s def _deliver_shares(self, shares): # they will call hungry() again if they want more self._hungry = False shares_s = ",".join([str(sh) for sh in shares]) self.log(format="delivering shares: %s" % shares_s, level=log.NOISY, umid="2n1qQw") eventually(self.share_consumer.got_shares, shares) def _got_error(self, f, server, req, d_ev, lp): d_ev.error(now()) self.log(format="got error from [%(name)s]", name=server.get_name(), failure=f, level=log.UNUSUAL, parent=lp, umid="zUKdCw") allmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/status.py0000644000175000017500000002200412556560070023536 0ustar ramram import itertools from zope.interface import implements from allmydata.interfaces import IDownloadStatus class ReadEvent: def __init__(self, ev, ds): self._ev = ev self._ds = ds def update(self, bytes, decrypttime, pausetime): self._ev["bytes_returned"] += bytes self._ev["decrypt_time"] += decrypttime self._ev["paused_time"] += pausetime def finished(self, finishtime): self._ev["finish_time"] = finishtime self._ds.update_last_timestamp(finishtime) class SegmentEvent: def __init__(self, ev, ds): self._ev = ev self._ds = ds def activate(self, when): if self._ev["active_time"] is None: self._ev["active_time"] = when def deliver(self, when, start, length, decodetime): assert self._ev["active_time"] is not None self._ev["finish_time"] = when self._ev["success"] = True self._ev["decode_time"] = decodetime self._ev["segment_start"] = start self._ev["segment_length"] = length self._ds.update_last_timestamp(when) def error(self, when): self._ev["finish_time"] = when self._ev["success"] = False self._ds.update_last_timestamp(when) class DYHBEvent: def __init__(self, ev, ds): self._ev = ev self._ds = ds def error(self, when): self._ev["finish_time"] = when self._ev["success"] = False self._ds.update_last_timestamp(when) def finished(self, shnums, when): self._ev["finish_time"] = when self._ev["success"] = True self._ev["response_shnums"] = shnums self._ds.update_last_timestamp(when) class BlockRequestEvent: def __init__(self, ev, ds): self._ev = ev self._ds = ds def finished(self, received, when): self._ev["finish_time"] = when self._ev["success"] = True self._ev["response_length"] = received self._ds.update_last_timestamp(when) def error(self, when): self._ev["finish_time"] = when self._ev["success"] = False self._ds.update_last_timestamp(when) class DownloadStatus: # There is one DownloadStatus for each CiphertextFileNode. The status # object will keep track of all activity for that node. implements(IDownloadStatus) statusid_counter = itertools.count(0) def __init__(self, storage_index, size): self.storage_index = storage_index self.size = size self.counter = self.statusid_counter.next() self.helper = False self.first_timestamp = None self.last_timestamp = None # all four of these _events lists are sorted by start_time, because # they are strictly append-only (some elements are later mutated in # place, but none are removed or inserted in the middle). # self.read_events tracks read() requests. It is a list of dicts, # each with the following keys: # start,length (of data requested) # start_time # finish_time (None until finished) # bytes_returned (starts at 0, grows as segments are delivered) # decrypt_time (time spent in decrypt, None for ciphertext-only reads) # paused_time (time spent paused by client via pauseProducing) self.read_events = [] # self.segment_events tracks segment requests and their resolution. # It is a list of dicts: # segment_number # start_time # active_time (None until work has begun) # decode_time (time spent in decode, None until delievered) # finish_time (None until resolved) # success (None until resolved, then boolean) # segment_start (file offset of first byte, None until delivered) # segment_length (None until delivered) self.segment_events = [] # self.dyhb_requests tracks "do you have a share" requests and # responses. It is a list of dicts: # server (instance of IServer) # start_time # success (None until resolved, then boolean) # response_shnums (tuple, None until successful) # finish_time (None until resolved) self.dyhb_requests = [] # self.block_requests tracks share-data requests and responses. It is # a list of dicts: # server (instance of IServer) # shnum, # start,length, (of data requested) # start_time # finish_time (None until resolved) # success (None until resolved, then bool) # response_length (None until success) self.block_requests = [] self.known_shares = [] # (server, shnum) self.problems = [] self.misc_events = [] def add_misc_event(self, what, start, finish=None): self.misc_events.append( {"what": what, "start_time": start, "finish_time": finish, } ) def add_read_event(self, start, length, when): if self.first_timestamp is None: self.first_timestamp = when r = { "start": start, "length": length, "start_time": when, "finish_time": None, "bytes_returned": 0, "decrypt_time": 0, "paused_time": 0, } self.read_events.append(r) return ReadEvent(r, self) def add_segment_request(self, segnum, when): if self.first_timestamp is None: self.first_timestamp = when r = { "segment_number": segnum, "start_time": when, "active_time": None, "finish_time": None, "success": None, "decode_time": None, "segment_start": None, "segment_length": None, } self.segment_events.append(r) return SegmentEvent(r, self) def add_dyhb_request(self, server, when): r = { "server": server, "start_time": when, "success": None, "response_shnums": None, "finish_time": None, } self.dyhb_requests.append(r) return DYHBEvent(r, self) def add_block_request(self, server, shnum, start, length, when): r = { "server": server, "shnum": shnum, "start": start, "length": length, "start_time": when, "finish_time": None, "success": None, "response_length": None, } self.block_requests.append(r) return BlockRequestEvent(r, self) def update_last_timestamp(self, when): if self.last_timestamp is None or when > self.last_timestamp: self.last_timestamp = when def add_known_share(self, server, shnum): # XXX use me self.known_shares.append( (server, shnum) ) def add_problem(self, p): self.problems.append(p) # IDownloadStatus methods def get_counter(self): return self.counter def get_storage_index(self): return self.storage_index def get_size(self): return self.size def get_status(self): # mention all outstanding segment requests outstanding = set() errorful = set() outstanding = set([s_ev["segment_number"] for s_ev in self.segment_events if s_ev["finish_time"] is None]) errorful = set([s_ev["segment_number"] for s_ev in self.segment_events if s_ev["success"] is False]) def join(segnums): if len(segnums) == 1: return "segment %s" % list(segnums)[0] else: return "segments %s" % (",".join([str(i) for i in sorted(segnums)])) error_s = "" if errorful: error_s = "; errors on %s" % join(errorful) if outstanding: s = "fetching %s" % join(outstanding) else: s = "idle" return s + error_s def get_progress(self): # measure all read events that aren't completely done, return the # total percentage complete for them if not self.read_events: return 0.0 total_outstanding, total_received = 0, 0 for r_ev in self.read_events: if r_ev["finish_time"] is None: total_outstanding += r_ev["length"] total_received += r_ev["bytes_returned"] # else ignore completed requests if not total_outstanding: return 1.0 return 1.0 * total_received / total_outstanding def using_helper(self): return False def get_active(self): # a download is considered active if it has at least one outstanding # read() call for r_ev in self.read_events: if r_ev["finish_time"] is None: return True return False def get_started(self): return self.first_timestamp def get_results(self): return None # TODO allmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/node.py0000644000175000017500000005701212556560070023147 0ustar ramram import time now = time.time from zope.interface import Interface from twisted.python.failure import Failure from twisted.internet import defer from foolscap.api import eventually from allmydata import uri from allmydata.codec import CRSDecoder from allmydata.util import base32, log, hashutil, mathutil, observer from allmydata.interfaces import DEFAULT_MAX_SEGMENT_SIZE from allmydata.hashtree import IncompleteHashTree, BadHashError, \ NotEnoughHashesError # local imports from finder import ShareFinder from fetcher import SegmentFetcher from segmentation import Segmentation from common import BadCiphertextHashError class IDownloadStatusHandlingConsumer(Interface): def set_download_status_read_event(read_ev): """Record the DownloadStatus 'read event', to be updated with the time it takes to decrypt each chunk of data.""" class Cancel: def __init__(self, f): self._f = f self.active = True def cancel(self): if self.active: self.active = False self._f(self) class DownloadNode: """Internal class which manages downloads and holds state. External callers use CiphertextFileNode instead.""" # Share._node points to me def __init__(self, verifycap, storage_broker, secret_holder, terminator, history, download_status): assert isinstance(verifycap, uri.CHKFileVerifierURI) self._verifycap = verifycap self._storage_broker = storage_broker self._si_prefix = base32.b2a_l(verifycap.storage_index[:8], 60) self.running = True if terminator: terminator.register(self) # calls self.stop() at stopService() # the rules are: # 1: Only send network requests if you're active (self.running is True) # 2: Use TimerService, not reactor.callLater # 3: You can do eventual-sends any time. # These rules should mean that once # stopService()+flushEventualQueue() fires, everything will be done. self._secret_holder = secret_holder self._history = history self._download_status = download_status self.share_hash_tree = IncompleteHashTree(self._verifycap.total_shares) # we guess the segment size, so Segmentation can pull non-initial # segments in a single roundtrip. This populates # .guessed_segment_size, .guessed_num_segments, and # .ciphertext_hash_tree (with a dummy, to let us guess which hashes # we'll need) self._build_guessed_tables(DEFAULT_MAX_SEGMENT_SIZE) # filled in when we parse a valid UEB self.have_UEB = False self.segment_size = None self.tail_segment_size = None self.tail_segment_padded = None self.num_segments = None self.block_size = None self.tail_block_size = None # things to track callers that want data # _segment_requests can have duplicates self._segment_requests = [] # (segnum, d, cancel_handle, seg_ev, lp) self._active_segment = None # a SegmentFetcher, with .segnum self._segsize_observers = observer.OneShotObserverList() # we create one top-level logparent for this _Node, and another one # for each read() call. Segmentation and get_segment() messages are # associated with the read() call, everything else is tied to the # _Node's log entry. lp = log.msg(format="Immutable.DownloadNode(%(si)s) created:" " size=%(size)d," " guessed_segsize=%(guessed_segsize)d," " guessed_numsegs=%(guessed_numsegs)d", si=self._si_prefix, size=verifycap.size, guessed_segsize=self.guessed_segment_size, guessed_numsegs=self.guessed_num_segments, level=log.OPERATIONAL, umid="uJ0zAQ") self._lp = lp self._sharefinder = ShareFinder(storage_broker, verifycap, self, self._download_status, lp) self._shares = set() def _build_guessed_tables(self, max_segment_size): size = min(self._verifycap.size, max_segment_size) s = mathutil.next_multiple(size, self._verifycap.needed_shares) self.guessed_segment_size = s r = self._calculate_sizes(self.guessed_segment_size) self.guessed_num_segments = r["num_segments"] # as with CommonShare, our ciphertext_hash_tree is a stub until we # get the real num_segments self.ciphertext_hash_tree = IncompleteHashTree(self.guessed_num_segments) self.ciphertext_hash_tree_leaves = self.guessed_num_segments def __repr__(self): return "ImmutableDownloadNode(%s)" % (self._si_prefix,) def stop(self): # called by the Terminator at shutdown, mostly for tests if self._active_segment: self._active_segment.stop() self._active_segment = None self._sharefinder.stop() # things called by outside callers, via CiphertextFileNode. get_segment() # may also be called by Segmentation. def read(self, consumer, offset, size): """I am the main entry point, from which FileNode.read() can get data. I feed the consumer with the desired range of ciphertext. I return a Deferred that fires (with the consumer) when the read is finished. Note that there is no notion of a 'file pointer': each call to read() uses an independent offset= value. """ # for concurrent operations: each gets its own Segmentation manager if size is None: size = self._verifycap.size # ignore overruns: clip size so offset+size does not go past EOF, and # so size is not negative (which indicates that offset >= EOF) size = max(0, min(size, self._verifycap.size-offset)) read_ev = self._download_status.add_read_event(offset, size, now()) if IDownloadStatusHandlingConsumer.providedBy(consumer): consumer.set_download_status_read_event(read_ev) consumer.set_download_status(self._download_status) lp = log.msg(format="imm Node(%(si)s).read(%(offset)d, %(size)d)", si=base32.b2a(self._verifycap.storage_index)[:8], offset=offset, size=size, level=log.OPERATIONAL, parent=self._lp, umid="l3j3Ww") if self._history: sp = self._history.stats_provider sp.count("downloader.files_downloaded", 1) # really read() calls sp.count("downloader.bytes_downloaded", size) if size == 0: read_ev.finished(now()) # no data, so no producer, so no register/unregisterProducer return defer.succeed(consumer) # for concurrent operations, each read() gets its own Segmentation # manager s = Segmentation(self, offset, size, consumer, read_ev, lp) # this raises an interesting question: what segments to fetch? if # offset=0, always fetch the first segment, and then allow # Segmentation to be responsible for pulling the subsequent ones if # the first wasn't large enough. If offset>0, we're going to need an # extra roundtrip to get the UEB (and therefore the segment size) # before we can figure out which segment to get. TODO: allow the # offset-table-guessing code (which starts by guessing the segsize) # to assist the offset>0 process. d = s.start() def _done(res): read_ev.finished(now()) return res d.addBoth(_done) return d def get_segment(self, segnum, logparent=None): """Begin downloading a segment. I return a tuple (d, c): 'd' is a Deferred that fires with (offset,data) when the desired segment is available, and c is an object on which c.cancel() can be called to disavow interest in the segment (after which 'd' will never fire). You probably need to know the segment size before calling this, unless you want the first few bytes of the file. If you ask for a segment number which turns out to be too large, the Deferred will errback with BadSegmentNumberError. The Deferred fires with the offset of the first byte of the data segment, so that you can call get_segment() before knowing the segment size, and still know which data you received. The Deferred can also errback with other fatal problems, such as NotEnoughSharesError, NoSharesError, or BadCiphertextHashError. """ lp = log.msg(format="imm Node(%(si)s).get_segment(%(segnum)d)", si=base32.b2a(self._verifycap.storage_index)[:8], segnum=segnum, level=log.OPERATIONAL, parent=logparent, umid="UKFjDQ") seg_ev = self._download_status.add_segment_request(segnum, now()) d = defer.Deferred() c = Cancel(self._cancel_request) self._segment_requests.append( (segnum, d, c, seg_ev, lp) ) self._start_new_segment() return (d, c) def get_segsize(self): """Return a Deferred that fires when we know the real segment size.""" if self.segment_size: return defer.succeed(self.segment_size) # TODO: this downloads (and discards) the first segment of the file. # We could make this more efficient by writing # fetcher.SegmentSizeFetcher, with the job of finding a single valid # share and extracting the UEB. We'd add Share.get_UEB() to request # just the UEB. (d,c) = self.get_segment(0) # this ensures that an error during get_segment() will errback the # caller, so Repair won't wait forever on completely missing files d.addCallback(lambda ign: self._segsize_observers.when_fired()) return d # things called by the Segmentation object used to transform # arbitrary-sized read() calls into quantized segment fetches def _start_new_segment(self): if self._active_segment is None and self._segment_requests: (segnum, d, c, seg_ev, lp) = self._segment_requests[0] k = self._verifycap.needed_shares log.msg(format="%(node)s._start_new_segment: segnum=%(segnum)d", node=repr(self), segnum=segnum, level=log.NOISY, parent=lp, umid="wAlnHQ") self._active_segment = fetcher = SegmentFetcher(self, segnum, k, lp) seg_ev.activate(now()) active_shares = [s for s in self._shares if s.is_alive()] fetcher.add_shares(active_shares) # this triggers the loop # called by our child ShareFinder def got_shares(self, shares): self._shares.update(shares) if self._active_segment: self._active_segment.add_shares(shares) def no_more_shares(self): self._no_more_shares = True if self._active_segment: self._active_segment.no_more_shares() # things called by our Share instances def validate_and_store_UEB(self, UEB_s): log.msg("validate_and_store_UEB", level=log.OPERATIONAL, parent=self._lp, umid="7sTrPw") h = hashutil.uri_extension_hash(UEB_s) if h != self._verifycap.uri_extension_hash: raise BadHashError self._parse_and_store_UEB(UEB_s) # sets self._stuff # TODO: a malformed (but authentic) UEB could throw an assertion in # _parse_and_store_UEB, and we should abandon the download. self.have_UEB = True # inform the ShareFinder about our correct number of segments. This # will update the block-hash-trees in all existing CommonShare # instances, and will populate new ones with the correct value. self._sharefinder.update_num_segments() def _parse_and_store_UEB(self, UEB_s): # Note: the UEB contains needed_shares and total_shares. These are # redundant and inferior (the filecap contains the authoritative # values). However, because it is possible to encode the same file in # multiple ways, and the encoders might choose (poorly) to use the # same key for both (therefore getting the same SI), we might # encounter shares for both types. The UEB hashes will be different, # however, and we'll disregard the "other" encoding's shares as # corrupted. # therefore, we ignore d['total_shares'] and d['needed_shares']. d = uri.unpack_extension(UEB_s) log.msg(format="UEB=%(ueb)s, vcap=%(vcap)s", ueb=repr(uri.unpack_extension_readable(UEB_s)), vcap=self._verifycap.to_string(), level=log.NOISY, parent=self._lp, umid="cVqZnA") k, N = self._verifycap.needed_shares, self._verifycap.total_shares self.segment_size = d['segment_size'] self._segsize_observers.fire(self.segment_size) r = self._calculate_sizes(self.segment_size) self.tail_segment_size = r["tail_segment_size"] self.tail_segment_padded = r["tail_segment_padded"] self.num_segments = r["num_segments"] self.block_size = r["block_size"] self.tail_block_size = r["tail_block_size"] log.msg("actual sizes: %s" % (r,), level=log.NOISY, parent=self._lp, umid="PY6P5Q") if (self.segment_size == self.guessed_segment_size and self.num_segments == self.guessed_num_segments): log.msg("my guess was right!", level=log.NOISY, parent=self._lp, umid="x340Ow") else: log.msg("my guess was wrong! Extra round trips for me.", level=log.NOISY, parent=self._lp, umid="tb7RJw") # zfec.Decode() instantiation is fast, but still, let's use the same # codec instance for all but the last segment. 3-of-10 takes 15us on # my laptop, 25-of-100 is 900us, 3-of-255 is 97us, 25-of-255 is # 2.5ms, worst-case 254-of-255 is 9.3ms self._codec = CRSDecoder() self._codec.set_params(self.segment_size, k, N) # Ciphertext hash tree root is mandatory, so that there is at most # one ciphertext that matches this read-cap or verify-cap. The # integrity check on the shares is not sufficient to prevent the # original encoder from creating some shares of file A and other # shares of file B. self.ciphertext_hash_tree was a guess before: # this is where we create it for real. self.ciphertext_hash_tree = IncompleteHashTree(self.num_segments) self.ciphertext_hash_tree_leaves = self.num_segments self.ciphertext_hash_tree.set_hashes({0: d['crypttext_root_hash']}) self.share_hash_tree.set_hashes({0: d['share_root_hash']}) # Our job is a fast download, not verification, so we ignore any # redundant fields. The Verifier uses a different code path which # does not ignore them. def _calculate_sizes(self, segment_size): # segments of ciphertext size = self._verifycap.size k = self._verifycap.needed_shares # this assert matches the one in encode.py:127 inside # Encoded._got_all_encoding_parameters, where the UEB is constructed assert segment_size % k == 0 # the last segment is usually short. We don't store a whole segsize, # but we do pad the segment up to a multiple of k, because the # encoder requires that. tail_segment_size = size % segment_size if tail_segment_size == 0: tail_segment_size = segment_size padded = mathutil.next_multiple(tail_segment_size, k) tail_segment_padded = padded num_segments = mathutil.div_ceil(size, segment_size) # each segment is turned into N blocks. All but the last are of size # block_size, and the last is of size tail_block_size block_size = segment_size / k tail_block_size = tail_segment_padded / k return { "tail_segment_size": tail_segment_size, "tail_segment_padded": tail_segment_padded, "num_segments": num_segments, "block_size": block_size, "tail_block_size": tail_block_size, } def process_share_hashes(self, share_hashes): for hashnum in share_hashes: if hashnum >= len(self.share_hash_tree): # "BadHashError" is normally for e.g. a corrupt block. We # sort of abuse it here to mean a badly numbered hash (which # indicates corruption in the number bytes, rather than in # the data bytes). raise BadHashError("hashnum %d doesn't fit in hashtree(%d)" % (hashnum, len(self.share_hash_tree))) self.share_hash_tree.set_hashes(share_hashes) def get_desired_ciphertext_hashes(self, segnum): if segnum < self.ciphertext_hash_tree_leaves: return self.ciphertext_hash_tree.needed_hashes(segnum, include_leaf=True) return [] def get_needed_ciphertext_hashes(self, segnum): cht = self.ciphertext_hash_tree return cht.needed_hashes(segnum, include_leaf=True) def process_ciphertext_hashes(self, hashes): assert self.num_segments is not None # this may raise BadHashError or NotEnoughHashesError self.ciphertext_hash_tree.set_hashes(hashes) # called by our child SegmentFetcher def want_more_shares(self): self._sharefinder.hungry() def fetch_failed(self, sf, f): assert sf is self._active_segment # deliver error upwards for (d,c,seg_ev) in self._extract_requests(sf.segnum): seg_ev.error(now()) eventually(self._deliver, d, c, f) self._active_segment = None self._start_new_segment() def process_blocks(self, segnum, blocks): start = now() d = defer.maybeDeferred(self._decode_blocks, segnum, blocks) d.addCallback(self._check_ciphertext_hash, segnum) def _deliver(result): log.msg(format="delivering segment(%(segnum)d)", segnum=segnum, level=log.OPERATIONAL, parent=self._lp, umid="j60Ojg") when = now() if isinstance(result, Failure): # this catches failures in decode or ciphertext hash for (d,c,seg_ev) in self._extract_requests(segnum): seg_ev.error(when) eventually(self._deliver, d, c, result) else: (offset, segment, decodetime) = result for (d,c,seg_ev) in self._extract_requests(segnum): # when we have two requests for the same segment, the # second one will not be "activated" before the data is # delivered, so to allow the status-reporting code to see # consistent behavior, we activate them all now. The # SegmentEvent will ignore duplicate activate() calls. # Note that this will result in an inaccurate "receive # speed" for the second request. seg_ev.activate(when) seg_ev.deliver(when, offset, len(segment), decodetime) eventually(self._deliver, d, c, result) self._download_status.add_misc_event("process_block", start, now()) self._active_segment = None self._start_new_segment() d.addBoth(_deliver) d.addErrback(log.err, "unhandled error during process_blocks", level=log.WEIRD, parent=self._lp, umid="MkEsCg") def _decode_blocks(self, segnum, blocks): start = now() tail = (segnum == self.num_segments-1) codec = self._codec block_size = self.block_size decoded_size = self.segment_size if tail: # account for the padding in the last segment codec = CRSDecoder() k, N = self._verifycap.needed_shares, self._verifycap.total_shares codec.set_params(self.tail_segment_padded, k, N) block_size = self.tail_block_size decoded_size = self.tail_segment_padded shares = [] shareids = [] for (shareid, share) in blocks.iteritems(): assert len(share) == block_size shareids.append(shareid) shares.append(share) del blocks d = codec.decode(shares, shareids) # segment del shares def _process(buffers): decodetime = now() - start segment = "".join(buffers) assert len(segment) == decoded_size del buffers if tail: segment = segment[:self.tail_segment_size] self._download_status.add_misc_event("decode", start, now()) return (segment, decodetime) d.addCallback(_process) return d def _check_ciphertext_hash(self, (segment, decodetime), segnum): start = now() assert self._active_segment.segnum == segnum assert self.segment_size is not None offset = segnum * self.segment_size h = hashutil.crypttext_segment_hash(segment) try: self.ciphertext_hash_tree.set_hashes(leaves={segnum: h}) self._download_status.add_misc_event("CThash", start, now()) return (offset, segment, decodetime) except (BadHashError, NotEnoughHashesError): format = ("hash failure in ciphertext_hash_tree:" " segnum=%(segnum)d, SI=%(si)s") log.msg(format=format, segnum=segnum, si=self._si_prefix, failure=Failure(), level=log.WEIRD, parent=self._lp, umid="MTwNnw") # this is especially weird, because we made it past the share # hash tree. It implies that we're using the wrong encoding, or # that the uploader deliberately constructed a bad UEB. msg = format % {"segnum": segnum, "si": self._si_prefix} raise BadCiphertextHashError(msg) def _deliver(self, d, c, result): # this method exists to handle cancel() that occurs between # _got_segment and _deliver if c.active: c.active = False # it is now too late to cancel d.callback(result) # might actually be an errback def _extract_requests(self, segnum): """Remove matching requests and return their (d,c) tuples so that the caller can retire them.""" retire = [(d,c,seg_ev) for (segnum0,d,c,seg_ev,lp) in self._segment_requests if segnum0 == segnum] self._segment_requests = [t for t in self._segment_requests if t[0] != segnum] return retire def _cancel_request(self, cancel): self._segment_requests = [t for t in self._segment_requests if t[2] != cancel] segnums = [segnum for (segnum,d,c,seg_ev,lp) in self._segment_requests] # self._active_segment might be None in rare circumstances, so make # sure we tolerate it if self._active_segment and self._active_segment.segnum not in segnums: self._active_segment.stop() self._active_segment = None self._start_new_segment() # called by ShareFinder to choose hashtree sizes in CommonShares, and by # SegmentFetcher to tell if it is still fetching a valid segnum. def get_num_segments(self): # returns (best_num_segments, authoritative) if self.num_segments is None: return (self.guessed_num_segments, False) return (self.num_segments, True) allmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/segmentation.py0000644000175000017500000001536312556560070024722 0ustar ramram import time now = time.time from zope.interface import implements from twisted.internet import defer from twisted.internet.interfaces import IPushProducer from foolscap.api import eventually from allmydata.util import log from allmydata.util.spans import overlap from allmydata.interfaces import DownloadStopped from common import BadSegmentNumberError, WrongSegmentError class Segmentation: """I am responsible for a single offset+size read of the file. I handle segmentation: I figure out which segments are necessary, request them (from my CiphertextDownloader) in order, and trim the segments down to match the offset+size span. I use the Producer/Consumer interface to only request one segment at a time. """ implements(IPushProducer) def __init__(self, node, offset, size, consumer, read_ev, logparent=None): self._node = node self._hungry = True self._active_segnum = None self._cancel_segment_request = None # these are updated as we deliver data. At any given time, we still # want to download file[offset:offset+size] self._offset = offset self._size = size assert offset+size <= node._verifycap.size self._consumer = consumer self._read_ev = read_ev self._start_pause = None self._lp = logparent def start(self): self._alive = True self._deferred = defer.Deferred() self._deferred.addBoth(self._done) self._consumer.registerProducer(self, True) self._maybe_fetch_next() return self._deferred def _done(self, res): self._consumer.unregisterProducer() return res def _maybe_fetch_next(self): if not self._alive or not self._hungry: return if self._active_segnum is not None: return self._fetch_next() def _fetch_next(self): if self._size == 0: # done! self._alive = False self._hungry = False self._deferred.callback(self._consumer) return n = self._node have_actual_segment_size = n.segment_size is not None guess_s = "" if not have_actual_segment_size: guess_s = "probably " segment_size = n.segment_size or n.guessed_segment_size if self._offset == 0: # great! we want segment0 for sure wanted_segnum = 0 else: # this might be a guess wanted_segnum = self._offset // segment_size log.msg(format="_fetch_next(offset=%(offset)d) %(guess)swants segnum=%(segnum)d", offset=self._offset, guess=guess_s, segnum=wanted_segnum, level=log.NOISY, parent=self._lp, umid="5WfN0w") self._active_segnum = wanted_segnum d,c = n.get_segment(wanted_segnum, self._lp) self._cancel_segment_request = c d.addBoth(self._request_retired) d.addCallback(self._got_segment, wanted_segnum) if not have_actual_segment_size: # we can retry once d.addErrback(self._retry_bad_segment) d.addErrback(self._error) def _request_retired(self, res): self._active_segnum = None self._cancel_segment_request = None return res def _got_segment(self, (segment_start,segment,decodetime), wanted_segnum): self._cancel_segment_request = None # we got file[segment_start:segment_start+len(segment)] # we want file[self._offset:self._offset+self._size] log.msg(format="Segmentation got data:" " want [%(wantstart)d-%(wantend)d)," " given [%(segstart)d-%(segend)d), for segnum=%(segnum)d", wantstart=self._offset, wantend=self._offset+self._size, segstart=segment_start, segend=segment_start+len(segment), segnum=wanted_segnum, level=log.OPERATIONAL, parent=self._lp, umid="32dHcg") o = overlap(segment_start, len(segment), self._offset, self._size) # the overlap is file[o[0]:o[0]+o[1]] if not o or o[0] != self._offset: # we didn't get the first byte, so we can't use this segment log.msg("Segmentation handed wrong data:" " want [%d-%d), given [%d-%d), for segnum=%d," " for si=%s" % (self._offset, self._offset+self._size, segment_start, segment_start+len(segment), wanted_segnum, self._node._si_prefix), level=log.UNUSUAL, parent=self._lp, umid="STlIiA") # we may retry if the segnum we asked was based on a guess raise WrongSegmentError("I was given the wrong data.") offset_in_segment = self._offset - segment_start desired_data = segment[offset_in_segment:offset_in_segment+o[1]] self._offset += len(desired_data) self._size -= len(desired_data) self._consumer.write(desired_data) # the consumer might call our .pauseProducing() inside that write() # call, setting self._hungry=False self._read_ev.update(len(desired_data), 0, 0) # note: filenode.DecryptingConsumer is responsible for calling # _read_ev.update with how much decrypt_time was consumed self._maybe_fetch_next() def _retry_bad_segment(self, f): f.trap(WrongSegmentError, BadSegmentNumberError) # we guessed the segnum wrong: either one that doesn't overlap with # the start of our desired region, or one that's beyond the end of # the world. Now that we have the right information, we're allowed to # retry once. assert self._node.segment_size is not None return self._maybe_fetch_next() def _error(self, f): log.msg("Error in Segmentation", failure=f, level=log.WEIRD, parent=self._lp, umid="EYlXBg") self._alive = False self._hungry = False self._deferred.errback(f) def stopProducing(self): log.msg("asked to stopProducing", level=log.NOISY, parent=self._lp, umid="XIyL9w") self._hungry = False self._alive = False # cancel any outstanding segment request if self._cancel_segment_request: self._cancel_segment_request.cancel() self._cancel_segment_request = None e = DownloadStopped("our Consumer called stopProducing()") self._deferred.errback(e) def pauseProducing(self): self._hungry = False self._start_pause = now() def resumeProducing(self): self._hungry = True eventually(self._maybe_fetch_next) if self._start_pause is not None: paused = now() - self._start_pause self._read_ev.update(0, 0, paused) self._start_pause = None allmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/common.py0000644000175000017500000000045112556560070023505 0ustar ramram (AVAILABLE, PENDING, OVERDUE, COMPLETE, CORRUPT, DEAD, BADSEGNUM) = \ ("AVAILABLE", "PENDING", "OVERDUE", "COMPLETE", "CORRUPT", "DEAD", "BADSEGNUM") class BadSegmentNumberError(Exception): pass class WrongSegmentError(Exception): pass class BadCiphertextHashError(Exception): pass allmydata-tahoe-1.10.2/src/allmydata/immutable/downloader/share.py0000644000175000017500000012535112556560070023326 0ustar ramram import struct import time now = time.time from twisted.python.failure import Failure from foolscap.api import eventually from allmydata.util import base32, log, hashutil, mathutil from allmydata.util.spans import Spans, DataSpans from allmydata.interfaces import HASH_SIZE from allmydata.hashtree import IncompleteHashTree, BadHashError, \ NotEnoughHashesError from allmydata.immutable.layout import make_write_bucket_proxy from allmydata.util.observer import EventStreamObserver from common import COMPLETE, CORRUPT, DEAD, BADSEGNUM class LayoutInvalid(Exception): pass class DataUnavailable(Exception): pass class Share: """I represent a single instance of a single share (e.g. I reference the shnum2 for share SI=abcde on server xy12t, not the one on server ab45q). I am associated with a CommonShare that remembers data that is held in common among e.g. SI=abcde/shnum2 across all servers. I am also associated with a CiphertextFileNode for e.g. SI=abcde (all shares, all servers). """ # this is a specific implementation of IShare for tahoe's native storage # servers. A different backend would use a different class. def __init__(self, rref, server, verifycap, commonshare, node, download_status, shnum, dyhb_rtt, logparent): self._rref = rref self._server = server self._node = node # holds share_hash_tree and UEB self.actual_segment_size = node.segment_size # might still be None # XXX change node.guessed_segment_size to # node.best_guess_segment_size(), which should give us the real ones # if known, else its guess. self._guess_offsets(verifycap, node.guessed_segment_size) self.actual_offsets = None self._UEB_length = None self._commonshare = commonshare # holds block_hash_tree self._download_status = download_status self._storage_index = verifycap.storage_index self._si_prefix = base32.b2a(verifycap.storage_index)[:8] self._shnum = shnum self._dyhb_rtt = dyhb_rtt # self._alive becomes False upon fatal corruption or server error self._alive = True self._loop_scheduled = False self._lp = log.msg(format="%(share)s created", share=repr(self), level=log.NOISY, parent=logparent, umid="P7hv2w") self._pending = Spans() # request sent but no response received yet self._received = DataSpans() # ACK response received, with data self._unavailable = Spans() # NAK response received, no data # any given byte of the share can be in one of four states: # in: _wanted, _requested, _received # FALSE FALSE FALSE : don't care about it at all # TRUE FALSE FALSE : want it, haven't yet asked for it # TRUE TRUE FALSE : request is in-flight # or didn't get it # FALSE TRUE TRUE : got it, haven't used it yet # FALSE TRUE FALSE : got it and used it # FALSE FALSE FALSE : block consumed, ready to ask again # # when we request data and get a NAK, we leave it in _requested # to remind ourself to not ask for it again. We don't explicitly # remove it from anything (maybe this should change). # # We retain the hashtrees in the Node, so we leave those spans in # _requested (and never ask for them again, as long as the Node is # alive). But we don't retain data blocks (too big), so when we # consume a data block, we remove it from _requested, so a later # download can re-fetch it. self._requested_blocks = [] # (segnum, set(observer2..)) v = server.get_version() ver = v["http://allmydata.org/tahoe/protocols/storage/v1"] self._overrun_ok = ver["tolerates-immutable-read-overrun"] # If _overrun_ok and we guess the offsets correctly, we can get # everything in one RTT. If _overrun_ok and we guess wrong, we might # need two RTT (but we could get lucky and do it in one). If overrun # is *not* ok (tahoe-1.3.0 or earlier), we need four RTT: 1=version, # 2=offset table, 3=UEB_length and everything else (hashes, block), # 4=UEB. self.had_corruption = False # for unit tests def __repr__(self): return "Share(sh%d-on-%s)" % (self._shnum, self._server.get_name()) def is_alive(self): # XXX: reconsider. If the share sees a single error, should it remain # dead for all time? Or should the next segment try again? This DEAD # state is stored elsewhere too (SegmentFetcher per-share states?) # and needs to be consistent. We clear _alive in self._fail(), which # is called upon a network error, or layout failure, or hash failure # in the UEB or a hash tree. We do not _fail() for a hash failure in # a block, but of course we still tell our callers about # state=CORRUPT so they'll find a different share. return self._alive def _guess_offsets(self, verifycap, guessed_segment_size): self.guessed_segment_size = guessed_segment_size size = verifycap.size k = verifycap.needed_shares N = verifycap.total_shares r = self._node._calculate_sizes(guessed_segment_size) # num_segments, block_size/tail_block_size # guessed_segment_size/tail_segment_size/tail_segment_padded share_size = mathutil.div_ceil(size, k) # share_size is the amount of block data that will be put into each # share, summed over all segments. It does not include hashes, the # UEB, or other overhead. # use the upload-side code to get this as accurate as possible ht = IncompleteHashTree(N) num_share_hashes = len(ht.needed_hashes(0, include_leaf=True)) wbp = make_write_bucket_proxy(None, None, share_size, r["block_size"], r["num_segments"], num_share_hashes, 0) self._fieldsize = wbp.fieldsize self._fieldstruct = wbp.fieldstruct self.guessed_offsets = wbp._offsets # called by our client, the SegmentFetcher def get_block(self, segnum): """Add a block number to the list of requests. This will eventually result in a fetch of the data necessary to validate the block, then the block itself. The fetch order is generally first-come-first-served, but requests may be answered out-of-order if data becomes available sooner. I return an EventStreamObserver, which has two uses. The first is to call o.subscribe(), which gives me a place to send state changes and eventually the data block. The second is o.cancel(), which removes the request (if it is still active). I will distribute the following events through my EventStreamObserver: - state=OVERDUE: ?? I believe I should have had an answer by now. You may want to ask another share instead. - state=BADSEGNUM: the segnum you asked for is too large. I must fetch a valid UEB before I can determine this, so the notification is asynchronous - state=COMPLETE, block=data: here is a valid block - state=CORRUPT: this share contains corrupted data - state=DEAD, f=Failure: the server reported an error, this share is unusable """ log.msg("%s.get_block(%d)" % (repr(self), segnum), level=log.NOISY, parent=self._lp, umid="RTo9MQ") assert segnum >= 0 o = EventStreamObserver() o.set_canceler(self, "_cancel_block_request") for i,(segnum0,observers) in enumerate(self._requested_blocks): if segnum0 == segnum: observers.add(o) break else: self._requested_blocks.append( (segnum, set([o])) ) self.schedule_loop() return o def _cancel_block_request(self, o): new_requests = [] for e in self._requested_blocks: (segnum0, observers) = e observers.discard(o) if observers: new_requests.append(e) self._requested_blocks = new_requests # internal methods def _active_segnum_and_observers(self): if self._requested_blocks: # we only retrieve information for one segment at a time, to # minimize alacrity (first come, first served) return self._requested_blocks[0] return None, [] def schedule_loop(self): if self._loop_scheduled: return self._loop_scheduled = True eventually(self.loop) def loop(self): self._loop_scheduled = False if not self._alive: return try: # if any exceptions occur here, kill the download log.msg("%s.loop, reqs=[%s], pending=%s, received=%s," " unavailable=%s" % (repr(self), ",".join([str(req[0]) for req in self._requested_blocks]), self._pending.dump(), self._received.dump(), self._unavailable.dump() ), level=log.NOISY, parent=self._lp, umid="BaL1zw") self._do_loop() # all exception cases call self._fail(), which clears self._alive except (BadHashError, NotEnoughHashesError, LayoutInvalid), e: # Abandon this share. We do this if we see corruption in the # offset table, the UEB, or a hash tree. We don't abandon the # whole share if we see corruption in a data block (we abandon # just the one block, and still try to get data from other blocks # on the same server). In theory, we could get good data from a # share with a corrupt UEB (by first getting the UEB from some # other share), or corrupt hash trees, but the logic to decide # when this is safe is non-trivial. So for now, give up at the # first sign of corruption. # # _satisfy_*() code which detects corruption should first call # self._signal_corruption(), and then raise the exception. log.msg(format="corruption detected in %(share)s", share=repr(self), level=log.UNUSUAL, parent=self._lp, umid="gWspVw") self._fail(Failure(e), log.UNUSUAL) except DataUnavailable, e: # Abandon this share. log.msg(format="need data that will never be available" " from %s: pending=%s, received=%s, unavailable=%s" % (repr(self), self._pending.dump(), self._received.dump(), self._unavailable.dump() ), level=log.UNUSUAL, parent=self._lp, umid="F7yJnQ") self._fail(Failure(e), log.UNUSUAL) except BaseException: self._fail(Failure()) raise log.msg("%s.loop done, reqs=[%s], pending=%s, received=%s," " unavailable=%s" % (repr(self), ",".join([str(req[0]) for req in self._requested_blocks]), self._pending.dump(), self._received.dump(), self._unavailable.dump() ), level=log.NOISY, parent=self._lp, umid="9lRaRA") def _do_loop(self): # we are (eventually) called after all state transitions: # new segments added to self._requested_blocks # new data received from servers (responses to our read() calls) # impatience timer fires (server appears slow) # First, consume all of the information that we currently have, for # all the segments people currently want. start = now() while self._get_satisfaction(): pass self._download_status.add_misc_event("satisfy", start, now()) # When we get no satisfaction (from the data we've received so far), # we determine what data we desire (to satisfy more requests). The # number of segments is finite, so I can't get no satisfaction # forever. start = now() wanted, needed = self._desire() self._download_status.add_misc_event("desire", start, now()) # Finally, send out requests for whatever we need (desire minus # have). You can't always get what you want, but if you try # sometimes, you just might find, you get what you need. self._send_requests(wanted + needed) # and sometimes you can't even get what you need start = now() disappointment = needed & self._unavailable if disappointment.len(): self.had_corruption = True raise DataUnavailable("need %s but will never get it" % disappointment.dump()) self._download_status.add_misc_event("checkdis", start, now()) def _get_satisfaction(self): # return True if we retired a data block, and should therefore be # called again. Return False if we don't retire a data block (even if # we do retire some other data, like hash chains). if self.actual_offsets is None: if not self._satisfy_offsets(): # can't even look at anything without the offset table return False if not self._node.have_UEB: if not self._satisfy_UEB(): # can't check any hashes without the UEB return False # the call to _satisfy_UEB() will immediately set the # authoritative num_segments in all our CommonShares. If we # guessed wrong, we might stil be working on a bogus segnum # (beyond the real range). We catch this and signal BADSEGNUM # before invoking any further code that touches hashtrees. self.actual_segment_size = self._node.segment_size # might be updated assert self.actual_segment_size is not None # knowing the UEB means knowing num_segments assert self._node.num_segments is not None segnum, observers = self._active_segnum_and_observers() # if segnum is None, we don't really need to do anything (we have no # outstanding readers right now), but we'll fill in the bits that # aren't tied to any particular segment. if segnum is not None and segnum >= self._node.num_segments: for o in observers: o.notify(state=BADSEGNUM) self._requested_blocks.pop(0) return True if self._node.share_hash_tree.needed_hashes(self._shnum): if not self._satisfy_share_hash_tree(): # can't check block_hash_tree without a root return False if self._commonshare.need_block_hash_root(): block_hash_root = self._node.share_hash_tree.get_leaf(self._shnum) self._commonshare.set_block_hash_root(block_hash_root) if segnum is None: return False # we don't want any particular segment right now # block_hash_tree needed_hashes = self._commonshare.get_needed_block_hashes(segnum) if needed_hashes: if not self._satisfy_block_hash_tree(needed_hashes): # can't check block without block_hash_tree return False # ciphertext_hash_tree needed_hashes = self._node.get_needed_ciphertext_hashes(segnum) if needed_hashes: if not self._satisfy_ciphertext_hash_tree(needed_hashes): # can't check decoded blocks without ciphertext_hash_tree return False # data blocks return self._satisfy_data_block(segnum, observers) def _satisfy_offsets(self): version_s = self._received.get(0, 4) if version_s is None: return False (version,) = struct.unpack(">L", version_s) if version == 1: table_start = 0x0c self._fieldsize = 0x4 self._fieldstruct = "L" elif version == 2: table_start = 0x14 self._fieldsize = 0x8 self._fieldstruct = "Q" else: self.had_corruption = True raise LayoutInvalid("unknown version %d (I understand 1 and 2)" % version) offset_table_size = 6 * self._fieldsize table_s = self._received.pop(table_start, offset_table_size) if table_s is None: return False fields = struct.unpack(">"+6*self._fieldstruct, table_s) offsets = {} for i,field in enumerate(['data', 'plaintext_hash_tree', # UNUSED 'crypttext_hash_tree', 'block_hashes', 'share_hashes', 'uri_extension', ] ): offsets[field] = fields[i] self.actual_offsets = offsets log.msg("actual offsets: data=%d, plaintext_hash_tree=%d, crypttext_hash_tree=%d, block_hashes=%d, share_hashes=%d, uri_extension=%d" % tuple(fields), level=log.NOISY, parent=self._lp, umid="jedQcw") self._received.remove(0, 4) # don't need this anymore # validate the offsets a bit share_hashes_size = offsets["uri_extension"] - offsets["share_hashes"] if share_hashes_size < 0 or share_hashes_size % (2+HASH_SIZE) != 0: # the share hash chain is stored as (hashnum,hash) pairs self.had_corruption = True raise LayoutInvalid("share hashes malformed -- should be a" " multiple of %d bytes -- not %d" % (2+HASH_SIZE, share_hashes_size)) block_hashes_size = offsets["share_hashes"] - offsets["block_hashes"] if block_hashes_size < 0 or block_hashes_size % (HASH_SIZE) != 0: # the block hash tree is stored as a list of hashes self.had_corruption = True raise LayoutInvalid("block hashes malformed -- should be a" " multiple of %d bytes -- not %d" % (HASH_SIZE, block_hashes_size)) # we only look at 'crypttext_hash_tree' if the UEB says we're # actually using it. Same with 'plaintext_hash_tree'. This gives us # some wiggle room: a place to stash data for later extensions. return True def _satisfy_UEB(self): o = self.actual_offsets fsize = self._fieldsize UEB_length_s = self._received.get(o["uri_extension"], fsize) if not UEB_length_s: return False (UEB_length,) = struct.unpack(">"+self._fieldstruct, UEB_length_s) UEB_s = self._received.pop(o["uri_extension"]+fsize, UEB_length) if not UEB_s: return False self._received.remove(o["uri_extension"], fsize) try: self._node.validate_and_store_UEB(UEB_s) return True except (LayoutInvalid, BadHashError), e: # TODO: if this UEB was bad, we'll keep trying to validate it # over and over again. Only log.err on the first one, or better # yet skip all but the first f = Failure(e) self._signal_corruption(f, o["uri_extension"], fsize+UEB_length) self.had_corruption = True raise def _satisfy_share_hash_tree(self): # the share hash chain is stored as (hashnum,hash) tuples, so you # can't fetch just the pieces you need, because you don't know # exactly where they are. So fetch everything, and parse the results # later. o = self.actual_offsets hashlen = o["uri_extension"] - o["share_hashes"] assert hashlen % (2+HASH_SIZE) == 0 hashdata = self._received.get(o["share_hashes"], hashlen) if not hashdata: return False share_hashes = {} for i in range(0, hashlen, 2+HASH_SIZE): (hashnum,) = struct.unpack(">H", hashdata[i:i+2]) hashvalue = hashdata[i+2:i+2+HASH_SIZE] share_hashes[hashnum] = hashvalue # TODO: if they give us an empty set of hashes, # process_share_hashes() won't fail. We must ensure that this # situation doesn't allow unverified shares through. Manual testing # shows that set_block_hash_root() throws an assert because an # internal node is None instead of an actual hash, but we want # something better. It's probably best to add a method to # IncompleteHashTree which takes a leaf number and raises an # exception unless that leaf is present and fully validated. try: self._node.process_share_hashes(share_hashes) # adds to self._node.share_hash_tree except (BadHashError, NotEnoughHashesError), e: f = Failure(e) self._signal_corruption(f, o["share_hashes"], hashlen) self.had_corruption = True raise self._received.remove(o["share_hashes"], hashlen) return True def _signal_corruption(self, f, start, offset): # there was corruption somewhere in the given range reason = "corruption in share[%d-%d): %s" % (start, start+offset, str(f.value)) self._rref.callRemoteOnly("advise_corrupt_share", reason) def _satisfy_block_hash_tree(self, needed_hashes): o_bh = self.actual_offsets["block_hashes"] block_hashes = {} for hashnum in needed_hashes: hashdata = self._received.get(o_bh+hashnum*HASH_SIZE, HASH_SIZE) if hashdata: block_hashes[hashnum] = hashdata else: return False # missing some hashes # note that we don't submit any hashes to the block_hash_tree until # we've gotten them all, because the hash tree will throw an # exception if we only give it a partial set (which it therefore # cannot validate) try: self._commonshare.process_block_hashes(block_hashes) except (BadHashError, NotEnoughHashesError), e: f = Failure(e) hashnums = ",".join([str(n) for n in sorted(block_hashes.keys())]) log.msg(format="hash failure in block_hashes=(%(hashnums)s)," " from %(share)s", hashnums=hashnums, shnum=self._shnum, share=repr(self), failure=f, level=log.WEIRD, parent=self._lp, umid="yNyFdA") hsize = max(0, max(needed_hashes)) * HASH_SIZE self._signal_corruption(f, o_bh, hsize) self.had_corruption = True raise for hashnum in needed_hashes: self._received.remove(o_bh+hashnum*HASH_SIZE, HASH_SIZE) return True def _satisfy_ciphertext_hash_tree(self, needed_hashes): start = self.actual_offsets["crypttext_hash_tree"] hashes = {} for hashnum in needed_hashes: hashdata = self._received.get(start+hashnum*HASH_SIZE, HASH_SIZE) if hashdata: hashes[hashnum] = hashdata else: return False # missing some hashes # we don't submit any hashes to the ciphertext_hash_tree until we've # gotten them all try: self._node.process_ciphertext_hashes(hashes) except (BadHashError, NotEnoughHashesError), e: f = Failure(e) hashnums = ",".join([str(n) for n in sorted(hashes.keys())]) log.msg(format="hash failure in ciphertext_hashes=(%(hashnums)s)," " from %(share)s", hashnums=hashnums, share=repr(self), failure=f, level=log.WEIRD, parent=self._lp, umid="iZI0TA") hsize = max(0, max(needed_hashes))*HASH_SIZE self._signal_corruption(f, start, hsize) self.had_corruption = True raise for hashnum in needed_hashes: self._received.remove(start+hashnum*HASH_SIZE, HASH_SIZE) return True def _satisfy_data_block(self, segnum, observers): tail = (segnum == self._node.num_segments-1) datastart = self.actual_offsets["data"] blockstart = datastart + segnum * self._node.block_size blocklen = self._node.block_size if tail: blocklen = self._node.tail_block_size block = self._received.pop(blockstart, blocklen) if not block: log.msg("no data for block %s (want [%d:+%d])" % (repr(self), blockstart, blocklen), level=log.NOISY, parent=self._lp, umid="aK0RFw") return False log.msg(format="%(share)s._satisfy_data_block [%(start)d:+%(length)d]", share=repr(self), start=blockstart, length=blocklen, level=log.NOISY, parent=self._lp, umid="uTDNZg") # this block is being retired, either as COMPLETE or CORRUPT, since # no further data reads will help assert self._requested_blocks[0][0] == segnum try: self._commonshare.check_block(segnum, block) # hurrah, we have a valid block. Deliver it. for o in observers: # goes to SegmentFetcher._block_request_activity o.notify(state=COMPLETE, block=block) # now clear our received data, to dodge the #1170 spans.py # complexity bug self._received = DataSpans() except (BadHashError, NotEnoughHashesError), e: # rats, we have a corrupt block. Notify our clients that they # need to look elsewhere, and advise the server. Unlike # corruption in other parts of the share, this doesn't cause us # to abandon the whole share. f = Failure(e) log.msg(format="hash failure in block %(segnum)d, from %(share)s", segnum=segnum, share=repr(self), failure=f, level=log.WEIRD, parent=self._lp, umid="mZjkqA") for o in observers: o.notify(state=CORRUPT) self._signal_corruption(f, blockstart, blocklen) self.had_corruption = True # in either case, we've retired this block self._requested_blocks.pop(0) # popping the request keeps us from turning around and wanting the # block again right away return True # got satisfaction def _desire(self): segnum, observers = self._active_segnum_and_observers() # maybe None # 'want_it' is for data we merely want: we know that we don't really # need it. This includes speculative reads, like the first 1KB of the # share (for the offset table) and the first 2KB of the UEB. # # 'need_it' is for data that, if we have the real offset table, we'll # need. If we are only guessing at the offset table, it's merely # wanted. (The share is abandoned if we can't get data that we really # need). # # 'gotta_gotta_have_it' is for data that we absolutely need, # independent of whether we're still guessing about the offset table: # the version number and the offset table itself. # # Mr. Popeil, I'm in trouble, need your assistance on the double. Aww.. desire = Spans(), Spans(), Spans() (want_it, need_it, gotta_gotta_have_it) = desire self.actual_segment_size = self._node.segment_size # might be updated o = self.actual_offsets or self.guessed_offsets segsize = self.actual_segment_size or self.guessed_segment_size r = self._node._calculate_sizes(segsize) if not self.actual_offsets: # all _desire functions add bits to the three desire[] spans self._desire_offsets(desire) # we can use guessed offsets as long as this server tolerates # overrun. Otherwise, we must wait for the offsets to arrive before # we try to read anything else. if self.actual_offsets or self._overrun_ok: if not self._node.have_UEB: self._desire_UEB(desire, o) self._desire_share_hashes(desire, o) if segnum is not None: # They might be asking for a segment number that is beyond # what we guess the file contains, but _desire_block_hashes # and _desire_data will tolerate that. self._desire_block_hashes(desire, o, segnum) self._desire_data(desire, o, r, segnum, segsize) log.msg("end _desire: want_it=%s need_it=%s gotta=%s" % (want_it.dump(), need_it.dump(), gotta_gotta_have_it.dump()), level=log.NOISY, parent=self._lp, umid="IG7CgA") if self.actual_offsets: return (want_it, need_it+gotta_gotta_have_it) else: return (want_it+need_it, gotta_gotta_have_it) def _desire_offsets(self, desire): (want_it, need_it, gotta_gotta_have_it) = desire if self._overrun_ok: # easy! this includes version number, sizes, and offsets want_it.add(0, 1024) return # v1 has an offset table that lives [0x0,0x24). v2 lives [0x0,0x44). # To be conservative, only request the data that we know lives there, # even if that means more roundtrips. gotta_gotta_have_it.add(0, 4) # version number, always safe version_s = self._received.get(0, 4) if not version_s: return (version,) = struct.unpack(">L", version_s) # The code in _satisfy_offsets will have checked this version # already. There is no code path to get this far with version>2. assert 1 <= version <= 2, "can't get here, version=%d" % version if version == 1: table_start = 0x0c fieldsize = 0x4 elif version == 2: table_start = 0x14 fieldsize = 0x8 offset_table_size = 6 * fieldsize gotta_gotta_have_it.add(table_start, offset_table_size) def _desire_UEB(self, desire, o): (want_it, need_it, gotta_gotta_have_it) = desire # UEB data is stored as (length,data). if self._overrun_ok: # We can pre-fetch 2kb, which should probably cover it. If it # turns out to be larger, we'll come back here later with a known # length and fetch the rest. want_it.add(o["uri_extension"], 2048) # now, while that is probably enough to fetch the whole UEB, it # might not be, so we need to do the next few steps as well. In # most cases, the following steps will not actually add anything # to need_it need_it.add(o["uri_extension"], self._fieldsize) # only use a length if we're sure it's correct, otherwise we'll # probably fetch a huge number if not self.actual_offsets: return UEB_length_s = self._received.get(o["uri_extension"], self._fieldsize) if UEB_length_s: (UEB_length,) = struct.unpack(">"+self._fieldstruct, UEB_length_s) # we know the length, so make sure we grab everything need_it.add(o["uri_extension"]+self._fieldsize, UEB_length) def _desire_share_hashes(self, desire, o): (want_it, need_it, gotta_gotta_have_it) = desire if self._node.share_hash_tree.needed_hashes(self._shnum): hashlen = o["uri_extension"] - o["share_hashes"] need_it.add(o["share_hashes"], hashlen) def _desire_block_hashes(self, desire, o, segnum): (want_it, need_it, gotta_gotta_have_it) = desire # block hash chain for hashnum in self._commonshare.get_desired_block_hashes(segnum): need_it.add(o["block_hashes"]+hashnum*HASH_SIZE, HASH_SIZE) # ciphertext hash chain for hashnum in self._node.get_desired_ciphertext_hashes(segnum): need_it.add(o["crypttext_hash_tree"]+hashnum*HASH_SIZE, HASH_SIZE) def _desire_data(self, desire, o, r, segnum, segsize): if segnum > r["num_segments"]: # they're asking for a segment that's beyond what we think is the # end of the file. We won't get here if we've already learned the # real UEB: _get_satisfaction() will notice the out-of-bounds and # terminate the loop. So we must still be guessing, which means # that they might be correct in asking for such a large segnum. # But if they're right, then our segsize/segnum guess is # certainly wrong, which means we don't know what data blocks to # ask for yet. So don't bother adding anything. When the UEB # comes back and we learn the correct segsize/segnums, we'll # either reject the request or have enough information to proceed # normally. This costs one roundtrip. log.msg("_desire_data: segnum(%d) looks wrong (numsegs=%d)" % (segnum, r["num_segments"]), level=log.UNUSUAL, parent=self._lp, umid="tuYRQQ") return (want_it, need_it, gotta_gotta_have_it) = desire tail = (segnum == r["num_segments"]-1) datastart = o["data"] blockstart = datastart + segnum * r["block_size"] blocklen = r["block_size"] if tail: blocklen = r["tail_block_size"] need_it.add(blockstart, blocklen) def _send_requests(self, desired): ask = desired - self._pending - self._received.get_spans() log.msg("%s._send_requests, desired=%s, pending=%s, ask=%s" % (repr(self), desired.dump(), self._pending.dump(), ask.dump()), level=log.NOISY, parent=self._lp, umid="E94CVA") # XXX At one time, this code distinguished between data blocks and # hashes, and made sure to send (small) requests for hashes before # sending (big) requests for blocks. The idea was to make sure that # all hashes arrive before the blocks, so the blocks can be consumed # and released in a single turn. I removed this for simplicity. # Reconsider the removal: maybe bring it back. ds = self._download_status for (start, length) in ask: # TODO: quantize to reasonably-large blocks self._pending.add(start, length) lp = log.msg(format="%(share)s._send_request" " [%(start)d:+%(length)d]", share=repr(self), start=start, length=length, level=log.NOISY, parent=self._lp, umid="sgVAyA") block_ev = ds.add_block_request(self._server, self._shnum, start, length, now()) d = self._send_request(start, length) d.addCallback(self._got_data, start, length, block_ev, lp) d.addErrback(self._got_error, start, length, block_ev, lp) d.addCallback(self._trigger_loop) d.addErrback(lambda f: log.err(format="unhandled error during send_request", failure=f, parent=self._lp, level=log.WEIRD, umid="qZu0wg")) def _send_request(self, start, length): return self._rref.callRemote("read", start, length) def _got_data(self, data, start, length, block_ev, lp): block_ev.finished(len(data), now()) if not self._alive: return log.msg(format="%(share)s._got_data [%(start)d:+%(length)d] -> %(datalen)d", share=repr(self), start=start, length=length, datalen=len(data), level=log.NOISY, parent=lp, umid="5Qn6VQ") self._pending.remove(start, length) self._received.add(start, data) # if we ask for [a:c], and we get back [a:b] (b= numsegs: # oops, we were asking for a segment number beyond the end of the # file. This is an error. self.stop() e = BadSegmentNumberError("segnum=%d, numsegs=%d" % (self.segnum, self._node.num_segments)) f = Failure(e) self._node.fetch_failed(self, f) return #print "LOOP", self._blocks.keys(), "active:", self._active_share_map, "overdue:", self._overdue_share_map, "unused:", self._shares # Should we sent out more requests? while len(set(self._blocks.keys()) | set(self._active_share_map.keys()) ) < k: # we don't have data or active requests for enough shares. Are # there any unused shares we can start using? (sent_something, want_more_diversity) = self._find_and_use_share() if sent_something: # great. loop back around in case we need to send more. continue if want_more_diversity: # we could have sent something if we'd been allowed to pull # more shares per server. Increase the limit and try again. self._max_shares_per_server += 1 log.msg("SegmentFetcher(%s) increasing diversity limit to %d" % (self._node._si_prefix, self._max_shares_per_server), level=log.NOISY, umid="xY2pBA") # Also ask for more shares, in the hopes of achieving better # diversity for the next segment. self._ask_for_more_shares() continue # we need more shares than the ones in self._shares to make # progress self._ask_for_more_shares() if self._no_more_shares: # But there are no more shares to be had. If we're going to # succeed, it will be with the shares we've already seen. # Will they be enough? if len(set(self._blocks.keys()) | set(self._active_share_map.keys()) | set(self._overdue_share_map.keys()) ) < k: # nope. bail. self._no_shares_error() # this calls self.stop() return # our outstanding or overdue requests may yet work. # more shares may be coming. Wait until then. return # are we done? if len(set(self._blocks.keys())) >= k: # yay! self.stop() self._node.process_blocks(self.segnum, self._blocks) return def _no_shares_error(self): if not (self._shares or self._active_share_map or self._overdue_share_map or self._blocks): format = ("no shares (need %(k)d)." " Last failure: %(last_failure)s") args = { "k": self._k, "last_failure": self._last_failure } error = NoSharesError else: format = ("ran out of shares: complete=%(complete)s" " pending=%(pending)s overdue=%(overdue)s" " unused=%(unused)s need %(k)d." " Last failure: %(last_failure)s") def join(shnums): return ",".join(["sh%d" % shnum for shnum in sorted(shnums)]) pending_s = ",".join([str(sh) for sh in self._active_share_map.values()]) overdue = set() for shares in self._overdue_share_map.values(): overdue |= shares overdue_s = ",".join([str(sh) for sh in overdue]) args = {"complete": join(self._blocks.keys()), "pending": pending_s, "overdue": overdue_s, # 'unused' should be zero "unused": ",".join([str(sh) for sh in self._shares]), "k": self._k, "last_failure": self._last_failure, } error = NotEnoughSharesError log.msg(format=format, level=log.UNUSUAL, parent=self._lp, umid="1DsnTg", **args) e = error(format % args) f = Failure(e) self.stop() self._node.fetch_failed(self, f) def _find_and_use_share(self): sent_something = False want_more_diversity = False for sh in self._shares: # find one good share to fetch shnum = sh._shnum ; server = sh._server # XXX if shnum in self._blocks: continue # don't request data we already have if shnum in self._active_share_map: # note: OVERDUE shares are removed from _active_share_map # and added to _overdue_share_map instead. continue # don't send redundant requests sfs = self._shares_from_server if len(sfs.get(server,set())) >= self._max_shares_per_server: # don't pull too much from a single server want_more_diversity = True continue # ok, we can use this share self._shares.remove(sh) self._active_share_map[shnum] = sh self._shares_from_server.add(server, sh) self._start_share(sh, shnum) sent_something = True break return (sent_something, want_more_diversity) def _start_share(self, share, shnum): self._share_observers[share] = o = share.get_block(self.segnum) o.subscribe(self._block_request_activity, share=share, shnum=shnum) def _ask_for_more_shares(self): if not self._no_more_shares: self._node.want_more_shares() # that will trigger the ShareFinder to keep looking, and call our # add_shares() or no_more_shares() later. def _cancel_all_requests(self): for o in self._share_observers.values(): o.cancel() self._share_observers = {} def _block_request_activity(self, share, shnum, state, block=None, f=None): # called by Shares, in response to our s.send_request() calls. if not self._running: return log.msg("SegmentFetcher(%s)._block_request_activity: %s -> %s" % (self._node._si_prefix, repr(share), state), level=log.NOISY, parent=self._lp, umid="vilNWA") # COMPLETE, CORRUPT, DEAD, BADSEGNUM are terminal. Remove the share # from all our tracking lists. if state in (COMPLETE, CORRUPT, DEAD, BADSEGNUM): self._share_observers.pop(share, None) server = share._server # XXX self._shares_from_server.discard(server, share) if self._active_share_map.get(shnum) is share: del self._active_share_map[shnum] self._overdue_share_map.discard(shnum, share) if state is COMPLETE: # 'block' is fully validated and complete self._blocks[shnum] = block if state is OVERDUE: # no longer active, but still might complete del self._active_share_map[shnum] self._overdue_share_map.add(shnum, share) # OVERDUE is not terminal: it will eventually transition to # COMPLETE, CORRUPT, or DEAD. if state is DEAD: self._last_failure = f if state is BADSEGNUM: # our main loop will ask the DownloadNode each time for the # number of segments, so we'll deal with this in the top of # _do_loop pass eventually(self.loop) allmydata-tahoe-1.10.2/src/allmydata/immutable/upload.py0000644000175000017500000021040212556560070021342 0ustar ramramimport os, time, weakref, itertools from zope.interface import implements from twisted.python import failure from twisted.internet import defer from twisted.application import service from foolscap.api import Referenceable, Copyable, RemoteCopy, fireEventually from allmydata.util.hashutil import file_renewal_secret_hash, \ file_cancel_secret_hash, bucket_renewal_secret_hash, \ bucket_cancel_secret_hash, plaintext_hasher, \ storage_index_hash, plaintext_segment_hasher, convergence_hasher from allmydata import hashtree, uri from allmydata.storage.server import si_b2a from allmydata.immutable import encode from allmydata.util import base32, dictutil, idlib, log, mathutil from allmydata.util.happinessutil import servers_of_happiness, \ shares_by_server, merge_servers, \ failure_message from allmydata.util.assertutil import precondition, _assert from allmydata.util.rrefutil import add_version_to_remote_reference from allmydata.interfaces import IUploadable, IUploader, IUploadResults, \ IEncryptedUploadable, RIEncryptedUploadable, IUploadStatus, \ NoServersError, InsufficientVersionError, UploadUnhappinessError, \ DEFAULT_MAX_SEGMENT_SIZE from allmydata.immutable import layout from pycryptopp.cipher.aes import AES from cStringIO import StringIO # this wants to live in storage, not here class TooFullError(Exception): pass # HelperUploadResults are what we get from the Helper, and to retain # backwards compatibility with old Helpers we can't change the format. We # convert them into a local UploadResults upon receipt. class HelperUploadResults(Copyable, RemoteCopy): # note: don't change this string, it needs to match the value used on the # helper, and it does *not* need to match the fully-qualified # package/module/class name typeToCopy = "allmydata.upload.UploadResults.tahoe.allmydata.com" copytype = typeToCopy # also, think twice about changing the shape of any existing attribute, # because instances of this class are sent from the helper to its client, # so changing this may break compatibility. Consider adding new fields # instead of modifying existing ones. def __init__(self): self.timings = {} # dict of name to number of seconds self.sharemap = dictutil.DictOfSets() # {shnum: set(serverid)} self.servermap = dictutil.DictOfSets() # {serverid: set(shnum)} self.file_size = None self.ciphertext_fetched = None # how much the helper fetched self.uri = None self.preexisting_shares = None # count of shares already present self.pushed_shares = None # count of shares we pushed class UploadResults: implements(IUploadResults) def __init__(self, file_size, ciphertext_fetched, # how much the helper fetched preexisting_shares, # count of shares already present pushed_shares, # count of shares we pushed sharemap, # {shnum: set(server)} servermap, # {server: set(shnum)} timings, # dict of name to number of seconds uri_extension_data, uri_extension_hash, verifycapstr): self._file_size = file_size self._ciphertext_fetched = ciphertext_fetched self._preexisting_shares = preexisting_shares self._pushed_shares = pushed_shares self._sharemap = sharemap self._servermap = servermap self._timings = timings self._uri_extension_data = uri_extension_data self._uri_extension_hash = uri_extension_hash self._verifycapstr = verifycapstr def set_uri(self, uri): self._uri = uri def get_file_size(self): return self._file_size def get_uri(self): return self._uri def get_ciphertext_fetched(self): return self._ciphertext_fetched def get_preexisting_shares(self): return self._preexisting_shares def get_pushed_shares(self): return self._pushed_shares def get_sharemap(self): return self._sharemap def get_servermap(self): return self._servermap def get_timings(self): return self._timings def get_uri_extension_data(self): return self._uri_extension_data def get_verifycapstr(self): return self._verifycapstr # our current uri_extension is 846 bytes for small files, a few bytes # more for larger ones (since the filesize is encoded in decimal in a # few places). Ask for a little bit more just in case we need it. If # the extension changes size, we can change EXTENSION_SIZE to # allocate a more accurate amount of space. EXTENSION_SIZE = 1000 # TODO: actual extensions are closer to 419 bytes, so we can probably lower # this. def pretty_print_shnum_to_servers(s): return ', '.join([ "sh%s: %s" % (k, '+'.join([idlib.shortnodeid_b2a(x) for x in v])) for k, v in s.iteritems() ]) class ServerTracker: def __init__(self, server, sharesize, blocksize, num_segments, num_share_hashes, storage_index, bucket_renewal_secret, bucket_cancel_secret): self._server = server self.buckets = {} # k: shareid, v: IRemoteBucketWriter self.sharesize = sharesize wbp = layout.make_write_bucket_proxy(None, None, sharesize, blocksize, num_segments, num_share_hashes, EXTENSION_SIZE) self.wbp_class = wbp.__class__ # to create more of them self.allocated_size = wbp.get_allocated_size() self.blocksize = blocksize self.num_segments = num_segments self.num_share_hashes = num_share_hashes self.storage_index = storage_index self.renew_secret = bucket_renewal_secret self.cancel_secret = bucket_cancel_secret def __repr__(self): return ("" % (self._server.get_name(), si_b2a(self.storage_index)[:5])) def get_server(self): return self._server def get_serverid(self): return self._server.get_serverid() def get_name(self): return self._server.get_name() def query(self, sharenums): rref = self._server.get_rref() d = rref.callRemote("allocate_buckets", self.storage_index, self.renew_secret, self.cancel_secret, sharenums, self.allocated_size, canary=Referenceable()) d.addCallback(self._got_reply) return d def ask_about_existing_shares(self): rref = self._server.get_rref() return rref.callRemote("get_buckets", self.storage_index) def _got_reply(self, (alreadygot, buckets)): #log.msg("%s._got_reply(%s)" % (self, (alreadygot, buckets))) b = {} for sharenum, rref in buckets.iteritems(): bp = self.wbp_class(rref, self._server, self.sharesize, self.blocksize, self.num_segments, self.num_share_hashes, EXTENSION_SIZE) b[sharenum] = bp self.buckets.update(b) return (alreadygot, set(b.keys())) def abort(self): """ I abort the remote bucket writers for all shares. This is a good idea to conserve space on the storage server. """ self.abort_some_buckets(self.buckets.keys()) def abort_some_buckets(self, sharenums): """ I abort the remote bucket writers for the share numbers in sharenums. """ for sharenum in sharenums: if sharenum in self.buckets: self.buckets[sharenum].abort() del self.buckets[sharenum] def str_shareloc(shnum, bucketwriter): return "%s: %s" % (shnum, bucketwriter.get_servername(),) class Tahoe2ServerSelector(log.PrefixingLogMixin): def __init__(self, upload_id, logparent=None, upload_status=None): self.upload_id = upload_id self.query_count, self.good_query_count, self.bad_query_count = 0,0,0 # Servers that are working normally, but full. self.full_count = 0 self.error_count = 0 self.num_servers_contacted = 0 self.last_failure_msg = None self._status = IUploadStatus(upload_status) log.PrefixingLogMixin.__init__(self, 'tahoe.immutable.upload', logparent, prefix=upload_id) self.log("starting", level=log.OPERATIONAL) def __repr__(self): return "" % self.upload_id def get_shareholders(self, storage_broker, secret_holder, storage_index, share_size, block_size, num_segments, total_shares, needed_shares, servers_of_happiness): """ @return: (upload_trackers, already_serverids), where upload_trackers is a set of ServerTracker instances that have agreed to hold some shares for us (the shareids are stashed inside the ServerTracker), and already_serverids is a dict mapping shnum to a set of serverids for servers which claim to already have the share. """ if self._status: self._status.set_status("Contacting Servers..") self.total_shares = total_shares self.servers_of_happiness = servers_of_happiness self.needed_shares = needed_shares self.homeless_shares = set(range(total_shares)) self.use_trackers = set() # ServerTrackers that have shares assigned # to them self.preexisting_shares = {} # shareid => set(serverids) holding shareid # These servers have shares -- any shares -- for our SI. We keep # track of these to write an error message with them later. self.serverids_with_shares = set() # this needed_hashes computation should mirror # Encoder.send_all_share_hash_trees. We use an IncompleteHashTree # (instead of a HashTree) because we don't require actual hashing # just to count the levels. ht = hashtree.IncompleteHashTree(total_shares) num_share_hashes = len(ht.needed_hashes(0, include_leaf=True)) # figure out how much space to ask for wbp = layout.make_write_bucket_proxy(None, None, share_size, 0, num_segments, num_share_hashes, EXTENSION_SIZE) allocated_size = wbp.get_allocated_size() all_servers = storage_broker.get_servers_for_psi(storage_index) if not all_servers: raise NoServersError("client gave us zero servers") # filter the list of servers according to which ones can accomodate # this request. This excludes older servers (which used a 4-byte size # field) from getting large shares (for files larger than about # 12GiB). See #439 for details. def _get_maxsize(server): v0 = server.get_rref().version v1 = v0["http://allmydata.org/tahoe/protocols/storage/v1"] return v1["maximum-immutable-share-size"] writeable_servers = [server for server in all_servers if _get_maxsize(server) >= allocated_size] readonly_servers = set(all_servers[:2*total_shares]) - set(writeable_servers) # decide upon the renewal/cancel secrets, to include them in the # allocate_buckets query. client_renewal_secret = secret_holder.get_renewal_secret() client_cancel_secret = secret_holder.get_cancel_secret() file_renewal_secret = file_renewal_secret_hash(client_renewal_secret, storage_index) file_cancel_secret = file_cancel_secret_hash(client_cancel_secret, storage_index) def _make_trackers(servers): trackers = [] for s in servers: seed = s.get_lease_seed() renew = bucket_renewal_secret_hash(file_renewal_secret, seed) cancel = bucket_cancel_secret_hash(file_cancel_secret, seed) st = ServerTracker(s, share_size, block_size, num_segments, num_share_hashes, storage_index, renew, cancel) trackers.append(st) return trackers # We assign each servers/trackers into one three lists. They all # start in the "first pass" list. During the first pass, as we ask # each one to hold a share, we move their tracker to the "second # pass" list, until the first-pass list is empty. Then during the # second pass, as we ask each to hold more shares, we move their # tracker to the "next pass" list, until the second-pass list is # empty. Then we move everybody from the next-pass list back to the # second-pass list and repeat the "second" pass (really the third, # fourth, etc pass), until all shares are assigned, or we've run out # of potential servers. self.first_pass_trackers = _make_trackers(writeable_servers) self.second_pass_trackers = [] # servers worth asking again self.next_pass_trackers = [] # servers that we have asked again self._started_second_pass = False # We don't try to allocate shares to these servers, since they've # said that they're incapable of storing shares of the size that we'd # want to store. We ask them about existing shares for this storage # index, which we want to know about for accurate # servers_of_happiness accounting, then we forget about them. readonly_trackers = _make_trackers(readonly_servers) # We now ask servers that can't hold any new shares about existing # shares that they might have for our SI. Once this is done, we # start placing the shares that we haven't already accounted # for. ds = [] if self._status and readonly_trackers: self._status.set_status("Contacting readonly servers to find " "any existing shares") for tracker in readonly_trackers: assert isinstance(tracker, ServerTracker) d = tracker.ask_about_existing_shares() d.addBoth(self._handle_existing_response, tracker) ds.append(d) self.num_servers_contacted += 1 self.query_count += 1 self.log("asking server %s for any existing shares" % (tracker.get_name(),), level=log.NOISY) dl = defer.DeferredList(ds) dl.addCallback(lambda ign: self._loop()) return dl def _handle_existing_response(self, res, tracker): """ I handle responses to the queries sent by Tahoe2ServerSelector._existing_shares. """ serverid = tracker.get_serverid() if isinstance(res, failure.Failure): self.log("%s got error during existing shares check: %s" % (tracker.get_name(), res), level=log.UNUSUAL) self.error_count += 1 self.bad_query_count += 1 else: buckets = res if buckets: self.serverids_with_shares.add(serverid) self.log("response to get_buckets() from server %s: alreadygot=%s" % (tracker.get_name(), tuple(sorted(buckets))), level=log.NOISY) for bucket in buckets: self.preexisting_shares.setdefault(bucket, set()).add(serverid) self.homeless_shares.discard(bucket) self.full_count += 1 self.bad_query_count += 1 def _get_progress_message(self): if not self.homeless_shares: msg = "placed all %d shares, " % (self.total_shares) else: msg = ("placed %d shares out of %d total (%d homeless), " % (self.total_shares - len(self.homeless_shares), self.total_shares, len(self.homeless_shares))) return (msg + "want to place shares on at least %d servers such that " "any %d of them have enough shares to recover the file, " "sent %d queries to %d servers, " "%d queries placed some shares, %d placed none " "(of which %d placed none due to the server being" " full and %d placed none due to an error)" % (self.servers_of_happiness, self.needed_shares, self.query_count, self.num_servers_contacted, self.good_query_count, self.bad_query_count, self.full_count, self.error_count)) def _loop(self): if not self.homeless_shares: merged = merge_servers(self.preexisting_shares, self.use_trackers) effective_happiness = servers_of_happiness(merged) if self.servers_of_happiness <= effective_happiness: msg = ("server selection successful for %s: %s: pretty_print_merged: %s, " "self.use_trackers: %s, self.preexisting_shares: %s") \ % (self, self._get_progress_message(), pretty_print_shnum_to_servers(merged), [', '.join([str_shareloc(k,v) for k,v in st.buckets.iteritems()]) for st in self.use_trackers], pretty_print_shnum_to_servers(self.preexisting_shares)) self.log(msg, level=log.OPERATIONAL) return (self.use_trackers, self.preexisting_shares) else: # We're not okay right now, but maybe we can fix it by # redistributing some shares. In cases where one or two # servers has, before the upload, all or most of the # shares for a given SI, this can work by allowing _loop # a chance to spread those out over the other servers, delta = self.servers_of_happiness - effective_happiness shares = shares_by_server(self.preexisting_shares) # Each server in shares maps to a set of shares stored on it. # Since we want to keep at least one share on each server # that has one (otherwise we'd only be making # the situation worse by removing distinct servers), # each server has len(its shares) - 1 to spread around. shares_to_spread = sum([len(list(sharelist)) - 1 for (server, sharelist) in shares.items()]) if delta <= len(self.first_pass_trackers) and \ shares_to_spread >= delta: items = shares.items() while len(self.homeless_shares) < delta: # Loop through the allocated shares, removing # one from each server that has more than one # and putting it back into self.homeless_shares # until we've done this delta times. server, sharelist = items.pop() if len(sharelist) > 1: share = sharelist.pop() self.homeless_shares.add(share) self.preexisting_shares[share].remove(server) if not self.preexisting_shares[share]: del self.preexisting_shares[share] items.append((server, sharelist)) for writer in self.use_trackers: writer.abort_some_buckets(self.homeless_shares) return self._loop() else: # Redistribution won't help us; fail. server_count = len(self.serverids_with_shares) failmsg = failure_message(server_count, self.needed_shares, self.servers_of_happiness, effective_happiness) servmsgtempl = "server selection unsuccessful for %r: %s (%s), merged=%s" servmsg = servmsgtempl % ( self, failmsg, self._get_progress_message(), pretty_print_shnum_to_servers(merged) ) self.log(servmsg, level=log.INFREQUENT) return self._failed("%s (%s)" % (failmsg, self._get_progress_message())) if self.first_pass_trackers: tracker = self.first_pass_trackers.pop(0) # TODO: don't pre-convert all serverids to ServerTrackers assert isinstance(tracker, ServerTracker) shares_to_ask = set(sorted(self.homeless_shares)[:1]) self.homeless_shares -= shares_to_ask self.query_count += 1 self.num_servers_contacted += 1 if self._status: self._status.set_status("Contacting Servers [%s] (first query)," " %d shares left.." % (tracker.get_name(), len(self.homeless_shares))) d = tracker.query(shares_to_ask) d.addBoth(self._got_response, tracker, shares_to_ask, self.second_pass_trackers) return d elif self.second_pass_trackers: # ask a server that we've already asked. if not self._started_second_pass: self.log("starting second pass", level=log.NOISY) self._started_second_pass = True num_shares = mathutil.div_ceil(len(self.homeless_shares), len(self.second_pass_trackers)) tracker = self.second_pass_trackers.pop(0) shares_to_ask = set(sorted(self.homeless_shares)[:num_shares]) self.homeless_shares -= shares_to_ask self.query_count += 1 if self._status: self._status.set_status("Contacting Servers [%s] (second query)," " %d shares left.." % (tracker.get_name(), len(self.homeless_shares))) d = tracker.query(shares_to_ask) d.addBoth(self._got_response, tracker, shares_to_ask, self.next_pass_trackers) return d elif self.next_pass_trackers: # we've finished the second-or-later pass. Move all the remaining # servers back into self.second_pass_trackers for the next pass. self.second_pass_trackers.extend(self.next_pass_trackers) self.next_pass_trackers[:] = [] return self._loop() else: # no more servers. If we haven't placed enough shares, we fail. merged = merge_servers(self.preexisting_shares, self.use_trackers) effective_happiness = servers_of_happiness(merged) if effective_happiness < self.servers_of_happiness: msg = failure_message(len(self.serverids_with_shares), self.needed_shares, self.servers_of_happiness, effective_happiness) msg = ("server selection failed for %s: %s (%s)" % (self, msg, self._get_progress_message())) if self.last_failure_msg: msg += " (%s)" % (self.last_failure_msg,) self.log(msg, level=log.UNUSUAL) return self._failed(msg) else: # we placed enough to be happy, so we're done if self._status: self._status.set_status("Placed all shares") msg = ("server selection successful (no more servers) for %s: %s: %s" % (self, self._get_progress_message(), pretty_print_shnum_to_servers(merged))) self.log(msg, level=log.OPERATIONAL) return (self.use_trackers, self.preexisting_shares) def _got_response(self, res, tracker, shares_to_ask, put_tracker_here): if isinstance(res, failure.Failure): # This is unusual, and probably indicates a bug or a network # problem. self.log("%s got error during server selection: %s" % (tracker, res), level=log.UNUSUAL) self.error_count += 1 self.bad_query_count += 1 self.homeless_shares |= shares_to_ask if (self.first_pass_trackers or self.second_pass_trackers or self.next_pass_trackers): # there is still hope, so just loop pass else: # No more servers, so this upload might fail (it depends upon # whether we've hit servers_of_happiness or not). Log the last # failure we got: if a coding error causes all servers to fail # in the same way, this allows the common failure to be seen # by the uploader and should help with debugging msg = ("last failure (from %s) was: %s" % (tracker, res)) self.last_failure_msg = msg else: (alreadygot, allocated) = res self.log("response to allocate_buckets() from server %s: alreadygot=%s, allocated=%s" % (tracker.get_name(), tuple(sorted(alreadygot)), tuple(sorted(allocated))), level=log.NOISY) progress = False for s in alreadygot: self.preexisting_shares.setdefault(s, set()).add(tracker.get_serverid()) if s in self.homeless_shares: self.homeless_shares.remove(s) progress = True elif s in shares_to_ask: progress = True # the ServerTracker will remember which shares were allocated on # that peer. We just have to remember to use them. if allocated: self.use_trackers.add(tracker) progress = True if allocated or alreadygot: self.serverids_with_shares.add(tracker.get_serverid()) not_yet_present = set(shares_to_ask) - set(alreadygot) still_homeless = not_yet_present - set(allocated) if progress: # They accepted at least one of the shares that we asked # them to accept, or they had a share that we didn't ask # them to accept but that we hadn't placed yet, so this # was a productive query self.good_query_count += 1 else: self.bad_query_count += 1 self.full_count += 1 if still_homeless: # In networks with lots of space, this is very unusual and # probably indicates an error. In networks with servers that # are full, it is merely unusual. In networks that are very # full, it is common, and many uploads will fail. In most # cases, this is obviously not fatal, and we'll just use some # other servers. # some shares are still homeless, keep trying to find them a # home. The ones that were rejected get first priority. self.homeless_shares |= still_homeless # Since they were unable to accept all of our requests, so it # is safe to assume that asking them again won't help. else: # if they *were* able to accept everything, they might be # willing to accept even more. put_tracker_here.append(tracker) # now loop return self._loop() def _failed(self, msg): """ I am called when server selection fails. I first abort all of the remote buckets that I allocated during my unsuccessful attempt to place shares for this file. I then raise an UploadUnhappinessError with my msg argument. """ for tracker in self.use_trackers: assert isinstance(tracker, ServerTracker) tracker.abort() raise UploadUnhappinessError(msg) class EncryptAnUploadable: """This is a wrapper that takes an IUploadable and provides IEncryptedUploadable.""" implements(IEncryptedUploadable) CHUNKSIZE = 50*1024 def __init__(self, original, log_parent=None): precondition(original.default_params_set, "set_default_encoding_parameters not called on %r before wrapping with EncryptAnUploadable" % (original,)) self.original = IUploadable(original) self._log_number = log_parent self._encryptor = None self._plaintext_hasher = plaintext_hasher() self._plaintext_segment_hasher = None self._plaintext_segment_hashes = [] self._encoding_parameters = None self._file_size = None self._ciphertext_bytes_read = 0 self._status = None def set_upload_status(self, upload_status): self._status = IUploadStatus(upload_status) self.original.set_upload_status(upload_status) def log(self, *args, **kwargs): if "facility" not in kwargs: kwargs["facility"] = "upload.encryption" if "parent" not in kwargs: kwargs["parent"] = self._log_number return log.msg(*args, **kwargs) def get_size(self): if self._file_size is not None: return defer.succeed(self._file_size) d = self.original.get_size() def _got_size(size): self._file_size = size if self._status: self._status.set_size(size) return size d.addCallback(_got_size) return d def get_all_encoding_parameters(self): if self._encoding_parameters is not None: return defer.succeed(self._encoding_parameters) d = self.original.get_all_encoding_parameters() def _got(encoding_parameters): (k, happy, n, segsize) = encoding_parameters self._segment_size = segsize # used by segment hashers self._encoding_parameters = encoding_parameters self.log("my encoding parameters: %s" % (encoding_parameters,), level=log.NOISY) return encoding_parameters d.addCallback(_got) return d def _get_encryptor(self): if self._encryptor: return defer.succeed(self._encryptor) d = self.original.get_encryption_key() def _got(key): e = AES(key) self._encryptor = e storage_index = storage_index_hash(key) assert isinstance(storage_index, str) # There's no point to having the SI be longer than the key, so we # specify that it is truncated to the same 128 bits as the AES key. assert len(storage_index) == 16 # SHA-256 truncated to 128b self._storage_index = storage_index if self._status: self._status.set_storage_index(storage_index) return e d.addCallback(_got) return d def get_storage_index(self): d = self._get_encryptor() d.addCallback(lambda res: self._storage_index) return d def _get_segment_hasher(self): p = self._plaintext_segment_hasher if p: left = self._segment_size - self._plaintext_segment_hashed_bytes return p, left p = plaintext_segment_hasher() self._plaintext_segment_hasher = p self._plaintext_segment_hashed_bytes = 0 return p, self._segment_size def _update_segment_hash(self, chunk): offset = 0 while offset < len(chunk): p, segment_left = self._get_segment_hasher() chunk_left = len(chunk) - offset this_segment = min(chunk_left, segment_left) p.update(chunk[offset:offset+this_segment]) self._plaintext_segment_hashed_bytes += this_segment if self._plaintext_segment_hashed_bytes == self._segment_size: # we've filled this segment self._plaintext_segment_hashes.append(p.digest()) self._plaintext_segment_hasher = None self.log("closed hash [%d]: %dB" % (len(self._plaintext_segment_hashes)-1, self._plaintext_segment_hashed_bytes), level=log.NOISY) self.log(format="plaintext leaf hash [%(segnum)d] is %(hash)s", segnum=len(self._plaintext_segment_hashes)-1, hash=base32.b2a(p.digest()), level=log.NOISY) offset += this_segment def read_encrypted(self, length, hash_only): # make sure our parameters have been set up first d = self.get_all_encoding_parameters() # and size d.addCallback(lambda ignored: self.get_size()) d.addCallback(lambda ignored: self._get_encryptor()) # then fetch and encrypt the plaintext. The unusual structure here # (passing a Deferred *into* a function) is needed to avoid # overflowing the stack: Deferreds don't optimize out tail recursion. # We also pass in a list, to which _read_encrypted will append # ciphertext. ciphertext = [] d2 = defer.Deferred() d.addCallback(lambda ignored: self._read_encrypted(length, ciphertext, hash_only, d2)) d.addCallback(lambda ignored: d2) return d def _read_encrypted(self, remaining, ciphertext, hash_only, fire_when_done): if not remaining: fire_when_done.callback(ciphertext) return None # tolerate large length= values without consuming a lot of RAM by # reading just a chunk (say 50kB) at a time. This only really matters # when hash_only==True (i.e. resuming an interrupted upload), since # that's the case where we will be skipping over a lot of data. size = min(remaining, self.CHUNKSIZE) remaining = remaining - size # read a chunk of plaintext.. d = defer.maybeDeferred(self.original.read, size) # N.B.: if read() is synchronous, then since everything else is # actually synchronous too, we'd blow the stack unless we stall for a # tick. Once you accept a Deferred from IUploadable.read(), you must # be prepared to have it fire immediately too. d.addCallback(fireEventually) def _good(plaintext): # and encrypt it.. # o/' over the fields we go, hashing all the way, sHA! sHA! sHA! o/' ct = self._hash_and_encrypt_plaintext(plaintext, hash_only) ciphertext.extend(ct) self._read_encrypted(remaining, ciphertext, hash_only, fire_when_done) def _err(why): fire_when_done.errback(why) d.addCallback(_good) d.addErrback(_err) return None def _hash_and_encrypt_plaintext(self, data, hash_only): assert isinstance(data, (tuple, list)), type(data) data = list(data) cryptdata = [] # we use data.pop(0) instead of 'for chunk in data' to save # memory: each chunk is destroyed as soon as we're done with it. bytes_processed = 0 while data: chunk = data.pop(0) self.log(" read_encrypted handling %dB-sized chunk" % len(chunk), level=log.NOISY) bytes_processed += len(chunk) self._plaintext_hasher.update(chunk) self._update_segment_hash(chunk) # TODO: we have to encrypt the data (even if hash_only==True) # because pycryptopp's AES-CTR implementation doesn't offer a # way to change the counter value. Once pycryptopp acquires # this ability, change this to simply update the counter # before each call to (hash_only==False) _encryptor.process() ciphertext = self._encryptor.process(chunk) if hash_only: self.log(" skipping encryption", level=log.NOISY) else: cryptdata.append(ciphertext) del ciphertext del chunk self._ciphertext_bytes_read += bytes_processed if self._status: progress = float(self._ciphertext_bytes_read) / self._file_size self._status.set_progress(1, progress) return cryptdata def get_plaintext_hashtree_leaves(self, first, last, num_segments): # this is currently unused, but will live again when we fix #453 if len(self._plaintext_segment_hashes) < num_segments: # close out the last one assert len(self._plaintext_segment_hashes) == num_segments-1 p, segment_left = self._get_segment_hasher() self._plaintext_segment_hashes.append(p.digest()) del self._plaintext_segment_hasher self.log("closing plaintext leaf hasher, hashed %d bytes" % self._plaintext_segment_hashed_bytes, level=log.NOISY) self.log(format="plaintext leaf hash [%(segnum)d] is %(hash)s", segnum=len(self._plaintext_segment_hashes)-1, hash=base32.b2a(p.digest()), level=log.NOISY) assert len(self._plaintext_segment_hashes) == num_segments return defer.succeed(tuple(self._plaintext_segment_hashes[first:last])) def get_plaintext_hash(self): h = self._plaintext_hasher.digest() return defer.succeed(h) def close(self): return self.original.close() class UploadStatus: implements(IUploadStatus) statusid_counter = itertools.count(0) def __init__(self): self.storage_index = None self.size = None self.helper = False self.status = "Not started" self.progress = [0.0, 0.0, 0.0] self.active = True self.results = None self.counter = self.statusid_counter.next() self.started = time.time() def get_started(self): return self.started def get_storage_index(self): return self.storage_index def get_size(self): return self.size def using_helper(self): return self.helper def get_status(self): return self.status def get_progress(self): return tuple(self.progress) def get_active(self): return self.active def get_results(self): return self.results def get_counter(self): return self.counter def set_storage_index(self, si): self.storage_index = si def set_size(self, size): self.size = size def set_helper(self, helper): self.helper = helper def set_status(self, status): self.status = status def set_progress(self, which, value): # [0]: chk, [1]: ciphertext, [2]: encode+push self.progress[which] = value def set_active(self, value): self.active = value def set_results(self, value): self.results = value class CHKUploader: server_selector_class = Tahoe2ServerSelector def __init__(self, storage_broker, secret_holder): # server_selector needs storage_broker and secret_holder self._storage_broker = storage_broker self._secret_holder = secret_holder self._log_number = self.log("CHKUploader starting", parent=None) self._encoder = None self._storage_index = None self._upload_status = UploadStatus() self._upload_status.set_helper(False) self._upload_status.set_active(True) # locate_all_shareholders() will create the following attribute: # self._server_trackers = {} # k: shnum, v: instance of ServerTracker def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.upload" return log.msg(*args, **kwargs) def start(self, encrypted_uploadable): """Start uploading the file. Returns a Deferred that will fire with the UploadResults instance. """ self._started = time.time() eu = IEncryptedUploadable(encrypted_uploadable) self.log("starting upload of %s" % eu) eu.set_upload_status(self._upload_status) d = self.start_encrypted(eu) def _done(uploadresults): self._upload_status.set_active(False) return uploadresults d.addBoth(_done) return d def abort(self): """Call this if the upload must be abandoned before it completes. This will tell the shareholders to delete their partial shares. I return a Deferred that fires when these messages have been acked.""" if not self._encoder: # how did you call abort() before calling start() ? return defer.succeed(None) return self._encoder.abort() def start_encrypted(self, encrypted): """ Returns a Deferred that will fire with the UploadResults instance. """ eu = IEncryptedUploadable(encrypted) started = time.time() self._encoder = e = encode.Encoder(self._log_number, self._upload_status) d = e.set_encrypted_uploadable(eu) d.addCallback(self.locate_all_shareholders, started) d.addCallback(self.set_shareholders, e) d.addCallback(lambda res: e.start()) d.addCallback(self._encrypted_done) return d def locate_all_shareholders(self, encoder, started): server_selection_started = now = time.time() self._storage_index_elapsed = now - started storage_broker = self._storage_broker secret_holder = self._secret_holder storage_index = encoder.get_param("storage_index") self._storage_index = storage_index upload_id = si_b2a(storage_index)[:5] self.log("using storage index %s" % upload_id) server_selector = self.server_selector_class(upload_id, self._log_number, self._upload_status) share_size = encoder.get_param("share_size") block_size = encoder.get_param("block_size") num_segments = encoder.get_param("num_segments") k,desired,n = encoder.get_param("share_counts") self._server_selection_started = time.time() d = server_selector.get_shareholders(storage_broker, secret_holder, storage_index, share_size, block_size, num_segments, n, k, desired) def _done(res): self._server_selection_elapsed = time.time() - server_selection_started return res d.addCallback(_done) return d def set_shareholders(self, (upload_trackers, already_serverids), encoder): """ @param upload_trackers: a sequence of ServerTracker objects that have agreed to hold some shares for us (the shareids are stashed inside the ServerTracker) @paran already_serverids: a dict mapping sharenum to a set of serverids for servers that claim to already have this share """ msgtempl = "set_shareholders; upload_trackers is %s, already_serverids is %s" values = ([', '.join([str_shareloc(k,v) for k,v in st.buckets.iteritems()]) for st in upload_trackers], already_serverids) self.log(msgtempl % values, level=log.OPERATIONAL) # record already-present shares in self._results self._count_preexisting_shares = len(already_serverids) self._server_trackers = {} # k: shnum, v: instance of ServerTracker for tracker in upload_trackers: assert isinstance(tracker, ServerTracker) buckets = {} servermap = already_serverids.copy() for tracker in upload_trackers: buckets.update(tracker.buckets) for shnum in tracker.buckets: self._server_trackers[shnum] = tracker servermap.setdefault(shnum, set()).add(tracker.get_serverid()) assert len(buckets) == sum([len(tracker.buckets) for tracker in upload_trackers]), \ "%s (%s) != %s (%s)" % ( len(buckets), buckets, sum([len(tracker.buckets) for tracker in upload_trackers]), [(t.buckets, t.get_serverid()) for t in upload_trackers] ) encoder.set_shareholders(buckets, servermap) def _encrypted_done(self, verifycap): """Returns a Deferred that will fire with the UploadResults instance.""" e = self._encoder sharemap = dictutil.DictOfSets() servermap = dictutil.DictOfSets() for shnum in e.get_shares_placed(): server = self._server_trackers[shnum].get_server() sharemap.add(shnum, server) servermap.add(server, shnum) now = time.time() timings = {} timings["total"] = now - self._started timings["storage_index"] = self._storage_index_elapsed timings["peer_selection"] = self._server_selection_elapsed timings.update(e.get_times()) ur = UploadResults(file_size=e.file_size, ciphertext_fetched=0, preexisting_shares=self._count_preexisting_shares, pushed_shares=len(e.get_shares_placed()), sharemap=sharemap, servermap=servermap, timings=timings, uri_extension_data=e.get_uri_extension_data(), uri_extension_hash=e.get_uri_extension_hash(), verifycapstr=verifycap.to_string()) self._upload_status.set_results(ur) return ur def get_upload_status(self): return self._upload_status def read_this_many_bytes(uploadable, size, prepend_data=[]): if size == 0: return defer.succeed([]) d = uploadable.read(size) def _got(data): assert isinstance(data, list) bytes = sum([len(piece) for piece in data]) assert bytes > 0 assert bytes <= size remaining = size - bytes if remaining: return read_this_many_bytes(uploadable, remaining, prepend_data + data) return prepend_data + data d.addCallback(_got) return d class LiteralUploader: def __init__(self): self._status = s = UploadStatus() s.set_storage_index(None) s.set_helper(False) s.set_progress(0, 1.0) s.set_active(False) def start(self, uploadable): uploadable = IUploadable(uploadable) d = uploadable.get_size() def _got_size(size): self._size = size self._status.set_size(size) return read_this_many_bytes(uploadable, size) d.addCallback(_got_size) d.addCallback(lambda data: uri.LiteralFileURI("".join(data))) d.addCallback(lambda u: u.to_string()) d.addCallback(self._build_results) return d def _build_results(self, uri): ur = UploadResults(file_size=self._size, ciphertext_fetched=0, preexisting_shares=0, pushed_shares=0, sharemap={}, servermap={}, timings={}, uri_extension_data=None, uri_extension_hash=None, verifycapstr=None) ur.set_uri(uri) self._status.set_status("Finished") self._status.set_progress(1, 1.0) self._status.set_progress(2, 1.0) self._status.set_results(ur) return ur def close(self): pass def get_upload_status(self): return self._status class RemoteEncryptedUploadable(Referenceable): implements(RIEncryptedUploadable) def __init__(self, encrypted_uploadable, upload_status): self._eu = IEncryptedUploadable(encrypted_uploadable) self._offset = 0 self._bytes_sent = 0 self._status = IUploadStatus(upload_status) # we are responsible for updating the status string while we run, and # for setting the ciphertext-fetch progress. self._size = None def get_size(self): if self._size is not None: return defer.succeed(self._size) d = self._eu.get_size() def _got_size(size): self._size = size return size d.addCallback(_got_size) return d def remote_get_size(self): return self.get_size() def remote_get_all_encoding_parameters(self): return self._eu.get_all_encoding_parameters() def _read_encrypted(self, length, hash_only): d = self._eu.read_encrypted(length, hash_only) def _read(strings): if hash_only: self._offset += length else: size = sum([len(data) for data in strings]) self._offset += size return strings d.addCallback(_read) return d def remote_read_encrypted(self, offset, length): # we don't support seek backwards, but we allow skipping forwards precondition(offset >= 0, offset) precondition(length >= 0, length) lp = log.msg("remote_read_encrypted(%d-%d)" % (offset, offset+length), level=log.NOISY) precondition(offset >= self._offset, offset, self._offset) if offset > self._offset: # read the data from disk anyways, to build up the hash tree skip = offset - self._offset log.msg("remote_read_encrypted skipping ahead from %d to %d, skip=%d" % (self._offset, offset, skip), level=log.UNUSUAL, parent=lp) d = self._read_encrypted(skip, hash_only=True) else: d = defer.succeed(None) def _at_correct_offset(res): assert offset == self._offset, "%d != %d" % (offset, self._offset) return self._read_encrypted(length, hash_only=False) d.addCallback(_at_correct_offset) def _read(strings): size = sum([len(data) for data in strings]) self._bytes_sent += size return strings d.addCallback(_read) return d def remote_close(self): return self._eu.close() class AssistedUploader: def __init__(self, helper, storage_broker): self._helper = helper self._storage_broker = storage_broker self._log_number = log.msg("AssistedUploader starting") self._storage_index = None self._upload_status = s = UploadStatus() s.set_helper(True) s.set_active(True) def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number return log.msg(*args, **kwargs) def start(self, encrypted_uploadable, storage_index): """Start uploading the file. Returns a Deferred that will fire with the UploadResults instance. """ precondition(isinstance(storage_index, str), storage_index) self._started = time.time() eu = IEncryptedUploadable(encrypted_uploadable) eu.set_upload_status(self._upload_status) self._encuploadable = eu self._storage_index = storage_index d = eu.get_size() d.addCallback(self._got_size) d.addCallback(lambda res: eu.get_all_encoding_parameters()) d.addCallback(self._got_all_encoding_parameters) d.addCallback(self._contact_helper) d.addCallback(self._build_verifycap) def _done(res): self._upload_status.set_active(False) return res d.addBoth(_done) return d def _got_size(self, size): self._size = size self._upload_status.set_size(size) def _got_all_encoding_parameters(self, params): k, happy, n, segment_size = params # stash these for URI generation later self._needed_shares = k self._total_shares = n self._segment_size = segment_size def _contact_helper(self, res): now = self._time_contacting_helper_start = time.time() self._storage_index_elapsed = now - self._started self.log(format="contacting helper for SI %(si)s..", si=si_b2a(self._storage_index), level=log.NOISY) self._upload_status.set_status("Contacting Helper") d = self._helper.callRemote("upload_chk", self._storage_index) d.addCallback(self._contacted_helper) return d def _contacted_helper(self, (helper_upload_results, upload_helper)): now = time.time() elapsed = now - self._time_contacting_helper_start self._elapsed_time_contacting_helper = elapsed if upload_helper: self.log("helper says we need to upload", level=log.NOISY) self._upload_status.set_status("Uploading Ciphertext") # we need to upload the file reu = RemoteEncryptedUploadable(self._encuploadable, self._upload_status) # let it pre-compute the size for progress purposes d = reu.get_size() d.addCallback(lambda ignored: upload_helper.callRemote("upload", reu)) # this Deferred will fire with the upload results return d self.log("helper says file is already uploaded", level=log.OPERATIONAL) self._upload_status.set_progress(1, 1.0) return helper_upload_results def _convert_old_upload_results(self, upload_results): # pre-1.3.0 helpers return upload results which contain a mapping # from shnum to a single human-readable string, containing things # like "Found on [x],[y],[z]" (for healthy files that were already in # the grid), "Found on [x]" (for files that needed upload but which # discovered pre-existing shares), and "Placed on [x]" (for newly # uploaded shares). The 1.3.0 helper returns a mapping from shnum to # set of binary serverid strings. # the old results are too hard to deal with (they don't even contain # as much information as the new results, since the nodeids are # abbreviated), so if we detect old results, just clobber them. sharemap = upload_results.sharemap if str in [type(v) for v in sharemap.values()]: upload_results.sharemap = None def _build_verifycap(self, helper_upload_results): self.log("upload finished, building readcap", level=log.OPERATIONAL) self._convert_old_upload_results(helper_upload_results) self._upload_status.set_status("Building Readcap") hur = helper_upload_results assert hur.uri_extension_data["needed_shares"] == self._needed_shares assert hur.uri_extension_data["total_shares"] == self._total_shares assert hur.uri_extension_data["segment_size"] == self._segment_size assert hur.uri_extension_data["size"] == self._size # hur.verifycap doesn't exist if already found v = uri.CHKFileVerifierURI(self._storage_index, uri_extension_hash=hur.uri_extension_hash, needed_shares=self._needed_shares, total_shares=self._total_shares, size=self._size) timings = {} timings["storage_index"] = self._storage_index_elapsed timings["contacting_helper"] = self._elapsed_time_contacting_helper for key,val in hur.timings.items(): if key == "total": key = "helper_total" timings[key] = val now = time.time() timings["total"] = now - self._started gss = self._storage_broker.get_stub_server sharemap = {} servermap = {} for shnum, serverids in hur.sharemap.items(): sharemap[shnum] = set([gss(serverid) for serverid in serverids]) # if the file was already in the grid, hur.servermap is an empty dict for serverid, shnums in hur.servermap.items(): servermap[gss(serverid)] = set(shnums) ur = UploadResults(file_size=self._size, # not if already found ciphertext_fetched=hur.ciphertext_fetched, preexisting_shares=hur.preexisting_shares, pushed_shares=hur.pushed_shares, sharemap=sharemap, servermap=servermap, timings=timings, uri_extension_data=hur.uri_extension_data, uri_extension_hash=hur.uri_extension_hash, verifycapstr=v.to_string()) self._upload_status.set_status("Finished") self._upload_status.set_results(ur) return ur def get_upload_status(self): return self._upload_status class BaseUploadable: # this is overridden by max_segment_size default_max_segment_size = DEFAULT_MAX_SEGMENT_SIZE default_params_set = False max_segment_size = None encoding_param_k = None encoding_param_happy = None encoding_param_n = None _all_encoding_parameters = None _status = None def set_upload_status(self, upload_status): self._status = IUploadStatus(upload_status) def set_default_encoding_parameters(self, default_params): assert isinstance(default_params, dict) for k,v in default_params.items(): precondition(isinstance(k, str), k, v) precondition(isinstance(v, int), k, v) if "k" in default_params: self.default_encoding_param_k = default_params["k"] if "happy" in default_params: self.default_encoding_param_happy = default_params["happy"] if "n" in default_params: self.default_encoding_param_n = default_params["n"] if "max_segment_size" in default_params: self.default_max_segment_size = default_params["max_segment_size"] self.default_params_set = True def get_all_encoding_parameters(self): _assert(self.default_params_set, "set_default_encoding_parameters not called on %r" % (self,)) if self._all_encoding_parameters: return defer.succeed(self._all_encoding_parameters) max_segsize = self.max_segment_size or self.default_max_segment_size k = self.encoding_param_k or self.default_encoding_param_k happy = self.encoding_param_happy or self.default_encoding_param_happy n = self.encoding_param_n or self.default_encoding_param_n d = self.get_size() def _got_size(file_size): # for small files, shrink the segment size to avoid wasting space segsize = min(max_segsize, file_size) # this must be a multiple of 'required_shares'==k segsize = mathutil.next_multiple(segsize, k) encoding_parameters = (k, happy, n, segsize) self._all_encoding_parameters = encoding_parameters return encoding_parameters d.addCallback(_got_size) return d class FileHandle(BaseUploadable): implements(IUploadable) def __init__(self, filehandle, convergence): """ Upload the data from the filehandle. If convergence is None then a random encryption key will be used, else the plaintext will be hashed, then the hash will be hashed together with the string in the "convergence" argument to form the encryption key. """ assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) self._filehandle = filehandle self._key = None self.convergence = convergence self._size = None def _get_encryption_key_convergent(self): if self._key is not None: return defer.succeed(self._key) d = self.get_size() # that sets self._size as a side-effect d.addCallback(lambda size: self.get_all_encoding_parameters()) def _got(params): k, happy, n, segsize = params f = self._filehandle enckey_hasher = convergence_hasher(k, n, segsize, self.convergence) f.seek(0) BLOCKSIZE = 64*1024 bytes_read = 0 while True: data = f.read(BLOCKSIZE) if not data: break enckey_hasher.update(data) # TODO: setting progress in a non-yielding loop is kind of # pointless, but I'm anticipating (perhaps prematurely) the # day when we use a slowjob or twisted's CooperatorService to # make this yield time to other jobs. bytes_read += len(data) if self._status: self._status.set_progress(0, float(bytes_read)/self._size) f.seek(0) self._key = enckey_hasher.digest() if self._status: self._status.set_progress(0, 1.0) assert len(self._key) == 16 return self._key d.addCallback(_got) return d def _get_encryption_key_random(self): if self._key is None: self._key = os.urandom(16) return defer.succeed(self._key) def get_encryption_key(self): if self.convergence is not None: return self._get_encryption_key_convergent() else: return self._get_encryption_key_random() def get_size(self): if self._size is not None: return defer.succeed(self._size) self._filehandle.seek(0, os.SEEK_END) size = self._filehandle.tell() self._size = size self._filehandle.seek(0) return defer.succeed(size) def read(self, length): return defer.succeed([self._filehandle.read(length)]) def close(self): # the originator of the filehandle reserves the right to close it pass class FileName(FileHandle): def __init__(self, filename, convergence): """ Upload the data from the filename. If convergence is None then a random encryption key will be used, else the plaintext will be hashed, then the hash will be hashed together with the string in the "convergence" argument to form the encryption key. """ assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) FileHandle.__init__(self, open(filename, "rb"), convergence=convergence) def close(self): FileHandle.close(self) self._filehandle.close() class Data(FileHandle): def __init__(self, data, convergence): """ Upload the data from the data argument. If convergence is None then a random encryption key will be used, else the plaintext will be hashed, then the hash will be hashed together with the string in the "convergence" argument to form the encryption key. """ assert convergence is None or isinstance(convergence, str), (convergence, type(convergence)) FileHandle.__init__(self, StringIO(data), convergence=convergence) class Uploader(service.MultiService, log.PrefixingLogMixin): """I am a service that allows file uploading. I am a service-child of the Client. """ implements(IUploader) name = "uploader" URI_LIT_SIZE_THRESHOLD = 55 def __init__(self, helper_furl=None, stats_provider=None, history=None): self._helper_furl = helper_furl self.stats_provider = stats_provider self._history = history self._helper = None self._all_uploads = weakref.WeakKeyDictionary() # for debugging log.PrefixingLogMixin.__init__(self, facility="tahoe.immutable.upload") service.MultiService.__init__(self) def startService(self): service.MultiService.startService(self) if self._helper_furl: self.parent.tub.connectTo(self._helper_furl, self._got_helper) def _got_helper(self, helper): self.log("got helper connection, getting versions") default = { "http://allmydata.org/tahoe/protocols/helper/v1" : { }, "application-version": "unknown: no get_version()", } d = add_version_to_remote_reference(helper, default) d.addCallback(self._got_versioned_helper) def _got_versioned_helper(self, helper): needed = "http://allmydata.org/tahoe/protocols/helper/v1" if needed not in helper.version: raise InsufficientVersionError(needed, helper.version) self._helper = helper helper.notifyOnDisconnect(self._lost_helper) def _lost_helper(self): self._helper = None def get_helper_info(self): # return a tuple of (helper_furl_or_None, connected_bool) return (self._helper_furl, bool(self._helper)) def upload(self, uploadable): """ Returns a Deferred that will fire with the UploadResults instance. """ assert self.parent assert self.running uploadable = IUploadable(uploadable) d = uploadable.get_size() def _got_size(size): default_params = self.parent.get_encoding_parameters() precondition(isinstance(default_params, dict), default_params) precondition("max_segment_size" in default_params, default_params) uploadable.set_default_encoding_parameters(default_params) if self.stats_provider: self.stats_provider.count('uploader.files_uploaded', 1) self.stats_provider.count('uploader.bytes_uploaded', size) if size <= self.URI_LIT_SIZE_THRESHOLD: uploader = LiteralUploader() return uploader.start(uploadable) else: eu = EncryptAnUploadable(uploadable, self._parentmsgid) d2 = defer.succeed(None) storage_broker = self.parent.get_storage_broker() if self._helper: uploader = AssistedUploader(self._helper, storage_broker) d2.addCallback(lambda x: eu.get_storage_index()) d2.addCallback(lambda si: uploader.start(eu, si)) else: storage_broker = self.parent.get_storage_broker() secret_holder = self.parent._secret_holder uploader = CHKUploader(storage_broker, secret_holder) d2.addCallback(lambda x: uploader.start(eu)) self._all_uploads[uploader] = None if self._history: self._history.add_upload(uploader.get_upload_status()) def turn_verifycap_into_read_cap(uploadresults): # Generate the uri from the verifycap plus the key. d3 = uploadable.get_encryption_key() def put_readcap_into_results(key): v = uri.from_string(uploadresults.get_verifycapstr()) r = uri.CHKFileURI(key, v.uri_extension_hash, v.needed_shares, v.total_shares, v.size) uploadresults.set_uri(r.to_string()) return uploadresults d3.addCallback(put_readcap_into_results) return d3 d2.addCallback(turn_verifycap_into_read_cap) return d2 d.addCallback(_got_size) def _done(res): uploadable.close() return res d.addBoth(_done) return d allmydata-tahoe-1.10.2/src/allmydata/immutable/checker.py0000644000175000017500000011333012556560070021464 0ustar ramramfrom zope.interface import implements from twisted.internet import defer from foolscap.api import DeadReferenceError, RemoteException from allmydata import hashtree, codec, uri from allmydata.interfaces import IValidatedThingProxy, IVerifierURI from allmydata.hashtree import IncompleteHashTree from allmydata.check_results import CheckResults from allmydata.uri import CHKFileVerifierURI from allmydata.util.assertutil import precondition from allmydata.util import base32, deferredutil, dictutil, log, mathutil from allmydata.util.hashutil import file_renewal_secret_hash, \ file_cancel_secret_hash, bucket_renewal_secret_hash, \ bucket_cancel_secret_hash, uri_extension_hash, CRYPTO_VAL_SIZE, \ block_hash from allmydata.util.happinessutil import servers_of_happiness from allmydata.immutable import layout class IntegrityCheckReject(Exception): pass class BadURIExtension(IntegrityCheckReject): pass class BadURIExtensionHashValue(IntegrityCheckReject): pass class BadOrMissingHash(IntegrityCheckReject): pass class UnsupportedErasureCodec(BadURIExtension): pass class ValidatedExtendedURIProxy: implements(IValidatedThingProxy) """ I am a front-end for a remote UEB (using a local ReadBucketProxy), responsible for retrieving and validating the elements from the UEB.""" def __init__(self, readbucketproxy, verifycap, fetch_failures=None): # fetch_failures is for debugging -- see test_encode.py self._fetch_failures = fetch_failures self._readbucketproxy = readbucketproxy precondition(IVerifierURI.providedBy(verifycap), verifycap) self._verifycap = verifycap # required self.segment_size = None self.crypttext_root_hash = None self.share_root_hash = None # computed self.block_size = None self.share_size = None self.num_segments = None self.tail_data_size = None self.tail_segment_size = None # optional self.crypttext_hash = None def __str__(self): return "<%s %s>" % (self.__class__.__name__, self._verifycap.to_string()) def _check_integrity(self, data): h = uri_extension_hash(data) if h != self._verifycap.uri_extension_hash: msg = ("The copy of uri_extension we received from %s was bad: wanted %s, got %s" % (self._readbucketproxy, base32.b2a(self._verifycap.uri_extension_hash), base32.b2a(h))) if self._fetch_failures is not None: self._fetch_failures["uri_extension"] += 1 raise BadURIExtensionHashValue(msg) else: return data def _parse_and_validate(self, data): self.share_size = mathutil.div_ceil(self._verifycap.size, self._verifycap.needed_shares) d = uri.unpack_extension(data) # There are several kinds of things that can be found in a UEB. # First, things that we really need to learn from the UEB in order to # do this download. Next: things which are optional but not redundant # -- if they are present in the UEB they will get used. Next, things # that are optional and redundant. These things are required to be # consistent: they don't have to be in the UEB, but if they are in # the UEB then they will be checked for consistency with the # already-known facts, and if they are inconsistent then an exception # will be raised. These things aren't actually used -- they are just # tested for consistency and ignored. Finally: things which are # deprecated -- they ought not be in the UEB at all, and if they are # present then a warning will be logged but they are otherwise # ignored. # First, things that we really need to learn from the UEB: # segment_size, crypttext_root_hash, and share_root_hash. self.segment_size = d['segment_size'] self.block_size = mathutil.div_ceil(self.segment_size, self._verifycap.needed_shares) self.num_segments = mathutil.div_ceil(self._verifycap.size, self.segment_size) self.tail_data_size = self._verifycap.size % self.segment_size if not self.tail_data_size: self.tail_data_size = self.segment_size # padding for erasure code self.tail_segment_size = mathutil.next_multiple(self.tail_data_size, self._verifycap.needed_shares) # Ciphertext hash tree root is mandatory, so that there is at most # one ciphertext that matches this read-cap or verify-cap. The # integrity check on the shares is not sufficient to prevent the # original encoder from creating some shares of file A and other # shares of file B. self.crypttext_root_hash = d['crypttext_root_hash'] self.share_root_hash = d['share_root_hash'] # Next: things that are optional and not redundant: crypttext_hash if d.has_key('crypttext_hash'): self.crypttext_hash = d['crypttext_hash'] if len(self.crypttext_hash) != CRYPTO_VAL_SIZE: raise BadURIExtension('crypttext_hash is required to be hashutil.CRYPTO_VAL_SIZE bytes, not %s bytes' % (len(self.crypttext_hash),)) # Next: things that are optional, redundant, and required to be # consistent: codec_name, codec_params, tail_codec_params, # num_segments, size, needed_shares, total_shares if d.has_key('codec_name'): if d['codec_name'] != "crs": raise UnsupportedErasureCodec(d['codec_name']) if d.has_key('codec_params'): ucpss, ucpns, ucpts = codec.parse_params(d['codec_params']) if ucpss != self.segment_size: raise BadURIExtension("inconsistent erasure code params: " "ucpss: %s != self.segment_size: %s" % (ucpss, self.segment_size)) if ucpns != self._verifycap.needed_shares: raise BadURIExtension("inconsistent erasure code params: ucpns: %s != " "self._verifycap.needed_shares: %s" % (ucpns, self._verifycap.needed_shares)) if ucpts != self._verifycap.total_shares: raise BadURIExtension("inconsistent erasure code params: ucpts: %s != " "self._verifycap.total_shares: %s" % (ucpts, self._verifycap.total_shares)) if d.has_key('tail_codec_params'): utcpss, utcpns, utcpts = codec.parse_params(d['tail_codec_params']) if utcpss != self.tail_segment_size: raise BadURIExtension("inconsistent erasure code params: utcpss: %s != " "self.tail_segment_size: %s, self._verifycap.size: %s, " "self.segment_size: %s, self._verifycap.needed_shares: %s" % (utcpss, self.tail_segment_size, self._verifycap.size, self.segment_size, self._verifycap.needed_shares)) if utcpns != self._verifycap.needed_shares: raise BadURIExtension("inconsistent erasure code params: utcpns: %s != " "self._verifycap.needed_shares: %s" % (utcpns, self._verifycap.needed_shares)) if utcpts != self._verifycap.total_shares: raise BadURIExtension("inconsistent erasure code params: utcpts: %s != " "self._verifycap.total_shares: %s" % (utcpts, self._verifycap.total_shares)) if d.has_key('num_segments'): if d['num_segments'] != self.num_segments: raise BadURIExtension("inconsistent num_segments: size: %s, " "segment_size: %s, computed_num_segments: %s, " "ueb_num_segments: %s" % (self._verifycap.size, self.segment_size, self.num_segments, d['num_segments'])) if d.has_key('size'): if d['size'] != self._verifycap.size: raise BadURIExtension("inconsistent size: URI size: %s, UEB size: %s" % (self._verifycap.size, d['size'])) if d.has_key('needed_shares'): if d['needed_shares'] != self._verifycap.needed_shares: raise BadURIExtension("inconsistent needed shares: URI needed shares: %s, UEB " "needed shares: %s" % (self._verifycap.total_shares, d['needed_shares'])) if d.has_key('total_shares'): if d['total_shares'] != self._verifycap.total_shares: raise BadURIExtension("inconsistent total shares: URI total shares: %s, UEB " "total shares: %s" % (self._verifycap.total_shares, d['total_shares'])) # Finally, things that are deprecated and ignored: plaintext_hash, # plaintext_root_hash if d.get('plaintext_hash'): log.msg("Found plaintext_hash in UEB. This field is deprecated for security reasons " "and is no longer used. Ignoring. %s" % (self,)) if d.get('plaintext_root_hash'): log.msg("Found plaintext_root_hash in UEB. This field is deprecated for security " "reasons and is no longer used. Ignoring. %s" % (self,)) return self def start(self): """Fetch the UEB from bucket, compare its hash to the hash from verifycap, then parse it. Returns a deferred which is called back with self once the fetch is successful, or is erred back if it fails.""" d = self._readbucketproxy.get_uri_extension() d.addCallback(self._check_integrity) d.addCallback(self._parse_and_validate) return d class ValidatedReadBucketProxy(log.PrefixingLogMixin): """I am a front-end for a remote storage bucket, responsible for retrieving and validating data from that bucket. My get_block() method is used by BlockDownloaders. """ def __init__(self, sharenum, bucket, share_hash_tree, num_blocks, block_size, share_size): """ share_hash_tree is required to have already been initialized with the root hash (the number-0 hash), using the share_root_hash from the UEB""" precondition(share_hash_tree[0] is not None, share_hash_tree) prefix = "%d-%s-%s" % (sharenum, bucket, base32.b2a_l(share_hash_tree[0][:8], 60)) log.PrefixingLogMixin.__init__(self, facility="tahoe.immutable.download", prefix=prefix) self.sharenum = sharenum self.bucket = bucket self.share_hash_tree = share_hash_tree self.num_blocks = num_blocks self.block_size = block_size self.share_size = share_size self.block_hash_tree = hashtree.IncompleteHashTree(self.num_blocks) def get_all_sharehashes(self): """Retrieve and validate all the share-hash-tree nodes that are included in this share, regardless of whether we need them to validate the share or not. Each share contains a minimal Merkle tree chain, but there is lots of overlap, so usually we'll be using hashes from other shares and not reading every single hash from this share. The Verifier uses this function to read and validate every single hash from this share. Call this (and wait for the Deferred it returns to fire) before calling get_block() for the first time: this lets us check that the share share contains enough hashes to validate its own data, and avoids downloading any share hash twice. I return a Deferred which errbacks upon failure, probably with BadOrMissingHash.""" d = self.bucket.get_share_hashes() def _got_share_hashes(sh): sharehashes = dict(sh) try: self.share_hash_tree.set_hashes(sharehashes) except IndexError, le: raise BadOrMissingHash(le) except (hashtree.BadHashError, hashtree.NotEnoughHashesError), le: raise BadOrMissingHash(le) d.addCallback(_got_share_hashes) return d def get_all_blockhashes(self): """Retrieve and validate all the block-hash-tree nodes that are included in this share. Each share contains a full Merkle tree, but we usually only fetch the minimal subset necessary for any particular block. This function fetches everything at once. The Verifier uses this function to validate the block hash tree. Call this (and wait for the Deferred it returns to fire) after calling get_all_sharehashes() and before calling get_block() for the first time: this lets us check that the share contains all block hashes and avoids downloading them multiple times. I return a Deferred which errbacks upon failure, probably with BadOrMissingHash. """ # get_block_hashes(anything) currently always returns everything needed = list(range(len(self.block_hash_tree))) d = self.bucket.get_block_hashes(needed) def _got_block_hashes(blockhashes): if len(blockhashes) < len(self.block_hash_tree): raise BadOrMissingHash() bh = dict(enumerate(blockhashes)) try: self.block_hash_tree.set_hashes(bh) except IndexError, le: raise BadOrMissingHash(le) except (hashtree.BadHashError, hashtree.NotEnoughHashesError), le: raise BadOrMissingHash(le) d.addCallback(_got_block_hashes) return d def get_all_crypttext_hashes(self, crypttext_hash_tree): """Retrieve and validate all the crypttext-hash-tree nodes that are in this share. Normally we don't look at these at all: the download process fetches them incrementally as needed to validate each segment of ciphertext. But this is a convenient place to give the Verifier a function to validate all of these at once. Call this with a new hashtree object for each share, initialized with the crypttext hash tree root. I return a Deferred which errbacks upon failure, probably with BadOrMissingHash. """ # get_crypttext_hashes() always returns everything d = self.bucket.get_crypttext_hashes() def _got_crypttext_hashes(hashes): if len(hashes) < len(crypttext_hash_tree): raise BadOrMissingHash() ct_hashes = dict(enumerate(hashes)) try: crypttext_hash_tree.set_hashes(ct_hashes) except IndexError, le: raise BadOrMissingHash(le) except (hashtree.BadHashError, hashtree.NotEnoughHashesError), le: raise BadOrMissingHash(le) d.addCallback(_got_crypttext_hashes) return d def get_block(self, blocknum): # the first time we use this bucket, we need to fetch enough elements # of the share hash tree to validate it from our share hash up to the # hashroot. if self.share_hash_tree.needed_hashes(self.sharenum): d1 = self.bucket.get_share_hashes() else: d1 = defer.succeed([]) # We might need to grab some elements of our block hash tree, to # validate the requested block up to the share hash. blockhashesneeded = self.block_hash_tree.needed_hashes(blocknum, include_leaf=True) # We don't need the root of the block hash tree, as that comes in the # share tree. blockhashesneeded.discard(0) d2 = self.bucket.get_block_hashes(blockhashesneeded) if blocknum < self.num_blocks-1: thisblocksize = self.block_size else: thisblocksize = self.share_size % self.block_size if thisblocksize == 0: thisblocksize = self.block_size d3 = self.bucket.get_block_data(blocknum, self.block_size, thisblocksize) dl = deferredutil.gatherResults([d1, d2, d3]) dl.addCallback(self._got_data, blocknum) return dl def _got_data(self, results, blocknum): precondition(blocknum < self.num_blocks, self, blocknum, self.num_blocks) sharehashes, blockhashes, blockdata = results try: sharehashes = dict(sharehashes) except ValueError, le: le.args = tuple(le.args + (sharehashes,)) raise blockhashes = dict(enumerate(blockhashes)) candidate_share_hash = None # in case we log it in the except block below blockhash = None # in case we log it in the except block below try: if self.share_hash_tree.needed_hashes(self.sharenum): # This will raise exception if the values being passed do not # match the root node of self.share_hash_tree. try: self.share_hash_tree.set_hashes(sharehashes) except IndexError, le: # Weird -- sharehashes contained index numbers outside of # the range that fit into this hash tree. raise BadOrMissingHash(le) # To validate a block we need the root of the block hash tree, # which is also one of the leafs of the share hash tree, and is # called "the share hash". if not self.block_hash_tree[0]: # empty -- no root node yet # Get the share hash from the share hash tree. share_hash = self.share_hash_tree.get_leaf(self.sharenum) if not share_hash: # No root node in block_hash_tree and also the share hash # wasn't sent by the server. raise hashtree.NotEnoughHashesError self.block_hash_tree.set_hashes({0: share_hash}) if self.block_hash_tree.needed_hashes(blocknum): self.block_hash_tree.set_hashes(blockhashes) blockhash = block_hash(blockdata) self.block_hash_tree.set_hashes(leaves={blocknum: blockhash}) #self.log("checking block_hash(shareid=%d, blocknum=%d) len=%d " # "%r .. %r: %s" % # (self.sharenum, blocknum, len(blockdata), # blockdata[:50], blockdata[-50:], base32.b2a(blockhash))) except (hashtree.BadHashError, hashtree.NotEnoughHashesError), le: # log.WEIRD: indicates undetected disk/network error, or more # likely a programming error self.log("hash failure in block=%d, shnum=%d on %s" % (blocknum, self.sharenum, self.bucket)) if self.block_hash_tree.needed_hashes(blocknum): self.log(""" failure occurred when checking the block_hash_tree. This suggests that either the block data was bad, or that the block hashes we received along with it were bad.""") else: self.log(""" the failure probably occurred when checking the share_hash_tree, which suggests that the share hashes we received from the remote peer were bad.""") self.log(" have candidate_share_hash: %s" % bool(candidate_share_hash)) self.log(" block length: %d" % len(blockdata)) self.log(" block hash: %s" % base32.b2a_or_none(blockhash)) if len(blockdata) < 100: self.log(" block data: %r" % (blockdata,)) else: self.log(" block data start/end: %r .. %r" % (blockdata[:50], blockdata[-50:])) self.log(" share hash tree:\n" + self.share_hash_tree.dump()) self.log(" block hash tree:\n" + self.block_hash_tree.dump()) lines = [] for i,h in sorted(sharehashes.items()): lines.append("%3d: %s" % (i, base32.b2a_or_none(h))) self.log(" sharehashes:\n" + "\n".join(lines) + "\n") lines = [] for i,h in blockhashes.items(): lines.append("%3d: %s" % (i, base32.b2a_or_none(h))) log.msg(" blockhashes:\n" + "\n".join(lines) + "\n") raise BadOrMissingHash(le) # If we made it here, the block is good. If the hash trees didn't # like what they saw, they would have raised a BadHashError, causing # our caller to see a Failure and thus ignore this block (as well as # dropping this bucket). return blockdata class Checker(log.PrefixingLogMixin): """I query all servers to see if M uniquely-numbered shares are available. If the verify flag was passed to my constructor, then for each share I download every data block and all metadata from each server and perform a cryptographic integrity check on all of it. If not, I just ask each server 'Which shares do you have?' and believe its answer. In either case, I wait until I have gotten responses from all servers. This fact -- that I wait -- means that an ill-behaved server which fails to answer my questions will make me wait indefinitely. If it is ill-behaved in a way that triggers the underlying foolscap timeouts, then I will wait only as long as those foolscap timeouts, but if it is ill-behaved in a way which placates the foolscap timeouts but still doesn't answer my question then I will wait indefinitely. Before I send any new request to a server, I always ask the 'monitor' object that was passed into my constructor whether this task has been cancelled (by invoking its raise_if_cancelled() method). """ def __init__(self, verifycap, servers, verify, add_lease, secret_holder, monitor): assert precondition(isinstance(verifycap, CHKFileVerifierURI), verifycap, type(verifycap)) prefix = "%s" % base32.b2a_l(verifycap.get_storage_index()[:8], 60) log.PrefixingLogMixin.__init__(self, facility="tahoe.immutable.checker", prefix=prefix) self._verifycap = verifycap self._monitor = monitor self._servers = servers self._verify = verify # bool: verify what the servers claim, or not? self._add_lease = add_lease frs = file_renewal_secret_hash(secret_holder.get_renewal_secret(), self._verifycap.get_storage_index()) self.file_renewal_secret = frs fcs = file_cancel_secret_hash(secret_holder.get_cancel_secret(), self._verifycap.get_storage_index()) self.file_cancel_secret = fcs def _get_renewal_secret(self, seed): return bucket_renewal_secret_hash(self.file_renewal_secret, seed) def _get_cancel_secret(self, seed): return bucket_cancel_secret_hash(self.file_cancel_secret, seed) def _get_buckets(self, s, storageindex): """Return a deferred that eventually fires with ({sharenum: bucket}, serverid, success). In case the server is disconnected or returns a Failure then it fires with ({}, serverid, False) (A server disconnecting or returning a Failure when we ask it for buckets is the same, for our purposes, as a server that says it has none, except that we want to track and report whether or not each server responded.)""" rref = s.get_rref() lease_seed = s.get_lease_seed() if self._add_lease: renew_secret = self._get_renewal_secret(lease_seed) cancel_secret = self._get_cancel_secret(lease_seed) d2 = rref.callRemote("add_lease", storageindex, renew_secret, cancel_secret) d2.addErrback(self._add_lease_failed, s.get_name(), storageindex) d = rref.callRemote("get_buckets", storageindex) def _wrap_results(res): return (res, True) def _trap_errs(f): level = log.WEIRD if f.check(DeadReferenceError): level = log.UNUSUAL self.log("failure from server on 'get_buckets' the REMOTE failure was:", facility="tahoe.immutable.checker", failure=f, level=level, umid="AX7wZQ") return ({}, False) d.addCallbacks(_wrap_results, _trap_errs) return d def _add_lease_failed(self, f, server_name, storage_index): # Older versions of Tahoe didn't handle the add-lease message very # well: <=1.1.0 throws a NameError because it doesn't implement # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets # (which is most of them, since we send add-lease to everybody, # before we know whether or not they have any shares for us), and # 1.2.0 throws KeyError even on known buckets due to an internal bug # in the latency-measuring code. # we want to ignore the known-harmless errors and log the others. In # particular we want to log any local errors caused by coding # problems. if f.check(DeadReferenceError): return if f.check(RemoteException): if f.value.failure.check(KeyError, IndexError, NameError): # this may ignore a bit too much, but that only hurts us # during debugging return self.log(format="error in add_lease from [%(name)s]: %(f_value)s", name=server_name, f_value=str(f.value), failure=f, level=log.WEIRD, umid="atbAxw") return # local errors are cause for alarm log.err(f, format="local error in add_lease to [%(name)s]: %(f_value)s", name=server_name, f_value=str(f.value), level=log.WEIRD, umid="hEGuQg") def _download_and_verify(self, server, sharenum, bucket): """Start an attempt to download and verify every block in this bucket and return a deferred that will eventually fire once the attempt completes. If you download and verify every block then fire with (True, sharenum, None), else if the share data couldn't be parsed because it was of an unknown version number fire with (False, sharenum, 'incompatible'), else if any of the blocks were invalid, fire with (False, sharenum, 'corrupt'), else if the server disconnected (False, sharenum, 'disconnect'), else if the server returned a Failure during the process fire with (False, sharenum, 'failure'). If there is an internal error such as an uncaught exception in this code, then the deferred will errback, but if there is a remote error such as the server failing or the returned data being incorrect then it will not errback -- it will fire normally with the indicated results.""" vcap = self._verifycap b = layout.ReadBucketProxy(bucket, server, vcap.get_storage_index()) veup = ValidatedExtendedURIProxy(b, vcap) d = veup.start() def _got_ueb(vup): share_hash_tree = IncompleteHashTree(vcap.total_shares) share_hash_tree.set_hashes({0: vup.share_root_hash}) vrbp = ValidatedReadBucketProxy(sharenum, b, share_hash_tree, vup.num_segments, vup.block_size, vup.share_size) # note: normal download doesn't use get_all_sharehashes(), # because it gets more data than necessary. We've discussed the # security properties of having verification and download look # identical (so the server couldn't, say, provide good responses # for one and not the other), but I think that full verification # is more important than defending against inconsistent server # behavior. Besides, they can't pass the verifier without storing # all the data, so there's not so much to be gained by behaving # inconsistently. d = vrbp.get_all_sharehashes() # we fill share_hash_tree before fetching any blocks, so the # block fetches won't send redundant share-hash-tree requests, to # speed things up. Then we fetch+validate all the blockhashes. d.addCallback(lambda ign: vrbp.get_all_blockhashes()) cht = IncompleteHashTree(vup.num_segments) cht.set_hashes({0: vup.crypttext_root_hash}) d.addCallback(lambda ign: vrbp.get_all_crypttext_hashes(cht)) d.addCallback(lambda ign: vrbp) return d d.addCallback(_got_ueb) def _discard_result(r): assert isinstance(r, str), r # to free up the RAM return None def _get_blocks(vrbp): def _get_block(ign, blocknum): db = vrbp.get_block(blocknum) db.addCallback(_discard_result) return db dbs = defer.succeed(None) for blocknum in range(veup.num_segments): dbs.addCallback(_get_block, blocknum) # The Deferred we return will fire after every block of this # share has been downloaded and verified successfully, or else it # will errback as soon as the first error is observed. return dbs d.addCallback(_get_blocks) # if none of those errbacked, the blocks (and the hashes above them) # are good def _all_good(ign): return (True, sharenum, None) d.addCallback(_all_good) # but if anything fails, we'll land here def _errb(f): # We didn't succeed at fetching and verifying all the blocks of # this share. Handle each reason for failure differently. if f.check(DeadReferenceError): return (False, sharenum, 'disconnect') elif f.check(RemoteException): return (False, sharenum, 'failure') elif f.check(layout.ShareVersionIncompatible): return (False, sharenum, 'incompatible') elif f.check(layout.LayoutInvalid, layout.RidiculouslyLargeURIExtensionBlock, BadOrMissingHash, BadURIExtensionHashValue): return (False, sharenum, 'corrupt') # if it wasn't one of those reasons, re-raise the error return f d.addErrback(_errb) return d def _verify_server_shares(self, s): """ Return a deferred which eventually fires with a tuple of (set(sharenum), server, set(corruptsharenum), set(incompatiblesharenum), success) showing all the shares verified to be served by this server, and all the corrupt shares served by the server, and all the incompatible shares served by the server. In case the server is disconnected or returns a Failure then it fires with the last element False. A server disconnecting or returning a failure when we ask it for shares is the same, for our purposes, as a server that says it has none or offers invalid ones, except that we want to track and report the server's behavior. Similarly, the presence of corrupt shares is mainly of use for diagnostics -- you can typically treat it as just like being no share at all by just observing its absence from the verified shares dict and ignoring its presence in the corrupt shares dict. The 'success' argument means whether the server responded to *any* queries during this process, so if it responded to some queries and then disconnected and ceased responding, or returned a failure, it is still marked with the True flag for 'success'. """ d = self._get_buckets(s, self._verifycap.get_storage_index()) def _got_buckets(result): bucketdict, success = result shareverds = [] for (sharenum, bucket) in bucketdict.items(): d = self._download_and_verify(s, sharenum, bucket) shareverds.append(d) dl = deferredutil.gatherResults(shareverds) def collect(results): verified = set() corrupt = set() incompatible = set() for succ, sharenum, whynot in results: if succ: verified.add(sharenum) else: if whynot == 'corrupt': corrupt.add(sharenum) elif whynot == 'incompatible': incompatible.add(sharenum) return (verified, s, corrupt, incompatible, success) dl.addCallback(collect) return dl def _err(f): f.trap(RemoteException, DeadReferenceError) return (set(), s, set(), set(), False) d.addCallbacks(_got_buckets, _err) return d def _check_server_shares(self, s): """Return a deferred which eventually fires with a tuple of (set(sharenum), server, set(), set(), responded) showing all the shares claimed to be served by this server. In case the server is disconnected then it fires with (set(), server, set(), set(), False) (a server disconnecting when we ask it for buckets is the same, for our purposes, as a server that says it has none, except that we want to track and report whether or not each server responded.)""" def _curry_empty_corrupted(res): buckets, responded = res return (set(buckets), s, set(), set(), responded) d = self._get_buckets(s, self._verifycap.get_storage_index()) d.addCallback(_curry_empty_corrupted) return d def _format_results(self, results): SI = self._verifycap.get_storage_index() verifiedshares = dictutil.DictOfSets() # {sharenum: set(server)} servers = {} # {server: set(sharenums)} corruptshare_locators = [] # (server, storageindex, sharenum) incompatibleshare_locators = [] # (server, storageindex, sharenum) servers_responding = set() # server for verified, server, corrupt, incompatible, responded in results: servers.setdefault(server, set()).update(verified) for sharenum in verified: verifiedshares.setdefault(sharenum, set()).add(server) for sharenum in corrupt: corruptshare_locators.append((server, SI, sharenum)) for sharenum in incompatible: incompatibleshare_locators.append((server, SI, sharenum)) if responded: servers_responding.add(server) good_share_hosts = len([s for s in servers.keys() if servers[s]]) assert len(verifiedshares) <= self._verifycap.total_shares, (verifiedshares.keys(), self._verifycap.total_shares) if len(verifiedshares) == self._verifycap.total_shares: healthy = True summary = "Healthy" else: healthy = False summary = ("Not Healthy: %d shares (enc %d-of-%d)" % (len(verifiedshares), self._verifycap.needed_shares, self._verifycap.total_shares)) if len(verifiedshares) >= self._verifycap.needed_shares: recoverable = 1 unrecoverable = 0 else: recoverable = 0 unrecoverable = 1 count_happiness = servers_of_happiness(verifiedshares) cr = CheckResults(self._verifycap, SI, healthy=healthy, recoverable=bool(recoverable), count_happiness=count_happiness, count_shares_needed=self._verifycap.needed_shares, count_shares_expected=self._verifycap.total_shares, count_shares_good=len(verifiedshares), count_good_share_hosts=good_share_hosts, count_recoverable_versions=recoverable, count_unrecoverable_versions=unrecoverable, servers_responding=list(servers_responding), sharemap=verifiedshares, count_wrong_shares=0, # no such thing, for immutable list_corrupt_shares=corruptshare_locators, count_corrupt_shares=len(corruptshare_locators), list_incompatible_shares=incompatibleshare_locators, count_incompatible_shares=len(incompatibleshare_locators), summary=summary, report=[], share_problems=[], servermap=None) return cr def start(self): ds = [] if self._verify: for s in self._servers: ds.append(self._verify_server_shares(s)) else: for s in self._servers: ds.append(self._check_server_shares(s)) return deferredutil.gatherResults(ds).addCallback(self._format_results) allmydata-tahoe-1.10.2/src/allmydata/immutable/literal.py0000644000175000017500000000633312556560070021520 0ustar ramramfrom cStringIO import StringIO from zope.interface import implements from twisted.internet import defer from twisted.internet.interfaces import IPushProducer from twisted.protocols import basic from allmydata.interfaces import IImmutableFileNode, ICheckable from allmydata.uri import LiteralFileURI class _ImmutableFileNodeBase(object): implements(IImmutableFileNode, ICheckable) def get_write_uri(self): return None def get_readonly_uri(self): return self.get_uri() def is_mutable(self): return False def is_readonly(self): return True def is_unknown(self): return False def is_allowed_in_immutable_directory(self): return True def raise_error(self): pass def __hash__(self): return self.u.__hash__() def __eq__(self, other): if isinstance(other, _ImmutableFileNodeBase): return self.u.__eq__(other.u) else: return False def __ne__(self, other): if isinstance(other, _ImmutableFileNodeBase): return self.u.__eq__(other.u) else: return True class LiteralProducer: implements(IPushProducer) def pauseProducing(self): pass def resumeProducing(self): pass def stopProducing(self): pass class LiteralFileNode(_ImmutableFileNodeBase): def __init__(self, filecap): assert isinstance(filecap, LiteralFileURI) self.u = filecap def get_size(self): return len(self.u.data) def get_current_size(self): return defer.succeed(self.get_size()) def get_cap(self): return self.u def get_readcap(self): return self.u def get_verify_cap(self): return None def get_repair_cap(self): return None def get_uri(self): return self.u.to_string() def get_storage_index(self): return None def check(self, monitor, verify=False, add_lease=False): return defer.succeed(None) def check_and_repair(self, monitor, verify=False, add_lease=False): return defer.succeed(None) def read(self, consumer, offset=0, size=None): if size is None: data = self.u.data[offset:] else: data = self.u.data[offset:offset+size] # We use twisted.protocols.basic.FileSender, which only does # non-streaming, i.e. PullProducer, where the receiver/consumer must # ask explicitly for each chunk of data. There are only two places in # the Twisted codebase that can't handle streaming=False, both of # which are in the upload path for an FTP/SFTP server # (protocols.ftp.FileConsumer and # vfs.adapters.ftp._FileToConsumerAdapter), neither of which is # likely to be used as the target for a Tahoe download. d = basic.FileSender().beginFileTransfer(StringIO(data), consumer) d.addCallback(lambda lastSent: consumer) return d # IReadable, IFileNode, IFilesystemNode def get_best_readable_version(self): return defer.succeed(self) def download_best_version(self): return defer.succeed(self.u.data) download_to_data = download_best_version get_size_of_best_version = get_current_size allmydata-tahoe-1.10.2/src/allmydata/immutable/filenode.py0000644000175000017500000003221012556560070021642 0ustar ramram import binascii import time now = time.time from zope.interface import implements from twisted.internet import defer from allmydata import uri from twisted.internet.interfaces import IConsumer from allmydata.interfaces import IImmutableFileNode, IUploadResults from allmydata.util import consumer from allmydata.check_results import CheckResults, CheckAndRepairResults from allmydata.util.dictutil import DictOfSets from allmydata.util.happinessutil import servers_of_happiness from pycryptopp.cipher.aes import AES # local imports from allmydata.immutable.checker import Checker from allmydata.immutable.repairer import Repairer from allmydata.immutable.downloader.node import DownloadNode, \ IDownloadStatusHandlingConsumer from allmydata.immutable.downloader.status import DownloadStatus class CiphertextFileNode: def __init__(self, verifycap, storage_broker, secret_holder, terminator, history): assert isinstance(verifycap, uri.CHKFileVerifierURI) self._verifycap = verifycap self._storage_broker = storage_broker self._secret_holder = secret_holder self._terminator = terminator self._history = history self._download_status = None self._node = None # created lazily, on read() def _maybe_create_download_node(self): if not self._download_status: ds = DownloadStatus(self._verifycap.storage_index, self._verifycap.size) if self._history: self._history.add_download(ds) self._download_status = ds if self._node is None: self._node = DownloadNode(self._verifycap, self._storage_broker, self._secret_holder, self._terminator, self._history, self._download_status) def read(self, consumer, offset=0, size=None): """I am the main entry point, from which FileNode.read() can get data. I feed the consumer with the desired range of ciphertext. I return a Deferred that fires (with the consumer) when the read is finished.""" self._maybe_create_download_node() return self._node.read(consumer, offset, size) def get_segment(self, segnum): """Begin downloading a segment. I return a tuple (d, c): 'd' is a Deferred that fires with (offset,data) when the desired segment is available, and c is an object on which c.cancel() can be called to disavow interest in the segment (after which 'd' will never fire). You probably need to know the segment size before calling this, unless you want the first few bytes of the file. If you ask for a segment number which turns out to be too large, the Deferred will errback with BadSegmentNumberError. The Deferred fires with the offset of the first byte of the data segment, so that you can call get_segment() before knowing the segment size, and still know which data you received. """ self._maybe_create_download_node() return self._node.get_segment(segnum) def get_segment_size(self): # return a Deferred that fires with the file's real segment size self._maybe_create_download_node() return self._node.get_segsize() def get_storage_index(self): return self._verifycap.storage_index def get_verify_cap(self): return self._verifycap def get_size(self): return self._verifycap.size def raise_error(self): pass def is_mutable(self): return False def check_and_repair(self, monitor, verify=False, add_lease=False): c = Checker(verifycap=self._verifycap, servers=self._storage_broker.get_connected_servers(), verify=verify, add_lease=add_lease, secret_holder=self._secret_holder, monitor=monitor) d = c.start() d.addCallback(self._maybe_repair, monitor) return d def _maybe_repair(self, cr, monitor): crr = CheckAndRepairResults(self._verifycap.storage_index) crr.pre_repair_results = cr if cr.is_healthy(): crr.post_repair_results = cr return defer.succeed(crr) crr.repair_attempted = True crr.repair_successful = False # until proven successful def _repair_error(f): # as with mutable repair, I'm not sure if I want to pass # through a failure or not. TODO crr.repair_successful = False crr.repair_failure = f return f r = Repairer(self, storage_broker=self._storage_broker, secret_holder=self._secret_holder, monitor=monitor) d = r.start() d.addCallbacks(self._gather_repair_results, _repair_error, callbackArgs=(cr, crr,)) return d def _gather_repair_results(self, ur, cr, crr): assert IUploadResults.providedBy(ur), ur # clone the cr (check results) to form the basis of the # prr (post-repair results) verifycap = self._verifycap servers_responding = set(cr.get_servers_responding()) sm = DictOfSets() assert isinstance(cr.get_sharemap(), DictOfSets) for shnum, servers in cr.get_sharemap().items(): for server in servers: sm.add(shnum, server) for shnum, servers in ur.get_sharemap().items(): for server in servers: sm.add(shnum, server) servers_responding.add(server) servers_responding = sorted(servers_responding) good_hosts = len(reduce(set.union, sm.values(), set())) is_healthy = bool(len(sm) >= verifycap.total_shares) is_recoverable = bool(len(sm) >= verifycap.needed_shares) count_happiness = servers_of_happiness(sm) prr = CheckResults(cr.get_uri(), cr.get_storage_index(), healthy=is_healthy, recoverable=is_recoverable, count_happiness=count_happiness, count_shares_needed=verifycap.needed_shares, count_shares_expected=verifycap.total_shares, count_shares_good=len(sm), count_good_share_hosts=good_hosts, count_recoverable_versions=int(is_recoverable), count_unrecoverable_versions=int(not is_recoverable), servers_responding=list(servers_responding), sharemap=sm, count_wrong_shares=0, # no such thing as wrong, for immutable list_corrupt_shares=cr.get_corrupt_shares(), count_corrupt_shares=len(cr.get_corrupt_shares()), list_incompatible_shares=cr.get_incompatible_shares(), count_incompatible_shares=len(cr.get_incompatible_shares()), summary="", report=[], share_problems=[], servermap=None) crr.repair_successful = is_healthy crr.post_repair_results = prr return crr def check(self, monitor, verify=False, add_lease=False): verifycap = self._verifycap sb = self._storage_broker servers = sb.get_connected_servers() sh = self._secret_holder v = Checker(verifycap=verifycap, servers=servers, verify=verify, add_lease=add_lease, secret_holder=sh, monitor=monitor) return v.start() class DecryptingConsumer: """I sit between a CiphertextDownloader (which acts as a Producer) and the real Consumer, decrypting everything that passes by. The real Consumer sees the real Producer, but the Producer sees us instead of the real consumer.""" implements(IConsumer, IDownloadStatusHandlingConsumer) def __init__(self, consumer, readkey, offset): self._consumer = consumer self._read_ev = None self._download_status = None # TODO: pycryptopp CTR-mode needs random-access operations: I want # either a=AES(readkey, offset) or better yet both of: # a=AES(readkey, offset=0) # a.process(ciphertext, offset=xyz) # For now, we fake it with the existing iv= argument. offset_big = offset // 16 offset_small = offset % 16 iv = binascii.unhexlify("%032x" % offset_big) self._decryptor = AES(readkey, iv=iv) self._decryptor.process("\x00"*offset_small) def set_download_status_read_event(self, read_ev): self._read_ev = read_ev def set_download_status(self, ds): self._download_status = ds def registerProducer(self, producer, streaming): # this passes through, so the real consumer can flow-control the real # producer. Therefore we don't need to provide any IPushProducer # methods. We implement all the IConsumer methods as pass-throughs, # and only intercept write() to perform decryption. self._consumer.registerProducer(producer, streaming) def unregisterProducer(self): self._consumer.unregisterProducer() def write(self, ciphertext): started = now() plaintext = self._decryptor.process(ciphertext) if self._read_ev: elapsed = now() - started self._read_ev.update(0, elapsed, 0) if self._download_status: self._download_status.add_misc_event("AES", started, now()) self._consumer.write(plaintext) class ImmutableFileNode: implements(IImmutableFileNode) # I wrap a CiphertextFileNode with a decryption key def __init__(self, filecap, storage_broker, secret_holder, terminator, history): assert isinstance(filecap, uri.CHKFileURI) verifycap = filecap.get_verify_cap() self._cnode = CiphertextFileNode(verifycap, storage_broker, secret_holder, terminator, history) assert isinstance(filecap, uri.CHKFileURI) self.u = filecap self._readkey = filecap.key # TODO: I'm not sure about this.. what's the use case for node==node? If # we keep it here, we should also put this on CiphertextFileNode def __hash__(self): return self.u.__hash__() def __eq__(self, other): if isinstance(other, ImmutableFileNode): return self.u.__eq__(other.u) else: return False def __ne__(self, other): if isinstance(other, ImmutableFileNode): return self.u.__eq__(other.u) else: return True def read(self, consumer, offset=0, size=None): decryptor = DecryptingConsumer(consumer, self._readkey, offset) d = self._cnode.read(decryptor, offset, size) d.addCallback(lambda dc: consumer) return d def raise_error(self): pass def get_write_uri(self): return None def get_readonly_uri(self): return self.get_uri() def get_uri(self): return self.u.to_string() def get_cap(self): return self.u def get_readcap(self): return self.u.get_readonly() def get_verify_cap(self): return self.u.get_verify_cap() def get_repair_cap(self): # CHK files can be repaired with just the verifycap return self.u.get_verify_cap() def get_storage_index(self): return self.u.get_storage_index() def get_size(self): return self.u.get_size() def get_current_size(self): return defer.succeed(self.get_size()) def is_mutable(self): return False def is_readonly(self): return True def is_unknown(self): return False def is_allowed_in_immutable_directory(self): return True def check_and_repair(self, monitor, verify=False, add_lease=False): return self._cnode.check_and_repair(monitor, verify, add_lease) def check(self, monitor, verify=False, add_lease=False): return self._cnode.check(monitor, verify, add_lease) def get_best_readable_version(self): """ Return an IReadable of the best version of this file. Since immutable files can have only one version, we just return the current filenode. """ return defer.succeed(self) def download_best_version(self): """ Download the best version of this file, returning its contents as a bytestring. Since there is only one version of an immutable file, we download and return the contents of this file. """ d = consumer.download_to_data(self) return d # for an immutable file, download_to_data (specified in IReadable) # is the same as download_best_version (specified in IFileNode). For # mutable files, the difference is more meaningful, since they can # have multiple versions. download_to_data = download_best_version # get_size() (IReadable), get_current_size() (IFilesystemNode), and # get_size_of_best_version(IFileNode) are all the same for immutable # files. get_size_of_best_version = get_current_size allmydata-tahoe-1.10.2/src/allmydata/immutable/layout.py0000644000175000017500000005046212556560070021403 0ustar ramramimport struct from zope.interface import implements from twisted.internet import defer from allmydata.interfaces import IStorageBucketWriter, IStorageBucketReader, \ FileTooLargeError, HASH_SIZE from allmydata.util import mathutil, observer, pipeline from allmydata.util.assertutil import precondition from allmydata.storage.server import si_b2a class LayoutInvalid(Exception): """ There is something wrong with these bytes so they can't be interpreted as the kind of immutable file that I know how to download.""" pass class RidiculouslyLargeURIExtensionBlock(LayoutInvalid): """ When downloading a file, the length of the URI Extension Block was given as >= 2**32. This means the share data must have been corrupted, or else the original uploader of the file wrote a ridiculous value into the URI Extension Block length.""" pass class ShareVersionIncompatible(LayoutInvalid): """ When downloading a share, its format was not one of the formats we know how to parse.""" pass """ Share data is written in a file. At the start of the file, there is a series of four-byte big-endian offset values, which indicate where each section starts. Each offset is measured from the beginning of the share data. 0x00: version number (=00 00 00 01) 0x04: block size # See Footnote 1 below. 0x08: share data size # See Footnote 1 below. 0x0c: offset of data (=00 00 00 24) 0x10: offset of plaintext_hash_tree UNUSED 0x14: offset of crypttext_hash_tree 0x18: offset of block_hashes 0x1c: offset of share_hashes 0x20: offset of uri_extension_length + uri_extension 0x24: start of data ? : start of plaintext_hash_tree UNUSED ? : start of crypttext_hash_tree ? : start of block_hashes ? : start of share_hashes each share_hash is written as a two-byte (big-endian) hashnum followed by the 32-byte SHA-256 hash. We store only the hashes necessary to validate the share hash root ? : start of uri_extension_length (four-byte big-endian value) ? : start of uri_extension """ """ v2 shares: these use 8-byte offsets to remove two of the three ~12GiB size limitations described in #346. 0x00: version number (=00 00 00 02) 0x04: block size # See Footnote 1 below. 0x0c: share data size # See Footnote 1 below. 0x14: offset of data (=00 00 00 00 00 00 00 44) 0x1c: offset of plaintext_hash_tree UNUSED 0x24: offset of crypttext_hash_tree 0x2c: offset of block_hashes 0x34: offset of share_hashes 0x3c: offset of uri_extension_length + uri_extension 0x44: start of data : rest of share is the same as v1, above ... ... ? : start of uri_extension_length (eight-byte big-endian value) ? : start of uri_extension """ # Footnote 1: as of Tahoe v1.3.0 these fields are not used when reading, but # they are still provided when writing so that older versions of Tahoe can # read them. FORCE_V2 = False # set briefly by unit tests to make small-sized V2 shares def make_write_bucket_proxy(rref, server, data_size, block_size, num_segments, num_share_hashes, uri_extension_size_max): # Use layout v1 for small files, so they'll be readable by older versions # (= 2**32 or data_size >= 2**32: raise FileTooLargeError("This file is too large to be uploaded (data_size).") offsets = self._offsets = {} x = 0x24 offsets['data'] = x x += data_size offsets['plaintext_hash_tree'] = x # UNUSED x += self._segment_hash_size offsets['crypttext_hash_tree'] = x x += self._segment_hash_size offsets['block_hashes'] = x x += self._segment_hash_size offsets['share_hashes'] = x x += self._share_hashtree_size offsets['uri_extension'] = x if x >= 2**32: raise FileTooLargeError("This file is too large to be uploaded (offsets).") offset_data = struct.pack(">LLLLLLLLL", 1, # version number block_size, data_size, offsets['data'], offsets['plaintext_hash_tree'], # UNUSED offsets['crypttext_hash_tree'], offsets['block_hashes'], offsets['share_hashes'], offsets['uri_extension'], ) assert len(offset_data) == 0x24 self._offset_data = offset_data def __repr__(self): return "" % self._server.get_name() def put_header(self): return self._write(0, self._offset_data) def put_block(self, segmentnum, data): offset = self._offsets['data'] + segmentnum * self._block_size assert offset + len(data) <= self._offsets['uri_extension'] assert isinstance(data, str) if segmentnum < self._num_segments-1: precondition(len(data) == self._block_size, len(data), self._block_size) else: precondition(len(data) == (self._data_size - (self._block_size * (self._num_segments - 1))), len(data), self._block_size) return self._write(offset, data) def put_crypttext_hashes(self, hashes): offset = self._offsets['crypttext_hash_tree'] assert isinstance(hashes, list) data = "".join(hashes) precondition(len(data) == self._segment_hash_size, len(data), self._segment_hash_size) precondition(offset + len(data) <= self._offsets['block_hashes'], offset, len(data), offset+len(data), self._offsets['block_hashes']) return self._write(offset, data) def put_block_hashes(self, blockhashes): offset = self._offsets['block_hashes'] assert isinstance(blockhashes, list) data = "".join(blockhashes) precondition(len(data) == self._segment_hash_size, len(data), self._segment_hash_size) precondition(offset + len(data) <= self._offsets['share_hashes'], offset, len(data), offset+len(data), self._offsets['share_hashes']) return self._write(offset, data) def put_share_hashes(self, sharehashes): # sharehashes is a list of (index, hash) tuples, so they get stored # as 2+32=34 bytes each offset = self._offsets['share_hashes'] assert isinstance(sharehashes, list) data = "".join([struct.pack(">H", hashnum) + hashvalue for hashnum,hashvalue in sharehashes]) precondition(len(data) == self._share_hashtree_size, len(data), self._share_hashtree_size) precondition(offset + len(data) <= self._offsets['uri_extension'], offset, len(data), offset+len(data), self._offsets['uri_extension']) return self._write(offset, data) def put_uri_extension(self, data): offset = self._offsets['uri_extension'] assert isinstance(data, str) precondition(len(data) <= self._uri_extension_size_max, len(data), self._uri_extension_size_max) length = struct.pack(self.fieldstruct, len(data)) return self._write(offset, length+data) def _write(self, offset, data): # use a Pipeline to pipeline several writes together. TODO: another # speedup would be to coalesce small writes into a single call: this # would reduce the foolscap CPU overhead per share, but wouldn't # reduce the number of round trips, so it might not be worth the # effort. return self._pipeline.add(len(data), self._rref.callRemote, "write", offset, data) def close(self): d = self._pipeline.add(0, self._rref.callRemote, "close") d.addCallback(lambda ign: self._pipeline.flush()) return d def abort(self): return self._rref.callRemoteOnly("abort") def get_servername(self): return self._server.get_name() def get_peerid(self): return self._server.get_serverid() class WriteBucketProxy_v2(WriteBucketProxy): fieldsize = 8 fieldstruct = ">Q" def _create_offsets(self, block_size, data_size): if block_size >= 2**64 or data_size >= 2**64: raise FileTooLargeError("This file is too large to be uploaded (data_size).") offsets = self._offsets = {} x = 0x44 offsets['data'] = x x += data_size offsets['plaintext_hash_tree'] = x # UNUSED x += self._segment_hash_size offsets['crypttext_hash_tree'] = x x += self._segment_hash_size offsets['block_hashes'] = x x += self._segment_hash_size offsets['share_hashes'] = x x += self._share_hashtree_size offsets['uri_extension'] = x if x >= 2**64: raise FileTooLargeError("This file is too large to be uploaded (offsets).") offset_data = struct.pack(">LQQQQQQQQ", 2, # version number block_size, data_size, offsets['data'], offsets['plaintext_hash_tree'], # UNUSED offsets['crypttext_hash_tree'], offsets['block_hashes'], offsets['share_hashes'], offsets['uri_extension'], ) assert len(offset_data) == 0x44, len(offset_data) self._offset_data = offset_data class ReadBucketProxy: implements(IStorageBucketReader) MAX_UEB_SIZE = 2000 # actual size is closer to 419, but varies by a few bytes def __init__(self, rref, server, storage_index): self._rref = rref self._server = server self._storage_index = storage_index self._started = False # sent request to server self._ready = observer.OneShotObserverList() # got response from server def get_peerid(self): return self._server.get_serverid() def __repr__(self): return "" % \ (id(self), self._server.get_name(), si_b2a(self._storage_index)) def _start_if_needed(self): """ Returns a deferred that will be fired when I'm ready to return data, or errbacks if the starting (header reading and parsing) process fails.""" if not self._started: self._start() return self._ready.when_fired() def _start(self): self._started = True # TODO: for small shares, read the whole bucket in _start() d = self._fetch_header() d.addCallback(self._parse_offsets) # XXX The following two callbacks implement a slightly faster/nicer # way to get the ueb and sharehashtree, but it requires that the # storage server be >= v1.3.0. # d.addCallback(self._fetch_sharehashtree_and_ueb) # d.addCallback(self._parse_sharehashtree_and_ueb) def _fail_waiters(f): self._ready.fire(f) def _notify_waiters(result): self._ready.fire(result) d.addCallbacks(_notify_waiters, _fail_waiters) return d def _fetch_header(self): return self._read(0, 0x44) def _parse_offsets(self, data): precondition(len(data) >= 0x4) self._offsets = {} (version,) = struct.unpack(">L", data[0:4]) if version != 1 and version != 2: raise ShareVersionIncompatible(version) if version == 1: precondition(len(data) >= 0x24) x = 0x0c fieldsize = 0x4 fieldstruct = ">L" else: precondition(len(data) >= 0x44) x = 0x14 fieldsize = 0x8 fieldstruct = ">Q" self._version = version self._fieldsize = fieldsize self._fieldstruct = fieldstruct for field in ( 'data', 'plaintext_hash_tree', # UNUSED 'crypttext_hash_tree', 'block_hashes', 'share_hashes', 'uri_extension', ): offset = struct.unpack(fieldstruct, data[x:x+fieldsize])[0] x += fieldsize self._offsets[field] = offset return self._offsets def _fetch_sharehashtree_and_ueb(self, offsets): sharehashtree_size = offsets['uri_extension'] - offsets['share_hashes'] return self._read(offsets['share_hashes'], self.MAX_UEB_SIZE+sharehashtree_size) def _parse_sharehashtree_and_ueb(self, data): sharehashtree_size = self._offsets['uri_extension'] - self._offsets['share_hashes'] if len(data) < sharehashtree_size: raise LayoutInvalid("share hash tree truncated -- should have at least %d bytes -- not %d" % (sharehashtree_size, len(data))) if sharehashtree_size % (2+HASH_SIZE) != 0: raise LayoutInvalid("share hash tree malformed -- should have an even multiple of %d bytes -- not %d" % (2+HASH_SIZE, sharehashtree_size)) self._share_hashes = [] for i in range(0, sharehashtree_size, 2+HASH_SIZE): hashnum = struct.unpack(">H", data[i:i+2])[0] hashvalue = data[i+2:i+2+HASH_SIZE] self._share_hashes.append( (hashnum, hashvalue) ) i = self._offsets['uri_extension']-self._offsets['share_hashes'] if len(data) < i+self._fieldsize: raise LayoutInvalid("not enough bytes to encode URI length -- should be at least %d bytes long, not %d " % (i+self._fieldsize, len(data),)) length = struct.unpack(self._fieldstruct, data[i:i+self._fieldsize])[0] self._ueb_data = data[i+self._fieldsize:i+self._fieldsize+length] def _get_block_data(self, unused, blocknum, blocksize, thisblocksize): offset = self._offsets['data'] + blocknum * blocksize return self._read(offset, thisblocksize) def get_block_data(self, blocknum, blocksize, thisblocksize): d = self._start_if_needed() d.addCallback(self._get_block_data, blocknum, blocksize, thisblocksize) return d def _str2l(self, s): """ split string (pulled from storage) into a list of blockids """ return [ s[i:i+HASH_SIZE] for i in range(0, len(s), HASH_SIZE) ] def _get_crypttext_hashes(self, unused=None): offset = self._offsets['crypttext_hash_tree'] size = self._offsets['block_hashes'] - offset d = self._read(offset, size) d.addCallback(self._str2l) return d def get_crypttext_hashes(self): d = self._start_if_needed() d.addCallback(self._get_crypttext_hashes) return d def _get_block_hashes(self, unused=None, at_least_these=()): # TODO: fetch only at_least_these instead of all of them. offset = self._offsets['block_hashes'] size = self._offsets['share_hashes'] - offset d = self._read(offset, size) d.addCallback(self._str2l) return d def get_block_hashes(self, at_least_these=()): if at_least_these: d = self._start_if_needed() d.addCallback(self._get_block_hashes, at_least_these) return d else: return defer.succeed([]) def _get_share_hashes(self, unused=None): if hasattr(self, '_share_hashes'): return self._share_hashes else: return self._get_share_hashes_the_old_way() return self._share_hashes def get_share_hashes(self): d = self._start_if_needed() d.addCallback(self._get_share_hashes) return d def _get_share_hashes_the_old_way(self): """ Tahoe storage servers < v1.3.0 would return an error if you tried to read past the end of the share, so we need to use the offset and read just that much.""" offset = self._offsets['share_hashes'] size = self._offsets['uri_extension'] - offset if size % (2+HASH_SIZE) != 0: raise LayoutInvalid("share hash tree corrupted -- should occupy a multiple of %d bytes, not %d bytes" % ((2+HASH_SIZE), size)) d = self._read(offset, size) def _unpack_share_hashes(data): if len(data) != size: raise LayoutInvalid("share hash tree corrupted -- got a short read of the share data -- should have gotten %d, not %d bytes" % (size, len(data))) hashes = [] for i in range(0, size, 2+HASH_SIZE): hashnum = struct.unpack(">H", data[i:i+2])[0] hashvalue = data[i+2:i+2+HASH_SIZE] hashes.append( (hashnum, hashvalue) ) return hashes d.addCallback(_unpack_share_hashes) return d def _get_uri_extension_the_old_way(self, unused=None): """ Tahoe storage servers < v1.3.0 would return an error if you tried to read past the end of the share, so we need to fetch the UEB size and then read just that much.""" offset = self._offsets['uri_extension'] d = self._read(offset, self._fieldsize) def _got_length(data): if len(data) != self._fieldsize: raise LayoutInvalid("not enough bytes to encode URI length -- should be %d bytes long, not %d " % (self._fieldsize, len(data),)) length = struct.unpack(self._fieldstruct, data)[0] if length >= 2**31: # URI extension blocks are around 419 bytes long, so this # must be corrupted. Anyway, the foolscap interface schema # for "read" will not allow >= 2**31 bytes length. raise RidiculouslyLargeURIExtensionBlock(length) return self._read(offset+self._fieldsize, length) d.addCallback(_got_length) return d def _get_uri_extension(self, unused=None): if hasattr(self, '_ueb_data'): return self._ueb_data else: return self._get_uri_extension_the_old_way() def get_uri_extension(self): d = self._start_if_needed() d.addCallback(self._get_uri_extension) return d def _read(self, offset, length): return self._rref.callRemote("read", offset, length) allmydata-tahoe-1.10.2/src/allmydata/immutable/encode.py0000644000175000017500000007267212556560070021332 0ustar ramram# -*- test-case-name: allmydata.test.test_encode -*- import time from zope.interface import implements from twisted.internet import defer from foolscap.api import fireEventually from allmydata import uri from allmydata.storage.server import si_b2a from allmydata.hashtree import HashTree from allmydata.util import mathutil, hashutil, base32, log, happinessutil from allmydata.util.assertutil import _assert, precondition from allmydata.codec import CRSEncoder from allmydata.interfaces import IEncoder, IStorageBucketWriter, \ IEncryptedUploadable, IUploadStatus, UploadUnhappinessError """ The goal of the encoder is to turn the original file into a series of 'shares'. Each share is going to a 'shareholder' (nominally each shareholder is a different host, but for small grids there may be overlap). The number of shares is chosen to hit our reliability goals (more shares on more machines means more reliability), and is limited by overhead (proportional to numshares or log(numshares)) and the encoding technology in use (zfec permits only 256 shares total). It is also constrained by the amount of data we want to send to each host. For estimating purposes, think of 10 shares out of which we need 3 to reconstruct the file. The encoder starts by cutting the original file into segments. All segments except the last are of equal size. The segment size is chosen to constrain the memory footprint (which will probably vary between 1x and 4x segment size) and to constrain the overhead (which will be proportional to log(number of segments)). Each segment (A,B,C) is read into memory, encrypted, and encoded into blocks. The 'share' (say, share #1) that makes it out to a host is a collection of these blocks (block A1, B1, C1), plus some hash-tree information necessary to validate the data upon retrieval. Only one segment is handled at a time: all blocks for segment A are delivered before any work is begun on segment B. As blocks are created, we retain the hash of each one. The list of block hashes for a single share (say, hash(A1), hash(B1), hash(C1)) is used to form the base of a Merkle hash tree for that share, called the block hash tree. This hash tree has one terminal leaf per block. The complete block hash tree is sent to the shareholder after all the data has been sent. At retrieval time, the decoder will ask for specific pieces of this tree before asking for blocks, whichever it needs to validate those blocks. (Note: we don't really need to generate this whole block hash tree ourselves. It would be sufficient to have the shareholder generate it and just tell us the root. This gives us an extra level of validation on the transfer, though, and it is relatively cheap to compute.) Each of these block hash trees has a root hash. The collection of these root hashes for all shares are collected into the 'share hash tree', which has one terminal leaf per share. After sending the blocks and the complete block hash tree to each shareholder, we send them the portion of the share hash tree that is necessary to validate their share. The root of the share hash tree is put into the URI. """ class UploadAborted(Exception): pass KiB=1024 MiB=1024*KiB GiB=1024*MiB TiB=1024*GiB PiB=1024*TiB class Encoder(object): implements(IEncoder) def __init__(self, log_parent=None, upload_status=None): object.__init__(self) self.uri_extension_data = {} self._codec = None self._status = None if upload_status: self._status = IUploadStatus(upload_status) precondition(log_parent is None or isinstance(log_parent, int), log_parent) self._log_number = log.msg("creating Encoder %s" % self, facility="tahoe.encoder", parent=log_parent) self._aborted = False def __repr__(self): if hasattr(self, "_storage_index"): return "" % si_b2a(self._storage_index)[:5] return "" def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.encoder" return log.msg(*args, **kwargs) def set_encrypted_uploadable(self, uploadable): eu = self._uploadable = IEncryptedUploadable(uploadable) d = eu.get_size() def _got_size(size): self.log(format="file size: %(size)d", size=size) self.file_size = size d.addCallback(_got_size) d.addCallback(lambda res: eu.get_all_encoding_parameters()) d.addCallback(self._got_all_encoding_parameters) d.addCallback(lambda res: eu.get_storage_index()) def _done(storage_index): self._storage_index = storage_index return self d.addCallback(_done) return d def _got_all_encoding_parameters(self, params): assert not self._codec k, happy, n, segsize = params self.required_shares = k self.servers_of_happiness = happy self.num_shares = n self.segment_size = segsize self.log("got encoding parameters: %d/%d/%d %d" % (k,happy,n, segsize)) self.log("now setting up codec") assert self.segment_size % self.required_shares == 0 self.num_segments = mathutil.div_ceil(self.file_size, self.segment_size) self._codec = CRSEncoder() self._codec.set_params(self.segment_size, self.required_shares, self.num_shares) data = self.uri_extension_data data['codec_name'] = self._codec.get_encoder_type() data['codec_params'] = self._codec.get_serialized_params() data['size'] = self.file_size data['segment_size'] = self.segment_size self.share_size = mathutil.div_ceil(self.file_size, self.required_shares) data['num_segments'] = self.num_segments data['needed_shares'] = self.required_shares data['total_shares'] = self.num_shares # the "tail" is the last segment. This segment may or may not be # shorter than all other segments. We use the "tail codec" to handle # it. If the tail is short, we use a different codec instance. In # addition, the tail codec must be fed data which has been padded out # to the right size. tail_size = self.file_size % self.segment_size if not tail_size: tail_size = self.segment_size # the tail codec is responsible for encoding tail_size bytes padded_tail_size = mathutil.next_multiple(tail_size, self.required_shares) self._tail_codec = CRSEncoder() self._tail_codec.set_params(padded_tail_size, self.required_shares, self.num_shares) data['tail_codec_params'] = self._tail_codec.get_serialized_params() def _get_share_size(self): share_size = mathutil.div_ceil(self.file_size, self.required_shares) overhead = self._compute_overhead() return share_size + overhead def _compute_overhead(self): return 0 def get_param(self, name): assert self._codec if name == "storage_index": return self._storage_index elif name == "share_counts": return (self.required_shares, self.servers_of_happiness, self.num_shares) elif name == "num_segments": return self.num_segments elif name == "segment_size": return self.segment_size elif name == "block_size": return self._codec.get_block_size() elif name == "share_size": return self._get_share_size() elif name == "serialized_params": return self._codec.get_serialized_params() else: raise KeyError("unknown parameter name '%s'" % name) def set_shareholders(self, landlords, servermap): assert isinstance(landlords, dict) for k in landlords: assert IStorageBucketWriter.providedBy(landlords[k]) self.landlords = landlords.copy() assert isinstance(servermap, dict) for v in servermap.itervalues(): assert isinstance(v, set) self.servermap = servermap.copy() def start(self): """ Returns a Deferred that will fire with the verify cap (an instance of uri.CHKFileVerifierURI).""" self.log("%s starting" % (self,)) #paddedsize = self._size + mathutil.pad_size(self._size, self.needed_shares) assert self._codec self._crypttext_hasher = hashutil.crypttext_hasher() self._crypttext_hashes = [] self.segment_num = 0 self.block_hashes = [[] for x in range(self.num_shares)] # block_hashes[i] is a list that will be accumulated and then send # to landlord[i]. This list contains a hash of each segment_share # that we sent to that landlord. self.share_root_hashes = [None] * self.num_shares self._times = { "cumulative_encoding": 0.0, "cumulative_sending": 0.0, "hashes_and_close": 0.0, "total_encode_and_push": 0.0, } self._start_total_timestamp = time.time() d = fireEventually() d.addCallback(lambda res: self.start_all_shareholders()) for i in range(self.num_segments-1): # note to self: this form doesn't work, because lambda only # captures the slot, not the value #d.addCallback(lambda res: self.do_segment(i)) # use this form instead: d.addCallback(lambda res, i=i: self._encode_segment(i)) d.addCallback(self._send_segment, i) d.addCallback(self._turn_barrier) last_segnum = self.num_segments - 1 d.addCallback(lambda res: self._encode_tail_segment(last_segnum)) d.addCallback(self._send_segment, last_segnum) d.addCallback(self._turn_barrier) d.addCallback(lambda res: self.finish_hashing()) d.addCallback(lambda res: self.send_crypttext_hash_tree_to_all_shareholders()) d.addCallback(lambda res: self.send_all_block_hash_trees()) d.addCallback(lambda res: self.send_all_share_hash_trees()) d.addCallback(lambda res: self.send_uri_extension_to_all_shareholders()) d.addCallback(lambda res: self.close_all_shareholders()) d.addCallbacks(self.done, self.err) return d def set_status(self, status): if self._status: self._status.set_status(status) def set_encode_and_push_progress(self, sent_segments=None, extra=0.0): if self._status: # we treat the final hash+close as an extra segment if sent_segments is None: sent_segments = self.num_segments progress = float(sent_segments + extra) / (self.num_segments + 1) self._status.set_progress(2, progress) def abort(self): self.log("aborting upload", level=log.UNUSUAL) assert self._codec, "don't call abort before start" self._aborted = True # the next segment read (in _gather_data inside _encode_segment) will # raise UploadAborted(), which will bypass the rest of the upload # chain. If we've sent the final segment's shares, it's too late to # abort. TODO: allow abort any time up to close_all_shareholders. def _turn_barrier(self, res): # putting this method in a Deferred chain imposes a guaranteed # reactor turn between the pre- and post- portions of that chain. # This can be useful to limit memory consumption: since Deferreds do # not do tail recursion, code which uses defer.succeed(result) for # consistency will cause objects to live for longer than you might # normally expect. return fireEventually(res) def start_all_shareholders(self): self.log("starting shareholders", level=log.NOISY) self.set_status("Starting shareholders") dl = [] for shareid in list(self.landlords): d = self.landlords[shareid].put_header() d.addErrback(self._remove_shareholder, shareid, "start") dl.append(d) return self._gather_responses(dl) def _encode_segment(self, segnum): codec = self._codec start = time.time() # the ICodecEncoder API wants to receive a total of self.segment_size # bytes on each encode() call, broken up into a number of # identically-sized pieces. Due to the way the codec algorithm works, # these pieces need to be the same size as the share which the codec # will generate. Therefore we must feed it with input_piece_size that # equals the output share size. input_piece_size = codec.get_block_size() # as a result, the number of input pieces per encode() call will be # equal to the number of required shares with which the codec was # constructed. You can think of the codec as chopping up a # 'segment_size' of data into 'required_shares' shares (not doing any # fancy math at all, just doing a split), then creating some number # of additional shares which can be substituted if the primary ones # are unavailable # we read data from the source one segment at a time, and then chop # it into 'input_piece_size' pieces before handing it to the codec crypttext_segment_hasher = hashutil.crypttext_segment_hasher() # memory footprint: we only hold a tiny piece of the plaintext at any # given time. We build up a segment's worth of cryptttext, then hand # it to the encoder. Assuming 3-of-10 encoding (3.3x expansion) and # 1MiB max_segment_size, we get a peak memory footprint of 4.3*1MiB = # 4.3MiB. Lowering max_segment_size to, say, 100KiB would drop the # footprint to 430KiB at the expense of more hash-tree overhead. d = self._gather_data(self.required_shares, input_piece_size, crypttext_segment_hasher) def _done_gathering(chunks): for c in chunks: assert len(c) == input_piece_size self._crypttext_hashes.append(crypttext_segment_hasher.digest()) # during this call, we hit 5*segsize memory return codec.encode(chunks) d.addCallback(_done_gathering) def _done(res): elapsed = time.time() - start self._times["cumulative_encoding"] += elapsed return res d.addCallback(_done) return d def _encode_tail_segment(self, segnum): start = time.time() codec = self._tail_codec input_piece_size = codec.get_block_size() crypttext_segment_hasher = hashutil.crypttext_segment_hasher() d = self._gather_data(self.required_shares, input_piece_size, crypttext_segment_hasher, allow_short=True) def _done_gathering(chunks): for c in chunks: # a short trailing chunk will have been padded by # _gather_data assert len(c) == input_piece_size self._crypttext_hashes.append(crypttext_segment_hasher.digest()) return codec.encode(chunks) d.addCallback(_done_gathering) def _done(res): elapsed = time.time() - start self._times["cumulative_encoding"] += elapsed return res d.addCallback(_done) return d def _gather_data(self, num_chunks, input_chunk_size, crypttext_segment_hasher, allow_short=False): """Return a Deferred that will fire when the required number of chunks have been read (and hashed and encrypted). The Deferred fires with a list of chunks, each of size input_chunk_size.""" # I originally built this to allow read_encrypted() to behave badly: # to let it return more or less data than you asked for. It would # stash the leftovers until later, and then recurse until it got # enough. I don't think that was actually useful. # # who defines read_encrypted? # offloaded.LocalCiphertextReader: real disk file: exact # upload.EncryptAnUploadable: Uploadable, but a wrapper that makes # it exact. The return value is a list of 50KiB chunks, to reduce # the memory footprint of the encryption process. # repairer.Repairer: immutable.filenode.CiphertextFileNode: exact # # This has been redefined to require read_encrypted() to behave like # a local file: return exactly the amount requested unless it hits # EOF. # -warner if self._aborted: raise UploadAborted() read_size = num_chunks * input_chunk_size d = self._uploadable.read_encrypted(read_size, hash_only=False) def _got(data): assert isinstance(data, (list,tuple)) if self._aborted: raise UploadAborted() data = "".join(data) precondition(len(data) <= read_size, len(data), read_size) if not allow_short: precondition(len(data) == read_size, len(data), read_size) crypttext_segment_hasher.update(data) self._crypttext_hasher.update(data) if allow_short and len(data) < read_size: # padding data += "\x00" * (read_size - len(data)) encrypted_pieces = [data[i:i+input_chunk_size] for i in range(0, len(data), input_chunk_size)] return encrypted_pieces d.addCallback(_got) return d def _send_segment(self, (shares, shareids), segnum): # To generate the URI, we must generate the roothash, so we must # generate all shares, even if we aren't actually giving them to # anybody. This means that the set of shares we create will be equal # to or larger than the set of landlords. If we have any landlord who # *doesn't* have a share, that's an error. _assert(set(self.landlords.keys()).issubset(set(shareids)), shareids=shareids, landlords=self.landlords) start = time.time() dl = [] self.set_status("Sending segment %d of %d" % (segnum+1, self.num_segments)) self.set_encode_and_push_progress(segnum) lognum = self.log("send_segment(%d)" % segnum, level=log.NOISY) for i in range(len(shares)): block = shares[i] shareid = shareids[i] d = self.send_block(shareid, segnum, block, lognum) dl.append(d) block_hash = hashutil.block_hash(block) #from allmydata.util import base32 #log.msg("creating block (shareid=%d, blocknum=%d) " # "len=%d %r .. %r: %s" % # (shareid, segnum, len(block), # block[:50], block[-50:], base32.b2a(block_hash))) self.block_hashes[shareid].append(block_hash) dl = self._gather_responses(dl) def _logit(res): self.log("%s uploaded %s / %s bytes (%d%%) of your file." % (self, self.segment_size*(segnum+1), self.segment_size*self.num_segments, 100 * (segnum+1) / self.num_segments, ), level=log.OPERATIONAL) elapsed = time.time() - start self._times["cumulative_sending"] += elapsed return res dl.addCallback(_logit) return dl def send_block(self, shareid, segment_num, block, lognum): if shareid not in self.landlords: return defer.succeed(None) sh = self.landlords[shareid] lognum2 = self.log("put_block to %s" % self.landlords[shareid], parent=lognum, level=log.NOISY) d = sh.put_block(segment_num, block) def _done(res): self.log("put_block done", parent=lognum2, level=log.NOISY) return res d.addCallback(_done) d.addErrback(self._remove_shareholder, shareid, "segnum=%d" % segment_num) return d def _remove_shareholder(self, why, shareid, where): ln = self.log(format="error while sending %(method)s to shareholder=%(shnum)d", method=where, shnum=shareid, level=log.UNUSUAL, failure=why) if shareid in self.landlords: self.landlords[shareid].abort() peerid = self.landlords[shareid].get_peerid() assert peerid del self.landlords[shareid] self.servermap[shareid].remove(peerid) if not self.servermap[shareid]: del self.servermap[shareid] else: # even more UNUSUAL self.log("they weren't in our list of landlords", parent=ln, level=log.WEIRD, umid="TQGFRw") happiness = happinessutil.servers_of_happiness(self.servermap) if happiness < self.servers_of_happiness: peerids = set(happinessutil.shares_by_server(self.servermap).keys()) msg = happinessutil.failure_message(len(peerids), self.required_shares, self.servers_of_happiness, happiness) msg = "%s: %s" % (msg, why) raise UploadUnhappinessError(msg) self.log("but we can still continue with %s shares, we'll be happy " "with at least %s" % (happiness, self.servers_of_happiness), parent=ln) def _gather_responses(self, dl): d = defer.DeferredList(dl, fireOnOneErrback=True) def _eatUploadUnhappinessError(f): # all exceptions that occur while talking to a peer are handled # in _remove_shareholder. That might raise UploadUnhappinessError, # which will cause the DeferredList to errback but which should # otherwise be consumed. Allow non-UploadUnhappinessError exceptions # to pass through as an unhandled errback. We use this in lieu of # consumeErrors=True to allow coding errors to be logged. f.trap(UploadUnhappinessError) return None for d0 in dl: d0.addErrback(_eatUploadUnhappinessError) return d def finish_hashing(self): self._start_hashing_and_close_timestamp = time.time() self.set_status("Finishing hashes") self.set_encode_and_push_progress(extra=0.0) crypttext_hash = self._crypttext_hasher.digest() self.uri_extension_data["crypttext_hash"] = crypttext_hash self._uploadable.close() def send_crypttext_hash_tree_to_all_shareholders(self): self.log("sending crypttext hash tree", level=log.NOISY) self.set_status("Sending Crypttext Hash Tree") self.set_encode_and_push_progress(extra=0.3) t = HashTree(self._crypttext_hashes) all_hashes = list(t) self.uri_extension_data["crypttext_root_hash"] = t[0] dl = [] for shareid in list(self.landlords): dl.append(self.send_crypttext_hash_tree(shareid, all_hashes)) return self._gather_responses(dl) def send_crypttext_hash_tree(self, shareid, all_hashes): if shareid not in self.landlords: return defer.succeed(None) sh = self.landlords[shareid] d = sh.put_crypttext_hashes(all_hashes) d.addErrback(self._remove_shareholder, shareid, "put_crypttext_hashes") return d def send_all_block_hash_trees(self): self.log("sending block hash trees", level=log.NOISY) self.set_status("Sending Subshare Hash Trees") self.set_encode_and_push_progress(extra=0.4) dl = [] for shareid,hashes in enumerate(self.block_hashes): # hashes is a list of the hashes of all blocks that were sent # to shareholder[shareid]. dl.append(self.send_one_block_hash_tree(shareid, hashes)) return self._gather_responses(dl) def send_one_block_hash_tree(self, shareid, block_hashes): t = HashTree(block_hashes) all_hashes = list(t) # all_hashes[0] is the root hash, == hash(ah[1]+ah[2]) # all_hashes[1] is the left child, == hash(ah[3]+ah[4]) # all_hashes[n] == hash(all_hashes[2*n+1] + all_hashes[2*n+2]) self.share_root_hashes[shareid] = t[0] if shareid not in self.landlords: return defer.succeed(None) sh = self.landlords[shareid] d = sh.put_block_hashes(all_hashes) d.addErrback(self._remove_shareholder, shareid, "put_block_hashes") return d def send_all_share_hash_trees(self): # Each bucket gets a set of share hash tree nodes that are needed to validate their # share. This includes the share hash itself, but does not include the top-level hash # root (which is stored securely in the URI instead). self.log("sending all share hash trees", level=log.NOISY) self.set_status("Sending Share Hash Trees") self.set_encode_and_push_progress(extra=0.6) dl = [] for h in self.share_root_hashes: assert h # create the share hash tree t = HashTree(self.share_root_hashes) # the root of this hash tree goes into our URI self.uri_extension_data['share_root_hash'] = t[0] # now send just the necessary pieces out to each shareholder for i in range(self.num_shares): # the HashTree is given a list of leaves: 0,1,2,3..n . # These become nodes A+0,A+1,A+2.. of the tree, where A=n-1 needed_hash_indices = t.needed_hashes(i, include_leaf=True) hashes = [(hi, t[hi]) for hi in needed_hash_indices] dl.append(self.send_one_share_hash_tree(i, hashes)) return self._gather_responses(dl) def send_one_share_hash_tree(self, shareid, needed_hashes): if shareid not in self.landlords: return defer.succeed(None) sh = self.landlords[shareid] d = sh.put_share_hashes(needed_hashes) d.addErrback(self._remove_shareholder, shareid, "put_share_hashes") return d def send_uri_extension_to_all_shareholders(self): lp = self.log("sending uri_extension", level=log.NOISY) self.set_status("Sending URI Extensions") self.set_encode_and_push_progress(extra=0.8) for k in ('crypttext_root_hash', 'crypttext_hash', ): assert k in self.uri_extension_data uri_extension = uri.pack_extension(self.uri_extension_data) ed = {} for k,v in self.uri_extension_data.items(): if k.endswith("hash"): ed[k] = base32.b2a(v) else: ed[k] = v self.log("uri_extension_data is %s" % (ed,), level=log.NOISY, parent=lp) self.uri_extension_hash = hashutil.uri_extension_hash(uri_extension) dl = [] for shareid in list(self.landlords): dl.append(self.send_uri_extension(shareid, uri_extension)) return self._gather_responses(dl) def send_uri_extension(self, shareid, uri_extension): sh = self.landlords[shareid] d = sh.put_uri_extension(uri_extension) d.addErrback(self._remove_shareholder, shareid, "put_uri_extension") return d def close_all_shareholders(self): self.log("closing shareholders", level=log.NOISY) self.set_status("Closing Shareholders") self.set_encode_and_push_progress(extra=0.9) dl = [] for shareid in list(self.landlords): d = self.landlords[shareid].close() d.addErrback(self._remove_shareholder, shareid, "close") dl.append(d) return self._gather_responses(dl) def done(self, res): self.log("upload done", level=log.OPERATIONAL) self.set_status("Finished") self.set_encode_and_push_progress(extra=1.0) # done now = time.time() h_and_c_elapsed = now - self._start_hashing_and_close_timestamp self._times["hashes_and_close"] = h_and_c_elapsed total_elapsed = now - self._start_total_timestamp self._times["total_encode_and_push"] = total_elapsed # update our sharemap self._shares_placed = set(self.landlords.keys()) return uri.CHKFileVerifierURI(self._storage_index, self.uri_extension_hash, self.required_shares, self.num_shares, self.file_size) def err(self, f): self.log("upload failed", failure=f, level=log.UNUSUAL) self.set_status("Failed") # we need to abort any remaining shareholders, so they'll delete the # partial share, allowing someone else to upload it again. self.log("aborting shareholders", level=log.UNUSUAL) for shareid in list(self.landlords): self.landlords[shareid].abort() if f.check(defer.FirstError): return f.value.subFailure return f def get_shares_placed(self): # return a set of share numbers that were successfully placed. return self._shares_placed def get_times(self): # return a dictionary of encode+push timings return self._times def get_uri_extension_data(self): return self.uri_extension_data def get_uri_extension_hash(self): return self.uri_extension_hash allmydata-tahoe-1.10.2/src/allmydata/immutable/offloaded.py0000644000175000017500000006367312556560070022021 0ustar ramram import os, stat, time, weakref from zope.interface import implements from twisted.internet import defer from foolscap.api import Referenceable, DeadReferenceError, eventually import allmydata # for __full_version__ from allmydata import interfaces, uri from allmydata.storage.server import si_b2a from allmydata.immutable import upload from allmydata.immutable.layout import ReadBucketProxy from allmydata.util.assertutil import precondition from allmydata.util import log, observer, fileutil, hashutil, dictutil class NotEnoughWritersError(Exception): pass class CHKCheckerAndUEBFetcher: """I check to see if a file is already present in the grid. I also fetch the URI Extension Block, which is useful for an uploading client who wants to avoid the work of encryption and encoding. I return False if the file is not completely healthy: i.e. if there are less than 'N' shares present. If the file is completely healthy, I return a tuple of (sharemap, UEB_data, UEB_hash). """ def __init__(self, peer_getter, storage_index, logparent=None): self._peer_getter = peer_getter self._found_shares = set() self._storage_index = storage_index self._sharemap = dictutil.DictOfSets() self._readers = set() self._ueb_hash = None self._ueb_data = None self._logparent = logparent def log(self, *args, **kwargs): if 'facility' not in kwargs: kwargs['facility'] = "tahoe.helper.chk.checkandUEBfetch" if 'parent' not in kwargs: kwargs['parent'] = self._logparent return log.msg(*args, **kwargs) def check(self): d = self._get_all_shareholders(self._storage_index) d.addCallback(self._get_uri_extension) d.addCallback(self._done) return d def _get_all_shareholders(self, storage_index): dl = [] for s in self._peer_getter(storage_index): d = s.get_rref().callRemote("get_buckets", storage_index) d.addCallbacks(self._got_response, self._got_error, callbackArgs=(s,)) dl.append(d) return defer.DeferredList(dl) def _got_response(self, buckets, server): # buckets is a dict: maps shum to an rref of the server who holds it shnums_s = ",".join([str(shnum) for shnum in buckets]) self.log("got_response: [%s] has %d shares (%s)" % (server.get_name(), len(buckets), shnums_s), level=log.NOISY) self._found_shares.update(buckets.keys()) for k in buckets: self._sharemap.add(k, server.get_serverid()) self._readers.update( [ (bucket, server) for bucket in buckets.values() ] ) def _got_error(self, f): if f.check(DeadReferenceError): return log.err(f, parent=self._logparent) pass def _get_uri_extension(self, res): # assume that we can pull the UEB from any share. If we get an error, # declare the whole file unavailable. if not self._readers: self.log("no readers, so no UEB", level=log.NOISY) return b,server = self._readers.pop() rbp = ReadBucketProxy(b, server, si_b2a(self._storage_index)) d = rbp.get_uri_extension() d.addCallback(self._got_uri_extension) d.addErrback(self._ueb_error) return d def _got_uri_extension(self, ueb): self.log("_got_uri_extension", level=log.NOISY) self._ueb_hash = hashutil.uri_extension_hash(ueb) self._ueb_data = uri.unpack_extension(ueb) def _ueb_error(self, f): # an error means the file is unavailable, but the overall check # shouldn't fail. self.log("UEB fetch failed", failure=f, level=log.WEIRD, umid="sJLKVg") return None def _done(self, res): if self._ueb_data: found = len(self._found_shares) total = self._ueb_data['total_shares'] self.log(format="got %(found)d shares of %(total)d", found=found, total=total, level=log.NOISY) if found < total: # not all shares are present in the grid self.log("not enough to qualify, file not found in grid", level=log.NOISY) return False # all shares are present self.log("all shares present, file is found in grid", level=log.NOISY) return (self._sharemap, self._ueb_data, self._ueb_hash) # no shares are present self.log("unable to find UEB data, file not found in grid", level=log.NOISY) return False class CHKUploadHelper(Referenceable, upload.CHKUploader): """I am the helper-server -side counterpart to AssistedUploader. I handle peer selection, encoding, and share pushing. I read ciphertext from the remote AssistedUploader. """ implements(interfaces.RICHKUploadHelper) VERSION = { "http://allmydata.org/tahoe/protocols/helper/chk-upload/v1" : { }, "application-version": str(allmydata.__full_version__), } def __init__(self, storage_index, helper, storage_broker, secret_holder, incoming_file, encoding_file, log_number): self._storage_index = storage_index self._helper = helper self._incoming_file = incoming_file self._encoding_file = encoding_file self._upload_id = si_b2a(storage_index)[:5] self._log_number = log_number self._upload_status = upload.UploadStatus() self._upload_status.set_helper(False) self._upload_status.set_storage_index(storage_index) self._upload_status.set_status("fetching ciphertext") self._upload_status.set_progress(0, 1.0) self._helper.log("CHKUploadHelper starting for SI %s" % self._upload_id, parent=log_number) self._storage_broker = storage_broker self._secret_holder = secret_holder self._fetcher = CHKCiphertextFetcher(self, incoming_file, encoding_file, self._log_number) self._reader = LocalCiphertextReader(self, storage_index, encoding_file) self._finished_observers = observer.OneShotObserverList() self._started = time.time() d = self._fetcher.when_done() d.addCallback(lambda res: self._reader.start()) d.addCallback(lambda res: self.start_encrypted(self._reader)) d.addCallback(self._finished) d.addErrback(self._failed) def log(self, *args, **kwargs): if 'facility' not in kwargs: kwargs['facility'] = "tahoe.helper.chk" return upload.CHKUploader.log(self, *args, **kwargs) def remote_get_version(self): return self.VERSION def remote_upload(self, reader): # reader is an RIEncryptedUploadable. I am specified to return an # UploadResults dictionary. # Log how much ciphertext we need to get. self.log("deciding whether to upload the file or not", level=log.NOISY) if os.path.exists(self._encoding_file): # we have the whole file, and we might be encoding it (or the # encode/upload might have failed, and we need to restart it). self.log("ciphertext already in place", level=log.UNUSUAL) elif os.path.exists(self._incoming_file): # we have some of the file, but not all of it (otherwise we'd be # encoding). The caller might be useful. self.log("partial ciphertext already present", level=log.UNUSUAL) else: # we don't remember uploading this file self.log("no ciphertext yet", level=log.NOISY) # let our fetcher pull ciphertext from the reader. self._fetcher.add_reader(reader) # and also hashes self._reader.add_reader(reader) # and inform the client when the upload has finished return self._finished_observers.when_fired() def _finished(self, ur): assert interfaces.IUploadResults.providedBy(ur), ur vcapstr = ur.get_verifycapstr() precondition(isinstance(vcapstr, str), vcapstr) v = uri.from_string(vcapstr) f_times = self._fetcher.get_times() hur = upload.HelperUploadResults() hur.timings = {"cumulative_fetch": f_times["cumulative_fetch"], "total_fetch": f_times["total"], } for key,val in ur.get_timings().items(): hur.timings[key] = val hur.uri_extension_hash = v.uri_extension_hash hur.ciphertext_fetched = self._fetcher.get_ciphertext_fetched() hur.preexisting_shares = ur.get_preexisting_shares() # hur.sharemap needs to be {shnum: set(serverid)} hur.sharemap = {} for shnum, servers in ur.get_sharemap().items(): hur.sharemap[shnum] = set([s.get_serverid() for s in servers]) # and hur.servermap needs to be {serverid: set(shnum)} hur.servermap = {} for server, shnums in ur.get_servermap().items(): hur.servermap[server.get_serverid()] = set(shnums) hur.pushed_shares = ur.get_pushed_shares() hur.file_size = ur.get_file_size() hur.uri_extension_data = ur.get_uri_extension_data() hur.verifycapstr = vcapstr self._reader.close() os.unlink(self._encoding_file) self._finished_observers.fire(hur) self._helper.upload_finished(self._storage_index, v.size) del self._reader def _failed(self, f): self.log(format="CHKUploadHelper(%(si)s) failed", si=si_b2a(self._storage_index)[:5], failure=f, level=log.UNUSUAL) self._finished_observers.fire(f) self._helper.upload_finished(self._storage_index, 0) del self._reader class AskUntilSuccessMixin: # create me with a _reader array _last_failure = None def add_reader(self, reader): self._readers.append(reader) def call(self, *args, **kwargs): if not self._readers: raise NotEnoughWritersError("ran out of assisted uploaders, last failure was %s" % self._last_failure) rr = self._readers[0] d = rr.callRemote(*args, **kwargs) def _err(f): self._last_failure = f if rr in self._readers: self._readers.remove(rr) self._upload_helper.log("call to assisted uploader %s failed" % rr, failure=f, level=log.UNUSUAL) # we can try again with someone else who's left return self.call(*args, **kwargs) d.addErrback(_err) return d class CHKCiphertextFetcher(AskUntilSuccessMixin): """I use one or more remote RIEncryptedUploadable instances to gather ciphertext on disk. When I'm done, the file I create can be used by a LocalCiphertextReader to satisfy the ciphertext needs of a CHK upload process. I begin pulling ciphertext as soon as a reader is added. I remove readers when they have any sort of error. If the last reader is removed, I fire my when_done() Deferred with a failure. I fire my when_done() Deferred (with None) immediately after I have moved the ciphertext to 'encoded_file'. """ def __init__(self, helper, incoming_file, encoded_file, logparent): self._upload_helper = helper self._incoming_file = incoming_file self._encoding_file = encoded_file self._upload_id = helper._upload_id self._log_parent = logparent self._done_observers = observer.OneShotObserverList() self._readers = [] self._started = False self._f = None self._times = { "cumulative_fetch": 0.0, "total": 0.0, } self._ciphertext_fetched = 0 def log(self, *args, **kwargs): if "facility" not in kwargs: kwargs["facility"] = "tahoe.helper.chkupload.fetch" if "parent" not in kwargs: kwargs["parent"] = self._log_parent return log.msg(*args, **kwargs) def add_reader(self, reader): AskUntilSuccessMixin.add_reader(self, reader) eventually(self._start) def _start(self): if self._started: return self._started = True started = time.time() if os.path.exists(self._encoding_file): self.log("ciphertext already present, bypassing fetch", level=log.UNUSUAL) d = defer.succeed(None) else: # first, find out how large the file is going to be d = self.call("get_size") d.addCallback(self._got_size) d.addCallback(self._start_reading) d.addCallback(self._done) d.addCallback(self._done2, started) d.addErrback(self._failed) def _got_size(self, size): self.log("total size is %d bytes" % size, level=log.NOISY) self._upload_helper._upload_status.set_size(size) self._expected_size = size def _start_reading(self, res): # then find out how much crypttext we have on disk if os.path.exists(self._incoming_file): self._have = os.stat(self._incoming_file)[stat.ST_SIZE] self._upload_helper._helper.count("chk_upload_helper.resumes") self.log("we already have %d bytes" % self._have, level=log.NOISY) else: self._have = 0 self.log("we do not have any ciphertext yet", level=log.NOISY) self.log("starting ciphertext fetch", level=log.NOISY) self._f = open(self._incoming_file, "ab") # now loop to pull the data from the readers d = defer.Deferred() self._loop(d) # this Deferred will be fired once the last byte has been written to # self._f return d # read data in 50kB chunks. We should choose a more considered number # here, possibly letting the client specify it. The goal should be to # keep the RTT*bandwidth to be less than 10% of the chunk size, to reduce # the upload bandwidth lost because this protocol is non-windowing. Too # large, however, means more memory consumption for both ends. Something # that can be transferred in, say, 10 seconds sounds about right. On my # home DSL line (50kBps upstream), that suggests 500kB. Most lines are # slower, maybe 10kBps, which suggests 100kB, and that's a bit more # memory than I want to hang on to, so I'm going to go with 50kB and see # how that works. CHUNK_SIZE = 50*1024 def _loop(self, fire_when_done): # this slightly weird structure is needed because Deferreds don't do # tail-recursion, so it is important to let each one retire promptly. # Simply chaining them will cause a stack overflow at the end of a # transfer that involves more than a few hundred chunks. # 'fire_when_done' lives a long time, but the Deferreds returned by # the inner _fetch() call do not. start = time.time() d = defer.maybeDeferred(self._fetch) def _done(finished): elapsed = time.time() - start self._times["cumulative_fetch"] += elapsed if finished: self.log("finished reading ciphertext", level=log.NOISY) fire_when_done.callback(None) else: self._loop(fire_when_done) def _err(f): self.log(format="[%(si)s] ciphertext read failed", si=self._upload_id, failure=f, level=log.UNUSUAL) fire_when_done.errback(f) d.addCallbacks(_done, _err) return None def _fetch(self): needed = self._expected_size - self._have fetch_size = min(needed, self.CHUNK_SIZE) if fetch_size == 0: self._upload_helper._upload_status.set_progress(1, 1.0) return True # all done percent = 0.0 if self._expected_size: percent = 1.0 * (self._have+fetch_size) / self._expected_size self.log(format="fetching [%(si)s] %(start)d-%(end)d of %(total)d (%(percent)d%%)", si=self._upload_id, start=self._have, end=self._have+fetch_size, total=self._expected_size, percent=int(100.0*percent), level=log.NOISY) d = self.call("read_encrypted", self._have, fetch_size) def _got_data(ciphertext_v): for data in ciphertext_v: self._f.write(data) self._have += len(data) self._ciphertext_fetched += len(data) self._upload_helper._helper.count("chk_upload_helper.fetched_bytes", len(data)) self._upload_helper._upload_status.set_progress(1, percent) return False # not done d.addCallback(_got_data) return d def _done(self, res): self._f.close() self._f = None self.log(format="done fetching ciphertext, size=%(size)d", size=os.stat(self._incoming_file)[stat.ST_SIZE], level=log.NOISY) os.rename(self._incoming_file, self._encoding_file) def _done2(self, _ignored, started): self.log("done2", level=log.NOISY) elapsed = time.time() - started self._times["total"] = elapsed self._readers = [] self._done_observers.fire(None) def _failed(self, f): if self._f: self._f.close() self._readers = [] self._done_observers.fire(f) def when_done(self): return self._done_observers.when_fired() def get_times(self): return self._times def get_ciphertext_fetched(self): return self._ciphertext_fetched class LocalCiphertextReader(AskUntilSuccessMixin): implements(interfaces.IEncryptedUploadable) def __init__(self, upload_helper, storage_index, encoding_file): self._readers = [] self._upload_helper = upload_helper self._storage_index = storage_index self._encoding_file = encoding_file self._status = None def start(self): self._upload_helper._upload_status.set_status("pushing") self._size = os.stat(self._encoding_file)[stat.ST_SIZE] self.f = open(self._encoding_file, "rb") def get_size(self): return defer.succeed(self._size) def get_all_encoding_parameters(self): return self.call("get_all_encoding_parameters") def get_storage_index(self): return defer.succeed(self._storage_index) def read_encrypted(self, length, hash_only): assert hash_only is False d = defer.maybeDeferred(self.f.read, length) d.addCallback(lambda data: [data]) return d def close(self): self.f.close() # ??. I'm not sure if it makes sense to forward the close message. return self.call("close") class Helper(Referenceable): implements(interfaces.RIHelper, interfaces.IStatsProducer) # this is the non-distributed version. When we need to have multiple # helpers, this object will become the HelperCoordinator, and will query # the farm of Helpers to see if anyone has the storage_index of interest, # and send the request off to them. If nobody has it, we'll choose a # helper at random. name = "helper" VERSION = { "http://allmydata.org/tahoe/protocols/helper/v1" : { }, "application-version": str(allmydata.__full_version__), } MAX_UPLOAD_STATUSES = 10 def __init__(self, basedir, storage_broker, secret_holder, stats_provider, history): self._basedir = basedir self._storage_broker = storage_broker self._secret_holder = secret_holder self._chk_incoming = os.path.join(basedir, "CHK_incoming") self._chk_encoding = os.path.join(basedir, "CHK_encoding") fileutil.make_dirs(self._chk_incoming) fileutil.make_dirs(self._chk_encoding) self._active_uploads = {} self._all_uploads = weakref.WeakKeyDictionary() # for debugging self.stats_provider = stats_provider if stats_provider: stats_provider.register_producer(self) self._counters = {"chk_upload_helper.upload_requests": 0, "chk_upload_helper.upload_already_present": 0, "chk_upload_helper.upload_need_upload": 0, "chk_upload_helper.resumes": 0, "chk_upload_helper.fetched_bytes": 0, "chk_upload_helper.encoded_bytes": 0, } self._history = history def log(self, *args, **kwargs): if 'facility' not in kwargs: kwargs['facility'] = "tahoe.helper" return log.msg(*args, **kwargs) def count(self, key, value=1): if self.stats_provider: self.stats_provider.count(key, value) self._counters[key] += value def get_stats(self): OLD = 86400*2 # 48hours now = time.time() inc_count = inc_size = inc_size_old = 0 enc_count = enc_size = enc_size_old = 0 inc = os.listdir(self._chk_incoming) enc = os.listdir(self._chk_encoding) for f in inc: s = os.stat(os.path.join(self._chk_incoming, f)) size = s[stat.ST_SIZE] mtime = s[stat.ST_MTIME] inc_count += 1 inc_size += size if now - mtime > OLD: inc_size_old += size for f in enc: s = os.stat(os.path.join(self._chk_encoding, f)) size = s[stat.ST_SIZE] mtime = s[stat.ST_MTIME] enc_count += 1 enc_size += size if now - mtime > OLD: enc_size_old += size stats = { 'chk_upload_helper.active_uploads': len(self._active_uploads), 'chk_upload_helper.incoming_count': inc_count, 'chk_upload_helper.incoming_size': inc_size, 'chk_upload_helper.incoming_size_old': inc_size_old, 'chk_upload_helper.encoding_count': enc_count, 'chk_upload_helper.encoding_size': enc_size, 'chk_upload_helper.encoding_size_old': enc_size_old, } stats.update(self._counters) return stats def remote_get_version(self): return self.VERSION def remote_upload_chk(self, storage_index): self.count("chk_upload_helper.upload_requests") lp = self.log(format="helper: upload_chk query for SI %(si)s", si=si_b2a(storage_index)) if storage_index in self._active_uploads: self.log("upload is currently active", parent=lp) uh = self._active_uploads[storage_index] return (None, uh) d = self._check_chk(storage_index, lp) d.addCallback(self._did_chk_check, storage_index, lp) def _err(f): self.log("error while checking for chk-already-in-grid", failure=f, level=log.WEIRD, parent=lp, umid="jDtxZg") return f d.addErrback(_err) return d def _check_chk(self, storage_index, lp): # see if this file is already in the grid lp2 = self.log("doing a quick check+UEBfetch", parent=lp, level=log.NOISY) sb = self._storage_broker c = CHKCheckerAndUEBFetcher(sb.get_servers_for_psi, storage_index, lp2) d = c.check() def _checked(res): if res: (sharemap, ueb_data, ueb_hash) = res self.log("found file in grid", level=log.NOISY, parent=lp) hur = upload.HelperUploadResults() hur.uri_extension_hash = ueb_hash hur.sharemap = sharemap hur.uri_extension_data = ueb_data hur.preexisting_shares = len(sharemap) hur.pushed_shares = 0 return hur return None d.addCallback(_checked) return d def _did_chk_check(self, already_present, storage_index, lp): if already_present: # the necessary results are placed in the UploadResults self.count("chk_upload_helper.upload_already_present") self.log("file already found in grid", parent=lp) return (already_present, None) self.count("chk_upload_helper.upload_need_upload") # the file is not present in the grid, by which we mean there are # less than 'N' shares available. self.log("unable to find file in the grid", parent=lp, level=log.NOISY) # We need an upload helper. Check our active uploads again in # case there was a race. if storage_index in self._active_uploads: self.log("upload is currently active", parent=lp) uh = self._active_uploads[storage_index] else: self.log("creating new upload helper", parent=lp) uh = self._make_chk_upload_helper(storage_index, lp) self._active_uploads[storage_index] = uh self._add_upload(uh) return (None, uh) def _make_chk_upload_helper(self, storage_index, lp): si_s = si_b2a(storage_index) incoming_file = os.path.join(self._chk_incoming, si_s) encoding_file = os.path.join(self._chk_encoding, si_s) uh = CHKUploadHelper(storage_index, self, self._storage_broker, self._secret_holder, incoming_file, encoding_file, lp) return uh def _add_upload(self, uh): self._all_uploads[uh] = None if self._history: s = uh.get_upload_status() self._history.notify_helper_upload(s) def upload_finished(self, storage_index, size): # this is called with size=0 if the upload failed self.count("chk_upload_helper.encoded_bytes", size) uh = self._active_uploads[storage_index] del self._active_uploads[storage_index] s = uh.get_upload_status() s.set_active(False) allmydata-tahoe-1.10.2/src/allmydata/immutable/__init__.py0000644000175000017500000000000012556560070021604 0ustar ramramallmydata-tahoe-1.10.2/src/allmydata/immutable/repairer.py0000644000175000017500000001010612556560070021666 0ustar ramramfrom zope.interface import implements from twisted.internet import defer from allmydata.storage.server import si_b2a from allmydata.util import log, consumer from allmydata.util.assertutil import precondition from allmydata.interfaces import IEncryptedUploadable from allmydata.immutable import upload class Repairer(log.PrefixingLogMixin): implements(IEncryptedUploadable) """I generate any shares which were not available and upload them to servers. Which servers? Well, I just use the normal upload process, so any servers that will take shares. In fact, I even believe servers if they say that they already have shares even if attempts to download those shares would fail because the shares are corrupted. My process of uploading replacement shares proceeds in a segment-wise fashion -- first I ask servers if they can hold the new shares, and wait until enough have agreed then I download the first segment of the file and upload the first block of each replacement share, and only after all those blocks have been uploaded do I download the second segment of the file and upload the second block of each replacement share to its respective server. (I do it this way in order to minimize the amount of downloading I have to do and the amount of memory I have to use at any one time.) If any of the servers to which I am uploading replacement shares fails to accept the blocks during this process, then I just stop using that server, abandon any share-uploads that were going to that server, and proceed to finish uploading the remaining shares to their respective servers. At the end of my work, I produce an object which satisfies the ICheckAndRepairResults interface (by firing the deferred that I returned from start() and passing that check-and-repair-results object). Before I send any new request to a server, I always ask the 'monitor' object that was passed into my constructor whether this task has been cancelled (by invoking its raise_if_cancelled() method). """ def __init__(self, filenode, storage_broker, secret_holder, monitor): logprefix = si_b2a(filenode.get_storage_index())[:5] log.PrefixingLogMixin.__init__(self, "allmydata.immutable.repairer", prefix=logprefix) self._filenode = filenode self._storage_broker = storage_broker self._secret_holder = secret_holder self._monitor = monitor self._offset = 0 def start(self): self.log("starting repair") d = self._filenode.get_segment_size() def _got_segsize(segsize): vcap = self._filenode.get_verify_cap() k = vcap.needed_shares N = vcap.total_shares # Per ticket #1212 # (http://tahoe-lafs.org/trac/tahoe-lafs/ticket/1212) happy = 0 self._encodingparams = (k, happy, N, segsize) ul = upload.CHKUploader(self._storage_broker, self._secret_holder) return ul.start(self) # I am the IEncryptedUploadable d.addCallback(_got_segsize) return d # methods to satisfy the IEncryptedUploader interface # (From the perspective of an uploader I am an IEncryptedUploadable.) def set_upload_status(self, upload_status): self.upload_status = upload_status def get_size(self): size = self._filenode.get_size() assert size is not None return defer.succeed(size) def get_all_encoding_parameters(self): return defer.succeed(self._encodingparams) def read_encrypted(self, length, hash_only): """Returns a deferred which eventually fires with the requested ciphertext, as a list of strings.""" precondition(length) # please don't ask to read 0 bytes mc = consumer.MemoryConsumer() d = self._filenode.read(mc, self._offset, length) self._offset += length d.addCallback(lambda ign: mc.chunks) return d def get_storage_index(self): return self._filenode.get_storage_index() def close(self): pass allmydata-tahoe-1.10.2/src/allmydata/blacklist.py0000644000175000017500000001134412556560070020053 0ustar ramram import os from zope.interface import implements from twisted.internet import defer from twisted.python import log as twisted_log from allmydata.interfaces import IFileNode, IFilesystemNode from allmydata.util import base32 from allmydata.util.encodingutil import quote_output class FileProhibited(Exception): """This client has been configured to prohibit access to this object.""" def __init__(self, reason): Exception.__init__(self, "Access Prohibited: %s" % quote_output(reason, encoding='utf-8', quotemarks=False)) self.reason = reason class Blacklist: def __init__(self, blacklist_fn): self.blacklist_fn = blacklist_fn self.last_mtime = None self.entries = {} self.read_blacklist() # sets .last_mtime and .entries def read_blacklist(self): try: current_mtime = os.stat(self.blacklist_fn).st_mtime except EnvironmentError: # unreadable blacklist file means no blacklist self.entries.clear() return try: if self.last_mtime is None or current_mtime > self.last_mtime: self.entries.clear() for line in open(self.blacklist_fn, "r").readlines(): line = line.strip() if not line or line.startswith("#"): continue si_s, reason = line.split(None, 1) si = base32.a2b(si_s) # must be valid base32 self.entries[si] = reason self.last_mtime = current_mtime except Exception, e: twisted_log.err(e, "unparseable blacklist file") raise def check_storageindex(self, si): self.read_blacklist() reason = self.entries.get(si, None) if reason is not None: # log this to logs/twistd.log, since web logs go there too twisted_log.msg("blacklist prohibited access to SI %s: %s" % (base32.b2a(si), reason)) return reason class ProhibitedNode: implements(IFileNode) def __init__(self, wrapped_node, reason): assert IFilesystemNode.providedBy(wrapped_node), wrapped_node self.wrapped_node = wrapped_node self.reason = reason def get_cap(self): return self.wrapped_node.get_cap() def get_readcap(self): return self.wrapped_node.get_readcap() def is_readonly(self): return self.wrapped_node.is_readonly() def is_mutable(self): return self.wrapped_node.is_mutable() def is_unknown(self): return self.wrapped_node.is_unknown() def is_allowed_in_immutable_directory(self): return self.wrapped_node.is_allowed_in_immutable_directory() def is_alleged_immutable(self): return self.wrapped_node.is_alleged_immutable() def raise_error(self): # We don't raise an exception here because that would prevent the node from being listed. pass def get_uri(self): return self.wrapped_node.get_uri() def get_write_uri(self): return self.wrapped_node.get_write_uri() def get_readonly_uri(self): return self.wrapped_node.get_readonly_uri() def get_storage_index(self): return self.wrapped_node.get_storage_index() def get_verify_cap(self): return self.wrapped_node.get_verify_cap() def get_repair_cap(self): return self.wrapped_node.get_repair_cap() def get_size(self): return None def get_current_size(self): return defer.succeed(None) def get_size_of_best_version(self): return defer.succeed(None) def check(self, monitor, verify, add_lease): return defer.succeed(None) def check_and_repair(self, monitor, verify, add_lease): return defer.succeed(None) def get_version(self): return None # Omitting any of these methods would fail safe; they are just to ensure correct error reporting. def get_best_readable_version(self): raise FileProhibited(self.reason) def download_best_version(self): raise FileProhibited(self.reason) def get_best_mutable_version(self): raise FileProhibited(self.reason) def overwrite(self, new_contents): raise FileProhibited(self.reason) def modify(self, modifier_cb): raise FileProhibited(self.reason) def get_servermap(self, mode): raise FileProhibited(self.reason) def download_version(self, servermap, version): raise FileProhibited(self.reason) def upload(self, new_contents, servermap): raise FileProhibited(self.reason) def get_writekey(self): raise FileProhibited(self.reason) def read(self, consumer, offset=0, size=None): raise FileProhibited(self.reason) allmydata-tahoe-1.10.2/src/allmydata/interfaces.py0000644000175000017500000036420612556560070020236 0ustar ramram from zope.interface import Interface from foolscap.api import StringConstraint, ListOf, TupleOf, SetOf, DictOf, \ ChoiceOf, IntegerConstraint, Any, RemoteInterface, Referenceable HASH_SIZE=32 SALT_SIZE=16 SDMF_VERSION=0 MDMF_VERSION=1 Hash = StringConstraint(maxLength=HASH_SIZE, minLength=HASH_SIZE)# binary format 32-byte SHA256 hash Nodeid = StringConstraint(maxLength=20, minLength=20) # binary format 20-byte SHA1 hash FURL = StringConstraint(1000) StorageIndex = StringConstraint(16) URI = StringConstraint(300) # kind of arbitrary MAX_BUCKETS = 256 # per peer -- zfec offers at most 256 shares per file DEFAULT_MAX_SEGMENT_SIZE = 128*1024 ShareData = StringConstraint(None) URIExtensionData = StringConstraint(1000) Number = IntegerConstraint(8) # 2**(8*8) == 16EiB ~= 18e18 ~= 18 exabytes Offset = Number ReadSize = int # the 'int' constraint is 2**31 == 2Gib -- large files are processed in not-so-large increments WriteEnablerSecret = Hash # used to protect mutable share modifications LeaseRenewSecret = Hash # used to protect lease renewal requests LeaseCancelSecret = Hash # was used to protect lease cancellation requests class RIBucketWriter(RemoteInterface): """ Objects of this kind live on the server side. """ def write(offset=Offset, data=ShareData): return None def close(): """ If the data that has been written is incomplete or inconsistent then the server will throw the data away, else it will store it for future retrieval. """ return None def abort(): """Abandon all the data that has been written. """ return None class RIBucketReader(RemoteInterface): def read(offset=Offset, length=ReadSize): return ShareData def advise_corrupt_share(reason=str): """Clients who discover hash failures in shares that they have downloaded from me will use this method to inform me about the failures. I will record their concern so that my operator can manually inspect the shares in question. I return None. This is a wrapper around RIStorageServer.advise_corrupt_share() that is tied to a specific share, and therefore does not need the extra share-identifying arguments. Please see that method for full documentation. """ TestVector = ListOf(TupleOf(Offset, ReadSize, str, str)) # elements are (offset, length, operator, specimen) # operator is one of "lt, le, eq, ne, ge, gt" # nop always passes and is used to fetch data while writing. # you should use length==len(specimen) for everything except nop DataVector = ListOf(TupleOf(Offset, ShareData)) # (offset, data). This limits us to 30 writes of 1MiB each per call TestAndWriteVectorsForShares = DictOf(int, TupleOf(TestVector, DataVector, ChoiceOf(None, Offset), # new_length )) ReadVector = ListOf(TupleOf(Offset, ReadSize)) ReadData = ListOf(ShareData) # returns data[offset:offset+length] for each element of TestVector class RIStorageServer(RemoteInterface): __remote_name__ = "RIStorageServer.tahoe.allmydata.com" def get_version(): """ Return a dictionary of version information. """ return DictOf(str, Any()) def allocate_buckets(storage_index=StorageIndex, renew_secret=LeaseRenewSecret, cancel_secret=LeaseCancelSecret, sharenums=SetOf(int, maxLength=MAX_BUCKETS), allocated_size=Offset, canary=Referenceable): """ @param storage_index: the index of the bucket to be created or increfed. @param sharenums: these are the share numbers (probably between 0 and 99) that the sender is proposing to store on this server. @param renew_secret: This is the secret used to protect bucket refresh This secret is generated by the client and stored for later comparison by the server. Each server is given a different secret. @param cancel_secret: This no longer allows lease cancellation, but must still be a unique value identifying the lease. XXX stop relying on it to be unique. @param canary: If the canary is lost before close(), the bucket is deleted. @return: tuple of (alreadygot, allocated), where alreadygot is what we already have and allocated is what we hereby agree to accept. New leases are added for shares in both lists. """ return TupleOf(SetOf(int, maxLength=MAX_BUCKETS), DictOf(int, RIBucketWriter, maxKeys=MAX_BUCKETS)) def add_lease(storage_index=StorageIndex, renew_secret=LeaseRenewSecret, cancel_secret=LeaseCancelSecret): """ Add a new lease on the given bucket. If the renew_secret matches an existing lease, that lease will be renewed instead. If there is no bucket for the given storage_index, return silently. (note that in tahoe-1.3.0 and earlier, IndexError was raised if there was no bucket) """ return Any() # returns None now, but future versions might change def renew_lease(storage_index=StorageIndex, renew_secret=LeaseRenewSecret): """ Renew the lease on a given bucket, resetting the timer to 31 days. Some networks will use this, some will not. If there is no bucket for the given storage_index, IndexError will be raised. For mutable shares, if the given renew_secret does not match an existing lease, IndexError will be raised with a note listing the server-nodeids on the existing leases, so leases on migrated shares can be renewed. For immutable shares, IndexError (without the note) will be raised. """ return Any() def get_buckets(storage_index=StorageIndex): return DictOf(int, RIBucketReader, maxKeys=MAX_BUCKETS) def slot_readv(storage_index=StorageIndex, shares=ListOf(int), readv=ReadVector): """Read a vector from the numbered shares associated with the given storage index. An empty shares list means to return data from all known shares. Returns a dictionary with one key per share.""" return DictOf(int, ReadData) # shnum -> results def slot_testv_and_readv_and_writev(storage_index=StorageIndex, secrets=TupleOf(WriteEnablerSecret, LeaseRenewSecret, LeaseCancelSecret), tw_vectors=TestAndWriteVectorsForShares, r_vector=ReadVector, ): """ General-purpose test-read-and-set operation for mutable slots: (1) For submitted shnums, compare the test vectors against extant shares, or against an empty share for shnums that do not exist. (2) Use the read vectors to extract "old data" from extant shares. (3) If all tests in (1) passed, then apply the write vectors (possibly creating new shares). (4) Return whether the tests passed, and the "old data", which does not include any modifications made by the writes. The operation does not interleave with other operations on the same shareset. This method is, um, large. The goal is to allow clients to update all the shares associated with a mutable file in a single round trip. @param storage_index: the index of the bucket to be created or increfed. @param write_enabler: a secret that is stored along with the slot. Writes are accepted from any caller who can present the matching secret. A different secret should be used for each slot*server pair. @param renew_secret: This is the secret used to protect bucket refresh This secret is generated by the client and stored for later comparison by the server. Each server is given a different secret. @param cancel_secret: This no longer allows lease cancellation, but must still be a unique value identifying the lease. XXX stop relying on it to be unique. The 'secrets' argument is a tuple of (write_enabler, renew_secret, cancel_secret). The first is required to perform any write. The latter two are used when allocating new shares. To simply acquire a new lease on existing shares, use an empty testv and an empty writev. Each share can have a separate test vector (i.e. a list of comparisons to perform). If all vectors for all shares pass, then all writes for all shares are recorded. Each comparison is a 4-tuple of (offset, length, operator, specimen), which effectively does a bool( (read(offset, length)) OPERATOR specimen ) and only performs the write if all these evaluate to True. Basic test-and-set uses 'eq'. Write-if-newer uses a seqnum and (offset, length, 'lt', specimen). Write-if-same-or-newer uses 'le'. Reads from the end of the container are truncated, and missing shares behave like empty ones, so to assert that a share doesn't exist (for use when creating a new share), use (0, 1, 'eq', ''). The write vector will be applied to the given share, expanding it if necessary. A write vector applied to a share number that did not exist previously will cause that share to be created. Write vectors must not overlap (if they do, this will either cause an error or apply them in an unspecified order). Duplicate write vectors, with the same offset and data, are currently tolerated but are not desirable. In Tahoe-LAFS v1.8.3 or later (except 1.9.0a1), if you send a write vector whose offset is beyond the end of the current data, the space between the end of the current data and the beginning of the write vector will be filled with zero bytes. In earlier versions the contents of this space was unspecified (and might end up containing secrets). Storage servers with the new zero-filling behavior will advertise a true value for the 'fills-holes-with-zero-bytes' key (under 'http://allmydata.org/tahoe/protocols/storage/v1') in their version information. Each write vector is accompanied by a 'new_length' argument, which can be used to truncate the data. If new_length is not None and it is less than the current size of the data (after applying all write vectors), then the data will be truncated to new_length. If new_length==0, the share will be deleted. In Tahoe-LAFS v1.8.2 and earlier, new_length could also be used to enlarge the file by sending a number larger than the size of the data after applying all write vectors. That behavior was not used, and as of Tahoe-LAFS v1.8.3 it no longer works and the new_length is ignored in that case. If a storage client knows that the server supports zero-filling, for example from the 'fills-holes-with-zero-bytes' key in its version information, it can extend the file efficiently by writing a single zero byte just before the new end-of-file. Otherwise it must explicitly write zeroes to all bytes between the old and new end-of-file. In any case it should avoid sending new_length larger than the size of the data after applying all write vectors. The read vector is used to extract data from all known shares, *before* any writes have been applied. The same read vector is used for all shares. This captures the state that was tested by the test vector, for extant shares. This method returns two values: a boolean and a dict. The boolean is True if the write vectors were applied, False if not. The dict is keyed by share number, and each value contains a list of strings, one for each element of the read vector. If the write_enabler is wrong, this will raise BadWriteEnablerError. To enable share migration (using update_write_enabler), the exception will have the nodeid used for the old write enabler embedded in it, in the following string:: The write enabler was recorded by nodeid '%s'. Note that the nodeid here is encoded using the same base32 encoding used by Foolscap and allmydata.util.idlib.nodeid_b2a(). """ return TupleOf(bool, DictOf(int, ReadData)) def advise_corrupt_share(share_type=str, storage_index=StorageIndex, shnum=int, reason=str): """Clients who discover hash failures in shares that they have downloaded from me will use this method to inform me about the failures. I will record their concern so that my operator can manually inspect the shares in question. I return None. 'share_type' is either 'mutable' or 'immutable'. 'storage_index' is a (binary) storage index string, and 'shnum' is the integer share number. 'reason' is a human-readable explanation of the problem, probably including some expected hash values and the computed ones that did not match. Corruption advisories for mutable shares should include a hash of the public key (the same value that appears in the mutable-file verify-cap), since the current share format does not store that on disk. """ class IStorageBucketWriter(Interface): """ Objects of this kind live on the client side. """ def put_block(segmentnum, data): """ @param segmentnum=int @param data=ShareData: For most segments, this data will be 'blocksize' bytes in length. The last segment might be shorter. @return: a Deferred that fires (with None) when the operation completes """ def put_crypttext_hashes(hashes): """ @param hashes=ListOf(Hash) @return: a Deferred that fires (with None) when the operation completes """ def put_block_hashes(blockhashes): """ @param blockhashes=ListOf(Hash) @return: a Deferred that fires (with None) when the operation completes """ def put_share_hashes(sharehashes): """ @param sharehashes=ListOf(TupleOf(int, Hash)) @return: a Deferred that fires (with None) when the operation completes """ def put_uri_extension(data): """This block of data contains integrity-checking information (hashes of plaintext, crypttext, and shares), as well as encoding parameters that are necessary to recover the data. This is a serialized dict mapping strings to other strings. The hash of this data is kept in the URI and verified before any of the data is used. All buckets for a given file contain identical copies of this data. The serialization format is specified with the following pseudocode: for k in sorted(dict.keys()): assert re.match(r'^[a-zA-Z_\-]+$', k) write(k + ':' + netstring(dict[k])) @param data=URIExtensionData @return: a Deferred that fires (with None) when the operation completes """ def close(): """Finish writing and close the bucket. The share is not finalized until this method is called: if the uploading client disconnects before calling close(), the partially-written share will be discarded. @return: a Deferred that fires (with None) when the operation completes """ class IStorageBucketReader(Interface): def get_block_data(blocknum, blocksize, size): """Most blocks will be the same size. The last block might be shorter than the others. @param blocknum=int @param blocksize=int @param size=int @return: ShareData """ def get_crypttext_hashes(): """ @return: ListOf(Hash) """ def get_block_hashes(at_least_these=()): """ @param at_least_these=SetOf(int) @return: ListOf(Hash) """ def get_share_hashes(): """ @return: ListOf(TupleOf(int, Hash)) """ def get_uri_extension(): """ @return: URIExtensionData """ class IStorageBroker(Interface): def get_servers_for_psi(peer_selection_index): """ @return: list of IServer instances """ def get_connected_servers(): """ @return: frozenset of connected IServer instances """ def get_known_servers(): """ @return: frozenset of IServer instances """ def get_all_serverids(): """ @return: frozenset of serverid strings """ def get_nickname_for_serverid(serverid): """ @return: unicode nickname, or None """ # methods moved from IntroducerClient, need review def get_all_connections(): """Return a frozenset of (nodeid, service_name, rref) tuples, one for each active connection we've established to a remote service. This is mostly useful for unit tests that need to wait until a certain number of connections have been made.""" def get_all_connectors(): """Return a dict that maps from (nodeid, service_name) to a RemoteServiceConnector instance for all services that we are actively trying to connect to. Each RemoteServiceConnector has the following public attributes:: service_name: the type of service provided, like 'storage' announcement_time: when we first heard about this service last_connect_time: when we last established a connection last_loss_time: when we last lost a connection version: the peer's version, from the most recent connection oldest_supported: the peer's oldest supported version, same rref: the RemoteReference, if connected, otherwise None remote_host: the IAddress, if connected, otherwise None This method is intended for monitoring interfaces, such as a web page that describes connecting and connected peers. """ def get_all_peerids(): """Return a frozenset of all peerids to whom we have a connection (to one or more services) established. Mostly useful for unit tests.""" def get_all_connections_for(service_name): """Return a frozenset of (nodeid, service_name, rref) tuples, one for each active connection that provides the given SERVICE_NAME.""" def get_permuted_peers(service_name, key): """Returns an ordered list of (peerid, rref) tuples, selecting from the connections that provide SERVICE_NAME, using a hash-based permutation keyed by KEY. This randomizes the service list in a repeatable way, to distribute load over many peers. """ class IDisplayableServer(Interface): def get_nickname(): pass def get_name(): pass def get_longname(): pass class IServer(IDisplayableServer): """I live in the client, and represent a single server.""" def start_connecting(tub, trigger_cb): pass def get_rref(): """Once a server is connected, I return a RemoteReference. Before a server is connected for the first time, I return None. Note that the rref I return will start producing DeadReferenceErrors once the connection is lost. """ class IMutableSlotWriter(Interface): """ The interface for a writer around a mutable slot on a remote server. """ def set_checkstring(seqnum_or_checkstring, root_hash=None, salt=None): """ Set the checkstring that I will pass to the remote server when writing. @param checkstring A packed checkstring to use. Note that implementations can differ in which semantics they wish to support for set_checkstring -- they can, for example, build the checkstring themselves from its constituents, or some other thing. """ def get_checkstring(): """ Get the checkstring that I think currently exists on the remote server. """ def put_block(data, segnum, salt): """ Add a block and salt to the share. """ def put_encprivkey(encprivkey): """ Add the encrypted private key to the share. """ def put_blockhashes(blockhashes): """ @param blockhashes=list Add the block hash tree to the share. """ def put_sharehashes(sharehashes): """ @param sharehashes=dict Add the share hash chain to the share. """ def get_signable(): """ Return the part of the share that needs to be signed. """ def put_signature(signature): """ Add the signature to the share. """ def put_verification_key(verification_key): """ Add the verification key to the share. """ def finish_publishing(): """ Do anything necessary to finish writing the share to a remote server. I require that no further publishing needs to take place after this method has been called. """ class IURI(Interface): def init_from_string(uri): """Accept a string (as created by my to_string() method) and populate this instance with its data. I am not normally called directly, please use the module-level uri.from_string() function to convert arbitrary URI strings into IURI-providing instances.""" def is_readonly(): """Return False if this URI be used to modify the data. Return True if this URI cannot be used to modify the data.""" def is_mutable(): """Return True if the data can be modified by *somebody* (perhaps someone who has a more powerful URI than this one).""" # TODO: rename to get_read_cap() def get_readonly(): """Return another IURI instance that represents a read-only form of this one. If is_readonly() is True, this returns self.""" def get_verify_cap(): """Return an instance that provides IVerifierURI, which can be used to check on the availability of the file or directory, without providing enough capabilities to actually read or modify the contents. This may return None if the file does not need checking or verification (e.g. LIT URIs). """ def to_string(): """Return a string of printable ASCII characters, suitable for passing into init_from_string.""" class IVerifierURI(Interface, IURI): def init_from_string(uri): """Accept a string (as created by my to_string() method) and populate this instance with its data. I am not normally called directly, please use the module-level uri.from_string() function to convert arbitrary URI strings into IURI-providing instances.""" def to_string(): """Return a string of printable ASCII characters, suitable for passing into init_from_string.""" class IDirnodeURI(Interface): """I am a URI that represents a dirnode.""" class IFileURI(Interface): """I am a URI that represents a filenode.""" def get_size(): """Return the length (in bytes) of the file that I represent.""" class IImmutableFileURI(IFileURI): pass class IMutableFileURI(Interface): pass class IDirectoryURI(Interface): pass class IReadonlyDirectoryURI(Interface): pass class CapConstraintError(Exception): """A constraint on a cap was violated.""" class MustBeDeepImmutableError(CapConstraintError): """Mutable children cannot be added to an immutable directory. Also, caps obtained from an immutable directory can trigger this error if they are later found to refer to a mutable object and then used.""" class MustBeReadonlyError(CapConstraintError): """Known write caps cannot be specified in a ro_uri field. Also, caps obtained from a ro_uri field can trigger this error if they are later found to be write caps and then used.""" class MustNotBeUnknownRWError(CapConstraintError): """Cannot add an unknown child cap specified in a rw_uri field.""" class IReadable(Interface): """I represent a readable object -- either an immutable file, or a specific version of a mutable file. """ def is_readonly(): """Return True if this reference provides mutable access to the given file or directory (i.e. if you can modify it), or False if not. Note that even if this reference is read-only, someone else may hold a read-write reference to it. For an IReadable returned by get_best_readable_version(), this will always return True, but for instances of subinterfaces such as IMutableFileVersion, it may return False.""" def is_mutable(): """Return True if this file or directory is mutable (by *somebody*, not necessarily you), False if it is is immutable. Note that a file might be mutable overall, but your reference to it might be read-only. On the other hand, all references to an immutable file will be read-only; there are no read-write references to an immutable file.""" def get_storage_index(): """Return the storage index of the file.""" def get_size(): """Return the length (in bytes) of this readable object.""" def download_to_data(): """Download all of the file contents. I return a Deferred that fires with the contents as a byte string.""" def read(consumer, offset=0, size=None): """Download a portion (possibly all) of the file's contents, making them available to the given IConsumer. Return a Deferred that fires (with the consumer) when the consumer is unregistered (either because the last byte has been given to it, or because the consumer threw an exception during write(), possibly because it no longer wants to receive data). The portion downloaded will start at 'offset' and contain 'size' bytes (or the remainder of the file if size==None). It is an error to read beyond the end of the file: callers must use get_size() and clip any non-default offset= and size= parameters. It is permissible to read zero bytes. The consumer will be used in non-streaming mode: an IPullProducer will be attached to it. The consumer will not receive data right away: several network trips must occur first. The order of events will be:: consumer.registerProducer(p, streaming) (if streaming == False):: consumer does p.resumeProducing() consumer.write(data) consumer does p.resumeProducing() consumer.write(data).. (repeat until all data is written) consumer.unregisterProducer() deferred.callback(consumer) If a download error occurs, or an exception is raised by consumer.registerProducer() or consumer.write(), I will call consumer.unregisterProducer() and then deliver the exception via deferred.errback(). To cancel the download, the consumer should call p.stopProducing(), which will result in an exception being delivered via deferred.errback(). See src/allmydata/util/consumer.py for an example of a simple download-to-memory consumer. """ class IWriteable(Interface): """ I define methods that callers can use to update SDMF and MDMF mutable files on a Tahoe-LAFS grid. """ # XXX: For the moment, we have only this. It is possible that we # want to move overwrite() and modify() in here too. def update(data, offset): """ I write the data from my data argument to the MDMF file, starting at offset. I continue writing data until my data argument is exhausted, appending data to the file as necessary. """ # assert IMutableUploadable.providedBy(data) # to append data: offset=node.get_size_of_best_version() # do we want to support compacting MDMF? # for an MDMF file, this can be done with O(data.get_size()) # memory. For an SDMF file, any modification takes # O(node.get_size_of_best_version()). class IMutableFileVersion(IReadable): """I provide access to a particular version of a mutable file. The access is read/write if I was obtained from a filenode derived from a write cap, or read-only if the filenode was derived from a read cap. """ def get_sequence_number(): """Return the sequence number of this version.""" def get_servermap(): """Return the IMutableFileServerMap instance that was used to create this object. """ def get_writekey(): """Return this filenode's writekey, or None if the node does not have write-capability. This may be used to assist with data structures that need to make certain data available only to writers, such as the read-write child caps in dirnodes. The recommended process is to have reader-visible data be submitted to the filenode in the clear (where it will be encrypted by the filenode using the readkey), but encrypt writer-visible data using this writekey. """ def overwrite(new_contents): """Replace the contents of the mutable file, provided that no other node has published (or is attempting to publish, concurrently) a newer version of the file than this one. I will avoid modifying any share that is different than the version given by get_sequence_number(). However, if another node is writing to the file at the same time as me, I may manage to update some shares while they update others. If I see any evidence of this, I will signal UncoordinatedWriteError, and the file will be left in an inconsistent state (possibly the version you provided, possibly the old version, possibly somebody else's version, and possibly a mix of shares from all of these). The recommended response to UncoordinatedWriteError is to either return it to the caller (since they failed to coordinate their writes), or to attempt some sort of recovery. It may be sufficient to wait a random interval (with exponential backoff) and repeat your operation. If I do not signal UncoordinatedWriteError, then I was able to write the new version without incident. I return a Deferred that fires (with a PublishStatus object) when the update has completed. """ def modify(modifier_cb): """Modify the contents of the file, by downloading this version, applying the modifier function (or bound method), then uploading the new version. This will succeed as long as no other node publishes a version between the download and the upload. I return a Deferred that fires (with a PublishStatus object) when the update is complete. The modifier callable will be given three arguments: a string (with the old contents), a 'first_time' boolean, and a servermap. As with download_to_data(), the old contents will be from this version, but the modifier can use the servermap to make other decisions (such as refusing to apply the delta if there are multiple parallel versions, or if there is evidence of a newer unrecoverable version). 'first_time' will be True the first time the modifier is called, and False on any subsequent calls. The callable should return a string with the new contents. The callable must be prepared to be called multiple times, and must examine the input string to see if the change that it wants to make is already present in the old version. If it does not need to make any changes, it can either return None, or return its input string. If the modifier raises an exception, it will be returned in the errback. """ # The hierarchy looks like this: # IFilesystemNode # IFileNode # IMutableFileNode # IImmutableFileNode # IDirectoryNode class IFilesystemNode(Interface): def get_cap(): """Return the strongest 'cap instance' associated with this node. (writecap for writeable-mutable files/directories, readcap for immutable or readonly-mutable files/directories). To convert this into a string, call .to_string() on the result.""" def get_readcap(): """Return a readonly cap instance for this node. For immutable or readonly nodes, get_cap() and get_readcap() return the same thing.""" def get_repair_cap(): """Return an IURI instance that can be used to repair the file, or None if this node cannot be repaired (either because it is not distributed, like a LIT file, or because the node does not represent sufficient authority to create a repair-cap, like a read-only RSA mutable file node [which cannot create the correct write-enablers]). """ def get_verify_cap(): """Return an IVerifierURI instance that represents the 'verifiy/refresh capability' for this node. The holder of this capability will be able to renew the lease for this node, protecting it from garbage-collection. They will also be able to ask a server if it holds a share for the file or directory. """ def get_uri(): """Return the URI string corresponding to the strongest cap associated with this node. If this node is read-only, the URI will only offer read-only access. If this node is read-write, the URI will offer read-write access. If you have read-write access to a node and wish to share merely read-only access with others, use get_readonly_uri(). """ def get_write_uri(): """Return the URI string that can be used by others to get write access to this node, if it is writeable. If this is a read-only node, return None.""" def get_readonly_uri(): """Return the URI string that can be used by others to get read-only access to this node. The result is a read-only URI, regardless of whether this node is read-only or read-write. If you have merely read-only access to this node, get_readonly_uri() will return the same thing as get_uri(). """ def get_storage_index(): """Return a string with the (binary) storage index in use on this download. This may be None if there is no storage index (i.e. LIT files and directories).""" def is_readonly(): """Return True if this reference provides mutable access to the given file or directory (i.e. if you can modify it), or False if not. Note that even if this reference is read-only, someone else may hold a read-write reference to it.""" def is_mutable(): """Return True if this file or directory is mutable (by *somebody*, not necessarily you), False if it is is immutable. Note that a file might be mutable overall, but your reference to it might be read-only. On the other hand, all references to an immutable file will be read-only; there are no read-write references to an immutable file. """ def is_unknown(): """Return True if this is an unknown node.""" def is_allowed_in_immutable_directory(): """Return True if this node is allowed as a child of a deep-immutable directory. This is true if either the node is of a known-immutable type, or it is unknown and read-only. """ def raise_error(): """Raise any error associated with this node.""" # XXX: These may not be appropriate outside the context of an IReadable. def get_size(): """Return the length (in bytes) of the data this node represents. For directory nodes, I return the size of the backing store. I return synchronously and do not consult the network, so for mutable objects, I will return the most recently observed size for the object, or None if I don't remember a size. Use get_current_size, which returns a Deferred, if you want more up-to-date information.""" def get_current_size(): """I return a Deferred that fires with the length (in bytes) of the data this node represents. """ class IFileNode(IFilesystemNode): """I am a node that represents a file: a sequence of bytes. I am not a container, like IDirectoryNode.""" def get_best_readable_version(): """Return a Deferred that fires with an IReadable for the 'best' available version of the file. The IReadable provides only read access, even if this filenode was derived from a write cap. For an immutable file, there is only one version. For a mutable file, the 'best' version is the recoverable version with the highest sequence number. If no uncoordinated writes have occurred, and if enough shares are available, then this will be the most recent version that has been uploaded. If no version is recoverable, the Deferred will errback with an UnrecoverableFileError. """ def download_best_version(): """Download the contents of the version that would be returned by get_best_readable_version(). This is equivalent to calling download_to_data() on the IReadable given by that method. I return a Deferred that fires with a byte string when the file has been fully downloaded. To support streaming download, use the 'read' method of IReadable. If no version is recoverable, the Deferred will errback with an UnrecoverableFileError. """ def get_size_of_best_version(): """Find the size of the version that would be returned by get_best_readable_version(). I return a Deferred that fires with an integer. If no version is recoverable, the Deferred will errback with an UnrecoverableFileError. """ class IImmutableFileNode(IFileNode, IReadable): """I am a node representing an immutable file. Immutable files have only one version""" class IMutableFileNode(IFileNode): """I provide access to a 'mutable file', which retains its identity regardless of what contents are put in it. The consistency-vs-availability problem means that there might be multiple versions of a file present in the grid, some of which might be unrecoverable (i.e. have fewer than 'k' shares). These versions are loosely ordered: each has a sequence number and a hash, and any version with seqnum=N was uploaded by a node that has seen at least one version with seqnum=N-1. The 'servermap' (an instance of IMutableFileServerMap) is used to describe the versions that are known to be present in the grid, and which servers are hosting their shares. It is used to represent the 'state of the world', and is used for this purpose by my test-and-set operations. Downloading the contents of the mutable file will also return a servermap. Uploading a new version into the mutable file requires a servermap as input, and the semantics of the replace operation is 'replace the file with my new version if it looks like nobody else has changed the file since my previous download'. Because the file is distributed, this is not a perfect test-and-set operation, but it will do its best. If the replace process sees evidence of a simultaneous write, it will signal an UncoordinatedWriteError, so that the caller can take corrective action. Most readers will want to use the 'best' current version of the file, and should use my 'download_best_version()' method. To unconditionally replace the file, callers should use overwrite(). This is the mode that user-visible mutable files will probably use. To apply some delta to the file, call modify() with a callable modifier function that can apply the modification that you want to make. This is the mode that dirnodes will use, since most directory modification operations can be expressed in terms of deltas to the directory state. Three methods are available for users who need to perform more complex operations. The first is get_servermap(), which returns an up-to-date servermap using a specified mode. The second is download_version(), which downloads a specific version (not necessarily the 'best' one). The third is 'upload', which accepts new contents and a servermap (which must have been updated with MODE_WRITE). The upload method will attempt to apply the new contents as long as no other node has modified the file since the servermap was updated. This might be useful to a caller who wants to merge multiple versions into a single new one. Note that each time the servermap is updated, a specific 'mode' is used, which determines how many peers are queried. To use a servermap for my replace() method, that servermap must have been updated in MODE_WRITE. These modes are defined in allmydata.mutable.common, and consist of MODE_READ, MODE_WRITE, MODE_ANYTHING, and MODE_CHECK. Please look in allmydata/mutable/servermap.py for details about the differences. Mutable files are currently limited in size (about 3.5MB max) and can only be retrieved and updated all-at-once, as a single big string. Future versions of our mutable files will remove this restriction. """ def get_best_mutable_version(): """Return a Deferred that fires with an IMutableFileVersion for the 'best' available version of the file. The best version is the recoverable version with the highest sequence number. If no uncoordinated writes have occurred, and if enough shares are available, then this will be the most recent version that has been uploaded. If no version is recoverable, the Deferred will errback with an UnrecoverableFileError. """ def overwrite(new_contents): """Unconditionally replace the contents of the mutable file with new ones. This simply chains get_servermap(MODE_WRITE) and upload(). This is only appropriate to use when the new contents of the file are completely unrelated to the old ones, and you do not care about other clients' changes. I return a Deferred that fires (with a PublishStatus object) when the update has completed. """ def modify(modifier_cb): """Modify the contents of the file, by downloading the current version, applying the modifier function (or bound method), then uploading the new version. I return a Deferred that fires (with a PublishStatus object) when the update is complete. The modifier callable will be given three arguments: a string (with the old contents), a 'first_time' boolean, and a servermap. As with download_best_version(), the old contents will be from the best recoverable version, but the modifier can use the servermap to make other decisions (such as refusing to apply the delta if there are multiple parallel versions, or if there is evidence of a newer unrecoverable version). 'first_time' will be True the first time the modifier is called, and False on any subsequent calls. The callable should return a string with the new contents. The callable must be prepared to be called multiple times, and must examine the input string to see if the change that it wants to make is already present in the old version. If it does not need to make any changes, it can either return None, or return its input string. If the modifier raises an exception, it will be returned in the errback. """ def get_servermap(mode): """Return a Deferred that fires with an IMutableFileServerMap instance, updated using the given mode. """ def download_version(servermap, version): """Download a specific version of the file, using the servermap as a guide to where the shares are located. I return a Deferred that fires with the requested contents, or errbacks with UnrecoverableFileError. Note that a servermap that was updated with MODE_ANYTHING or MODE_READ may not know about shares for all versions (those modes stop querying servers as soon as they can fulfil their goals), so you may want to use MODE_CHECK (which checks everything) to get increased visibility. """ def upload(new_contents, servermap): """Replace the contents of the file with new ones. This requires a servermap that was previously updated with MODE_WRITE. I attempt to provide test-and-set semantics, in that I will avoid modifying any share that is different than the version I saw in the servermap. However, if another node is writing to the file at the same time as me, I may manage to update some shares while they update others. If I see any evidence of this, I will signal UncoordinatedWriteError, and the file will be left in an inconsistent state (possibly the version you provided, possibly the old version, possibly somebody else's version, and possibly a mix of shares from all of these). The recommended response to UncoordinatedWriteError is to either return it to the caller (since they failed to coordinate their writes), or to attempt some sort of recovery. It may be sufficient to wait a random interval (with exponential backoff) and repeat your operation. If I do not signal UncoordinatedWriteError, then I was able to write the new version without incident. I return a Deferred that fires (with a PublishStatus object) when the publish has completed. I will update the servermap in-place with the location of all new shares. """ def get_writekey(): """Return this filenode's writekey, or None if the node does not have write-capability. This may be used to assist with data structures that need to make certain data available only to writers, such as the read-write child caps in dirnodes. The recommended process is to have reader-visible data be submitted to the filenode in the clear (where it will be encrypted by the filenode using the readkey), but encrypt writer-visible data using this writekey. """ def get_version(): """Returns the mutable file protocol version.""" class NotEnoughSharesError(Exception): """Download was unable to get enough shares""" class NoSharesError(Exception): """Download was unable to get any shares at all.""" class DownloadStopped(Exception): pass class UploadUnhappinessError(Exception): """Upload was unable to satisfy 'servers_of_happiness'""" class UnableToFetchCriticalDownloadDataError(Exception): """I was unable to fetch some piece of critical data that is supposed to be identically present in all shares.""" class NoServersError(Exception): """Upload wasn't given any servers to work with, usually indicating a network or Introducer problem.""" class ExistingChildError(Exception): """A directory node was asked to add or replace a child that already exists, and overwrite= was set to False.""" class NoSuchChildError(Exception): """A directory node was asked to fetch a child that does not exist.""" def __str__(self): # avoid UnicodeEncodeErrors when converting to str return self.__repr__() class ChildOfWrongTypeError(Exception): """An operation was attempted on a child of the wrong type (file or directory).""" class IDirectoryNode(IFilesystemNode): """I represent a filesystem node that is a container, with a name-to-child mapping, holding the tahoe equivalent of a directory. All child names are unicode strings, and all children are some sort of IFilesystemNode (a file, subdirectory, or unknown node). """ def get_uri(): """ The dirnode ('1') URI returned by this method can be used in set_uri() on a different directory ('2') to 'mount' a reference to this directory ('1') under the other ('2'). This URI is just a string, so it can be passed around through email or other out-of-band protocol. """ def get_readonly_uri(): """ The dirnode ('1') URI returned by this method can be used in set_uri() on a different directory ('2') to 'mount' a reference to this directory ('1') under the other ('2'). This URI is just a string, so it can be passed around through email or other out-of-band protocol. """ def list(): """I return a Deferred that fires with a dictionary mapping child name (a unicode string) to (node, metadata_dict) tuples, in which 'node' is an IFilesystemNode and 'metadata_dict' is a dictionary of metadata.""" def has_child(name): """I return a Deferred that fires with a boolean, True if there exists a child of the given name, False if not. The child name must be a unicode string.""" def get(name): """I return a Deferred that fires with a specific named child node, which is an IFilesystemNode. The child name must be a unicode string. I raise NoSuchChildError if I do not have a child by that name.""" def get_metadata_for(name): """I return a Deferred that fires with the metadata dictionary for a specific named child node. The child name must be a unicode string. This metadata is stored in the *edge*, not in the child, so it is attached to the parent dirnode rather than the child node. I raise NoSuchChildError if I do not have a child by that name.""" def set_metadata_for(name, metadata): """I replace any existing metadata for the named child with the new metadata. The child name must be a unicode string. This metadata is stored in the *edge*, not in the child, so it is attached to the parent dirnode rather than the child node. I return a Deferred (that fires with this dirnode) when the operation is complete. I raise NoSuchChildError if I do not have a child by that name.""" def get_child_at_path(path): """Transform a child path into an IFilesystemNode. I perform a recursive series of 'get' operations to find the named descendant node. I return a Deferred that fires with the node, or errbacks with NoSuchChildError if the node could not be found. The path can be either a single string (slash-separated) or a list of path-name elements. All elements must be unicode strings. """ def get_child_and_metadata_at_path(path): """Transform a child path into an IFilesystemNode and metadata. I am like get_child_at_path(), but my Deferred fires with a tuple of (node, metadata). The metadata comes from the last edge. If the path is empty, the metadata will be an empty dictionary. """ def set_uri(name, writecap, readcap=None, metadata=None, overwrite=True): """I add a child (by writecap+readcap) at the specific name. I return a Deferred that fires when the operation finishes. If overwrite= is True, I will replace any existing child of the same name, otherwise an existing child will cause me to return ExistingChildError. The child name must be a unicode string. The child caps could be for a file, or for a directory. If you have both the writecap and readcap, you should provide both arguments. If you have only one cap and don't know whether it is read-only, provide it as the writecap argument and leave the readcap as None. If you have only one cap that is known to be read-only, provide it as the readcap argument and leave the writecap as None. The filecaps are typically obtained from an IFilesystemNode with get_uri() and get_readonly_uri(). If metadata= is provided, I will use it as the metadata for the named edge. This will replace any existing metadata. If metadata= is left as the default value of None, I will set ['mtime'] to the current time, and I will set ['ctime'] to the current time if there was not already a child by this name present. This roughly matches the ctime/mtime semantics of traditional filesystems. See the "About the metadata" section of webapi.txt for futher information. If this directory node is read-only, the Deferred will errback with a NotWriteableError.""" def set_children(entries, overwrite=True): """Add multiple children (by writecap+readcap) to a directory node. Takes a dictionary, with childname as keys and (writecap, readcap) tuples (or (writecap, readcap, metadata) triples) as values. Returns a Deferred that fires (with this dirnode) when the operation finishes. This is equivalent to calling set_uri() multiple times, but is much more efficient. All child names must be unicode strings. """ def set_node(name, child, metadata=None, overwrite=True): """I add a child at the specific name. I return a Deferred that fires when the operation finishes. This Deferred will fire with the child node that was just added. I will replace any existing child of the same name. The child name must be a unicode string. The 'child' instance must be an instance providing IFilesystemNode. If metadata= is provided, I will use it as the metadata for the named edge. This will replace any existing metadata. If metadata= is left as the default value of None, I will set ['mtime'] to the current time, and I will set ['ctime'] to the current time if there was not already a child by this name present. This roughly matches the ctime/mtime semantics of traditional filesystems. See the "About the metadata" section of webapi.txt for futher information. If this directory node is read-only, the Deferred will errback with a NotWriteableError.""" def set_nodes(entries, overwrite=True): """Add multiple children to a directory node. Takes a dict mapping unicode childname to (child_node, metdata) tuples. If metdata=None, the original metadata is left unmodified. Returns a Deferred that fires (with this dirnode) when the operation finishes. This is equivalent to calling set_node() multiple times, but is much more efficient.""" def add_file(name, uploadable, metadata=None, overwrite=True): """I upload a file (using the given IUploadable), then attach the resulting ImmutableFileNode to the directory at the given name. I set metadata the same way as set_uri and set_node. The child name must be a unicode string. I return a Deferred that fires (with the IFileNode of the uploaded file) when the operation completes.""" def delete(name, must_exist=True, must_be_directory=False, must_be_file=False): """I remove the child at the specific name. I return a Deferred that fires when the operation finishes. The child name must be a unicode string. If must_exist is True and I do not have a child by that name, I raise NoSuchChildError. If must_be_directory is True and the child is a file, or if must_be_file is True and the child is a directory, I raise ChildOfWrongTypeError.""" def create_subdirectory(name, initial_children={}, overwrite=True, mutable=True, mutable_version=None, metadata=None): """I create and attach a directory at the given name. The new directory can be empty, or it can be populated with children according to 'initial_children', which takes a dictionary in the same format as set_nodes (i.e. mapping unicode child name to (childnode, metadata) tuples). The child name must be a unicode string. I return a Deferred that fires (with the new directory node) when the operation finishes.""" def move_child_to(current_child_name, new_parent, new_child_name=None, overwrite=True): """I take one of my children and move them to a new parent. The child is referenced by name. On the new parent, the child will live under 'new_child_name', which defaults to 'current_child_name'. TODO: what should we do about metadata? I return a Deferred that fires when the operation finishes. The child name must be a unicode string. I raise NoSuchChildError if I do not have a child by that name.""" def build_manifest(): """I generate a table of everything reachable from this directory. I also compute deep-stats as described below. I return a Monitor. The Monitor's results will be a dictionary with four elements: res['manifest']: a list of (path, cap) tuples for all nodes (directories and files) reachable from this one. 'path' will be a tuple of unicode strings. The origin dirnode will be represented by an empty path tuple. res['verifycaps']: a list of (printable) verifycap strings, one for each reachable non-LIT node. This is a set: it will contain no duplicates. res['storage-index']: a list of (base32) storage index strings, one for each reachable non-LIT node. This is a set: it will contain no duplicates. res['stats']: a dictionary, the same that is generated by start_deep_stats() below. The Monitor will also have an .origin_si attribute with the (binary) storage index of the starting point. """ def start_deep_stats(): """Return a Monitor, examining all nodes (directories and files) reachable from this one. The Monitor's results will be a dictionary with the following keys:: count-immutable-files: count of how many CHK files are in the set count-mutable-files: same, for mutable files (does not include directories) count-literal-files: same, for LIT files count-files: sum of the above three count-directories: count of directories size-immutable-files: total bytes for all CHK files in the set size-mutable-files (TODO): same, for current version of all mutable files, does not include directories size-literal-files: same, for LIT files size-directories: size of mutable files used by directories largest-directory: number of bytes in the largest directory largest-directory-children: number of children in the largest directory largest-immutable-file: number of bytes in the largest CHK file size-mutable-files is not yet implemented, because it would involve even more queries than deep_stats does. The Monitor will also have an .origin_si attribute with the (binary) storage index of the starting point. This operation will visit every directory node underneath this one, and can take a long time to run. On a typical workstation with good bandwidth, this can examine roughly 15 directories per second (and takes several minutes of 100% CPU for ~1700 directories). """ class ICodecEncoder(Interface): def set_params(data_size, required_shares, max_shares): """Set up the parameters of this encoder. This prepares the encoder to perform an operation that converts a single block of data into a number of shares, such that a future ICodecDecoder can use a subset of these shares to recover the original data. This operation is invoked by calling encode(). Once the encoding parameters are set up, the encode operation can be invoked multiple times. set_params() prepares the encoder to accept blocks of input data that are exactly 'data_size' bytes in length. The encoder will be prepared to produce 'max_shares' shares for each encode() operation (although see the 'desired_share_ids' to use less CPU). The encoding math will be chosen such that the decoder can get by with as few as 'required_shares' of these shares and still reproduce the original data. For example, set_params(1000, 5, 5) offers no redundancy at all, whereas set_params(1000, 1, 10) provides 10x redundancy. Numerical Restrictions: 'data_size' is required to be an integral multiple of 'required_shares'. In general, the caller should choose required_shares and max_shares based upon their reliability requirements and the number of peers available (the total storage space used is roughly equal to max_shares*data_size/required_shares), then choose data_size to achieve the memory footprint desired (larger data_size means more efficient operation, smaller data_size means smaller memory footprint). In addition, 'max_shares' must be equal to or greater than 'required_shares'. Of course, setting them to be equal causes encode() to degenerate into a particularly slow form of the 'split' utility. See encode() for more details about how these parameters are used. set_params() must be called before any other ICodecEncoder methods may be invoked. """ def get_params(): """Return the 3-tuple of data_size, required_shares, max_shares""" def get_encoder_type(): """Return a short string that describes the type of this encoder. There is required to be a global table of encoder classes. This method returns an index into this table; the value at this index is an encoder class, and this encoder is an instance of that class. """ def get_block_size(): """Return the length of the shares that encode() will produce. """ def encode_proposal(data, desired_share_ids=None): """Encode some data. 'data' must be a string (or other buffer object), and len(data) must be equal to the 'data_size' value passed earlier to set_params(). This will return a Deferred that will fire with two lists. The first is a list of shares, each of which is a string (or other buffer object) such that len(share) is the same as what get_share_size() returned earlier. The second is a list of shareids, in which each is an integer. The lengths of the two lists will always be equal to each other. The user should take care to keep each share closely associated with its shareid, as one is useless without the other. The length of this output list will normally be the same as the value provided to the 'max_shares' parameter of set_params(). This may be different if 'desired_share_ids' is provided. 'desired_share_ids', if provided, is required to be a sequence of ints, each of which is required to be >= 0 and < max_shares. If not provided, encode() will produce 'max_shares' shares, as if 'desired_share_ids' were set to range(max_shares). You might use this if you initially thought you were going to use 10 peers, started encoding, and then two of the peers dropped out: you could use desired_share_ids= to skip the work (both memory and CPU) of producing shares for the peers that are no longer available. """ def encode(inshares, desired_share_ids=None): """Encode some data. This may be called multiple times. Each call is independent. inshares is a sequence of length required_shares, containing buffers (i.e. strings), where each buffer contains the next contiguous non-overlapping segment of the input data. Each buffer is required to be the same length, and the sum of the lengths of the buffers is required to be exactly the data_size promised by set_params(). (This implies that the data has to be padded before being passed to encode(), unless of course it already happens to be an even multiple of required_shares in length.) Note: the requirement to break up your data into 'required_shares' chunks of exactly the right length before calling encode() is surprising from point of view of a user who doesn't know how FEC works. It feels like an implementation detail that has leaked outside the abstraction barrier. Is there a use case in which the data to be encoded might already be available in pre-segmented chunks, such that it is faster or less work to make encode() take a list rather than splitting a single string? Yes, there is: suppose you are uploading a file with K=64, N=128, segsize=262,144. Then each in-share will be of size 4096. If you use this .encode() API then your code could first read each successive 4096-byte chunk from the file and store each one in a Python string and store each such Python string in a Python list. Then you could call .encode(), passing that list as "inshares". The encoder would generate the other 64 "secondary shares" and return to you a new list containing references to the same 64 Python strings that you passed in (as the primary shares) plus references to the new 64 Python strings. (You could even imagine that your code could use readv() so that the operating system can arrange to get all of those bytes copied from the file into the Python list of Python strings as efficiently as possible instead of having a loop written in C or in Python to copy the next part of the file into the next string.) On the other hand if you instead use the .encode_proposal() API (above), then your code can first read in all of the 262,144 bytes of the segment from the file into a Python string, then call .encode_proposal() passing the segment data as the "data" argument. The encoder would basically first split the "data" argument into a list of 64 in-shares of 4096 byte each, and then do the same thing that .encode() does. So this would result in a little bit more copying of data and a little bit higher of a "maximum memory usage" during the process, although it might or might not make a practical difference for our current use cases. Note that "inshares" is a strange name for the parameter if you think of the parameter as being just for feeding in data to the codec. It makes more sense if you think of the result of this encoding as being the set of shares from inshares plus an extra set of "secondary shares" (or "check shares"). It is a surprising name! If the API is going to be surprising then the name should be surprising. If we switch to encode_proposal() above then we should also switch to an unsurprising name. 'desired_share_ids', if provided, is required to be a sequence of ints, each of which is required to be >= 0 and < max_shares. If not provided, encode() will produce 'max_shares' shares, as if 'desired_share_ids' were set to range(max_shares). You might use this if you initially thought you were going to use 10 peers, started encoding, and then two of the peers dropped out: you could use desired_share_ids= to skip the work (both memory and CPU) of producing shares for the peers that are no longer available. For each call, encode() will return a Deferred that fires with two lists, one containing shares and the other containing the shareids. The get_share_size() method can be used to determine the length of the share strings returned by encode(). Each shareid is a small integer, exactly as passed into 'desired_share_ids' (or range(max_shares), if desired_share_ids was not provided). The shares and their corresponding shareids are required to be kept together during storage and retrieval. Specifically, the share data is useless by itself: the decoder needs to be told which share is which by providing it with both the shareid and the actual share data. This function will allocate an amount of memory roughly equal to:: (max_shares - required_shares) * get_share_size() When combined with the memory that the caller must allocate to provide the input data, this leads to a memory footprint roughly equal to the size of the resulting encoded shares (i.e. the expansion factor times the size of the input segment). """ # rejected ideas: # # returning a list of (shareidN,shareN) tuples instead of a pair of # lists (shareids..,shares..). Brian thought the tuples would # encourage users to keep the share and shareid together throughout # later processing, Zooko pointed out that the code to iterate # through two lists is not really more complicated than using a list # of tuples and there's also a performance improvement # # having 'data_size' not required to be an integral multiple of # 'required_shares'. Doing this would require encode() to perform # padding internally, and we'd prefer to have any padding be done # explicitly by the caller. Yes, it is an abstraction leak, but # hopefully not an onerous one. class ICodecDecoder(Interface): def set_params(data_size, required_shares, max_shares): """Set the params. They have to be exactly the same ones that were used for encoding.""" def get_needed_shares(): """Return the number of shares needed to reconstruct the data. set_params() is required to be called before this.""" def decode(some_shares, their_shareids): """Decode a partial list of shares into data. 'some_shares' is required to be a sequence of buffers of sharedata, a subset of the shares returned by ICodecEncode.encode(). Each share is required to be of the same length. The i'th element of their_shareids is required to be the shareid of the i'th buffer in some_shares. This returns a Deferred that fires with a sequence of buffers. This sequence will contain all of the segments of the original data, in order. The sum of the lengths of all of the buffers will be the 'data_size' value passed into the original ICodecEncode.set_params() call. To get back the single original input block of data, use ''.join(output_buffers), or you may wish to simply write them in order to an output file. Note that some of the elements in the result sequence may be references to the elements of the some_shares input sequence. In particular, this means that if those share objects are mutable (e.g. arrays) and if they are changed, then both the input (the 'some_shares' parameter) and the output (the value given when the deferred is triggered) will change. The length of 'some_shares' is required to be exactly the value of 'required_shares' passed into the original ICodecEncode.set_params() call. """ class IEncoder(Interface): """I take an object that provides IEncryptedUploadable, which provides encrypted data, and a list of shareholders. I then encode, hash, and deliver shares to those shareholders. I will compute all the necessary Merkle hash trees that are necessary to validate the crypttext that eventually comes back from the shareholders. I provide the URI Extension Block Hash, and the encoding parameters, both of which must be included in the URI. I do not choose shareholders, that is left to the IUploader. I must be given a dict of RemoteReferences to storage buckets that are ready and willing to receive data. """ def set_size(size): """Specify the number of bytes that will be encoded. This must be peformed before get_serialized_params() can be called. """ def set_encrypted_uploadable(u): """Provide a source of encrypted upload data. 'u' must implement IEncryptedUploadable. When this is called, the IEncryptedUploadable will be queried for its length and the storage_index that should be used. This returns a Deferred that fires with this Encoder instance. This must be performed before start() can be called. """ def get_param(name): """Return an encoding parameter, by name. 'storage_index': return a string with the (16-byte truncated SHA-256 hash) storage index to which these shares should be pushed. 'share_counts': return a tuple describing how many shares are used: (needed_shares, servers_of_happiness, total_shares) 'num_segments': return an int with the number of segments that will be encoded. 'segment_size': return an int with the size of each segment. 'block_size': return the size of the individual blocks that will be delivered to a shareholder's put_block() method. By knowing this, the shareholder will be able to keep all blocks in a single file and still provide random access when reading them. # TODO: can we avoid exposing this? 'share_size': an int with the size of the data that will be stored on each shareholder. This is aggregate amount of data that will be sent to the shareholder, summed over all the put_block() calls I will ever make. It is useful to determine this size before asking potential shareholders whether they will grant a lease or not, since their answers will depend upon how much space we need. TODO: this might also include some amount of overhead, like the size of all the hashes. We need to decide whether this is useful or not. 'serialized_params': a string with a concise description of the codec name and its parameters. This may be passed into the IUploadable to let it make sure that the same file encoded with different parameters will result in different storage indexes. Once this is called, set_size() and set_params() may not be called. """ def set_shareholders(shareholders, servermap): """Tell the encoder where to put the encoded shares. 'shareholders' must be a dictionary that maps share number (an integer ranging from 0 to n-1) to an instance that provides IStorageBucketWriter. 'servermap' is a dictionary that maps share number (as defined above) to a set of peerids. This must be performed before start() can be called.""" def start(): """Begin the encode/upload process. This involves reading encrypted data from the IEncryptedUploadable, encoding it, uploading the shares to the shareholders, then sending the hash trees. set_encrypted_uploadable() and set_shareholders() must be called before this can be invoked. This returns a Deferred that fires with a verify cap when the upload process is complete. The verifycap, plus the encryption key, is sufficient to construct the read cap. """ class IDecoder(Interface): """I take a list of shareholders and some setup information, then download, validate, decode, and decrypt data from them, writing the results to an output file. I do not locate the shareholders, that is left to the IDownloader. I must be given a dict of RemoteReferences to storage buckets that are ready to send data. """ def setup(outfile): """I take a file-like object (providing write and close) to which all the plaintext data will be written. TODO: producer/consumer . Maybe write() should return a Deferred that indicates when it will accept more data? But probably having the IDecoder be a producer is easier to glue to IConsumer pieces. """ def set_shareholders(shareholders): """I take a dictionary that maps share identifiers (small integers) to RemoteReferences that provide RIBucketReader. This must be called before start().""" def start(): """I start the download. This process involves retrieving data and hash chains from the shareholders, using the hashes to validate the data, decoding the shares into segments, decrypting the segments, then writing the resulting plaintext to the output file. I return a Deferred that will fire (with self) when the download is complete. """ class IDownloadTarget(Interface): # Note that if the IDownloadTarget is also an IConsumer, the downloader # will register itself as a producer. This allows the target to invoke # downloader.pauseProducing, resumeProducing, and stopProducing. def open(size): """Called before any calls to write() or close(). If an error occurs before any data is available, fail() may be called without a previous call to open(). 'size' is the length of the file being downloaded, in bytes.""" def write(data): """Output some data to the target.""" def close(): """Inform the target that there is no more data to be written.""" def fail(why): """fail() is called to indicate that the download has failed. 'why' is a Failure object indicating what went wrong. No further methods will be invoked on the IDownloadTarget after fail().""" def register_canceller(cb): """The CiphertextDownloader uses this to register a no-argument function that the target can call to cancel the download. Once this canceller is invoked, no further calls to write() or close() will be made.""" def finish(): """When the CiphertextDownloader is done, this finish() function will be called. Whatever it returns will be returned to the invoker of Downloader.download. """ class IDownloader(Interface): def download(uri, target): """Perform a CHK download, sending the data to the given target. 'target' must provide IDownloadTarget. Returns a Deferred that fires (with the results of target.finish) when the download is finished, or errbacks if something went wrong.""" class IEncryptedUploadable(Interface): def set_upload_status(upload_status): """Provide an IUploadStatus object that should be filled with status information. The IEncryptedUploadable is responsible for setting key-determination progress ('chk'), size, storage_index, and ciphertext-fetch progress. It may delegate some of this responsibility to others, in particular to the IUploadable.""" def get_size(): """This behaves just like IUploadable.get_size().""" def get_all_encoding_parameters(): """Return a Deferred that fires with a tuple of (k,happy,n,segment_size). The segment_size will be used as-is, and must match the following constraints: it must be a multiple of k, and it shouldn't be unreasonably larger than the file size (if segment_size is larger than filesize, the difference must be stored as padding). This usually passes through to the IUploadable method of the same name. The encoder strictly obeys the values returned by this method. To make an upload use non-default encoding parameters, you must arrange to control the values that this method returns. """ def get_storage_index(): """Return a Deferred that fires with a 16-byte storage index. """ def read_encrypted(length, hash_only): """This behaves just like IUploadable.read(), but returns crypttext instead of plaintext. If hash_only is True, then this discards the data (and returns an empty list); this improves efficiency when resuming an interrupted upload (where we need to compute the plaintext hashes, but don't need the redundant encrypted data).""" def close(): """Just like IUploadable.close().""" class IUploadable(Interface): def set_upload_status(upload_status): """Provide an IUploadStatus object that should be filled with status information. The IUploadable is responsible for setting key-determination progress ('chk').""" def set_default_encoding_parameters(params): """Set the default encoding parameters, which must be a dict mapping strings to ints. The meaningful keys are 'k', 'happy', 'n', and 'max_segment_size'. These might have an influence on the final encoding parameters returned by get_all_encoding_parameters(), if the Uploadable doesn't have more specific preferences. This call is optional: if it is not used, the Uploadable will use some built-in defaults. If used, this method must be called before any other IUploadable methods to have any effect. """ def get_size(): """Return a Deferred that will fire with the length of the data to be uploaded, in bytes. This will be called before the data is actually used, to compute encoding parameters. """ def get_all_encoding_parameters(): """Return a Deferred that fires with a tuple of (k,happy,n,segment_size). The segment_size will be used as-is, and must match the following constraints: it must be a multiple of k, and it shouldn't be unreasonably larger than the file size (if segment_size is larger than filesize, the difference must be stored as padding). The relative values of k and n allow some IUploadables to request better redundancy than others (in exchange for consuming more space in the grid). Larger values of segment_size reduce hash overhead, while smaller values reduce memory footprint and cause data to be delivered in smaller pieces (which may provide a smoother and more predictable download experience). The encoder strictly obeys the values returned by this method. To make an upload use non-default encoding parameters, you must arrange to control the values that this method returns. One way to influence them may be to call set_encoding_parameters() before calling get_all_encoding_parameters(). """ def get_encryption_key(): """Return a Deferred that fires with a 16-byte AES key. This key will be used to encrypt the data. The key will also be hashed to derive the StorageIndex. Uploadables that want to achieve convergence should hash their file contents and the serialized_encoding_parameters to form the key (which of course requires a full pass over the data). Uploadables can use the upload.ConvergentUploadMixin class to achieve this automatically. Uploadables that do not care about convergence (or do not wish to make multiple passes over the data) can simply return a strongly-random 16 byte string. get_encryption_key() may be called multiple times: the IUploadable is required to return the same value each time. """ def read(length): """Return a Deferred that fires with a list of strings (perhaps with only a single element) that, when concatenated together, contain the next 'length' bytes of data. If EOF is near, this may provide fewer than 'length' bytes. The total number of bytes provided by read() before it signals EOF must equal the size provided by get_size(). If the data must be acquired through multiple internal read operations, returning a list instead of a single string may help to reduce string copies. However, the length of the concatenated strings must equal the amount of data requested, unless EOF is encountered. Long reads, or short reads without EOF, are not allowed. read() should return the same amount of data as a local disk file read, just in a different shape and asynchronously. 'length' will typically be equal to (min(get_size(),1MB)/req_shares), so a 10kB file means length=3kB, 100kB file means length=30kB, and >=1MB file means length=300kB. This method provides for a single full pass through the data. Later use cases may desire multiple passes or access to only parts of the data (such as a mutable file making small edits-in-place). This API will be expanded once those use cases are better understood. """ def close(): """The upload is finished, and whatever filehandle was in use may be closed.""" class IMutableUploadable(Interface): """ I represent content that is due to be uploaded to a mutable filecap. """ # This is somewhat simpler than the IUploadable interface above # because mutable files do not need to be concerned with possibly # generating a CHK, nor with per-file keys. It is a subset of the # methods in IUploadable, though, so we could just as well implement # the mutable uploadables as IUploadables that don't happen to use # those methods (with the understanding that the unused methods will # never be called on such objects) def get_size(): """ Returns a Deferred that fires with the size of the content held by the uploadable. """ def read(length): """ Returns a list of strings that, when concatenated, are the next length bytes of the file, or fewer if there are fewer bytes between the current location and the end of the file. """ def close(): """ The process that used the Uploadable is finished using it, so the uploadable may be closed. """ class IUploadResults(Interface): """I am returned by immutable upload() methods and contain the results of the upload. Note that some of my methods return empty values (0 or an empty dict) when called for non-distributed LIT files.""" def get_file_size(): """Return the file size, in bytes.""" def get_uri(): """Return the (string) URI of the object uploaded, a CHK readcap.""" def get_ciphertext_fetched(): """Return the number of bytes fetched by the helpe for this upload, or 0 if the helper did not need to fetch any bytes (or if there was no helper).""" def get_preexisting_shares(): """Return the number of shares that were already present in the grid.""" def get_pushed_shares(): """Return the number of shares that were uploaded.""" def get_sharemap(): """Return a dict mapping share identifier to set of IServer instances. This indicates which servers were given which shares. For immutable files, the shareid is an integer (the share number, from 0 to N-1). For mutable files, it is a string of the form 'seq%d-%s-sh%d', containing the sequence number, the roothash, and the share number.""" def get_servermap(): """Return dict mapping IServer instance to a set of share numbers.""" def get_timings(): """Return dict of timing information, mapping name to seconds. All times are floats: total : total upload time, start to finish storage_index : time to compute the storage index peer_selection : time to decide which peers will be used contacting_helper : initial helper query to upload/no-upload decision helper_total : initial helper query to helper finished pushing cumulative_fetch : helper waiting for ciphertext requests total_fetch : helper start to last ciphertext response cumulative_encoding : just time spent in zfec cumulative_sending : just time spent waiting for storage servers hashes_and_close : last segment push to shareholder close total_encode_and_push : first encode to shareholder close """ def get_uri_extension_data(): """Return the dict of UEB data created for this file.""" def get_verifycapstr(): """Return the (string) verify-cap URI for the uploaded object.""" class IDownloadResults(Interface): """I am created internally by download() methods. I contain a number of public attributes that contain details about the download process.:: .file_size : the size of the file, in bytes .servers_used : set of server peerids that were used during download .server_problems : dict mapping server peerid to a problem string. Only servers that had problems (bad hashes, disconnects) are listed here. .servermap : dict mapping server peerid to a set of share numbers. Only servers that had any shares are listed here. .timings : dict of timing information, mapping name to seconds (float) peer_selection : time to ask servers about shares servers_peer_selection : dict of peerid to DYHB-query time uri_extension : time to fetch a copy of the URI extension block hashtrees : time to fetch the hash trees segments : time to fetch, decode, and deliver segments cumulative_fetch : time spent waiting for storage servers cumulative_decode : just time spent in zfec cumulative_decrypt : just time spent in decryption total : total download time, start to finish fetch_per_server : dict of server to list of per-segment fetch times """ class IUploader(Interface): def upload(uploadable): """Upload the file. 'uploadable' must impement IUploadable. This returns a Deferred that fires with an IUploadResults instance, from which the URI of the file can be obtained as results.uri .""" class ICheckable(Interface): def check(monitor, verify=False, add_lease=False): """Check up on my health, optionally repairing any problems. This returns a Deferred that fires with an instance that provides ICheckResults, or None if the object is non-distributed (i.e. LIT files). The monitor will be checked periodically to see if the operation has been cancelled. If so, no new queries will be sent, and the Deferred will fire (with a OperationCancelledError) immediately. Filenodes and dirnodes (which provide IFilesystemNode) are also checkable. Instances that represent verifier-caps will be checkable but not downloadable. Some objects (like LIT files) do not actually live in the grid, and their checkers return None (non-distributed files are always healthy). If verify=False, a relatively lightweight check will be performed: I will ask all servers if they have a share for me, and I will believe whatever they say. If there are at least N distinct shares on the grid, my results will indicate r.is_healthy()==True. This requires a roundtrip to each server, but does not transfer very much data, so the network bandwidth is fairly low. If verify=True, a more resource-intensive check will be performed: every share will be downloaded, and the hashes will be validated on every bit. I will ignore any shares that failed their hash checks. If there are at least N distinct valid shares on the grid, my results will indicate r.is_healthy()==True. This requires N/k times as much download bandwidth (and server disk IO) as a regular download. If a storage server is holding a corrupt share, or is experiencing memory failures during retrieval, or is malicious or buggy, then verification will detect the problem, but checking will not. If add_lease=True, I will ensure that an up-to-date lease is present on each share. The lease secrets will be derived from by node secret (in BASEDIR/private/secret), so either I will add a new lease to the share, or I will merely renew the lease that I already had. In a future version of the storage-server protocol (once Accounting has been implemented), there may be additional options here to define the kind of lease that is obtained (which account number to claim, etc). TODO: any problems seen during checking will be reported to the health-manager.furl, a centralized object that is responsible for figuring out why files are unhealthy so corrective action can be taken. """ def check_and_repair(monitor, verify=False, add_lease=False): """Like check(), but if the file/directory is not healthy, attempt to repair the damage. Any non-healthy result will cause an immediate repair operation, to generate and upload new shares. After repair, the file will be as healthy as we can make it. Details about what sort of repair is done will be put in the check-and-repair results. The Deferred will not fire until the repair is complete. This returns a Deferred that fires with an instance of ICheckAndRepairResults.""" class IDeepCheckable(Interface): def start_deep_check(verify=False, add_lease=False): """Check upon the health of me and everything I can reach. This is a recursive form of check(), useable only on dirnodes. I return a Monitor, with results that are an IDeepCheckResults object. TODO: If any of the directories I traverse are unrecoverable, the Monitor will report failure. If any of the files I check upon are unrecoverable, those problems will be reported in the IDeepCheckResults as usual, and the Monitor will not report a failure. """ def start_deep_check_and_repair(verify=False, add_lease=False): """Check upon the health of me and everything I can reach. Repair anything that isn't healthy. This is a recursive form of check_and_repair(), useable only on dirnodes. I return a Monitor, with results that are an IDeepCheckAndRepairResults object. TODO: If any of the directories I traverse are unrecoverable, the Monitor will report failure. If any of the files I check upon are unrecoverable, those problems will be reported in the IDeepCheckResults as usual, and the Monitor will not report a failure. """ class ICheckResults(Interface): """I contain the detailed results of a check/verify operation. """ def get_storage_index(): """Return a string with the (binary) storage index.""" def get_storage_index_string(): """Return a string with the (printable) abbreviated storage index.""" def get_uri(): """Return the (string) URI of the object that was checked.""" def is_healthy(): """Return a boolean, True if the file/dir is fully healthy, False if it is damaged in any way. Non-distributed LIT files always return True.""" def is_recoverable(): """Return a boolean, True if the file/dir can be recovered, False if not. Unrecoverable files are obviously unhealthy. Non-distributed LIT files always return True.""" # the following methods all return None for non-distributed LIT files def get_happiness(): """Return the happiness count of the file.""" def get_encoding_needed(): """Return 'k', the number of shares required for recovery.""" def get_encoding_expected(): """Return 'N', the number of total shares generated.""" def get_share_counter_good(): """Return the number of distinct good shares that were found. For mutable files, this counts shares for the 'best' version.""" def get_share_counter_wrong(): """For mutable files, return the number of shares for versions other than the 'best' one (which is defined as being the recoverable version with the highest sequence number, then the highest roothash). These are either leftover shares from an older version (perhaps on a server that was offline when an update occurred), shares from an unrecoverable newer version, or shares from an alternate current version that results from an uncoordinated write collision. For a healthy file, this will equal 0. For immutable files, this will always equal 0.""" def get_corrupt_shares(): """Return a list of 'share locators', one for each share that was found to be corrupt (integrity failure). Each share locator is a list of (IServer, storage_index, sharenum).""" def get_incompatible_shares(): """Return a list of 'share locators', one for each share that was found to be of an unknown format. Each share locator is a list of (IServer, storage_index, sharenum).""" def get_servers_responding(): """Return a list of IServer objects, one for each server that responded to the share query (even if they said they didn't have shares, and even if they said they did have shares but then didn't send them when asked, or dropped the connection, or returned a Failure, and even if they said they did have shares and sent incorrect ones when asked)""" def get_host_counter_good_shares(): """Return the number of distinct storage servers with good shares. If this number is less than get_share_counters()[good], then some shares are doubled up, increasing the correlation of failures. This indicates that one or more shares should be moved to an otherwise unused server, if one is available. """ def get_version_counter_recoverable(): """Return the number of recoverable versions of the file. For a healthy file, this will equal 1.""" def get_version_counter_unrecoverable(): """Return the number of unrecoverable versions of the file. For a healthy file, this will be 0.""" def get_sharemap(): """Return a dict mapping share identifier to list of IServer objects. This indicates which servers are holding which shares. For immutable files, the shareid is an integer (the share number, from 0 to N-1). For mutable files, it is a string of the form 'seq%d-%s-sh%d', containing the sequence number, the roothash, and the share number.""" def get_summary(): """Return a string with a brief (one-line) summary of the results.""" def get_report(): """Return a list of strings with more detailed results.""" class ICheckAndRepairResults(Interface): """I contain the detailed results of a check/verify/repair operation. The IFilesystemNode.check()/verify()/repair() methods all return instances that provide ICheckAndRepairResults. """ def get_storage_index(): """Return a string with the (binary) storage index.""" def get_storage_index_string(): """Return a string with the (printable) abbreviated storage index.""" def get_repair_attempted(): """Return a boolean, True if a repair was attempted. We might not attempt to repair the file because it was healthy, or healthy enough (i.e. some shares were missing but not enough to exceed some threshold), or because we don't know how to repair this object.""" def get_repair_successful(): """Return a boolean, True if repair was attempted and the file/dir was fully healthy afterwards. False if no repair was attempted or if a repair attempt failed.""" def get_pre_repair_results(): """Return an ICheckResults instance that describes the state of the file/dir before any repair was attempted.""" def get_post_repair_results(): """Return an ICheckResults instance that describes the state of the file/dir after any repair was attempted. If no repair was attempted, the pre-repair and post-repair results will be identical.""" class IDeepCheckResults(Interface): """I contain the results of a deep-check operation. This is returned by a call to ICheckable.deep_check(). """ def get_root_storage_index_string(): """Return the storage index (abbreviated human-readable string) of the first object checked.""" def get_counters(): """Return a dictionary with the following keys:: count-objects-checked: count of how many objects were checked count-objects-healthy: how many of those objects were completely healthy count-objects-unhealthy: how many were damaged in some way count-objects-unrecoverable: how many were unrecoverable count-corrupt-shares: how many shares were found to have corruption, summed over all objects examined """ def get_corrupt_shares(): """Return a set of (IServer, storage_index, sharenum) for all shares that were found to be corrupt. storage_index is binary.""" def get_all_results(): """Return a dictionary mapping pathname (a tuple of strings, ready to be slash-joined) to an ICheckResults instance, one for each object that was checked.""" def get_results_for_storage_index(storage_index): """Retrive the ICheckResults instance for the given (binary) storage index. Raises KeyError if there are no results for that storage index.""" def get_stats(): """Return a dictionary with the same keys as IDirectoryNode.deep_stats().""" class IDeepCheckAndRepairResults(Interface): """I contain the results of a deep-check-and-repair operation. This is returned by a call to ICheckable.deep_check_and_repair(). """ def get_root_storage_index_string(): """Return the storage index (abbreviated human-readable string) of the first object checked.""" def get_counters(): """Return a dictionary with the following keys:: count-objects-checked: count of how many objects were checked count-objects-healthy-pre-repair: how many of those objects were completely healthy (before any repair) count-objects-unhealthy-pre-repair: how many were damaged in some way count-objects-unrecoverable-pre-repair: how many were unrecoverable count-objects-healthy-post-repair: how many of those objects were completely healthy (after any repair) count-objects-unhealthy-post-repair: how many were damaged in some way count-objects-unrecoverable-post-repair: how many were unrecoverable count-repairs-attempted: repairs were attempted on this many objects. The count-repairs- keys will always be provided, however unless repair=true is present, they will all be zero. count-repairs-successful: how many repairs resulted in healthy objects count-repairs-unsuccessful: how many repairs resulted did not results in completely healthy objects count-corrupt-shares-pre-repair: how many shares were found to have corruption, summed over all objects examined (before any repair) count-corrupt-shares-post-repair: how many shares were found to have corruption, summed over all objects examined (after any repair) """ def get_stats(): """Return a dictionary with the same keys as IDirectoryNode.deep_stats().""" def get_corrupt_shares(): """Return a set of (IServer, storage_index, sharenum) for all shares that were found to be corrupt before any repair was attempted. storage_index is binary. """ def get_remaining_corrupt_shares(): """Return a set of (IServer, storage_index, sharenum) for all shares that were found to be corrupt after any repair was completed. storage_index is binary. These are shares that need manual inspection and probably deletion. """ def get_all_results(): """Return a dictionary mapping pathname (a tuple of strings, ready to be slash-joined) to an ICheckAndRepairResults instance, one for each object that was checked.""" def get_results_for_storage_index(storage_index): """Retrive the ICheckAndRepairResults instance for the given (binary) storage index. Raises KeyError if there are no results for that storage index.""" class IRepairable(Interface): def repair(check_results): """Attempt to repair the given object. Returns a Deferred that fires with a IRepairResults object. I must be called with an object that implements ICheckResults, as proof that you have actually discovered a problem with this file. I will use the data in the checker results to guide the repair process, such as which servers provided bad data and should therefore be avoided. The ICheckResults object is inside the ICheckAndRepairResults object, which is returned by the ICheckable.check() method:: d = filenode.check(repair=False) def _got_results(check_and_repair_results): check_results = check_and_repair_results.get_pre_repair_results() return filenode.repair(check_results) d.addCallback(_got_results) return d """ class IRepairResults(Interface): """I contain the results of a repair operation.""" def get_successful(): """Returns a boolean: True if the repair made the file healthy, False if not. Repair failure generally indicates a file that has been damaged beyond repair.""" class IClient(Interface): def upload(uploadable): """Upload some data into a CHK, get back the UploadResults for it. @param uploadable: something that implements IUploadable @return: a Deferred that fires with the UploadResults instance. To get the URI for this file, use results.uri . """ def create_mutable_file(contents=""): """Create a new mutable file (with initial) contents, get back the new node instance. @param contents: (bytestring, callable, or None): this provides the initial contents of the mutable file. If 'contents' is a bytestring, it will be used as-is. If 'contents' is a callable, it will be invoked with the new MutableFileNode instance and is expected to return a bytestring with the initial contents of the file (the callable can use node.get_writekey() to decide how to encrypt the initial contents, e.g. for a brand new dirnode with initial children). contents=None is equivalent to an empty string. Using content_maker= is more efficient than creating a mutable file and setting its contents in two separate operations. @return: a Deferred that fires with an IMutableFileNode instance. """ def create_dirnode(initial_children={}): """Create a new unattached dirnode, possibly with initial children. @param initial_children: dict with keys that are unicode child names, and values that are (childnode, metadata) tuples. @return: a Deferred that fires with the new IDirectoryNode instance. """ def create_node_from_uri(uri, rouri): """Create a new IFilesystemNode instance from the uri, synchronously. @param uri: a string or IURI-providing instance, or None. This could be for a LiteralFileNode, a CHK file node, a mutable file node, or a directory node @param rouri: a string or IURI-providing instance, or None. If the main uri is None, I will use the rouri instead. If I recognize the format of the main uri, I will ignore the rouri (because it can be derived from the writecap). @return: an instance that provides IFilesystemNode (or more usefully one of its subclasses). File-specifying URIs will result in IFileNode-providing instances, like ImmutableFileNode, LiteralFileNode, or MutableFileNode. Directory-specifying URIs will result in IDirectoryNode-providing instances, like DirectoryNode. """ class INodeMaker(Interface): """The NodeMaker is used to create IFilesystemNode instances. It can accept a filecap/dircap string and return the node right away. It can also create new nodes (i.e. upload a file, or create a mutable file) asynchronously. Once you have one of these nodes, you can use other methods to determine whether it is a file or directory, and to download or modify its contents. The NodeMaker encapsulates all the authorities that these IFilesystemNodes require (like references to the StorageFarmBroker). Each Tahoe process will typically have a single NodeMaker, but unit tests may create simplified/mocked forms for testing purposes. """ def create_from_cap(writecap, readcap=None, deep_immutable=False, name=u""): """I create an IFilesystemNode from the given writecap/readcap. I can only provide nodes for existing file/directory objects: use my other methods to create new objects. I return synchronously.""" def create_mutable_file(contents=None, keysize=None): """I create a new mutable file, and return a Deferred that will fire with the IMutableFileNode instance when it is ready. If contents= is provided (a bytestring), it will be used as the initial contents of the new file, otherwise the file will contain zero bytes. keysize= is for use by unit tests, to create mutable files that are smaller than usual.""" def create_new_mutable_directory(initial_children={}): """I create a new mutable directory, and return a Deferred that will fire with the IDirectoryNode instance when it is ready. If initial_children= is provided (a dict mapping unicode child name to (childnode, metadata_dict) tuples), the directory will be populated with those children, otherwise it will be empty.""" class IClientStatus(Interface): def list_all_uploads(): """Return a list of uploader objects, one for each upload that currently has an object available (tracked with weakrefs). This is intended for debugging purposes.""" def list_active_uploads(): """Return a list of active IUploadStatus objects.""" def list_recent_uploads(): """Return a list of IUploadStatus objects for the most recently started uploads.""" def list_all_downloads(): """Return a list of downloader objects, one for each download that currently has an object available (tracked with weakrefs). This is intended for debugging purposes.""" def list_active_downloads(): """Return a list of active IDownloadStatus objects.""" def list_recent_downloads(): """Return a list of IDownloadStatus objects for the most recently started downloads.""" class IUploadStatus(Interface): def get_started(): """Return a timestamp (float with seconds since epoch) indicating when the operation was started.""" def get_storage_index(): """Return a string with the (binary) storage index in use on this upload. Returns None if the storage index has not yet been calculated.""" def get_size(): """Return an integer with the number of bytes that will eventually be uploaded for this file. Returns None if the size is not yet known. """ def using_helper(): """Return True if this upload is using a Helper, False if not.""" def get_status(): """Return a string describing the current state of the upload process.""" def get_progress(): """Returns a tuple of floats, (chk, ciphertext, encode_and_push), each from 0.0 to 1.0 . 'chk' describes how much progress has been made towards hashing the file to determine a CHK encryption key: if non-convergent encryption is in use, this will be trivial, otherwise the whole file must be hashed. 'ciphertext' describes how much of the ciphertext has been pushed to the helper, and is '1.0' for non-helper uploads. 'encode_and_push' describes how much of the encode-and-push process has finished: for helper uploads this is dependent upon the helper providing progress reports. It might be reasonable to add all three numbers and report the sum to the user.""" def get_active(): """Return True if the upload is currently active, False if not.""" def get_results(): """Return an instance of UploadResults (which contains timing and sharemap information). Might return None if the upload is not yet finished.""" def get_counter(): """Each upload status gets a unique number: this method returns that number. This provides a handle to this particular upload, so a web page can generate a suitable hyperlink.""" class IDownloadStatus(Interface): def get_started(): """Return a timestamp (float with seconds since epoch) indicating when the operation was started.""" def get_storage_index(): """Return a string with the (binary) storage index in use on this download. This may be None if there is no storage index (i.e. LIT files).""" def get_size(): """Return an integer with the number of bytes that will eventually be retrieved for this file. Returns None if the size is not yet known. """ def using_helper(): """Return True if this download is using a Helper, False if not.""" def get_status(): """Return a string describing the current state of the download process.""" def get_progress(): """Returns a float (from 0.0 to 1.0) describing the amount of the download that has completed. This value will remain at 0.0 until the first byte of plaintext is pushed to the download target.""" def get_active(): """Return True if the download is currently active, False if not.""" def get_counter(): """Each download status gets a unique number: this method returns that number. This provides a handle to this particular download, so a web page can generate a suitable hyperlink.""" class IServermapUpdaterStatus(Interface): pass class IPublishStatus(Interface): pass class IRetrieveStatus(Interface): pass class NotCapableError(Exception): """You have tried to write to a read-only node.""" class BadWriteEnablerError(Exception): pass class RIControlClient(RemoteInterface): def wait_for_client_connections(num_clients=int): """Do not return until we have connections to at least NUM_CLIENTS storage servers. """ # debug stuff def upload_random_data_from_file(size=int, convergence=str): return str def download_to_tempfile_and_delete(uri=str): return None def get_memory_usage(): """Return a dict describes the amount of memory currently in use. The keys are 'VmPeak', 'VmSize', and 'VmData'. The values are integers, measuring memory consupmtion in bytes.""" return DictOf(str, int) def speed_test(count=int, size=int, mutable=Any()): """Write 'count' tempfiles to disk, all of the given size. Measure how long (in seconds) it takes to upload them all to the servers. Then measure how long it takes to download all of them. If 'mutable' is 'create', time creation of mutable files. If 'mutable' is 'upload', then time access to the same mutable file instead of creating one. Returns a tuple of (upload_time, download_time). """ return (float, float) def measure_peer_response_time(): """Send a short message to each connected peer, and measure the time it takes for them to respond to it. This is a rough measure of the application-level round trip time. @return: a dictionary mapping peerid to a float (RTT time in seconds) """ return DictOf(str, float) UploadResults = Any() #DictOf(str, str) class RIEncryptedUploadable(RemoteInterface): __remote_name__ = "RIEncryptedUploadable.tahoe.allmydata.com" def get_size(): return Offset def get_all_encoding_parameters(): return (int, int, int, long) def read_encrypted(offset=Offset, length=ReadSize): return ListOf(str) def close(): return None class RICHKUploadHelper(RemoteInterface): __remote_name__ = "RIUploadHelper.tahoe.allmydata.com" def get_version(): """ Return a dictionary of version information. """ return DictOf(str, Any()) def upload(reader=RIEncryptedUploadable): return UploadResults class RIHelper(RemoteInterface): __remote_name__ = "RIHelper.tahoe.allmydata.com" def get_version(): """ Return a dictionary of version information. """ return DictOf(str, Any()) def upload_chk(si=StorageIndex): """See if a file with a given storage index needs uploading. The helper will ask the appropriate storage servers to see if the file has already been uploaded. If so, the helper will return a set of 'upload results' that includes whatever hashes are needed to build the read-cap, and perhaps a truncated sharemap. If the file has not yet been uploaded (or if it was only partially uploaded), the helper will return an empty upload-results dictionary and also an RICHKUploadHelper object that will take care of the upload process. The client should call upload() on this object and pass it a reference to an RIEncryptedUploadable object that will provide ciphertext. When the upload is finished, the upload() method will finish and return the upload results. """ return (UploadResults, ChoiceOf(RICHKUploadHelper, None)) class RIStatsProvider(RemoteInterface): __remote_name__ = "RIStatsProvider.tahoe.allmydata.com" """ Provides access to statistics and monitoring information. """ def get_stats(): """ returns a dictionary containing 'counters' and 'stats', each a dictionary with string counter/stat name keys, and numeric or None values. counters are monotonically increasing measures of work done, and stats are instantaneous measures (potentially time averaged internally) """ return DictOf(str, DictOf(str, ChoiceOf(float, int, long, None))) class RIStatsGatherer(RemoteInterface): __remote_name__ = "RIStatsGatherer.tahoe.allmydata.com" """ Provides a monitoring service for centralised collection of stats """ def provide(provider=RIStatsProvider, nickname=str): """ @param provider: a stats collector instance that should be polled periodically by the gatherer to collect stats. @param nickname: a name useful to identify the provided client """ return None class IStatsProducer(Interface): def get_stats(): """ returns a dictionary, with str keys representing the names of stats to be monitored, and numeric values. """ class RIKeyGenerator(RemoteInterface): __remote_name__ = "RIKeyGenerator.tahoe.allmydata.com" """ Provides a service offering to make RSA key pairs. """ def get_rsa_key_pair(key_size=int): """ @param key_size: the size of the signature key. @return: tuple(verifying_key, signing_key) """ return TupleOf(str, str) class FileTooLargeError(Exception): pass class IValidatedThingProxy(Interface): def start(): """ Acquire a thing and validate it. Return a deferred that is eventually fired with self if the thing is valid or errbacked if it can't be acquired or validated.""" class InsufficientVersionError(Exception): def __init__(self, needed, got): self.needed = needed self.got = got def __repr__(self): return "InsufficientVersionError(need '%s', got %s)" % (self.needed, self.got) class EmptyPathnameComponentError(Exception): """The webapi disallows empty pathname components.""" allmydata-tahoe-1.10.2/src/allmydata/mutable/0000755000175000017500000000000012556560072017161 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/mutable/servermap.py0000644000175000017500000014667312556560070021556 0ustar ramram import sys, time, copy from zope.interface import implements from itertools import count from twisted.internet import defer from twisted.python import failure from foolscap.api import DeadReferenceError, RemoteException, eventually, \ fireEventually from allmydata.util import base32, hashutil, log, deferredutil from allmydata.util.dictutil import DictOfSets from allmydata.storage.server import si_b2a from allmydata.interfaces import IServermapUpdaterStatus from pycryptopp.publickey import rsa from allmydata.mutable.common import MODE_CHECK, MODE_ANYTHING, MODE_WRITE, \ MODE_READ, MODE_REPAIR, CorruptShareError from allmydata.mutable.layout import SIGNED_PREFIX_LENGTH, MDMFSlotReadProxy class UpdateStatus: implements(IServermapUpdaterStatus) statusid_counter = count(0) def __init__(self): self.timings = {} self.timings["per_server"] = {} self.timings["cumulative_verify"] = 0.0 self.privkey_from = None self.problems = {} self.active = True self.storage_index = None self.mode = "?" self.status = "Not started" self.progress = 0.0 self.counter = self.statusid_counter.next() self.started = time.time() self.finished = None def add_per_server_time(self, server, op, sent, elapsed): assert op in ("query", "late", "privkey") if server not in self.timings["per_server"]: self.timings["per_server"][server] = [] self.timings["per_server"][server].append((op,sent,elapsed)) def get_started(self): return self.started def get_finished(self): return self.finished def get_storage_index(self): return self.storage_index def get_mode(self): return self.mode def get_servermap(self): return self.servermap def get_privkey_from(self): return self.privkey_from def using_helper(self): return False def get_size(self): return "-NA-" def get_status(self): return self.status def get_progress(self): return self.progress def get_active(self): return self.active def get_counter(self): return self.counter def set_storage_index(self, si): self.storage_index = si def set_mode(self, mode): self.mode = mode def set_privkey_from(self, server): self.privkey_from = server def set_status(self, status): self.status = status def set_progress(self, value): self.progress = value def set_active(self, value): self.active = value def set_finished(self, when): self.finished = when class ServerMap: """I record the placement of mutable shares. This object records which shares (of various versions) are located on which servers. One purpose I serve is to inform callers about which versions of the mutable file are recoverable and 'current'. A second purpose is to serve as a state marker for test-and-set operations. I am passed out of retrieval operations and back into publish operations, which means 'publish this new version, but only if nothing has changed since I last retrieved this data'. This reduces the chances of clobbering a simultaneous (uncoordinated) write. @var _known_shares: a dictionary, mapping a (server, shnum) tuple to a (versionid, timestamp) tuple. Each 'versionid' is a tuple of (seqnum, root_hash, IV, segsize, datalength, k, N, signed_prefix, offsets) @ivar _bad_shares: dict with keys of (server, shnum) tuples, describing shares that I should ignore (because a previous user of the servermap determined that they were invalid). The updater only locates a certain number of shares: if some of these turn out to have integrity problems and are unusable, the caller will need to mark those shares as bad, then re-update the servermap, then try again. The dict maps (server, shnum) tuple to old checkstring. """ def __init__(self): self._known_shares = {} self.unreachable_servers = set() # servers that didn't respond to queries self.reachable_servers = set() # servers that did respond to queries self._problems = [] # mostly for debugging self._bad_shares = {} # maps (server,shnum) to old checkstring self._last_update_mode = None self._last_update_time = 0 self.proxies = {} self.update_data = {} # shnum -> [(verinfo,(blockhashes,start,end)),..] # where blockhashes is a list of bytestrings (the result of # layout.MDMFSlotReadProxy.get_blockhashes), and start/end are both # (block,salt) tuple-of-bytestrings from get_block_and_salt() def copy(self): s = ServerMap() s._known_shares = self._known_shares.copy() # tuple->tuple s.unreachable_servers = set(self.unreachable_servers) s.reachable_servers = set(self.reachable_servers) s._problems = self._problems[:] s._bad_shares = self._bad_shares.copy() # tuple->str s._last_update_mode = self._last_update_mode s._last_update_time = self._last_update_time s.update_data = copy.deepcopy(self.update_data) return s def get_reachable_servers(self): return self.reachable_servers def mark_server_reachable(self, server): self.reachable_servers.add(server) def mark_server_unreachable(self, server): self.unreachable_servers.add(server) def mark_bad_share(self, server, shnum, checkstring): """This share was found to be bad, either in the checkstring or signature (detected during mapupdate), or deeper in the share (detected at retrieve time). Remove it from our list of useful shares, and remember that it is bad so we don't add it back again later. We record the share's old checkstring (which might be corrupted or badly signed) so that a repair operation can do the test-and-set using it as a reference. """ key = (server, shnum) # record checkstring self._bad_shares[key] = checkstring self._known_shares.pop(key, None) def get_bad_shares(self): # key=(server,shnum) -> checkstring return self._bad_shares def add_new_share(self, server, shnum, verinfo, timestamp): """We've written a new share out, replacing any that was there before.""" key = (server, shnum) self._bad_shares.pop(key, None) self._known_shares[key] = (verinfo, timestamp) def add_problem(self, f): self._problems.append(f) def get_problems(self): return self._problems def set_last_update(self, mode, when): self._last_update_mode = mode self._last_update_time = when def get_last_update(self): return (self._last_update_mode, self._last_update_time) def dump(self, out=sys.stdout): print >>out, "servermap:" for ( (server, shnum), (verinfo, timestamp) ) in self._known_shares.items(): (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo print >>out, ("[%s]: sh#%d seq%d-%s %d-of-%d len%d" % (server.get_name(), shnum, seqnum, base32.b2a(root_hash)[:4], k, N, datalength)) if self._problems: print >>out, "%d PROBLEMS" % len(self._problems) for f in self._problems: print >>out, str(f) return out def all_servers(self): return set([server for (server, shnum) in self._known_shares]) def all_servers_for_version(self, verinfo): """Return a set of servers that hold shares for the given version.""" return set([server for ( (server, shnum), (verinfo2, timestamp) ) in self._known_shares.items() if verinfo == verinfo2]) def get_known_shares(self): # maps (server,shnum) to (versionid,timestamp) return self._known_shares def make_sharemap(self): """Return a dict that maps shnum to a set of servers that hold it.""" sharemap = DictOfSets() for (server, shnum) in self._known_shares: sharemap.add(shnum, server) return sharemap def make_versionmap(self): """Return a dict that maps versionid to sets of (shnum, server, timestamp) tuples.""" versionmap = DictOfSets() for ( (server, shnum), (verinfo, timestamp) ) in self._known_shares.items(): versionmap.add(verinfo, (shnum, server, timestamp)) return versionmap def debug_shares_on_server(self, server): # used by tests return set([shnum for (s, shnum) in self._known_shares if s == server]) def version_on_server(self, server, shnum): key = (server, shnum) if key in self._known_shares: (verinfo, timestamp) = self._known_shares[key] return verinfo return None def shares_available(self): """Return a dict that maps verinfo to tuples of (num_distinct_shares, k, N) tuples.""" versionmap = self.make_versionmap() all_shares = {} for verinfo, shares in versionmap.items(): s = set() for (shnum, server, timestamp) in shares: s.add(shnum) (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo all_shares[verinfo] = (len(s), k, N) return all_shares def highest_seqnum(self): available = self.shares_available() seqnums = [verinfo[0] for verinfo in available.keys()] seqnums.append(0) return max(seqnums) def summarize_version(self, verinfo): """Take a versionid, return a string that describes it.""" (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo return "seq%d-%s" % (seqnum, base32.b2a(root_hash)[:4]) def summarize_versions(self): """Return a string describing which versions we know about.""" versionmap = self.make_versionmap() bits = [] for (verinfo, shares) in versionmap.items(): vstr = self.summarize_version(verinfo) shnums = set([shnum for (shnum, server, timestamp) in shares]) bits.append("%d*%s" % (len(shnums), vstr)) return "/".join(bits) def recoverable_versions(self): """Return a set of versionids, one for each version that is currently recoverable.""" versionmap = self.make_versionmap() recoverable_versions = set() for (verinfo, shares) in versionmap.items(): (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo shnums = set([shnum for (shnum, server, timestamp) in shares]) if len(shnums) >= k: # this one is recoverable recoverable_versions.add(verinfo) return recoverable_versions def unrecoverable_versions(self): """Return a set of versionids, one for each version that is currently unrecoverable.""" versionmap = self.make_versionmap() unrecoverable_versions = set() for (verinfo, shares) in versionmap.items(): (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo shnums = set([shnum for (shnum, server, timestamp) in shares]) if len(shnums) < k: unrecoverable_versions.add(verinfo) return unrecoverable_versions def best_recoverable_version(self): """Return a single versionid, for the so-called 'best' recoverable version. Sequence number is the primary sort criteria, followed by root hash. Returns None if there are no recoverable versions.""" recoverable = list(self.recoverable_versions()) recoverable.sort() if recoverable: return recoverable[-1] return None def size_of_version(self, verinfo): """Given a versionid (perhaps returned by best_recoverable_version), return the size of the file in bytes.""" (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo return datalength def unrecoverable_newer_versions(self): # Return a dict of versionid -> health, for versions that are # unrecoverable and have later seqnums than any recoverable versions. # These indicate that a write will lose data. versionmap = self.make_versionmap() healths = {} # maps verinfo to (found,k) unrecoverable = set() highest_recoverable_seqnum = -1 for (verinfo, shares) in versionmap.items(): (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo shnums = set([shnum for (shnum, server, timestamp) in shares]) healths[verinfo] = (len(shnums),k) if len(shnums) < k: unrecoverable.add(verinfo) else: highest_recoverable_seqnum = max(seqnum, highest_recoverable_seqnum) newversions = {} for verinfo in unrecoverable: (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = verinfo if seqnum > highest_recoverable_seqnum: newversions[verinfo] = healths[verinfo] return newversions def needs_merge(self): # return True if there are multiple recoverable versions with the # same seqnum, meaning that MutableFileNode.read_best_version is not # giving you the whole story, and that using its data to do a # subsequent publish will lose information. recoverable_seqnums = [verinfo[0] for verinfo in self.recoverable_versions()] for seqnum in recoverable_seqnums: if recoverable_seqnums.count(seqnum) > 1: return True return False def get_update_data_for_share_and_verinfo(self, shnum, verinfo): """ I return the update data for the given shnum """ update_data = self.update_data[shnum] update_datum = [i[1] for i in update_data if i[0] == verinfo][0] return update_datum def set_update_data_for_share_and_verinfo(self, shnum, verinfo, data): """ I record the block hash tree for the given shnum. """ self.update_data.setdefault(shnum , []).append((verinfo, data)) class ServermapUpdater: def __init__(self, filenode, storage_broker, monitor, servermap, mode=MODE_READ, add_lease=False, update_range=None): """I update a servermap, locating a sufficient number of useful shares and remembering where they are located. """ self._node = filenode self._storage_broker = storage_broker self._monitor = monitor self._servermap = servermap self.mode = mode self._add_lease = add_lease self._running = True self._storage_index = filenode.get_storage_index() self._last_failure = None self._status = UpdateStatus() self._status.set_storage_index(self._storage_index) self._status.set_progress(0.0) self._status.set_mode(mode) self._servers_responded = set() # how much data should we read? # SDMF: # * if we only need the checkstring, then [0:75] # * if we need to validate the checkstring sig, then [543ish:799ish] # * if we need the verification key, then [107:436ish] # * the offset table at [75:107] tells us about the 'ish' # * if we need the encrypted private key, we want [-1216ish:] # * but we can't read from negative offsets # * the offset table tells us the 'ish', also the positive offset # MDMF: # * Checkstring? [0:72] # * If we want to validate the checkstring, then [0:72], [143:?] -- # the offset table will tell us for sure. # * If we need the verification key, we have to consult the offset # table as well. # At this point, we don't know which we are. Our filenode can # tell us, but it might be lying -- in some cases, we're # responsible for telling it which kind of file it is. self._read_size = 4000 if mode == MODE_CHECK: # we use unpack_prefix_and_signature, so we need 1k self._read_size = 1000 self._need_privkey = False if mode in (MODE_WRITE, MODE_REPAIR) and not self._node.get_privkey(): self._need_privkey = True # check+repair: repair requires the privkey, so if we didn't happen # to ask for it during the check, we'll have problems doing the # publish. self.fetch_update_data = False if mode == MODE_WRITE and update_range: # We're updating the servermap in preparation for an # in-place file update, so we need to fetch some additional # data from each share that we find. assert len(update_range) == 2 self.start_segment = update_range[0] self.end_segment = update_range[1] self.fetch_update_data = True prefix = si_b2a(self._storage_index)[:5] self._log_number = log.msg(format="SharemapUpdater(%(si)s): starting (%(mode)s)", si=prefix, mode=mode) def get_status(self): return self._status def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.mutable.mapupdate" return log.msg(*args, **kwargs) def update(self): """Update the servermap to reflect current conditions. Returns a Deferred that fires with the servermap once the update has finished.""" self._started = time.time() self._status.set_active(True) # self._valid_versions is a set of validated verinfo tuples. We just # use it to remember which versions had valid signatures, so we can # avoid re-checking the signatures for each share. self._valid_versions = set() self._done_deferred = defer.Deferred() # first, which servers should be talk to? Any that were in our old # servermap, plus "enough" others. self._queries_completed = 0 sb = self._storage_broker # All of the servers, permuted by the storage index, as usual. full_serverlist = list(sb.get_servers_for_psi(self._storage_index)) self.full_serverlist = full_serverlist # for use later, immutable self.extra_servers = full_serverlist[:] # servers are removed as we use them self._good_servers = set() # servers who had some shares self._servers_with_shares = set() #servers that we know have shares now self._empty_servers = set() # servers who don't have any shares self._bad_servers = set() # servers to whom our queries failed k = self._node.get_required_shares() # For what cases can these conditions work? if k is None: # make a guess k = 3 N = self._node.get_total_shares() if N is None: N = 10 self.EPSILON = k # we want to send queries to at least this many servers (although we # might not wait for all of their answers to come back) self.num_servers_to_query = k + self.EPSILON if self.mode in (MODE_CHECK, MODE_REPAIR): # We want to query all of the servers. initial_servers_to_query = list(full_serverlist) must_query = set(initial_servers_to_query) self.extra_servers = [] elif self.mode == MODE_WRITE: # we're planning to replace all the shares, so we want a good # chance of finding them all. We will keep searching until we've # seen epsilon that don't have a share. # We don't query all of the servers because that could take a while. self.num_servers_to_query = N + self.EPSILON initial_servers_to_query, must_query = self._build_initial_querylist() self.required_num_empty_servers = self.EPSILON # TODO: arrange to read lots of data from k-ish servers, to avoid # the extra round trip required to read large directories. This # might also avoid the round trip required to read the encrypted # private key. else: # MODE_READ, MODE_ANYTHING # 2*k servers is good enough. initial_servers_to_query, must_query = self._build_initial_querylist() # this is a set of servers that we are required to get responses # from: they are servers who used to have a share, so we need to know # where they currently stand, even if that means we have to wait for # a silently-lost TCP connection to time out. We remove servers from # this set as we get responses. self._must_query = set(must_query) # now initial_servers_to_query contains the servers that we should # ask, self.must_query contains the servers that we must have heard # from before we can consider ourselves finished, and # self.extra_servers contains the overflow (servers that we should # tap if we don't get enough responses) # I guess that self._must_query is a subset of # initial_servers_to_query? assert must_query.issubset(initial_servers_to_query) self._send_initial_requests(initial_servers_to_query) self._status.timings["initial_queries"] = time.time() - self._started return self._done_deferred def _build_initial_querylist(self): # we send queries to everyone who was already in the sharemap initial_servers_to_query = set(self._servermap.all_servers()) # and we must wait for responses from them must_query = set(initial_servers_to_query) while ((self.num_servers_to_query > len(initial_servers_to_query)) and self.extra_servers): initial_servers_to_query.add(self.extra_servers.pop(0)) return initial_servers_to_query, must_query def _send_initial_requests(self, serverlist): self._status.set_status("Sending %d initial queries" % len(serverlist)) self._queries_outstanding = set() for server in serverlist: self._queries_outstanding.add(server) self._do_query(server, self._storage_index, self._read_size) if not serverlist: # there is nobody to ask, so we need to short-circuit the state # machine. d = defer.maybeDeferred(self._check_for_done, None) d.addErrback(self._fatal_error) # control flow beyond this point: state machine. Receiving responses # from queries is the input. We might send out more queries, or we # might produce a result. return None def _do_query(self, server, storage_index, readsize): self.log(format="sending query to [%(name)s], readsize=%(readsize)d", name=server.get_name(), readsize=readsize, level=log.NOISY) started = time.time() self._queries_outstanding.add(server) d = self._do_read(server, storage_index, [], [(0, readsize)]) d.addCallback(self._got_results, server, readsize, storage_index, started) d.addErrback(self._query_failed, server) # errors that aren't handled by _query_failed (and errors caused by # _query_failed) get logged, but we still want to check for doneness. d.addErrback(log.err) d.addErrback(self._fatal_error) d.addCallback(self._check_for_done) return d def _do_read(self, server, storage_index, shnums, readv): ss = server.get_rref() if self._add_lease: # send an add-lease message in parallel. The results are handled # separately. This is sent before the slot_readv() so that we can # be sure the add_lease is retired by the time slot_readv comes # back (this relies upon our knowledge that the server code for # add_lease is synchronous). renew_secret = self._node.get_renewal_secret(server) cancel_secret = self._node.get_cancel_secret(server) d2 = ss.callRemote("add_lease", storage_index, renew_secret, cancel_secret) # we ignore success d2.addErrback(self._add_lease_failed, server, storage_index) d = ss.callRemote("slot_readv", storage_index, shnums, readv) return d def _got_corrupt_share(self, e, shnum, server, data, lp): """ I am called when a remote server returns a corrupt share in response to one of our queries. By corrupt, I mean a share without a valid signature. I then record the failure, notify the server of the corruption, and record the share as bad. """ f = failure.Failure(e) self.log(format="bad share: %(f_value)s", f_value=str(f), failure=f, parent=lp, level=log.WEIRD, umid="h5llHg") # Notify the server that its share is corrupt. self.notify_server_corruption(server, shnum, str(e)) # By flagging this as a bad server, we won't count any of # the other shares on that server as valid, though if we # happen to find a valid version string amongst those # shares, we'll keep track of it so that we don't need # to validate the signature on those again. self._bad_servers.add(server) self._last_failure = f # XXX: Use the reader for this? checkstring = data[:SIGNED_PREFIX_LENGTH] self._servermap.mark_bad_share(server, shnum, checkstring) self._servermap.add_problem(f) def _got_results(self, datavs, server, readsize, storage_index, started): lp = self.log(format="got result from [%(name)s], %(numshares)d shares", name=server.get_name(), numshares=len(datavs)) ss = server.get_rref() now = time.time() elapsed = now - started def _done_processing(ignored=None): self._queries_outstanding.discard(server) self._servermap.mark_server_reachable(server) self._must_query.discard(server) self._queries_completed += 1 if not self._running: self.log("but we're not running, so we'll ignore it", parent=lp) _done_processing() self._status.add_per_server_time(server, "late", started, elapsed) return self._status.add_per_server_time(server, "query", started, elapsed) if datavs: self._good_servers.add(server) else: self._empty_servers.add(server) ds = [] for shnum,datav in datavs.items(): data = datav[0] reader = MDMFSlotReadProxy(ss, storage_index, shnum, data, data_is_everything=(len(data) < readsize)) # our goal, with each response, is to validate the version # information and share data as best we can at this point -- # we do this by validating the signature. To do this, we # need to do the following: # - If we don't already have the public key, fetch the # public key. We use this to validate the signature. if not self._node.get_pubkey(): # fetch and set the public key. d = reader.get_verification_key() d.addCallback(lambda results, shnum=shnum: self._try_to_set_pubkey(results, server, shnum, lp)) # XXX: Make self._pubkey_query_failed? d.addErrback(lambda error, shnum=shnum, data=data: self._got_corrupt_share(error, shnum, server, data, lp)) else: # we already have the public key. d = defer.succeed(None) # Neither of these two branches return anything of # consequence, so the first entry in our deferredlist will # be None. # - Next, we need the version information. We almost # certainly got this by reading the first thousand or so # bytes of the share on the storage server, so we # shouldn't need to fetch anything at this step. d2 = reader.get_verinfo() d2.addErrback(lambda error, shnum=shnum, data=data: self._got_corrupt_share(error, shnum, server, data, lp)) # - Next, we need the signature. For an SDMF share, it is # likely that we fetched this when doing our initial fetch # to get the version information. In MDMF, this lives at # the end of the share, so unless the file is quite small, # we'll need to do a remote fetch to get it. d3 = reader.get_signature() d3.addErrback(lambda error, shnum=shnum, data=data: self._got_corrupt_share(error, shnum, server, data, lp)) # Once we have all three of these responses, we can move on # to validating the signature # Does the node already have a privkey? If not, we'll try to # fetch it here. if self._need_privkey: d4 = reader.get_encprivkey() d4.addCallback(lambda results, shnum=shnum: self._try_to_validate_privkey(results, server, shnum, lp)) d4.addErrback(lambda error, shnum=shnum: self._privkey_query_failed(error, server, shnum, lp)) else: d4 = defer.succeed(None) if self.fetch_update_data: # fetch the block hash tree and first + last segment, as # configured earlier. # Then set them in wherever we happen to want to set # them. ds = [] # XXX: We do this above, too. Is there a good way to # make the two routines share the value without # introducing more roundtrips? ds.append(reader.get_verinfo()) ds.append(reader.get_blockhashes()) ds.append(reader.get_block_and_salt(self.start_segment)) ds.append(reader.get_block_and_salt(self.end_segment)) d5 = deferredutil.gatherResults(ds) d5.addCallback(self._got_update_results_one_share, shnum) else: d5 = defer.succeed(None) dl = defer.DeferredList([d, d2, d3, d4, d5]) def _append_proxy(passthrough, shnum=shnum, reader=reader): # Store the proxy (with its cache) keyed by serverid and # version. _, (_,verinfo), _, _, _ = passthrough verinfo = self._make_verinfo_hashable(verinfo) self._servermap.proxies[(verinfo, server.get_serverid(), storage_index, shnum)] = reader return passthrough dl.addCallback(_append_proxy) dl.addBoth(self._turn_barrier) dl.addCallback(lambda results, shnum=shnum: self._got_signature_one_share(results, shnum, server, lp)) dl.addErrback(lambda error, shnum=shnum, data=data: self._got_corrupt_share(error, shnum, server, data, lp)) ds.append(dl) # dl is a deferred list that will fire when all of the shares # that we found on this server are done processing. When dl fires, # we know that processing is done, so we can decrement the # semaphore-like thing that we incremented earlier. dl = defer.DeferredList(ds, fireOnOneErrback=True) # Are we done? Done means that there are no more queries to # send, that there are no outstanding queries, and that we # haven't received any queries that are still processing. If we # are done, self._check_for_done will cause the done deferred # that we returned to our caller to fire, which tells them that # they have a complete servermap, and that we won't be touching # the servermap anymore. dl.addCallback(_done_processing) dl.addCallback(self._check_for_done) dl.addErrback(self._fatal_error) # all done! self.log("_got_results done", parent=lp, level=log.NOISY) return dl def _turn_barrier(self, result): """ I help the servermap updater avoid the recursion limit issues discussed in #237. """ return fireEventually(result) def _try_to_set_pubkey(self, pubkey_s, server, shnum, lp): if self._node.get_pubkey(): return # don't go through this again if we don't have to fingerprint = hashutil.ssk_pubkey_fingerprint_hash(pubkey_s) assert len(fingerprint) == 32 if fingerprint != self._node.get_fingerprint(): raise CorruptShareError(server, shnum, "pubkey doesn't match fingerprint") self._node._populate_pubkey(self._deserialize_pubkey(pubkey_s)) assert self._node.get_pubkey() def notify_server_corruption(self, server, shnum, reason): rref = server.get_rref() rref.callRemoteOnly("advise_corrupt_share", "mutable", self._storage_index, shnum, reason) def _got_signature_one_share(self, results, shnum, server, lp): # It is our job to give versioninfo to our caller. We need to # raise CorruptShareError if the share is corrupt for any # reason, something that our caller will handle. self.log(format="_got_results: got shnum #%(shnum)d from serverid %(name)s", shnum=shnum, name=server.get_name(), level=log.NOISY, parent=lp) if not self._running: # We can't process the results, since we can't touch the # servermap anymore. self.log("but we're not running anymore.") return None _, verinfo, signature, __, ___ = results verinfo = self._make_verinfo_hashable(verinfo[1]) # This tuple uniquely identifies a share on the grid; we use it # to keep track of the ones that we've already seen. (seqnum, root_hash, saltish, segsize, datalen, k, n, prefix, offsets_tuple) = verinfo if verinfo not in self._valid_versions: # This is a new version tuple, and we need to validate it # against the public key before keeping track of it. assert self._node.get_pubkey() valid = self._node.get_pubkey().verify(prefix, signature[1]) if not valid: raise CorruptShareError(server, shnum, "signature is invalid") # ok, it's a valid verinfo. Add it to the list of validated # versions. self.log(" found valid version %d-%s from %s-sh%d: %d-%d/%d/%d" % (seqnum, base32.b2a(root_hash)[:4], server.get_name(), shnum, k, n, segsize, datalen), parent=lp) self._valid_versions.add(verinfo) # We now know that this is a valid candidate verinfo. Whether or # not this instance of it is valid is a matter for the next # statement; at this point, we just know that if we see this # version info again, that its signature checks out and that # we're okay to skip the signature-checking step. # (server, shnum) are bound in the method invocation. if (server, shnum) in self._servermap.get_bad_shares(): # we've been told that the rest of the data in this share is # unusable, so don't add it to the servermap. self.log("but we've been told this is a bad share", parent=lp, level=log.UNUSUAL) return verinfo # Add the info to our servermap. timestamp = time.time() self._servermap.add_new_share(server, shnum, verinfo, timestamp) self._servers_with_shares.add(server) return verinfo def _make_verinfo_hashable(self, verinfo): (seqnum, root_hash, saltish, segsize, datalen, k, n, prefix, offsets) = verinfo offsets_tuple = tuple( [(key,value) for key,value in offsets.items()] ) verinfo = (seqnum, root_hash, saltish, segsize, datalen, k, n, prefix, offsets_tuple) return verinfo def _got_update_results_one_share(self, results, share): """ I record the update results in results. """ assert len(results) == 4 verinfo, blockhashes, start, end = results verinfo = self._make_verinfo_hashable(verinfo) update_data = (blockhashes, start, end) self._servermap.set_update_data_for_share_and_verinfo(share, verinfo, update_data) def _deserialize_pubkey(self, pubkey_s): verifier = rsa.create_verifying_key_from_string(pubkey_s) return verifier def _try_to_validate_privkey(self, enc_privkey, server, shnum, lp): """ Given a writekey from a remote server, I validate it against the writekey stored in my node. If it is valid, then I set the privkey and encprivkey properties of the node. """ alleged_privkey_s = self._node._decrypt_privkey(enc_privkey) alleged_writekey = hashutil.ssk_writekey_hash(alleged_privkey_s) if alleged_writekey != self._node.get_writekey(): self.log("invalid privkey from %s shnum %d" % (server.get_name(), shnum), parent=lp, level=log.WEIRD, umid="aJVccw") return # it's good self.log("got valid privkey from shnum %d on serverid %s" % (shnum, server.get_name()), parent=lp) privkey = rsa.create_signing_key_from_string(alleged_privkey_s) self._node._populate_encprivkey(enc_privkey) self._node._populate_privkey(privkey) self._need_privkey = False self._status.set_privkey_from(server) def _add_lease_failed(self, f, server, storage_index): # Older versions of Tahoe didn't handle the add-lease message very # well: <=1.1.0 throws a NameError because it doesn't implement # remote_add_lease(), 1.2.0/1.3.0 throw IndexError on unknown buckets # (which is most of them, since we send add-lease to everybody, # before we know whether or not they have any shares for us), and # 1.2.0 throws KeyError even on known buckets due to an internal bug # in the latency-measuring code. # we want to ignore the known-harmless errors and log the others. In # particular we want to log any local errors caused by coding # problems. if f.check(DeadReferenceError): return if f.check(RemoteException): if f.value.failure.check(KeyError, IndexError, NameError): # this may ignore a bit too much, but that only hurts us # during debugging return self.log(format="error in add_lease from [%(name)s]: %(f_value)s", name=server.get_name(), f_value=str(f.value), failure=f, level=log.WEIRD, umid="iqg3mw") return # local errors are cause for alarm log.err(f, format="local error in add_lease to [%(name)s]: %(f_value)s", name=server.get_name(), f_value=str(f.value), level=log.WEIRD, umid="ZWh6HA") def _query_failed(self, f, server): if not self._running: return level = log.WEIRD if f.check(DeadReferenceError): level = log.UNUSUAL self.log(format="error during query: %(f_value)s", f_value=str(f.value), failure=f, level=level, umid="IHXuQg") self._must_query.discard(server) self._queries_outstanding.discard(server) self._bad_servers.add(server) self._servermap.add_problem(f) # a server could be in both ServerMap.reachable_servers and # .unreachable_servers if they responded to our query, but then an # exception was raised in _got_results. self._servermap.mark_server_unreachable(server) self._queries_completed += 1 self._last_failure = f def _privkey_query_failed(self, f, server, shnum, lp): self._queries_outstanding.discard(server) if not self._running: return level = log.WEIRD if f.check(DeadReferenceError): level = log.UNUSUAL self.log(format="error during privkey query: %(f_value)s", f_value=str(f.value), failure=f, parent=lp, level=level, umid="McoJ5w") self._servermap.add_problem(f) self._last_failure = f def _check_for_done(self, res): # exit paths: # return self._send_more_queries(outstanding) : send some more queries # return self._done() : all done # return : keep waiting, no new queries lp = self.log(format=("_check_for_done, mode is '%(mode)s', " "%(outstanding)d queries outstanding, " "%(extra)d extra servers available, " "%(must)d 'must query' servers left, " "need_privkey=%(need_privkey)s" ), mode=self.mode, outstanding=len(self._queries_outstanding), extra=len(self.extra_servers), must=len(self._must_query), need_privkey=self._need_privkey, level=log.NOISY, ) if not self._running: self.log("but we're not running", parent=lp, level=log.NOISY) return if self._must_query: # we are still waiting for responses from servers that used to have # a share, so we must continue to wait. No additional queries are # required at this time. self.log("%d 'must query' servers left" % len(self._must_query), level=log.NOISY, parent=lp) return if (not self._queries_outstanding and not self.extra_servers): # all queries have retired, and we have no servers left to ask. No # more progress can be made, therefore we are done. self.log("all queries are retired, no extra servers: done", parent=lp) return self._done() recoverable_versions = self._servermap.recoverable_versions() unrecoverable_versions = self._servermap.unrecoverable_versions() # what is our completion policy? how hard should we work? if self.mode == MODE_ANYTHING: if recoverable_versions: self.log("%d recoverable versions: done" % len(recoverable_versions), parent=lp) return self._done() if self.mode in (MODE_CHECK, MODE_REPAIR): # we used self._must_query, and we know there aren't any # responses still waiting, so that means we must be done self.log("done", parent=lp) return self._done() MAX_IN_FLIGHT = 5 if self.mode == MODE_READ: # if we've queried k+epsilon servers, and we see a recoverable # version, and we haven't seen any unrecoverable higher-seqnum'ed # versions, then we're done. if self._queries_completed < self.num_servers_to_query: self.log(format="%(completed)d completed, %(query)d to query: need more", completed=self._queries_completed, query=self.num_servers_to_query, level=log.NOISY, parent=lp) return self._send_more_queries(MAX_IN_FLIGHT) if not recoverable_versions: self.log("no recoverable versions: need more", level=log.NOISY, parent=lp) return self._send_more_queries(MAX_IN_FLIGHT) highest_recoverable = max(recoverable_versions) highest_recoverable_seqnum = highest_recoverable[0] for unrec_verinfo in unrecoverable_versions: if unrec_verinfo[0] > highest_recoverable_seqnum: # there is evidence of a higher-seqnum version, but we # don't yet see enough shares to recover it. Try harder. # TODO: consider sending more queries. # TODO: consider limiting the search distance self.log("evidence of higher seqnum: need more", level=log.UNUSUAL, parent=lp) return self._send_more_queries(MAX_IN_FLIGHT) # all the unrecoverable versions were old or concurrent with a # recoverable version. Good enough. self.log("no higher-seqnum: done", parent=lp) return self._done() if self.mode == MODE_WRITE: # we want to keep querying until we've seen a few that don't have # any shares, to be sufficiently confident that we've seen all # the shares. This is still less work than MODE_CHECK, which asks # every server in the world. if not recoverable_versions: self.log("no recoverable versions: need more", parent=lp, level=log.NOISY) return self._send_more_queries(MAX_IN_FLIGHT) last_found = -1 last_not_responded = -1 num_not_responded = 0 num_not_found = 0 states = [] found_boundary = False for i,server in enumerate(self.full_serverlist): if server in self._bad_servers: # query failed states.append("x") #self.log("loop [%s]: x" % server.get_name() elif server in self._empty_servers: # no shares states.append("0") #self.log("loop [%s]: 0" % server.get_name() if last_found != -1: num_not_found += 1 if num_not_found >= self.EPSILON: self.log("found our boundary, %s" % "".join(states), parent=lp, level=log.NOISY) found_boundary = True break elif server in self._servers_with_shares: # yes shares states.append("1") #self.log("loop [%s]: 1" % server.get_name() last_found = i num_not_found = 0 else: # not responded yet states.append("?") #self.log("loop [%s]: ?" % server.get_name() last_not_responded = i num_not_responded += 1 if found_boundary: # we need to know that we've gotten answers from # everybody to the left of here if last_not_responded == -1: # we're done self.log("have all our answers", parent=lp, level=log.NOISY) # .. unless we're still waiting on the privkey if self._need_privkey: self.log("but we're still waiting for the privkey", parent=lp, level=log.NOISY) # if we found the boundary but we haven't yet found # the privkey, we may need to look further. If # somehow all the privkeys were corrupted (but the # shares were readable), then this is likely to do an # exhaustive search. return self._send_more_queries(MAX_IN_FLIGHT) return self._done() # still waiting for somebody return self._send_more_queries(num_not_responded) # if we hit here, we didn't find our boundary, so we're still # waiting for servers self.log("no boundary yet, %s" % "".join(states), parent=lp, level=log.NOISY) return self._send_more_queries(MAX_IN_FLIGHT) # otherwise, keep up to 5 queries in flight. TODO: this is pretty # arbitrary, really I want this to be something like k - # max(known_version_sharecounts) + some extra self.log("catchall: need more", parent=lp, level=log.NOISY) return self._send_more_queries(MAX_IN_FLIGHT) def _send_more_queries(self, num_outstanding): more_queries = [] while True: self.log(format=" there are %(outstanding)d queries outstanding", outstanding=len(self._queries_outstanding), level=log.NOISY) active_queries = len(self._queries_outstanding) + len(more_queries) if active_queries >= num_outstanding: break if not self.extra_servers: break more_queries.append(self.extra_servers.pop(0)) self.log(format="sending %(more)d more queries: %(who)s", more=len(more_queries), who=" ".join(["[%s]" % s.get_name() for s in more_queries]), level=log.NOISY) for server in more_queries: self._do_query(server, self._storage_index, self._read_size) # we'll retrigger when those queries come back def _done(self): if not self._running: self.log("not running; we're already done") return self._running = False now = time.time() elapsed = now - self._started self._status.set_finished(now) self._status.timings["total"] = elapsed self._status.set_progress(1.0) self._status.set_status("Finished") self._status.set_active(False) self._servermap.set_last_update(self.mode, self._started) # the servermap will not be touched after this self.log("servermap: %s" % self._servermap.summarize_versions()) eventually(self._done_deferred.callback, self._servermap) def _fatal_error(self, f): self.log("fatal error", failure=f, level=log.WEIRD, umid="1cNvlw") self._done_deferred.errback(f) allmydata-tahoe-1.10.2/src/allmydata/mutable/checker.py0000644000175000017500000003050712556560070021142 0ustar ramram from allmydata.uri import from_string from allmydata.util import base32, log, dictutil from allmydata.util.happinessutil import servers_of_happiness from allmydata.check_results import CheckAndRepairResults, CheckResults from allmydata.mutable.common import MODE_CHECK, MODE_WRITE, CorruptShareError from allmydata.mutable.servermap import ServerMap, ServermapUpdater from allmydata.mutable.retrieve import Retrieve # for verifying class MutableChecker: SERVERMAP_MODE = MODE_CHECK def __init__(self, node, storage_broker, history, monitor): self._node = node self._storage_broker = storage_broker self._history = history self._monitor = monitor self.bad_shares = [] # list of (server,shnum,failure) self._storage_index = self._node.get_storage_index() self.need_repair = False self.responded = set() # set of (binary) nodeids def check(self, verify=False, add_lease=False): servermap = ServerMap() # Updating the servermap in MODE_CHECK will stand a good chance # of finding all of the shares, and getting a good idea of # recoverability, etc, without verifying. u = ServermapUpdater(self._node, self._storage_broker, self._monitor, servermap, self.SERVERMAP_MODE, add_lease=add_lease) if self._history: self._history.notify_mapupdate(u.get_status()) d = u.update() d.addCallback(self._got_mapupdate_results) if verify: d.addCallback(self._verify_all_shares) d.addCallback(lambda res: servermap) d.addCallback(self._make_checker_results) return d def _got_mapupdate_results(self, servermap): # the file is healthy if there is exactly one recoverable version, it # has at least N distinct shares, and there are no unrecoverable # versions: all existing shares will be for the same version. self._monitor.raise_if_cancelled() self.best_version = None num_recoverable = len(servermap.recoverable_versions()) if num_recoverable: self.best_version = servermap.best_recoverable_version() # The file is unhealthy and needs to be repaired if: # - There are unrecoverable versions. if servermap.unrecoverable_versions(): self.need_repair = True # - There isn't a recoverable version. if num_recoverable != 1: self.need_repair = True # - The best recoverable version is missing some shares. if self.best_version: available_shares = servermap.shares_available() (num_distinct_shares, k, N) = available_shares[self.best_version] if num_distinct_shares < N: self.need_repair = True return servermap def _verify_all_shares(self, servermap): # read every byte of each share # # This logic is going to be very nearly the same as the # downloader. I bet we could pass the downloader a flag that # makes it do this, and piggyback onto that instead of # duplicating a bunch of code. # # Like: # r = Retrieve(blah, blah, blah, verify=True) # d = r.download() # (wait, wait, wait, d.callback) # # Then, when it has finished, we can check the servermap (which # we provided to Retrieve) to figure out which shares are bad, # since the Retrieve process will have updated the servermap as # it went along. # # By passing the verify=True flag to the constructor, we are # telling the downloader a few things. # # 1. It needs to download all N shares, not just K shares. # 2. It doesn't need to decrypt or decode the shares, only # verify them. if not self.best_version: return r = Retrieve(self._node, self._storage_broker, servermap, self.best_version, verify=True) d = r.download() d.addCallback(self._process_bad_shares) return d def _process_bad_shares(self, bad_shares): if bad_shares: self.need_repair = True self.bad_shares = bad_shares def _count_shares(self, smap, version): available_shares = smap.shares_available() (num_distinct_shares, k, N) = available_shares[version] counters = {} counters["count-shares-good"] = num_distinct_shares counters["count-shares-needed"] = k counters["count-shares-expected"] = N good_hosts = smap.all_servers_for_version(version) counters["count-good-share-hosts"] = len(good_hosts) vmap = smap.make_versionmap() counters["count-wrong-shares"] = sum([len(shares) for verinfo,shares in vmap.items() if verinfo != version]) return counters def _make_checker_results(self, smap): self._monitor.raise_if_cancelled() healthy = True report = [] summary = [] vmap = smap.make_versionmap() recoverable = smap.recoverable_versions() unrecoverable = smap.unrecoverable_versions() if recoverable: report.append("Recoverable Versions: " + "/".join(["%d*%s" % (len(vmap[v]), smap.summarize_version(v)) for v in recoverable])) if unrecoverable: report.append("Unrecoverable Versions: " + "/".join(["%d*%s" % (len(vmap[v]), smap.summarize_version(v)) for v in unrecoverable])) if smap.unrecoverable_versions(): healthy = False summary.append("some versions are unrecoverable") report.append("Unhealthy: some versions are unrecoverable") if len(recoverable) == 0: healthy = False summary.append("no versions are recoverable") report.append("Unhealthy: no versions are recoverable") if len(recoverable) > 1: healthy = False summary.append("multiple versions are recoverable") report.append("Unhealthy: there are multiple recoverable versions") if recoverable: best_version = smap.best_recoverable_version() report.append("Best Recoverable Version: " + smap.summarize_version(best_version)) counters = self._count_shares(smap, best_version) s = counters["count-shares-good"] k = counters["count-shares-needed"] N = counters["count-shares-expected"] if s < N: healthy = False report.append("Unhealthy: best version has only %d shares " "(encoding is %d-of-%d)" % (s, k, N)) summary.append("%d shares (enc %d-of-%d)" % (s, k, N)) elif unrecoverable: healthy = False # find a k and N from somewhere first = list(unrecoverable)[0] # not exactly the best version, but that doesn't matter too much counters = self._count_shares(smap, first) else: # couldn't find anything at all counters = { "count-shares-good": 0, "count-shares-needed": 3, # arbitrary defaults "count-shares-expected": 10, "count-good-share-hosts": 0, "count-wrong-shares": 0, } corrupt_share_locators = [] problems = [] if self.bad_shares: report.append("Corrupt Shares:") summary.append("Corrupt Shares:") for (server, shnum, f) in sorted(self.bad_shares): serverid = server.get_serverid() locator = (server, self._storage_index, shnum) corrupt_share_locators.append(locator) s = "%s-sh%d" % (server.get_name(), shnum) if f.check(CorruptShareError): ft = f.value.reason else: ft = str(f) report.append(" %s: %s" % (s, ft)) summary.append(s) p = (serverid, self._storage_index, shnum, f) problems.append(p) msg = ("CorruptShareError during mutable verify, " "serverid=%(serverid)s, si=%(si)s, shnum=%(shnum)d, " "where=%(where)s") log.msg(format=msg, serverid=server.get_name(), si=base32.b2a(self._storage_index), shnum=shnum, where=ft, level=log.WEIRD, umid="EkK8QA") sharemap = dictutil.DictOfSets() for verinfo in vmap: for (shnum, server, timestamp) in vmap[verinfo]: shareid = "%s-sh%d" % (smap.summarize_version(verinfo), shnum) sharemap.add(shareid, server) if healthy: summary = "Healthy" else: summary = "Unhealthy: " + " ".join(summary) count_happiness = servers_of_happiness(sharemap) cr = CheckResults(from_string(self._node.get_uri()), self._storage_index, healthy=healthy, recoverable=bool(recoverable), count_happiness=count_happiness, count_shares_needed=counters["count-shares-needed"], count_shares_expected=counters["count-shares-expected"], count_shares_good=counters["count-shares-good"], count_good_share_hosts=counters["count-good-share-hosts"], count_recoverable_versions=len(recoverable), count_unrecoverable_versions=len(unrecoverable), servers_responding=list(smap.get_reachable_servers()), sharemap=sharemap, count_wrong_shares=counters["count-wrong-shares"], list_corrupt_shares=corrupt_share_locators, count_corrupt_shares=len(corrupt_share_locators), list_incompatible_shares=[], count_incompatible_shares=0, summary=summary, report=report, share_problems=problems, servermap=smap.copy()) return cr class MutableCheckAndRepairer(MutableChecker): SERVERMAP_MODE = MODE_WRITE # needed to get the privkey def __init__(self, node, storage_broker, history, monitor): MutableChecker.__init__(self, node, storage_broker, history, monitor) self.cr_results = CheckAndRepairResults(self._storage_index) self.need_repair = False def check(self, verify=False, add_lease=False): d = MutableChecker.check(self, verify, add_lease) d.addCallback(self._stash_pre_repair_results) d.addCallback(self._maybe_repair) d.addCallback(lambda res: self.cr_results) return d def _stash_pre_repair_results(self, pre_repair_results): self.cr_results.pre_repair_results = pre_repair_results return pre_repair_results def _maybe_repair(self, pre_repair_results): crr = self.cr_results self._monitor.raise_if_cancelled() if not self.need_repair: crr.post_repair_results = pre_repair_results return if self._node.is_readonly(): # ticket #625: we cannot yet repair read-only mutable files crr.post_repair_results = pre_repair_results crr.repair_attempted = False return crr.repair_attempted = True d = self._node.repair(pre_repair_results, monitor=self._monitor) def _repair_finished(rr): crr.repair_successful = rr.get_successful() crr.post_repair_results = self._make_checker_results(rr.servermap) crr.repair_results = rr # TODO? return def _repair_error(f): # I'm not sure if I want to pass through a failure or not. crr.repair_successful = False crr.repair_failure = f # TODO? #crr.post_repair_results = ?? return f d.addCallbacks(_repair_finished, _repair_error) return d allmydata-tahoe-1.10.2/src/allmydata/mutable/retrieve.py0000644000175000017500000012240512556560070021362 0ustar ramram import time from itertools import count from zope.interface import implements from twisted.internet import defer from twisted.python import failure from twisted.internet.interfaces import IPushProducer, IConsumer from foolscap.api import eventually, fireEventually, DeadReferenceError, \ RemoteException from allmydata.interfaces import IRetrieveStatus, NotEnoughSharesError, \ DownloadStopped, MDMF_VERSION, SDMF_VERSION from allmydata.util.assertutil import _assert, precondition from allmydata.util import hashutil, log, mathutil, deferredutil from allmydata.util.dictutil import DictOfSets from allmydata import hashtree, codec from allmydata.storage.server import si_b2a from pycryptopp.cipher.aes import AES from pycryptopp.publickey import rsa from allmydata.mutable.common import CorruptShareError, BadShareError, \ UncoordinatedWriteError from allmydata.mutable.layout import MDMFSlotReadProxy class RetrieveStatus: implements(IRetrieveStatus) statusid_counter = count(0) def __init__(self): self.timings = {} self.timings["fetch_per_server"] = {} self.timings["decode"] = 0.0 self.timings["decrypt"] = 0.0 self.timings["cumulative_verify"] = 0.0 self._problems = {} self.active = True self.storage_index = None self.helper = False self.encoding = ("?","?") self.size = None self.status = "Not started" self.progress = 0.0 self.counter = self.statusid_counter.next() self.started = time.time() def get_started(self): return self.started def get_storage_index(self): return self.storage_index def get_encoding(self): return self.encoding def using_helper(self): return self.helper def get_size(self): return self.size def get_status(self): return self.status def get_progress(self): return self.progress def get_active(self): return self.active def get_counter(self): return self.counter def get_problems(self): return self._problems def add_fetch_timing(self, server, elapsed): if server not in self.timings["fetch_per_server"]: self.timings["fetch_per_server"][server] = [] self.timings["fetch_per_server"][server].append(elapsed) def accumulate_decode_time(self, elapsed): self.timings["decode"] += elapsed def accumulate_decrypt_time(self, elapsed): self.timings["decrypt"] += elapsed def set_storage_index(self, si): self.storage_index = si def set_helper(self, helper): self.helper = helper def set_encoding(self, k, n): self.encoding = (k, n) def set_size(self, size): self.size = size def set_status(self, status): self.status = status def set_progress(self, value): self.progress = value def set_active(self, value): self.active = value def add_problem(self, server, f): serverid = server.get_serverid() self._problems[serverid] = f class Marker: pass class Retrieve: # this class is currently single-use. Eventually (in MDMF) we will make # it multi-use, in which case you can call download(range) multiple # times, and each will have a separate response chain. However the # Retrieve object will remain tied to a specific version of the file, and # will use a single ServerMap instance. implements(IPushProducer) def __init__(self, filenode, storage_broker, servermap, verinfo, fetch_privkey=False, verify=False): self._node = filenode _assert(self._node.get_pubkey()) self._storage_broker = storage_broker self._storage_index = filenode.get_storage_index() _assert(self._node.get_readkey()) self._last_failure = None prefix = si_b2a(self._storage_index)[:5] self._log_number = log.msg("Retrieve(%s): starting" % prefix) self._running = True self._decoding = False self._bad_shares = set() self.servermap = servermap self.verinfo = verinfo # TODO: make it possible to use self.verinfo.datalength instead (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = self.verinfo self._data_length = datalength # during repair, we may be called upon to grab the private key, since # it wasn't picked up during a verify=False checker run, and we'll # need it for repair to generate a new version. self._need_privkey = verify or (fetch_privkey and not self._node.get_privkey()) if self._need_privkey: # TODO: Evaluate the need for this. We'll use it if we want # to limit how many queries are on the wire for the privkey # at once. self._privkey_query_markers = [] # one Marker for each time we've # tried to get the privkey. # verify means that we are using the downloader logic to verify all # of our shares. This tells the downloader a few things. # # 1. We need to download all of the shares. # 2. We don't need to decode or decrypt the shares, since our # caller doesn't care about the plaintext, only the # information about which shares are or are not valid. # 3. When we are validating readers, we need to validate the # signature on the prefix. Do we? We already do this in the # servermap update? self._verify = verify self._status = RetrieveStatus() self._status.set_storage_index(self._storage_index) self._status.set_helper(False) self._status.set_progress(0.0) self._status.set_active(True) self._status.set_size(datalength) self._status.set_encoding(k, N) self.readers = {} self._stopped = False self._pause_deferred = None self._offset = None self._read_length = None self.log("got seqnum %d" % self.verinfo[0]) def get_status(self): return self._status def log(self, *args, **kwargs): if "parent" not in kwargs: kwargs["parent"] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.mutable.retrieve" return log.msg(*args, **kwargs) def _set_current_status(self, state): seg = "%d/%d" % (self._current_segment, self._last_segment) self._status.set_status("segment %s (%s)" % (seg, state)) ################### # IPushProducer def pauseProducing(self): """ I am called by my download target if we have produced too much data for it to handle. I make the downloader stop producing new data until my resumeProducing method is called. """ if self._pause_deferred is not None: return # fired when the download is unpaused. self._old_status = self._status.get_status() self._set_current_status("paused") self._pause_deferred = defer.Deferred() def resumeProducing(self): """ I am called by my download target once it is ready to begin receiving data again. """ if self._pause_deferred is None: return p = self._pause_deferred self._pause_deferred = None self._status.set_status(self._old_status) eventually(p.callback, None) def stopProducing(self): self._stopped = True self.resumeProducing() def _check_for_paused(self, res): """ I am called just before a write to the consumer. I return a Deferred that eventually fires with the data that is to be written to the consumer. If the download has not been paused, the Deferred fires immediately. Otherwise, the Deferred fires when the downloader is unpaused. """ if self._pause_deferred is not None: d = defer.Deferred() self._pause_deferred.addCallback(lambda ignored: d.callback(res)) return d return res def _check_for_stopped(self, res): if self._stopped: raise DownloadStopped("our Consumer called stopProducing()") return res def download(self, consumer=None, offset=0, size=None): precondition(self._verify or IConsumer.providedBy(consumer)) if size is None: size = self._data_length - offset if self._verify: _assert(size == self._data_length, (size, self._data_length)) self.log("starting download") self._done_deferred = defer.Deferred() if consumer: self._consumer = consumer # we provide IPushProducer, so streaming=True, per IConsumer. self._consumer.registerProducer(self, streaming=True) self._started = time.time() self._started_fetching = time.time() if size == 0: # short-circuit the rest of the process self._done() else: self._start_download(consumer, offset, size) return self._done_deferred def _start_download(self, consumer, offset, size): precondition((0 <= offset < self._data_length) and (size > 0) and (offset+size <= self._data_length), (offset, size, self._data_length)) self._offset = offset self._read_length = size self._setup_encoding_parameters() self._setup_download() # The download process beyond this is a state machine. # _add_active_servers will select the servers that we want to use # for the download, and then attempt to start downloading. After # each segment, it will check for doneness, reacting to broken # servers and corrupt shares as necessary. If it runs out of good # servers before downloading all of the segments, _done_deferred # will errback. Otherwise, it will eventually callback with the # contents of the mutable file. self.loop() def loop(self): d = fireEventually(None) # avoid #237 recursion limit problem d.addCallback(lambda ign: self._activate_enough_servers()) d.addCallback(lambda ign: self._download_current_segment()) # when we're done, _download_current_segment will call _done. If we # aren't, it will call loop() again. d.addErrback(self._error) def _setup_download(self): self._status.set_status("Retrieving Shares") # how many shares do we need? (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = self.verinfo # first, which servers can we use? versionmap = self.servermap.make_versionmap() shares = versionmap[self.verinfo] # this sharemap is consumed as we decide to send requests self.remaining_sharemap = DictOfSets() for (shnum, server, timestamp) in shares: self.remaining_sharemap.add(shnum, server) # Reuse the SlotReader from the servermap. key = (self.verinfo, server.get_serverid(), self._storage_index, shnum) if key in self.servermap.proxies: reader = self.servermap.proxies[key] else: reader = MDMFSlotReadProxy(server.get_rref(), self._storage_index, shnum, None) reader.server = server self.readers[shnum] = reader if len(self.remaining_sharemap) < k: self._raise_notenoughshareserror() self.shares = {} # maps shnum to validated blocks self._active_readers = [] # list of active readers for this dl. self._block_hash_trees = {} # shnum => hashtree for i in xrange(self._total_shares): # So we don't have to do this later. self._block_hash_trees[i] = hashtree.IncompleteHashTree(self._num_segments) # We need one share hash tree for the entire file; its leaves # are the roots of the block hash trees for the shares that # comprise it, and its root is in the verinfo. self.share_hash_tree = hashtree.IncompleteHashTree(N) self.share_hash_tree.set_hashes({0: root_hash}) def decode(self, blocks_and_salts, segnum): """ I am a helper method that the mutable file update process uses as a shortcut to decode and decrypt the segments that it needs to fetch in order to perform a file update. I take in a collection of blocks and salts, and pick some of those to make a segment with. I return the plaintext associated with that segment. """ # We don't need the block hash trees in this case. self._block_hash_trees = None self._offset = 0 self._read_length = self._data_length self._setup_encoding_parameters() # _decode_blocks() expects the output of a gatherResults that # contains the outputs of _validate_block() (each of which is a dict # mapping shnum to (block,salt) bytestrings). d = self._decode_blocks([blocks_and_salts], segnum) d.addCallback(self._decrypt_segment) return d def _setup_encoding_parameters(self): """ I set up the encoding parameters, including k, n, the number of segments associated with this file, and the segment decoders. """ (seqnum, root_hash, IV, segsize, datalength, k, n, known_prefix, offsets_tuple) = self.verinfo self._required_shares = k self._total_shares = n self._segment_size = segsize #self._data_length = datalength # set during __init__() if not IV: self._version = MDMF_VERSION else: self._version = SDMF_VERSION if datalength and segsize: self._num_segments = mathutil.div_ceil(datalength, segsize) self._tail_data_size = datalength % segsize else: self._num_segments = 0 self._tail_data_size = 0 self._segment_decoder = codec.CRSDecoder() self._segment_decoder.set_params(segsize, k, n) if not self._tail_data_size: self._tail_data_size = segsize self._tail_segment_size = mathutil.next_multiple(self._tail_data_size, self._required_shares) if self._tail_segment_size == self._segment_size: self._tail_decoder = self._segment_decoder else: self._tail_decoder = codec.CRSDecoder() self._tail_decoder.set_params(self._tail_segment_size, self._required_shares, self._total_shares) self.log("got encoding parameters: " "k: %d " "n: %d " "%d segments of %d bytes each (%d byte tail segment)" % \ (k, n, self._num_segments, self._segment_size, self._tail_segment_size)) # Our last task is to tell the downloader where to start and # where to stop. We use three parameters for that: # - self._start_segment: the segment that we need to start # downloading from. # - self._current_segment: the next segment that we need to # download. # - self._last_segment: The last segment that we were asked to # download. # # We say that the download is complete when # self._current_segment > self._last_segment. We use # self._start_segment and self._last_segment to know when to # strip things off of segments, and how much to strip. if self._offset: self.log("got offset: %d" % self._offset) # our start segment is the first segment containing the # offset we were given. start = self._offset // self._segment_size _assert(start <= self._num_segments, start=start, num_segments=self._num_segments, offset=self._offset, segment_size=self._segment_size) self._start_segment = start self.log("got start segment: %d" % self._start_segment) else: self._start_segment = 0 # We might want to read only part of the file, and need to figure out # where to stop reading. Our end segment is the last segment # containing part of the segment that we were asked to read. _assert(self._read_length > 0, self._read_length) end_data = self._offset + self._read_length # We don't actually need to read the byte at end_data, but the one # before it. end = (end_data - 1) // self._segment_size _assert(0 <= end < self._num_segments, end=end, num_segments=self._num_segments, end_data=end_data, offset=self._offset, read_length=self._read_length, segment_size=self._segment_size) self._last_segment = end self.log("got end segment: %d" % self._last_segment) self._current_segment = self._start_segment def _activate_enough_servers(self): """ I populate self._active_readers with enough active readers to retrieve the contents of this mutable file. I am called before downloading starts, and (eventually) after each validation error, connection error, or other problem in the download. """ # TODO: It would be cool to investigate other heuristics for # reader selection. For instance, the cost (in time the user # spends waiting for their file) of selecting a really slow server # that happens to have a primary share is probably more than # selecting a really fast server that doesn't have a primary # share. Maybe the servermap could be extended to provide this # information; it could keep track of latency information while # it gathers more important data, and then this routine could # use that to select active readers. # # (these and other questions would be easier to answer with a # robust, configurable tahoe-lafs simulator, which modeled node # failures, differences in node speed, and other characteristics # that we expect storage servers to have. You could have # presets for really stable grids (like allmydata.com), # friendnets, make it easy to configure your own settings, and # then simulate the effect of big changes on these use cases # instead of just reasoning about what the effect might be. Out # of scope for MDMF, though.) # XXX: Why don't format= log messages work here? known_shnums = set(self.remaining_sharemap.keys()) used_shnums = set([r.shnum for r in self._active_readers]) unused_shnums = known_shnums - used_shnums if self._verify: new_shnums = unused_shnums # use them all elif len(self._active_readers) < self._required_shares: # need more shares more = self._required_shares - len(self._active_readers) # We favor lower numbered shares, since FEC is faster with # primary shares than with other shares, and lower-numbered # shares are more likely to be primary than higher numbered # shares. new_shnums = sorted(unused_shnums)[:more] if len(new_shnums) < more: # We don't have enough readers to retrieve the file; fail. self._raise_notenoughshareserror() else: new_shnums = [] self.log("adding %d new servers to the active list" % len(new_shnums)) for shnum in new_shnums: reader = self.readers[shnum] self._active_readers.append(reader) self.log("added reader for share %d" % shnum) # Each time we add a reader, we check to see if we need the # private key. If we do, we politely ask for it and then continue # computing. If we find that we haven't gotten it at the end of # segment decoding, then we'll take more drastic measures. if self._need_privkey and not self._node.is_readonly(): d = reader.get_encprivkey() d.addCallback(self._try_to_validate_privkey, reader, reader.server) # XXX: don't just drop the Deferred. We need error-reporting # but not flow-control here. def _try_to_validate_prefix(self, prefix, reader): """ I check that the prefix returned by a candidate server for retrieval matches the prefix that the servermap knows about (and, hence, the prefix that was validated earlier). If it does, I return True, which means that I approve of the use of the candidate server for segment retrieval. If it doesn't, I return False, which means that another server must be chosen. """ (seqnum, root_hash, IV, segsize, datalength, k, N, known_prefix, offsets_tuple) = self.verinfo if known_prefix != prefix: self.log("prefix from share %d doesn't match" % reader.shnum) raise UncoordinatedWriteError("Mismatched prefix -- this could " "indicate an uncoordinated write") # Otherwise, we're okay -- no issues. def _mark_bad_share(self, server, shnum, reader, f): """ I mark the given (server, shnum) as a bad share, which means that it will not be used anywhere else. There are several reasons to want to mark something as a bad share. These include: - A connection error to the server. - A mismatched prefix (that is, a prefix that does not match our local conception of the version information string). - A failing block hash, salt hash, share hash, or other integrity check. This method will ensure that readers that we wish to mark bad (for these reasons or other reasons) are not used for the rest of the download. Additionally, it will attempt to tell the remote server (with no guarantee of success) that its share is corrupt. """ self.log("marking share %d on server %s as bad" % \ (shnum, server.get_name())) prefix = self.verinfo[-2] self.servermap.mark_bad_share(server, shnum, prefix) self._bad_shares.add((server, shnum, f)) self._status.add_problem(server, f) self._last_failure = f # Remove the reader from _active_readers self._active_readers.remove(reader) for shnum in list(self.remaining_sharemap.keys()): self.remaining_sharemap.discard(shnum, reader.server) if f.check(BadShareError): self.notify_server_corruption(server, shnum, str(f.value)) def _download_current_segment(self): """ I download, validate, decode, decrypt, and assemble the segment that this Retrieve is currently responsible for downloading. """ if self._current_segment > self._last_segment: # No more segments to download, we're done. self.log("got plaintext, done") return self._done() elif self._verify and len(self._active_readers) == 0: self.log("no more good shares, no need to keep verifying") return self._done() self.log("on segment %d of %d" % (self._current_segment + 1, self._num_segments)) d = self._process_segment(self._current_segment) d.addCallback(lambda ign: self.loop()) return d def _process_segment(self, segnum): """ I download, validate, decode, and decrypt one segment of the file that this Retrieve is retrieving. This means coordinating the process of getting k blocks of that file, validating them, assembling them into one segment with the decoder, and then decrypting them. """ self.log("processing segment %d" % segnum) # TODO: The old code uses a marker. Should this code do that # too? What did the Marker do? # We need to ask each of our active readers for its block and # salt. We will then validate those. If validation is # successful, we will assemble the results into plaintext. ds = [] for reader in self._active_readers: started = time.time() d1 = reader.get_block_and_salt(segnum) d2,d3 = self._get_needed_hashes(reader, segnum) d = deferredutil.gatherResults([d1,d2,d3]) d.addCallback(self._validate_block, segnum, reader, reader.server, started) # _handle_bad_share takes care of recoverable errors (by dropping # that share and returning None). Any other errors (i.e. code # bugs) are passed through and cause the retrieve to fail. d.addErrback(self._handle_bad_share, [reader]) ds.append(d) dl = deferredutil.gatherResults(ds) if self._verify: dl.addCallback(lambda ignored: "") dl.addCallback(self._set_segment) else: dl.addCallback(self._maybe_decode_and_decrypt_segment, segnum) return dl def _maybe_decode_and_decrypt_segment(self, results, segnum): """ I take the results of fetching and validating the blocks from _process_segment. If validation and fetching succeeded without incident, I will proceed with decoding and decryption. Otherwise, I will do nothing. """ self.log("trying to decode and decrypt segment %d" % segnum) # 'results' is the output of a gatherResults set up in # _process_segment(). Each component Deferred will either contain the # non-Failure output of _validate_block() for a single block (i.e. # {segnum:(block,salt)}), or None if _validate_block threw an # exception and _validation_or_decoding_failed handled it (by # dropping that server). if None in results: self.log("some validation operations failed; not proceeding") return defer.succeed(None) self.log("everything looks ok, building segment %d" % segnum) d = self._decode_blocks(results, segnum) d.addCallback(self._decrypt_segment) # check to see whether we've been paused before writing # anything. d.addCallback(self._check_for_paused) d.addCallback(self._check_for_stopped) d.addCallback(self._set_segment) return d def _set_segment(self, segment): """ Given a plaintext segment, I register that segment with the target that is handling the file download. """ self.log("got plaintext for segment %d" % self._current_segment) if self._read_length == 0: self.log("on first+last segment, size=0, using 0 bytes") segment = b"" if self._current_segment == self._last_segment: # trim off the tail wanted = (self._offset + self._read_length) % self._segment_size if wanted != 0: self.log("on the last segment: using first %d bytes" % wanted) segment = segment[:wanted] else: self.log("on the last segment: using all %d bytes" % len(segment)) if self._current_segment == self._start_segment: # Trim off the head, if offset != 0. This should also work if # start==last, because we trim the tail first. skip = self._offset % self._segment_size self.log("on the first segment: skipping first %d bytes" % skip) segment = segment[skip:] if not self._verify: self._consumer.write(segment) else: # we don't care about the plaintext if we are doing a verify. segment = None self._current_segment += 1 def _handle_bad_share(self, f, readers): """ I am called when a block or a salt fails to correctly validate, or when the decryption or decoding operation fails for some reason. I react to this failure by notifying the remote server of corruption, and then removing the remote server from further activity. """ # these are the errors we can tolerate: by giving up on this share # and finding others to replace it. Any other errors (i.e. coding # bugs) are re-raised, causing the download to fail. f.trap(DeadReferenceError, RemoteException, BadShareError) # DeadReferenceError happens when we try to fetch data from a server # that has gone away. RemoteException happens if the server had an # internal error. BadShareError encompasses: (UnknownVersionError, # LayoutInvalid, struct.error) which happen when we get obviously # wrong data, and CorruptShareError which happens later, when we # perform integrity checks on the data. precondition(isinstance(readers, list), readers) bad_shnums = [reader.shnum for reader in readers] self.log("validation or decoding failed on share(s) %s, server(s) %s " ", segment %d: %s" % \ (bad_shnums, readers, self._current_segment, str(f))) for reader in readers: self._mark_bad_share(reader.server, reader.shnum, reader, f) return None def _validate_block(self, results, segnum, reader, server, started): """ I validate a block from one share on a remote server. """ # Grab the part of the block hash tree that is necessary to # validate this block, then generate the block hash root. self.log("validating share %d for segment %d" % (reader.shnum, segnum)) elapsed = time.time() - started self._status.add_fetch_timing(server, elapsed) self._set_current_status("validating blocks") block_and_salt, blockhashes, sharehashes = results block, salt = block_and_salt _assert(type(block) is str, (block, salt)) blockhashes = dict(enumerate(blockhashes)) self.log("the reader gave me the following blockhashes: %s" % \ blockhashes.keys()) self.log("the reader gave me the following sharehashes: %s" % \ sharehashes.keys()) bht = self._block_hash_trees[reader.shnum] if bht.needed_hashes(segnum, include_leaf=True): try: bht.set_hashes(blockhashes) except (hashtree.BadHashError, hashtree.NotEnoughHashesError, \ IndexError), e: raise CorruptShareError(server, reader.shnum, "block hash tree failure: %s" % e) if self._version == MDMF_VERSION: blockhash = hashutil.block_hash(salt + block) else: blockhash = hashutil.block_hash(block) # If this works without an error, then validation is # successful. try: bht.set_hashes(leaves={segnum: blockhash}) except (hashtree.BadHashError, hashtree.NotEnoughHashesError, \ IndexError), e: raise CorruptShareError(server, reader.shnum, "block hash tree failure: %s" % e) # Reaching this point means that we know that this segment # is correct. Now we need to check to see whether the share # hash chain is also correct. # SDMF wrote share hash chains that didn't contain the # leaves, which would be produced from the block hash tree. # So we need to validate the block hash tree first. If # successful, then bht[0] will contain the root for the # shnum, which will be a leaf in the share hash tree, which # will allow us to validate the rest of the tree. try: self.share_hash_tree.set_hashes(hashes=sharehashes, leaves={reader.shnum: bht[0]}) except (hashtree.BadHashError, hashtree.NotEnoughHashesError, \ IndexError), e: raise CorruptShareError(server, reader.shnum, "corrupt hashes: %s" % e) self.log('share %d is valid for segment %d' % (reader.shnum, segnum)) return {reader.shnum: (block, salt)} def _get_needed_hashes(self, reader, segnum): """ I get the hashes needed to validate segnum from the reader, then return to my caller when this is done. """ bht = self._block_hash_trees[reader.shnum] needed = bht.needed_hashes(segnum, include_leaf=True) # The root of the block hash tree is also a leaf in the share # hash tree. So we don't need to fetch it from the remote # server. In the case of files with one segment, this means that # we won't fetch any block hash tree from the remote server, # since the hash of each share of the file is the entire block # hash tree, and is a leaf in the share hash tree. This is fine, # since any share corruption will be detected in the share hash # tree. #needed.discard(0) self.log("getting blockhashes for segment %d, share %d: %s" % \ (segnum, reader.shnum, str(needed))) # TODO is force_remote necessary here? d1 = reader.get_blockhashes(needed, force_remote=False) if self.share_hash_tree.needed_hashes(reader.shnum): need = self.share_hash_tree.needed_hashes(reader.shnum) self.log("also need sharehashes for share %d: %s" % (reader.shnum, str(need))) d2 = reader.get_sharehashes(need, force_remote=False) else: d2 = defer.succeed({}) # the logic in the next method # expects a dict return d1,d2 def _decode_blocks(self, results, segnum): """ I take a list of k blocks and salts, and decode that into a single encrypted segment. """ # 'results' is one or more dicts (each {shnum:(block,salt)}), and we # want to merge them all blocks_and_salts = {} for d in results: blocks_and_salts.update(d) # All of these blocks should have the same salt; in SDMF, it is # the file-wide IV, while in MDMF it is the per-segment salt. In # either case, we just need to get one of them and use it. # # d.items()[0] is like (shnum, (block, salt)) # d.items()[0][1] is like (block, salt) # d.items()[0][1][1] is the salt. salt = blocks_and_salts.items()[0][1][1] # Next, extract just the blocks from the dict. We'll use the # salt in the next step. share_and_shareids = [(k, v[0]) for k, v in blocks_and_salts.items()] d2 = dict(share_and_shareids) shareids = [] shares = [] for shareid, share in d2.items(): shareids.append(shareid) shares.append(share) self._set_current_status("decoding") started = time.time() _assert(len(shareids) >= self._required_shares, len(shareids)) # zfec really doesn't want extra shares shareids = shareids[:self._required_shares] shares = shares[:self._required_shares] self.log("decoding segment %d" % segnum) if segnum == self._num_segments - 1: d = defer.maybeDeferred(self._tail_decoder.decode, shares, shareids) else: d = defer.maybeDeferred(self._segment_decoder.decode, shares, shareids) def _process(buffers): segment = "".join(buffers) self.log(format="now decoding segment %(segnum)s of %(numsegs)s", segnum=segnum, numsegs=self._num_segments, level=log.NOISY) self.log(" joined length %d, datalength %d" % (len(segment), self._data_length)) if segnum == self._num_segments - 1: size_to_use = self._tail_data_size else: size_to_use = self._segment_size segment = segment[:size_to_use] self.log(" segment len=%d" % len(segment)) self._status.accumulate_decode_time(time.time() - started) return segment, salt d.addCallback(_process) return d def _decrypt_segment(self, segment_and_salt): """ I take a single segment and its salt, and decrypt it. I return the plaintext of the segment that is in my argument. """ segment, salt = segment_and_salt self._set_current_status("decrypting") self.log("decrypting segment %d" % self._current_segment) started = time.time() key = hashutil.ssk_readkey_data_hash(salt, self._node.get_readkey()) decryptor = AES(key) plaintext = decryptor.process(segment) self._status.accumulate_decrypt_time(time.time() - started) return plaintext def notify_server_corruption(self, server, shnum, reason): rref = server.get_rref() rref.callRemoteOnly("advise_corrupt_share", "mutable", self._storage_index, shnum, reason) def _try_to_validate_privkey(self, enc_privkey, reader, server): alleged_privkey_s = self._node._decrypt_privkey(enc_privkey) alleged_writekey = hashutil.ssk_writekey_hash(alleged_privkey_s) if alleged_writekey != self._node.get_writekey(): self.log("invalid privkey from %s shnum %d" % (reader, reader.shnum), level=log.WEIRD, umid="YIw4tA") if self._verify: self.servermap.mark_bad_share(server, reader.shnum, self.verinfo[-2]) e = CorruptShareError(server, reader.shnum, "invalid privkey") f = failure.Failure(e) self._bad_shares.add((server, reader.shnum, f)) return # it's good self.log("got valid privkey from shnum %d on reader %s" % (reader.shnum, reader)) privkey = rsa.create_signing_key_from_string(alleged_privkey_s) self._node._populate_encprivkey(enc_privkey) self._node._populate_privkey(privkey) self._need_privkey = False def _done(self): """ I am called by _download_current_segment when the download process has finished successfully. After making some useful logging statements, I return the decrypted contents to the owner of this Retrieve object through self._done_deferred. """ self._running = False self._status.set_active(False) now = time.time() self._status.timings['total'] = now - self._started self._status.timings['fetch'] = now - self._started_fetching self._status.set_status("Finished") self._status.set_progress(1.0) # remember the encoding parameters, use them again next time (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = self.verinfo self._node._populate_required_shares(k) self._node._populate_total_shares(N) if self._verify: ret = self._bad_shares self.log("done verifying, found %d bad shares" % len(ret)) else: # TODO: upload status here? ret = self._consumer self._consumer.unregisterProducer() eventually(self._done_deferred.callback, ret) def _raise_notenoughshareserror(self): """ I am called when there are not enough active servers left to complete the download. After making some useful logging statements, I throw an exception to that effect to the caller of this Retrieve object through self._done_deferred. """ format = ("ran out of servers: " "have %(have)d of %(total)d segments; " "found %(bad)d bad shares; " "have %(remaining)d remaining shares of the right version; " "encoding %(k)d-of-%(n)d") args = {"have": self._current_segment, "total": self._num_segments, "need": self._last_segment, "k": self._required_shares, "n": self._total_shares, "bad": len(self._bad_shares), "remaining": len(self.remaining_sharemap), } raise NotEnoughSharesError("%s, last failure: %s" % (format % args, str(self._last_failure))) def _error(self, f): # all errors, including NotEnoughSharesError, land here self._running = False self._status.set_active(False) now = time.time() self._status.timings['total'] = now - self._started self._status.timings['fetch'] = now - self._started_fetching self._status.set_status("Failed") eventually(self._done_deferred.errback, f) allmydata-tahoe-1.10.2/src/allmydata/mutable/publish.py0000644000175000017500000015472312556560070021213 0ustar ramram import os, time from StringIO import StringIO from itertools import count from zope.interface import implements from twisted.internet import defer from twisted.python import failure from allmydata.interfaces import IPublishStatus, SDMF_VERSION, MDMF_VERSION, \ IMutableUploadable from allmydata.util import base32, hashutil, mathutil, log from allmydata.util.dictutil import DictOfSets from allmydata import hashtree, codec from allmydata.storage.server import si_b2a from pycryptopp.cipher.aes import AES from foolscap.api import eventually, fireEventually from allmydata.mutable.common import MODE_WRITE, MODE_CHECK, MODE_REPAIR, \ UncoordinatedWriteError, NotEnoughServersError from allmydata.mutable.servermap import ServerMap from allmydata.mutable.layout import get_version_from_checkstring,\ unpack_mdmf_checkstring, \ unpack_sdmf_checkstring, \ MDMFSlotWriteProxy, \ SDMFSlotWriteProxy KiB = 1024 DEFAULT_MAX_SEGMENT_SIZE = 128 * KiB PUSHING_BLOCKS_STATE = 0 PUSHING_EVERYTHING_ELSE_STATE = 1 DONE_STATE = 2 class PublishStatus: implements(IPublishStatus) statusid_counter = count(0) def __init__(self): self.timings = {} self.timings["send_per_server"] = {} self.timings["encrypt"] = 0.0 self.timings["encode"] = 0.0 self.servermap = None self._problems = {} self.active = True self.storage_index = None self.helper = False self.encoding = ("?", "?") self.size = None self.status = "Not started" self.progress = 0.0 self.counter = self.statusid_counter.next() self.started = time.time() def add_per_server_time(self, server, elapsed): if server not in self.timings["send_per_server"]: self.timings["send_per_server"][server] = [] self.timings["send_per_server"][server].append(elapsed) def accumulate_encode_time(self, elapsed): self.timings["encode"] += elapsed def accumulate_encrypt_time(self, elapsed): self.timings["encrypt"] += elapsed def get_started(self): return self.started def get_storage_index(self): return self.storage_index def get_encoding(self): return self.encoding def using_helper(self): return self.helper def get_servermap(self): return self.servermap def get_size(self): return self.size def get_status(self): return self.status def get_progress(self): return self.progress def get_active(self): return self.active def get_counter(self): return self.counter def get_problems(self): return self._problems def set_storage_index(self, si): self.storage_index = si def set_helper(self, helper): self.helper = helper def set_servermap(self, servermap): self.servermap = servermap def set_encoding(self, k, n): self.encoding = (k, n) def set_size(self, size): self.size = size def set_status(self, status): self.status = status def set_progress(self, value): self.progress = value def set_active(self, value): self.active = value class LoopLimitExceededError(Exception): pass class Publish: """I represent a single act of publishing the mutable file to the grid. I will only publish my data if the servermap I am using still represents the current state of the world. To make the initial publish, set servermap to None. """ def __init__(self, filenode, storage_broker, servermap): self._node = filenode self._storage_broker = storage_broker self._servermap = servermap self._storage_index = self._node.get_storage_index() self._log_prefix = prefix = si_b2a(self._storage_index)[:5] num = self.log("Publish(%s): starting" % prefix, parent=None) self._log_number = num self._running = True self._first_write_error = None self._last_failure = None self._status = PublishStatus() self._status.set_storage_index(self._storage_index) self._status.set_helper(False) self._status.set_progress(0.0) self._status.set_active(True) self._version = self._node.get_version() assert self._version in (SDMF_VERSION, MDMF_VERSION) def get_status(self): return self._status def log(self, *args, **kwargs): if 'parent' not in kwargs: kwargs['parent'] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.mutable.publish" return log.msg(*args, **kwargs) def update(self, data, offset, blockhashes, version): """ I replace the contents of this file with the contents of data, starting at offset. I return a Deferred that fires with None when the replacement has been completed, or with an error if something went wrong during the process. Note that this process will not upload new shares. If the file being updated is in need of repair, callers will have to repair it on their own. """ # How this works: # 1: Make server assignments. We'll assign each share that we know # about on the grid to that server that currently holds that # share, and will not place any new shares. # 2: Setup encoding parameters. Most of these will stay the same # -- datalength will change, as will some of the offsets. # 3. Upload the new segments. # 4. Be done. assert IMutableUploadable.providedBy(data) self.data = data # XXX: Use the MutableFileVersion instead. self.datalength = self._node.get_size() if data.get_size() > self.datalength: self.datalength = data.get_size() self.log("starting update") self.log("adding new data of length %d at offset %d" % \ (data.get_size(), offset)) self.log("new data length is %d" % self.datalength) self._status.set_size(self.datalength) self._status.set_status("Started") self._started = time.time() self.done_deferred = defer.Deferred() self._writekey = self._node.get_writekey() assert self._writekey, "need write capability to publish" # first, which servers will we publish to? We require that the # servermap was updated in MODE_WRITE, so we can depend upon the # serverlist computed by that process instead of computing our own. assert self._servermap assert self._servermap.get_last_update()[0] in (MODE_WRITE, MODE_CHECK, MODE_REPAIR) # we will push a version that is one larger than anything present # in the grid, according to the servermap. self._new_seqnum = self._servermap.highest_seqnum() + 1 self._status.set_servermap(self._servermap) self.log(format="new seqnum will be %(seqnum)d", seqnum=self._new_seqnum, level=log.NOISY) # We're updating an existing file, so all of the following # should be available. self.readkey = self._node.get_readkey() self.required_shares = self._node.get_required_shares() assert self.required_shares is not None self.total_shares = self._node.get_total_shares() assert self.total_shares is not None self._status.set_encoding(self.required_shares, self.total_shares) self._pubkey = self._node.get_pubkey() assert self._pubkey self._privkey = self._node.get_privkey() assert self._privkey self._encprivkey = self._node.get_encprivkey() sb = self._storage_broker full_serverlist = list(sb.get_servers_for_psi(self._storage_index)) self.full_serverlist = full_serverlist # for use later, immutable self.bad_servers = set() # servers who have errbacked/refused requests # This will set self.segment_size, self.num_segments, and # self.fec. TODO: Does it know how to do the offset? Probably # not. So do that part next. self.setup_encoding_parameters(offset=offset) # if we experience any surprises (writes which were rejected because # our test vector did not match, or shares which we didn't expect to # see), we set this flag and report an UncoordinatedWriteError at the # end of the publish process. self.surprised = False # we keep track of three tables. The first is our goal: which share # we want to see on which servers. This is initially populated by the # existing servermap. self.goal = set() # pairs of (server, shnum) tuples # the number of outstanding queries: those that are in flight and # may or may not be delivered, accepted, or acknowledged. This is # incremented when a query is sent, and decremented when the response # returns or errbacks. self.num_outstanding = 0 # the third is a table of successes: share which have actually been # placed. These are populated when responses come back with success. # When self.placed == self.goal, we're done. self.placed = set() # (server, shnum) tuples self.bad_share_checkstrings = {} # This is set at the last step of the publishing process. self.versioninfo = "" # we use the servermap to populate the initial goal: this way we will # try to update each existing share in place. Since we're # updating, we ignore damaged and missing shares -- callers must # do a repair to repair and recreate these. self.goal = set(self._servermap.get_known_shares()) # shnum -> set of IMutableSlotWriter self.writers = DictOfSets() # SDMF files are updated differently. self._version = MDMF_VERSION writer_class = MDMFSlotWriteProxy # For each (server, shnum) in self.goal, we make a # write proxy for that server. We'll use this to write # shares to the server. for (server,shnum) in self.goal: write_enabler = self._node.get_write_enabler(server) renew_secret = self._node.get_renewal_secret(server) cancel_secret = self._node.get_cancel_secret(server) secrets = (write_enabler, renew_secret, cancel_secret) writer = writer_class(shnum, server.get_rref(), self._storage_index, secrets, self._new_seqnum, self.required_shares, self.total_shares, self.segment_size, self.datalength) self.writers.add(shnum, writer) writer.server = server known_shares = self._servermap.get_known_shares() assert (server, shnum) in known_shares old_versionid, old_timestamp = known_shares[(server,shnum)] (old_seqnum, old_root_hash, old_salt, old_segsize, old_datalength, old_k, old_N, old_prefix, old_offsets_tuple) = old_versionid writer.set_checkstring(old_seqnum, old_root_hash, old_salt) # Our remote shares will not have a complete checkstring until # after we are done writing share data and have started to write # blocks. In the meantime, we need to know what to look for when # writing, so that we can detect UncoordinatedWriteErrors. self._checkstring = self._get_some_writer().get_checkstring() # Now, we start pushing shares. self._status.timings["setup"] = time.time() - self._started # First, we encrypt, encode, and publish the shares that we need # to encrypt, encode, and publish. # Our update process fetched these for us. We need to update # them in place as publishing happens. self.blockhashes = {} # (shnum, [blochashes]) for (i, bht) in blockhashes.iteritems(): # We need to extract the leaves from our old hash tree. old_segcount = mathutil.div_ceil(version[4], version[3]) h = hashtree.IncompleteHashTree(old_segcount) bht = dict(enumerate(bht)) h.set_hashes(bht) leaves = h[h.get_leaf_index(0):] for j in xrange(self.num_segments - len(leaves)): leaves.append(None) assert len(leaves) >= self.num_segments self.blockhashes[i] = leaves # This list will now be the leaves that were set during the # initial upload + enough empty hashes to make it a # power-of-two. If we exceed a power of two boundary, we # should be encoding the file over again, and should not be # here. So, we have #assert len(self.blockhashes[i]) == \ # hashtree.roundup_pow2(self.num_segments), \ # len(self.blockhashes[i]) # XXX: Except this doesn't work. Figure out why. # These are filled in later, after we've modified the block hash # tree suitably. self.sharehash_leaves = None # eventually [sharehashes] self.sharehashes = {} # shnum -> [sharehash leaves necessary to # validate the share] self.log("Starting push") self._state = PUSHING_BLOCKS_STATE self._push() return self.done_deferred def publish(self, newdata): """Publish the filenode's current contents. Returns a Deferred that fires (with None) when the publish has done as much work as it's ever going to do, or errbacks with ConsistencyError if it detects a simultaneous write. """ # 0. Setup encoding parameters, encoder, and other such things. # 1. Encrypt, encode, and publish segments. assert IMutableUploadable.providedBy(newdata) self.data = newdata self.datalength = newdata.get_size() #if self.datalength >= DEFAULT_MAX_SEGMENT_SIZE: # self._version = MDMF_VERSION #else: # self._version = SDMF_VERSION self.log("starting publish, datalen is %s" % self.datalength) self._status.set_size(self.datalength) self._status.set_status("Started") self._started = time.time() self.done_deferred = defer.Deferred() self._writekey = self._node.get_writekey() assert self._writekey, "need write capability to publish" # first, which servers will we publish to? We require that the # servermap was updated in MODE_WRITE, so we can depend upon the # serverlist computed by that process instead of computing our own. if self._servermap: assert self._servermap.get_last_update()[0] in (MODE_WRITE, MODE_CHECK, MODE_REPAIR) # we will push a version that is one larger than anything present # in the grid, according to the servermap. self._new_seqnum = self._servermap.highest_seqnum() + 1 else: # If we don't have a servermap, that's because we're doing the # initial publish self._new_seqnum = 1 self._servermap = ServerMap() self._status.set_servermap(self._servermap) self.log(format="new seqnum will be %(seqnum)d", seqnum=self._new_seqnum, level=log.NOISY) # having an up-to-date servermap (or using a filenode that was just # created for the first time) also guarantees that the following # fields are available self.readkey = self._node.get_readkey() self.required_shares = self._node.get_required_shares() assert self.required_shares is not None self.total_shares = self._node.get_total_shares() assert self.total_shares is not None self._status.set_encoding(self.required_shares, self.total_shares) self._pubkey = self._node.get_pubkey() assert self._pubkey self._privkey = self._node.get_privkey() assert self._privkey self._encprivkey = self._node.get_encprivkey() sb = self._storage_broker full_serverlist = list(sb.get_servers_for_psi(self._storage_index)) self.full_serverlist = full_serverlist # for use later, immutable self.bad_servers = set() # servers who have errbacked/refused requests # This will set self.segment_size, self.num_segments, and # self.fec. self.setup_encoding_parameters() # if we experience any surprises (writes which were rejected because # our test vector did not match, or shares which we didn't expect to # see), we set this flag and report an UncoordinatedWriteError at the # end of the publish process. self.surprised = False # we keep track of three tables. The first is our goal: which share # we want to see on which servers. This is initially populated by the # existing servermap. self.goal = set() # pairs of (server, shnum) tuples # the number of outstanding queries: those that are in flight and # may or may not be delivered, accepted, or acknowledged. This is # incremented when a query is sent, and decremented when the response # returns or errbacks. self.num_outstanding = 0 # the third is a table of successes: share which have actually been # placed. These are populated when responses come back with success. # When self.placed == self.goal, we're done. self.placed = set() # (server, shnum) tuples self.bad_share_checkstrings = {} # This is set at the last step of the publishing process. self.versioninfo = "" # we use the servermap to populate the initial goal: this way we will # try to update each existing share in place. self.goal = set(self._servermap.get_known_shares()) # then we add in all the shares that were bad (corrupted, bad # signatures, etc). We want to replace these. for key, old_checkstring in self._servermap.get_bad_shares().items(): (server, shnum) = key self.goal.add( (server,shnum) ) self.bad_share_checkstrings[(server,shnum)] = old_checkstring # TODO: Make this part do server selection. self.update_goal() # shnum -> set of IMutableSlotWriter self.writers = DictOfSets() if self._version == MDMF_VERSION: writer_class = MDMFSlotWriteProxy else: writer_class = SDMFSlotWriteProxy # For each (server, shnum) in self.goal, we make a # write proxy for that server. We'll use this to write # shares to the server. for (server,shnum) in self.goal: write_enabler = self._node.get_write_enabler(server) renew_secret = self._node.get_renewal_secret(server) cancel_secret = self._node.get_cancel_secret(server) secrets = (write_enabler, renew_secret, cancel_secret) writer = writer_class(shnum, server.get_rref(), self._storage_index, secrets, self._new_seqnum, self.required_shares, self.total_shares, self.segment_size, self.datalength) self.writers.add(shnum, writer) writer.server = server known_shares = self._servermap.get_known_shares() if (server, shnum) in known_shares: old_versionid, old_timestamp = known_shares[(server,shnum)] (old_seqnum, old_root_hash, old_salt, old_segsize, old_datalength, old_k, old_N, old_prefix, old_offsets_tuple) = old_versionid writer.set_checkstring(old_seqnum, old_root_hash, old_salt) elif (server, shnum) in self.bad_share_checkstrings: old_checkstring = self.bad_share_checkstrings[(server, shnum)] writer.set_checkstring(old_checkstring) # Our remote shares will not have a complete checkstring until # after we are done writing share data and have started to write # blocks. In the meantime, we need to know what to look for when # writing, so that we can detect UncoordinatedWriteErrors. self._checkstring = self._get_some_writer().get_checkstring() # Now, we start pushing shares. self._status.timings["setup"] = time.time() - self._started # First, we encrypt, encode, and publish the shares that we need # to encrypt, encode, and publish. # This will eventually hold the block hash chain for each share # that we publish. We define it this way so that empty publishes # will still have something to write to the remote slot. self.blockhashes = dict([(i, []) for i in xrange(self.total_shares)]) for i in xrange(self.total_shares): blocks = self.blockhashes[i] for j in xrange(self.num_segments): blocks.append(None) self.sharehash_leaves = None # eventually [sharehashes] self.sharehashes = {} # shnum -> [sharehash leaves necessary to # validate the share] self.log("Starting push") self._state = PUSHING_BLOCKS_STATE self._push() return self.done_deferred def _get_some_writer(self): return list(self.writers.values()[0])[0] def _update_status(self): self._status.set_status("Sending Shares: %d placed out of %d, " "%d messages outstanding" % (len(self.placed), len(self.goal), self.num_outstanding)) self._status.set_progress(1.0 * len(self.placed) / len(self.goal)) def setup_encoding_parameters(self, offset=0): if self._version == MDMF_VERSION: segment_size = DEFAULT_MAX_SEGMENT_SIZE # 128 KiB by default else: segment_size = self.datalength # SDMF is only one segment # this must be a multiple of self.required_shares segment_size = mathutil.next_multiple(segment_size, self.required_shares) self.segment_size = segment_size # Calculate the starting segment for the upload. if segment_size: # We use div_ceil instead of integer division here because # it is semantically correct. # If datalength isn't an even multiple of segment_size, but # is larger than segment_size, datalength // segment_size # will be the largest number such that num <= datalength and # num % segment_size == 0. But that's not what we want, # because it ignores the extra data. div_ceil will give us # the right number of segments for the data that we're # given. self.num_segments = mathutil.div_ceil(self.datalength, segment_size) self.starting_segment = offset // segment_size else: self.num_segments = 0 self.starting_segment = 0 self.log("building encoding parameters for file") self.log("got segsize %d" % self.segment_size) self.log("got %d segments" % self.num_segments) if self._version == SDMF_VERSION: assert self.num_segments in (0, 1) # SDMF # calculate the tail segment size. if segment_size and self.datalength: self.tail_segment_size = self.datalength % segment_size self.log("got tail segment size %d" % self.tail_segment_size) else: self.tail_segment_size = 0 if self.tail_segment_size == 0 and segment_size: # The tail segment is the same size as the other segments. self.tail_segment_size = segment_size # Make FEC encoders fec = codec.CRSEncoder() fec.set_params(self.segment_size, self.required_shares, self.total_shares) self.piece_size = fec.get_block_size() self.fec = fec if self.tail_segment_size == self.segment_size: self.tail_fec = self.fec else: tail_fec = codec.CRSEncoder() tail_fec.set_params(self.tail_segment_size, self.required_shares, self.total_shares) self.tail_fec = tail_fec self._current_segment = self.starting_segment self.end_segment = self.num_segments - 1 # Now figure out where the last segment should be. if self.data.get_size() != self.datalength: # We're updating a few segments in the middle of a mutable # file, so we don't want to republish the whole thing. # (we don't have enough data to do that even if we wanted # to) end = self.data.get_size() self.end_segment = end // segment_size if end % segment_size == 0: self.end_segment -= 1 self.log("got start segment %d" % self.starting_segment) self.log("got end segment %d" % self.end_segment) def _push(self, ignored=None): """ I manage state transitions. In particular, I see that we still have a good enough number of writers to complete the upload successfully. """ # Can we still successfully publish this file? # TODO: Keep track of outstanding queries before aborting the # process. num_shnums = len(self.writers) if num_shnums < self.required_shares or self.surprised: return self._failure() # Figure out what we need to do next. Each of these needs to # return a deferred so that we don't block execution when this # is first called in the upload method. if self._state == PUSHING_BLOCKS_STATE: return self.push_segment(self._current_segment) elif self._state == PUSHING_EVERYTHING_ELSE_STATE: return self.push_everything_else() # If we make it to this point, we were successful in placing the # file. return self._done() def push_segment(self, segnum): if self.num_segments == 0 and self._version == SDMF_VERSION: self._add_dummy_salts() if segnum > self.end_segment: # We don't have any more segments to push. self._state = PUSHING_EVERYTHING_ELSE_STATE return self._push() d = self._encode_segment(segnum) d.addCallback(self._push_segment, segnum) def _increment_segnum(ign): self._current_segment += 1 # XXX: I don't think we need to do addBoth here -- any errBacks # should be handled within push_segment. d.addCallback(_increment_segnum) d.addCallback(self._turn_barrier) d.addCallback(self._push) d.addErrback(self._failure) def _turn_barrier(self, result): """ I help the publish process avoid the recursion limit issues described in #237. """ return fireEventually(result) def _add_dummy_salts(self): """ SDMF files need a salt even if they're empty, or the signature won't make sense. This method adds a dummy salt to each of our SDMF writers so that they can write the signature later. """ salt = os.urandom(16) assert self._version == SDMF_VERSION for shnum, writers in self.writers.iteritems(): for writer in writers: writer.put_salt(salt) def _encode_segment(self, segnum): """ I encrypt and encode the segment segnum. """ started = time.time() if segnum + 1 == self.num_segments: segsize = self.tail_segment_size else: segsize = self.segment_size self.log("Pushing segment %d of %d" % (segnum + 1, self.num_segments)) data = self.data.read(segsize) # XXX: This is dumb. Why return a list? data = "".join(data) assert len(data) == segsize, len(data) salt = os.urandom(16) key = hashutil.ssk_readkey_data_hash(salt, self.readkey) self._status.set_status("Encrypting") enc = AES(key) crypttext = enc.process(data) assert len(crypttext) == len(data) now = time.time() self._status.accumulate_encrypt_time(now - started) started = now # now apply FEC if segnum + 1 == self.num_segments: fec = self.tail_fec else: fec = self.fec self._status.set_status("Encoding") crypttext_pieces = [None] * self.required_shares piece_size = fec.get_block_size() for i in range(len(crypttext_pieces)): offset = i * piece_size piece = crypttext[offset:offset+piece_size] piece = piece + "\x00"*(piece_size - len(piece)) # padding crypttext_pieces[i] = piece assert len(piece) == piece_size d = fec.encode(crypttext_pieces) def _done_encoding(res): elapsed = time.time() - started self._status.accumulate_encode_time(elapsed) return (res, salt) d.addCallback(_done_encoding) return d def _push_segment(self, encoded_and_salt, segnum): """ I push (data, salt) as segment number segnum. """ results, salt = encoded_and_salt shares, shareids = results self._status.set_status("Pushing segment") for i in xrange(len(shares)): sharedata = shares[i] shareid = shareids[i] if self._version == MDMF_VERSION: hashed = salt + sharedata else: hashed = sharedata block_hash = hashutil.block_hash(hashed) self.blockhashes[shareid][segnum] = block_hash # find the writer for this share writers = self.writers[shareid] for writer in writers: writer.put_block(sharedata, segnum, salt) def push_everything_else(self): """ I put everything else associated with a share. """ self._pack_started = time.time() self.push_encprivkey() self.push_blockhashes() self.push_sharehashes() self.push_toplevel_hashes_and_signature() d = self.finish_publishing() def _change_state(ignored): self._state = DONE_STATE d.addCallback(_change_state) d.addCallback(self._push) return d def push_encprivkey(self): encprivkey = self._encprivkey self._status.set_status("Pushing encrypted private key") for shnum, writers in self.writers.iteritems(): for writer in writers: writer.put_encprivkey(encprivkey) def push_blockhashes(self): self.sharehash_leaves = [None] * len(self.blockhashes) self._status.set_status("Building and pushing block hash tree") for shnum, blockhashes in self.blockhashes.iteritems(): t = hashtree.HashTree(blockhashes) self.blockhashes[shnum] = list(t) # set the leaf for future use. self.sharehash_leaves[shnum] = t[0] writers = self.writers[shnum] for writer in writers: writer.put_blockhashes(self.blockhashes[shnum]) def push_sharehashes(self): self._status.set_status("Building and pushing share hash chain") share_hash_tree = hashtree.HashTree(self.sharehash_leaves) for shnum in xrange(len(self.sharehash_leaves)): needed_indices = share_hash_tree.needed_hashes(shnum) self.sharehashes[shnum] = dict( [ (i, share_hash_tree[i]) for i in needed_indices] ) writers = self.writers[shnum] for writer in writers: writer.put_sharehashes(self.sharehashes[shnum]) self.root_hash = share_hash_tree[0] def push_toplevel_hashes_and_signature(self): # We need to to three things here: # - Push the root hash and salt hash # - Get the checkstring of the resulting layout; sign that. # - Push the signature self._status.set_status("Pushing root hashes and signature") for shnum in xrange(self.total_shares): writers = self.writers[shnum] for writer in writers: writer.put_root_hash(self.root_hash) self._update_checkstring() self._make_and_place_signature() def _update_checkstring(self): """ After putting the root hash, MDMF files will have the checkstring written to the storage server. This means that we can update our copy of the checkstring so we can detect uncoordinated writes. SDMF files will have the same checkstring, so we need not do anything. """ self._checkstring = self._get_some_writer().get_checkstring() def _make_and_place_signature(self): """ I create and place the signature. """ started = time.time() self._status.set_status("Signing prefix") signable = self._get_some_writer().get_signable() self.signature = self._privkey.sign(signable) for (shnum, writers) in self.writers.iteritems(): for writer in writers: writer.put_signature(self.signature) self._status.timings['sign'] = time.time() - started def finish_publishing(self): # We're almost done -- we just need to put the verification key # and the offsets started = time.time() self._status.set_status("Pushing shares") self._started_pushing = started ds = [] verification_key = self._pubkey.serialize() for (shnum, writers) in self.writers.copy().iteritems(): for writer in writers: writer.put_verification_key(verification_key) self.num_outstanding += 1 def _no_longer_outstanding(res): self.num_outstanding -= 1 return res d = writer.finish_publishing() d.addBoth(_no_longer_outstanding) d.addErrback(self._connection_problem, writer) d.addCallback(self._got_write_answer, writer, started) ds.append(d) self._record_verinfo() self._status.timings['pack'] = time.time() - started return defer.DeferredList(ds) def _record_verinfo(self): self.versioninfo = self._get_some_writer().get_verinfo() def _connection_problem(self, f, writer): """ We ran into a connection problem while working with writer, and need to deal with that. """ self.log("found problem: %s" % str(f)) self._last_failure = f self.writers.discard(writer.shnum, writer) def log_goal(self, goal, message=""): logmsg = [message] for (shnum, server) in sorted([(s,p) for (p,s) in goal]): logmsg.append("sh%d to [%s]" % (shnum, server.get_name())) self.log("current goal: %s" % (", ".join(logmsg)), level=log.NOISY) self.log("we are planning to push new seqnum=#%d" % self._new_seqnum, level=log.NOISY) def update_goal(self): # if log.recording_noisy if True: self.log_goal(self.goal, "before update: ") # first, remove any bad servers from our goal self.goal = set([ (server, shnum) for (server, shnum) in self.goal if server not in self.bad_servers ]) # find the homeless shares: homefull_shares = set([shnum for (server, shnum) in self.goal]) homeless_shares = set(range(self.total_shares)) - homefull_shares homeless_shares = sorted(list(homeless_shares)) # place them somewhere. We prefer unused servers at the beginning of # the available server list. if not homeless_shares: return # if an old share X is on a node, put the new share X there too. # TODO: 1: redistribute shares to achieve one-per-server, by copying # shares from existing servers to new (less-crowded) ones. The # old shares must still be updated. # TODO: 2: move those shares instead of copying them, to reduce future # update work # this is a bit CPU intensive but easy to analyze. We create a sort # order for each server. If the server is marked as bad, we don't # even put them in the list. Then we care about the number of shares # which have already been assigned to them. After that we care about # their permutation order. old_assignments = DictOfSets() for (server, shnum) in self.goal: old_assignments.add(server, shnum) serverlist = [] for i, server in enumerate(self.full_serverlist): serverid = server.get_serverid() if server in self.bad_servers: continue entry = (len(old_assignments.get(server, [])), i, serverid, server) serverlist.append(entry) serverlist.sort() if not serverlist: raise NotEnoughServersError("Ran out of non-bad servers, " "first_error=%s" % str(self._first_write_error), self._first_write_error) # we then index this serverlist with an integer, because we may have # to wrap. We update the goal as we go. i = 0 for shnum in homeless_shares: (ignored1, ignored2, ignored3, server) = serverlist[i] # if we are forced to send a share to a server that already has # one, we may have two write requests in flight, and the # servermap (which was computed before either request was sent) # won't reflect the new shares, so the second response will be # surprising. There is code in _got_write_answer() to tolerate # this, otherwise it would cause the publish to fail with an # UncoordinatedWriteError. See #546 for details of the trouble # this used to cause. self.goal.add( (server, shnum) ) i += 1 if i >= len(serverlist): i = 0 if True: self.log_goal(self.goal, "after update: ") def _got_write_answer(self, answer, writer, started): if not answer: # SDMF writers only pretend to write when readers set their # blocks, salts, and so on -- they actually just write once, # at the end of the upload process. In fake writes, they # return defer.succeed(None). If we see that, we shouldn't # bother checking it. return server = writer.server lp = self.log("_got_write_answer from %s, share %d" % (server.get_name(), writer.shnum)) now = time.time() elapsed = now - started self._status.add_per_server_time(server, elapsed) wrote, read_data = answer surprise_shares = set(read_data.keys()) - set([writer.shnum]) # We need to remove from surprise_shares any shares that we are # knowingly also writing to that server from other writers. # TODO: Precompute this. shares = [] for shnum, writers in self.writers.iteritems(): shares.extend([x.shnum for x in writers if x.server == server]) known_shnums = set(shares) surprise_shares -= known_shnums self.log("found the following surprise shares: %s" % str(surprise_shares)) # Now surprise shares contains all of the shares that we did not # expect to be there. surprised = False for shnum in surprise_shares: # read_data is a dict mapping shnum to checkstring (SIGNED_PREFIX) checkstring = read_data[shnum][0] # What we want to do here is to see if their (seqnum, # roothash, salt) is the same as our (seqnum, roothash, # salt), or the equivalent for MDMF. The best way to do this # is to store a packed representation of our checkstring # somewhere, then not bother unpacking the other # checkstring. if checkstring == self._checkstring: # they have the right share, somehow if (server,shnum) in self.goal: # and we want them to have it, so we probably sent them a # copy in an earlier write. This is ok, and avoids the # #546 problem. continue # They aren't in our goal, but they are still for the right # version. Somebody else wrote them, and it's a convergent # uncoordinated write. Pretend this is ok (don't be # surprised), since I suspect there's a decent chance that # we'll hit this in normal operation. continue else: # the new shares are of a different version if server in self._servermap.get_reachable_servers(): # we asked them about their shares, so we had knowledge # of what they used to have. Any surprising shares must # have come from someone else, so UCW. surprised = True else: # we didn't ask them, and now we've discovered that they # have a share we didn't know about. This indicates that # mapupdate should have wokred harder and asked more # servers before concluding that it knew about them all. # signal UCW, but make sure to ask this server next time, # so we'll remember to update it if/when we retry. surprised = True # TODO: ask this server next time. I don't yet have a good # way to do this. Two insufficient possibilities are: # # self._servermap.add_new_share(server, shnum, verinfo, now) # but that requires fetching/validating/parsing the whole # version string, and all we have is the checkstring # self._servermap.mark_bad_share(server, shnum, checkstring) # that will make publish overwrite the share next time, # but it won't re-query the server, and it won't make # mapupdate search further # TODO later: when publish starts, do # servermap.get_best_version(), extract the seqnum, # subtract one, and store as highest-replaceable-seqnum. # Then, if this surprise-because-we-didn't-ask share is # of highest-replaceable-seqnum or lower, we're allowed # to replace it: send out a new writev (or rather add it # to self.goal and loop). pass surprised = True if surprised: self.log("they had shares %s that we didn't know about" % (list(surprise_shares),), parent=lp, level=log.WEIRD, umid="un9CSQ") self.surprised = True if not wrote: # TODO: there are two possibilities. The first is that the server # is full (or just doesn't want to give us any room), which means # we shouldn't ask them again, but is *not* an indication of an # uncoordinated write. The second is that our testv failed, which # *does* indicate an uncoordinated write. We currently don't have # a way to tell these two apart (in fact, the storage server code # doesn't have the option of refusing our share). # # If the server is full, mark the server as bad (so we don't ask # them again), but don't set self.surprised. The loop() will find # a new server. # # If the testv failed, log it, set self.surprised, but don't # bother adding to self.bad_servers . self.log("our testv failed, so the write did not happen", parent=lp, level=log.WEIRD, umid="8sc26g") self.surprised = True self.bad_servers.add(server) # don't ask them again # use the checkstring to add information to the log message unknown_format = False for (shnum,readv) in read_data.items(): checkstring = readv[0] version = get_version_from_checkstring(checkstring) if version == MDMF_VERSION: (other_seqnum, other_roothash) = unpack_mdmf_checkstring(checkstring) elif version == SDMF_VERSION: (other_seqnum, other_roothash, other_IV) = unpack_sdmf_checkstring(checkstring) else: unknown_format = True expected_version = self._servermap.version_on_server(server, shnum) if expected_version: (seqnum, root_hash, IV, segsize, datalength, k, N, prefix, offsets_tuple) = expected_version msg = ("somebody modified the share on us:" " shnum=%d: I thought they had #%d:R=%s," % (shnum, seqnum, base32.b2a(root_hash)[:4])) if unknown_format: msg += (" but I don't know how to read share" " format %d" % version) else: msg += " but testv reported #%d:R=%s" % \ (other_seqnum, base32.b2a(other_roothash)[:4]) self.log(msg, parent=lp, level=log.NOISY) # if expected_version==None, then we didn't expect to see a # share on that server, and the 'surprise_shares' clause # above will have logged it. return # and update the servermap # self.versioninfo is set during the last phase of publishing. # If we get there, we know that responses correspond to placed # shares, and can safely execute these statements. if self.versioninfo: self.log("wrote successfully: adding new share to servermap") self._servermap.add_new_share(server, writer.shnum, self.versioninfo, started) self.placed.add( (server, writer.shnum) ) self._update_status() # the next method in the deferred chain will check to see if # we're done and successful. return def _done(self): if not self._running: return self._running = False now = time.time() self._status.timings["total"] = now - self._started elapsed = now - self._started_pushing self._status.timings['push'] = elapsed self._status.set_active(False) self.log("Publish done, success") self._status.set_status("Finished") self._status.set_progress(1.0) # Get k and segsize, then give them to the caller. hints = {} hints['segsize'] = self.segment_size hints['k'] = self.required_shares self._node.set_downloader_hints(hints) eventually(self.done_deferred.callback, None) def _failure(self, f=None): if f: self._last_failure = f if not self.surprised: # We ran out of servers msg = "Publish ran out of good servers" if self._last_failure: msg += ", last failure was: %s" % str(self._last_failure) self.log(msg) e = NotEnoughServersError(msg) else: # We ran into shares that we didn't recognize, which means # that we need to return an UncoordinatedWriteError. self.log("Publish failed with UncoordinatedWriteError") e = UncoordinatedWriteError() f = failure.Failure(e) eventually(self.done_deferred.callback, f) class MutableFileHandle: """ I am a mutable uploadable built around a filehandle-like object, usually either a StringIO instance or a handle to an actual file. """ implements(IMutableUploadable) def __init__(self, filehandle): # The filehandle is defined as a generally file-like object that # has these two methods. We don't care beyond that. assert hasattr(filehandle, "read") assert hasattr(filehandle, "close") self._filehandle = filehandle # We must start reading at the beginning of the file, or we risk # encountering errors when the data read does not match the size # reported to the uploader. self._filehandle.seek(0) # We have not yet read anything, so our position is 0. self._marker = 0 def get_size(self): """ I return the amount of data in my filehandle. """ if not hasattr(self, "_size"): old_position = self._filehandle.tell() # Seek to the end of the file by seeking 0 bytes from the # file's end self._filehandle.seek(0, os.SEEK_END) self._size = self._filehandle.tell() # Restore the previous position, in case this was called # after a read. self._filehandle.seek(old_position) assert self._filehandle.tell() == old_position assert hasattr(self, "_size") return self._size def pos(self): """ I return the position of my read marker -- i.e., how much data I have already read and returned to callers. """ return self._marker def read(self, length): """ I return some data (up to length bytes) from my filehandle. In most cases, I return length bytes, but sometimes I won't -- for example, if I am asked to read beyond the end of a file, or an error occurs. """ results = self._filehandle.read(length) self._marker += len(results) return [results] def close(self): """ I close the underlying filehandle. Any further operations on the filehandle fail at this point. """ self._filehandle.close() class MutableData(MutableFileHandle): """ I am a mutable uploadable built around a string, which I then cast into a StringIO and treat as a filehandle. """ def __init__(self, s): # Take a string and return a file-like uploadable. assert isinstance(s, str) MutableFileHandle.__init__(self, StringIO(s)) class TransformingUploadable: """ I am an IMutableUploadable that wraps another IMutableUploadable, and some segments that are already on the grid. When I am called to read, I handle merging of boundary segments. """ implements(IMutableUploadable) def __init__(self, data, offset, segment_size, start, end): assert IMutableUploadable.providedBy(data) self._newdata = data self._offset = offset self._segment_size = segment_size self._start = start self._end = end self._read_marker = 0 self._first_segment_offset = offset % segment_size num = self.log("TransformingUploadable: starting", parent=None) self._log_number = num self.log("got fso: %d" % self._first_segment_offset) self.log("got offset: %d" % self._offset) def log(self, *args, **kwargs): if 'parent' not in kwargs: kwargs['parent'] = self._log_number if "facility" not in kwargs: kwargs["facility"] = "tahoe.mutable.transforminguploadable" return log.msg(*args, **kwargs) def get_size(self): return self._offset + self._newdata.get_size() def read(self, length): # We can get data from 3 sources here. # 1. The first of the segments provided to us. # 2. The data that we're replacing things with. # 3. The last of the segments provided to us. # are we in state 0? self.log("reading %d bytes" % length) old_start_data = "" old_data_length = self._first_segment_offset - self._read_marker if old_data_length > 0: if old_data_length > length: old_data_length = length self.log("returning %d bytes of old start data" % old_data_length) old_data_end = old_data_length + self._read_marker old_start_data = self._start[self._read_marker:old_data_end] length -= old_data_length else: # otherwise calculations later get screwed up. old_data_length = 0 # Is there enough new data to satisfy this read? If not, we need # to pad the end of the data with data from our last segment. old_end_length = length - \ (self._newdata.get_size() - self._newdata.pos()) old_end_data = "" if old_end_length > 0: self.log("reading %d bytes of old end data" % old_end_length) # TODO: We're not explicitly checking for tail segment size # here. Is that a problem? old_data_offset = (length - old_end_length + \ old_data_length) % self._segment_size self.log("reading at offset %d" % old_data_offset) old_end = old_data_offset + old_end_length old_end_data = self._end[old_data_offset:old_end] length -= old_end_length assert length == self._newdata.get_size() - self._newdata.pos() self.log("reading %d bytes of new data" % length) new_data = self._newdata.read(length) new_data = "".join(new_data) self._read_marker += len(old_start_data + new_data + old_end_data) return old_start_data + new_data + old_end_data def close(self): pass allmydata-tahoe-1.10.2/src/allmydata/mutable/filenode.py0000644000175000017500000013212012556560070021315 0ustar ramram import random from zope.interface import implements from twisted.internet import defer, reactor from foolscap.api import eventually from allmydata.interfaces import IMutableFileNode, ICheckable, ICheckResults, \ NotEnoughSharesError, MDMF_VERSION, SDMF_VERSION, IMutableUploadable, \ IMutableFileVersion, IWriteable from allmydata.util import hashutil, log, consumer, deferredutil, mathutil from allmydata.util.assertutil import precondition from allmydata.uri import WriteableSSKFileURI, ReadonlySSKFileURI, \ WriteableMDMFFileURI, ReadonlyMDMFFileURI from allmydata.monitor import Monitor from pycryptopp.cipher.aes import AES from allmydata.mutable.publish import Publish, MutableData,\ TransformingUploadable from allmydata.mutable.common import MODE_READ, MODE_WRITE, MODE_CHECK, UnrecoverableFileError, \ UncoordinatedWriteError from allmydata.mutable.servermap import ServerMap, ServermapUpdater from allmydata.mutable.retrieve import Retrieve from allmydata.mutable.checker import MutableChecker, MutableCheckAndRepairer from allmydata.mutable.repairer import Repairer class BackoffAgent: # these parameters are copied from foolscap.reconnector, which gets them # from twisted.internet.protocol.ReconnectingClientFactory initialDelay = 1.0 factor = 2.7182818284590451 # (math.e) jitter = 0.11962656492 # molar Planck constant times c, Joule meter/mole maxRetries = 4 def __init__(self): self._delay = self.initialDelay self._count = 0 def delay(self, node, f): self._count += 1 if self._count == 4: return f self._delay = self._delay * self.factor self._delay = random.normalvariate(self._delay, self._delay * self.jitter) d = defer.Deferred() reactor.callLater(self._delay, d.callback, None) return d # use nodemaker.create_mutable_file() to make one of these class MutableFileNode: implements(IMutableFileNode, ICheckable) def __init__(self, storage_broker, secret_holder, default_encoding_parameters, history): self._storage_broker = storage_broker self._secret_holder = secret_holder self._default_encoding_parameters = default_encoding_parameters self._history = history self._pubkey = None # filled in upon first read self._privkey = None # filled in if we're mutable # we keep track of the last encoding parameters that we use. These # are updated upon retrieve, and used by publish. If we publish # without ever reading (i.e. overwrite()), then we use these values. self._required_shares = default_encoding_parameters["k"] self._total_shares = default_encoding_parameters["n"] self._sharemap = {} # known shares, shnum-to-[nodeids] self._most_recent_size = None # filled in after __init__ if we're being created for the first time; # filled in by the servermap updater before publishing, otherwise. # set to this default value in case neither of those things happen, # or in case the servermap can't find any shares to tell us what # to publish as. self._protocol_version = None # all users of this MutableFileNode go through the serializer. This # takes advantage of the fact that Deferreds discard the callbacks # that they're done with, so we can keep using the same Deferred # forever without consuming more and more memory. self._serializer = defer.succeed(None) # Starting with MDMF, we can get these from caps if they're # there. Leave them alone for now; they'll be filled in by my # init_from_cap method if necessary. self._downloader_hints = {} def __repr__(self): if hasattr(self, '_uri'): return "<%s %x %s %s>" % (self.__class__.__name__, id(self), self.is_readonly() and 'RO' or 'RW', self._uri.abbrev()) else: return "<%s %x %s %s>" % (self.__class__.__name__, id(self), None, None) def init_from_cap(self, filecap): # we have the URI, but we have not yet retrieved the public # verification key, nor things like 'k' or 'N'. If and when someone # wants to get our contents, we'll pull from shares and fill those # in. if isinstance(filecap, (WriteableMDMFFileURI, ReadonlyMDMFFileURI)): self._protocol_version = MDMF_VERSION elif isinstance(filecap, (ReadonlySSKFileURI, WriteableSSKFileURI)): self._protocol_version = SDMF_VERSION self._uri = filecap self._writekey = None if not filecap.is_readonly() and filecap.is_mutable(): self._writekey = self._uri.writekey self._readkey = self._uri.readkey self._storage_index = self._uri.storage_index self._fingerprint = self._uri.fingerprint # the following values are learned during Retrieval # self._pubkey # self._required_shares # self._total_shares # and these are needed for Publish. They are filled in by Retrieval # if possible, otherwise by the first peer that Publish talks to. self._privkey = None self._encprivkey = None return self def create_with_keys(self, (pubkey, privkey), contents, version=SDMF_VERSION): """Call this to create a brand-new mutable file. It will create the shares, find homes for them, and upload the initial contents (created with the same rules as IClient.create_mutable_file() ). Returns a Deferred that fires (with the MutableFileNode instance you should use) when it completes. """ self._pubkey, self._privkey = pubkey, privkey pubkey_s = self._pubkey.serialize() privkey_s = self._privkey.serialize() self._writekey = hashutil.ssk_writekey_hash(privkey_s) self._encprivkey = self._encrypt_privkey(self._writekey, privkey_s) self._fingerprint = hashutil.ssk_pubkey_fingerprint_hash(pubkey_s) if version == MDMF_VERSION: self._uri = WriteableMDMFFileURI(self._writekey, self._fingerprint) self._protocol_version = version elif version == SDMF_VERSION: self._uri = WriteableSSKFileURI(self._writekey, self._fingerprint) self._protocol_version = version self._readkey = self._uri.readkey self._storage_index = self._uri.storage_index initial_contents = self._get_initial_contents(contents) return self._upload(initial_contents, None) def _get_initial_contents(self, contents): if contents is None: return MutableData("") if isinstance(contents, str): return MutableData(contents) if IMutableUploadable.providedBy(contents): return contents assert callable(contents), "%s should be callable, not %s" % \ (contents, type(contents)) return contents(self) def _encrypt_privkey(self, writekey, privkey): enc = AES(writekey) crypttext = enc.process(privkey) return crypttext def _decrypt_privkey(self, enc_privkey): enc = AES(self._writekey) privkey = enc.process(enc_privkey) return privkey def _populate_pubkey(self, pubkey): self._pubkey = pubkey def _populate_required_shares(self, required_shares): self._required_shares = required_shares def _populate_total_shares(self, total_shares): self._total_shares = total_shares def _populate_privkey(self, privkey): self._privkey = privkey def _populate_encprivkey(self, encprivkey): self._encprivkey = encprivkey def get_write_enabler(self, server): seed = server.get_foolscap_write_enabler_seed() assert len(seed) == 20 return hashutil.ssk_write_enabler_hash(self._writekey, seed) def get_renewal_secret(self, server): crs = self._secret_holder.get_renewal_secret() frs = hashutil.file_renewal_secret_hash(crs, self._storage_index) lease_seed = server.get_lease_seed() assert len(lease_seed) == 20 return hashutil.bucket_renewal_secret_hash(frs, lease_seed) def get_cancel_secret(self, server): ccs = self._secret_holder.get_cancel_secret() fcs = hashutil.file_cancel_secret_hash(ccs, self._storage_index) lease_seed = server.get_lease_seed() assert len(lease_seed) == 20 return hashutil.bucket_cancel_secret_hash(fcs, lease_seed) def get_writekey(self): return self._writekey def get_readkey(self): return self._readkey def get_storage_index(self): return self._storage_index def get_fingerprint(self): return self._fingerprint def get_privkey(self): return self._privkey def get_encprivkey(self): return self._encprivkey def get_pubkey(self): return self._pubkey def get_required_shares(self): return self._required_shares def get_total_shares(self): return self._total_shares #################################### # IFilesystemNode def get_size(self): return self._most_recent_size def get_current_size(self): d = self.get_size_of_best_version() d.addCallback(self._stash_size) return d def _stash_size(self, size): self._most_recent_size = size return size def get_cap(self): return self._uri def get_readcap(self): return self._uri.get_readonly() def get_verify_cap(self): return self._uri.get_verify_cap() def get_repair_cap(self): if self._uri.is_readonly(): return None return self._uri def get_uri(self): return self._uri.to_string() def get_write_uri(self): if self.is_readonly(): return None return self._uri.to_string() def get_readonly_uri(self): return self._uri.get_readonly().to_string() def get_readonly(self): if self.is_readonly(): return self ro = MutableFileNode(self._storage_broker, self._secret_holder, self._default_encoding_parameters, self._history) ro.init_from_cap(self._uri.get_readonly()) return ro def is_mutable(self): return self._uri.is_mutable() def is_readonly(self): return self._uri.is_readonly() def is_unknown(self): return False def is_allowed_in_immutable_directory(self): return not self._uri.is_mutable() def raise_error(self): pass def __hash__(self): return hash((self.__class__, self._uri)) def __cmp__(self, them): if cmp(type(self), type(them)): return cmp(type(self), type(them)) if cmp(self.__class__, them.__class__): return cmp(self.__class__, them.__class__) return cmp(self._uri, them._uri) ################################# # ICheckable def check(self, monitor, verify=False, add_lease=False): checker = MutableChecker(self, self._storage_broker, self._history, monitor) return checker.check(verify, add_lease) def check_and_repair(self, monitor, verify=False, add_lease=False): checker = MutableCheckAndRepairer(self, self._storage_broker, self._history, monitor) return checker.check(verify, add_lease) ################################# # IRepairable def repair(self, check_results, force=False, monitor=None): assert ICheckResults(check_results) r = Repairer(self, check_results, self._storage_broker, self._history, monitor) d = r.start(force) return d ################################# # IFileNode def get_best_readable_version(self): """ I return a Deferred that fires with a MutableFileVersion representing the best readable version of the file that I represent """ return self.get_readable_version() def get_readable_version(self, servermap=None, version=None): """ I return a Deferred that fires with an MutableFileVersion for my version argument, if there is a recoverable file of that version on the grid. If there is no recoverable version, I fire with an UnrecoverableFileError. If a servermap is provided, I look in there for the requested version. If no servermap is provided, I create and update a new one. If no version is provided, then I return a MutableFileVersion representing the best recoverable version of the file. """ d = self._get_version_from_servermap(MODE_READ, servermap, version) def _build_version((servermap, their_version)): assert their_version in servermap.recoverable_versions() assert their_version in servermap.make_versionmap() mfv = MutableFileVersion(self, servermap, their_version, self._storage_index, self._storage_broker, self._readkey, history=self._history) assert mfv.is_readonly() mfv.set_downloader_hints(self._downloader_hints) # our caller can use this to download the contents of the # mutable file. return mfv return d.addCallback(_build_version) def _get_version_from_servermap(self, mode, servermap=None, version=None): """ I return a Deferred that fires with (servermap, version). This function performs validation and a servermap update. If it returns (servermap, version), the caller can assume that: - servermap was last updated in mode. - version is recoverable, and corresponds to the servermap. If version and servermap are provided to me, I will validate that version exists in the servermap, and that the servermap was updated correctly. If version is not provided, but servermap is, I will validate the servermap and return the best recoverable version that I can find in the servermap. If the version is provided but the servermap isn't, I will obtain a servermap that has been updated in the correct mode and validate that version is found and recoverable. If neither servermap nor version are provided, I will obtain a servermap updated in the correct mode, and return the best recoverable version that I can find in there. """ # XXX: wording ^^^^ if servermap and servermap.get_last_update()[0] == mode: d = defer.succeed(servermap) else: d = self._get_servermap(mode) def _get_version(servermap, v): if v and v not in servermap.recoverable_versions(): v = None elif not v: v = servermap.best_recoverable_version() if not v: raise UnrecoverableFileError("no recoverable versions") return (servermap, v) return d.addCallback(_get_version, version) def download_best_version(self): """ I return a Deferred that fires with the contents of the best version of this mutable file. """ return self._do_serialized(self._download_best_version) def _download_best_version(self): """ I am the serialized sibling of download_best_version. """ d = self.get_best_readable_version() d.addCallback(self._record_size) d.addCallback(lambda version: version.download_to_data()) # It is possible that the download will fail because there # aren't enough shares to be had. If so, we will try again after # updating the servermap in MODE_WRITE, which may find more # shares than updating in MODE_READ, as we just did. We can do # this by getting the best mutable version and downloading from # that -- the best mutable version will be a MutableFileVersion # with a servermap that was last updated in MODE_WRITE, as we # want. If this fails, then we give up. def _maybe_retry(failure): failure.trap(NotEnoughSharesError) d = self.get_best_mutable_version() d.addCallback(self._record_size) d.addCallback(lambda version: version.download_to_data()) return d d.addErrback(_maybe_retry) return d def _record_size(self, mfv): """ I record the size of a mutable file version. """ self._most_recent_size = mfv.get_size() return mfv def get_size_of_best_version(self): """ I return the size of the best version of this mutable file. This is equivalent to calling get_size() on the result of get_best_readable_version(). """ d = self.get_best_readable_version() return d.addCallback(lambda mfv: mfv.get_size()) ################################# # IMutableFileNode def get_best_mutable_version(self, servermap=None): """ I return a Deferred that fires with a MutableFileVersion representing the best readable version of the file that I represent. I am like get_best_readable_version, except that I will try to make a writeable version if I can. """ return self.get_mutable_version(servermap=servermap) def get_mutable_version(self, servermap=None, version=None): """ I return a version of this mutable file. I return a Deferred that fires with a MutableFileVersion If version is provided, the Deferred will fire with a MutableFileVersion initailized with that version. Otherwise, it will fire with the best version that I can recover. If servermap is provided, I will use that to find versions instead of performing my own servermap update. """ if self.is_readonly(): return self.get_readable_version(servermap=servermap, version=version) # get_mutable_version => write intent, so we require that the # servermap is updated in MODE_WRITE d = self._get_version_from_servermap(MODE_WRITE, servermap, version) def _build_version((servermap, smap_version)): # these should have been set by the servermap update. assert self._secret_holder assert self._writekey mfv = MutableFileVersion(self, servermap, smap_version, self._storage_index, self._storage_broker, self._readkey, self._writekey, self._secret_holder, history=self._history) assert not mfv.is_readonly() mfv.set_downloader_hints(self._downloader_hints) return mfv return d.addCallback(_build_version) # XXX: I'm uncomfortable with the difference between upload and # overwrite, which, FWICT, is basically that you don't have to # do a servermap update before you overwrite. We split them up # that way anyway, so I guess there's no real difficulty in # offering both ways to callers, but it also makes the # public-facing API cluttery, and makes it hard to discern the # right way of doing things. # In general, we leave it to callers to ensure that they aren't # going to cause UncoordinatedWriteErrors when working with # MutableFileVersions. We know that the next three operations # (upload, overwrite, and modify) will all operate on the same # version, so we say that only one of them can be going on at once, # and serialize them to ensure that that actually happens, since as # the caller in this situation it is our job to do that. def overwrite(self, new_contents): """ I overwrite the contents of the best recoverable version of this mutable file with new_contents. This is equivalent to calling overwrite on the result of get_best_mutable_version with new_contents as an argument. I return a Deferred that eventually fires with the results of my replacement process. """ # TODO: Update downloader hints. return self._do_serialized(self._overwrite, new_contents) def _overwrite(self, new_contents): """ I am the serialized sibling of overwrite. """ d = self.get_best_mutable_version() d.addCallback(lambda mfv: mfv.overwrite(new_contents)) d.addCallback(self._did_upload, new_contents.get_size()) return d def upload(self, new_contents, servermap): """ I overwrite the contents of the best recoverable version of this mutable file with new_contents, using servermap instead of creating/updating our own servermap. I return a Deferred that fires with the results of my upload. """ # TODO: Update downloader hints return self._do_serialized(self._upload, new_contents, servermap) def modify(self, modifier, backoffer=None): """ I modify the contents of the best recoverable version of this mutable file with the modifier. This is equivalent to calling modify on the result of get_best_mutable_version. I return a Deferred that eventually fires with an UploadResults instance describing this process. """ # TODO: Update downloader hints. return self._do_serialized(self._modify, modifier, backoffer) def _modify(self, modifier, backoffer): """ I am the serialized sibling of modify. """ d = self.get_best_mutable_version() d.addCallback(lambda mfv: mfv.modify(modifier, backoffer)) return d def download_version(self, servermap, version, fetch_privkey=False): """ Download the specified version of this mutable file. I return a Deferred that fires with the contents of the specified version as a bytestring, or errbacks if the file is not recoverable. """ d = self.get_readable_version(servermap, version) return d.addCallback(lambda mfv: mfv.download_to_data(fetch_privkey)) def get_servermap(self, mode): """ I return a servermap that has been updated in mode. mode should be one of MODE_READ, MODE_WRITE, MODE_CHECK or MODE_ANYTHING. See servermap.py for more on what these mean. """ return self._do_serialized(self._get_servermap, mode) def _get_servermap(self, mode): """ I am a serialized twin to get_servermap. """ servermap = ServerMap() d = self._update_servermap(servermap, mode) # The servermap will tell us about the most recent size of the # file, so we may as well set that so that callers might get # more data about us. if not self._most_recent_size: d.addCallback(self._get_size_from_servermap) return d def _get_size_from_servermap(self, servermap): """ I extract the size of the best version of this file and record it in self._most_recent_size. I return the servermap that I was given. """ if servermap.recoverable_versions(): v = servermap.best_recoverable_version() size = v[4] # verinfo[4] == size self._most_recent_size = size return servermap def _update_servermap(self, servermap, mode): u = ServermapUpdater(self, self._storage_broker, Monitor(), servermap, mode) if self._history: self._history.notify_mapupdate(u.get_status()) return u.update() #def set_version(self, version): # I can be set in two ways: # 1. When the node is created. # 2. (for an existing share) when the Servermap is updated # before I am read. # assert version in (MDMF_VERSION, SDMF_VERSION) # self._protocol_version = version def get_version(self): return self._protocol_version def _do_serialized(self, cb, *args, **kwargs): # note: to avoid deadlock, this callable is *not* allowed to invoke # other serialized methods within this (or any other) # MutableFileNode. The callable should be a bound method of this same # MFN instance. d = defer.Deferred() self._serializer.addCallback(lambda ignore: cb(*args, **kwargs)) # we need to put off d.callback until this Deferred is finished being # processed. Otherwise the caller's subsequent activities (like, # doing other things with this node) can cause reentrancy problems in # the Deferred code itself self._serializer.addBoth(lambda res: eventually(d.callback, res)) # add a log.err just in case something really weird happens, because # self._serializer stays around forever, therefore we won't see the # usual Unhandled Error in Deferred that would give us a hint. self._serializer.addErrback(log.err) return d def _upload(self, new_contents, servermap): """ A MutableFileNode still has to have some way of getting published initially, which is what I am here for. After that, all publishing, updating, modifying and so on happens through MutableFileVersions. """ assert self._pubkey, "update_servermap must be called before publish" # Define IPublishInvoker with a set_downloader_hints method? # Then have the publisher call that method when it's done publishing? p = Publish(self, self._storage_broker, servermap) if self._history: self._history.notify_publish(p.get_status(), new_contents.get_size()) d = p.publish(new_contents) d.addCallback(self._did_upload, new_contents.get_size()) return d def set_downloader_hints(self, hints): self._downloader_hints = hints def _did_upload(self, res, size): self._most_recent_size = size return res class MutableFileVersion: """ I represent a specific version (most likely the best version) of a mutable file. Since I implement IReadable, instances which hold a reference to an instance of me are guaranteed the ability (absent connection difficulties or unrecoverable versions) to read the file that I represent. Depending on whether I was initialized with a write capability or not, I may also provide callers the ability to overwrite or modify the contents of the mutable file that I reference. """ implements(IMutableFileVersion, IWriteable) def __init__(self, node, servermap, version, storage_index, storage_broker, readcap, writekey=None, write_secrets=None, history=None): self._node = node self._servermap = servermap self._version = version self._storage_index = storage_index self._write_secrets = write_secrets self._history = history self._storage_broker = storage_broker #assert isinstance(readcap, IURI) self._readcap = readcap self._writekey = writekey self._serializer = defer.succeed(None) def get_sequence_number(self): """ Get the sequence number of the mutable version that I represent. """ return self._version[0] # verinfo[0] == the sequence number # TODO: Terminology? def get_writekey(self): """ I return a writekey or None if I don't have a writekey. """ return self._writekey def set_downloader_hints(self, hints): """ I set the downloader hints. """ assert isinstance(hints, dict) self._downloader_hints = hints def get_downloader_hints(self): """ I return the downloader hints. """ return self._downloader_hints def overwrite(self, new_contents): """ I overwrite the contents of this mutable file version with the data in new_contents. """ assert not self.is_readonly() return self._do_serialized(self._overwrite, new_contents) def _overwrite(self, new_contents): assert IMutableUploadable.providedBy(new_contents) assert self._servermap.get_last_update()[0] == MODE_WRITE return self._upload(new_contents) def modify(self, modifier, backoffer=None): """I use a modifier callback to apply a change to the mutable file. I implement the following pseudocode:: obtain_mutable_filenode_lock() first_time = True while True: update_servermap(MODE_WRITE) old = retrieve_best_version() new = modifier(old, servermap, first_time) first_time = False if new == old: break try: publish(new) except UncoordinatedWriteError, e: backoffer(e) continue break release_mutable_filenode_lock() The idea is that your modifier function can apply a delta of some sort, and it will be re-run as necessary until it succeeds. The modifier must inspect the old version to see whether its delta has already been applied: if so it should return the contents unmodified. Note that the modifier is required to run synchronously, and must not invoke any methods on this MutableFileNode instance. The backoff-er is a callable that is responsible for inserting a random delay between subsequent attempts, to help competing updates from colliding forever. It is also allowed to give up after a while. The backoffer is given two arguments: this MutableFileNode, and the Failure object that contains the UncoordinatedWriteError. It should return a Deferred that will fire when the next attempt should be made, or return the Failure if the loop should give up. If backoffer=None, a default one is provided which will perform exponential backoff, and give up after 4 tries. Note that the backoffer should not invoke any methods on this MutableFileNode instance, and it needs to be highly conscious of deadlock issues. """ assert not self.is_readonly() return self._do_serialized(self._modify, modifier, backoffer) def _modify(self, modifier, backoffer): if backoffer is None: backoffer = BackoffAgent().delay return self._modify_and_retry(modifier, backoffer, True) def _modify_and_retry(self, modifier, backoffer, first_time): """ I try to apply modifier to the contents of this version of the mutable file. If I succeed, I return an UploadResults instance describing my success. If I fail, I try again after waiting for a little bit. """ log.msg("doing modify") if first_time: d = self._update_servermap() else: # We ran into trouble; do MODE_CHECK so we're a little more # careful on subsequent tries. d = self._update_servermap(mode=MODE_CHECK) d.addCallback(lambda ignored: self._modify_once(modifier, first_time)) def _retry(f): f.trap(UncoordinatedWriteError) # Uh oh, it broke. We're allowed to trust the servermap for our # first try, but after that we need to update it. It's # possible that we've failed due to a race with another # uploader, and if the race is to converge correctly, we # need to know about that upload. d2 = defer.maybeDeferred(backoffer, self, f) d2.addCallback(lambda ignored: self._modify_and_retry(modifier, backoffer, False)) return d2 d.addErrback(_retry) return d def _modify_once(self, modifier, first_time): """ I attempt to apply a modifier to the contents of the mutable file. """ assert self._servermap.get_last_update()[0] != MODE_READ # download_to_data is serialized, so we have to call this to # avoid deadlock. d = self._try_to_download_data() def _apply(old_contents): new_contents = modifier(old_contents, self._servermap, first_time) precondition((isinstance(new_contents, str) or new_contents is None), "Modifier function must return a string " "or None") if new_contents is None or new_contents == old_contents: log.msg("no changes") # no changes need to be made if first_time: return # However, since Publish is not automatically doing a # recovery when it observes UCWE, we need to do a second # publish. See #551 for details. We'll basically loop until # we managed an uncontested publish. old_uploadable = MutableData(old_contents) new_contents = old_uploadable else: new_contents = MutableData(new_contents) return self._upload(new_contents) d.addCallback(_apply) return d def is_readonly(self): """ I return True if this MutableFileVersion provides no write access to the file that it encapsulates, and False if it provides the ability to modify the file. """ return self._writekey is None def is_mutable(self): """ I return True, since mutable files are always mutable by somebody. """ return True def get_storage_index(self): """ I return the storage index of the reference that I encapsulate. """ return self._storage_index def get_size(self): """ I return the length, in bytes, of this readable object. """ return self._servermap.size_of_version(self._version) def download_to_data(self, fetch_privkey=False): """ I return a Deferred that fires with the contents of this readable object as a byte string. """ c = consumer.MemoryConsumer() d = self.read(c, fetch_privkey=fetch_privkey) d.addCallback(lambda mc: "".join(mc.chunks)) return d def _try_to_download_data(self): """ I am an unserialized cousin of download_to_data; I am called from the children of modify() to download the data associated with this mutable version. """ c = consumer.MemoryConsumer() # modify will almost certainly write, so we need the privkey. d = self._read(c, fetch_privkey=True) d.addCallback(lambda mc: "".join(mc.chunks)) return d def read(self, consumer, offset=0, size=None, fetch_privkey=False): """ I read a portion (possibly all) of the mutable file that I reference into consumer. """ return self._do_serialized(self._read, consumer, offset, size, fetch_privkey) def _read(self, consumer, offset=0, size=None, fetch_privkey=False): """ I am the serialized companion of read. """ r = Retrieve(self._node, self._storage_broker, self._servermap, self._version, fetch_privkey) if self._history: self._history.notify_retrieve(r.get_status()) d = r.download(consumer, offset, size) return d def _do_serialized(self, cb, *args, **kwargs): # note: to avoid deadlock, this callable is *not* allowed to invoke # other serialized methods within this (or any other) # MutableFileNode. The callable should be a bound method of this same # MFN instance. d = defer.Deferred() self._serializer.addCallback(lambda ignore: cb(*args, **kwargs)) # we need to put off d.callback until this Deferred is finished being # processed. Otherwise the caller's subsequent activities (like, # doing other things with this node) can cause reentrancy problems in # the Deferred code itself self._serializer.addBoth(lambda res: eventually(d.callback, res)) # add a log.err just in case something really weird happens, because # self._serializer stays around forever, therefore we won't see the # usual Unhandled Error in Deferred that would give us a hint. self._serializer.addErrback(log.err) return d def _upload(self, new_contents): #assert self._pubkey, "update_servermap must be called before publish" p = Publish(self._node, self._storage_broker, self._servermap) if self._history: self._history.notify_publish(p.get_status(), new_contents.get_size()) d = p.publish(new_contents) d.addCallback(self._did_upload, new_contents.get_size()) return d def _did_upload(self, res, size): self._most_recent_size = size return res def update(self, data, offset): """ Do an update of this mutable file version by inserting data at offset within the file. If offset is the EOF, this is an append operation. I return a Deferred that fires with the results of the update operation when it has completed. In cases where update does not append any data, or where it does not append so many blocks that the block count crosses a power-of-two boundary, this operation will use roughly O(data.get_size()) memory/bandwidth/CPU to perform the update. Otherwise, it must download, re-encode, and upload the entire file again, which will use O(filesize) resources. """ return self._do_serialized(self._update, data, offset) def _update(self, data, offset): """ I update the mutable file version represented by this particular IMutableVersion by inserting the data in data at the offset offset. I return a Deferred that fires when this has been completed. """ new_size = data.get_size() + offset old_size = self.get_size() segment_size = self._version[3] num_old_segments = mathutil.div_ceil(old_size, segment_size) num_new_segments = mathutil.div_ceil(new_size, segment_size) log.msg("got %d old segments, %d new segments" % \ (num_old_segments, num_new_segments)) # We do a whole file re-encode if the file is an SDMF file. if self._version[2]: # version[2] == SDMF salt, which MDMF lacks log.msg("doing re-encode instead of in-place update") return self._do_modify_update(data, offset) # Otherwise, we can replace just the parts that are changing. log.msg("updating in place") d = self._do_update_update(data, offset) d.addCallback(self._decode_and_decrypt_segments, data, offset) d.addCallback(self._build_uploadable_and_finish, data, offset) return d def _do_modify_update(self, data, offset): """ I perform a file update by modifying the contents of the file after downloading it, then reuploading it. I am less efficient than _do_update_update, but am necessary for certain updates. """ def m(old, servermap, first_time): start = offset rest = offset + data.get_size() new = old[:start] new += "".join(data.read(data.get_size())) new += old[rest:] return new return self._modify(m, None) def _do_update_update(self, data, offset): """ I start the Servermap update that gets us the data we need to continue the update process. I return a Deferred that fires when the servermap update is done. """ assert IMutableUploadable.providedBy(data) assert self.is_mutable() # offset == self.get_size() is valid and means that we are # appending data to the file. assert offset <= self.get_size() segsize = self._version[3] # We'll need the segment that the data starts in, regardless of # what we'll do later. start_segment = offset // segsize # We only need the end segment if the data we append does not go # beyond the current end-of-file. end_segment = start_segment if offset + data.get_size() < self.get_size(): end_data = offset + data.get_size() # The last byte we touch is the end_data'th byte, which is actually # byte end_data - 1 because bytes are zero-indexed. end_data -= 1 end_segment = end_data // segsize self._start_segment = start_segment self._end_segment = end_segment # Now ask for the servermap to be updated in MODE_WRITE with # this update range. return self._update_servermap(update_range=(start_segment, end_segment)) def _decode_and_decrypt_segments(self, ignored, data, offset): """ After the servermap update, I take the encrypted and encoded data that the servermap fetched while doing its update and transform it into decoded-and-decrypted plaintext that can be used by the new uploadable. I return a Deferred that fires with the segments. """ r = Retrieve(self._node, self._storage_broker, self._servermap, self._version) # decode: takes in our blocks and salts from the servermap, # returns a Deferred that fires with the corresponding plaintext # segments. Does not download -- simply takes advantage of # existing infrastructure within the Retrieve class to avoid # duplicating code. sm = self._servermap # XXX: If the methods in the servermap don't work as # abstractions, you should rewrite them instead of going around # them. update_data = sm.update_data start_segments = {} # shnum -> start segment end_segments = {} # shnum -> end segment blockhashes = {} # shnum -> blockhash tree for (shnum, original_data) in update_data.iteritems(): data = [d[1] for d in original_data if d[0] == self._version] # data is [(blockhashes,start,end)..] # Every data entry in our list should now be share shnum for # a particular version of the mutable file, so all of the # entries should be identical. datum = data[0] assert [x for x in data if x != datum] == [] # datum is (blockhashes,start,end) blockhashes[shnum] = datum[0] start_segments[shnum] = datum[1] # (block,salt) bytestrings end_segments[shnum] = datum[2] d1 = r.decode(start_segments, self._start_segment) d2 = r.decode(end_segments, self._end_segment) d3 = defer.succeed(blockhashes) return deferredutil.gatherResults([d1, d2, d3]) def _build_uploadable_and_finish(self, segments_and_bht, data, offset): """ After the process has the plaintext segments, I build the TransformingUploadable that the publisher will eventually re-upload to the grid. I then invoke the publisher with that uploadable, and return a Deferred when the publish operation has completed without issue. """ u = TransformingUploadable(data, offset, self._version[3], segments_and_bht[0], segments_and_bht[1]) p = Publish(self._node, self._storage_broker, self._servermap) return p.update(u, offset, segments_and_bht[2], self._version) def _update_servermap(self, mode=MODE_WRITE, update_range=None): """ I update the servermap. I return a Deferred that fires when the servermap update is done. """ if update_range: u = ServermapUpdater(self._node, self._storage_broker, Monitor(), self._servermap, mode=mode, update_range=update_range) else: u = ServermapUpdater(self._node, self._storage_broker, Monitor(), self._servermap, mode=mode) return u.update() allmydata-tahoe-1.10.2/src/allmydata/mutable/layout.py0000644000175000017500000021312012556560070021045 0ustar ramram import struct from allmydata.mutable.common import NeedMoreDataError, UnknownVersionError, \ BadShareError from allmydata.interfaces import HASH_SIZE, SALT_SIZE, SDMF_VERSION, \ MDMF_VERSION, IMutableSlotWriter from allmydata.util import mathutil from twisted.python import failure from twisted.internet import defer from zope.interface import implements # These strings describe the format of the packed structs they help process. # Here's what they mean: # # PREFIX: # >: Big-endian byte order; the most significant byte is first (leftmost). # B: The container version information; stored as an unsigned 8-bit integer. # This is currently either SDMF_VERSION or MDMF_VERSION. # Q: The sequence number; this is sort of like a revision history for # mutable files; they start at 1 and increase as they are changed after # being uploaded. Stored as an unsigned 64-bit integer. # 32s: The root hash of the share hash tree. We use sha-256d, so we use 32 # bytes to store the value. # 16s: The salt for the readkey. This is a 16-byte random value. # # SIGNED_PREFIX additions, things that are covered by the signature: # B: The "k" encoding parameter. We store this as an unsigned 8-bit # integer, since our erasure coding scheme cannot encode to more than # 255 pieces. # B: The "N" encoding parameter. Stored as an unsigned 8-bit integer for # the same reason as above. # Q: The segment size of the uploaded file. This is an unsigned 64-bit # integer, to allow handling large segments and files. For SDMF the # segment size is the data length plus padding; for MDMF it can be # smaller. # Q: The data length of the uploaded file. Like the segment size field, # it is an unsigned 64-bit integer. # # HEADER additions: # L: The offset of the signature. An unsigned 32-bit integer. # L: The offset of the share hash chain. An unsigned 32-bit integer. # L: The offset of the block hash tree. An unsigned 32-bit integer. # L: The offset of the share data. An unsigned 32-bit integer. # Q: The offset of the encrypted private key. An unsigned 64-bit integer, # to account for the possibility of a lot of share data. # Q: The offset of the EOF. An unsigned 64-bit integer, to account for # the possibility of a lot of share data. # # After all of these, we have the following: # - The verification key: Occupies the space between the end of the header # and the start of the signature (i.e.: data[HEADER_LENGTH:o['signature']]. # - The signature, which goes from the signature offset to the share hash # chain offset. # - The share hash chain, which goes from the share hash chain offset to # the block hash tree offset. # - The share data, which goes from the share data offset to the encrypted # private key offset. # - The encrypted private key offset, which goes until the end of the file. # # The block hash tree in this encoding has only one share, so the offset of # the share data will be 32 bits more than the offset of the block hash tree. # Given this, we may need to check to see how many bytes a reasonably sized # block hash tree will take up. PREFIX = ">BQ32s16s" # each version may have a different prefix SIGNED_PREFIX = ">BQ32s16s BBQQ" # this is covered by the signature SIGNED_PREFIX_LENGTH = struct.calcsize(SIGNED_PREFIX) HEADER = ">BQ32s16s BBQQ LLLLQQ" # includes offsets HEADER_LENGTH = struct.calcsize(HEADER) OFFSETS = ">LLLLQQ" OFFSETS_LENGTH = struct.calcsize(OFFSETS) MAX_MUTABLE_SHARE_SIZE = 69105*1000*1000*1000*1000 # 69105 TB, kind of arbitrary # These are still used for some tests of SDMF files. def unpack_header(data): o = {} (version, seqnum, root_hash, IV, k, N, segsize, datalen, o['signature'], o['share_hash_chain'], o['block_hash_tree'], o['share_data'], o['enc_privkey'], o['EOF']) = struct.unpack(HEADER, data[:HEADER_LENGTH]) return (version, seqnum, root_hash, IV, k, N, segsize, datalen, o) def unpack_share(data): assert len(data) >= HEADER_LENGTH o = {} (version, seqnum, root_hash, IV, k, N, segsize, datalen, o['signature'], o['share_hash_chain'], o['block_hash_tree'], o['share_data'], o['enc_privkey'], o['EOF']) = struct.unpack(HEADER, data[:HEADER_LENGTH]) if version != 0: raise UnknownVersionError("got mutable share version %d, but I only understand version 0" % version) if len(data) < o['EOF']: raise NeedMoreDataError(o['EOF'], o['enc_privkey'], o['EOF']-o['enc_privkey']) pubkey = data[HEADER_LENGTH:o['signature']] signature = data[o['signature']:o['share_hash_chain']] share_hash_chain_s = data[o['share_hash_chain']:o['block_hash_tree']] share_hash_format = ">H32s" hsize = struct.calcsize(share_hash_format) if len(share_hash_chain_s) % hsize != 0: raise BadShareError("hash chain is %d bytes, not multiple of %d" % (len(share_hash_chain_s), hsize)) share_hash_chain = [] for i in range(0, len(share_hash_chain_s), hsize): chunk = share_hash_chain_s[i:i+hsize] (hid, h) = struct.unpack(share_hash_format, chunk) share_hash_chain.append( (hid, h) ) share_hash_chain = dict(share_hash_chain) block_hash_tree_s = data[o['block_hash_tree']:o['share_data']] if len(block_hash_tree_s) % 32 != 0: raise BadShareError("block_hash_tree is %d bytes, not multiple of %d" % (len(block_hash_tree_s), 32)) block_hash_tree = [] for i in range(0, len(block_hash_tree_s), 32): block_hash_tree.append(block_hash_tree_s[i:i+32]) share_data = data[o['share_data']:o['enc_privkey']] enc_privkey = data[o['enc_privkey']:o['EOF']] return (seqnum, root_hash, IV, k, N, segsize, datalen, pubkey, signature, share_hash_chain, block_hash_tree, share_data, enc_privkey) def get_version_from_checkstring(checkstring): (t, ) = struct.unpack(">B", checkstring[:1]) return t def unpack_sdmf_checkstring(checkstring): cs_len = struct.calcsize(PREFIX) version, seqnum, root_hash, IV = struct.unpack(PREFIX, checkstring[:cs_len]) assert version == SDMF_VERSION, version return (seqnum, root_hash, IV) def unpack_mdmf_checkstring(checkstring): cs_len = struct.calcsize(MDMFCHECKSTRING) version, seqnum, root_hash = struct.unpack(MDMFCHECKSTRING, checkstring[:cs_len]) assert version == MDMF_VERSION, version return (seqnum, root_hash) def pack_offsets(verification_key_length, signature_length, share_hash_chain_length, block_hash_tree_length, share_data_length, encprivkey_length): post_offset = HEADER_LENGTH offsets = {} o1 = offsets['signature'] = post_offset + verification_key_length o2 = offsets['share_hash_chain'] = o1 + signature_length o3 = offsets['block_hash_tree'] = o2 + share_hash_chain_length o4 = offsets['share_data'] = o3 + block_hash_tree_length o5 = offsets['enc_privkey'] = o4 + share_data_length offsets['EOF'] = o5 + encprivkey_length return struct.pack(">LLLLQQ", offsets['signature'], offsets['share_hash_chain'], offsets['block_hash_tree'], offsets['share_data'], offsets['enc_privkey'], offsets['EOF']) def pack_share(prefix, verification_key, signature, share_hash_chain, block_hash_tree, share_data, encprivkey): share_hash_chain_s = "".join([struct.pack(">H32s", i, share_hash_chain[i]) for i in sorted(share_hash_chain.keys())]) for h in block_hash_tree: assert len(h) == 32 block_hash_tree_s = "".join(block_hash_tree) offsets = pack_offsets(len(verification_key), len(signature), len(share_hash_chain_s), len(block_hash_tree_s), len(share_data), len(encprivkey)) final_share = "".join([prefix, offsets, verification_key, signature, share_hash_chain_s, block_hash_tree_s, share_data, encprivkey]) return final_share def pack_prefix(seqnum, root_hash, IV, required_shares, total_shares, segment_size, data_length): prefix = struct.pack(SIGNED_PREFIX, 0, # version, seqnum, root_hash, IV, required_shares, total_shares, segment_size, data_length, ) return prefix class SDMFSlotWriteProxy: implements(IMutableSlotWriter) """ I represent a remote write slot for an SDMF mutable file. I build a share in memory, and then write it in one piece to the remote server. This mimics how SDMF shares were built before MDMF (and the new MDMF uploader), but provides that functionality in a way that allows the MDMF uploader to be built without much special-casing for file format, which makes the uploader code more readable. """ def __init__(self, shnum, rref, # a remote reference to a storage server storage_index, secrets, # (write_enabler, renew_secret, cancel_secret) seqnum, # the sequence number of the mutable file required_shares, total_shares, segment_size, data_length): # the length of the original file self.shnum = shnum self._rref = rref self._storage_index = storage_index self._secrets = secrets self._seqnum = seqnum self._required_shares = required_shares self._total_shares = total_shares self._segment_size = segment_size self._data_length = data_length # This is an SDMF file, so it should have only one segment, so, # modulo padding of the data length, the segment size and the # data length should be the same. expected_segment_size = mathutil.next_multiple(data_length, self._required_shares) assert expected_segment_size == segment_size self._block_size = self._segment_size / self._required_shares # This is meant to mimic how SDMF files were built before MDMF # entered the picture: we generate each share in its entirety, # then push it off to the storage server in one write. When # callers call set_*, they are just populating this dict. # finish_publishing will stitch these pieces together into a # coherent share, and then write the coherent share to the # storage server. self._share_pieces = {} # This tells the write logic what checkstring to use when # writing remote shares. self._testvs = [] self._readvs = [(0, struct.calcsize(PREFIX))] def set_checkstring(self, checkstring_or_seqnum, root_hash=None, salt=None): """ Set the checkstring that I will pass to the remote server when writing. @param checkstring_or_seqnum: A packed checkstring to use, or a sequence number. I will treat this as a checkstr Note that implementations can differ in which semantics they wish to support for set_checkstring -- they can, for example, build the checkstring themselves from its constituents, or some other thing. """ if root_hash and salt: checkstring = struct.pack(PREFIX, 0, checkstring_or_seqnum, root_hash, salt) else: checkstring = checkstring_or_seqnum self._testvs = [(0, len(checkstring), "eq", checkstring)] def get_checkstring(self): """ Get the checkstring that I think currently exists on the remote server. """ if self._testvs: return self._testvs[0][3] return "" def put_block(self, data, segnum, salt): """ Add a block and salt to the share. """ # SDMF files have only one segment assert segnum == 0 assert len(data) == self._block_size assert len(salt) == SALT_SIZE self._share_pieces['sharedata'] = data self._share_pieces['salt'] = salt # TODO: Figure out something intelligent to return. return defer.succeed(None) def put_encprivkey(self, encprivkey): """ Add the encrypted private key to the share. """ self._share_pieces['encprivkey'] = encprivkey return defer.succeed(None) def put_blockhashes(self, blockhashes): """ Add the block hash tree to the share. """ assert isinstance(blockhashes, list) for h in blockhashes: assert len(h) == HASH_SIZE # serialize the blockhashes, then set them. blockhashes_s = "".join(blockhashes) self._share_pieces['block_hash_tree'] = blockhashes_s return defer.succeed(None) def put_sharehashes(self, sharehashes): """ Add the share hash chain to the share. """ assert isinstance(sharehashes, dict) for h in sharehashes.itervalues(): assert len(h) == HASH_SIZE # serialize the sharehashes, then set them. sharehashes_s = "".join([struct.pack(">H32s", i, sharehashes[i]) for i in sorted(sharehashes.keys())]) self._share_pieces['share_hash_chain'] = sharehashes_s return defer.succeed(None) def put_root_hash(self, root_hash): """ Add the root hash to the share. """ assert len(root_hash) == HASH_SIZE self._share_pieces['root_hash'] = root_hash return defer.succeed(None) def put_salt(self, salt): """ Add a salt to an empty SDMF file. """ assert len(salt) == SALT_SIZE self._share_pieces['salt'] = salt self._share_pieces['sharedata'] = "" def get_signable(self): """ Return the part of the share that needs to be signed. SDMF writers need to sign the packed representation of the first eight fields of the remote share, that is: - version number (0) - sequence number - root of the share hash tree - salt - k - n - segsize - datalen This method is responsible for returning that to callers. """ return struct.pack(SIGNED_PREFIX, 0, self._seqnum, self._share_pieces['root_hash'], self._share_pieces['salt'], self._required_shares, self._total_shares, self._segment_size, self._data_length) def put_signature(self, signature): """ Add the signature to the share. """ self._share_pieces['signature'] = signature return defer.succeed(None) def put_verification_key(self, verification_key): """ Add the verification key to the share. """ self._share_pieces['verification_key'] = verification_key return defer.succeed(None) def get_verinfo(self): """ I return my verinfo tuple. This is used by the ServermapUpdater to keep track of versions of mutable files. The verinfo tuple for MDMF files contains: - seqnum - root hash - a blank (nothing) - segsize - datalen - k - n - prefix (the thing that you sign) - a tuple of offsets We include the nonce in MDMF to simplify processing of version information tuples. The verinfo tuple for SDMF files is the same, but contains a 16-byte IV instead of a hash of salts. """ return (self._seqnum, self._share_pieces['root_hash'], self._share_pieces['salt'], self._segment_size, self._data_length, self._required_shares, self._total_shares, self.get_signable(), self._get_offsets_tuple()) def _get_offsets_dict(self): post_offset = HEADER_LENGTH offsets = {} verification_key_length = len(self._share_pieces['verification_key']) o1 = offsets['signature'] = post_offset + verification_key_length signature_length = len(self._share_pieces['signature']) o2 = offsets['share_hash_chain'] = o1 + signature_length share_hash_chain_length = len(self._share_pieces['share_hash_chain']) o3 = offsets['block_hash_tree'] = o2 + share_hash_chain_length block_hash_tree_length = len(self._share_pieces['block_hash_tree']) o4 = offsets['share_data'] = o3 + block_hash_tree_length share_data_length = len(self._share_pieces['sharedata']) o5 = offsets['enc_privkey'] = o4 + share_data_length encprivkey_length = len(self._share_pieces['encprivkey']) offsets['EOF'] = o5 + encprivkey_length return offsets def _get_offsets_tuple(self): offsets = self._get_offsets_dict() return tuple([(key, value) for key, value in offsets.items()]) def _pack_offsets(self): offsets = self._get_offsets_dict() return struct.pack(">LLLLQQ", offsets['signature'], offsets['share_hash_chain'], offsets['block_hash_tree'], offsets['share_data'], offsets['enc_privkey'], offsets['EOF']) def finish_publishing(self): """ Do anything necessary to finish writing the share to a remote server. I require that no further publishing needs to take place after this method has been called. """ for k in ["sharedata", "encprivkey", "signature", "verification_key", "share_hash_chain", "block_hash_tree"]: assert k in self._share_pieces, (self.shnum, k, self._share_pieces.keys()) # This is the only method that actually writes something to the # remote server. # First, we need to pack the share into data that we can write # to the remote server in one write. offsets = self._pack_offsets() prefix = self.get_signable() final_share = "".join([prefix, offsets, self._share_pieces['verification_key'], self._share_pieces['signature'], self._share_pieces['share_hash_chain'], self._share_pieces['block_hash_tree'], self._share_pieces['sharedata'], self._share_pieces['encprivkey']]) # Our only data vector is going to be writing the final share, # in its entirely. datavs = [(0, final_share)] if not self._testvs: # Our caller has not provided us with another checkstring # yet, so we assume that we are writing a new share, and set # a test vector that will allow a new share to be written. self._testvs = [] self._testvs.append(tuple([0, 1, "eq", ""])) tw_vectors = {} tw_vectors[self.shnum] = (self._testvs, datavs, None) return self._rref.callRemote("slot_testv_and_readv_and_writev", self._storage_index, self._secrets, tw_vectors, # TODO is it useful to read something? self._readvs) MDMFHEADER = ">BQ32sBBQQ QQQQQQQQ" MDMFHEADERWITHOUTOFFSETS = ">BQ32sBBQQ" MDMFHEADERSIZE = struct.calcsize(MDMFHEADER) MDMFHEADERWITHOUTOFFSETSSIZE = struct.calcsize(MDMFHEADERWITHOUTOFFSETS) MDMFCHECKSTRING = ">BQ32s" MDMFSIGNABLEHEADER = ">BQ32sBBQQ" MDMFOFFSETS = ">QQQQQQQQ" MDMFOFFSETS_LENGTH = struct.calcsize(MDMFOFFSETS) PRIVATE_KEY_SIZE = 1220 SIGNATURE_SIZE = 260 VERIFICATION_KEY_SIZE = 292 # We know we won't have more than 256 shares, and we know that we won't need # to store more than ln2(256) hash-chain nodes to validate, so that's our # bound. Each node requires 2 bytes of node-number plus 32 bytes of hash. SHARE_HASH_CHAIN_SIZE = (2+HASH_SIZE)*mathutil.log_ceil(256, 2) class MDMFSlotWriteProxy: implements(IMutableSlotWriter) """ I represent a remote write slot for an MDMF mutable file. I abstract away from my caller the details of block and salt management, and the implementation of the on-disk format for MDMF shares. """ # Expected layout, MDMF: # offset: size: name: #-- signed part -- # 0 1 version number (01) # 1 8 sequence number # 9 32 share tree root hash # 41 1 The "k" encoding parameter # 42 1 The "N" encoding parameter # 43 8 The segment size of the uploaded file # 51 8 The data length of the original plaintext #-- end signed part -- # 59 8 The offset of the encrypted private key # 67 8 The offset of the share hash chain # 75 8 The offset of the signature # 83 8 The offset of the verification key # 91 8 The offset of the end of the v. key. # 99 8 The offset of the share data # 107 8 The offset of the block hash tree # 115 8 The offset of EOF # 123 var encrypted private key # var var share hash chain # var var signature # var var verification key # var large share data # var var block hash tree # # We order the fields that way to make smart downloaders -- downloaders # which prempetively read a big part of the share -- possible. # # The checkstring is the first three fields -- the version number, # sequence number, root hash and root salt hash. This is consistent # in meaning to what we have with SDMF files, except now instead of # using the literal salt, we use a value derived from all of the # salts -- the share hash root. # # The salt is stored before the block for each segment. The block # hash tree is computed over the combination of block and salt for # each segment. In this way, we get integrity checking for both # block and salt with the current block hash tree arrangement. # # The ordering of the offsets is different to reflect the dependencies # that we'll run into with an MDMF file. The expected write flow is # something like this: # # 0: Initialize with the sequence number, encoding parameters and # data length. From this, we can deduce the number of segments, # and where they should go.. We can also figure out where the # encrypted private key should go, because we can figure out how # big the share data will be. # # 1: Encrypt, encode, and upload the file in chunks. Do something # like # # put_block(data, segnum, salt) # # to write a block and a salt to the disk. We can do both of # these operations now because we have enough of the offsets to # know where to put them. # # 2: Put the encrypted private key. Use: # # put_encprivkey(encprivkey) # # Now that we know the length of the private key, we can fill # in the offset for the block hash tree. # # 3: We're now in a position to upload the block hash tree for # a share. Put that using something like: # # put_blockhashes(block_hash_tree) # # Note that block_hash_tree is a list of hashes -- we'll take # care of the details of serializing that appropriately. When # we get the block hash tree, we are also in a position to # calculate the offset for the share hash chain, and fill that # into the offsets table. # # 4: We're now in a position to upload the share hash chain for # a share. Do that with something like: # # put_sharehashes(share_hash_chain) # # share_hash_chain should be a dictionary mapping shnums to # 32-byte hashes -- the wrapper handles serialization. # We'll know where to put the signature at this point, also. # The root of this tree will be put explicitly in the next # step. # # 5: Before putting the signature, we must first put the # root_hash. Do this with: # # put_root_hash(root_hash). # # In terms of knowing where to put this value, it was always # possible to place it, but it makes sense semantically to # place it after the share hash tree, so that's why you do it # in this order. # # 6: With the root hash put, we can now sign the header. Use: # # get_signable() # # to get the part of the header that you want to sign, and use: # # put_signature(signature) # # to write your signature to the remote server. # # 6: Add the verification key, and finish. Do: # # put_verification_key(key) # # and # # finish_publish() # # Checkstring management: # # To write to a mutable slot, we have to provide test vectors to ensure # that we are writing to the same data that we think we are. These # vectors allow us to detect uncoordinated writes; that is, writes # where both we and some other shareholder are writing to the # mutable slot, and to report those back to the parts of the program # doing the writing. # # With SDMF, this was easy -- all of the share data was written in # one go, so it was easy to detect uncoordinated writes, and we only # had to do it once. With MDMF, not all of the file is written at # once. # # If a share is new, we write out as much of the header as we can # before writing out anything else. This gives other writers a # canary that they can use to detect uncoordinated writes, and, if # they do the same thing, gives us the same canary. We them update # the share. We won't be able to write out two fields of the header # -- the share tree hash and the salt hash -- until we finish # writing out the share. We only require the writer to provide the # initial checkstring, and keep track of what it should be after # updates ourselves. # # If we haven't written anything yet, then on the first write (which # will probably be a block + salt of a share), we'll also write out # the header. On subsequent passes, we'll expect to see the header. # This changes in two places: # # - When we write out the salt hash # - When we write out the root of the share hash tree # # since these values will change the header. It is possible that we # can just make those be written in one operation to minimize # disruption. def __init__(self, shnum, rref, # a remote reference to a storage server storage_index, secrets, # (write_enabler, renew_secret, cancel_secret) seqnum, # the sequence number of the mutable file required_shares, total_shares, segment_size, data_length): # the length of the original file self.shnum = shnum self._rref = rref self._storage_index = storage_index self._seqnum = seqnum self._required_shares = required_shares assert self.shnum >= 0 and self.shnum < total_shares self._total_shares = total_shares # We build up the offset table as we write things. It is the # last thing we write to the remote server. self._offsets = {} self._testvs = [] # This is a list of write vectors that will be sent to our # remote server once we are directed to write things there. self._writevs = [] self._secrets = secrets # The segment size needs to be a multiple of the k parameter -- # any padding should have been carried out by the publisher # already. assert segment_size % required_shares == 0 self._segment_size = segment_size self._data_length = data_length # These are set later -- we define them here so that we can # check for their existence easily # This is the root of the share hash tree -- the Merkle tree # over the roots of the block hash trees computed for shares in # this upload. self._root_hash = None # We haven't yet written anything to the remote bucket. By # setting this, we tell the _write method as much. The write # method will then know that it also needs to add a write vector # for the checkstring (or what we have of it) to the first write # request. We'll then record that value for future use. If # we're expecting something to be there already, we need to call # set_checkstring before we write anything to tell the first # write about that. self._written = False # When writing data to the storage servers, we get a read vector # for free. We'll read the checkstring, which will help us # figure out what's gone wrong if a write fails. self._readv = [(0, struct.calcsize(MDMFCHECKSTRING))] # We calculate the number of segments because it tells us # where the salt part of the file ends/share segment begins, # and also because it provides a useful amount of bounds checking. self._num_segments = mathutil.div_ceil(self._data_length, self._segment_size) self._block_size = self._segment_size / self._required_shares # We also calculate the share size, to help us with block # constraints later. tail_size = self._data_length % self._segment_size if not tail_size: self._tail_block_size = self._block_size else: self._tail_block_size = mathutil.next_multiple(tail_size, self._required_shares) self._tail_block_size /= self._required_shares # We already know where the sharedata starts; right after the end # of the header (which is defined as the signable part + the offsets) # We can also calculate where the encrypted private key begins # from what we know know. self._actual_block_size = self._block_size + SALT_SIZE data_size = self._actual_block_size * (self._num_segments - 1) data_size += self._tail_block_size data_size += SALT_SIZE self._offsets['enc_privkey'] = MDMFHEADERSIZE # We don't define offsets for these because we want them to be # tightly packed -- this allows us to ignore the responsibility # of padding individual values, and of removing that padding # later. So nonconstant_start is where we start writing # nonconstant data. nonconstant_start = self._offsets['enc_privkey'] nonconstant_start += PRIVATE_KEY_SIZE nonconstant_start += SIGNATURE_SIZE nonconstant_start += VERIFICATION_KEY_SIZE nonconstant_start += SHARE_HASH_CHAIN_SIZE self._offsets['share_data'] = nonconstant_start # Finally, we know how big the share data will be, so we can # figure out where the block hash tree needs to go. # XXX: But this will go away if Zooko wants to make it so that # you don't need to know the size of the file before you start # uploading it. self._offsets['block_hash_tree'] = self._offsets['share_data'] + \ data_size # Done. We can snow start writing. def set_checkstring(self, seqnum_or_checkstring, root_hash=None, salt=None): """ Set checkstring checkstring for the given shnum. This can be invoked in one of two ways. With one argument, I assume that you are giving me a literal checkstring -- e.g., the output of get_checkstring. I will then set that checkstring as it is. This form is used by unit tests. With two arguments, I assume that you are giving me a sequence number and root hash to make a checkstring from. In that case, I will build a checkstring and set it for you. This form is used by the publisher. By default, I assume that I am writing new shares to the grid. If you don't explcitly set your own checkstring, I will use one that requires that the remote share not exist. You will want to use this method if you are updating a share in-place; otherwise, writes will fail. """ # You're allowed to overwrite checkstrings with this method; # I assume that users know what they are doing when they call # it. if root_hash: checkstring = struct.pack(MDMFCHECKSTRING, 1, seqnum_or_checkstring, root_hash) else: checkstring = seqnum_or_checkstring if checkstring == "": # We special-case this, since len("") = 0, but we need # length of 1 for the case of an empty share to work on the # storage server, which is what a checkstring that is the # empty string means. self._testvs = [] else: self._testvs = [] self._testvs.append((0, len(checkstring), "eq", checkstring)) def __repr__(self): return "MDMFSlotWriteProxy for share %d" % self.shnum def get_checkstring(self): """ Given a share number, I return a representation of what the checkstring for that share on the server will look like. I am mostly used for tests. """ if self._root_hash: roothash = self._root_hash else: roothash = "\x00" * 32 return struct.pack(MDMFCHECKSTRING, 1, self._seqnum, roothash) def put_block(self, data, segnum, salt): """ I queue a write vector for the data, salt, and segment number provided to me. I return None, as I do not actually cause anything to be written yet. """ if segnum >= self._num_segments: raise LayoutInvalid("I won't overwrite the block hash tree") if len(salt) != SALT_SIZE: raise LayoutInvalid("I was given a salt of size %d, but " "I wanted a salt of size %d") if segnum + 1 == self._num_segments: if len(data) != self._tail_block_size: raise LayoutInvalid("I was given the wrong size block to write") elif len(data) != self._block_size: raise LayoutInvalid("I was given the wrong size block to write") # We want to write at len(MDMFHEADER) + segnum * block_size. offset = self._offsets['share_data'] + \ (self._actual_block_size * segnum) data = salt + data self._writevs.append(tuple([offset, data])) def put_encprivkey(self, encprivkey): """ I queue a write vector for the encrypted private key provided to me. """ assert self._offsets assert self._offsets['enc_privkey'] # You shouldn't re-write the encprivkey after the block hash # tree is written, since that could cause the private key to run # into the block hash tree. Before it writes the block hash # tree, the block hash tree writing method writes the offset of # the share hash chain. So that's a good indicator of whether or # not the block hash tree has been written. if "signature" in self._offsets: raise LayoutInvalid("You can't put the encrypted private key " "after putting the share hash chain") self._offsets['share_hash_chain'] = self._offsets['enc_privkey'] + \ len(encprivkey) self._writevs.append(tuple([self._offsets['enc_privkey'], encprivkey])) def put_blockhashes(self, blockhashes): """ I queue a write vector to put the block hash tree in blockhashes onto the remote server. The encrypted private key must be queued before the block hash tree, since we need to know how large it is to know where the block hash tree should go. The block hash tree must be put before the share hash chain, since its size determines the offset of the share hash chain. """ assert self._offsets assert "block_hash_tree" in self._offsets assert isinstance(blockhashes, list) blockhashes_s = "".join(blockhashes) self._offsets['EOF'] = self._offsets['block_hash_tree'] + len(blockhashes_s) self._writevs.append(tuple([self._offsets['block_hash_tree'], blockhashes_s])) def put_sharehashes(self, sharehashes): """ I queue a write vector to put the share hash chain in my argument onto the remote server. The block hash tree must be queued before the share hash chain, since we need to know where the block hash tree ends before we can know where the share hash chain starts. The share hash chain must be put before the signature, since the length of the packed share hash chain determines the offset of the signature. Also, semantically, you must know what the root of the block hash tree is before you can generate a valid signature. """ assert isinstance(sharehashes, dict) assert self._offsets if "share_hash_chain" not in self._offsets: raise LayoutInvalid("You must put the block hash tree before " "putting the share hash chain") # The signature comes after the share hash chain. If the # signature has already been written, we must not write another # share hash chain. The signature writes the verification key # offset when it gets sent to the remote server, so we look for # that. if "verification_key" in self._offsets: raise LayoutInvalid("You must write the share hash chain " "before you write the signature") sharehashes_s = "".join([struct.pack(">H32s", i, sharehashes[i]) for i in sorted(sharehashes.keys())]) self._offsets['signature'] = self._offsets['share_hash_chain'] + \ len(sharehashes_s) self._writevs.append(tuple([self._offsets['share_hash_chain'], sharehashes_s])) def put_root_hash(self, roothash): """ Put the root hash (the root of the share hash tree) in the remote slot. """ # It does not make sense to be able to put the root # hash without first putting the share hashes, since you need # the share hashes to generate the root hash. # # Signature is defined by the routine that places the share hash # chain, so it's a good thing to look for in finding out whether # or not the share hash chain exists on the remote server. if len(roothash) != HASH_SIZE: raise LayoutInvalid("hashes and salts must be exactly %d bytes" % HASH_SIZE) self._root_hash = roothash # To write both of these values, we update the checkstring on # the remote server, which includes them checkstring = self.get_checkstring() self._writevs.append(tuple([0, checkstring])) # This write, if successful, changes the checkstring, so we need # to update our internal checkstring to be consistent with the # one on the server. def get_signable(self): """ Get the first seven fields of the mutable file; the parts that are signed. """ if not self._root_hash: raise LayoutInvalid("You need to set the root hash " "before getting something to " "sign") return struct.pack(MDMFSIGNABLEHEADER, 1, self._seqnum, self._root_hash, self._required_shares, self._total_shares, self._segment_size, self._data_length) def put_signature(self, signature): """ I queue a write vector for the signature of the MDMF share. I require that the root hash and share hash chain have been put to the grid before I will write the signature to the grid. """ if "signature" not in self._offsets: raise LayoutInvalid("You must put the share hash chain " # It does not make sense to put a signature without first # putting the root hash and the salt hash (since otherwise # the signature would be incomplete), so we don't allow that. "before putting the signature") if not self._root_hash: raise LayoutInvalid("You must complete the signed prefix " "before computing a signature") # If we put the signature after we put the verification key, we # could end up running into the verification key, and will # probably screw up the offsets as well. So we don't allow that. if "verification_key_end" in self._offsets: raise LayoutInvalid("You can't put the signature after the " "verification key") # The method that writes the verification key defines the EOF # offset before writing the verification key, so look for that. self._offsets['verification_key'] = self._offsets['signature'] +\ len(signature) self._writevs.append(tuple([self._offsets['signature'], signature])) def put_verification_key(self, verification_key): """ I queue a write vector for the verification key. I require that the signature have been written to the storage server before I allow the verification key to be written to the remote server. """ if "verification_key" not in self._offsets: raise LayoutInvalid("You must put the signature before you " "can put the verification key") self._offsets['verification_key_end'] = \ self._offsets['verification_key'] + len(verification_key) assert self._offsets['verification_key_end'] <= self._offsets['share_data'] self._writevs.append(tuple([self._offsets['verification_key'], verification_key])) def _get_offsets_tuple(self): return tuple([(key, value) for key, value in self._offsets.items()]) def get_verinfo(self): return (self._seqnum, self._root_hash, None, self._segment_size, self._data_length, self._required_shares, self._total_shares, self.get_signable(), self._get_offsets_tuple()) def finish_publishing(self): """ I add a write vector for the offsets table, and then cause all of the write vectors that I've dealt with so far to be published to the remote server, ending the write process. """ if "verification_key_end" not in self._offsets: raise LayoutInvalid("You must put the verification key before " "you can publish the offsets") offsets_offset = struct.calcsize(MDMFHEADERWITHOUTOFFSETS) offsets = struct.pack(MDMFOFFSETS, self._offsets['enc_privkey'], self._offsets['share_hash_chain'], self._offsets['signature'], self._offsets['verification_key'], self._offsets['verification_key_end'], self._offsets['share_data'], self._offsets['block_hash_tree'], self._offsets['EOF']) self._writevs.append(tuple([offsets_offset, offsets])) encoding_parameters_offset = struct.calcsize(MDMFCHECKSTRING) params = struct.pack(">BBQQ", self._required_shares, self._total_shares, self._segment_size, self._data_length) self._writevs.append(tuple([encoding_parameters_offset, params])) return self._write(self._writevs) def _write(self, datavs, on_failure=None, on_success=None): """I write the data vectors in datavs to the remote slot.""" tw_vectors = {} if not self._testvs: self._testvs = [] self._testvs.append(tuple([0, 1, "eq", ""])) if not self._written: # Write a new checkstring to the share when we write it, so # that we have something to check later. new_checkstring = self.get_checkstring() datavs.append((0, new_checkstring)) def _first_write(): self._written = True self._testvs = [(0, len(new_checkstring), "eq", new_checkstring)] on_success = _first_write tw_vectors[self.shnum] = (self._testvs, datavs, None) d = self._rref.callRemote("slot_testv_and_readv_and_writev", self._storage_index, self._secrets, tw_vectors, self._readv) def _result(results): if isinstance(results, failure.Failure) or not results[0]: # Do nothing; the write was unsuccessful. if on_failure: on_failure() else: if on_success: on_success() return results d.addCallback(_result) return d def _handle_bad_struct(f): # struct.unpack errors mean the server didn't give us enough data, so # this share is bad f.trap(struct.error) raise BadShareError(f.value.args[0]) class MDMFSlotReadProxy: """ I read from a mutable slot filled with data written in the MDMF data format (which is described above). I can be initialized with some amount of data, which I will use (if it is valid) to eliminate some of the need to fetch it from servers. """ def __init__(self, rref, storage_index, shnum, data="", data_is_everything=False): # Start the initialization process. self._rref = rref self._storage_index = storage_index self.shnum = shnum # Before doing anything, the reader is probably going to want to # verify that the signature is correct. To do that, they'll need # the verification key, and the signature. To get those, we'll # need the offset table. So fetch the offset table on the # assumption that that will be the first thing that a reader is # going to do. # The fact that these encoding parameters are None tells us # that we haven't yet fetched them from the remote share, so we # should. We could just not set them, but the checks will be # easier to read if we don't have to use hasattr. self._version_number = None self._sequence_number = None self._root_hash = None # Filled in if we're dealing with an SDMF file. Unused # otherwise. self._salt = None self._required_shares = None self._total_shares = None self._segment_size = None self._data_length = None self._offsets = None # If the user has chosen to initialize us with some data, we'll # try to satisfy subsequent data requests with that data before # asking the storage server for it. self._data = data # If the provided data is known to be complete, then we know there's # nothing to be gained by querying the server, so we should just # partially satisfy requests with what we have. self._data_is_everything = data_is_everything # The way callers interact with cache in the filenode returns # None if there isn't any cached data, but the way we index the # cached data requires a string, so convert None to "". if self._data == None: self._data = "" def _maybe_fetch_offsets_and_header(self, force_remote=False): """ I fetch the offset table and the header from the remote slot if I don't already have them. If I do have them, I do nothing and return an empty Deferred. """ if self._offsets: return defer.succeed(None) # At this point, we may be either SDMF or MDMF. Fetching 107 # bytes will be enough to get header and offsets for both SDMF and # MDMF, though we'll be left with 4 more bytes than we # need if this ends up being MDMF. This is probably less # expensive than the cost of a second roundtrip. readvs = [(0, 123)] d = self._read(readvs, force_remote) d.addCallback(self._process_encoding_parameters) d.addCallback(self._process_offsets) d.addErrback(_handle_bad_struct) return d def _process_encoding_parameters(self, encoding_parameters): if self.shnum not in encoding_parameters: raise BadShareError("no data for shnum %d" % self.shnum) encoding_parameters = encoding_parameters[self.shnum][0] # The first byte is the version number. It will tell us what # to do next. (verno,) = struct.unpack(">B", encoding_parameters[:1]) if verno == MDMF_VERSION: read_size = MDMFHEADERWITHOUTOFFSETSSIZE (verno, seqnum, root_hash, k, n, segsize, datalen) = struct.unpack(MDMFHEADERWITHOUTOFFSETS, encoding_parameters[:read_size]) if segsize == 0 and datalen == 0: # Empty file, no segments. self._num_segments = 0 else: self._num_segments = mathutil.div_ceil(datalen, segsize) elif verno == SDMF_VERSION: read_size = SIGNED_PREFIX_LENGTH (verno, seqnum, root_hash, salt, k, n, segsize, datalen) = struct.unpack(">BQ32s16s BBQQ", encoding_parameters[:SIGNED_PREFIX_LENGTH]) self._salt = salt if segsize == 0 and datalen == 0: # empty file self._num_segments = 0 else: # non-empty SDMF files have one segment. self._num_segments = 1 else: raise UnknownVersionError("You asked me to read mutable file " "version %d, but I only understand " "%d and %d" % (verno, SDMF_VERSION, MDMF_VERSION)) self._version_number = verno self._sequence_number = seqnum self._root_hash = root_hash self._required_shares = k self._total_shares = n self._segment_size = segsize self._data_length = datalen self._block_size = self._segment_size / self._required_shares # We can upload empty files, and need to account for this fact # so as to avoid zero-division and zero-modulo errors. if datalen > 0: tail_size = self._data_length % self._segment_size else: tail_size = 0 if not tail_size: self._tail_block_size = self._block_size else: self._tail_block_size = mathutil.next_multiple(tail_size, self._required_shares) self._tail_block_size /= self._required_shares return encoding_parameters def _process_offsets(self, offsets): if self._version_number == 0: read_size = OFFSETS_LENGTH read_offset = SIGNED_PREFIX_LENGTH end = read_size + read_offset (signature, share_hash_chain, block_hash_tree, share_data, enc_privkey, EOF) = struct.unpack(">LLLLQQ", offsets[read_offset:end]) self._offsets = {} self._offsets['signature'] = signature self._offsets['share_data'] = share_data self._offsets['block_hash_tree'] = block_hash_tree self._offsets['share_hash_chain'] = share_hash_chain self._offsets['enc_privkey'] = enc_privkey self._offsets['EOF'] = EOF elif self._version_number == 1: read_offset = MDMFHEADERWITHOUTOFFSETSSIZE read_length = MDMFOFFSETS_LENGTH end = read_offset + read_length (encprivkey, sharehashes, signature, verification_key, verification_key_end, sharedata, blockhashes, eof) = struct.unpack(MDMFOFFSETS, offsets[read_offset:end]) self._offsets = {} self._offsets['enc_privkey'] = encprivkey self._offsets['block_hash_tree'] = blockhashes self._offsets['share_hash_chain'] = sharehashes self._offsets['signature'] = signature self._offsets['verification_key'] = verification_key self._offsets['verification_key_end']= \ verification_key_end self._offsets['EOF'] = eof self._offsets['share_data'] = sharedata def get_block_and_salt(self, segnum): """ I return (block, salt), where block is the block data and salt is the salt used to encrypt that segment. """ d = self._maybe_fetch_offsets_and_header() def _then(ignored): base_share_offset = self._offsets['share_data'] if segnum + 1 > self._num_segments: raise LayoutInvalid("Not a valid segment number") if self._version_number == 0: share_offset = base_share_offset + self._block_size * segnum else: share_offset = base_share_offset + (self._block_size + \ SALT_SIZE) * segnum if segnum + 1 == self._num_segments: data = self._tail_block_size else: data = self._block_size if self._version_number == 1: data += SALT_SIZE readvs = [(share_offset, data)] return readvs d.addCallback(_then) d.addCallback(lambda readvs: self._read(readvs)) def _process_results(results): if self.shnum not in results: raise BadShareError("no data for shnum %d" % self.shnum) if self._version_number == 0: # We only read the share data, but we know the salt from # when we fetched the header data = results[self.shnum] if not data: data = "" else: if len(data) != 1: raise BadShareError("got %d vectors, not 1" % len(data)) data = data[0] salt = self._salt else: data = results[self.shnum] if not data: salt = data = "" else: salt_and_data = results[self.shnum][0] salt = salt_and_data[:SALT_SIZE] data = salt_and_data[SALT_SIZE:] return data, salt d.addCallback(_process_results) return d def get_blockhashes(self, needed=None, force_remote=False): """ I return the block hash tree I take an optional argument, needed, which is a set of indices correspond to hashes that I should fetch. If this argument is missing, I will fetch the entire block hash tree; otherwise, I may attempt to fetch fewer hashes, based on what needed says that I should do. Note that I may fetch as many hashes as I want, so long as the set of hashes that I do fetch is a superset of the ones that I am asked for, so callers should be prepared to tolerate additional hashes. """ # TODO: Return only the parts of the block hash tree necessary # to validate the blocknum provided? # This is a good idea, but it is hard to implement correctly. It # is bad to fetch any one block hash more than once, so we # probably just want to fetch the whole thing at once and then # serve it. if needed == set([]): return defer.succeed([]) d = self._maybe_fetch_offsets_and_header() def _then(ignored): blockhashes_offset = self._offsets['block_hash_tree'] if self._version_number == 1: blockhashes_length = self._offsets['EOF'] - blockhashes_offset else: blockhashes_length = self._offsets['share_data'] - blockhashes_offset readvs = [(blockhashes_offset, blockhashes_length)] return readvs d.addCallback(_then) d.addCallback(lambda readvs: self._read(readvs, force_remote=force_remote)) def _build_block_hash_tree(results): if self.shnum not in results: raise BadShareError("no data for shnum %d" % self.shnum) rawhashes = results[self.shnum][0] results = [rawhashes[i:i+HASH_SIZE] for i in range(0, len(rawhashes), HASH_SIZE)] return results d.addCallback(_build_block_hash_tree) return d def get_sharehashes(self, needed=None, force_remote=False): """ I return the part of the share hash chain placed to validate this share. I take an optional argument, needed. Needed is a set of indices that correspond to the hashes that I should fetch. If needed is not present, I will fetch and return the entire share hash chain. Otherwise, I may fetch and return any part of the share hash chain that is a superset of the part that I am asked to fetch. Callers should be prepared to deal with more hashes than they've asked for. """ if needed == set([]): return defer.succeed([]) d = self._maybe_fetch_offsets_and_header() def _make_readvs(ignored): sharehashes_offset = self._offsets['share_hash_chain'] if self._version_number == 0: sharehashes_length = self._offsets['block_hash_tree'] - sharehashes_offset else: sharehashes_length = self._offsets['signature'] - sharehashes_offset readvs = [(sharehashes_offset, sharehashes_length)] return readvs d.addCallback(_make_readvs) d.addCallback(lambda readvs: self._read(readvs, force_remote=force_remote)) def _build_share_hash_chain(results): if self.shnum not in results: raise BadShareError("no data for shnum %d" % self.shnum) sharehashes = results[self.shnum][0] results = [sharehashes[i:i+(HASH_SIZE + 2)] for i in range(0, len(sharehashes), HASH_SIZE + 2)] results = dict([struct.unpack(">H32s", data) for data in results]) return results d.addCallback(_build_share_hash_chain) d.addErrback(_handle_bad_struct) return d def get_encprivkey(self): """ I return the encrypted private key. """ d = self._maybe_fetch_offsets_and_header() def _make_readvs(ignored): privkey_offset = self._offsets['enc_privkey'] if self._version_number == 0: privkey_length = self._offsets['EOF'] - privkey_offset else: privkey_length = self._offsets['share_hash_chain'] - privkey_offset readvs = [(privkey_offset, privkey_length)] return readvs d.addCallback(_make_readvs) d.addCallback(lambda readvs: self._read(readvs)) def _process_results(results): if self.shnum not in results: raise BadShareError("no data for shnum %d" % self.shnum) privkey = results[self.shnum][0] return privkey d.addCallback(_process_results) return d def get_signature(self): """ I return the signature of my share. """ d = self._maybe_fetch_offsets_and_header() def _make_readvs(ignored): signature_offset = self._offsets['signature'] if self._version_number == 1: signature_length = self._offsets['verification_key'] - signature_offset else: signature_length = self._offsets['share_hash_chain'] - signature_offset readvs = [(signature_offset, signature_length)] return readvs d.addCallback(_make_readvs) d.addCallback(lambda readvs: self._read(readvs)) def _process_results(results): if self.shnum not in results: raise BadShareError("no data for shnum %d" % self.shnum) signature = results[self.shnum][0] return signature d.addCallback(_process_results) return d def get_verification_key(self): """ I return the verification key. """ d = self._maybe_fetch_offsets_and_header() def _make_readvs(ignored): if self._version_number == 1: vk_offset = self._offsets['verification_key'] vk_length = self._offsets['verification_key_end'] - vk_offset else: vk_offset = struct.calcsize(">BQ32s16sBBQQLLLLQQ") vk_length = self._offsets['signature'] - vk_offset readvs = [(vk_offset, vk_length)] return readvs d.addCallback(_make_readvs) d.addCallback(lambda readvs: self._read(readvs)) def _process_results(results): if self.shnum not in results: raise BadShareError("no data for shnum %d" % self.shnum) verification_key = results[self.shnum][0] return verification_key d.addCallback(_process_results) return d def get_encoding_parameters(self): """ I return (k, n, segsize, datalen) """ d = self._maybe_fetch_offsets_and_header() d.addCallback(lambda ignored: (self._required_shares, self._total_shares, self._segment_size, self._data_length)) return d def get_seqnum(self): """ I return the sequence number for this share. """ d = self._maybe_fetch_offsets_and_header() d.addCallback(lambda ignored: self._sequence_number) return d def get_root_hash(self): """ I return the root of the block hash tree """ d = self._maybe_fetch_offsets_and_header() d.addCallback(lambda ignored: self._root_hash) return d def get_checkstring(self): """ I return the packed representation of the following: - version number - sequence number - root hash - salt hash which my users use as a checkstring to detect other writers. """ d = self._maybe_fetch_offsets_and_header() def _build_checkstring(ignored): if self._salt: checkstring = struct.pack(PREFIX, self._version_number, self._sequence_number, self._root_hash, self._salt) else: checkstring = struct.pack(MDMFCHECKSTRING, self._version_number, self._sequence_number, self._root_hash) return checkstring d.addCallback(_build_checkstring) return d def get_prefix(self, force_remote): d = self._maybe_fetch_offsets_and_header(force_remote) d.addCallback(lambda ignored: self._build_prefix()) return d def _build_prefix(self): # The prefix is another name for the part of the remote share # that gets signed. It consists of everything up to and # including the datalength, packed by struct. if self._version_number == SDMF_VERSION: return struct.pack(SIGNED_PREFIX, self._version_number, self._sequence_number, self._root_hash, self._salt, self._required_shares, self._total_shares, self._segment_size, self._data_length) else: return struct.pack(MDMFSIGNABLEHEADER, self._version_number, self._sequence_number, self._root_hash, self._required_shares, self._total_shares, self._segment_size, self._data_length) def _get_offsets_tuple(self): # The offsets tuple is another component of the version # information tuple. It is basically our offsets dictionary, # itemized and in a tuple. return self._offsets.copy() def get_verinfo(self): """ I return my verinfo tuple. This is used by the ServermapUpdater to keep track of versions of mutable files. The verinfo tuple for MDMF files contains: - seqnum - root hash - a blank (nothing) - segsize - datalen - k - n - prefix (the thing that you sign) - a tuple of offsets We include the nonce in MDMF to simplify processing of version information tuples. The verinfo tuple for SDMF files is the same, but contains a 16-byte IV instead of a hash of salts. """ d = self._maybe_fetch_offsets_and_header() def _build_verinfo(ignored): if self._version_number == SDMF_VERSION: salt_to_use = self._salt else: salt_to_use = None return (self._sequence_number, self._root_hash, salt_to_use, self._segment_size, self._data_length, self._required_shares, self._total_shares, self._build_prefix(), self._get_offsets_tuple()) d.addCallback(_build_verinfo) return d def _read(self, readvs, force_remote=False): unsatisfiable = filter(lambda x: x[0] + x[1] > len(self._data), readvs) # TODO: It's entirely possible to tweak this so that it just # fulfills the requests that it can, and not demand that all # requests are satisfiable before running it. if not unsatisfiable or self._data_is_everything: results = [self._data[offset:offset+length] for (offset, length) in readvs] results = {self.shnum: results} return defer.succeed(results) else: return self._rref.callRemote("slot_readv", self._storage_index, [self.shnum], readvs) def is_sdmf(self): """I tell my caller whether or not my remote file is SDMF or MDMF """ d = self._maybe_fetch_offsets_and_header() d.addCallback(lambda ignored: self._version_number == 0) return d class LayoutInvalid(BadShareError): """ This isn't a valid MDMF mutable file """ allmydata-tahoe-1.10.2/src/allmydata/mutable/common.py0000644000175000017500000000457612556560070021035 0ustar ramram MODE_CHECK = "MODE_CHECK" # query all peers MODE_ANYTHING = "MODE_ANYTHING" # one recoverable version MODE_WRITE = "MODE_WRITE" # replace all shares, probably.. not for initial # creation MODE_READ = "MODE_READ" MODE_REPAIR = "MODE_REPAIR" # query all peers, get the privkey class NotWriteableError(Exception): pass class BadShareError(Exception): """This represents an error discovered in a particular share, during retrieve, from which we can recover by using some other share. This does *not* include local coding errors. """ class NeedMoreDataError(BadShareError): def __init__(self, needed_bytes, encprivkey_offset, encprivkey_length): Exception.__init__(self) self.needed_bytes = needed_bytes # up through EOF self.encprivkey_offset = encprivkey_offset self.encprivkey_length = encprivkey_length def __repr__(self): return "" % self.needed_bytes class UncoordinatedWriteError(Exception): def __repr__(self): return ("<%s -- You, oh user, tried to change a file or directory " "at the same time as another process was trying to change it. " " To avoid data loss, don't do this. Please see " "docs/write_coordination.rst for details.>" % (self.__class__.__name__,)) class UnrecoverableFileError(Exception): pass class NotEnoughServersError(Exception): """There were not enough functioning servers available to place shares upon. This might result from all servers being full or having an error, a local bug which causes all server requests to fail in the same way, or from there being zero servers. The first error received (if any) is stored in my .first_error attribute.""" def __init__(self, why, first_error=None): Exception.__init__(self, why, first_error) self.first_error = first_error class CorruptShareError(BadShareError): def __init__(self, server, shnum, reason): self.args = (server, shnum, reason) self.server = server self.shnum = shnum self.reason = reason def __str__(self): return "" % (self.__class__.__name__, self.abbrev()) def abbrev(self): return base32.b2a(self.writekey[:5]) def abbrev_si(self): return base32.b2a(self.storage_index)[:5] def is_readonly(self): return False def is_mutable(self): return True def get_readonly(self): return ReadonlySSKFileURI(self.readkey, self.fingerprint) def get_verify_cap(self): return SSKVerifierURI(self.storage_index, self.fingerprint) class ReadonlySSKFileURI(_BaseURI): implements(IURI, IMutableFileURI) BASE_STRING='URI:SSK-RO:' STRING_RE=re.compile('^URI:SSK-RO:'+BASE32STR_128bits+':'+BASE32STR_256bits+'$') def __init__(self, readkey, fingerprint): self.readkey = readkey self.storage_index = hashutil.ssk_storage_index_hash(self.readkey) assert len(self.storage_index) == 16 self.fingerprint = fingerprint @classmethod def init_from_string(cls, uri): mo = cls.STRING_RE.search(uri) if not mo: raise BadURIError("'%s' doesn't look like a %s cap" % (uri, cls)) return cls(base32.a2b(mo.group(1)), base32.a2b(mo.group(2))) def to_string(self): assert isinstance(self.readkey, str) assert isinstance(self.fingerprint, str) return 'URI:SSK-RO:%s:%s' % (base32.b2a(self.readkey), base32.b2a(self.fingerprint)) def __repr__(self): return "<%s %s>" % (self.__class__.__name__, self.abbrev()) def abbrev(self): return base32.b2a(self.readkey[:5]) def abbrev_si(self): return base32.b2a(self.storage_index)[:5] def is_readonly(self): return True def is_mutable(self): return True def get_readonly(self): return self def get_verify_cap(self): return SSKVerifierURI(self.storage_index, self.fingerprint) class SSKVerifierURI(_BaseURI): implements(IVerifierURI) BASE_STRING='URI:SSK-Verifier:' STRING_RE=re.compile('^'+BASE_STRING+BASE32STR_128bits+':'+BASE32STR_256bits+'$') def __init__(self, storage_index, fingerprint): assert len(storage_index) == 16 self.storage_index = storage_index self.fingerprint = fingerprint @classmethod def init_from_string(cls, uri): mo = cls.STRING_RE.search(uri) if not mo: raise BadURIError("'%s' doesn't look like a %s cap" % (uri, cls)) return cls(si_a2b(mo.group(1)), base32.a2b(mo.group(2))) def to_string(self): assert isinstance(self.storage_index, str) assert isinstance(self.fingerprint, str) return 'URI:SSK-Verifier:%s:%s' % (si_b2a(self.storage_index), base32.b2a(self.fingerprint)) def is_readonly(self): return True def is_mutable(self): return False def get_readonly(self): return self def get_verify_cap(self): return self class WriteableMDMFFileURI(_BaseURI): implements(IURI, IMutableFileURI) BASE_STRING='URI:MDMF:' STRING_RE=re.compile('^'+BASE_STRING+BASE32STR_128bits+':'+BASE32STR_256bits+'(:|$)') def __init__(self, writekey, fingerprint): self.writekey = writekey self.readkey = hashutil.ssk_readkey_hash(writekey) self.storage_index = hashutil.ssk_storage_index_hash(self.readkey) assert len(self.storage_index) == 16 self.fingerprint = fingerprint @classmethod def init_from_string(cls, uri): mo = cls.STRING_RE.search(uri) if not mo: raise BadURIError("'%s' doesn't look like a %s cap" % (uri, cls)) return cls(base32.a2b(mo.group(1)), base32.a2b(mo.group(2))) def to_string(self): assert isinstance(self.writekey, str) assert isinstance(self.fingerprint, str) ret = 'URI:MDMF:%s:%s' % (base32.b2a(self.writekey), base32.b2a(self.fingerprint)) return ret def __repr__(self): return "<%s %s>" % (self.__class__.__name__, self.abbrev()) def abbrev(self): return base32.b2a(self.writekey[:5]) def abbrev_si(self): return base32.b2a(self.storage_index)[:5] def is_readonly(self): return False def is_mutable(self): return True def get_readonly(self): return ReadonlyMDMFFileURI(self.readkey, self.fingerprint) def get_verify_cap(self): return MDMFVerifierURI(self.storage_index, self.fingerprint) class ReadonlyMDMFFileURI(_BaseURI): implements(IURI, IMutableFileURI) BASE_STRING='URI:MDMF-RO:' STRING_RE=re.compile('^' +BASE_STRING+BASE32STR_128bits+':'+BASE32STR_256bits+'(:|$)') def __init__(self, readkey, fingerprint): self.readkey = readkey self.storage_index = hashutil.ssk_storage_index_hash(self.readkey) assert len(self.storage_index) == 16 self.fingerprint = fingerprint @classmethod def init_from_string(cls, uri): mo = cls.STRING_RE.search(uri) if not mo: raise BadURIError("'%s' doesn't look like a %s cap" % (uri, cls)) return cls(base32.a2b(mo.group(1)), base32.a2b(mo.group(2))) def to_string(self): assert isinstance(self.readkey, str) assert isinstance(self.fingerprint, str) ret = 'URI:MDMF-RO:%s:%s' % (base32.b2a(self.readkey), base32.b2a(self.fingerprint)) return ret def __repr__(self): return "<%s %s>" % (self.__class__.__name__, self.abbrev()) def abbrev(self): return base32.b2a(self.readkey[:5]) def abbrev_si(self): return base32.b2a(self.storage_index)[:5] def is_readonly(self): return True def is_mutable(self): return True def get_readonly(self): return self def get_verify_cap(self): return MDMFVerifierURI(self.storage_index, self.fingerprint) class MDMFVerifierURI(_BaseURI): implements(IVerifierURI) BASE_STRING='URI:MDMF-Verifier:' STRING_RE=re.compile('^'+BASE_STRING+BASE32STR_128bits+':'+BASE32STR_256bits+'(:|$)') def __init__(self, storage_index, fingerprint): assert len(storage_index) == 16 self.storage_index = storage_index self.fingerprint = fingerprint @classmethod def init_from_string(cls, uri): mo = cls.STRING_RE.search(uri) if not mo: raise BadURIError("'%s' doesn't look like a %s cap" % (uri, cls)) return cls(si_a2b(mo.group(1)), base32.a2b(mo.group(2))) def to_string(self): assert isinstance(self.storage_index, str) assert isinstance(self.fingerprint, str) ret = 'URI:MDMF-Verifier:%s:%s' % (si_b2a(self.storage_index), base32.b2a(self.fingerprint)) return ret def is_readonly(self): return True def is_mutable(self): return False def get_readonly(self): return self def get_verify_cap(self): return self class _DirectoryBaseURI(_BaseURI): implements(IURI, IDirnodeURI) def __init__(self, filenode_uri=None): self._filenode_uri = filenode_uri def __repr__(self): return "<%s %s>" % (self.__class__.__name__, self.abbrev()) @classmethod def init_from_string(cls, uri): mo = cls.BASE_STRING_RE.search(uri) if not mo: raise BadURIError("'%s' doesn't look like a %s cap" % (uri, cls)) bits = uri[mo.end():] fn = cls.INNER_URI_CLASS.init_from_string( cls.INNER_URI_CLASS.BASE_STRING+bits) return cls(fn) def to_string(self): fnuri = self._filenode_uri.to_string() mo = re.match(self.INNER_URI_CLASS.BASE_STRING, fnuri) assert mo, fnuri bits = fnuri[mo.end():] return self.BASE_STRING+bits def abbrev(self): return self._filenode_uri.to_string().split(':')[2][:5] def abbrev_si(self): si = self._filenode_uri.get_storage_index() if si is None: return "" return base32.b2a(si)[:5] def is_mutable(self): return True def get_filenode_cap(self): return self._filenode_uri def get_verify_cap(self): return DirectoryURIVerifier(self._filenode_uri.get_verify_cap()) def get_storage_index(self): return self._filenode_uri.get_storage_index() class DirectoryURI(_DirectoryBaseURI): implements(IDirectoryURI) BASE_STRING='URI:DIR2:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=WriteableSSKFileURI def __init__(self, filenode_uri=None): if filenode_uri: assert not filenode_uri.is_readonly() _DirectoryBaseURI.__init__(self, filenode_uri) def is_readonly(self): return False def get_readonly(self): return ReadonlyDirectoryURI(self._filenode_uri.get_readonly()) class ReadonlyDirectoryURI(_DirectoryBaseURI): implements(IReadonlyDirectoryURI) BASE_STRING='URI:DIR2-RO:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=ReadonlySSKFileURI def __init__(self, filenode_uri=None): if filenode_uri: assert filenode_uri.is_readonly() _DirectoryBaseURI.__init__(self, filenode_uri) def is_readonly(self): return True def get_readonly(self): return self class _ImmutableDirectoryBaseURI(_DirectoryBaseURI): def __init__(self, filenode_uri=None): if filenode_uri: assert isinstance(filenode_uri, self.INNER_URI_CLASS), filenode_uri assert not filenode_uri.is_mutable() _DirectoryBaseURI.__init__(self, filenode_uri) def is_readonly(self): return True def is_mutable(self): return False def get_readonly(self): return self class ImmutableDirectoryURI(_ImmutableDirectoryBaseURI): BASE_STRING='URI:DIR2-CHK:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=CHKFileURI def get_verify_cap(self): vcap = self._filenode_uri.get_verify_cap() return ImmutableDirectoryURIVerifier(vcap) class LiteralDirectoryURI(_ImmutableDirectoryBaseURI): BASE_STRING='URI:DIR2-LIT:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=LiteralFileURI def get_verify_cap(self): # LIT caps have no verifier, since they aren't distributed return None class MDMFDirectoryURI(_DirectoryBaseURI): implements(IDirectoryURI) BASE_STRING='URI:DIR2-MDMF:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=WriteableMDMFFileURI def __init__(self, filenode_uri=None): if filenode_uri: assert not filenode_uri.is_readonly() _DirectoryBaseURI.__init__(self, filenode_uri) def is_readonly(self): return False def get_readonly(self): return ReadonlyMDMFDirectoryURI(self._filenode_uri.get_readonly()) def get_verify_cap(self): return MDMFDirectoryURIVerifier(self._filenode_uri.get_verify_cap()) class ReadonlyMDMFDirectoryURI(_DirectoryBaseURI): implements(IReadonlyDirectoryURI) BASE_STRING='URI:DIR2-MDMF-RO:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=ReadonlyMDMFFileURI def __init__(self, filenode_uri=None): if filenode_uri: assert filenode_uri.is_readonly() _DirectoryBaseURI.__init__(self, filenode_uri) def is_readonly(self): return True def get_readonly(self): return self def get_verify_cap(self): return MDMFDirectoryURIVerifier(self._filenode_uri.get_verify_cap()) def wrap_dirnode_cap(filecap): if isinstance(filecap, WriteableSSKFileURI): return DirectoryURI(filecap) if isinstance(filecap, ReadonlySSKFileURI): return ReadonlyDirectoryURI(filecap) if isinstance(filecap, CHKFileURI): return ImmutableDirectoryURI(filecap) if isinstance(filecap, LiteralFileURI): return LiteralDirectoryURI(filecap) if isinstance(filecap, WriteableMDMFFileURI): return MDMFDirectoryURI(filecap) if isinstance(filecap, ReadonlyMDMFFileURI): return ReadonlyMDMFDirectoryURI(filecap) raise AssertionError("cannot interpret as a directory cap: %s" % filecap.__class__) class MDMFDirectoryURIVerifier(_DirectoryBaseURI): implements(IVerifierURI) BASE_STRING='URI:DIR2-MDMF-Verifier:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=MDMFVerifierURI def __init__(self, filenode_uri=None): if filenode_uri: _assert(IVerifierURI.providedBy(filenode_uri)) self._filenode_uri = filenode_uri def get_filenode_cap(self): return self._filenode_uri def is_mutable(self): return False def is_readonly(self): return True def get_readonly(self): return self class DirectoryURIVerifier(_DirectoryBaseURI): implements(IVerifierURI) BASE_STRING='URI:DIR2-Verifier:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=SSKVerifierURI def __init__(self, filenode_uri=None): if filenode_uri: _assert(IVerifierURI.providedBy(filenode_uri)) self._filenode_uri = filenode_uri def get_filenode_cap(self): return self._filenode_uri def is_mutable(self): return False def is_readonly(self): return True def get_readonly(self): return self class ImmutableDirectoryURIVerifier(DirectoryURIVerifier): implements(IVerifierURI) BASE_STRING='URI:DIR2-CHK-Verifier:' BASE_STRING_RE=re.compile('^'+BASE_STRING) INNER_URI_CLASS=CHKFileVerifierURI class UnknownURI: def __init__(self, uri, error=None): self._uri = uri self._error = error def to_string(self): return self._uri def get_readonly(self): return None def get_error(self): return self._error def get_verify_cap(self): return None ALLEGED_READONLY_PREFIX = 'ro.' ALLEGED_IMMUTABLE_PREFIX = 'imm.' def from_string(u, deep_immutable=False, name=u""): if not isinstance(u, str): raise TypeError("unknown URI type: %s.." % str(u)[:100]) # We allow and check ALLEGED_READONLY_PREFIX or ALLEGED_IMMUTABLE_PREFIX # on all URIs, even though we would only strictly need to do so for caps of # new formats (post Tahoe-LAFS 1.6). URIs that are not consistent with their # prefix are treated as unknown. This should be revisited when we add the # new cap formats. See ticket #833 comment:31. s = u can_be_mutable = can_be_writeable = not deep_immutable if s.startswith(ALLEGED_IMMUTABLE_PREFIX): can_be_mutable = can_be_writeable = False s = s[len(ALLEGED_IMMUTABLE_PREFIX):] elif s.startswith(ALLEGED_READONLY_PREFIX): can_be_writeable = False s = s[len(ALLEGED_READONLY_PREFIX):] error = None kind = "cap" try: if s.startswith('URI:CHK:'): return CHKFileURI.init_from_string(s) elif s.startswith('URI:CHK-Verifier:'): return CHKFileVerifierURI.init_from_string(s) elif s.startswith('URI:LIT:'): return LiteralFileURI.init_from_string(s) elif s.startswith('URI:SSK:'): if can_be_writeable: return WriteableSSKFileURI.init_from_string(s) kind = "URI:SSK file writecap" elif s.startswith('URI:SSK-RO:'): if can_be_mutable: return ReadonlySSKFileURI.init_from_string(s) kind = "URI:SSK-RO readcap to a mutable file" elif s.startswith('URI:SSK-Verifier:'): return SSKVerifierURI.init_from_string(s) elif s.startswith('URI:MDMF:'): if can_be_writeable: return WriteableMDMFFileURI.init_from_string(s) kind = "URI:MDMF file writecap" elif s.startswith('URI:MDMF-RO:'): if can_be_mutable: return ReadonlyMDMFFileURI.init_from_string(s) kind = "URI:MDMF-RO readcap to a mutable file" elif s.startswith('URI:MDMF-Verifier:'): return MDMFVerifierURI.init_from_string(s) elif s.startswith('URI:DIR2:'): if can_be_writeable: return DirectoryURI.init_from_string(s) kind = "URI:DIR2 directory writecap" elif s.startswith('URI:DIR2-RO:'): if can_be_mutable: return ReadonlyDirectoryURI.init_from_string(s) kind = "URI:DIR2-RO readcap to a mutable directory" elif s.startswith('URI:DIR2-Verifier:'): return DirectoryURIVerifier.init_from_string(s) elif s.startswith('URI:DIR2-CHK:'): return ImmutableDirectoryURI.init_from_string(s) elif s.startswith('URI:DIR2-CHK-Verifier:'): return ImmutableDirectoryURIVerifier.init_from_string(s) elif s.startswith('URI:DIR2-LIT:'): return LiteralDirectoryURI.init_from_string(s) elif s.startswith('URI:DIR2-MDMF:'): if can_be_writeable: return MDMFDirectoryURI.init_from_string(s) kind = "URI:DIR2-MDMF directory writecap" elif s.startswith('URI:DIR2-MDMF-RO:'): if can_be_mutable: return ReadonlyMDMFDirectoryURI.init_from_string(s) kind = "URI:DIR2-MDMF-RO readcap to a mutable directory" elif s.startswith('URI:DIR2-MDMF-Verifier:'): return MDMFDirectoryURIVerifier.init_from_string(s) elif s.startswith('x-tahoe-future-test-writeable:') and not can_be_writeable: # For testing how future writeable caps would behave in read-only contexts. kind = "x-tahoe-future-test-writeable: testing cap" elif s.startswith('x-tahoe-future-test-mutable:') and not can_be_mutable: # For testing how future mutable readcaps would behave in immutable contexts. kind = "x-tahoe-future-test-mutable: testing cap" else: return UnknownURI(u) # We fell through because a constraint was not met. # Prefer to report the most specific constraint. if not can_be_mutable: error = MustBeDeepImmutableError(kind + " used in an immutable context", name) else: error = MustBeReadonlyError(kind + " used in a read-only context", name) except BadURIError, e: error = e return UnknownURI(u, error=error) def is_uri(s): try: from_string(s, deep_immutable=False) return True except (TypeError, AssertionError): return False def is_literal_file_uri(s): if not isinstance(s, str): return False return (s.startswith('URI:LIT:') or s.startswith(ALLEGED_READONLY_PREFIX + 'URI:LIT:') or s.startswith(ALLEGED_IMMUTABLE_PREFIX + 'URI:LIT:')) def has_uri_prefix(s): if not isinstance(s, str): return False return (s.startswith("URI:") or s.startswith(ALLEGED_READONLY_PREFIX + 'URI:') or s.startswith(ALLEGED_IMMUTABLE_PREFIX + 'URI:')) # These take the same keyword arguments as from_string above. def from_string_dirnode(s, **kwargs): u = from_string(s, **kwargs) _assert(IDirnodeURI.providedBy(u)) return u registerAdapter(from_string_dirnode, str, IDirnodeURI) def from_string_filenode(s, **kwargs): u = from_string(s, **kwargs) _assert(IFileURI.providedBy(u)) return u registerAdapter(from_string_filenode, str, IFileURI) def from_string_mutable_filenode(s, **kwargs): u = from_string(s, **kwargs) _assert(IMutableFileURI.providedBy(u)) return u registerAdapter(from_string_mutable_filenode, str, IMutableFileURI) def from_string_verifier(s, **kwargs): u = from_string(s, **kwargs) _assert(IVerifierURI.providedBy(u)) return u registerAdapter(from_string_verifier, str, IVerifierURI) def pack_extension(data): pieces = [] for k in sorted(data.keys()): value = data[k] if isinstance(value, (int, long)): value = "%d" % value assert isinstance(value, str), k assert re.match(r'^[a-zA-Z_\-]+$', k) pieces.append(k + ':' + hashutil.netstring(value)) uri_extension = ''.join(pieces) return uri_extension def unpack_extension(data): d = {} while data: colon = data.index(':') key = data[:colon] data = data[colon+1:] colon = data.index(':') number = data[:colon] length = int(number) data = data[colon+1:] value = data[:length] assert data[length] == ',' data = data[length+1:] d[key] = value # convert certain things to numbers for intkey in ('size', 'segment_size', 'num_segments', 'needed_shares', 'total_shares'): if intkey in d: d[intkey] = int(d[intkey]) return d def unpack_extension_readable(data): unpacked = unpack_extension(data) unpacked["UEB_hash"] = hashutil.uri_extension_hash(data) for k in sorted(unpacked.keys()): if 'hash' in k: unpacked[k] = base32.b2a(unpacked[k]) return unpacked allmydata-tahoe-1.10.2/src/allmydata/test/0000755000175000017500000000000012556560072016507 5ustar ramramallmydata-tahoe-1.10.2/src/allmydata/test/test_encodingutil.py0000644000175000017500000004652512556560070022616 0ustar ramram lumiere_nfc = u"lumi\u00E8re" Artonwall_nfc = u"\u00C4rtonwall.mp3" Artonwall_nfd = u"A\u0308rtonwall.mp3" TEST_FILENAMES = ( Artonwall_nfc, u'test_file', u'Blah blah.txt', ) # The following main helps to generate a test class for other operating # systems. if __name__ == "__main__": import sys, os import tempfile import shutil import platform if len(sys.argv) != 2: print "Usage: %s lumire" % sys.argv[0] sys.exit(1) if sys.platform == "win32": try: from allmydata.windows.fixups import initialize except ImportError: print "set PYTHONPATH to the src directory" sys.exit(1) initialize() print print "class MyWeirdOS(EncodingUtil, unittest.TestCase):" print " uname = '%s'" % ' '.join(platform.uname()) print " argv = %s" % repr(sys.argv[1]) print " platform = '%s'" % sys.platform print " filesystem_encoding = '%s'" % sys.getfilesystemencoding() print " io_encoding = '%s'" % sys.stdout.encoding try: tmpdir = tempfile.mkdtemp() for fname in TEST_FILENAMES: open(os.path.join(tmpdir, fname), 'w').close() # Use Unicode API under Windows or MacOS X if sys.platform in ('win32', 'darwin'): dirlist = os.listdir(unicode(tmpdir)) else: dirlist = os.listdir(tmpdir) print " dirlist = %s" % repr(dirlist) except: print " # Oops, I cannot write filenames containing non-ascii characters" print shutil.rmtree(tmpdir) sys.exit(0) import os, sys, locale from twisted.trial import unittest from allmydata.test.common_util import ReallyEqualMixin from allmydata.util import encodingutil, fileutil from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \ unicode_to_output, quote_output, quote_path, quote_local_unicode_path, \ unicode_platform, listdir_unicode, FilenameEncodingError, get_io_encoding, \ get_filesystem_encoding, to_str, from_utf8_or_none, _reload from allmydata.dirnode import normalize from twisted.python import usage class MockStdout(object): pass class EncodingUtilErrors(ReallyEqualMixin, unittest.TestCase): def test_get_io_encoding(self): mock_stdout = MockStdout() self.patch(sys, 'stdout', mock_stdout) mock_stdout.encoding = 'UTF-8' _reload() self.failUnlessReallyEqual(get_io_encoding(), 'utf-8') mock_stdout.encoding = 'cp65001' _reload() self.failUnlessReallyEqual(get_io_encoding(), 'utf-8') mock_stdout.encoding = 'koi8-r' expected = sys.platform == "win32" and 'utf-8' or 'koi8-r' _reload() self.failUnlessReallyEqual(get_io_encoding(), expected) mock_stdout.encoding = 'nonexistent_encoding' if sys.platform == "win32": _reload() self.failUnlessReallyEqual(get_io_encoding(), 'utf-8') else: self.failUnlessRaises(AssertionError, _reload) def test_get_io_encoding_not_from_stdout(self): preferredencoding = 'koi8-r' def call_locale_getpreferredencoding(): return preferredencoding self.patch(locale, 'getpreferredencoding', call_locale_getpreferredencoding) mock_stdout = MockStdout() self.patch(sys, 'stdout', mock_stdout) expected = sys.platform == "win32" and 'utf-8' or 'koi8-r' _reload() self.failUnlessReallyEqual(get_io_encoding(), expected) mock_stdout.encoding = None _reload() self.failUnlessReallyEqual(get_io_encoding(), expected) preferredencoding = None _reload() self.failUnlessReallyEqual(get_io_encoding(), 'utf-8') def test_argv_to_unicode(self): encodingutil.io_encoding = 'utf-8' self.failUnlessRaises(usage.UsageError, argv_to_unicode, lumiere_nfc.encode('latin1')) def test_unicode_to_output(self): encodingutil.io_encoding = 'koi8-r' self.failUnlessRaises(UnicodeEncodeError, unicode_to_output, lumiere_nfc) def test_no_unicode_normalization(self): # Pretend to run on a Unicode platform. # listdir_unicode normalized to NFC in 1.7beta, but now doesn't. def call_os_listdir(path): return [Artonwall_nfd] self.patch(os, 'listdir', call_os_listdir) self.patch(sys, 'platform', 'darwin') _reload() self.failUnlessReallyEqual(listdir_unicode(u'/dummy'), [Artonwall_nfd]) # The following tests apply only to platforms that don't store filenames as # Unicode entities on the filesystem. class EncodingUtilNonUnicodePlatform(unittest.TestCase): def setUp(self): # Mock sys.platform because unicode_platform() uses it self.original_platform = sys.platform sys.platform = 'linux' def tearDown(self): sys.platform = self.original_platform _reload() def test_listdir_unicode(self): # What happens if latin1-encoded filenames are encountered on an UTF-8 # filesystem? def call_os_listdir(path): return [ lumiere_nfc.encode('utf-8'), lumiere_nfc.encode('latin1') ] self.patch(os, 'listdir', call_os_listdir) sys_filesystemencoding = 'utf-8' def call_sys_getfilesystemencoding(): return sys_filesystemencoding self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding) _reload() self.failUnlessRaises(FilenameEncodingError, listdir_unicode, u'/dummy') # We're trying to list a directory whose name cannot be represented in # the filesystem encoding. This should fail. sys_filesystemencoding = 'ascii' _reload() self.failUnlessRaises(FilenameEncodingError, listdir_unicode, u'/' + lumiere_nfc) class EncodingUtil(ReallyEqualMixin): def setUp(self): self.original_platform = sys.platform sys.platform = self.platform def tearDown(self): sys.platform = self.original_platform _reload() def test_argv_to_unicode(self): if 'argv' not in dir(self): return mock_stdout = MockStdout() mock_stdout.encoding = self.io_encoding self.patch(sys, 'stdout', mock_stdout) argu = lumiere_nfc argv = self.argv _reload() self.failUnlessReallyEqual(argv_to_unicode(argv), argu) def test_unicode_to_url(self): self.failUnless(unicode_to_url(lumiere_nfc), "lumi\xc3\xa8re") def test_unicode_to_output(self): if 'argv' not in dir(self): return mock_stdout = MockStdout() mock_stdout.encoding = self.io_encoding self.patch(sys, 'stdout', mock_stdout) _reload() self.failUnlessReallyEqual(unicode_to_output(lumiere_nfc), self.argv) def test_unicode_platform(self): matrix = { 'linux2': False, 'linux3': False, 'openbsd4': False, 'win32': True, 'darwin': True, } _reload() self.failUnlessReallyEqual(unicode_platform(), matrix[self.platform]) def test_listdir_unicode(self): if 'dirlist' not in dir(self): return try: u"test".encode(self.filesystem_encoding) except (LookupError, AttributeError): raise unittest.SkipTest("This platform does not support the '%s' filesystem encoding " "that we are testing for the benefit of a different platform." % (self.filesystem_encoding,)) def call_os_listdir(path): return self.dirlist self.patch(os, 'listdir', call_os_listdir) def call_sys_getfilesystemencoding(): return self.filesystem_encoding self.patch(sys, 'getfilesystemencoding', call_sys_getfilesystemencoding) _reload() filenames = listdir_unicode(u'/dummy') self.failUnlessEqual(set([normalize(fname) for fname in filenames]), set(TEST_FILENAMES)) class StdlibUnicode(unittest.TestCase): """This mainly tests that some of the stdlib functions support Unicode paths, but also that listdir_unicode works for valid filenames.""" def skip_if_cannot_represent_filename(self, u): enc = get_filesystem_encoding() if not unicode_platform(): try: u.encode(enc) except UnicodeEncodeError: raise unittest.SkipTest("A non-ASCII filename could not be encoded on this platform.") def test_mkdir_open_exists_abspath_listdir_expanduser(self): self.skip_if_cannot_represent_filename(lumiere_nfc) try: os.mkdir(lumiere_nfc) except EnvironmentError, e: raise unittest.SkipTest("%r\nIt is possible that the filesystem on which this test is being run " "does not support Unicode, even though the platform does." % (e,)) fn = lumiere_nfc + u'/' + lumiere_nfc + u'.txt' open(fn, 'wb').close() self.failUnless(os.path.exists(fn)) self.failUnless(os.path.exists(os.path.join(os.getcwdu(), fn))) filenames = listdir_unicode(lumiere_nfc) # We only require that the listing includes a filename that is canonically equivalent # to lumiere_nfc (on Mac OS X, it will be the NFD equivalent). self.failUnlessIn(lumiere_nfc + ".txt", set([normalize(fname) for fname in filenames])) expanded = fileutil.expanduser(u"~/" + lumiere_nfc) self.failIfIn(u"~", expanded) self.failUnless(expanded.endswith(lumiere_nfc), expanded) def test_open_unrepresentable(self): if unicode_platform(): raise unittest.SkipTest("This test is not applicable to platforms that represent filenames as Unicode.") enc = get_filesystem_encoding() fn = u'\u2621.txt' try: fn.encode(enc) raise unittest.SkipTest("This test cannot be run unless we know a filename that is not representable.") except UnicodeEncodeError: self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb') class QuoteOutput(ReallyEqualMixin, unittest.TestCase): def tearDown(self): _reload() def _check(self, inp, out, enc, optional_quotes, quote_newlines): out2 = out if optional_quotes: out2 = out2[1:-1] self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quote_newlines=quote_newlines), out) self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) if out[0:2] == 'b"': pass elif isinstance(inp, str): self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quote_newlines=quote_newlines), out) self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) else: self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quote_newlines=quote_newlines), out) self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False, quote_newlines=quote_newlines), out2) def _test_quote_output_all(self, enc): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out, enc, optional_quotes, quote_newlines) # optional single quotes check("foo", "'foo'", True) check("\\", "'\\'", True) check("$\"`", "'$\"`'", True) check("\n", "'\n'", True, quote_newlines=False) # mandatory single quotes check("\"", "'\"'") # double quotes check("'", "\"'\"") check("\n", "\"\\x0a\"", quote_newlines=True) check("\x00", "\"\\x00\"") # invalid Unicode and astral planes check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"") check(u"\uDC00\uD800", "\"\\udc00\\ud800\"") check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"") check(u"\uD800\uDC00", "\"\\U00010000\"") check(u"\uD800\uDC01", "\"\\U00010001\"") check(u"\uD801\uDC00", "\"\\U00010400\"") check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"") check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"") check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"") # invalid UTF-8 check("\xFF", "b\"\\xff\"") check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") def test_quote_output_ascii(self, enc='ascii'): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out, enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) check(u"\u00D7", "\"\\xd7\"") check(u"'\u00D7", "\"'\\xd7\"") check(u"\"\u00D7", "\"\\\"\\xd7\"") check(u"\u2621", "\"\\u2621\"") check(u"'\u2621", "\"'\\u2621\"") check(u"\"\u2621", "\"\\\"\\u2621\"") check(u"\n", "'\n'", True, quote_newlines=False) check(u"\n", "\"\\x0a\"", quote_newlines=True) def test_quote_output_latin1(self, enc='latin1'): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out.encode('latin1'), enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) check(u"\u00D7", u"'\u00D7'", True) check(u"'\u00D7", u"\"'\u00D7\"") check(u"\"\u00D7", u"'\"\u00D7'") check(u"\u00D7\"", u"'\u00D7\"'", True) check(u"\u2621", u"\"\\u2621\"") check(u"'\u2621", u"\"'\\u2621\"") check(u"\"\u2621", u"\"\\\"\\u2621\"") check(u"\n", u"'\n'", True, quote_newlines=False) check(u"\n", u"\"\\x0a\"", quote_newlines=True) def test_quote_output_utf8(self, enc='utf-8'): def check(inp, out, optional_quotes=False, quote_newlines=None): self._check(inp, out.encode('utf-8'), enc, optional_quotes, quote_newlines) self._test_quote_output_all(enc) check(u"\u2621", u"'\u2621'", True) check(u"'\u2621", u"\"'\u2621\"") check(u"\"\u2621", u"'\"\u2621'") check(u"\u2621\"", u"'\u2621\"'", True) check(u"\n", u"'\n'", True, quote_newlines=False) check(u"\n", u"\"\\x0a\"", quote_newlines=True) def test_quote_output_default(self): encodingutil.io_encoding = 'ascii' self.test_quote_output_ascii(None) encodingutil.io_encoding = 'latin1' self.test_quote_output_latin1(None) encodingutil.io_encoding = 'utf-8' self.test_quote_output_utf8(None) class QuotePaths(ReallyEqualMixin, unittest.TestCase): def test_quote_path(self): self.failUnlessReallyEqual(quote_path([u'foo', u'bar']), "'foo/bar'") self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=True), "'foo/bar'") self.failUnlessReallyEqual(quote_path([u'foo', u'bar'], quotemarks=False), "foo/bar") self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar']), '"foo/\\x0abar"') self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=True), '"foo/\\x0abar"') self.failUnlessReallyEqual(quote_path([u'foo', u'\nbar'], quotemarks=False), '"foo/\\x0abar"') def win32_other(win32, other): return win32 if sys.platform == "win32" else other self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo"), win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=True), win32_other("'C:\\foo'", "'\\\\?\\C:\\foo'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\C:\\foo", quotemarks=False), win32_other("C:\\foo", "\\\\?\\C:\\foo")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar"), win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=True), win32_other("'\\\\foo\\bar'", "'\\\\?\\UNC\\foo\\bar'")) self.failUnlessReallyEqual(quote_local_unicode_path(u"\\\\?\\UNC\\foo\\bar", quotemarks=False), win32_other("\\\\foo\\bar", "\\\\?\\UNC\\foo\\bar")) class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' argv = 'lumi\xc3\xa8re' platform = 'linux2' filesystem_encoding = 'UTF-8' io_encoding = 'UTF-8' dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt'] class UbuntuKarmicLatin1(EncodingUtil, unittest.TestCase): uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' argv = 'lumi\xe8re' platform = 'linux2' filesystem_encoding = 'ISO-8859-1' io_encoding = 'ISO-8859-1' dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3'] class Windows(EncodingUtil, unittest.TestCase): uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD' argv = 'lumi\xc3\xa8re' platform = 'win32' filesystem_encoding = 'mbcs' io_encoding = 'utf-8' dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3'] class MacOSXLeopard(EncodingUtil, unittest.TestCase): uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' output = 'lumi\xc3\xa8re' platform = 'darwin' filesystem_encoding = 'utf-8' io_encoding = 'UTF-8' dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] class MacOSXLeopard7bit(EncodingUtil, unittest.TestCase): uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc' platform = 'darwin' filesystem_encoding = 'utf-8' io_encoding = 'US-ASCII' dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file'] class OpenBSD(EncodingUtil, unittest.TestCase): uname = 'OpenBSD 4.1 GENERIC#187 i386 Intel(R) Celeron(R) CPU 2.80GHz ("GenuineIntel" 686-class)' platform = 'openbsd4' filesystem_encoding = '646' io_encoding = '646' # Oops, I cannot write filenames containing non-ascii characters class TestToFromStr(ReallyEqualMixin, unittest.TestCase): def test_to_str(self): self.failUnlessReallyEqual(to_str("foo"), "foo") self.failUnlessReallyEqual(to_str("lumi\xc3\xa8re"), "lumi\xc3\xa8re") self.failUnlessReallyEqual(to_str("\xFF"), "\xFF") # passes through invalid UTF-8 -- is this what we want? self.failUnlessReallyEqual(to_str(u"lumi\u00E8re"), "lumi\xc3\xa8re") self.failUnlessReallyEqual(to_str(None), None) def test_from_utf8_or_none(self): self.failUnlessRaises(AssertionError, from_utf8_or_none, u"foo") self.failUnlessReallyEqual(from_utf8_or_none("lumi\xc3\xa8re"), u"lumi\u00E8re") self.failUnlessReallyEqual(from_utf8_or_none(None), None) self.failUnlessRaises(UnicodeDecodeError, from_utf8_or_none, "\xFF") allmydata-tahoe-1.10.2/src/allmydata/test/common_web.py0000644000175000017500000000627712556560070021220 0ustar ramram import re from twisted.internet import defer from twisted.web import client from nevow.testutil import FakeRequest from nevow import inevow, context class WebRenderingMixin: # d=page.renderString() or s=page.renderSynchronously() will exercise # docFactory, render_*/data_* . It won't exercise want_json(), or my # renderHTTP() override which tests want_json(). To exercise args=, we # must build a context. Pages which use a return_to= argument need a # context. # d=page.renderHTTP(ctx) will exercise my renderHTTP, want_json, and # docFactory/render_*/data_*, but it requires building a context. Since # we're already building a context, it is easy to exercise args= . # so, use at least two d=page.renderHTTP(ctx) per page (one for json, one # for html), then use lots of simple s=page.renderSynchronously() to # exercise the fine details (the ones that don't require args=). def make_context(self, req): ctx = context.RequestContext(tag=req) ctx.remember(req, inevow.IRequest) ctx.remember(None, inevow.IData) ctx = context.WovenContext(parent=ctx, precompile=False) return ctx def render1(self, page, **kwargs): # use this to exercise an overridden renderHTTP, usually for # output=json or render_GET. It always returns a Deferred. req = FakeRequest(**kwargs) req.fields = None ctx = self.make_context(req) d = defer.maybeDeferred(page.renderHTTP, ctx) def _done(res): if isinstance(res, str): return res + req.v return req.v d.addCallback(_done) return d def render2(self, page, **kwargs): # use this to exercise the normal Nevow docFactory rendering. It # returns a string. If one of the render_* methods returns a # Deferred, this will throw an exception. (note that # page.renderString is the Deferred-returning equivalent) req = FakeRequest(**kwargs) req.fields = None ctx = self.make_context(req) return page.renderSynchronously(ctx) def failUnlessIn(self, substring, s): self.failUnless(substring in s, s) def remove_tags(self, s): s = re.sub(r'<[^>]*>', ' ', s) s = re.sub(r'\s+', ' ', s) return s class MyGetter(client.HTTPPageGetter): handleStatus_206 = lambda self: self.handleStatus_200() # PARTIAL_CONTENT handleStatus_304 = lambda self: self.handleStatus_200() # NOT_MODIFIED class HTTPClientHEADFactory(client.HTTPClientFactory): protocol = MyGetter def noPage(self, reason): # Twisted-2.5.0 and earlier had a bug, in which they would raise an # exception when the response to a HEAD request had no body (when in # fact they are defined to never have a body). This was fixed in # Twisted-8.0 . To work around this, we catch the # PartialDownloadError and make it disappear. if (reason.check(client.PartialDownloadError) and self.method.upper() == "HEAD"): self.page("") return return client.HTTPClientFactory.noPage(self, reason) class HTTPClientGETFactory(client.HTTPClientFactory): protocol = MyGetter allmydata-tahoe-1.10.2/src/allmydata/test/test_cli_check.py0000644000175000017500000004401612556560070022027 0ustar ramramimport os.path import simplejson from twisted.trial import unittest from cStringIO import StringIO from allmydata import uri from allmydata.util import base32 from allmydata.util.encodingutil import quote_output, to_str from allmydata.mutable.publish import MutableData from allmydata.immutable import upload from allmydata.scripts import debug from .no_network import GridTestMixin from .test_cli import CLITestMixin timeout = 480 # deep_check takes 360s on Zandr's linksys box, others take > 240s class Check(GridTestMixin, CLITestMixin, unittest.TestCase): def test_check(self): self.basedir = "cli/Check/check" self.set_up_grid() c0 = self.g.clients[0] DATA = "data" * 100 DATA_uploadable = MutableData(DATA) d = c0.create_mutable_file(DATA_uploadable) def _stash_uri(n): self.uri = n.get_uri() d.addCallback(_stash_uri) d.addCallback(lambda ign: self.do_cli("check", self.uri)) def _check1((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("Summary: Healthy" in lines, out) self.failUnless(" good-shares: 10 (encoding is 3-of-10)" in lines, out) d.addCallback(_check1) d.addCallback(lambda ign: self.do_cli("check", "--raw", self.uri)) def _check2((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) data = simplejson.loads(out) self.failUnlessReallyEqual(to_str(data["summary"]), "Healthy") self.failUnlessReallyEqual(data["results"]["healthy"], True) d.addCallback(_check2) d.addCallback(lambda ign: c0.upload(upload.Data("literal", convergence=""))) def _stash_lit_uri(n): self.lit_uri = n.get_uri() d.addCallback(_stash_lit_uri) d.addCallback(lambda ign: self.do_cli("check", self.lit_uri)) def _check_lit((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("Summary: Healthy (LIT)" in lines, out) d.addCallback(_check_lit) d.addCallback(lambda ign: self.do_cli("check", "--raw", self.lit_uri)) def _check_lit_raw((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) data = simplejson.loads(out) self.failUnlessReallyEqual(data["results"]["healthy"], True) d.addCallback(_check_lit_raw) d.addCallback(lambda ign: c0.create_immutable_dirnode({}, convergence="")) def _stash_lit_dir_uri(n): self.lit_dir_uri = n.get_uri() d.addCallback(_stash_lit_dir_uri) d.addCallback(lambda ign: self.do_cli("check", self.lit_dir_uri)) d.addCallback(_check_lit) d.addCallback(lambda ign: self.do_cli("check", "--raw", self.lit_uri)) d.addCallback(_check_lit_raw) def _clobber_shares(ignored): # delete one, corrupt a second shares = self.find_uri_shares(self.uri) self.failUnlessReallyEqual(len(shares), 10) os.unlink(shares[0][2]) cso = debug.CorruptShareOptions() cso.stdout = StringIO() cso.parseOptions([shares[1][2]]) storage_index = uri.from_string(self.uri).get_storage_index() self._corrupt_share_line = " server %s, SI %s, shnum %d" % \ (base32.b2a(shares[1][1]), base32.b2a(storage_index), shares[1][0]) debug.corrupt_share(cso) d.addCallback(_clobber_shares) d.addCallback(lambda ign: self.do_cli("check", "--verify", self.uri)) def _check3((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() summary = [l for l in lines if l.startswith("Summary")][0] self.failUnless("Summary: Unhealthy: 8 shares (enc 3-of-10)" in summary, summary) self.failUnless(" good-shares: 8 (encoding is 3-of-10)" in lines, out) self.failUnless(" corrupt shares:" in lines, out) self.failUnless(self._corrupt_share_line in lines, out) d.addCallback(_check3) d.addCallback(lambda ign: self.do_cli("check", "--verify", "--raw", self.uri)) def _check3_raw((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) data = simplejson.loads(out) self.failUnlessReallyEqual(data["results"]["healthy"], False) self.failUnlessIn("Unhealthy: 8 shares (enc 3-of-10)", data["summary"]) self.failUnlessReallyEqual(data["results"]["count-shares-good"], 8) self.failUnlessReallyEqual(data["results"]["count-corrupt-shares"], 1) self.failUnlessIn("list-corrupt-shares", data["results"]) d.addCallback(_check3_raw) d.addCallback(lambda ign: self.do_cli("check", "--verify", "--repair", self.uri)) def _check4((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("Summary: not healthy" in lines, out) self.failUnless(" good-shares: 8 (encoding is 3-of-10)" in lines, out) self.failUnless(" corrupt shares:" in lines, out) self.failUnless(self._corrupt_share_line in lines, out) self.failUnless(" repair successful" in lines, out) d.addCallback(_check4) d.addCallback(lambda ign: self.do_cli("check", "--verify", "--repair", self.uri)) def _check5((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("Summary: healthy" in lines, out) self.failUnless(" good-shares: 10 (encoding is 3-of-10)" in lines, out) self.failIf(" corrupt shares:" in lines, out) d.addCallback(_check5) return d def test_deep_check(self): self.basedir = "cli/Check/deep_check" self.set_up_grid() c0 = self.g.clients[0] self.uris = {} self.fileurls = {} DATA = "data" * 100 quoted_good = quote_output(u"g\u00F6\u00F6d") d = c0.create_dirnode() def _stash_root_and_create_file(n): self.rootnode = n self.rooturi = n.get_uri() return n.add_file(u"g\u00F6\u00F6d", upload.Data(DATA, convergence="")) d.addCallback(_stash_root_and_create_file) def _stash_uri(fn, which): self.uris[which] = fn.get_uri() return fn d.addCallback(_stash_uri, u"g\u00F6\u00F6d") d.addCallback(lambda ign: self.rootnode.add_file(u"small", upload.Data("literal", convergence=""))) d.addCallback(_stash_uri, "small") d.addCallback(lambda ign: c0.create_mutable_file(MutableData(DATA+"1"))) d.addCallback(lambda fn: self.rootnode.set_node(u"mutable", fn)) d.addCallback(_stash_uri, "mutable") d.addCallback(lambda ign: self.do_cli("deep-check", self.rooturi)) def _check1((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("done: 4 objects checked, 4 healthy, 0 unhealthy" in lines, out) d.addCallback(_check1) # root # root/g\u00F6\u00F6d # root/small # root/mutable d.addCallback(lambda ign: self.do_cli("deep-check", "--verbose", self.rooturi)) def _check2((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("'': Healthy" in lines, out) self.failUnless("'small': Healthy (LIT)" in lines, out) self.failUnless((quoted_good + ": Healthy") in lines, out) self.failUnless("'mutable': Healthy" in lines, out) self.failUnless("done: 4 objects checked, 4 healthy, 0 unhealthy" in lines, out) d.addCallback(_check2) d.addCallback(lambda ign: self.do_cli("stats", self.rooturi)) def _check_stats((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnlessIn(" count-immutable-files: 1", lines) self.failUnlessIn(" count-mutable-files: 1", lines) self.failUnlessIn(" count-literal-files: 1", lines) self.failUnlessIn(" count-directories: 1", lines) self.failUnlessIn(" size-immutable-files: 400", lines) self.failUnlessIn("Size Histogram:", lines) self.failUnlessIn(" 4-10 : 1 (10 B, 10 B)", lines) self.failUnlessIn(" 317-1000 : 1 (1000 B, 1000 B)", lines) d.addCallback(_check_stats) def _clobber_shares(ignored): shares = self.find_uri_shares(self.uris[u"g\u00F6\u00F6d"]) self.failUnlessReallyEqual(len(shares), 10) os.unlink(shares[0][2]) shares = self.find_uri_shares(self.uris["mutable"]) cso = debug.CorruptShareOptions() cso.stdout = StringIO() cso.parseOptions([shares[1][2]]) storage_index = uri.from_string(self.uris["mutable"]).get_storage_index() self._corrupt_share_line = " corrupt: server %s, SI %s, shnum %d" % \ (base32.b2a(shares[1][1]), base32.b2a(storage_index), shares[1][0]) debug.corrupt_share(cso) d.addCallback(_clobber_shares) # root # root/g\u00F6\u00F6d [9 shares] # root/small # root/mutable [1 corrupt share] d.addCallback(lambda ign: self.do_cli("deep-check", "--verbose", self.rooturi)) def _check3((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("'': Healthy" in lines, out) self.failUnless("'small': Healthy (LIT)" in lines, out) self.failUnless("'mutable': Healthy" in lines, out) # needs verifier self.failUnless((quoted_good + ": Not Healthy: 9 shares (enc 3-of-10)") in lines, out) self.failIf(self._corrupt_share_line in lines, out) self.failUnless("done: 4 objects checked, 3 healthy, 1 unhealthy" in lines, out) d.addCallback(_check3) d.addCallback(lambda ign: self.do_cli("deep-check", "--verbose", "--verify", self.rooturi)) def _check4((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("'': Healthy" in lines, out) self.failUnless("'small': Healthy (LIT)" in lines, out) mutable = [l for l in lines if l.startswith("'mutable'")][0] self.failUnless(mutable.startswith("'mutable': Unhealthy: 9 shares (enc 3-of-10)"), mutable) self.failUnless(self._corrupt_share_line in lines, out) self.failUnless((quoted_good + ": Not Healthy: 9 shares (enc 3-of-10)") in lines, out) self.failUnless("done: 4 objects checked, 2 healthy, 2 unhealthy" in lines, out) d.addCallback(_check4) d.addCallback(lambda ign: self.do_cli("deep-check", "--raw", self.rooturi)) def _check5((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() units = [simplejson.loads(line) for line in lines] # root, small, g\u00F6\u00F6d, mutable, stats self.failUnlessReallyEqual(len(units), 4+1) d.addCallback(_check5) d.addCallback(lambda ign: self.do_cli("deep-check", "--verbose", "--verify", "--repair", self.rooturi)) def _check6((rc, out, err)): self.failUnlessReallyEqual(err, "") self.failUnlessReallyEqual(rc, 0) lines = out.splitlines() self.failUnless("'': healthy" in lines, out) self.failUnless("'small': healthy" in lines, out) self.failUnless("'mutable': not healthy" in lines, out) self.failUnless(self._corrupt_share_line in lines, out) self.failUnless((quoted_good + ": not healthy") in lines, out) self.failUnless("done: 4 objects checked" in lines, out) self.failUnless(" pre-repair: 2 healthy, 2 unhealthy" in lines, out) self.failUnless(" 2 repairs attempted, 2 successful, 0 failed" in lines, out) self.failUnless(" post-repair: 4 healthy, 0 unhealthy" in lines,out) d.addCallback(_check6) # now add a subdir, and a file below that, then make the subdir # unrecoverable d.addCallback(lambda ign: self.rootnode.create_subdirectory(u"subdir")) d.addCallback(_stash_uri, "subdir") d.addCallback(lambda fn: fn.add_file(u"subfile", upload.Data(DATA+"2", ""))) d.addCallback(lambda ign: self.delete_shares_numbered(self.uris["subdir"], range(10))) # root # rootg\u00F6\u00F6d/ # root/small # root/mutable # root/subdir [unrecoverable: 0 shares] # root/subfile d.addCallback(lambda ign: self.do_cli("manifest", self.rooturi)) def _manifest_failed((rc, out, err)): self.failIfEqual(rc, 0) self.failUnlessIn("ERROR: UnrecoverableFileError", err) # the fatal directory should still show up, as the last line self.failUnlessIn(" subdir\n", out) d.addCallback(_manifest_failed) d.addCallback(lambda ign: self.do_cli("deep-check", self.rooturi)) def _deep_check_failed((rc, out, err)): self.failIfEqual(rc, 0) self.failUnlessIn("ERROR: UnrecoverableFileError", err) # we want to make sure that the error indication is the last # thing that gets emitted self.failIf("done:" in out, out) d.addCallback(_deep_check_failed) # this test is disabled until the deep-repair response to an # unrepairable directory is fixed. The failure-to-repair should not # throw an exception, but the failure-to-traverse that follows # should throw UnrecoverableFileError. #d.addCallback(lambda ign: # self.do_cli("deep-check", "--repair", self.rooturi)) #def _deep_check_repair_failed((rc, out, err)): # self.failIfEqual(rc, 0) # print err # self.failUnlessIn("ERROR: UnrecoverableFileError", err) # self.failIf("done:" in out, out) #d.addCallback(_deep_check_repair_failed) return d def test_check_without_alias(self): # 'tahoe check' should output a sensible error message if it needs to # find the default alias and can't self.basedir = "cli/Check/check_without_alias" self.set_up_grid() d = self.do_cli("check") def _check((rc, out, err)): self.failUnlessReallyEqual(rc, 1) self.failUnlessIn("error:", err) self.failUnlessReallyEqual(out, "") d.addCallback(_check) d.addCallback(lambda ign: self.do_cli("deep-check")) d.addCallback(_check) return d def test_check_with_nonexistent_alias(self): # 'tahoe check' should output a sensible error message if it needs to # find an alias and can't. self.basedir = "cli/Check/check_with_nonexistent_alias" self.set_up_grid() d = self.do_cli("check", "nonexistent:") def _check((rc, out, err)): self.failUnlessReallyEqual(rc, 1) self.failUnlessIn("error:", err) self.failUnlessIn("nonexistent", err) self.failUnlessReallyEqual(out, "") d.addCallback(_check) return d def test_check_with_multiple_aliases(self): self.basedir = "cli/Check/check_with_multiple_aliases" self.set_up_grid() self.uriList = [] c0 = self.g.clients[0] d = c0.create_dirnode() def _stash_uri(n): self.uriList.append(n.get_uri()) d.addCallback(_stash_uri) d = c0.create_dirnode() d.addCallback(_stash_uri) d.addCallback(lambda ign: self.do_cli("check", self.uriList[0], self.uriList[1])) def _check((rc, out, err)): self.failUnlessReallyEqual(rc, 0) self.failUnlessReallyEqual(err, "") #Ensure healthy appears for each uri self.failUnlessIn("Healthy", out[:len(out)/2]) self.failUnlessIn("Healthy", out[len(out)/2:]) d.addCallback(_check) d.addCallback(lambda ign: self.do_cli("check", self.uriList[0], "nonexistent:")) def _check2((rc, out, err)): self.failUnlessReallyEqual(rc, 1) self.failUnlessIn("Healthy", out) self.failUnlessIn("error:", err) self.failUnlessIn("nonexistent", err) d.addCallback(_check2) return d allmydata-tahoe-1.10.2/src/allmydata/test/test_storage_client.py0000644000175000017500000000171612556560070023125 0ustar ramram from twisted.trial import unittest from allmydata.storage_client import NativeStorageServer class NativeStorageServerWithVersion(NativeStorageServer): def __init__(self,version): self.version=version def get_version(self): return self.version class TestNativeStorageServer(unittest.TestCase): def test_get_available_space_new(self): nss = NativeStorageServerWithVersion( { "http://allmydata.org/tahoe/protocols/storage/v1": { "maximum-immutable-share-size": 111, "available-space": 222, } }) self.failUnlessEqual(nss.get_available_space(), 222) def test_get_available_space_old(self): nss = NativeStorageServerWithVersion( { "http://allmydata.org/tahoe/protocols/storage/v1": { "maximum-immutable-share-size": 111, } }) self.failUnlessEqual(nss.get_available_space(), 111) allmydata-tahoe-1.10.2/src/allmydata/test/check_grid.py0000644000175000017500000002023012556560070021136 0ustar ramram""" Test an existing Tahoe grid, both to see if the grid is still running and to see if the client is still compatible with it. This script is suitable for running from a periodic monitoring script, perhaps by an hourly cronjob. This script uses a pre-established client node (configured to connect to the grid being tested) and a pre-established directory (stored as the 'testgrid:' alias in that client node's aliases file). It then performs a number of uploads and downloads to exercise compatibility in various directions (new client vs old data). All operations are performed by invoking various CLI commands through bin/tahoe . The script must be given two arguments: the client node directory, and the location of the bin/tahoe executable. Note that this script does not import anything from tahoe directly, so it doesn't matter what its PYTHONPATH is, as long as the bin/tahoe that it uses is functional. This script expects that the client node will be not running when the script starts, but it will forcibly shut down the node just to be sure. It will shut down the node after the test finishes. To set up the client node, do the following: tahoe create-client DIR populate DIR/introducer.furl tahoe start DIR tahoe add-alias -d DIR testgrid `tahoe mkdir -d DIR` pick a 10kB-ish test file, compute its md5sum tahoe put -d DIR FILE testgrid:old.MD5SUM tahoe put -d DIR FILE testgrid:recent.MD5SUM tahoe put -d DIR FILE testgrid:recentdir/recent.MD5SUM echo "" | tahoe put -d DIR --mutable testgrid:log echo "" | tahoe put -d DIR --mutable testgrid:recentlog This script will perform the following steps (the kind of compatibility that is being tested is in [brackets]): read old.* and check the md5sums [confirm that new code can read old files] read all recent.* files and check md5sums [read recent files] delete all recent.* files and verify they're gone [modify an old directory] read recentdir/recent.* files and check [read recent directory] delete recentdir/recent.* and verify [modify recent directory] delete recentdir and verify (keep the directory from growing unboundedly) mkdir recentdir upload random 10kB file to recentdir/recent.MD5SUM (prepare for next time) upload random 10kB file to recent.MD5SUM [new code can upload to old servers] append one-line timestamp to log [read/write old mutable files] append one-line timestamp to recentlog [read/write recent mutable files] delete recentlog upload small header to new mutable recentlog [create mutable files] This script will also keep track of speeds and latencies and will write them in a machine-readable logfile. """ import time, subprocess, md5, os.path, random from twisted.python import usage class GridTesterOptions(usage.Options): optFlags = [ ("no", "n", "Dry run: do not run any commands, just print them."), ] def parseArgs(self, nodedir, tahoe): # Note: does not support Unicode arguments. self.nodedir = os.path.expanduser(nodedir) self.tahoe = os.path.abspath(os.path.expanduser(tahoe)) class CommandFailed(Exception): pass class GridTester: def __init__(self, config): self.config = config self.tahoe = config.tahoe self.nodedir = config.nodedir def command(self, *cmd, **kwargs): expected_rc = kwargs.get("expected_rc", 0) stdin = kwargs.get("stdin", None) if self.config["no"]: return if stdin is not None: p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout,stderr) = p.communicate(stdin) else: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout,stderr) = p.communicate() rc = p.returncode if expected_rc != None and rc != expected_rc: if stderr: print "STDERR:" print stderr raise CommandFailed("command '%s' failed: rc=%d" % (cmd, rc)) return stdout, stderr def cli(self, cmd, *args, **kwargs): print "tahoe", cmd, " ".join(args) stdout, stderr = self.command(self.tahoe, cmd, "-d", self.nodedir, *args, **kwargs) if not kwargs.get("ignore_stderr", False) and stderr != "": raise CommandFailed("command '%s' had stderr: %s" % (" ".join(args), stderr)) return stdout def stop_old_node(self): print "tahoe stop", self.nodedir, "(force)" self.command(self.tahoe, "stop", self.nodedir, expected_rc=None) def start_node(self): print "tahoe start", self.nodedir self.command(self.tahoe, "start", self.nodedir) time.sleep(5) def stop_node(self): print "tahoe stop", self.nodedir self.command(self.tahoe, "stop", self.nodedir) def read_and_check(self, f): expected_md5_s = f[f.find(".")+1:] out = self.cli("get", "testgrid:" + f) got_md5_s = md5.new(out).hexdigest() if got_md5_s != expected_md5_s: raise CommandFailed("%s had md5sum of %s" % (f, got_md5_s)) def delete_and_check(self, dirname, f): oldfiles = self.listdir(dirname) if dirname: absfilename = "testgrid:" + dirname + "/" + f else: absfilename = "testgrid:" + f if f not in oldfiles: raise CommandFailed("um, '%s' was supposed to already be in %s" % (f, dirname)) self.cli("rm", absfilename) newfiles = self.listdir(dirname) if f in newfiles: raise CommandFailed("failed to remove '%s' from %s" % (f, dirname)) def listdir(self, dirname): out = self.cli("ls", "testgrid:"+dirname).strip().split("\n") files = [f.strip() for f in out] print " ", files return files def do_test(self): files = self.listdir("") for f in files: if f.startswith("old.") or f.startswith("recent."): self.read_and_check("" + f) for f in files: if f.startswith("recent."): self.delete_and_check("", f) files = self.listdir("recentdir") for f in files: if f.startswith("old.") or f.startswith("recent."): self.read_and_check("recentdir/" + f) for f in files: if f.startswith("recent."): self.delete_and_check("recentdir", f) self.delete_and_check("", "recentdir") self.cli("mkdir", "testgrid:recentdir") fn, data = self.makefile("recent") self.put("recentdir/"+fn, data) files = self.listdir("recentdir") if fn not in files: raise CommandFailed("failed to put %s in recentdir/" % fn) fn, data = self.makefile("recent") self.put(fn, data) files = self.listdir("") if fn not in files: raise CommandFailed("failed to put %s in testgrid:" % fn) self.update("log") self.update("recentlog") self.delete_and_check("", "recentlog") self.put_mutable("recentlog", "Recent Mutable Log Header\n\n") def put(self, fn, data): self.cli("put", "-", "testgrid:"+fn, stdin=data, ignore_stderr=True) def put_mutable(self, fn, data): self.cli("put", "--mutable", "-", "testgrid:"+fn, stdin=data, ignore_stderr=True) def update(self, fn): old = self.cli("get", "testgrid:"+fn) new = old + time.ctime() + "\n" self.put(fn, new) def makefile(self, prefix): size = random.randint(10001, 10100) data = os.urandom(size) md5sum = md5.new(data).hexdigest() fn = prefix + "." + md5sum return fn, data def run(self): self.stop_old_node() self.start_node() try: self.do_test() finally: self.stop_node() def main(): config = GridTesterOptions() config.parseOptions() gt = GridTester(config) gt.run() if __name__ == "__main__": main() allmydata-tahoe-1.10.2/src/allmydata/test/test_crawler.py0000644000175000017500000004057512556560070021570 0ustar ramram import time import os.path from twisted.trial import unittest from twisted.application import service from twisted.internet import defer from foolscap.api import eventually, fireEventually from allmydata.util import fileutil, hashutil, pollmixin from allmydata.storage.server import StorageServer, si_b2a from allmydata.storage.crawler import ShareCrawler, TimeSliceExceeded from allmydata.test.test_storage import FakeCanary from allmydata.test.common_util import StallMixin class BucketEnumeratingCrawler(ShareCrawler): cpu_slice = 500 # make sure it can complete in a single slice slow_start = 0 def __init__(self, *args, **kwargs): ShareCrawler.__init__(self, *args, **kwargs) self.all_buckets = [] self.finished_d = defer.Deferred() def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32): self.all_buckets.append(storage_index_b32) def finished_cycle(self, cycle): eventually(self.finished_d.callback, None) class PacedCrawler(ShareCrawler): cpu_slice = 500 # make sure it can complete in a single slice slow_start = 0 def __init__(self, *args, **kwargs): ShareCrawler.__init__(self, *args, **kwargs) self.countdown = 6 self.all_buckets = [] self.finished_d = defer.Deferred() self.yield_cb = None def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32): self.all_buckets.append(storage_index_b32) self.countdown -= 1 if self.countdown == 0: # force a timeout. We restore it in yielding() self.cpu_slice = -1.0 def yielding(self, sleep_time): self.cpu_slice = 500 if self.yield_cb: self.yield_cb() def finished_cycle(self, cycle): eventually(self.finished_d.callback, None) class ConsumingCrawler(ShareCrawler): cpu_slice = 0.5 allowed_cpu_percentage = 0.5 minimum_cycle_time = 0 slow_start = 0 def __init__(self, *args, **kwargs): ShareCrawler.__init__(self, *args, **kwargs) self.accumulated = 0.0 self.cycles = 0 self.last_yield = 0.0 def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32): start = time.time() time.sleep(0.05) elapsed = time.time() - start self.accumulated += elapsed self.last_yield += elapsed def finished_cycle(self, cycle): self.cycles += 1 def yielding(self, sleep_time): self.last_yield = 0.0 class OneShotCrawler(ShareCrawler): cpu_slice = 500 # make sure it can complete in a single slice slow_start = 0 def __init__(self, *args, **kwargs): ShareCrawler.__init__(self, *args, **kwargs) self.counter = 0 self.finished_d = defer.Deferred() def process_bucket(self, cycle, prefix, prefixdir, storage_index_b32): self.counter += 1 def finished_cycle(self, cycle): self.finished_d.callback(None) self.disownServiceParent() class Basic(unittest.TestCase, StallMixin, pollmixin.PollMixin): def setUp(self): self.s = service.MultiService() self.s.startService() def tearDown(self): return self.s.stopService() def si(self, i): return hashutil.storage_index_hash(str(i)) def rs(self, i, serverid): return hashutil.bucket_renewal_secret_hash(str(i), serverid) def cs(self, i, serverid): return hashutil.bucket_cancel_secret_hash(str(i), serverid) def write(self, i, ss, serverid, tail=0): si = self.si(i) si = si[:-1] + chr(tail) had,made = ss.remote_allocate_buckets(si, self.rs(i, serverid), self.cs(i, serverid), set([0]), 99, FakeCanary()) made[0].remote_write(0, "data") made[0].remote_close() return si_b2a(si) def test_immediate(self): self.basedir = "crawler/Basic/immediate" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) sis = [self.write(i, ss, serverid) for i in range(10)] statefile = os.path.join(self.basedir, "statefile") c = BucketEnumeratingCrawler(ss, statefile, allowed_cpu_percentage=.1) c.load_state() c.start_current_prefix(time.time()) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) # make sure the statefile has been returned to the starting point c.finished_d = defer.Deferred() c.all_buckets = [] c.start_current_prefix(time.time()) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) # check that a new crawler picks up on the state file properly c2 = BucketEnumeratingCrawler(ss, statefile) c2.load_state() c2.start_current_prefix(time.time()) self.failUnlessEqual(sorted(sis), sorted(c2.all_buckets)) def test_service(self): self.basedir = "crawler/Basic/service" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) sis = [self.write(i, ss, serverid) for i in range(10)] statefile = os.path.join(self.basedir, "statefile") c = BucketEnumeratingCrawler(ss, statefile) c.setServiceParent(self.s) # it should be legal to call get_state() and get_progress() right # away, even before the first tick is performed. No work should have # been done yet. s = c.get_state() p = c.get_progress() self.failUnlessEqual(s["last-complete-prefix"], None) self.failUnlessEqual(s["current-cycle"], None) self.failUnlessEqual(p["cycle-in-progress"], False) d = c.finished_d def _check(ignored): self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) d.addCallback(_check) return d def test_paced(self): self.basedir = "crawler/Basic/paced" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) # put four buckets in each prefixdir sis = [] for i in range(10): for tail in range(4): sis.append(self.write(i, ss, serverid, tail)) statefile = os.path.join(self.basedir, "statefile") c = PacedCrawler(ss, statefile) c.load_state() try: c.start_current_prefix(time.time()) except TimeSliceExceeded: pass # that should stop in the middle of one of the buckets. Since we # aren't using its normal scheduler, we have to save its state # manually. c.save_state() c.cpu_slice = PacedCrawler.cpu_slice self.failUnlessEqual(len(c.all_buckets), 6) c.start_current_prefix(time.time()) # finish it self.failUnlessEqual(len(sis), len(c.all_buckets)) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) # make sure the statefile has been returned to the starting point c.finished_d = defer.Deferred() c.all_buckets = [] c.start_current_prefix(time.time()) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) del c # start a new crawler, it should start from the beginning c = PacedCrawler(ss, statefile) c.load_state() try: c.start_current_prefix(time.time()) except TimeSliceExceeded: pass # that should stop in the middle of one of the buckets. Since we # aren't using its normal scheduler, we have to save its state # manually. c.save_state() c.cpu_slice = PacedCrawler.cpu_slice # a third crawler should pick up from where it left off c2 = PacedCrawler(ss, statefile) c2.all_buckets = c.all_buckets[:] c2.load_state() c2.countdown = -1 c2.start_current_prefix(time.time()) self.failUnlessEqual(len(sis), len(c2.all_buckets)) self.failUnlessEqual(sorted(sis), sorted(c2.all_buckets)) del c, c2 # now stop it at the end of a bucket (countdown=4), to exercise a # different place that checks the time c = PacedCrawler(ss, statefile) c.load_state() c.countdown = 4 try: c.start_current_prefix(time.time()) except TimeSliceExceeded: pass # that should stop at the end of one of the buckets. Again we must # save state manually. c.save_state() c.cpu_slice = PacedCrawler.cpu_slice self.failUnlessEqual(len(c.all_buckets), 4) c.start_current_prefix(time.time()) # finish it self.failUnlessEqual(len(sis), len(c.all_buckets)) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) del c # stop it again at the end of the bucket, check that a new checker # picks up correctly c = PacedCrawler(ss, statefile) c.load_state() c.countdown = 4 try: c.start_current_prefix(time.time()) except TimeSliceExceeded: pass # that should stop at the end of one of the buckets. c.save_state() c2 = PacedCrawler(ss, statefile) c2.all_buckets = c.all_buckets[:] c2.load_state() c2.countdown = -1 c2.start_current_prefix(time.time()) self.failUnlessEqual(len(sis), len(c2.all_buckets)) self.failUnlessEqual(sorted(sis), sorted(c2.all_buckets)) del c, c2 def test_paced_service(self): self.basedir = "crawler/Basic/paced_service" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) sis = [self.write(i, ss, serverid) for i in range(10)] statefile = os.path.join(self.basedir, "statefile") c = PacedCrawler(ss, statefile) did_check_progress = [False] def check_progress(): c.yield_cb = None try: p = c.get_progress() self.failUnlessEqual(p["cycle-in-progress"], True) pct = p["cycle-complete-percentage"] # after 6 buckets, we happen to be at 76.17% complete. As # long as we create shares in deterministic order, this will # continue to be true. self.failUnlessEqual(int(pct), 76) left = p["remaining-sleep-time"] self.failUnless(isinstance(left, float), left) self.failUnless(left > 0.0, left) except Exception, e: did_check_progress[0] = e else: did_check_progress[0] = True c.yield_cb = check_progress c.setServiceParent(self.s) # that should get through 6 buckets, pause for a little while (and # run check_progress()), then resume d = c.finished_d def _check(ignored): if did_check_progress[0] is not True: raise did_check_progress[0] self.failUnless(did_check_progress[0]) self.failUnlessEqual(sorted(sis), sorted(c.all_buckets)) # at this point, the crawler should be sitting in the inter-cycle # timer, which should be pegged at the minumum cycle time self.failUnless(c.timer) self.failUnless(c.sleeping_between_cycles) self.failUnlessEqual(c.current_sleep_time, c.minimum_cycle_time) p = c.get_progress() self.failUnlessEqual(p["cycle-in-progress"], False) naptime = p["remaining-wait-time"] self.failUnless(isinstance(naptime, float), naptime) # min-cycle-time is 300, so this is basically testing that it took # less than 290s to crawl self.failUnless(naptime > 10.0, naptime) soon = p["next-crawl-time"] - time.time() self.failUnless(soon > 10.0, soon) d.addCallback(_check) return d def OFF_test_cpu_usage(self): # this test can't actually assert anything, because too many # buildslave machines are slow. But on a fast developer machine, it # can produce interesting results. So if you care about how well the # Crawler is accomplishing it's run-slowly goals, re-enable this test # and read the stdout when it runs. self.basedir = "crawler/Basic/cpu_usage" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) for i in range(10): self.write(i, ss, serverid) statefile = os.path.join(self.basedir, "statefile") c = ConsumingCrawler(ss, statefile) c.setServiceParent(self.s) # this will run as fast as it can, consuming about 50ms per call to # process_bucket(), limited by the Crawler to about 50% cpu. We let # it run for a few seconds, then compare how much time # process_bucket() got vs wallclock time. It should get between 10% # and 70% CPU. This is dicey, there's about 100ms of overhead per # 300ms slice (saving the state file takes about 150-200us, but we do # it 1024 times per cycle, one for each [empty] prefixdir), leaving # 200ms for actual processing, which is enough to get through 4 # buckets each slice, then the crawler sleeps for 300ms/0.5 = 600ms, # giving us 900ms wallclock per slice. In 4.0 seconds we can do 4.4 # slices, giving us about 17 shares, so we merely assert that we've # finished at least one cycle in that time. # with a short cpu_slice (so we can keep this test down to 4 # seconds), the overhead is enough to make a nominal 50% usage more # like 30%. Forcing sleep_time to 0 only gets us 67% usage. start = time.time() d = self.stall(delay=4.0) def _done(res): elapsed = time.time() - start percent = 100.0 * c.accumulated / elapsed # our buildslaves vary too much in their speeds and load levels, # and many of them only manage to hit 7% usage when our target is # 50%. So don't assert anything about the results, just log them. print print "crawler: got %d%% percent when trying for 50%%" % percent print "crawler: got %d full cycles" % c.cycles d.addCallback(_done) return d def test_empty_subclass(self): self.basedir = "crawler/Basic/empty_subclass" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) for i in range(10): self.write(i, ss, serverid) statefile = os.path.join(self.basedir, "statefile") c = ShareCrawler(ss, statefile) c.slow_start = 0 c.setServiceParent(self.s) # we just let it run for a while, to get figleaf coverage of the # empty methods in the base class def _check(): return bool(c.state["last-cycle-finished"] is not None) d = self.poll(_check) def _done(ignored): state = c.get_state() self.failUnless(state["last-cycle-finished"] is not None) d.addCallback(_done) return d def test_oneshot(self): self.basedir = "crawler/Basic/oneshot" fileutil.make_dirs(self.basedir) serverid = "\x00" * 20 ss = StorageServer(self.basedir, serverid) ss.setServiceParent(self.s) for i in range(30): self.write(i, ss, serverid) statefile = os.path.join(self.basedir, "statefile") c = OneShotCrawler(ss, statefile) c.setServiceParent(self.s) d = c.finished_d def _finished_first_cycle(ignored): return fireEventually(c.counter) d.addCallback(_finished_first_cycle) def _check(old_counter): # the crawler should do any work after it's been stopped self.failUnlessEqual(old_counter, c.counter) self.failIf(c.running) self.failIf(c.timer) self.failIf(c.current_sleep_time) s = c.get_state() self.failUnlessEqual(s["last-cycle-finished"], 0) self.failUnlessEqual(s["current-cycle"], None) d.addCallback(_check) return d allmydata-tahoe-1.10.2/src/allmydata/test/test_ftp.py0000644000175000017500000001004212556560070020704 0ustar ramram from twisted.trial import unittest from allmydata.frontends import ftpd from allmydata.immutable import upload from allmydata.mutable import publish from allmydata.test.no_network import GridTestMixin from allmydata.test.common_util import ReallyEqualMixin class Handler(GridTestMixin, ReallyEqualMixin, unittest.TestCase): """ This is a no-network unit test of ftpd.Handler and the abstractions it uses. """ FALL_OF_BERLIN_WALL = 626644800 TURN_OF_MILLENIUM = 946684800 def _set_up(self, basedir, num_clients=1, num_servers=10): self.basedir = "ftp/" + basedir self.set_up_grid(num_clients=num_clients, num_servers=num_servers) self.client = self.g.clients[0] self.username = "alice" self.convergence = "" d = self.client.create_dirnode() def _created_root(node): self.root = node self.root_uri = node.get_uri() self.handler = ftpd.Handler(self.client, self.root, self.username, self.convergence) d.addCallback(_created_root) return d def _set_metadata(self, name, metadata): """Set metadata for `name', avoiding MetadataSetter's timestamp reset behavior.""" def _modifier(old_contents, servermap, first_time): children = self.root._unpack_contents(old_contents) children[name] = (children[name][0], metadata) return self.root._pack_contents(children) return self.root._node.modify(_modifier) def _set_up_tree(self): # add immutable file at root immutable = upload.Data("immutable file contents", None) d = self.root.add_file(u"immutable", immutable) # `mtime' and `linkmotime' both set md_both = {'mtime': self.FALL_OF_BERLIN_WALL, 'tahoe': {'linkmotime': self.TURN_OF_MILLENIUM}} d.addCallback(lambda _: self._set_metadata(u"immutable", md_both)) # add link to root from root d.addCallback(lambda _: self.root.set_node(u"loop", self.root)) # `mtime' set, but no `linkmotime' md_just_mtime = {'mtime': self.FALL_OF_BERLIN_WALL, 'tahoe': {}} d.addCallback(lambda _: self._set_metadata(u"loop", md_just_mtime)) # add mutable file at root mutable = publish.MutableData("mutable file contents") d.addCallback(lambda _: self.client.create_mutable_file(mutable)) d.addCallback(lambda node: self.root.set_node(u"mutable", node)) # neither `mtime' nor `linkmotime' set d.addCallback(lambda _: self._set_metadata(u"mutable", {})) return d def _compareDirLists(self, actual, expected): actual_list = sorted(actual) expected_list = sorted(expected) self.failUnlessReallyEqual(len(actual_list), len(expected_list), "%r is wrong length, expecting %r" % ( actual_list, expected_list)) for (a, b) in zip(actual_list, expected_list): (name, meta) = a (expected_name, expected_meta) = b self.failUnlessReallyEqual(name, expected_name) self.failUnlessReallyEqual(meta, expected_meta) def test_list(self): keys = ("size", "directory", "permissions", "hardlinks", "modified", "owner", "group", "unexpected") d = self._set_up("list") d.addCallback(lambda _: self._set_up_tree()) d.addCallback(lambda _: self.handler.list("", keys=keys)) expected_root = [ ('loop', [0, True, ftpd.IntishPermissions(0600), 1, self.FALL_OF_BERLIN_WALL, 'alice', 'alice', '??']), ('immutable', [23, False, ftpd.IntishPermissions(0600), 1, self.TURN_OF_MILLENIUM, 'alice', 'alice', '??']), ('mutable', # timestamp should be 0 if no timestamp metadata is present [0, False, ftpd.IntishPermissions(0600), 1, 0, 'alice', 'alice', '??'])] d.addCallback(lambda root: self._compareDirLists(root, expected_root)) return d allmydata-tahoe-1.10.2/src/allmydata/test/test_json_metadata.py0000644000175000017500000000350512556560070022732 0ustar ramram from twisted.trial.unittest import TestCase from allmydata.web.common import get_filenode_metadata, SDMF_VERSION, MDMF_VERSION class MockFileNode(object): def __init__(self, size, mutable_version=None): self.size = size self.mutable_version = mutable_version def get_size(self): return self.size def is_mutable(self): return self.mutable_version is not None def get_version(self): if self.mutable_version is None: raise AttributeError() return self.mutable_version class CommonFixture(object): def test_size_is_0(self): """If get_size doesn't return None the returned metadata must contain "size".""" mockfilenode = MockFileNode(0, mutable_version=self.mutable_version) metadata = get_filenode_metadata(mockfilenode) self.failUnlessEqual(metadata['size'], 0) def test_size_is_1000(self): """1000 is sufficiently large to guarantee the cap is not a literal.""" mockfilenode = MockFileNode(1000, mutable_version=self.mutable_version) metadata = get_filenode_metadata(mockfilenode) self.failUnlessEqual(metadata['size'], 1000) def test_size_is_None(self): """If get_size returns None the returned metadata must not contain "size".""" mockfilenode = MockFileNode(None, mutable_version=self.mutable_version) metadata = get_filenode_metadata(mockfilenode) self.failIfIn('size', metadata) class Test_GetFileNodeMetaData_Immutable(CommonFixture, TestCase): def setUp(self): self.mutable_version = None class Test_GetFileNodeMetaData_SDMF(CommonFixture, TestCase): def setUp(self): self.mutable_version = SDMF_VERSION class Test_GetFileNodeMetaData_MDMF(CommonFixture, TestCase): def setUp(self): self.mutable_version = MDMF_VERSION allmydata-tahoe-1.10.2/src/allmydata/test/test_drop_upload.py0000644000175000017500000002003112556560070022422 0ustar ramram import os, sys from twisted.trial import unittest from twisted.python import filepath, runtime from twisted.internet import defer from allmydata.interfaces import IDirectoryNode, NoSuchChildError from allmydata.util import fake_inotify from allmydata.util.encodingutil import get_filesystem_encoding from allmydata.util.consumer import download_to_data from allmydata.test.no_network import GridTestMixin from allmydata.test.common_util import ReallyEqualMixin, NonASCIIPathMixin from allmydata.test.common import ShouldFailMixin from allmydata.frontends.drop_upload import DropUploader class DropUploadTestMixin(GridTestMixin, ShouldFailMixin, ReallyEqualMixin, NonASCIIPathMixin): """ These tests will be run both with a mock notifier, and (on platforms that support it) with the real INotify. """ def _get_count(self, name): return self.stats_provider.get_stats()["counters"].get(name, 0) def _test(self): self.uploader = None self.set_up_grid() self.local_dir = os.path.join(self.basedir, self.unicode_or_fallback(u"loc\u0101l_dir", u"local_dir")) self.mkdir_nonascii(self.local_dir) self.client = self.g.clients[0] self.stats_provider = self.client.stats_provider d = self.client.create_dirnode() def _made_upload_dir(n): self.failUnless(IDirectoryNode.providedBy(n)) self.upload_dirnode = n self.upload_dircap = n.get_uri() self.uploader = DropUploader(self.client, self.upload_dircap, self.local_dir.encode('utf-8'), inotify=self.inotify) return self.uploader.startService() d.addCallback(_made_upload_dir) # Write something short enough for a LIT file. d.addCallback(lambda ign: self._test_file(u"short", "test")) # Write to the same file again with different data. d.addCallback(lambda ign: self._test_file(u"short", "different")) # Test that temporary files are not uploaded. d.addCallback(lambda ign: self._test_file(u"tempfile", "test", temporary=True)) # Test that we tolerate creation of a subdirectory. d.addCallback(lambda ign: os.mkdir(os.path.join(self.local_dir, u"directory"))) # Write something longer, and also try to test a Unicode name if the fs can represent it. name_u = self.unicode_or_fallback(u"l\u00F8ng", u"long") d.addCallback(lambda ign: self._test_file(name_u, "test"*100)) # TODO: test that causes an upload failure. d.addCallback(lambda ign: self.failUnlessReallyEqual(self._get_count('drop_upload.files_failed'), 0)) # Prevent unclean reactor errors. def _cleanup(res): d = defer.succeed(None) if self.uploader is not None: d.addCallback(lambda ign: self.uploader.finish(for_tests=True)) d.addCallback(lambda ign: res) return d d.addBoth(_cleanup) return d def _test_file(self, name_u, data, temporary=False): previously_uploaded = self._get_count('drop_upload.files_uploaded') previously_disappeared = self._get_count('drop_upload.files_disappeared') d = defer.Deferred() # Note: this relies on the fact that we only get one IN_CLOSE_WRITE notification per file # (otherwise we would get a defer.AlreadyCalledError). Should we be relying on that? self.uploader.set_uploaded_callback(d.callback) path_u = os.path.join(self.local_dir, name_u) if sys.platform == "win32": path = filepath.FilePath(path_u) else: path = filepath.FilePath(path_u.encode(get_filesystem_encoding())) # We don't use FilePath.setContent() here because it creates a temporary file that # is renamed into place, which causes events that the test is not expecting. f = open(path.path, "wb") try: if temporary and sys.platform != "win32": os.unlink(path.path) f.write(data) finally: f.close() if temporary and sys.platform == "win32": os.unlink(path.path) self.notify_close_write(path) if temporary: d.addCallback(lambda ign: self.shouldFail(NoSuchChildError, 'temp file not uploaded', None, self.upload_dirnode.get, name_u)) d.addCallback(lambda ign: self.failUnlessReallyEqual(self._get_count('drop_upload.files_disappeared'), previously_disappeared + 1)) else: d.addCallback(lambda ign: self.upload_dirnode.get(name_u)) d.addCallback(download_to_data) d.addCallback(lambda actual_data: self.failUnlessReallyEqual(actual_data, data)) d.addCallback(lambda ign: self.failUnlessReallyEqual(self._get_count('drop_upload.files_uploaded'), previously_uploaded + 1)) d.addCallback(lambda ign: self.failUnlessReallyEqual(self._get_count('drop_upload.files_queued'), 0)) return d class MockTest(DropUploadTestMixin, unittest.TestCase): """This can run on any platform, and even if twisted.internet.inotify can't be imported.""" def test_errors(self): self.basedir = "drop_upload.MockTest.test_errors" self.set_up_grid() errors_dir = os.path.join(self.basedir, "errors_dir") os.mkdir(errors_dir) client = self.g.clients[0] d = client.create_dirnode() def _made_upload_dir(n): self.failUnless(IDirectoryNode.providedBy(n)) upload_dircap = n.get_uri() readonly_dircap = n.get_readonly_uri() self.shouldFail(AssertionError, 'invalid local.directory', 'could not be represented', DropUploader, client, upload_dircap, '\xFF', inotify=fake_inotify) self.shouldFail(AssertionError, 'nonexistent local.directory', 'there is no directory', DropUploader, client, upload_dircap, os.path.join(self.basedir, "Laputa"), inotify=fake_inotify) fp = filepath.FilePath(self.basedir).child('NOT_A_DIR') fp.touch() self.shouldFail(AssertionError, 'non-directory local.directory', 'is not a directory', DropUploader, client, upload_dircap, fp.path, inotify=fake_inotify) self.shouldFail(AssertionError, 'bad upload.dircap', 'does not refer to a directory', DropUploader, client, 'bad', errors_dir, inotify=fake_inotify) self.shouldFail(AssertionError, 'non-directory upload.dircap', 'does not refer to a directory', DropUploader, client, 'URI:LIT:foo', errors_dir, inotify=fake_inotify) self.shouldFail(AssertionError, 'readonly upload.dircap', 'is not a writecap to a directory', DropUploader, client, readonly_dircap, errors_dir, inotify=fake_inotify) d.addCallback(_made_upload_dir) return d def test_drop_upload(self): self.inotify = fake_inotify self.basedir = "drop_upload.MockTest.test_drop_upload" return self._test() def notify_close_write(self, path): self.uploader._notifier.event(path, self.inotify.IN_CLOSE_WRITE) class RealTest(DropUploadTestMixin, unittest.TestCase): """This is skipped unless both Twisted and the platform support inotify.""" def test_drop_upload(self): # We should always have runtime.platform.supportsINotify, because we're using # Twisted >= 10.1. if not runtime.platform.supportsINotify(): raise unittest.SkipTest("Drop-upload support can only be tested for-real on an OS that supports inotify or equivalent.") self.inotify = None # use the appropriate inotify for the platform self.basedir = "drop_upload.RealTest.test_drop_upload" return self._test() def notify_close_write(self, path): # Writing to the file causes the notification. pass allmydata-tahoe-1.10.2/src/allmydata/test/test_web.py0000644000175000017500000103522612556560070020704 0ustar ramramimport os.path, re, urllib, time, cgi import simplejson from StringIO import StringIO from twisted.application import service from twisted.trial import unittest from twisted.internet import defer, reactor from twisted.internet.task import Clock from twisted.web import client, error, http from twisted.python import failure, log from foolscap.api import fireEventually, flushEventualQueue from nevow.util import escapeToXML from nevow import rend from allmydata import interfaces, uri, webish, dirnode from allmydata.storage.shares import get_share_file from allmydata.storage_client import StorageFarmBroker, StubServer from allmydata.immutable import upload from allmydata.immutable.downloader.status import DownloadStatus from allmydata.dirnode import DirectoryNode from allmydata.nodemaker import NodeMaker from allmydata.unknown import UnknownNode from allmydata.web import status, common from allmydata.scripts.debug import CorruptShareOptions, corrupt_share from allmydata.util import fileutil, base32, hashutil from allmydata.util.consumer import download_to_data from allmydata.util.netstring import split_netstring from allmydata.util.encodingutil import to_str from allmydata.test.common import FakeCHKFileNode, FakeMutableFileNode, \ create_chk_filenode, WebErrorMixin, ShouldFailMixin, \ make_mutable_file_uri, create_mutable_filenode from allmydata.interfaces import IMutableFileNode, SDMF_VERSION, MDMF_VERSION from allmydata.mutable import servermap, publish, retrieve import allmydata.test.common_util as testutil from allmydata.test.no_network import GridTestMixin from allmydata.test.common_web import HTTPClientGETFactory, \ HTTPClientHEADFactory from allmydata.client import Client, SecretHolder from allmydata.introducer import IntroducerNode # create a fake uploader/downloader, and a couple of fake dirnodes, then # create a webserver that works against them timeout = 480 # Most of these take longer than 240 seconds on Francois's arm box. unknown_rwcap = u"lafs://from_the_future_rw_\u263A".encode('utf-8') unknown_rocap = u"ro.lafs://readonly_from_the_future_ro_\u263A".encode('utf-8') unknown_immcap = u"imm.lafs://immutable_from_the_future_imm_\u263A".encode('utf-8') FAVICON_MARKUP = '' DIR_HTML_TAG = '' class FakeStatsProvider: def get_stats(self): stats = {'stats': {}, 'counters': {}} return stats class FakeNodeMaker(NodeMaker): encoding_params = { 'k': 3, 'n': 10, 'happy': 7, 'max_segment_size':128*1024 # 1024=KiB } def _create_lit(self, cap): return FakeCHKFileNode(cap, self.all_contents) def _create_immutable(self, cap): return FakeCHKFileNode(cap, self.all_contents) def _create_mutable(self, cap): return FakeMutableFileNode(None, None, self.encoding_params, None, self.all_contents).init_from_cap(cap) def create_mutable_file(self, contents="", keysize=None, version=SDMF_VERSION): n = FakeMutableFileNode(None, None, self.encoding_params, None, self.all_contents) return n.create(contents, version=version) class FakeUploader(service.Service): name = "uploader" helper_furl = None helper_connected = False def upload(self, uploadable): d = uploadable.get_size() d.addCallback(lambda size: uploadable.read(size)) def _got_data(datav): data = "".join(datav) n = create_chk_filenode(data, self.all_contents) ur = upload.UploadResults(file_size=len(data), ciphertext_fetched=0, preexisting_shares=0, pushed_shares=10, sharemap={}, servermap={}, timings={}, uri_extension_data={}, uri_extension_hash="fake", verifycapstr="fakevcap") ur.set_uri(n.get_uri()) return ur d.addCallback(_got_data) return d def get_helper_info(self): return (self.helper_furl, self.helper_connected) def build_one_ds(): ds = DownloadStatus("storage_index", 1234) now = time.time() serverA = StubServer(hashutil.tagged_hash("foo", "serverid_a")[:20]) serverB = StubServer(hashutil.tagged_hash("foo", "serverid_b")[:20]) storage_index = hashutil.storage_index_hash("SI") e0 = ds.add_segment_request(0, now) e0.activate(now+0.5) e0.deliver(now+1, 0, 100, 0.5) # when, start,len, decodetime e1 = ds.add_segment_request(1, now+2) e1.error(now+3) # two outstanding requests e2 = ds.add_segment_request(2, now+4) e3 = ds.add_segment_request(3, now+5) del e2,e3 # hush pyflakes # simulate a segment which gets delivered faster than a system clock tick (ticket #1166) e = ds.add_segment_request(4, now) e.activate(now) e.deliver(now, 0, 140, 0.5) e = ds.add_dyhb_request(serverA, now) e.finished([1,2], now+1) e = ds.add_dyhb_request(serverB, now+2) # left unfinished e = ds.add_read_event(0, 120, now) e.update(60, 0.5, 0.1) # bytes, decrypttime, pausetime e.finished(now+1) e = ds.add_read_event(120, 30, now+2) # left unfinished e = ds.add_block_request(serverA, 1, 100, 20, now) e.finished(20, now+1) e = ds.add_block_request(serverB, 1, 120, 30, now+1) # left unfinished # make sure that add_read_event() can come first too ds1 = DownloadStatus(storage_index, 1234) e = ds1.add_read_event(0, 120, now) e.update(60, 0.5, 0.1) # bytes, decrypttime, pausetime e.finished(now+1) return ds class FakeHistory: _all_upload_status = [upload.UploadStatus()] _all_download_status = [build_one_ds()] _all_mapupdate_statuses = [servermap.UpdateStatus()] _all_publish_statuses = [publish.PublishStatus()] _all_retrieve_statuses = [retrieve.RetrieveStatus()] def list_all_upload_statuses(self): return self._all_upload_status def list_all_download_statuses(self): return self._all_download_status def list_all_mapupdate_statuses(self): return self._all_mapupdate_statuses def list_all_publish_statuses(self): return self._all_publish_statuses def list_all_retrieve_statuses(self): return self._all_retrieve_statuses def list_all_helper_statuses(self): return [] class FakeDisplayableServer(StubServer): def __init__(self, serverid, nickname): StubServer.__init__(self, serverid) self.announcement = {"my-version": "allmydata-tahoe-fake", "service-name": "storage", "nickname": nickname} def is_connected(self): return True def get_permutation_seed(self): return "" def get_remote_host(self): return "" def get_last_loss_time(self): return None def get_announcement_time(self): return None def get_announcement(self): return self.announcement def get_nickname(self): return self.announcement["nickname"] def get_available_space(self): return 123456 class FakeBucketCounter(object): def get_state(self): return {"last-complete-bucket-count": 0} def get_progress(self): return {"estimated-time-per-cycle": 0, "cycle-in-progress": False, "remaining-wait-time": 0} class FakeLeaseChecker(object): def __init__(self): self.expiration_enabled = False self.mode = "age" self.override_lease_duration = None self.sharetypes_to_expire = {} def get_state(self): return {"history": None} def get_progress(self): return {"estimated-time-per-cycle": 0, "cycle-in-progress": False, "remaining-wait-time": 0} class FakeStorageServer(service.MultiService): name = 'storage' def __init__(self, nodeid, nickname): service.MultiService.__init__(self) self.my_nodeid = nodeid self.nickname = nickname self.bucket_counter = FakeBucketCounter() self.lease_checker = FakeLeaseChecker() def get_stats(self): return {"storage_server.accepting_immutable_shares": False} class FakeClient(Client): def __init__(self): # don't upcall to Client.__init__, since we only want to initialize a # minimal subset service.MultiService.__init__(self) self.all_contents = {} self.nodeid = "fake_nodeid" self.nickname = u"fake_nickname \u263A" self.introducer_furl = "None" self.stats_provider = FakeStatsProvider() self._secret_holder = SecretHolder("lease secret", "convergence secret") self.helper = None self.convergence = "some random string" self.storage_broker = StorageFarmBroker(None, permute_peers=True) # fake knowledge of another server self.storage_broker.test_add_server("other_nodeid", FakeDisplayableServer("other_nodeid", u"other_nickname \u263B")) self.introducer_client = None self.history = FakeHistory() self.uploader = FakeUploader() self.uploader.all_contents = self.all_contents self.uploader.setServiceParent(self) self.blacklist = None self.nodemaker = FakeNodeMaker(None, self._secret_holder, None, self.uploader, None, None, None, None) self.nodemaker.all_contents = self.all_contents self.mutable_file_default = SDMF_VERSION self.addService(FakeStorageServer(self.nodeid, self.nickname)) def get_long_nodeid(self): return "v0-nodeid" def get_long_tubid(self): return "tubid" def startService(self): return service.MultiService.startService(self) def stopService(self): return service.MultiService.stopService(self) MUTABLE_SIZELIMIT = FakeMutableFileNode.MUTABLE_SIZELIMIT class WebMixin(object): def setUp(self): self.s = FakeClient() self.s.startService() self.staticdir = self.mktemp() self.clock = Clock() self.ws = webish.WebishServer(self.s, "0", staticdir=self.staticdir, clock=self.clock) self.ws.setServiceParent(self.s) self.webish_port = self.ws.getPortnum() self.webish_url = self.ws.getURL() assert self.webish_url.endswith("/") self.webish_url = self.webish_url[:-1] # these tests add their own / l = [ self.s.create_dirnode() for x in range(6) ] d = defer.DeferredList(l) def _then(res): self.public_root = res[0][1] assert interfaces.IDirectoryNode.providedBy(self.public_root), res self.public_url = "/uri/" + self.public_root.get_uri() self.private_root = res[1][1] foo = res[2][1] self._foo_node = foo self._foo_uri = foo.get_uri() self._foo_readonly_uri = foo.get_readonly_uri() self._foo_verifycap = foo.get_verify_cap().to_string() # NOTE: we ignore the deferred on all set_uri() calls, because we # know the fake nodes do these synchronously self.public_root.set_uri(u"foo", foo.get_uri(), foo.get_readonly_uri()) self.BAR_CONTENTS, n, self._bar_txt_uri = self.makefile(0) foo.set_uri(u"bar.txt", self._bar_txt_uri, self._bar_txt_uri) self._bar_txt_verifycap = n.get_verify_cap().to_string() # sdmf # XXX: Do we ever use this? self.BAZ_CONTENTS, n, self._baz_txt_uri, self._baz_txt_readonly_uri = self.makefile_mutable(0) foo.set_uri(u"baz.txt", self._baz_txt_uri, self._baz_txt_readonly_uri) # mdmf self.QUUX_CONTENTS, n, self._quux_txt_uri, self._quux_txt_readonly_uri = self.makefile_mutable(0, mdmf=True) assert self._quux_txt_uri.startswith("URI:MDMF") foo.set_uri(u"quux.txt", self._quux_txt_uri, self._quux_txt_readonly_uri) foo.set_uri(u"empty", res[3][1].get_uri(), res[3][1].get_readonly_uri()) sub_uri = res[4][1].get_uri() self._sub_uri = sub_uri foo.set_uri(u"sub", sub_uri, sub_uri) sub = self.s.create_node_from_uri(sub_uri) self._sub_node = sub _ign, n, blocking_uri = self.makefile(1) foo.set_uri(u"blockingfile", blocking_uri, blocking_uri) # filenode to test for html encoding issues self._htmlname_unicode = u"<&weirdly'named\"file>>>_