diff options
author | biao716.wang <biao716.wang@samsung.com> | 2020-08-26 11:34:58 +0900 |
---|---|---|
committer | biao716.wang <biao716.wang@samsung.com> | 2020-08-26 11:34:58 +0900 |
commit | a5651e772478cb72e7fd02e1d60bbe7a509a4d50 (patch) | |
tree | dd6921b15d6285d79e2e42dcbbd132ac066b19d1 | |
parent | 136d1e028cec5dcddc6ef6ac7302c794fda5f135 (diff) | |
download | python-urlgrabber-debian/4.1.0.tar.gz python-urlgrabber-debian/4.1.0.tar.bz2 python-urlgrabber-debian/4.1.0.zip |
Port to Python3debian/4.1.0
Change-Id: I46b8f71dce3d1f009617aa6e969414ad0c6393a6
Signed-off-by: biao716.wang <biao716.wang@samsung.com>
-rwxr-xr-x[-rw-r--r--] | ChangeLog | 37 | ||||
-rwxr-xr-x[-rw-r--r--] | LICENSE | 0 | ||||
-rwxr-xr-x | MANIFEST.in | 9 | ||||
-rwxr-xr-x[-rw-r--r--] | PKG-INFO | 63 | ||||
-rwxr-xr-x[-rw-r--r--] | README | 2 | ||||
-rwxr-xr-x[-rw-r--r--] | TODO | 0 | ||||
-rw-r--r-- | debian/changelog | 6 | ||||
-rw-r--r-- | debian/control | 8 | ||||
-rwxr-xr-x | debian/rules | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | makefile | 14 | ||||
-rw-r--r-- | packaging/python-urlgrabber.changes | 206 | ||||
-rw-r--r-- | packaging/python-urlgrabber.spec | 9 | ||||
-rwxr-xr-x[-rw-r--r--] | scripts/urlgrabber | 75 | ||||
-rwxr-xr-x | scripts/urlgrabber-ext-down | 82 | ||||
-rwxr-xr-x | setup.cfg | 4 | ||||
-rwxr-xr-x[-rw-r--r--] | setup.py | 99 | ||||
-rwxr-xr-x[-rw-r--r--] | test/base_test_code.py | 7 | ||||
-rwxr-xr-x[-rw-r--r--] | test/grabberperf.py | 40 | ||||
-rwxr-xr-x[-rw-r--r--] | test/munittest.py | 140 | ||||
-rwxr-xr-x[-rw-r--r--] | test/runtests.py | 20 | ||||
-rwxr-xr-x[-rw-r--r--] | test/test_byterange.py | 103 | ||||
-rwxr-xr-x[-rw-r--r--] | test/test_grabber.py | 327 | ||||
-rwxr-xr-x[-rw-r--r--] | test/test_mirror.py | 207 | ||||
-rwxr-xr-x[-rw-r--r--] | test/threading/batchgrabber.py | 42 | ||||
-rwxr-xr-x | urlgrabber.egg-info/PKG-INFO | 31 | ||||
-rwxr-xr-x | urlgrabber.egg-info/SOURCES.txt | 27 | ||||
-rwxr-xr-x | urlgrabber.egg-info/dependency_links.txt | 1 | ||||
-rwxr-xr-x | urlgrabber.egg-info/requires.txt | 3 | ||||
-rwxr-xr-x | urlgrabber.egg-info/top_level.txt | 1 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/__init__.py | 44 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/byterange.py | 201 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/grabber.py | 1701 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/mirror.py | 132 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/progress.py | 341 |
34 files changed, 2841 insertions, 1145 deletions
diff --git a/ChangeLog b/ChangeLog index 644fbdb..5a72bbf 100644..100755 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,40 @@ +2019-10-08 Neal Gompa <ngompa13@gmail.com> + + * Fix confused license header to clarify licensing + * Fix Python 3 compatibility with urlgrabber-ext-down + * Support HTTP CONNECT with reget. BZ 1585596 + * Fix for usage of _levelNames from logging module + * Fix issue when URLGRABBER_DEBUG is not an integer on Python 3 + * Revise setup.py to remove need for extra setup-time dependencies + * setuptools: Update Development Status to "Production/Stable" + * Bump version to 4.1.0 + +2019-02-25 Neal Gompa <ngompa13@gmail.com> + + * Port to Python 3 + * Add curl_obj option to grabber + * Throw an obvious error message when urlgrabber-ext-down + is missing when attempting to use external downloader + * Use setuptools for setup.py instead of distutils + * bump version to 4.0.0 + +2017-02-02 Valentina Mukhamedzhanova <vmukhame@redhat.com> + + * Add no_cache and retry_no_cache options. + * Work around pycurl dependency in setup.py. + * Don't set speed=0 on a new mirror that 404'd. + * Add a comprehensive error message to pycurl error 77. + * Don't crash on timedhosts parsing error. + * bump version to 3.10.2 + +2013-10-09 Zdenek Pavlas <zpavlas@redhat.com> + + * lots of enahncements and bugfixes + (parallel downloading, mirror profiling, new options) + * updated authors, url + * updated unit tests + * bump version to 3.10 + 2009-09-25 Seth Vidal <skvidal@fedoraproject.org> * urlgrabber/__init__.py: bump version to 3.9.1 diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100755 index 0000000..999fdee --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,9 @@ +recursive-include urlgrabber *.py +recursive-include test *.py +include scripts/urlgrabber +include README +include LICENSE +include TODO +include ChangeLog +include MANIFEST +include makefile
\ No newline at end of file @@ -1,48 +1,31 @@ -Metadata-Version: 1.0 +Metadata-Version: 1.2 Name: urlgrabber -Version: 3.9.1 +Version: 4.1.0 Summary: A high-level cross-protocol url-grabber -Home-page: http://linux.duke.edu/projects/urlgrabber/ -Author: Michael D. Stenner, Ryan Tomayko -Author-email: mstenner@linux.duke.edu, skvidal@fedoraproject.org -License: LGPL -Description: A high-level cross-protocol url-grabber. - - Using urlgrabber, data can be fetched in three basic ways: - - urlgrab(url) copy the file to the local filesystem - urlopen(url) open the remote file and return a file object - (like urllib2.urlopen) - urlread(url) return the contents of the file as a string - - When using these functions (or methods), urlgrabber supports the - following features: - - * identical behavior for http://, ftp://, and file:// urls - * http keepalive - faster downloads of many files by using - only a single connection - * byte ranges - fetch only a portion of the file - * reget - for a urlgrab, resume a partial download - * progress meters - the ability to report download progress - automatically, even when using urlopen! - * throttling - restrict bandwidth usage - * retries - automatically retry a download if it fails. The - number of retries and failure types are configurable. - * authenticated server access for http and ftp - * proxy support - support for authenticated http and ftp proxies - * mirror groups - treat a list of mirrors as a single source, - automatically switching mirrors if there is a failure. - +Home-page: http://urlgrabber.baseurl.org/ +Author: Michael D. Stenner, Ryan Tomayko, Seth Vidal, Zdenek Pavlas +Author-email: mstenner@linux.duke.edu, rtomayko@naeblis.cx, skvidal@fedoraproject.org, zpavlas@redhat.com +Maintainer: Neal Gompa +Maintainer-email: ngompa@fedoraproject.org +License: LGPLv2+ +Description: UNKNOWN +Keywords: urlgrabber yum http ftp Platform: UNKNOWN -Classifier: Development Status :: 4 - Beta -Classifier: Environment :: Console -Classifier: Environment :: Web Environment +Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: Intended Audience :: System Administrators -Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL) -Classifier: Operating System :: POSIX -Classifier: Operating System :: POSIX :: Linux -Classifier: Programming Language :: Python Classifier: Topic :: Internet :: File Transfer Protocol (FTP) Classifier: Topic :: Internet :: WWW/HTTP Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Environment :: Console +Classifier: Environment :: Web Environment +Classifier: License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+) +Classifier: Operating System :: POSIX +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 @@ -19,7 +19,7 @@ You can build rpms by running python setup.py bdist_rpm The rpms (both source and "binary") will be specific to the current -distrubution/version and may not be portable to others. This is +distribution/version and may not be portable to others. This is because they will be built for the currently installed python. keepalive.py and byterange.py are generic urllib2 extension modules and diff --git a/debian/changelog b/debian/changelog index 1e95e82..9c663ba 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +python-urlgrabber (4.1.0) unstable; urgency=low + + * Port to Python 3 + + -- Biao Wang <biao716.wang@samsung.com> Wed, 26 Aug 2020 10:09:30 +0800 + python-urlgrabber (3.9.1) unstable; urgency=low * add debian diff --git a/debian/control b/debian/control index a365ee5..6776271 100644 --- a/debian/control +++ b/debian/control @@ -2,15 +2,15 @@ Source: python-urlgrabber Section: devel Priority: extra Maintainer: Jong-Woo Chae <jonwoo.chae@samsung.com> -Build-Depends: debhelper (>= 7.0.15), python-dev, python-pycurl -Standards-Version: 3.9.1 +Build-Depends: debhelper (>= 7.0.15), python3-six, python3-pycurl, python3-setuptools, dh_python3 +Standards-Version: 4.1.0 Homepage: http://www.tizen.org Package: python-urlgrabber Architecture: all Depends: ${python:Depends}, - python-m2crypto | python3-m2crypto, - python-pycurl + python3-six, + python3-pycurl Description: image creator for Linux distributions The tool createrep is used to ppfarm test diff --git a/debian/rules b/debian/rules index 0fa0995..b77ab76 100755 --- a/debian/rules +++ b/debian/rules @@ -6,7 +6,7 @@ build: build-stamp build-stamp: dh_testdir - python setup.py build + python3 setup.py build clean: dh_testdir dh_testroot @@ -34,7 +34,7 @@ binary-indep: build install dh_strip dh_compress dh_fixperms - dh_python2 + dh_python3 dh_installdeb dh_shlibdeps dh_gencontrol @@ -10,14 +10,14 @@ CLEANFILES = MANIFEST *~ build dist export release daily reference nonexistent_f *.pyc urlgrabber/*.pyc scripts/*.pyc test/*.pyc test/nonexistent_file \ test/reference test/reference.part urlgrabber/*~ ############################################################################## -VERSION = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__version__') -DATE = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__date__') +VERSION = $(shell $(PYTHON) -c 'import $(PY_MODULE); print($(PY_MODULE).__version__)') +DATE = $(shell $(PYTHON) -c 'import $(PY_MODULE); print($(PY_MODULE).__date__)') SCM_TAG = release-$(shell echo $(VERSION) | sed -e 's/\./_/g') -PYTHON22 = $(shell /usr/bin/which python2.2 2>/dev/null) -PYTHON23 = $(shell /usr/bin/which python2.3 2>/dev/null) -PYTHON24 = $(shell /usr/bin/which python2.4 2>/dev/null) -PYTHON25 = $(shell /usr/bin/which python2.5 2>/dev/null) -TESTPYTHONS = $(PYTHON22) $(PYTHON23) $(PYTHON24) $(PYTHON25) +PYTHON26 = $(shell /usr/bin/which python2.6 2>/dev/null) +PYTHON27 = $(shell /usr/bin/which python2.7 2>/dev/null) +PYTHON36 = $(shell /usr/bin/which python3.6 2>/dev/null) +PYTHON37 = $(shell /usr/bin/which python3.7 2>/dev/null) +TESTPYTHONS = $(PYTHON26) $(PYTHON27) $(PYTHON36) $(PYTHON37) ############################################################################## default: diff --git a/packaging/python-urlgrabber.changes b/packaging/python-urlgrabber.changes index dc0d37f..c33ecab 100644 --- a/packaging/python-urlgrabber.changes +++ b/packaging/python-urlgrabber.changes @@ -1,4 +1,204 @@ -* Wed Mar 06 2013 Patrick McCarty <patrick.mccarty@linux.intel.com> upstream/3.9.1@c5b017c -- Add packaging -- Imported Upstream version 3.9.1 +------------------------------------------------------------------- +Wed May 20 07:47:54 UTC 2020 - pgajdos@suse.com + +- urlgrabber-ext-down as an alternative + +------------------------------------------------------------------- +Mon May 18 09:10:43 UTC 2020 - Petr Gajdos <pgajdos@suse.com> + +- %python3_only -> %python_alternative +- urlgrabber-ext-down is expected to reside under /usr/libexec + +------------------------------------------------------------------- +Wed Oct 9 07:16:52 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com> + +- Update to 4.1.0: + * Fix confused license header to clarify licensing + * Fix Python 3 compatibility with urlgrabber-ext-down + * Support HTTP CONNECT with reget. BZ 1585596 + * Fix for usage of _levelNames from logging module + * Fix issue when URLGRABBER_DEBUG is not an integer on Python 3 + * Revise setup.py to remove need for extra setup-time dependencies + * setuptools: Update Development Status to Production/Stable + +------------------------------------------------------------------- +Wed Feb 27 09:39:32 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com> + +- Drop patch grabber_fix.diff that was never upstreamed. Should + not be needed anymore + +------------------------------------------------------------------- +Mon Feb 25 17:44:43 CET 2019 - Matej Cepl <mcepl@suse.com> + +- Update to the upstream version 4.0.0: + * Port to Python 3 rocket + * Add curl_obj option to grabber + * Throw an obvious error message when urlgrabber-ext-down is + missing when attempting to use external downloader + * Use setuptools for setup.py instead of distutils +- Remove merged patches: + * declare-dollar-sign-as-safe-in-urlquote.patch + * python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif + * python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch + +------------------------------------------------------------------- +Tue Dec 4 12:55:41 UTC 2018 - Matej Cepl <mcepl@suse.com> + +- Remove superfluous devel dependency for noarch package + +------------------------------------------------------------------- +Tue May 29 14:23:52 UTC 2018 - mcepl@suse.com + +- Clean SPEC file + +------------------------------------------------------------------- +Thu Aug 24 13:56:44 UTC 2017 - jmatejek@suse.com + +- singlespec auto-conversion + +------------------------------------------------------------------- +Thu Feb 12 13:42:05 CET 2015 - mc@suse.de + +- declare $ sign as a safe character in url paths to prevent + escaping /$RCE/ which lead into problems with token auth + (bnc#902416) + * declare-dollar-sign-as-safe-in-urlquote.patch + * python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif +- set curl option SSL_VERIFYHOST correct + +------------------------------------------------------------------- +Tue Sep 16 12:38:07 UTC 2014 - dmacvicar@suse.de + +- Add python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch + (bnc#896844) + +------------------------------------------------------------------- +Wed Feb 6 18:06:41 UTC 2013 - jmatejek@suse.com + +- Add grabber_fix.diff: Fixed timeout and other errors breaking yum + compatibility (bnc#793650) + +------------------------------------------------------------------- +Mon Oct 1 09:53:26 UTC 2012 - saschpe@suse.de + +- Fixed wrong license header in urlgrabber/__init__.py (bnc#781323) +- Updated upstream URL, the project moved the baseurl.org (yum) + +------------------------------------------------------------------- +Tue Sep 20 11:40:05 UTC 2011 - saschpe@suse.de + +- Update to version 3.9.1: + * cleanup all the old urlgrabber urllib code that's not being used + * delete sslfactory and keepalive fix up the unittests to match existing code + * make sure the value we get back from the parse150 and other calls is + converted to an int before we make it 'size' rhbug: #524705 +- Spec file updates: + * Removed authors from description + * Dropped useless python-urlgrabber-2.9.9.patch + * Dropped obsolete python-urlgrabber-3.1.0.patch (upstream changed) + * Require python-pycurl + +------------------------------------------------------------------- +Wed Aug 12 20:10:37 CEST 2009 - matejcik@suse.cz + +- build as noarch on newer distros +- switched filelist to --record-rpm + +------------------------------------------------------------------- +Mon Aug 10 14:06:55 CEST 2009 - coolo@novell.com + +- sync factory and build service + +------------------------------------------------------------------- +Sun Aug 9 08:45:09 CEST 2009 - coolo@novell.com + +- use new python macros + +------------------------------------------------------------------- +Tue May 12 14:14:04 CEST 2009 - poeml@suse.de + +- fix build on 11.1 onwards, where python must be in the + buildrequires in addition to python-devel, because otherwise + urllib2 appears to have no SSL support + +------------------------------------------------------------------- +Fri Sep 26 11:11:32 CEST 2008 - cthiel@suse.de + +- add python to BuildRequires to fix build + +------------------------------------------------------------------- +Tue Feb 19 16:01:09 CET 2008 - cthiel@suse.de + +- fix url parsing error in grabber.py (bnc #362937) + +------------------------------------------------------------------- +Mon Oct 2 13:53:16 CEST 2006 - cthiel@suse.de + +- fix build on older distributions + +------------------------------------------------------------------- +Sun Oct 1 15:34:10 CEST 2006 - cthiel@suse.de + +- update to version 3.1.0 + * various fixes + +------------------------------------------------------------------- +Thu Sep 21 14:26:46 CEST 2006 - cthiel@suse.de + +- fix build with python 2.5 + +------------------------------------------------------------------- +Fri Aug 4 17:25:18 CEST 2006 - cthiel@suse.de + +- update to version 2.9.10 + * Make keepalive, byteranges, etc. work with https. + * Fixed a minor error reporting bug due to changes in python 2.4. + * Catch read errors after the file has been opened. +- removed obsolete urlgrabber-read-error.patch + +------------------------------------------------------------------- +Thu May 25 14:19:34 CEST 2006 - cthiel@suse.de + +- update to version 2.9.9 + * Added tests to make sure that the "quote" option works as advertised + * Significant improvement to URL parsing. Parsing is now broken out into + a separate class (URLParser). It will now (by default) guess whether a + URL is already quoted, properly handle local files and URLs on windows, + and display un-quoted versions of the filename in the progress meter. + * Added a reget progress bar patch from Menno, and fixed the annoying next + _IndexError bug. +- added urlgrabber-read-error.patch (from Fedora) +- removed python-urlgrabber-2.9.7-reget.patch (included upstream) + +------------------------------------------------------------------- +Tue Feb 28 16:46:03 CET 2006 - jmatejek@suse.cz + +- updated to reflect python changes due to #149809 + +------------------------------------------------------------------- +Wed Jan 25 21:40:54 CET 2006 - mls@suse.de + +- converted neededforbuild to BuildRequires + +------------------------------------------------------------------- +Mon Oct 31 11:56:02 CET 2005 - dmueller@suse.de + +- don't build as root + +------------------------------------------------------------------- +Wed Oct 26 13:14:27 CEST 2005 - cthiel@suse.de + +- update to version 2.9.7 + +------------------------------------------------------------------- +Tue Sep 13 11:01:26 CEST 2005 - cthiel@suse.de + +- specfile cleanup + +------------------------------------------------------------------- +Sun Aug 14 02:04:14 CEST 2005 - cthiel@suse.de + +- initial package (version 2.9.6) + + diff --git a/packaging/python-urlgrabber.spec b/packaging/python-urlgrabber.spec index 6113fb5..58e8755 100644 --- a/packaging/python-urlgrabber.spec +++ b/packaging/python-urlgrabber.spec @@ -1,6 +1,6 @@ Name: python-urlgrabber Summary: A high-level cross-protocol url-grabber -Version: 3.9.1 +Version: 4.1.0 Release: 0 Group: Development/Libraries License: LGPL-2.1+ @@ -8,10 +8,11 @@ BuildArch: noarch URL: http://urlgrabber.baseurl.org/ Source0: urlgrabber-%{version}.tar.gz Source1001: python-urlgrabber.manifest -BuildRequires: python-devel -BuildRequires: python-pycurl -Requires: python-M2Crypto +BuildRequires: python3-pycurl +BuildRequires: python3-six +BuildRequires: python3-setuptools Requires: python-pycurl +Requires: python-six Provides: urlgrabber = %{version}-%{release} %description diff --git a/scripts/urlgrabber b/scripts/urlgrabber index 518e512..1b1e077 100644..100755 --- a/scripts/urlgrabber +++ b/scripts/urlgrabber @@ -19,6 +19,8 @@ # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko +from __future__ import print_function + """NAME urlgrabber - a simple client for the urlgrabber python package @@ -115,6 +117,7 @@ options: including quotes in the case of strings. e.g. --user_agent='"foobar/2.0"' + --output FILE -o FILE write output to FILE, otherwise the basename of the url will be used -O print the names of saved files to STDOUT @@ -130,8 +133,6 @@ options: --profile profile the actual fetching and print the results """ -# $Id: urlgrabber,v 1.7 2006/12/08 00:14:16 mstenner Exp $ - import sys import getopt import re @@ -170,12 +171,17 @@ class client_options: return ug_options, ug_defaults def process_command_line(self): - short_options = 'vd:hoOpD' + short_options = 'vd:ho:OpD' long_options = ['profile', 'repeat=', 'verbose=', - 'debug=', 'help', 'progress'] + 'debug=', 'help', 'progress', 'output='] ug_long = [ o + '=' for o in self.ug_options ] - optlist, args = getopt.getopt(sys.argv[1:], short_options, - long_options + ug_long) + try: + optlist, args = getopt.getopt(sys.argv[1:], short_options, + long_options + ug_long) + except getopt.GetoptError as e: + print("Error:", e, file=sys.stderr) + self.help([], ret=1) + self.verbose = 0 self.debug = None self.outputfile = None @@ -193,6 +199,7 @@ class client_options: if o == '--verbose': self.verbose = v if o == '-v': self.verbose += 1 if o == '-o': self.outputfile = v + if o == '--output': self.outputfile = v if o == '-p' or o == '--progress': self.progress = 1 if o == '-d' or o == '--debug': self.debug = v if o == '--profile': self.profile = 1 @@ -202,7 +209,7 @@ class client_options: self.repeat = int(v) if self.repeat < 1: raise ValueError() except ValueError: - print 'ERROR: repeat value must be an int >= 1' + print('ERROR: repeat value must be an int >= 1') sys.exit(1) if o == '-D': self.verbose = 3 @@ -211,20 +218,20 @@ class client_options: if o in ug_dash: try: val = eval(v) - except Exception, e: - print "error processing option value: %s" % v - print e + except Exception as e: + print("error processing option value: %s" % v) + print(e) sys.exit(1) else: self.ugops[o[2:]] = val if len(self.args) > 1 and self.outputfile is not None: - print "ERROR: cannot use -o when grabbing multiple files" + print("ERROR: cannot use -o when grabbing multiple files") sys.exit(1) - def help(self, args): + def help(self, args, ret=0): if not args: - print MAINHELP + print(MAINHELP) else: for a in args: m = getattr(self, 'help_'+a, None) @@ -233,20 +240,20 @@ class client_options: elif a in self.ug_options: self.help_ug_option(a) else: - print 'ERROR: no help on command "%s"' % a - sys.exit(0) + print('ERROR: no help on command "%s"' % a) + sys.exit(ret) def help_doc(self): - print __doc__ + print(__doc__) def help_options(self): width = max(map(len, self.ug_options)) format = ' %-' + str(width) + 's = %s' hformat = ' %-' + str(width) + 's %s' - print hformat % ('OPTION', 'DEFAULT') - print '-'*(width + 20) + print(hformat % ('OPTION', 'DEFAULT')) + print('-'*(width + 20)) for k in self.ug_options: - print format % (k, self.ug_defaults[k]) + print(format % (k, self.ug_defaults[k])) def help_all(self): for k in self.ug_options: @@ -257,21 +264,21 @@ class client_options: m = re.search(r'^( '+option+'.*?)\s*^ {,2}\S', urlgrabber.grabber.__doc__, re.M|re.S) if m: - print m.group(1) + print(m.group(1)) else: - print ' %s: no help found for this option' % option - print '' + print(' %s: no help found for this option' % option) + print('') class ugclient: def __init__(self): op = client_options() self.op = op if op.verbose >= 2 and op.ugops: - print "Module Options:" + print("Module Options:") width = max(map(len, op.ugops.keys())) format = " %-" + str(width) + "s = %s" for k, v in op.ugops.items(): - print format % (k, repr(v)) + print(format % (k, repr(v))) if op.debug: self.set_debug_logger(op.debug) @@ -287,22 +294,26 @@ class ugclient: def run(self): for url in self.op.args: - if self.op.verbose: print 'grabbing: %s' % url + if self.op.verbose: print('grabbing: %s' % url) try: for i in range(0, self.op.repeat): f = self.g.urlgrab(url, self.op.outputfile) - if self.op.localfile: print f - except URLGrabError, e: - print e - + if self.op.localfile: print(f) + except URLGrabError as e: + print(e) + sys.exit(1) + def set_debug_logger(self, dbspec): try: dbinfo = dbspec.split(',') import logging - level = logging._levelNames.get(dbinfo[0], None) - if level is None: level = int(dbinfo[0]) + if sys.version_info.major == 2: + level = logging._levelNames.get(dbinfo[0], None) + else: + level = logging.getLevelName(dbinfo[0]) + if level is None or not isinstance(level, int): level = int(dbinfo[0]) if level < 1: raise ValueError() - + formatter = logging.Formatter('%(asctime)s %(message)s') if len(dbinfo) > 1: filename = dbinfo[1] else: filename = '' diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down new file mode 100755 index 0000000..40469a7 --- /dev/null +++ b/scripts/urlgrabber-ext-down @@ -0,0 +1,82 @@ +#! /usr/bin/python +# A very simple external downloader +# Copyright 2011-2012 Zdenek Pavlas + +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA + +import time, os, errno, sys +import six +from urlgrabber.grabber import \ + _readlines, URLGrabberOptions, _loads, \ + PyCurlFileObject, URLGrabError, _to_utf8 + +def write(fmt, *arg): + buf = fmt % arg + if six.PY3: + buf = buf.encode() + try: + os.write(1, buf) + except OSError as e: + if e.args[0] != errno.EPIPE: raise + sys.exit(1) + +class ProxyProgress: + def start(self, *d1, **d2): + self.next_update = 0 + def update(self, _amount_read): + t = time.time() + if t < self.next_update: return + self.next_update = t + 0.31 + write('%d %d\n', self._id, _amount_read) + +def main(): + import signal + signal.signal(signal.SIGINT, lambda n, f: sys.exit(1)) + cnt = 0 + while True: + lines = _readlines(0) + if not lines: break + for line in lines: + if not isinstance(line, six.string_types): + line = line.decode('utf-8') + cnt += 1 + opts = URLGrabberOptions() + opts._id = cnt + for k in line.split(' '): + k, v = k.split('=', 1) + setattr(opts, k, _loads(v)) + if opts.progress_obj: + opts.progress_obj = ProxyProgress() + opts.progress_obj._id = cnt + + dlsz = dltm = 0 + try: + fo = PyCurlFileObject(_to_utf8(opts.url), opts.filename, opts) + fo._do_grab() + fo.fo.close() + size = fo._amount_read + if fo._tm_last: + dlsz = fo._tm_last[0] - fo._tm_first[0] + dltm = fo._tm_last[1] - fo._tm_first[1] + ug_err = 'OK' + except URLGrabError as e: + size = 0 + ug_err = '%d %d %s' % (e.errno, getattr(e, 'code', 0), e.strerror) + write('%d %d %d %.3f %s\n', opts._id, size, dlsz, dltm, ug_err) + +if __name__ == '__main__': + main() diff --git a/setup.cfg b/setup.cfg new file mode 100755 index 0000000..8bfd5a1 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[egg_info] +tag_build = +tag_date = 0 + @@ -1,45 +1,58 @@ -# urlgrabber distutils setup -import re as _re -import urlgrabber as _urlgrabber +from setuptools import setup -name = "urlgrabber" -description = "A high-level cross-protocol url-grabber" -long_description = _urlgrabber.__doc__ -license = "LGPL" -version = _urlgrabber.__version__ -_authors = _re.split(r',\s+', _urlgrabber.__author__) -author = ', '.join([_re.sub(r'\s+<.*', r'', _) for _ in _authors]) -author_email = ', '.join([_re.sub(r'(^.*<)|(>.*$)', r'', _) for _ in _authors]) -url = _urlgrabber.__url__ +pkg_name = "urlgrabber" +pkg_version = "4.1.0" -packages = ['urlgrabber'] -package_dir = {'urlgrabber':'urlgrabber'} -scripts = ['scripts/urlgrabber'] -data_files = [('share/doc/' + name + '-' + version, - ['README','LICENSE', 'TODO', 'ChangeLog'])] -options = { 'clean' : { 'all' : 1 } } -classifiers = [ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Environment :: Web Environment', - 'Intended Audience :: Developers', - 'Intended Audience :: System Administrators', - 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)', - 'Operating System :: POSIX', - 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python', - 'Topic :: Internet :: File Transfer Protocol (FTP)', - 'Topic :: Internet :: WWW/HTTP', - 'Topic :: Software Development :: Libraries :: Python Modules' - ] - -# load up distutils -if __name__ == '__main__': - config = globals().copy() - keys = config.keys() - for k in keys: - #print '%-20s -> %s' % (k, config[k]) - if k.startswith('_'): del config[k] - - from distutils.core import setup - setup(**config) +setup( + name=pkg_name, + version=pkg_version, + license="LGPLv2+", + description="A high-level cross-protocol url-grabber", + keywords="urlgrabber yum http ftp", + # From https://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=[ + # Development status + "Development Status :: 5 - Production/Stable", + # Target audience + "Intended Audience :: Developers", + "Intended Audience :: System Administrators", + # Type of software + "Topic :: Internet :: File Transfer Protocol (FTP)", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Software Development :: Libraries :: Python Modules", + # Kind of software + "Environment :: Console", + "Environment :: Web Environment", + # License (must match license field) + "License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)", + # Operating systems supported + "Operating System :: POSIX", + "Operating System :: POSIX :: Linux", + # Supported Python versions + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + ], + url="http://urlgrabber.baseurl.org/", + author="Michael D. Stenner, Ryan Tomayko, Seth Vidal, Zdenek Pavlas", + author_email="mstenner@linux.duke.edu, rtomayko@naeblis.cx, skvidal@fedoraproject.org, zpavlas@redhat.com", + maintainer="Neal Gompa", + maintainer_email="ngompa@fedoraproject.org", + packages=["urlgrabber"], + package_dir = {'urlgrabber':'urlgrabber'}, + include_package_data=True, + install_requires=[ + "pycurl", + "six", + "setuptools", + ], + scripts = ['scripts/urlgrabber'], + data_files = [ + ('share/doc/' + pkg_name + '-' + pkg_version, ['README','LICENSE', 'TODO', 'ChangeLog']), + ('libexec', ['scripts/urlgrabber-ext-down']), + ], +) diff --git a/test/base_test_code.py b/test/base_test_code.py index 50c6348..97901be 100644..100755 --- a/test/base_test_code.py +++ b/test/base_test_code.py @@ -1,17 +1,18 @@ from munittest import * -base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/' +#base_http = 'http://urlgrabber.baseurl.org/test/' +base_http = 'http://in.waw.pl/urlgrabber/test/' base_ftp = 'ftp://localhost/test/' # set to a proftp server only. we're working around a couple of # bugs in their implementation in byterange.py. base_proftp = 'ftp://localhost/test/' -reference_data = ''.join( [str(i)+'\n' for i in range(20000) ] ) +reference_data = ''.join(str(i) + '\n' for i in range(20000)).encode('utf8') ref_http = base_http + 'reference' ref_ftp = base_ftp + 'reference' ref_proftp = base_proftp + 'reference' -short_reference_data = ' '.join( [str(i) for i in range(10) ] ) +short_reference_data = ' '.join(str(i) for i in range(10)).encode('utf8') short_ref_http = base_http + 'short_reference' short_ref_ftp = base_ftp + 'short_reference' diff --git a/test/grabberperf.py b/test/grabberperf.py index 820da2c..31771ae 100644..100755 --- a/test/grabberperf.py +++ b/test/grabberperf.py @@ -11,14 +11,16 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +from __future__ import print_function + import sys import os from os.path import dirname, join as joinpath @@ -46,16 +48,16 @@ def main(): # remove temp files os.unlink(tempsrc) os.unlink(tempdst) - + def setuptemp(size): - if DEBUG: print 'writing %d KB to temporary file (%s).' % (size / 1024, tempsrc) + if DEBUG: print('writing %d KB to temporary file (%s).' % (size / 1024, tempsrc)) file = open(tempsrc, 'w', 1024) chars = '0123456789' for i in range(size): file.write(chars[i % 10]) file.flush() file.close() - + def speedtest(size): setuptemp(size) full_times = [] @@ -65,12 +67,12 @@ def speedtest(size): try: from urlgrabber.progress import text_progress_meter - except ImportError, e: + except ImportError as e: tpm = None - print 'not using progress meter' + print('not using progress meter') else: tpm = text_progress_meter(fo=open('/dev/null', 'w')) - + # to address concerns that the overhead from the progress meter # and throttling slow things down, we do this little test. # @@ -81,17 +83,17 @@ def speedtest(size): # note: it _is_ even slower to direct the progress meter to a real # tty or file, but I'm just interested in the overhead from _this_ # module. - + # get it nicely cached before we start comparing - if DEBUG: print 'pre-caching' + if DEBUG: print('pre-caching') for i in range(100): urlgrab(tempsrc, tempdst, copy_local=1, throttle=None, proxies=proxies) - - if DEBUG: print 'running speed test.' + + if DEBUG: print('running speed test.') reps = 500 for i in range(reps): - if DEBUG: - print '\r%4i/%-4i' % (i+1, reps), + if DEBUG: + print('\r%4i/%-4i' % (i+1, reps), end=' ') sys.stdout.flush() t = time.time() urlgrab(tempsrc, tempdst, @@ -108,7 +110,7 @@ def speedtest(size): t = time.time() in_fo = open(tempsrc) out_fo = open(tempdst, 'wb') - while 1: + while True: s = in_fo.read(1024 * 8) if not s: break out_fo.write(s) @@ -116,9 +118,9 @@ def speedtest(size): out_fo.close() none_times.append(1000 * (time.time() - t)) - if DEBUG: print '\r' + if DEBUG: print('\r') - print "%d KB Results:" % (size / 1024) + print("%d KB Results:" % (size / 1024)) print_result('full', full_times) print_result('raw', raw_times) print_result('none', none_times) @@ -131,7 +133,7 @@ def print_result(label, result_list): for i in result_list: mean += i mean = mean/len(result_list) median = result_list[int(len(result_list)/2)] - print format % (label, mean, median, result_list[0], result_list[-1]) + print(format % (label, mean, median, result_list[0], result_list[-1])) if __name__ == '__main__': main() diff --git a/test/munittest.py b/test/munittest.py index 96230b8..5fdf6f6 100644..100755 --- a/test/munittest.py +++ b/test/munittest.py @@ -1,4 +1,7 @@ #!/usr/bin/env python + +from __future__ import print_function + """ This is a modified version of the unittest module has been modified by Michael D. Stenner from Steve Purcell's version (revision 1.46, as @@ -98,14 +101,20 @@ AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. """ -# $Id: munittest.py,v 1.2 2004/03/31 01:27:24 mstenner Exp $ - import time import sys import traceback -import string import os import types +import unittest + +from six import class_types, string_types + +try: + cmp +except NameError: + def cmp(a, b): + return (a > b) - (a < b) ############################################################################## # Exported classes and functions @@ -113,7 +122,7 @@ import types __all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner', 'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader'] -# Expose obsolete functions for backwards compatability +# Expose obsolete functions for backwards compatibility __all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases']) @@ -190,7 +199,7 @@ class TestResult: def _exc_info_to_string(self, err): """Converts a sys.exc_info()-style tuple of values into a string.""" - return string.join(traceback.format_exception(*err), '') + return ''.join(traceback.format_exception(*err)) def __repr__(self): return "<%s run=%i errors=%i failures=%i>" % \ @@ -198,7 +207,7 @@ class TestResult: len(self.failures)) -class TestCase: +class TestCase(unittest.TestCase): """A class whose instances are single test cases. By default, the test code itself should be placed in a method named @@ -241,27 +250,6 @@ class TestCase: interrupt_skips = 0 - def __init__(self, methodName='runTest'): - """Create an instance of the class that will use the named test - method when executed. Raises a ValueError if the instance does - not have a method with the specified name. - """ - try: - self._testMethodName = methodName - testMethod = getattr(self, methodName) - self._testMethodDoc = testMethod.__doc__ - except AttributeError: - raise ValueError, "no such test method in %s: %s" % \ - (self.__class__, methodName) - - def setUp(self): - "Hook method for setting up the test fixture before exercising it." - pass - - def tearDown(self): - "Hook method for deconstructing the test fixture after testing it." - pass - def countTestCases(self): return 1 @@ -276,7 +264,7 @@ class TestCase: the specified test method's docstring. """ doc = self._testMethodDoc - return doc and string.strip(string.split(doc, "\n")[0]) or None + return doc and doc.split('\n')[0].strip() or None def id(self): return "%s.%s" % (_strclass(self.__class__), self._testMethodName) @@ -288,9 +276,6 @@ class TestCase: return "<%s testMethod=%s>" % \ (_strclass(self.__class__), self._testMethodName) - def run(self, result=None): - return self(result) - def __call__(self, result=None): if result is None: result = self.defaultTestResult() result.startTest(self) @@ -361,15 +346,15 @@ class TestCase: def fail(self, msg=None): """Fail immediately, with the given message.""" - raise self.failureException, msg + raise self.failureException(msg) def failIf(self, expr, msg=None): "Fail the test if the expression is true." - if expr: raise self.failureException, msg + if expr: raise self.failureException(msg) def failUnless(self, expr, msg=None): """Fail the test unless the expression is true.""" - if not expr: raise self.failureException, msg + if not expr: raise self.failureException(msg) def failUnlessRaises(self, excClass, callableObj, *args, **kwargs): """Fail unless an exception of class excClass is thrown @@ -386,23 +371,21 @@ class TestCase: else: if hasattr(excClass,'__name__'): excName = excClass.__name__ else: excName = str(excClass) - raise self.failureException, excName + raise self.failureException(excName) def failUnlessEqual(self, first, second, msg=None): """Fail if the two objects are unequal as determined by the '==' operator. """ if not first == second: - raise self.failureException, \ - (msg or '%s != %s' % (`first`, `second`)) + raise self.failureException(msg or '%r != %r' % (first, second)) def failIfEqual(self, first, second, msg=None): """Fail if the two objects are equal as determined by the '==' operator. """ if first == second: - raise self.failureException, \ - (msg or '%s == %s' % (`first`, `second`)) + raise self.failureException(msg or '%r == %r' % (first, second)) def failUnlessAlmostEqual(self, first, second, places=7, msg=None): """Fail if the two objects are unequal as determined by their @@ -410,11 +393,10 @@ class TestCase: (default 7) and comparing to zero. Note that decimal places (from zero) is usually not the same - as significant digits (measured from the most signficant digit). + as significant digits (measured from the most significant digit). """ if round(second-first, places) != 0: - raise self.failureException, \ - (msg or '%s != %s within %s places' % (`first`, `second`, `places` )) + raise self.failureException(msg or '%r != %r within %s places' % (first, second, places)) def failIfAlmostEqual(self, first, second, places=7, msg=None): """Fail if the two objects are equal as determined by their @@ -422,11 +404,10 @@ class TestCase: (default 7) and comparing to zero. Note that decimal places (from zero) is usually not the same - as significant digits (measured from the most signficant digit). + as significant digits (measured from the most significant digit). """ if round(second-first, places) == 0: - raise self.failureException, \ - (msg or '%s == %s within %s places' % (`first`, `second`, `places`)) + raise self.failureException(msg or '%r == %r within %r places' % (first, second, places)) assertEqual = assertEquals = failUnlessEqual @@ -442,15 +423,15 @@ class TestCase: def skip(self, msg=None): """Skip the test""" - raise self.skipException, msg + raise self.skipException(msg) def skipIf(self, expr, msg=None): "Skip the test if the expression is true." - if expr: raise self.skipException, msg + if expr: raise self.skipException(msg) def skipUnless(self, expr, msg=None): """Skip the test unless the expression is true.""" - if not expr: raise self.skipException, msg + if not expr: raise self.skipException(msg) @@ -467,12 +448,12 @@ class TestSuite: self._tests = [] self.addTests(tests) self.description = description or '(no description)' - + def __repr__(self): return "<%s tests=%s>" % (_strclass(self.__class__), self._tests) __str__ = __repr__ - + def shortDescription(self): return self.description @@ -498,7 +479,7 @@ class TestSuite: def __call__(self, result): try: result.startSuite(self) except AttributeError: pass - + for test in self._tests: if result.shouldStop: break @@ -554,8 +535,7 @@ class FunctionTestCase(TestCase): def shortDescription(self): if self._description is not None: return self._description doc = self._testFunc.__doc__ - return doc and string.strip(string.split(doc, "\n")[0]) or None - + return doc and doc.split('\n')[0].strip() or None ############################################################################## @@ -576,16 +556,16 @@ class TestLoader: instance_list = map(testCaseClass, name_list) description = getattr(testCaseClass, '__doc__') \ or testCaseClass.__name__ - description = (description.splitlines()[0]).strip() + description = description.splitlines()[0].strip() suite = self.suiteClass(instance_list, description) return suite - + def loadTestsFromModule(self, module): """Return a suite of all tests cases contained in the given module""" tests = [] for name in dir(module): obj = getattr(module, name) - if (isinstance(obj, (type, types.ClassType)) and + if (isinstance(obj, class_types) and issubclass(obj, TestCase) and not obj in [TestCase, FunctionTestCase]): tests.append(self.loadTestsFromTestCase(obj)) @@ -603,15 +583,15 @@ class TestLoader: The method optionally resolves the names relative to a given module. """ - parts = string.split(name, '.') + parts = name.split('.') if module is None: if not parts: - raise ValueError, "incomplete test name: %s" % name + raise ValueError("incomplete test name: %s" % name) else: parts_copy = parts[:] while parts_copy: try: - module = __import__(string.join(parts_copy,'.')) + module = __import__('.'.join(parts_copy)) break except ImportError: del parts_copy[-1] @@ -622,22 +602,20 @@ class TestLoader: obj = getattr(obj, part) import unittest - if type(obj) == types.ModuleType: + if isinstance(obj, types.ModuleType): return self.loadTestsFromModule(obj) - elif (isinstance(obj, (type, types.ClassType)) and + elif (isinstance(obj, class_types) and issubclass(obj, unittest.TestCase)): return self.loadTestsFromTestCase(obj) - elif type(obj) == types.UnboundMethodType: - return obj.im_class(obj.__name__) + elif isinstance(obj, types.UnboundMethodType): + return obj.__self__.__class__(obj.__name__) elif callable(obj): test = obj() - if not isinstance(test, unittest.TestCase) and \ - not isinstance(test, unittest.TestSuite): - raise ValueError, \ - "calling %s returned %s, not a test" % (obj,test) + if not isinstance(test, (unittest.TestCase, unittest.TestSuite)): + raise ValueError("calling %s returned %s, not a test" % (obj,test)) return test else: - raise ValueError, "don't know how to make test from: %s" % obj + raise ValueError("don't know how to make test from: %s" % obj) def loadTestsFromNames(self, names, module=None): """Return a suite of all tests cases found using the given sequence @@ -651,18 +629,16 @@ class TestLoader: def getTestCaseNames(self, testCaseClass): """Return a sorted sequence of method names found within testCaseClass """ - testFnNames = filter(lambda n,p=self.testMethodPrefix: n[:len(p)] == p, - dir(testCaseClass)) + testFnNames = [n for n in dir(testCaseClass) + if n.startswith(self.testMethodPrefix)] + for baseclass in testCaseClass.__bases__: for testFnName in self.getTestCaseNames(baseclass): if testFnName not in testFnNames: # handle overridden methods testFnNames.append(testFnName) - if self.sortTestMethodsUsing: - testFnNames.sort(self.sortTestMethodsUsing) + testFnNames.sort() return testFnNames - - defaultTestLoader = TestLoader() @@ -740,7 +716,7 @@ class _TextTestResult(TestResult): except AttributeError: desc = '(no description)' self.stream.writeln(desc) self.depth += 1 - + def startTest(self, test): TestResult.startTest(self, test) if self.showAll: @@ -825,8 +801,8 @@ class TextTestRunner: self.stream.writeln() if not result.wasSuccessful(): self.stream.write("FAILED (") - failed, errored, skipped = map(len, \ - (result.failures, result.errors, result.skipped)) + failed, errored, skipped = map(len, + (result.failures, result.errors, result.skipped)) if failed: self.stream.write("failures=%d" % failed) if errored: @@ -869,9 +845,9 @@ Examples: """ def __init__(self, module='__main__', defaultTest=None, argv=None, testRunner=None, testLoader=defaultTestLoader): - if type(module) == type(''): + if isinstance(module, string_types): self.module = __import__(module) - for part in string.split(module,'.')[1:]: + for part in module.split('.')[1:]: self.module = getattr(self.module, part) else: self.module = module @@ -886,8 +862,8 @@ Examples: self.runTests() def usageExit(self, msg=None): - if msg: print msg - print self.USAGE % self.__dict__ + if msg: print(msg) + print(self.USAGE % self.__dict__) sys.exit(2) def parseArgs(self, argv): @@ -910,7 +886,7 @@ Examples: else: self.testNames = (self.defaultTest,) self.createTests() - except getopt.error, msg: + except getopt.error as msg: self.usageExit(msg) def createTests(self): diff --git a/test/runtests.py b/test/runtests.py index c48bd1d..5aaac26 100644..100755 --- a/test/runtests.py +++ b/test/runtests.py @@ -1,20 +1,20 @@ #!/usr/bin/python +from __future__ import print_function + """Usage: python runtests.py [OPTIONS] -Quick script to run all unit tests from source directory +Quick script to run all unit tests from source directory (e.g. without having to install.) OPTIONS: - - -d, --descriptions=NUM Set to 0 to turn off printing + + -d, --descriptions=NUM Set to 0 to turn off printing test doc strings as descriptions. -v, --verbosity=NUM Output verbosity level. Defaults to - 2 which is one line of info per test. Set + 2 which is one line of info per test. Set to 1 to get one char of info per test or 0 to disable status output completely. """ - -# $Id: runtests.py,v 1.7 2004/03/31 17:02:00 mstenner Exp $ import sys from os.path import dirname, join as joinpath @@ -31,7 +31,7 @@ def main(): # it's okay to import now that sys.path is setup. import test_grabber, test_byterange, test_mirror suite = TestSuite( (test_grabber.suite(), - test_byterange.suite(), + test_byterange.suite(), test_mirror.suite()) ) suite.description = 'urlgrabber tests' runner = TextTestRunner(stream=sys.stdout, @@ -52,9 +52,9 @@ def parse_args(): elif o in ('-v', '--verbosity'): verbosity = int(a) return (descriptions,verbosity) - + def usage(): - print __doc__ - + print(__doc__) + if __name__ == '__main__': main() diff --git a/test/test_byterange.py b/test/test_byterange.py index 96f1573..dfed311 100644..100755 --- a/test/test_byterange.py +++ b/test/test_byterange.py @@ -11,98 +11,93 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko -"""byterange.py tests""" +from __future__ import print_function -# $Id: test_byterange.py,v 1.6 2004/03/31 17:02:00 mstenner Exp $ +"""byterange.py tests""" import sys -from StringIO import StringIO +if sys.version_info >= (3,): + # We do an explicit version check here because because python2 + # also has an io module with StringIO, but it is incompatible, + # and returns str instead of unicode somewhere. + from io import StringIO +else: + from cStringIO import StringIO + from urlgrabber.byterange import RangeableFileObject from base_test_code import * class RangeableFileObjectTestCase(TestCase): """Test range.RangeableFileObject class""" - + def setUp(self): # 0 1 2 3 4 5 6 7 8 9 # 0123456789012345678901234567890123456789012345678901234567 890123456789012345678901234567890 self.test = 'Why cannot we write the entire 24 volumes of Encyclopaedia\nBrittanica on the head of a pin?\n' self.fo = StringIO(self.test) self.rfo = RangeableFileObject(self.fo, (20,69)) - + def tearDown(self): pass - + def test_seek(self): """RangeableFileObject.seek()""" self.rfo.seek(11) - self.assertEquals('24', self.rfo.read(2)) + self.assertEqual('24', self.rfo.read(2)) self.rfo.seek(14) - self.assertEquals('volumes', self.rfo.read(7)) + self.assertEqual('volumes', self.rfo.read(7)) self.rfo.seek(1,1) - self.assertEquals('of', self.rfo.read(2)) - - def test_poor_mans_seek(self): - """RangeableFileObject.seek() poor mans version.. - - We just delete the seek method from StringIO so we can - excercise RangeableFileObject when the file object supplied - doesn't support seek. - """ - seek = StringIO.seek - del(StringIO.seek) - self.test_seek() - StringIO.seek = seek - + self.assertEqual('of', self.rfo.read(2)) + def test_read(self): """RangeableFileObject.read()""" - self.assertEquals('the', self.rfo.read(3)) - self.assertEquals(' entire 24 volumes of ', self.rfo.read(22)) - self.assertEquals('Encyclopaedia\nBrittanica', self.rfo.read(50)) - self.assertEquals('', self.rfo.read()) - + self.assertEqual('the', self.rfo.read(3)) + self.assertEqual(' entire 24 volumes of ', self.rfo.read(22)) + self.assertEqual('Encyclopaedia\nBrittanica', self.rfo.read(50)) + self.assertEqual('', self.rfo.read()) + def test_readall(self): """RangeableFileObject.read(): to end of file.""" rfo = RangeableFileObject(StringIO(self.test),(11,)) - self.assertEquals(self.test[11:],rfo.read()) - + self.assertEqual(self.test[11:],rfo.read()) + def test_readline(self): """RangeableFileObject.readline()""" - self.assertEquals('the entire 24 volumes of Encyclopaedia\n', self.rfo.readline()) - self.assertEquals('Brittanica', self.rfo.readline()) - self.assertEquals('', self.rfo.readline()) - + self.assertEqual('the entire 24 volumes of Encyclopaedia\n', self.rfo.readline()) + self.assertEqual('Brittanica', self.rfo.readline()) + self.assertEqual('', self.rfo.readline()) + def test_tell(self): """RangeableFileObject.tell()""" - self.assertEquals(0,self.rfo.tell()) + self.assertEqual(0,self.rfo.tell()) self.rfo.read(5) - self.assertEquals(5,self.rfo.tell()) + self.assertEqual(5,self.rfo.tell()) self.rfo.readline() - self.assertEquals(39,self.rfo.tell()) - + self.assertEqual(39,self.rfo.tell()) + class RangeModuleTestCase(TestCase): """Test module level functions defined in range.py""" def setUp(self): pass - + def tearDown(self): pass - + def test_range_tuple_normalize(self): """byterange.range_tuple_normalize()""" from urlgrabber.byterange import range_tuple_normalize from urlgrabber.byterange import RangeError - tests = ( + tests = ( ((None,50), (0,50)), ((500,600), (500,600)), ((500,), (500,'')), @@ -112,28 +107,28 @@ class RangeModuleTestCase(TestCase): (None, None) ) for test, ex in tests: - self.assertEquals( range_tuple_normalize(test), ex ) - + self.assertEqual( range_tuple_normalize(test), ex ) + try: range_tuple_normalize( (10,8) ) except RangeError: pass else: self.fail("range_tuple_normalize( (10,8) ) should have raised RangeError") - + def test_range_header_to_tuple(self): """byterange.range_header_to_tuple()""" from urlgrabber.byterange import range_header_to_tuple - tests = ( + tests = ( ('bytes=500-600', (500,601)), ('bytes=500-', (500,'')), ('bla bla', ()), (None, None) ) for test, ex in tests: - self.assertEquals( range_header_to_tuple(test), ex ) - + self.assertEqual( range_header_to_tuple(test), ex ) + def test_range_tuple_to_header(self): """byterange.range_tuple_to_header()""" from urlgrabber.byterange import range_tuple_to_header - tests = ( + tests = ( ((500,600), 'bytes=500-599'), ((500,''), 'bytes=500-'), ((500,), 'bytes=500-'), @@ -142,16 +137,16 @@ class RangeModuleTestCase(TestCase): (None, None), ) for test, ex in tests: - self.assertEquals( range_tuple_to_header(test), ex ) - + self.assertEqual( range_tuple_to_header(test), ex ) + try: range_tuple_to_header( ('not an int',500) ) except ValueError: pass else: self.fail("range_tuple_to_header( ('not an int',500) ) should have raised ValueError") - + try: range_tuple_to_header( (0,'not an int') ) except ValueError: pass else: self.fail("range_tuple_to_header( (0, 'not an int') ) should have raised ValueError") - + def suite(): tl = TestLoader() return tl.loadTestsFromModule(sys.modules[__name__]) diff --git a/test/test_grabber.py b/test/test_grabber.py index eecdbcf..465e5f5 100644..100755 --- a/test/test_grabber.py +++ b/test/test_grabber.py @@ -11,23 +11,37 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko -"""grabber.py tests""" +from __future__ import print_function -# $Id: test_grabber.py,v 1.31 2006/12/08 00:14:16 mstenner Exp $ +"""grabber.py tests""" import sys import os -import string, tempfile, random, cStringIO, os -import urllib2 +import tempfile, random, os import socket +from io import BytesIO +from six import string_types + +if sys.version_info >= (3,): + # We do an explicit version check here because because python2 + # also has an io module with StringIO, but it is incompatible, + # and returns str instead of unicode somewhere. + from io import StringIO +else: + from cStringIO import StringIO + +try: + from urllib.request import urlopen, OpenerDirector +except ImportError: + from urllib2 import urlopen, OpenerDirector from base_test_code import * @@ -38,15 +52,14 @@ from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject, \ from urlgrabber.progress import text_progress_meter class FileObjectTests(TestCase): - + def setUp(self): self.filename = tempfile.mktemp() - fo = file(self.filename, 'wb') - fo.write(reference_data) - fo.close() + with open(self.filename, 'wb') as fo: + fo.write(reference_data) - self.fo_input = cStringIO.StringIO(reference_data) - self.fo_output = cStringIO.StringIO() + self.fo_input = BytesIO(reference_data) + self.fo_output = BytesIO() (url, parts) = grabber.default_grabber.opts.urlparser.parse( self.filename, grabber.default_grabber.opts) self.wrapper = grabber.PyCurlFileObject( @@ -60,41 +73,39 @@ class FileObjectTests(TestCase): "PYCurlFileObject .read() method" s = self.wrapper.read() self.fo_output.write(s) - self.assert_(reference_data == self.fo_output.getvalue()) + self.assertTrue(reference_data == self.fo_output.getvalue()) def test_readline(self): "PyCurlFileObject .readline() method" - while 1: + while True: s = self.wrapper.readline() self.fo_output.write(s) if not s: break - self.assert_(reference_data == self.fo_output.getvalue()) + self.assertTrue(reference_data == self.fo_output.getvalue()) def test_readlines(self): "PyCurlFileObject .readlines() method" li = self.wrapper.readlines() - self.fo_output.write(string.join(li, '')) - self.assert_(reference_data == self.fo_output.getvalue()) + self.fo_output.write(b''.join(li)) + self.assertTrue(reference_data == self.fo_output.getvalue()) def test_smallread(self): "PyCurlFileObject .read(N) with small N" - while 1: + while True: s = self.wrapper.read(23) self.fo_output.write(s) if not s: break - self.assert_(reference_data == self.fo_output.getvalue()) - + self.assertTrue(reference_data == self.fo_output.getvalue()) + class HTTPTests(TestCase): def test_reference_file(self): - "download refernce file via HTTP" + "download reference file via HTTP" filename = tempfile.mktemp() grabber.urlgrab(ref_http, filename) - fo = file(filename, 'rb') - contents = fo.read() - fo.close() + contents = open(filename, 'rb').read() - self.assert_(contents == reference_data) + self.assertTrue(contents == reference_data) def test_post(self): "do an HTTP post" @@ -109,46 +120,46 @@ class URLGrabberModuleTestCase(TestCase): """Test module level functions defined in grabber.py""" def setUp(self): pass - + def tearDown(self): pass - + def test_urlopen(self): "module-level urlopen() function" fo = urlgrabber.urlopen('http://www.python.org') fo.close() - + def test_urlgrab(self): "module-level urlgrab() function" outfile = tempfile.mktemp() - filename = urlgrabber.urlgrab('http://www.python.org', + filename = urlgrabber.urlgrab('http://www.python.org', filename=outfile) os.unlink(outfile) - + def test_urlread(self): "module-level urlread() function" s = urlgrabber.urlread('http://www.python.org') - + class URLGrabberTestCase(TestCase): """Test grabber.URLGrabber class""" - + def setUp(self): - - self.meter = text_progress_meter( fo=cStringIO.StringIO() ) + + self.meter = text_progress_meter( fo=StringIO() ) pass - + def tearDown(self): pass - + def testKeywordArgs(self): """grabber.URLGrabber.__init__() **kwargs handling. - + This is a simple test that just passes some arbitrary values into the URLGrabber constructor and checks that they've been set properly. """ - opener = urllib2.OpenerDirector() + opener = OpenerDirector() g = URLGrabber( progress_obj=self.meter, throttle=0.9, bandwidth=20, @@ -160,78 +171,81 @@ class URLGrabberTestCase(TestCase): proxies={'http' : 'http://www.proxy.com:9090'}, opener=opener ) opts = g.opts - self.assertEquals( opts.progress_obj, self.meter ) - self.assertEquals( opts.throttle, 0.9 ) - self.assertEquals( opts.bandwidth, 20 ) - self.assertEquals( opts.retry, 20 ) - self.assertEquals( opts.retrycodes, [5,6,7] ) - self.assertEquals( opts.copy_local, 1 ) - self.assertEquals( opts.close_connection, 1 ) - self.assertEquals( opts.user_agent, 'test ua/1.0' ) - self.assertEquals( opts.proxies, {'http' : 'http://www.proxy.com:9090'} ) - self.assertEquals( opts.opener, opener ) - - nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5, + self.assertEqual( opts.progress_obj, self.meter ) + self.assertEqual( opts.throttle, 0.9 ) + self.assertEqual( opts.bandwidth, 20 ) + self.assertEqual( opts.retry, 20 ) + self.assertEqual( opts.retrycodes, [5,6,7] ) + self.assertEqual( opts.copy_local, 1 ) + self.assertEqual( opts.close_connection, 1 ) + self.assertEqual( opts.user_agent, 'test ua/1.0' ) + self.assertEqual( opts.proxies, {'http' : 'http://www.proxy.com:9090'} ) + self.assertEqual( opts.opener, opener ) + + nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5, copy_local=0) - self.assertEquals( nopts.progress_obj, self.meter ) - self.assertEquals( nopts.throttle, 0.5 ) - self.assertEquals( nopts.bandwidth, 20 ) - self.assertEquals( nopts.retry, 20 ) - self.assertEquals( nopts.retrycodes, [5,6,7] ) - self.assertEquals( nopts.copy_local, 0 ) - self.assertEquals( nopts.close_connection, 1 ) - self.assertEquals( nopts.user_agent, 'test ua/1.0' ) - self.assertEquals( nopts.proxies, {'http' : 'http://www.proxy.com:9090'} ) + self.assertEqual( nopts.progress_obj, self.meter ) + self.assertEqual( nopts.throttle, 0.5 ) + self.assertEqual( nopts.bandwidth, 20 ) + self.assertEqual( nopts.retry, 20 ) + self.assertEqual( nopts.retrycodes, [5,6,7] ) + self.assertEqual( nopts.copy_local, 0 ) + self.assertEqual( nopts.close_connection, 1 ) + self.assertEqual( nopts.user_agent, 'test ua/1.0' ) + self.assertEqual( nopts.proxies, {'http' : 'http://www.proxy.com:9090'} ) nopts.opener = None - self.assertEquals( nopts.opener, None ) - + self.assertEqual( nopts.opener, None ) + def test_make_callback(self): """grabber.URLGrabber._make_callback() tests""" def cb(e): pass tup_cb = (cb, ('stuff'), {'some': 'dict'}) g = URLGrabber() - self.assertEquals(g._make_callback(cb), (cb, (), {})) - self.assertEquals(g._make_callback(tup_cb), tup_cb) + self.assertEqual(g._make_callback(cb), (cb, (), {})) + self.assertEqual(g._make_callback(tup_cb), tup_cb) class URLParserTestCase(TestCase): def setUp(self): pass - + def tearDown(self): pass def test_parse_url_with_prefix(self): """grabber.URLParser.parse() with opts.prefix""" - base = 'http://foo.com/dir' - bases = [base, base+'/'] - filename = 'bar/baz' - target = base + '/' + filename - + base = b'http://foo.com/dir' + bases = [base, base + b'/'] + filename = b'bar/baz' + target = base + b'/' + filename + for b in bases: g = URLGrabber(prefix=b) (url, parts) = g.opts.urlparser.parse(filename, g.opts) - self.assertEquals(url, target) + self.assertEqual(url, target) def _test_url(self, urllist): g = URLGrabber() try: quote = urllist[3] except IndexError: quote = None g.opts.quote = quote - (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts) - + url = urllist[0].encode('utf8') + expected_url = urllist[1].encode('utf8') + expected_parts = tuple(part.encode('utf8') for part in urllist[2]) + (url, parts) = g.opts.urlparser.parse(url, g.opts) + if 1: - self.assertEquals(url, urllist[1]) - self.assertEquals(parts, urllist[2]) + self.assertEqual(url, expected_url) + self.assertEqual(parts, expected_parts) else: if url == urllist[1] and parts == urllist[2]: - print 'OK: %s' % urllist[0] + print('OK: %s' % urllist[0]) else: - print 'ERROR: %s' % urllist[0] - print ' ' + urllist[1] - print ' ' + url - print ' ' + urllist[2] - print ' ' + parts - + print('ERROR: %s' % urllist[0]) + print(' ' + urllist[1]) + print(' ' + url) + print(' ' + urllist[2]) + print(' ' + parts) + url_tests_all = ( ['http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash', @@ -251,13 +265,13 @@ class URLParserTestCase(TestCase): 'http://host.com/Should%2520Not', ('http', 'host.com', '/Should%2520Not', '', '', ''), 1], ) - + url_tests_posix = ( ['/etc/passwd', 'file:///etc/passwd', ('file', '', '/etc/passwd', '', '', '')], ) - + url_tests_nt = ( [r'\\foo.com\path\file.ext', 'file://foo.com/path/file.ext', @@ -295,7 +309,7 @@ class FailureTestCase(TestCase): self.obj = obj self.args = args self.kwargs = kwargs - + def test_failure_callback_called(self): "failure callback is called on retry" self.failure_callback_called = 0 @@ -303,7 +317,7 @@ class FailureTestCase(TestCase): failure_callback=self._failure_callback) try: g.urlgrab(ref_404) except URLGrabError: pass - self.assertEquals(self.failure_callback_called, 1) + self.assertEqual(self.failure_callback_called, 1) def test_failure_callback_args(self): "failure callback is called with the proper args" @@ -312,14 +326,17 @@ class FailureTestCase(TestCase): failure_callback=fc) try: g.urlgrab(ref_404) except URLGrabError: pass - self.assert_(hasattr(self, 'obj')) - self.assert_(hasattr(self, 'args')) - self.assert_(hasattr(self, 'kwargs')) - self.assertEquals(self.args, ('foo',)) - self.assertEquals(self.kwargs, {'bar': 'baz'}) - self.assert_(isinstance(self.obj, CallbackObject)) - self.assertEquals(self.obj.url, ref_404) - self.assert_(isinstance(self.obj.exception, URLGrabError)) + self.assertTrue(hasattr(self, 'obj')) + self.assertTrue(hasattr(self, 'args')) + self.assertTrue(hasattr(self, 'kwargs')) + self.assertEqual(self.args, ('foo',)) + self.assertEqual(self.kwargs, {'bar': 'baz'}) + self.assertTrue(isinstance(self.obj, CallbackObject)) + url = self.obj.url + if not isinstance(url, string_types): + url = url.decode('utf8') + self.assertEqual(url, ref_404) + self.assertTrue(isinstance(self.obj.exception, URLGrabError)) del self.obj class InterruptTestCase(TestCase): @@ -339,7 +356,7 @@ class InterruptTestCase(TestCase): self.kwargs = kwargs if kwargs.get('exception', None): raise kwargs['exception'] - + def test_interrupt_callback_called(self): "interrupt callback is called on retry" self.interrupt_callback_called = 0 @@ -348,7 +365,7 @@ class InterruptTestCase(TestCase): interrupt_callback=ic) try: g.urlgrab(ref_http) except KeyboardInterrupt: pass - self.assertEquals(self.interrupt_callback_called, 1) + self.assertEqual(self.interrupt_callback_called, 1) def test_interrupt_callback_raises(self): "interrupt callback raises an exception" @@ -366,12 +383,12 @@ class CheckfuncTestCase(TestCase): self.g = grabber.URLGrabber(checkfunc=cf) self.filename = tempfile.mktemp() self.data = short_reference_data - + def tearDown(self): try: os.unlink(self.filename) except: pass if hasattr(self, 'obj'): del self.obj - + def _checkfunc(self, obj, *args, **kwargs): self.obj = obj self.args = args @@ -379,37 +396,38 @@ class CheckfuncTestCase(TestCase): if hasattr(obj, 'filename'): # we used urlgrab - fo = file(obj.filename) - data = fo.read() - fo.close() + data = open(obj.filename, 'rb').read() else: # we used urlread data = obj.data if data == self.data: return else: raise URLGrabError(-2, "data doesn't match") - + def _check_common_args(self): "check the args that are common to both urlgrab and urlread" - self.assert_(hasattr(self, 'obj')) - self.assert_(hasattr(self, 'args')) - self.assert_(hasattr(self, 'kwargs')) - self.assertEquals(self.args, ('foo',)) - self.assertEquals(self.kwargs, {'bar': 'baz'}) - self.assert_(isinstance(self.obj, CallbackObject)) - self.assertEquals(self.obj.url, short_ref_http) + self.assertTrue(hasattr(self, 'obj')) + self.assertTrue(hasattr(self, 'args')) + self.assertTrue(hasattr(self, 'kwargs')) + self.assertEqual(self.args, ('foo',)) + self.assertEqual(self.kwargs, {'bar': 'baz'}) + self.assertTrue(isinstance(self.obj, CallbackObject)) + url = self.obj.url + if not isinstance(url, string_types): + url = url.decode() + self.assertEqual(url, short_ref_http) def test_checkfunc_urlgrab_args(self): "check for proper args when used with urlgrab" self.g.urlgrab(short_ref_http, self.filename) self._check_common_args() - self.assertEquals(self.obj.filename, self.filename) + self.assertEqual(self.obj.filename, self.filename) def test_checkfunc_urlread_args(self): "check for proper args when used with urlread" self.g.urlread(short_ref_http) self._check_common_args() - self.assertEquals(self.obj.data, short_reference_data) + self.assertEqual(self.obj.data, short_reference_data) def test_checkfunc_urlgrab_success(self): "check success with urlgrab checkfunc" @@ -425,20 +443,20 @@ class CheckfuncTestCase(TestCase): "check failure with urlgrab checkfunc" self.data = 'other data' self.assertRaises(URLGrabError, self.g.urlgrab, - short_ref_http, self.filename) + ref_404, self.filename) def test_checkfunc_urlread_failure(self): "check failure with urlread checkfunc" self.data = 'other data' self.assertRaises(URLGrabError, self.g.urlread, - short_ref_http) + ref_404) class RegetTestBase: def setUp(self): self.ref = short_reference_data self.grabber = grabber.URLGrabber(reget='check_timestamp') self.filename = tempfile.mktemp() - self.hl = len(self.ref) / 2 + self.hl = len(self.ref) // 2 self.url = 'OVERRIDE THIS' def tearDown(self): @@ -446,16 +464,13 @@ class RegetTestBase: except: pass def _make_half_zero_file(self): - fo = file(self.filename, 'wb') - fo.write('0'*self.hl) - fo.close() + with open(self.filename, 'wb') as fo: + fo.write(b'0' * self.hl) def _read_file(self): - fo = file(self.filename, 'rb') - data = fo.read() - fo.close() + data = open(self.filename, 'rb').read() return data - + class CommonRegetTests(RegetTestBase, TestCase): def test_bad_reget_type(self): "exception raised for illegal reget mode" @@ -469,7 +484,7 @@ class FTPRegetTests(RegetTestBase, TestCase): # this tests to see if the server is available. If it's not, # then these tests will be skipped try: - fo = urllib2.urlopen(self.url).close() + fo = urlopen(self.url).close() except IOError: self.skip() @@ -479,52 +494,55 @@ class FTPRegetTests(RegetTestBase, TestCase): self.grabber.urlgrab(self.url, self.filename, reget='simple') data = self._read_file() - self.assertEquals(data[:self.hl], '0'*self.hl) - self.assertEquals(data[self.hl:], self.ref[self.hl:]) + self.assertEqual(data[:self.hl], b'0'*self.hl) + self.assertEqual(data[self.hl:], self.ref[self.hl:]) class HTTPRegetTests(FTPRegetTests): def setUp(self): RegetTestBase.setUp(self) self.url = short_ref_http - + def test_older_check_timestamp(self): + # define this here rather than in the FTP tests because currently, + # we get no timestamp information back from ftp servers. + self._make_half_zero_file() + ts = 1600000000 # set local timestamp to 2020 + os.utime(self.filename, (ts, ts)) + try: - # define this here rather than in the FTP tests because currently, - # we get no timestamp information back from ftp servers. - self._make_half_zero_file() - ts = 1600000000 # set local timestamp to 2020 - os.utime(self.filename, (ts, ts)) self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp') - data = self._read_file() - - self.assertEquals(data[:self.hl], '0'*self.hl) - self.assertEquals(data[self.hl:], self.ref[self.hl:]) except NotImplementedError: self.skip() - + + data = self._read_file() + + self.assertEqual(data[:self.hl], b'0'*self.hl) + self.assertEqual(data[self.hl:], self.ref[self.hl:]) + def test_newer_check_timestamp(self): + # define this here rather than in the FTP tests because currently, + # we get no timestamp information back from ftp servers. + self._make_half_zero_file() + ts = 1 # set local timestamp to 1969 + os.utime(self.filename, (ts, ts)) + try: - # define this here rather than in the FTP tests because currently, - # we get no timestamp information back from ftp servers. - self._make_half_zero_file() - ts = 1 # set local timestamp to 1969 - os.utime(self.filename, (ts, ts)) self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp') - data = self._read_file() - - self.assertEquals(data, self.ref) - except: + except NotImplementedError: self.skip() - + + data = self._read_file() + + self.assertEqual(data, self.ref) + class FileRegetTests(HTTPRegetTests): def setUp(self): self.ref = short_reference_data tmp = tempfile.mktemp() - tmpfo = file(tmp, 'wb') - tmpfo.write(self.ref) - tmpfo.close() + with open(tmp, 'wb') as tmpfo: + tmpfo.write(self.ref) self.tmp = tmp - + (url, parts) = grabber.default_grabber.opts.urlparser.parse( tmp, grabber.default_grabber.opts) self.url = url @@ -532,7 +550,7 @@ class FileRegetTests(HTTPRegetTests): self.grabber = grabber.URLGrabber(reget='check_timestamp', copy_local=1) self.filename = tempfile.mktemp() - self.hl = len(self.ref) / 2 + self.hl = len(self.ref) // 2 def tearDown(self): try: os.unlink(self.filename) @@ -544,14 +562,14 @@ class ProFTPDSucksTests(TestCase): def setUp(self): self.url = ref_proftp try: - fo = urllib2.urlopen(self.url).close() + fo = urlopen(self.url).close() except IOError: self.skip() def test_restart_workaround(self): inst = grabber.URLGrabber() rslt = inst.urlread(self.url, range=(500, 1000)) - + class BaseProxyTests(TestCase): good_p = '%s://%s:%s@%s:%i' % (proxy_proto, proxy_user, good_proxy_pass, proxy_host, proxy_port) @@ -591,8 +609,8 @@ class ProxyFTPAuthTests(ProxyHTTPAuthTests): if not self.have_proxy(): self.skip() try: - fo = urllib2.urlopen(self.url).close() - except IOError: + fo = urlopen(self.url).close() + except URLError: self.skip() self.g = URLGrabber() @@ -604,4 +622,3 @@ if __name__ == '__main__': grabber.DEBUG = 0 runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) runner.run(suite()) - diff --git a/test/test_mirror.py b/test/test_mirror.py index 70fe069..a175977 100644..100755 --- a/test/test_mirror.py +++ b/test/test_mirror.py @@ -11,9 +11,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -21,14 +21,12 @@ """mirror.py tests""" -# $Id: test_mirror.py,v 1.12 2005/10/22 21:57:27 mstenner Exp $ - import sys import os -import string, tempfile, random, cStringIO, os +import tempfile, random, os import urlgrabber.grabber -from urlgrabber.grabber import URLGrabber, URLGrabError +from urlgrabber.grabber import URLGrabber, URLGrabError, URLGrabberOptions import urlgrabber.mirror from urlgrabber.mirror import MirrorGroup, MGRandomStart, MGRandomOrder @@ -53,9 +51,7 @@ class BasicTests(TestCase): url = 'short_reference' self.mg.urlgrab(url, filename) - fo = open(filename) - data = fo.read() - fo.close() + data = open(filename, 'rb').read() self.assertEqual(data, short_reference_data) @@ -87,9 +83,7 @@ class SubclassTests(TestCase): url = 'short_reference' self.mg.urlgrab(url, filename) - fo = open(filename) - data = fo.read() - fo.close() + data = open(filename, 'rb').read() self.assertEqual(data, short_reference_data) @@ -106,8 +100,11 @@ class CallbackTests(TestCase): self.g = URLGrabber() fullmirrors = [base_mirror_url + m + '/' for m in \ (bad_mirrors + good_mirrors)] + if hasattr(urlgrabber.grabber, '_TH'): + # test assumes mirrors are not re-ordered + urlgrabber.grabber._TH.hosts.clear() self.mg = MirrorGroup(self.g, fullmirrors) - + def test_failure_callback(self): "test that MG executes the failure callback correctly" tricky_list = [] @@ -115,9 +112,9 @@ class CallbackTests(TestCase): tl.append(str(cb_obj.exception)) self.mg.failure_callback = failure_callback, (tricky_list, ), {} data = self.mg.urlread('reference') - self.assert_(data == reference_data) - self.assertEquals(tricky_list[0][:25], - '[Errno 14] HTTP Error 403') + self.assertTrue(data == reference_data) + self.assertEqual(tricky_list[0][:25], + '[Errno 14] HTTP Error 404') def test_callback_reraise(self): "test that the callback can correctly re-raise the exception" @@ -152,10 +149,8 @@ class FailoverTests(TestCase): def cb(e, elist=elist): elist.append(e) self.mg.urlgrab(url, filename, failure_callback=cb) - fo = open(filename) - contents = fo.read() - fo.close() - + contents = open(filename, 'rb').read() + # first be sure that the first mirror failed and that the # callback was called self.assertEqual(len(elist), 1) @@ -168,7 +163,8 @@ class FakeGrabber: self.resultlist = resultlist or [] self.index = 0 self.calls = [] - + self.opts = URLGrabberOptions() + def urlgrab(self, url, filename=None, **kwargs): self.calls.append( (url, filename) ) res = self.resultlist[self.index] @@ -187,11 +183,11 @@ class ActionTests(TestCase): def tearDown(self): urlgrabber.mirror.DEBUG = self.db - + def test_defaults(self): 'test default action policy' self.mg.urlgrab('somefile') - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:3] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -203,15 +199,15 @@ class ActionTests(TestCase): 'GR mirrors: [c d e f] 0', 'MAIN mirrors: [a b c d e f] 2', 'MIRROR: trying somefile -> c/somefile'] - - self.assertEquals(self.g.calls, expected_calls) - self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + + self.assertEqual(self.g.calls, expected_calls) + self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs) + def test_instance_action(self): 'test the effects of passed-in default_action' self.mg.default_action = {'remove_master': 1} self.mg.urlgrab('somefile') - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:3] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -223,14 +219,14 @@ class ActionTests(TestCase): 'GR mirrors: [c d e f] 0', 'MAIN mirrors: [c d e f] 0', 'MIRROR: trying somefile -> c/somefile'] - - self.assertEquals(self.g.calls, expected_calls) - self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + + self.assertEqual(self.g.calls, expected_calls) + self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs) + def test_method_action(self): 'test the effects of method-level default_action' self.mg.urlgrab('somefile', default_action={'remove_master': 1}) - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:3] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -242,18 +238,18 @@ class ActionTests(TestCase): 'GR mirrors: [c d e f] 0', 'MAIN mirrors: [c d e f] 0', 'MIRROR: trying somefile -> c/somefile'] - - self.assertEquals(self.g.calls, expected_calls) - self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + + self.assertEqual(self.g.calls, expected_calls) + self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs) + def callback(self, e): return {'fail': 1} - + def test_callback_action(self): 'test the effects of a callback-returned action' self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile', failure_callback=self.callback) - expected_calls = [ (m + '/' + 'somefile', None) \ + expected_calls = [ (m.encode('utf8') + b'/somefile', None) for m in self.mirrors[:1] ] expected_logs = \ ['MIRROR: trying somefile -> a/somefile', @@ -261,9 +257,133 @@ class ActionTests(TestCase): 'GR mirrors: [b c d e f] 0', 'MAIN mirrors: [a b c d e f] 1'] - self.assertEquals(self.g.calls, expected_calls) - self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs) - + self.assertEqual(self.g.calls, expected_calls) + self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs) + +import threading, socket + +class HttpReplyCode(TestCase): + def setUp(self): + # start the server + self.exit = False + self.process = lambda data: None + + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + s.bind(('localhost', 0)); s.listen(1) + self.port = s.getsockname()[1] + + def server(): + while True: + c, a = s.accept() + if self.exit: c.close(); break + data = b'' + while not data.endswith(b'\r\n\r\n'): + data = c.recv(4096) + self.process(data) + c.sendall(b'HTTP/1.1 %d %s\r\n' % self.reply) + if self.content is not None: + c.sendall(b'Content-Length: %d\r\n\r\n' % len(self.content)) + c.sendall(self.content) + c.close() + s.close() + self.exit = False + + self.thread = threading.Thread(target=server) + self.thread.start() + + # create grabber and mirror group objects + def failure(obj): + self.code = getattr(obj.exception, 'code', None) + return {} + self.g = URLGrabber() + self.mg = MirrorGroup(self.g, ['http://localhost:%d' % self.port], + failure_callback = failure) + + def tearDown(self): + # shut down the server + self.exit = True + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + s.connect(('localhost', self.port)) # wake it up + except ConnectionRefusedError: + # already gone? + pass + s.close() + self.thread.join() + + def test_grab(self): + 'tests the propagation of HTTP reply code' + self.reply = 503, b'Busy' + self.content = None + + # single + self.assertRaises(URLGrabError, self.mg.urlgrab, 'foo') + self.assertEqual(self.code, 503); del self.code + + # multi + err = [] + self.mg.urlgrab('foo', async_=True, failfunc=err.append) + urlgrabber.grabber.parallel_wait() + self.assertEqual([e.exception.errno for e in err], [256]) + self.assertEqual(self.code, 503); del self.code + + def test_range(self): + 'test client-side processing of HTTP ranges' + # server does not process ranges + self.reply = 200, b'OK' + self.content = b'ABCDEF' + + # no range specified + data = self.mg.urlread('foo') + self.assertEqual(data, b'ABCDEF') + + data = self.mg.urlread('foo', range = (3, 5)) + self.assertEqual(data, b'DE') + + def test_retry_no_cache(self): + 'test bypassing proxy cache on failure' + def process(data): + if b'Pragma:no-cache' in data: + self.content = b'version2' + else: + self.content = b'version1' + + def checkfunc_read(obj): + if obj.data == b'version1': + raise URLGrabError(-1, 'Outdated version of foo') + elif obj.data != b'version2': + self.fail('Unexpected file content') + + def checkfunc_grab(obj): + with open('foo') as f: + data = f.read() + if data == 'version1': + raise URLGrabError(-1, 'Outdated version of foo') + elif data != 'version2': + self.fail('Unexpected file content') + + self.process = process + self.reply = 200, b'OK' + + opts = self.g.opts + opts.retry = 3 + opts.retry_no_cache = True + + # single + opts.checkfunc = checkfunc_read + try: + self.mg.urlread('foo') + except URLGrabError as e: + self.fail(str(e)) + + # multi + opts.checkfunc = checkfunc_grab + self.mg.urlgrab('foo', async_=True) + try: + urlgrabber.grabber.parallel_wait() + except URLGrabError as e: + self.fail(str(e)) def suite(): tl = TestLoader() @@ -272,4 +392,3 @@ def suite(): if __name__ == '__main__': runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2) runner.run(suite()) - diff --git a/test/threading/batchgrabber.py b/test/threading/batchgrabber.py index 076b7ef..4ee71e1 100644..100755 --- a/test/threading/batchgrabber.py +++ b/test/threading/batchgrabber.py @@ -9,20 +9,22 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +from __future__ import print_function + """Module for testing urlgrabber under multiple threads. -This module can be used from the command line. Each argument is +This module can be used from the command line. Each argument is a URL to grab. -The BatchURLGrabber class has an interface similar to URLGrabber +The BatchURLGrabber class has an interface similar to URLGrabber but instead of pulling files when urlgrab is called, the request is queued. Calling BatchURLGrabber.batchgrab causes all files to be pulled in multiple threads. @@ -31,7 +33,7 @@ be pulled in multiple threads. import os.path, sys if __name__ == '__main__': - print os.path.dirname(sys.argv[0]) + print(os.path.dirname(sys.argv[0])) sys.path.insert(0, (os.path.dirname(sys.argv[0]) or '.') + '/../..') from threading import Thread, Semaphore @@ -48,10 +50,10 @@ class BatchURLGrabber: self.queue = [] self.threads = [] self.sem = Semaphore() - + def urlgrab(self, url, filename=None, **kwargs): self.queue.append( (url, filename, kwargs) ) - + def batchgrab(self): if hasattr(self.grabber.opts.progress_obj, 'start'): self.grabber.opts.progress_obj.start(len(self.queue)) @@ -61,17 +63,17 @@ class BatchURLGrabber: del self.queue[0] thread = Worker(self, url, filename, kwargs) self.threads.append(thread) - if DEBUG: print "starting worker: " + url + if DEBUG: print("starting worker: " + url) thread.start() else: for t in self.threads: if not t.isAlive(): - if DEBUG: print "cleaning up worker: " + t.url + if DEBUG: print("cleaning up worker: " + t.url) self.threads.remove(t) #if len(self.threads) == self.maxthreads: # sleep(0.2) sleep(0.2) - + class Worker(Thread): def __init__(self, parent, url, filename, kwargs): Thread.__init__(self) @@ -79,18 +81,18 @@ class Worker(Thread): self.url = url self.filename = filename self.kwargs = kwargs - + def run(self): - if DEBUG: print "worker thread started." + if DEBUG: print("worker thread started.") grabber = self.parent.grabber progress_obj = grabber.opts.progress_obj if isinstance(progress_obj, MultiFileMeter): self.kwargs['progress_obj'] = progress_obj.newMeter() try: rslt = self.parent.grabber.urlgrab(self.url, self.filename, **self.kwargs) - except URLGrabError, e: - print '%s, %s' % (e, self.url) - + except URLGrabError as e: + print('%s, %s' % (e, self.url)) + def main(): progress_obj = None # uncomment to play with BatchProgressMeter (doesn't work right now) @@ -98,13 +100,13 @@ def main(): g = BatchURLGrabber(keepalive=1, progress_obj=progress_obj) for arg in sys.argv[1:]: g.urlgrab(arg) - if DEBUG: print "before batchgrab" + if DEBUG: print("before batchgrab") try: g.batchgrab() except KeyboardInterrupt: sys.exit(1) - - if DEBUG: print "after batchgrab" - + + if DEBUG: print("after batchgrab") + if __name__ == '__main__': main() diff --git a/urlgrabber.egg-info/PKG-INFO b/urlgrabber.egg-info/PKG-INFO new file mode 100755 index 0000000..20b9966 --- /dev/null +++ b/urlgrabber.egg-info/PKG-INFO @@ -0,0 +1,31 @@ +Metadata-Version: 1.2 +Name: urlgrabber +Version: 4.1.0 +Summary: A high-level cross-protocol url-grabber +Home-page: http://urlgrabber.baseurl.org/ +Author: Michael D. Stenner, Ryan Tomayko, Seth Vidal, Zdenek Pavlas +Author-email: mstenner@linux.duke.edu, rtomayko@naeblis.cx, skvidal@fedoraproject.org, zpavlas@redhat.com +Maintainer: Neal Gompa +Maintainer-email: ngompa@fedoraproject.org +License: LGPLv2+ +Description: UNKNOWN +Keywords: urlgrabber yum http ftp +Platform: UNKNOWN +Classifier: Development Status :: 5 - Production/Stable +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: System Administrators +Classifier: Topic :: Internet :: File Transfer Protocol (FTP) +Classifier: Topic :: Internet :: WWW/HTTP +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Environment :: Console +Classifier: Environment :: Web Environment +Classifier: License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+) +Classifier: Operating System :: POSIX +Classifier: Operating System :: POSIX :: Linux +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 2 +Classifier: Programming Language :: Python :: 2.6 +Classifier: Programming Language :: Python :: 2.7 +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 diff --git a/urlgrabber.egg-info/SOURCES.txt b/urlgrabber.egg-info/SOURCES.txt new file mode 100755 index 0000000..35174af --- /dev/null +++ b/urlgrabber.egg-info/SOURCES.txt @@ -0,0 +1,27 @@ +ChangeLog +LICENSE +MANIFEST.in +README +TODO +makefile +setup.py +scripts/urlgrabber +scripts/urlgrabber-ext-down +test/base_test_code.py +test/grabberperf.py +test/munittest.py +test/runtests.py +test/test_byterange.py +test/test_grabber.py +test/test_mirror.py +test/threading/batchgrabber.py +urlgrabber/__init__.py +urlgrabber/byterange.py +urlgrabber/grabber.py +urlgrabber/mirror.py +urlgrabber/progress.py +urlgrabber.egg-info/PKG-INFO +urlgrabber.egg-info/SOURCES.txt +urlgrabber.egg-info/dependency_links.txt +urlgrabber.egg-info/requires.txt +urlgrabber.egg-info/top_level.txt
\ No newline at end of file diff --git a/urlgrabber.egg-info/dependency_links.txt b/urlgrabber.egg-info/dependency_links.txt new file mode 100755 index 0000000..8b13789 --- /dev/null +++ b/urlgrabber.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/urlgrabber.egg-info/requires.txt b/urlgrabber.egg-info/requires.txt new file mode 100755 index 0000000..28d1cd4 --- /dev/null +++ b/urlgrabber.egg-info/requires.txt @@ -0,0 +1,3 @@ +pycurl +six +setuptools diff --git a/urlgrabber.egg-info/top_level.txt b/urlgrabber.egg-info/top_level.txt new file mode 100755 index 0000000..9aa2f20 --- /dev/null +++ b/urlgrabber.egg-info/top_level.txt @@ -0,0 +1 @@ +urlgrabber diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py index ddd5204..60f56c3 100644..100755 --- a/urlgrabber/__init__.py +++ b/urlgrabber/__init__.py @@ -1,16 +1,18 @@ -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Library General Public License for more details. +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. # -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA # Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko # Copyright 2009 Red Hat, Inc - pycurl support added by Seth Vidal @@ -44,11 +46,17 @@ following features: automatically switching mirrors if there is a failure. """ -__version__ = '3.9.1' -__date__ = '2009/09/25' -__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \ - 'Ryan Tomayko <rtomayko@naeblis.cx>' \ - 'Seth Vidal <skvidal@fedoraproject.org>' -__url__ = 'http://linux.duke.edu/projects/urlgrabber/' +try: + from email import message_from_string + from pkg_resources import get_distribution + pkgInfo = get_distribution(__package__).get_metadata('PKG-INFO') + __metadata__ = message_from_string(pkgInfo) + del pkgInfo -from grabber import urlgrab, urlopen, urlread + __version__ = __metadata__['Version'] + __author__ = __metadata__['Author'] + __url__ = __metadata__['Home-page'] +except: + __author__ = __version__ = __url__ = '<see setup.cfg>' + +from .grabber import urlgrab, urlopen, urlread diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py index 3e5f3b7..e341add 100644..100755 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -19,56 +19,74 @@ import os +import sys import stat import urllib -import urllib2 -import rfc822 +import email +import ftplib +import socket +import sys +import mimetypes + +try: + from urllib.request import BaseHandler, FileHandler, FTPHandler, URLError + from urllib.request import addclosehook, addinfourl + from urllib.request import ftpwrapper as urllib_ftpwrapper + from urllib.parse import splitport, splituser, splitpasswd, splitattr, unquote +except ImportError: + from urllib2 import BaseHandler, FileHandler, FTPHandler, URLError + from urllib2 import ftpwrapper as urllib_ftpwrapper + from urllib import (splitport, splituser, splitpasswd, splitattr, + unquote, addclosehook, addinfourl) DEBUG = None -try: +if sys.version_info >= (3,): + # We do an explicit version check here because because python2 + # also has an io module with StringIO, but it is incompatible, + # and returns str instead of unicode somewhere. + from io import StringIO +else: from cStringIO import StringIO -except ImportError, msg: - from StringIO import StringIO class RangeError(IOError): """Error raised when an unsatisfiable range is requested.""" pass - -class HTTPRangeHandler(urllib2.BaseHandler): + +class HTTPRangeHandler(BaseHandler): """Handler that enables HTTP Range headers. - + This was extremely simple. The Range header is a HTTP feature to - begin with so all this class does is tell urllib2 that the - "206 Partial Content" reponse from the HTTP server is what we + begin with so all this class does is tell urllib2 that the + "206 Partial Content" response from the HTTP server is what we expected. - + Example: import urllib2 import byterange - + range_handler = range.HTTPRangeHandler() - opener = urllib2.build_opener(range_handler) - + opener = urllib.request.build_opener(range_handler) + # install it - urllib2.install_opener(opener) - + urllib.request.install_opener(opener) + # create Request and set Range header - req = urllib2.Request('http://www.python.org/') + req = urllib.request.Request('http://www.python.org/') req.header['Range'] = 'bytes=30-50' - f = urllib2.urlopen(req) + f = urllib.request.urlopen(req) """ - + def http_error_206(self, req, fp, code, msg, hdrs): # 206 Partial Content Response r = urllib.addinfourl(fp, hdrs, req.get_full_url()) r.code = code r.msg = msg return r - + def http_error_416(self, req, fp, code, msg, hdrs): # HTTP's Range Not Satisfiable error - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') class HTTPSRangeHandler(HTTPRangeHandler): """ Range Header support for HTTPS. """ @@ -81,13 +99,13 @@ class HTTPSRangeHandler(HTTPRangeHandler): class RangeableFileObject: """File object wrapper to enable raw range handling. - This was implemented primarilary for handling range - specifications for file:// urls. This object effectively makes - a file object look like it consists only of a range of bytes in + This was implemented primarilary for handling range + specifications for file:// urls. This object effectively makes + a file object look like it consists only of a range of bytes in the stream. - + Examples: - # expose 10 bytes, starting at byte position 20, from + # expose 10 bytes, starting at byte position 20, from # /etc/aliases. >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) # seek seeks within the range (to position 23 in this case) @@ -99,11 +117,11 @@ class RangeableFileObject: # byte in the range. the following will return only 7 bytes. >>> fo.read(30) """ - + def __init__(self, fo, rangetup): """Create a RangeableFileObject. - fo -- a file like object. only the read() method need be - supported but supporting an optimized seek() is + fo -- a file like object. only the read() method need be + supported but supporting an optimized seek() is preferable. rangetup -- a (firstbyte,lastbyte) tuple specifying the range to work over. @@ -113,24 +131,24 @@ class RangeableFileObject: (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) self.realpos = 0 self._do_seek(self.firstbyte) - + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.""" if hasattr(self.fo, name): return getattr(self.fo, name) - raise AttributeError, name - + raise AttributeError(name) + def tell(self): """Return the position within the range. - This is different from fo.seek in that position 0 is the + This is different from fo.seek in that position 0 is the first byte position of the range tuple. For example, if this object was created with a range tuple of (500,899), tell() will return 0 when at byte position 500 of the file. """ return (self.realpos - self.firstbyte) - + def seek(self,offset,whence=0): """Seek within the byte range. Positioning is identical to that described under tell(). @@ -143,13 +161,13 @@ class RangeableFileObject: elif whence == 2: # absolute from end of file # XXX: are we raising the right Error here? raise IOError('seek from end of file not supported.') - + # do not allow seek past lastbyte in range if self.lastbyte and (realoffset >= self.lastbyte): realoffset = self.lastbyte - + self._do_seek(realoffset - self.realpos) - + def read(self, size=-1): """Read within the range. This method will limit the size read based on the range. @@ -158,7 +176,7 @@ class RangeableFileObject: rslt = self.fo.read(size) self.realpos += len(rslt) return rslt - + def readline(self, size=-1): """Read lines within the range. This method will limit the size read based on the range. @@ -167,7 +185,7 @@ class RangeableFileObject: rslt = self.fo.readline(size) self.realpos += len(rslt) return rslt - + def _calc_read_size(self, size): """Handles calculating the amount of data to read based on the range. @@ -179,7 +197,7 @@ class RangeableFileObject: else: size = (self.lastbyte - self.realpos) return size - + def _do_seek(self,offset): """Seek based on whether wrapped object supports seek(). offset is relative to the current position (self.realpos). @@ -190,7 +208,7 @@ class RangeableFileObject: else: self.fo.seek(self.realpos + offset) self.realpos+= offset - + def _poor_mans_seek(self,offset): """Seek by calling the wrapped file objects read() method. This is used for file like objects that do not have native @@ -198,7 +216,7 @@ class RangeableFileObject: to manually seek to the desired position. offset -- read this number of bytes from the wrapped file object. - raise RangeError if we encounter EOF before reaching the + raise RangeError if we encounter EOF before reaching the specified offset. """ pos = 0 @@ -208,28 +226,26 @@ class RangeableFileObject: bufsize = offset - pos buf = self.fo.read(bufsize) if len(buf) != bufsize: - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') pos+= bufsize -class FileRangeHandler(urllib2.FileHandler): +class FileRangeHandler(FileHandler): """FileHandler subclass that adds Range support. This class handles Range headers exactly like an HTTP server would. """ def open_local_file(self, req): - import mimetypes - import mimetools host = req.get_host() file = req.get_selector() localfile = urllib.url2pathname(file) stats = os.stat(localfile) size = stats[stat.ST_SIZE] - modified = rfc822.formatdate(stats[stat.ST_MTIME]) + modified = email.utils.formatdate(stats[stat.ST_MTIME]) mtype = mimetypes.guess_type(file)[0] if host: host, port = urllib.splitport(host) if port or socket.gethostbyname(host) not in self.get_names(): - raise urllib2.URLError('file not on local host') + raise URLError('file not on local host') fo = open(localfile,'rb') brange = req.headers.get('Range',None) brange = range_header_to_tuple(brange) @@ -238,35 +254,27 @@ class FileRangeHandler(urllib2.FileHandler): (fb,lb) = brange if lb == '': lb = size if fb < 0 or fb > size or lb > size: - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') size = (lb - fb) fo = RangeableFileObject(fo, (fb,lb)) - headers = mimetools.Message(StringIO( + headers = email.message_from_string( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % - (mtype or 'text/plain', size, modified))) + (mtype or 'text/plain', size, modified)) return urllib.addinfourl(fo, headers, 'file:'+file) -# FTP Range Support +# FTP Range Support # Unfortunately, a large amount of base FTP code had to be copied # from urllib and urllib2 in order to insert the FTP REST command. -# Code modifications for range support have been commented as +# Code modifications for range support have been commented as # follows: # -- range support modifications start/end here -from urllib import splitport, splituser, splitpasswd, splitattr, \ - unquote, addclosehook, addinfourl -import ftplib -import socket -import sys -import mimetypes -import mimetools - -class FTPRangeHandler(urllib2.FTPHandler): +class FTPRangeHandler(FTPHandler): def ftp_open(self, req): host = req.get_host() if not host: - raise IOError, ('ftp error', 'no host given') + raise IOError('ftp error', 'no host given') host, port = splitport(host) if port is None: port = ftplib.FTP_PORT @@ -282,11 +290,11 @@ class FTPRangeHandler(urllib2.FTPHandler): host = unquote(host) user = unquote(user or '') passwd = unquote(passwd or '') - + try: host = socket.gethostbyname(host) - except socket.error, msg: - raise urllib2.URLError(msg) + except socket.error as msg: + raise URLError(msg) path, attrs = splitattr(req.get_selector()) dirs = path.split('/') dirs = map(unquote, dirs) @@ -301,34 +309,34 @@ class FTPRangeHandler(urllib2.FTPHandler): if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() - + # -- range support modifications start here rest = None - range_tup = range_header_to_tuple(req.headers.get('Range',None)) + range_tup = range_header_to_tuple(req.headers.get('Range',None)) assert range_tup != () if range_tup: (fb,lb) = range_tup if fb > 0: rest = fb # -- range support modifications end here - + fp, retrlen = fw.retrfile(file, type, rest) - + # -- range support modifications start here if range_tup: (fb,lb) = range_tup - if lb == '': + if lb == '': if retrlen is None or retrlen == 0: - raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') + raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.') lb = retrlen retrlen = lb - fb if retrlen < 0: # beginning of range is larger than file - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') else: retrlen = lb - fb fp = RangeableFileObject(fp, (0,retrlen)) # -- range support modifications end here - + headers = "" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: @@ -338,14 +346,14 @@ class FTPRangeHandler(urllib2.FTPHandler): sf = StringIO(headers) headers = mimetools.Message(sf) return addinfourl(fp, headers, req.get_full_url()) - except ftplib.all_errors, msg: - raise IOError, ('ftp error', msg), sys.exc_info()[2] + except ftplib.all_errors as msg: + raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2]) def connect_ftp(self, user, passwd, host, port, dirs): fw = ftpwrapper(user, passwd, host, port, dirs) return fw -class ftpwrapper(urllib.ftpwrapper): +class ftpwrapper(urllib_ftpwrapper): # range support note: # this ftpwrapper code is copied directly from # urllib. The only enhancement is to add the rest @@ -364,22 +372,22 @@ class ftpwrapper(urllib.ftpwrapper): # Use nlst to see if the file exists at all try: self.ftp.nlst(file) - except ftplib.error_perm, reason: - raise IOError, ('ftp error', reason), sys.exc_info()[2] + except ftplib.error_perm as reason: + raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2]) # Restore the transfer mode! self.ftp.voidcmd(cmd) # Try to retrieve as a file try: cmd = 'RETR ' + file conn = self.ftp.ntransfercmd(cmd, rest) - except ftplib.error_perm, reason: + except ftplib.error_perm as reason: if str(reason)[:3] == '501': # workaround for REST not supported error fp, retrlen = self.retrfile(file, type) fp = RangeableFileObject(fp, (rest,'')) return (fp, retrlen) elif str(reason)[:3] != '550': - raise IOError, ('ftp error', reason), sys.exc_info()[2] + raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2]) if not conn: # Set transfer mode to ASCII! self.ftp.voidcmd('TYPE A') @@ -400,17 +408,17 @@ class ftpwrapper(urllib.ftpwrapper): _rangere = None def range_header_to_tuple(range_header): """Get a (firstbyte,lastbyte) tuple from a Range header value. - + Range headers have the form "bytes=<firstbyte>-<lastbyte>". This function pulls the firstbyte and lastbyte values and returns a (firstbyte,lastbyte) tuple. If lastbyte is not specified in the header value, it is returned as an empty string in the tuple. - + Return None if range_header is None - Return () if range_header does not conform to the range spec + Return () if range_header does not conform to the range spec pattern. - + """ global _rangere if range_header is None: return None @@ -418,9 +426,9 @@ def range_header_to_tuple(range_header): import re _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') match = _rangere.match(range_header) - if match: + if match: tup = range_tuple_normalize(match.group(1,2)) - if tup and tup[1]: + if tup and tup[1]: tup = (tup[0],tup[1]+1) return tup return () @@ -433,16 +441,16 @@ def range_tuple_to_header(range_tup): if range_tup is None: return None range_tup = range_tuple_normalize(range_tup) if range_tup: - if range_tup[1]: + if range_tup[1]: range_tup = (range_tup[0],range_tup[1] - 1) return 'bytes=%s-%s' % range_tup - + def range_tuple_normalize(range_tup): """Normalize a (first_byte,last_byte) range tuple. Return a tuple whose first element is guaranteed to be an int - and whose second element will be '' (meaning: the last byte) or + and whose second element will be '' (meaning: the last byte) or an int. Finally, return None if the normalized tuple == (0,'') - as that is equivelant to retrieving the entire file. + as that is equivalent to retrieving the entire file. """ if range_tup is None: return None # handle first byte @@ -452,12 +460,13 @@ def range_tuple_normalize(range_tup): # handle last byte try: lb = range_tup[1] except IndexError: lb = '' - else: + else: if lb is None: lb = '' elif lb != '': lb = int(lb) # check if range is over the entire file if (fb,lb) == (0,''): return None # check that the range is valid - if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) + if lb != '' and fb >= lb: + raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb)) return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index e090e90..b72d089 100644..100755 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -9,15 +9,17 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko # Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal +from __future__ import print_function + """A high-level cross-protocol url-grabber. GENERAL ARGUMENTS (kwargs) @@ -35,7 +37,7 @@ GENERAL ARGUMENTS (kwargs) close_connection = 0 [0|1] tells URLGrabber to close the connection after a file has been - transfered. This is ignored unless the download happens with the + transferred. This is ignored unless the download happens with the http keepalive handler (keepalive=1). Otherwise, the connection is left open for further use. The module level default for this option is 0 (keepalive connections will not be closed). @@ -49,13 +51,46 @@ GENERAL ARGUMENTS (kwargs) progress_obj = None a class instance that supports the following methods: - po.start(filename, url, basename, length, text) + po.start(filename, url, basename, size, now, text) # length will be None if unknown po.update(read) # read == bytes read so far po.end() + multi_progress_obj = None + + a class instance that supports the following methods: + mo.start(total_files, total_size) + mo.newMeter() => meter + mo.removeMeter(meter) + mo.end() + + The 'meter' object is similar to progress_obj, but multiple + instances may be created and updated at the same time. + + When downloading multiple files in parallel and multi_progress_obj + is None progress_obj is used in compatibility mode: finished files + are shown but there's no in-progress display. + + curl_obj = None + + a pycurl.Curl instance to be used instead of the default module-level + instance. + + Note that you don't have to configure the passed instance in any way; + urlgrabber will do all the necessary work. + + This option exists primarily to allow using urlgrabber from multiple + threads in your application, in which case you would want to instantiate a + fresh Curl object for each thread, to avoid race conditions. See the curl + documentation on thread safety for more information: + https://curl.haxx.se/libcurl/c/threadsafe.html + + Note that connection reuse (keepalive=1) is limited to the Curl instance it + was enabled on so if you're using multiple instances in your application, + connections won't be shared among them. + text = None - + specifies alternative text to be passed to the progress meter object. If not given, the default progress meter will use the basename of the file. @@ -68,14 +103,20 @@ GENERAL ARGUMENTS (kwargs) (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. - timeout = None + timeout = 300 + + a positive integer expressing the number of seconds to wait before + timing out attempts to connect to a server. If the value is None + or 0, connection attempts will not time out. The timeout is passed + to the underlying pycurl object as its CONNECTTIMEOUT option, see + the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. + http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT - a positive float expressing the number of seconds to wait for socket - operations. If the value is None or 0.0, socket operations will block - forever. Setting this option causes urlgrabber to call the settimeout - method on the Socket object used for the request. See the Python - documentation on settimeout for more information. - http://www.python.org/doc/current/lib/socket-objects.html + minrate = 1000 + + This sets the low speed threshold in bytes per second. If the server + is sending data slower than this for at least `timeout' seconds, the + library aborts the connection. bandwidth = 0 @@ -91,11 +132,11 @@ GENERAL ARGUMENTS (kwargs) range to retrieve. Either or both of the values may set to None. If first_byte is None, byte offset 0 is assumed. If last_byte is None, the last byte available is assumed. Note that - the range specification is python-like in that (0,10) will yeild + the range specification is python-like in that (0,10) will yield the first 10 bytes of the file. If set to None, no range will be used. - + reget = None [None|'simple'|'check_timestamp'] whether to attempt to reget a partially-downloaded file. Reget @@ -143,8 +184,18 @@ GENERAL ARGUMENTS (kwargs) note that proxy authentication information may be provided using normal URL constructs: proxies={ 'http' : 'http://user:host@foo:3128' } - Lastly, if proxies is None, the default environment settings will - be used. + + libproxy = False + + Use the libproxy module (if installed) to find proxies. + The libproxy code is only used if the proxies dictionary + does not provide any proxies. + + no_cache = False + + When True, server-side cache will be disabled for http and https + requests. This is equivalent to setting + http_headers = (('Pragma', 'no-cache'),) prefix = None @@ -175,7 +226,7 @@ GENERAL ARGUMENTS (kwargs) option. Note that python 2.2 handles the case of these badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. - + urlparser = URLParser() The URLParser class handles pre-processing of URLs, including @@ -198,6 +249,12 @@ GENERAL ARGUMENTS (kwargs) control, you should probably subclass URLParser and pass it in via the 'urlparser' option. + username = None + username to use for simple http auth - is automatically quoted for special characters + + password = None + password to use for simple http auth - is automatically quoted for special characters + ssl_ca_cert = None this option can be used if M2Crypto is available and will be @@ -209,45 +266,84 @@ GENERAL ARGUMENTS (kwargs) ssl_context = None No-op when using the curl backend (default) - - self.ssl_verify_peer = True + + ssl_verify_peer = True Check the server's certificate to make sure it is valid with what our CA validates - - self.ssl_verify_host = True + + ssl_verify_host = True Check the server's hostname to make sure it matches the certificate DN - self.ssl_key = None + ssl_key = None Path to the key the client should use to connect/authenticate with - self.ssl_key_type = 'PEM' + ssl_key_type = 'PEM' PEM or DER - format of key - - self.ssl_cert = None + + ssl_cert = None Path to the ssl certificate the client should use to to authenticate with - self.ssl_cert_type = 'PEM' + ssl_cert_type = 'PEM' PEM or DER - format of certificate - - self.ssl_key_pass = None + + ssl_key_pass = None password to access the ssl_key - - self.size = None - size (in bytes) or Maximum size of the thing being downloaded. + size = None + + size (in bytes) or Maximum size of the thing being downloaded. This is mostly to keep us from exploding with an endless datastream - - self.max_header_size = 2097152 + + max_header_size = 2097152 Maximum size (in bytes) of the headers. - + + ip_resolve = 'whatever' + + What type of name to IP resolving to use, default is to do both IPV4 and + IPV6. + + async_ = (key, limit) + + When this option is set, the urlgrab() is not processed immediately + but queued. parallel_wait() then processes grabs in parallel, limiting + the numer of connections in each 'key' group to at most 'limit'. + + max_connections + + The global connection limit. + + timedhosts + + The filename of the host download statistics. If defined, urlgrabber + will update the stats at the end of every download. At the end of + parallel_wait(), the updated stats are saved. If synchronous grabs + are used, you should call th_save(). + + default_speed, half_life + + These options only affect the async mirror selection code. + The default_speed option sets the speed estimate for mirrors + we have never downloaded from, and defaults to 1 MBps. + + The speed estimate also drifts exponentially from the speed + actually measured to the default speed, with default + period of 30 days. + + ftp_disable_epsv = False + + False, True + + This options disables Extended Passive Mode (the EPSV command) + which does not work correctly on some buggy ftp servers. + RETRY RELATED ARGUMENTS @@ -271,7 +367,7 @@ RETRY RELATED ARGUMENTS retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes if 12 not in retrycodes: retrycodes.append(12) - + checkfunc = None a function to do additional checks. This defaults to None, which @@ -302,7 +398,7 @@ RETRY RELATED ARGUMENTS function(obj, 'arg1', 2, kwarg=3) # obj.filename = '/tmp/stuff' # obj.url = 'http://foo.com/stuff' - + NOTE: both the "args" tuple and "kwargs" dict must be present if you use this syntax, but either (or both) can be empty. @@ -313,10 +409,11 @@ RETRY RELATED ARGUMENTS identical to checkfunc, except for the attributes defined in the CallbackObject instance. The attributes for failure_callback are: - exception = the raised exception - url = the url we're trying to fetch - tries = the number of tries so far (including this one) - retry = the value of the retry option + exception = the raised exception + url = the url we're trying to fetch + tries = the number of tries so far (including this one) + retry = the value of the retry option + retry_no_cache = the value of the retry_no_cache option The callback is present primarily to inform the calling program of the failure, but if it raises an exception (including the one it's @@ -328,6 +425,15 @@ RETRY RELATED ARGUMENTS but it cannot (without severe trickiness) prevent the exception from being raised. + failfunc = None + + The callback that gets called when urlgrab request fails. + If defined, urlgrab() calls it instead of raising URLGrabError. + Callback syntax is identical to failure_callback. + + Contrary to failure_callback, it's called only once. It's primary + purpose is to use urlgrab() without a try/except block. + interrupt_callback = None This callback is called if KeyboardInterrupt is received at any @@ -351,7 +457,20 @@ RETRY RELATED ARGUMENTS This callback is very similar to failure_callback. They are passed the same arguments, so you could use the same function for both. - + + retry_no_cache = False + + When True, automatically enable no_cache for future retries if + checkfunc performs an unsuccessful check. + + This option is useful if your application expects a set of files + from the same server to form an atomic unit and you write your + checkfunc to ensure each file being downloaded belongs to such a + unit. If transparent proxy caching is in effect, the files can + become out-of-sync, disrupting the atomicity. Enabling this option + will prevent that, while ensuring that you still enjoy the benefits + of caching when possible. + BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -368,6 +487,11 @@ BANDWIDTH THROTTLING is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set with set_bandwidth) is used. + Note that when multiple downloads run simultaneously (multiprocessing + or the parallel urlgrab() feature is used) the total bandwidth might + exceed the throttle limit. You may want to also set max_connections=1 + or scale your throttle option down accordingly. + THROTTLING EXAMPLES: Lets say you have a 100 Mbps connection. This is (about) 10^8 bits @@ -411,25 +535,64 @@ BANDWIDTH THROTTLING """ - - import os import sys -import urlparse import time import string import urllib -import urllib2 -import mimetools -import thread import types import stat import pycurl from ftplib import parse150 -from StringIO import StringIO -from httplib import HTTPException -import socket -from byterange import range_tuple_normalize, range_tuple_to_header, RangeError +import socket, select, fcntl +from io import BytesIO +import numbers + +try: + import urllib.parse as urlparse + urlquote, urlunquote = urlparse.quote, urlparse.unquote + from urllib.request import HTTPError, url2pathname, pathname2url +except ImportError: + import urlparse + from urllib2 import HTTPError + urlquote, urlunquote = urllib.quote, urllib.unquote + from urllib import url2pathname, pathname2url + +try: + from http.client import responses, HTTPException +except ImportError: + from httplib import responses, HTTPException + +if sys.version_info >= (3,): + # We do an explicit version check here because because python2 + # also has an io module with StringIO, but it is incompatible, + # and returns str instead of unicode somewhere. + from io import StringIO +else: + from cStringIO import StringIO + +from six import text_type, string_types + +from .byterange import range_tuple_normalize, range_tuple_to_header, RangeError + +try: + import xattr + if not hasattr(xattr, 'set'): + xattr = None # This is a "newer" API. +except ImportError: + xattr = None + +def _bytes_repr(s): + "A wrapper to avoid the b'' that python3 insists on when printing bytes" + if isinstance(s, string_types): + return s + else: + return repr(s)[2:-1] + +def _urlunquote_convert(s): + if not isinstance(s, text_type): + s = s.decode('utf8') + return urlunquote(s) ######################################################################## # MODULE INITIALIZATION @@ -439,6 +602,12 @@ try: except: __version__ = '???' +try: + # this part isn't going to do much - need to talk to gettext + from i18n import _ +except ImportError as msg: + def _(st): return st + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -468,7 +637,7 @@ def _init_default_logger(logspec=None): the form URLGRABBER_DEBUG=level,filename - + where "level" can be either an integer or a log level from the logging module (DEBUG, INFO, etc). If the integer is zero or less, logging will be disabled. Filename is the filename where @@ -481,8 +650,8 @@ def _init_default_logger(logspec=None): URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout URLGRABBER_DEBUG=INFO # log info and higher to stderr - - This funtion is called during module initialization. It is not + + This function is called during module initialization. It is not intended to be called from outside. The only reason it is a function at all is to keep the module-level namespace tidy and to collect the code into a nice block.''' @@ -492,8 +661,11 @@ def _init_default_logger(logspec=None): logspec = os.environ['URLGRABBER_DEBUG'] dbinfo = logspec.split(',') import logging - level = logging._levelNames.get(dbinfo[0], None) - if level is None: level = int(dbinfo[0]) + if sys.version_info.major == 2: + level = logging._levelNames.get(dbinfo[0], None) + else: + level = logging.getLevelName(dbinfo[0]) + if level is None or not isinstance(level, int): level = int(dbinfo[0]) if level < 1: raise ValueError() formatter = logging.Formatter('%(asctime)s %(message)s') @@ -504,6 +676,7 @@ def _init_default_logger(logspec=None): else: handler = logging.FileHandler(filename) handler.setFormatter(formatter) DBOBJ = logging.getLogger('urlgrabber') + DBOBJ.propagate = False DBOBJ.addHandler(handler) DBOBJ.setLevel(level) except (KeyError, ImportError, ValueError): @@ -512,9 +685,9 @@ def _init_default_logger(logspec=None): def _log_package_state(): if not DEBUG: return - DEBUG.info('urlgrabber version = %s' % __version__) - DEBUG.info('trans function "_" = %s' % _) - + DEBUG.debug('urlgrabber version = %s' % __version__) + DEBUG.debug('trans function "_" = %s' % _) + _init_default_logger() _log_package_state() @@ -527,6 +700,29 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## +######################################################################## +# UTILITY FUNCTIONS +######################################################################## + +# These functions are meant to be utilities for the urlgrabber library to use. + +def _to_utf8(obj, errors='replace'): + '''convert 'unicode' to an encoded utf-8 byte string ''' + # stolen from yum.i18n + if isinstance(obj, text_type): + obj = obj.encode('utf-8', errors) + return obj + +def exception2msg(e): + try: + return str(e) + except UnicodeEncodeError: + # always use byte strings + return text_type(e).encode('utf8') + +######################################################################## +# END UTILITY FUNCTIONS +######################################################################## class URLGrabError(IOError): @@ -551,7 +747,7 @@ class URLGrabError(IOError): 14 - HTTPError (includes .code and .exception attributes) 15 - user abort 16 - error writing to local file - + MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try @@ -562,7 +758,7 @@ class URLGrabError(IOError): -1 - retry the download, unknown reason Note: to test which group a code is in, you can simply do integer - division by 256: e.errno / 256 + division by 256: e.errno // 256 Negative codes are reserved for use by functions passed in to retrygrab with checkfunc. The value -1 is built in as a generic @@ -606,7 +802,7 @@ def urlgrab(url, filename=None, **kwargs): If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if the copy_local kwarg == 0. - + See module documentation for a description of possible kwargs. """ return default_grabber.urlgrab(url, filename, **kwargs) @@ -616,7 +812,7 @@ def urlopen(url, **kwargs): If a progress object or throttle specifications exist, then a special file object will be returned that supports them. The file object can be treated like any other file object. - + See module documentation for a description of possible kwargs. """ return default_grabber.urlopen(url, **kwargs) @@ -626,7 +822,7 @@ def urlread(url, limit=None, **kwargs): If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' - + See module documentation for a description of possible kwargs. """ return default_grabber.urlread(url, limit, **kwargs) @@ -662,37 +858,41 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ + url = _to_utf8(url) quote = opts.quote - + if opts.prefix: url = self.add_prefix(url, opts.prefix) - + parts = urlparse.urlparse(url) (scheme, host, path, parm, query, frag) = parts if not scheme or (len(scheme) == 1 and scheme in string.letters): # if a scheme isn't specified, we guess that it's "file:" - if url[0] not in '/\\': url = os.path.abspath(url) - url = 'file:' + urllib.pathname2url(url) + if url[0] not in b'/\\': url = os.path.abspath(url) + pathname = pathname2url(url) + if not isinstance(pathname, bytes): + pathname = pathname.encode('utf8') + url = b'file:' + pathname parts = urlparse.urlparse(url) quote = 0 # pathname2url quotes, so we won't do it again - - if scheme in ['http', 'https']: + + if scheme in [b'http', b'https']: parts = self.process_http(parts, url) - + if quote is None: quote = self.guess_should_quote(parts) if quote: parts = self.quote(parts) - + url = urlparse.urlunparse(parts) return url, parts def add_prefix(self, url, prefix): - if prefix[-1] == '/' or url[0] == '/': + if prefix.endswith(b'/') or url.startswith(b'/'): url = prefix + url else: - url = prefix + '/' + url + url = prefix + b'/' + url return url def process_http(self, parts, url): @@ -709,8 +909,10 @@ class URLParser: passing into urlgrabber. """ (scheme, host, path, parm, query, frag) = parts - path = urllib.quote(path) - return (scheme, host, path, parm, query, frag) + newpath = urlquote(path, safe='/$') + if not isinstance(path, text_type) and isinstance(newpath, text_type): + newpath = newpath.encode('utf8') + return (scheme, host, newpath, parm, query, frag) hexvals = '0123456789ABCDEF' def guess_should_quote(self, parts): @@ -724,9 +926,11 @@ class URLParser: else -> 1 """ (scheme, host, path, parm, query, frag) = parts + if not isinstance(path, text_type): + path = path.decode('utf8') if ' ' in path: return 1 - ind = string.find(path, '%') + ind = path.find('%') if ind > -1: while ind > -1: if len(path) < ind+3: @@ -735,10 +939,10 @@ class URLParser: if code[0] not in self.hexvals or \ code[1] not in self.hexvals: return 1 - ind = string.find(path, '%', ind+1) + ind = path.find('%', ind+1) return 0 return 1 - + class URLGrabberOptions: """Class to ease kwargs handling.""" @@ -751,70 +955,116 @@ class URLGrabberOptions: if delegate is None: self._set_defaults() self._set_attributes(**kwargs) - + def __getattr__(self, name): if self.delegate and hasattr(self.delegate, name): return getattr(self.delegate, name) - raise AttributeError, name - + raise AttributeError(name) + def raw_throttle(self): - """Calculate raw throttle value from throttle and bandwidth + """Calculate raw throttle value from throttle and bandwidth values. """ - if self.throttle <= 0: + if self.throttle <= 0: return 0 - elif type(self.throttle) == type(0): + elif isinstance(self.throttle, int): return float(self.throttle) else: # throttle is a float return self.bandwidth * self.throttle - + + def find_proxy(self, url, scheme): + """Find the proxy to use for this URL. + Use the proxies dictionary first, then libproxy. + """ + self.proxy = None + if scheme not in ('ftp', 'http', 'https'): + return + + if self.proxies: + proxy = self.proxies.get(scheme) + if proxy is None: + if scheme == 'http': + proxy = self.proxies.get('https') + elif scheme == 'https': + proxy = self.proxies.get('http') + if proxy == '_none_': + proxy = '' + self.proxy = proxy + return + + if self.libproxy: + global _libproxy_cache + if _libproxy_cache is None: + try: + import libproxy + _libproxy_cache = libproxy.ProxyFactory() + except: + _libproxy_cache = False + if _libproxy_cache: + for proxy in _libproxy_cache.getProxies(url): + if proxy.startswith('http://'): + if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url)) + self.proxy = proxy + break + def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the options specified in kwargs. """ return URLGrabberOptions(delegate=self, **kwargs) - + def _set_attributes(self, **kwargs): """Update object attributes with those provided in kwargs.""" self.__dict__.update(kwargs) - if kwargs.has_key('range'): + if 'range' in kwargs: # normalize the supplied range value self.range = range_tuple_normalize(self.range) + if 'async' in kwargs: + self.async_ = self.__dict__.pop('async') if not self.reget in [None, 'simple', 'check_timestamp']: - raise URLGrabError(11, _('Illegal reget mode: %s') \ - % (self.reget, )) + raise URLGrabError(11, _('Illegal reget mode: %s') + % (self.reget,)) def _set_defaults(self): - """Set all options to their default values. + """Set all options to their default values. When adding new options, make sure a default is provided here. """ self.progress_obj = None + self.multi_progress_obj = None + self.curl_obj = None self.throttle = 1.0 self.bandwidth = 0 self.retry = None self.retrycodes = [-1,2,4,5,6,7] self.checkfunc = None + self.failfunc = _do_raise self.copy_local = 0 self.close_connection = 0 self.range = None self.user_agent = 'urlgrabber/%s' % __version__ + self.ip_resolve = None self.keepalive = 1 self.proxies = None + self.libproxy = False + self.proxy = None self.reget = None self.failure_callback = None self.interrupt_callback = None self.prefix = None self.opener = None self.cache_openers = True - self.timeout = None + self.timeout = 300 + self.minrate = None self.text = None self.http_headers = None self.ftp_headers = None self.data = None self.urlparser = URLParser() self.quote = None + self.username = None + self.password = None self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb @@ -827,10 +1077,19 @@ class URLGrabberOptions: self.size = None # if we know how big the thing we're getting is going # to be. this is ultimately a MAXIMUM size for the file self.max_header_size = 2097152 #2mb seems reasonable for maximum header size - + self.async_ = None # blocking by default + self.mirror_group = None + self.max_connections = 5 + self.timedhosts = None + self.half_life = 30*24*60*60 # 30 days + self.default_speed = 500e3 # 500 kBps + self.ftp_disable_epsv = False + self.no_cache = False + self.retry_no_cache = False + def __repr__(self): return self.format() - + def format(self, indent=' '): keys = self.__dict__.keys() if self.delegate is not None: @@ -838,29 +1097,39 @@ class URLGrabberOptions: keys.sort() s = '{\n' for k in keys: - s = s + indent + '%-15s: %s,\n' % \ - (repr(k), repr(self.__dict__[k])) + s = s + indent + '%-15r: %r,\n' % (k, self.__dict__[k]) if self.delegate: df = self.delegate.format(indent + ' ') s = s + indent + '%-15s: %s\n' % ("'delegate'", df) s = s + indent + '}' return s -class URLGrabber: +def _do_raise(obj): + raise obj.exception + +def _run_callback(cb, obj): + if not cb: + return + if callable(cb): + return cb(obj) + cb, arg, karg = cb + return cb(obj, *arg, **karg) + +class URLGrabber(object): """Provides easy opening of URLs with a variety of options. - + All options are specified as kwargs. Options may be specified when the class is created and may be overridden on a per request basis. - + New objects inherit default values from default_grabber. """ - + def __init__(self, **kwargs): self.opts = URLGrabberOptions(**kwargs) - + def _retry(self, opts, func, *args): tries = 0 - while 1: + while True: # there are only two ways out of this loop. The second has # several "sub-ways" # 1) via the return in the "try" block @@ -872,122 +1141,142 @@ class URLGrabber: # beware of infinite loops :) tries = tries + 1 exception = None - retrycode = None callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) try: - r = apply(func, (opts,) + args, {}) + r = func(opts, *args) if DEBUG: DEBUG.info('success') return r - except URLGrabError, e: + except URLGrabError as e: exception = e callback = opts.failure_callback - retrycode = e.errno - except KeyboardInterrupt, e: + except KeyboardInterrupt as e: exception = e callback = opts.interrupt_callback + if not callback: + raise if DEBUG: DEBUG.info('exception: %s', exception) if callback: if DEBUG: DEBUG.info('calling callback: %s', callback) - cb_func, cb_args, cb_kwargs = self._make_callback(callback) obj = CallbackObject(exception=exception, url=args[0], - tries=tries, retry=opts.retry) - cb_func(obj, *cb_args, **cb_kwargs) + tries=tries, retry=opts.retry, + retry_no_cache=opts.retry_no_cache) + _run_callback(callback, obj) if (opts.retry is None) or (tries == opts.retry): if DEBUG: DEBUG.info('retries exceeded, re-raising') - raise + raise exception + retrycode = getattr(exception, 'errno', None) if (retrycode is not None) and (retrycode not in opts.retrycodes): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) - raise - - def urlopen(self, url, **kwargs): + raise exception + if retrycode is not None and retrycode < 0 and opts.retry_no_cache: + opts.no_cache = True + + def urlopen(self, url, opts=None, **kwargs): """open the url and return a file object - If a progress object or throttle value specified when this - object was created, then a special file object will be - returned that supports them. The file object can be treated + If a progress object or throttle value specified when this + object was created, then a special file object will be + returned that supports them. The file object can be treated like any other file object. """ - opts = self.opts.derive(**kwargs) - if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + url = _to_utf8(url) + opts = (opts or self.opts).derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %r' % (opts,)) + (url,parts) = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) def retryfunc(opts, url): return PyCurlFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) - - def urlgrab(self, url, filename=None, **kwargs): + + def urlgrab(self, url, filename=None, opts=None, **kwargs): """grab the file at <url> and make a local copy at <filename> If filename is none, the basename of the url is used. - urlgrab returns the filename of the local file, which may be + urlgrab returns the filename of the local file, which may be different from the passed-in filename if copy_local == 0. """ - opts = self.opts.derive(**kwargs) - if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + url = _to_utf8(url) + opts = (opts or self.opts).derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %r' % (opts,)) + (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts + opts.find_proxy(url, scheme) if filename is None: - filename = os.path.basename( urllib.unquote(path) ) + filename = os.path.basename(_urlunquote_convert(path)) + if not filename: + # This is better than nothing. + filename = 'index.html' if scheme == 'file' and not opts.copy_local: - # just return the name of the local file - don't make a + # just return the name of the local file - don't make a # copy currently - path = urllib.url2pathname(path) + path = url2pathname(path) if host: path = os.path.normpath('//' + host + path) if not os.path.exists(path): - err = URLGrabError(2, + err = URLGrabError(2, _('Local file does not exist: %s') % (path, )) err.url = url raise err elif not os.path.isfile(path): - err = URLGrabError(3, + err = URLGrabError(3, _('Not a normal file: %s') % (path, )) err.url = url raise err elif not opts.range: if not opts.checkfunc is None: - cb_func, cb_args, cb_kwargs = \ - self._make_callback(opts.checkfunc) - obj = CallbackObject() - obj.filename = path - obj.url = url - apply(cb_func, (obj, )+cb_args, cb_kwargs) + obj = CallbackObject(filename=path, url=url) + _run_callback(opts.checkfunc, obj) return path - + + if opts.async_: + opts.url = url + opts.filename = filename + opts.size = int(opts.size or 0) + _async_queue.append(opts) + return filename + def retryfunc(opts, url, filename): fo = PyCurlFileObject(url, filename, opts) try: fo._do_grab() + if fo._tm_last: + dlsz = fo._tm_last[0] - fo._tm_first[0] + dltm = fo._tm_last[1] - fo._tm_first[1] + _TH.update(url, dlsz, dltm, None) if not opts.checkfunc is None: - cb_func, cb_args, cb_kwargs = \ - self._make_callback(opts.checkfunc) - obj = CallbackObject() - obj.filename = filename - obj.url = url - apply(cb_func, (obj, )+cb_args, cb_kwargs) + obj = CallbackObject(filename=filename, url=url) + _run_callback(opts.checkfunc, obj) finally: fo.close() return filename - - return self._retry(opts, retryfunc, url, filename) - - def urlread(self, url, limit=None, **kwargs): + + try: + return self._retry(opts, retryfunc, url, filename) + except URLGrabError as e: + _TH.update(url, 0, 0, e) + opts.exception = e + return _run_callback(opts.failfunc, opts) + + def urlread(self, url, limit=None, opts=None, **kwargs): """read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note - that urlread is NOT intended to be used as a way of saying - "I want the first N bytes" but rather 'read the whole file + that urlread is NOT intended to be used as a way of saying + "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' """ - opts = self.opts.derive(**kwargs) - if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + url = _to_utf8(url) + opts = (opts or self.opts).derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %r' % (opts,)) + (url,parts) = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) if limit is not None: limit = limit + 1 - + def retryfunc(opts, url, limit): fo = PyCurlFileObject(url, filename=None, opts=opts) s = '' @@ -1000,26 +1289,23 @@ class URLGrabber: else: s = fo.read(limit) if not opts.checkfunc is None: - cb_func, cb_args, cb_kwargs = \ - self._make_callback(opts.checkfunc) - obj = CallbackObject() - obj.data = s - obj.url = url - apply(cb_func, (obj, )+cb_args, cb_kwargs) + obj = CallbackObject(data=s, url=url) + _run_callback(opts.checkfunc, obj) finally: fo.close() return s - + s = self._retry(opts, retryfunc, url, limit) if limit and len(s) > limit: - err = URLGrabError(8, + err = URLGrabError(8, _('Exceeded limit (%i): %s') % (limit, url)) err.url = url raise err return s - + def _make_callback(self, callback_obj): + # not used, left for compatibility if callable(callback_obj): return callback_obj, (), {} else: @@ -1030,10 +1316,10 @@ class URLGrabber: default_grabber = URLGrabber() -class PyCurlFileObject(): +class PyCurlFileObject(object): def __init__(self, url, filename, opts): self.fo = None - self._hdr_dump = '' + self._hdr_dump = b'' self._parsed_hdr = None self.url = url self.scheme = urlparse.urlsplit(self.url)[0] @@ -1042,20 +1328,24 @@ class PyCurlFileObject(): self.reget_time = None self.opts = opts if self.opts.reget == 'check_timestamp': - raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." + raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.") self._complete = False - self._rbuf = '' + self._rbuf = b'' self._rbufsize = 1024*8 self._ttime = time.time() self._tsize = 0 self._amount_read = 0 self._reget_length = 0 + self._range = None self._prog_running = False self._error = (None, None) - self.size = None + self.size = 0 + self._hdr_ended = False + self._tm_first = None + self._tm_last = None self._do_open() - - + + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for @@ -1063,48 +1353,93 @@ class PyCurlFileObject(): if hasattr(self.fo, name): return getattr(self.fo, name) - raise AttributeError, name + raise AttributeError(name) def _retrieve(self, buf): try: + tm = self._amount_read + len(buf), time.time() + if self._tm_first is None: + self._tm_first = tm + else: + self._tm_last = tm + if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length - self.opts.progress_obj.start(self._prog_reportname, - urllib.unquote(self.url), - self._prog_basename, + self.opts.progress_obj.start(self._prog_reportname, + _urlunquote_convert(self.url), + self._prog_basename, size=size, text=self.opts.text) self._prog_running = True self.opts.progress_obj.update(self._amount_read) self._amount_read += len(buf) - self.fo.write(buf) + try: + if self._range: + # client-side ranges + pos = self._amount_read - len(buf) + start = self._range[0] - pos + stop = self._range[1] - pos + if start < len(buf) and stop > 0: + self.fo.write(buf[max(start, 0):stop]) + else: + self.fo.write(buf) + except IOError as e: + self._cb_error = URLGrabError(16, exception2msg(e)) + return -1 return len(buf) except KeyboardInterrupt: return -1 - + def _hdr_retrieve(self, buf): - if self._over_max_size(cur=len(self._hdr_dump), + if self._hdr_ended: + self._hdr_dump = b'' + self.size = 0 + self._hdr_ended = False + + if self._over_max_size(cur=len(self._hdr_dump), max_size=self.opts.max_header_size): - return -1 + return -1 try: - self._hdr_dump += buf # we have to get the size before we do the progress obj start # but we can't do that w/o making it do 2 connects, which sucks # so we cheat and stuff it in here in the hdr_retrieve - if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: - length = buf.split(':')[1] - self.size = int(length) - elif self.scheme in ['ftp']: + if self.scheme in [b'http', b'https']: + if buf.lower().find(b'content-length:') != -1: + length = buf.split(b':')[1] + self.size = int(length) + elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 OK ' in buf: + # reget was attempted but server sends it all + # undo what we did in _build_range() + self.append = False + self.reget_time = None + self._amount_read = 0 + self._reget_length = 0 + self._range = self.opts.range + self.fo.truncate(0) + elif self.scheme in [b'ftp']: s = None - if buf.startswith('213 '): + if buf.startswith(b'213 '): s = buf[3:].strip() - elif buf.startswith('150 '): + if len(s) >= 14: + s = None # ignore MDTM responses + elif buf.startswith(b'150 '): s = parse150(buf) if s: self.size = int(s) - + + if buf.lower().find(b'location') != -1: + location = b':'.join(buf.split(b':')[1:]) + location = location.strip() + self.scheme = urlparse.urlsplit(location)[0] + self.url = location + + self._hdr_dump += buf + if len(self._hdr_dump) != 0 and buf == b'\r\n': + self._hdr_ended = True + if DEBUG: DEBUG.debug('header ended:') + return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT @@ -1112,12 +1447,14 @@ class PyCurlFileObject(): def _return_hdr_obj(self): if self._parsed_hdr: return self._parsed_hdr - statusend = self._hdr_dump.find('\n') + statusend = self._hdr_dump.find(b'\n') + statusend += 1 # ridiculous as it may seem. hdrfp = StringIO() hdrfp.write(self._hdr_dump[statusend:]) - self._parsed_hdr = mimetools.Message(hdrfp) + hdrfp.seek(0) + self._parsed_hdr = email.message_from_string(hdrfp) return self._parsed_hdr - + hdr = property(_return_hdr_obj) http_code = property(fget= lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE)) @@ -1127,6 +1464,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts + # keepalives + if not opts.keepalive: + self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) @@ -1136,172 +1476,219 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) - - if DEBUG: + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) + + if DEBUG and DEBUG.level <= 10: self.curl_obj.setopt(pycurl.VERBOSE, True) if opts.user_agent: self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) - + if opts.ip_resolve: + # Default is: IPRESOLVE_WHATEVER + ipr = opts.ip_resolve.lower() + if ipr == 'whatever': # Do we need this? + self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER) + if ipr == 'ipv4': + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) + if ipr == 'ipv6': + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6) + # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.MAXREDIRS, 5) - + # timeouts timeout = 300 - if opts.timeout: - timeout = int(opts.timeout) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + if hasattr(opts, 'timeout'): + timeout = int(opts.timeout or 0) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, opts.minrate or 1000) + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) # ssl options - if self.scheme == 'https': + if self.scheme == b'https': if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) - self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) + if opts.ssl_verify_host: # 1 is meaningless to curl + self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2) if opts.ssl_key: self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) if opts.ssl_key_type: self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) if opts.ssl_cert: self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) - if opts.ssl_cert_type: + # if we have a client side cert - turn off reuse b/c nss is odd + self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) + if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) #headers: - if opts.http_headers and self.scheme in ('http', 'https'): + if self.scheme in (b'http', b'https'): headers = [] - for (tag, content) in opts.http_headers: - headers.append('%s:%s' % (tag, content)) - self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + if opts.http_headers is not None: + for (tag, content) in opts.http_headers: + headers.append('%s:%s' % (tag, content)) + if opts.no_cache: + headers.append('Pragma:no-cache') + if headers: + self.curl_obj.setopt(pycurl.HTTPHEADER, headers) # ranges: if opts.range or opts.reget: range_str = self._build_range() if range_str: self.curl_obj.setopt(pycurl.RANGE, range_str) - + # throttle/bandwidth if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) - - # proxy settings - if opts.proxies: - for (scheme, proxy) in opts.proxies.items(): - if self.scheme in ('ftp'): # only set the ftp proxy for ftp items - if scheme not in ('ftp'): - continue - else: - if proxy == '_none_': proxy = "" - self.curl_obj.setopt(pycurl.PROXY, proxy) - elif self.scheme in ('http', 'https'): - if scheme not in ('http', 'https'): - continue - else: - if proxy == '_none_': proxy = "" - self.curl_obj.setopt(pycurl.PROXY, proxy) - - # FIXME username/password/auth settings + + # proxy + if opts.proxy is not None: + self.curl_obj.setopt(pycurl.PROXY, opts.proxy) + self.curl_obj.setopt(pycurl.PROXYAUTH, + # All but Kerberos. BZ 769254 + pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE) + + if opts.username and opts.password: + if self.scheme in (b'http', b'https'): + self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) + + if opts.username and opts.password: + # apparently when applying them as curlopts they do not require quoting of any kind + userpwd = '%s:%s' % (opts.username, opts.password) + self.curl_obj.setopt(pycurl.USERPWD, userpwd) #posts - simple - expects the fields as they are if opts.data: self.curl_obj.setopt(pycurl.POST, True) - self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) - + self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data)) + + # ftp + if opts.ftp_disable_epsv: + self.curl_obj.setopt(pycurl.FTP_USE_EPSV, False) + # our url self.curl_obj.setopt(pycurl.URL, self.url) - - + + def _do_perform(self): if self._complete: return - + try: self.curl_obj.perform() - except pycurl.error, e: + except pycurl.error as e: # XXX - break some of these out a bit more clearly - # to other URLGrabErrors from + # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html # this covers e.args[0] == 22 pretty well - which will be common - + code = self.http_code errcode = e.args[0] + errurl = _urlunquote_convert(self.url) + if self._error[0]: errcode = self._error[0] - - if errcode == 23 and code >= 200 and code < 299: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url - + + if errcode == 23 and 200 <= code <= 299: # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside # since we cannot know what it is -I'm banking on it being - # a ctrl-c. XXXX - if there's a way of going back two raises to + # a ctrl-c. XXXX - if there's a way of going back two raises to # figure out what aborted the pycurl process FIXME - raise KeyboardInterrupt - + raise getattr(self, '_cb_error', KeyboardInterrupt) + elif errcode == 28: - err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) - err.url = self.url + err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e)) + err.url = errurl raise err - elif errcode == 35: - msg = _("problem making ssl connection") - err = URLGrabError(14, msg) - err.url = self.url - raise err - elif errcode == 37: - msg = _("Could not open/read %s") % (self.url) - err = URLGrabError(14, msg) - err.url = self.url - raise err - + elif errcode == 42: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside # since we cannot know what it is -I'm banking on it being - # a ctrl-c. XXXX - if there's a way of going back two raises to + # a ctrl-c. XXXX - if there's a way of going back two raises to # figure out what aborted the pycurl process FIXME raise KeyboardInterrupt - - elif errcode == 58: - msg = _("problem with the local client certificate") - err = URLGrabError(14, msg) - err.url = self.url - raise err - elif errcode == 60: - msg = _("client cert cannot be verified or client cert incorrect") + else: + pyerr2str = { 5 : _("Couldn't resolve proxy"), + 6 : _("Couldn't resolve host"), + 7 : _("Couldn't connect"), + 8 : _("Bad reply to FTP server"), + 9 : _("Access denied"), + 11 : _("Bad reply to FTP pass"), + 13 : _("Bad reply to FTP pasv"), + 14 : _("Bad reply to FTP 227"), + 15 : _("Couldn't get FTP host"), + 17 : _("Couldn't set FTP type"), + 18 : _("Partial file"), + 19 : _("FTP RETR command failed"), + 22 : _("HTTP returned error"), + 23 : _("Write error"), + 25 : _("Upload failed"), + 26 : _("Read error"), + 27 : _("Out of Memory"), + 28 : _("Operation timed out"), + 30 : _("FTP PORT command failed"), + 31 : _("FTP REST command failed"), + 33 : _("Range failed"), + 34 : _("HTTP POST failed"), + 35 : _("SSL CONNECT failed"), + 36 : _("Couldn't resume download"), + 37 : _("Couldn't read file"), + 42 : _("Aborted by callback"), + 47 : _("Too many redirects"), + 51 : _("Peer certificate failed verification"), + 52 : _("Got nothing: SSL certificate expired?"), + 53 : _("SSL engine not found"), + 54 : _("SSL engine set failed"), + 55 : _("Network error send()"), + 56 : _("Network error recv()"), + 58 : _("Local certificate failed"), + 59 : _("SSL set cipher failed"), + 60 : _("Local CA certificate failed"), + 61 : _("HTTP bad transfer encoding"), + 63 : _("Maximum file size exceeded"), + 64 : _("FTP SSL failed"), + 67 : _("Authentication failure"), + 70 : _("Out of disk space on server"), + 73 : _("Remove file exists"), + 77 : _("Problem with the SSL CA cert (path? access rights?)"), + } + errstr = str(e.args[1]) or pyerr2str.get(errcode, '<Unknown>') + if code and not 200 <= code <= 299: + scheme = _bytes_repr(self.scheme) + msg = '%s Error %d - %s' % (scheme.upper(), code, + scheme in ('http', 'https') + and responses.get(code) or errstr) + else: + msg = 'curl#%s - "%s"' % (errcode, errstr) + code = errcode + err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl + err.code = code raise err - - elif errcode == 63: - if self._error[1]: - msg = self._error[1] - else: - msg = _("Max download size exceeded on %s") % (self.url) + + else: + if self._error[1]: + msg = self._error[1] err = URLGrabError(14, msg) - err.url = self.url + err.url = _urlunquote_convert(self.url) raise err - - elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) - else: - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) - code = errcode - err = URLGrabError(14, msg) - err.code = code - err.exception = e - raise err def _do_open(self): - self.curl_obj = _curl_cache + if hasattr(self.opts, 'curl_obj') and self.opts.curl_obj is not None: + self.curl_obj = self.opts.curl_obj + else: + self.curl_obj = _curl_cache self.curl_obj.reset() # reset all old settings away, just in case # setup any ranges self._set_opts() @@ -1310,11 +1697,11 @@ class PyCurlFileObject(): def _add_headers(self): pass - + def _build_range(self): reget_length = 0 rt = None - if self.opts.reget and type(self.filename) in types.StringTypes: + if self.opts.reget and isinstance(self.filename, string_types): # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1325,15 +1712,19 @@ class PyCurlFileObject(): reget_length = s[stat.ST_SIZE] # Set initial length when regetting - self._amount_read = reget_length + self._amount_read = reget_length self._reget_length = reget_length # set where we started from, too rt = reget_length, '' self.append = 1 - + if self.opts.range: rt = self.opts.range - if rt[0]: rt = (rt[0] + reget_length, rt[1]) + + if rt[0] is None: + rt = (0, rt[1]) + rt = (rt[0] + reget_length, rt[1]) + if rt: header = range_tuple_to_header(rt) @@ -1345,10 +1736,10 @@ class PyCurlFileObject(): def _make_request(self, req, opener): #XXXX # This doesn't do anything really, but we could use this - # instead of do_open() to catch a lot of crap errors as + # instead of do_open() to catch a lot of crap errors as # mstenner did before here return (self.fo, self.hdr) - + try: if self.opts.timeout: old_to = socket.getdefaulttimeout() @@ -1360,22 +1751,22 @@ class PyCurlFileObject(): else: fo = opener.open(req) hdr = fo.info() - except ValueError, e: + except ValueError as e: err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, )) err.url = self.url raise err - except RangeError, e: + except RangeError as e: err = URLGrabError(9, _('%s on %s') % (e, self.url)) err.url = self.url raise err - except urllib2.HTTPError, e: + except HTTPError as e: new_e = URLGrabError(14, _('%s on %s') % (e, self.url)) new_e.code = e.code new_e.exception = e new_e.url = self.url raise new_e - except IOError, e: + except IOError as e: if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) err.url = self.url @@ -1385,41 +1776,41 @@ class PyCurlFileObject(): err.url = self.url raise err - except OSError, e: + except OSError as e: err = URLGrabError(5, _('%s on %s') % (e, self.url)) err.url = self.url raise err - except HTTPException, e: - err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \ - (e.__class__.__name__, self.url, e)) + except HTTPException as e: + err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') + % (e.__class__.__name__, self.url, e)) err.url = self.url raise err else: return (fo, hdr) - + def _do_grab(self): """dump the file to a filename or StringIO buffer""" if self._complete: return _was_filename = False - if type(self.filename) in types.StringTypes and self.filename: + if isinstance(self.filename, string_types) and self.filename: _was_filename = True self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) - + if self.append: mode = 'ab' else: mode = 'wb' - if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \ - (self.filename, mode)) + if DEBUG: DEBUG.info('opening local file "%s" with mode %s' + % (self.filename, mode)) try: self.fo = open(self.filename, mode) - except IOError, e: - err = URLGrabError(16, _(\ - 'error opening local file from %s, IOError: %s') % (self.url, e)) + except IOError as e: + err = URLGrabError(16, _('error opening local file from %s, IOError: %s') + % (self.url, e)) err.url = self.url raise err @@ -1427,34 +1818,58 @@ class PyCurlFileObject(): self._prog_reportname = 'MEMORY' self._prog_basename = 'MEMORY' - - self.fo = StringIO() + + self.fo = BytesIO() # if this is to be a tempfile instead.... # it just makes crap in the tempdir #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') - - self._do_perform() - - + try: + self._do_perform() + except URLGrabError as e: + self.fo.flush() + self.fo.close() + raise e if _was_filename: # close it up self.fo.flush() self.fo.close() + + # Set the URL where we got it from: + if xattr is not None: + # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes + try: + xattr.set(self.filename, 'user.xdg.origin.url', self.url) + except: + pass # URL too long. = IOError ... ignore everything. + # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: - os.utime(self.filename, (mod_time, mod_time)) + try: + os.utime(self.filename, (mod_time, mod_time)) + except OSError as e: + err = URLGrabError(16, _('error setting timestamp on file %s from %s, OSError: %s') + % (self.filename, self.url, e)) + err.url = self.url + raise err # re open it - self.fo = open(self.filename, 'r') + try: + self.fo = open(self.filename, 'r') + except IOError as e: + err = URLGrabError(16, _('error opening file from %s, IOError: %s') + % (self.url, e)) + err.url = self.url + raise err + else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) self._complete = True - + def _fill_buffer(self, amt=None): """fill the buffer to contain at least 'amt' bytes by reading from the underlying file object. If amt is None, then it will @@ -1471,9 +1886,9 @@ class PyCurlFileObject(): # if we've made it here, then we don't have enough in the buffer # and we need to read more. - + if not self._complete: self._do_grab() #XXX cheater - change on ranges - + buf = [self._rbuf] bufsize = len(self._rbuf) while amt is None or amt: @@ -1483,23 +1898,23 @@ class PyCurlFileObject(): (time.time() - self._ttime) if diff > 0: time.sleep(diff) self._ttime = time.time() - + # now read some data, up to self._rbufsize if amt is None: readamount = self._rbufsize else: readamount = min(amt, self._rbufsize) try: new = self.fo.read(readamount) - except socket.error, e: + except socket.error as e: err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e)) err.url = self.url raise err - except socket.timeout, e: + except socket.timeout as e: raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) err.url = self.url raise err - except IOError, e: + except IOError as e: raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e)) err.url = self.url raise err @@ -1515,7 +1930,7 @@ class PyCurlFileObject(): #if self.opts.progress_obj: # self.opts.progress_obj.update(self._amount_read) - self._rbuf = string.join(buf, '') + self._rbuf = b''.join(buf) return def _progress_update(self, download_total, downloaded, upload_total, uploaded): @@ -1526,35 +1941,31 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) - except KeyboardInterrupt: + except (KeyboardInterrupt, IOError): return -1 - + def _over_max_size(self, cur, max_size=None): if not max_size: - max_size = self.size - if self.opts.size: # if we set an opts size use that, no matter what - max_size = self.opts.size + if not self.opts.size: + max_size = self.size + else: + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb - if cur > max_size + max_size*.10: + + if cur > int(float(max_size) * 1.10): msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) self._error = (pycurl.E_FILESIZE_EXCEEDED, msg) return True return False - - def _to_utf8(self, obj, errors='replace'): - '''convert 'unicode' to an encoded utf-8 byte string ''' - # stolen from yum.i18n - if isinstance(obj, unicode): - obj = obj.encode('utf-8', errors) - return obj - + def read(self, amt=None): self._fill_buffer(amt) if amt is None: - s, self._rbuf = self._rbuf, '' + s, self._rbuf = self._rbuf, b'' else: s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] return s @@ -1562,13 +1973,13 @@ class PyCurlFileObject(): def readline(self, limit=-1): if not self._complete: self._do_grab() return self.fo.readline() - - i = string.find(self._rbuf, '\n') + + i = self._rbuf.find('\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) self._fill_buffer(L + self._rbufsize) if not len(self._rbuf) > L: break - i = string.find(self._rbuf, '\n', L) + i = self._rbuf.find('\n', L) if i < 0: i = len(self._rbuf) else: i = i+1 @@ -1581,10 +1992,25 @@ class PyCurlFileObject(): if self._prog_running: self.opts.progress_obj.end(self._amount_read) self.fo.close() - + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url + +if hasattr(pycurl, 'GLOBAL_ACK_EINTR'): + # fail immediately on ctrl-c + pycurl.global_init(pycurl.GLOBAL_DEFAULT | pycurl.GLOBAL_ACK_EINTR) _curl_cache = pycurl.Curl() # make one and reuse it over and over and over +def reset_curl_obj(): + """To make sure curl has reread the network/dns info we force a reload""" + global _curl_cache + _curl_cache.close() + _curl_cache = pycurl.Curl() + +_libproxy_cache = None + ##################################################################### # DEPRECATED FUNCTIONS @@ -1603,90 +2029,601 @@ def set_progress_obj(new_progress_obj): def set_user_agent(new_user_agent): """Deprecated. Use: default_grabber.user_agent = new_user_agent""" default_grabber.user_agent = new_user_agent - + def retrygrab(url, filename=None, copy_local=0, close_connection=0, progress_obj=None, throttle=None, bandwidth=None, numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): """Deprecated. Use: urlgrab() with the retry arg instead""" - kwargs = {'copy_local' : copy_local, + kwargs = {'copy_local' : copy_local, 'close_connection' : close_connection, - 'progress_obj' : progress_obj, - 'throttle' : throttle, + 'progress_obj' : progress_obj, + 'throttle' : throttle, 'bandwidth' : bandwidth, 'retry' : numtries, 'retrycodes' : retrycodes, - 'checkfunc' : checkfunc + 'checkfunc' : checkfunc } return urlgrab(url, filename, **kwargs) - + +##################################################################### +# Serializer + parser: A replacement of the rather bulky Json code. +# +# - handles basic python literals, lists and tuples. +# - serialized strings never contain ' ' or '\n' +# +##################################################################### + +def _quoter(c): + if c in '%[(,)] \n': + return '%%%02x' % ord(c) + return c + +def _dumps(v): + if v is None: return 'None' + if v is True: return 'True' + if v is False: return 'False' + if isinstance(v, numbers.Number): + return str(v) + if isinstance(v, (str, text_type, bytes)): + # standarize to str on both py2 to py3 + if sys.version_info < (3,): + if isinstance(v, text_type): + v = v.encode('utf8') + else: + if isinstance(v, bytes): + v = v.decode('utf8') + return "'%s'" % ''.join(map(_quoter, v)) + if isinstance(v, tuple): + return "(%s)" % ','.join(map(_dumps, v)) + if isinstance(v, list): + return "[%s]" % ','.join(map(_dumps, v)) + raise TypeError("Can't serialize %s" % v) + +def _loads(s): + def decode(v): + if v == 'None': return None + if v == 'True': return True + if v == 'False': return False + try: return int(v) + except ValueError: pass + try: return float(v) + except ValueError: pass + if len(v) >= 2 and v[0] == v[-1] == "'": + ret = []; i = 1 + while True: + j = v.find('%', i) + ret.append(v[i:j]) # skips the final "'" + if j == -1: break + ret.append(chr(int(v[j + 1:j + 3], 16))) + i = j + 3 + v = ''.join(ret) + return v + stk = None + l = [] + i = j = 0 + while True: + if j == len(s) or s[j] in ',)]': + if j > i: + l.append(decode(s[i:j])) + if j == len(s): break + if s[j] in ')]': + if s[j] == ')': + l = tuple(l) + stk[0].append(l) + l, stk = stk + i = j = j + 1 + elif s[j] in '[(': + stk = l, stk + l = [] + i = j = j + 1 + else: + j += 1 # safe because '[(,)]' are quoted + if stk: raise ValueError + if len(l) == 1: l = l[0] + return l + + +##################################################################### +# External downloader process +##################################################################### + +def _readlines(fd): + buf = os.read(fd, 4096) + if not buf: return None + # whole lines only, no buffering + while not buf.endswith(b'\n'): + buf += os.read(fd, 4096) + return buf[:-1].split(b'\n') + +import subprocess + +class _ExternalDownloader: + def __init__(self): + # raise if urlgrabber-ext-down is not installed so the user gets a + # an obvious error message instead of "[Errno 5] [Errno 2] No such file + # or directory" + if not os.path.exists('/usr/libexec/urlgrabber-ext-down') and os.getenv('URLGRABBER_EXT_DOWN') is None: + raise OSError('"/usr/libexec/urlgrabber-ext-down" is not installed') + urlgrabber_path = (os.getenv('URLGRABBER_EXT_DOWN', None) + or '/usr/libexec/urlgrabber-ext-down') + self.popen = subprocess.Popen( + urlgrabber_path, + stdin = subprocess.PIPE, + stdout = subprocess.PIPE, + ) + self.stdin = self.popen.stdin.fileno() + self.stdout = self.popen.stdout.fileno() + self.running = {} + self.cnt = 0 + + # list of options we pass to downloader + _options = ( + 'url', 'filename', + 'timeout', 'minrate', 'close_connection', 'keepalive', + 'throttle', 'bandwidth', 'range', 'reget', + 'user_agent', 'http_headers', 'ftp_headers', + 'proxy', 'prefix', 'username', 'password', + 'ssl_ca_cert', + 'ssl_cert', 'ssl_cert_type', + 'ssl_key', 'ssl_key_type', + 'ssl_key_pass', + 'ssl_verify_peer', 'ssl_verify_host', + 'size', 'max_header_size', 'ip_resolve', + 'ftp_disable_epsv', + 'no_cache', + ) + + def start(self, opts): + arg = [] + for k in self._options: + v = getattr(opts, k) + if v is None: continue + arg.append('%s=%s' % (k, _dumps(v))) + if opts.progress_obj and opts.multi_progress_obj: + arg.append('progress_obj=True') + arg = ' '.join(arg) + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) + + self.cnt += 1 + self.running[self.cnt] = opts + os.write(self.stdin, (arg +'\n').encode('utf8')) + + def perform(self): + ret = [] + lines = _readlines(self.stdout) + if not lines: + if DEBUG: DEBUG.info('downloader died') + raise KeyboardInterrupt + for line in lines: + # parse downloader output + line = line.split(b' ', 6) + _id, size = map(int, line[:2]) + if len(line) == 2: + self.running[_id]._progress.update(size) + continue + # job done + opts = self.running.pop(_id) + if line[4] == b'OK': + ug_err = None + if DEBUG: DEBUG.info('success') + else: + ug_err = URLGrabError(int(line[4]), line[6]) + if line[5] != b'0': + ug_err.code = int(line[5]) + if DEBUG: DEBUG.info('failure: %s', ug_err) + _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async_[0]) + ret.append((opts, size, ug_err)) + return ret + + def abort(self): + self.popen.stdin.close() + self.popen.stdout.close() + self.popen.wait() + +class _ExternalDownloaderPool: + def __init__(self): + self.epoll = select.epoll() + self.running = {} + self.cache = {} + + def start(self, opts): + host = urlparse.urlsplit(opts.url).netloc + dl = self.cache.pop(host, None) + if not dl: + dl = _ExternalDownloader() + fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD) + fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC) + self.epoll.register(dl.stdout, select.EPOLLIN) + self.running[dl.stdout] = dl + dl.start(opts) + + def perform(self): + ret = [] + for fd, event in self.epoll.poll(): + if event & select.EPOLLHUP: + if DEBUG: DEBUG.info('downloader died') + raise KeyboardInterrupt + assert event & select.EPOLLIN + done = self.running[fd].perform() + if not done: continue + assert len(done) == 1 + ret.extend(done) + + # dl finished, move it to the cache + host = urlparse.urlsplit(done[0][0].url).netloc + if host in self.cache: self.cache[host].abort() + self.epoll.unregister(fd) + self.cache[host] = self.running.pop(fd) + return ret + + def abort(self): + for dl in self.running.values(): + self.epoll.unregister(dl.stdout) + dl.abort() + for dl in self.cache.values(): + dl.abort() + + +##################################################################### +# High level async API +##################################################################### + +_async_queue = [] + +def parallel_wait(meter=None): + '''Process queued requests in parallel. + ''' + + # calculate total sizes + meters = {} + for opts in _async_queue: + if opts.progress_obj and opts.multi_progress_obj: + count, total = meters.get(opts.multi_progress_obj) or (0, 0) + meters[opts.multi_progress_obj] = count + 1, total + opts.size + + # start multi-file meters + for meter in meters: + count, total = meters[meter] + meter.start(count, total) + + dl = _ExternalDownloaderPool() + host_con = {} # current host connection counts + single = set() # hosts in single connection mode + retry_queue = [] + + def start(opts, tries): + opts.tries = tries + try: + dl.start(opts) + except OSError as e: + # can't spawn downloader, give up immediately + opts.exception = URLGrabError(5, exception2msg(e)) + _run_callback(opts.failfunc, opts) + return + + key, limit = opts.async_ + host_con[key] = host_con.get(key, 0) + 1 + if opts.progress_obj: + if opts.multi_progress_obj: + opts._progress = opts.multi_progress_obj.newMeter() + opts._progress.start(text=opts.text) + else: + opts._progress = time.time() # no updates + + def perform(): + for opts, size, ug_err in dl.perform(): + key, limit = opts.async_ + host_con[key] -= 1 + + if ug_err is None: + if opts.checkfunc: + try: + _run_callback(opts.checkfunc, opts) + except URLGrabError as e: + ug_err = e + + if opts.progress_obj: + if opts.multi_progress_obj: + if ug_err: + opts._progress.failure(None) + else: + opts.multi_progress_obj.re.total += size - opts.size # correct totals + opts._progress.end(size) + opts.multi_progress_obj.removeMeter(opts._progress) + else: + opts.progress_obj.start(text=opts.text, now=opts._progress) + opts.progress_obj.update(size) + opts.progress_obj.end(size) + del opts._progress + + if ug_err is None: + continue + if limit != 1 and key not in single and ug_err.errno in (12, 14): + # One possible cause is connection-limited server. + # Turn on the max_connections=1 override. BZ 853432 + if DEBUG: DEBUG.info('max_connections(%s) %s => 1', key, limit) + single.add(key) + # When using multi-downloader the parent's _curl_cache + # object is idle. Kill it, as it might use keepalive=1. + reset_curl_obj() + + retry = opts.retry or 0 + if opts.failure_callback: + opts.exception = ug_err + try: + _run_callback(opts.failure_callback, opts) + except URLGrabError as e: + ug_err = e + retry = 0 # no retries + if opts.tries < retry and ug_err.errno in opts.retrycodes: + if ug_err.errno < 0 and opts.retry_no_cache: + opts.no_cache = True + start(opts, opts.tries + 1) # simple retry + continue + + if opts.mirror_group: + mg, errors, failed, removed = opts.mirror_group + errors.append((opts.url, exception2msg(ug_err))) + failed[key] = failed.get(key, 0) + 1 + opts.mirror = key + opts.exception = ug_err + action = mg.default_action or {} + if mg.failure_callback: + opts.tries = len(errors) + action = dict(action) # update only the copy + action.update(_run_callback(mg.failure_callback, opts)) + if not action.get('fail', 0): + # mask this mirror and retry + if action.get('remove', 1): + removed.add(key) + retry_queue.append(opts) + continue + # fail=1 from callback + ug_err.errors = errors + + # urlgrab failed + opts.exception = ug_err + _run_callback(opts.failfunc, opts) + + try: + retry_idx = idx = 0 + while True: + if retry_idx < len(retry_queue): + # retries first + opts = retry_queue[retry_idx] + retry_idx += 1 + elif idx < len(_async_queue): + # handle next request + opts = _async_queue[idx] + idx += 1 + else: + # both queues are empty + if not dl.running: break + perform() + continue + + # check global limit + while len(dl.running) >= default_grabber.opts.max_connections: + perform() + if DEBUG: + DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections) + + if opts.mirror_group: + mg, errors, failed, removed = opts.mirror_group + + # find the best mirror + best = None + best_speed = None + for mirror in mg.mirrors: + key = mirror['mirror'] + if key in removed: continue + + # estimate mirror speed + speed, fail = _TH.estimate(key) + speed /= 1 + host_con.get(key, 0) + + # order by: least failures, private flag, best speed + # ignore 'private' flag if there were failures + private = not fail and mirror.get('kwargs', {}).get('private', False) + speed = -failed.get(key, 0), private, speed + if best is None or speed > best_speed: + best = mirror + best_speed = speed + + if best is None: + opts.exception = URLGrabError(256, _('No more mirrors to try.')) + opts.exception.errors = errors + _run_callback(opts.failfunc, opts) + continue + + # update the grabber object, apply mirror kwargs + grabber = best.get('grabber') or mg.grabber + opts.delegate = grabber.opts.derive(**best.get('kwargs', {})) + + # update the current mirror and limit + key = best['mirror'] + limit = best.get('kwargs', {}).get('max_connections') + opts.async_ = key, limit + + # update URL and proxy + url = mg._join_url(key, opts.relative_url) + url, parts = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) + opts.url = url + + # check host limit, then start + key, limit = opts.async_ + if key in single: + limit = 1 + while host_con.get(key, 0) >= (limit or 2): + perform() + if DEBUG: + DEBUG.info('max_connections(%s): %d/%s', key, host_con.get(key, 0), limit) + + start(opts, 1) + except IOError as e: + if e.errno != 4: raise + raise KeyboardInterrupt + + finally: + dl.abort() + for meter in meters: + meter.end() + del _async_queue[:] + _TH.save() + + +##################################################################### +# Host bandwidth estimation +##################################################################### + +class _TH: + hosts = {} + dirty = None + + @staticmethod + def load(): + filename = default_grabber.opts.timedhosts + if filename and _TH.dirty is None: + try: + now = int(time.time()) + for line in open(filename): + try: + host, speed, fail, ts = line.rsplit(' ', 3) + _TH.hosts[host] = int(speed), int(fail), min(int(ts), now) + except ValueError: + if DEBUG: DEBUG.info('Error parsing timedhosts: line "%s"', line) + except IOError: pass + _TH.dirty = False + + @staticmethod + def save(): + filename = default_grabber.opts.timedhosts + if filename and _TH.dirty is True: + tmp = '%s.%d' % (filename, os.getpid()) + try: + f = open(tmp, 'w') + for host in _TH.hosts: + f.write(host + ' %d %d %d\n' % _TH.hosts[host]) + f.close() + os.rename(tmp, filename) + except IOError: pass + _TH.dirty = False + + @staticmethod + def update(url, dl_size, dl_time, ug_err, baseurl=None): + # Use hostname from URL. If it's a file:// URL, use baseurl. + # If no baseurl, do not update timedhosts. + host = urlparse.urlsplit(url).netloc.split(b'@')[-1] or baseurl + if not host: return + + _TH.load() + speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0) + now = time.time() + + if ug_err is None: + # defer first update if the file was small. BZ 851178. + if not ts and dl_size < 1e6: return + # k1: the older, the less useful + # k2: <500ms readings are less reliable + # speeds vary, use 10:1 smoothing + k1 = 2**((ts - now) / default_grabber.opts.half_life) + k2 = min(dl_time / .500, 1.0) / 10 + if k2 > 0: + speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2) + fail = 0 + elif getattr(ug_err, 'code', None) == 404: + if not ts: return # 1st update, avoid speed=0 + fail = 0 # alive, at least + else: + fail += 1 # seems dead + + _TH.hosts[host] = speed, fail, now + _TH.dirty = True + + @staticmethod + def estimate(baseurl): + _TH.load() + + # Use just the hostname, unless it's a file:// baseurl. + host = urlparse.urlsplit(baseurl).netloc.split(b'@')[-1] or baseurl + + default_speed = default_grabber.opts.default_speed + try: speed, fail, ts = _TH.hosts[host] + except KeyError: return default_speed, 0 + + speed *= 2**-fail + k = 2**((ts - time.time()) / default_grabber.opts.half_life) + speed = k * speed + (1 - k) * default_speed + return speed, fail + ##################################################################### # TESTING def _main_test(): try: url, filename = sys.argv[1:3] except ValueError: - print 'usage:', sys.argv[0], \ - '<url> <filename> [copy_local=0|1] [close_connection=0|1]' - sys.exit() + print('usage:', sys.argv[0], + '<url> <filename> [copy_local=0|1] [close_connection=0|1]') + sys.exit(2) kwargs = {} for a in sys.argv[3:]: - k, v = string.split(a, '=', 1) + k, v = a.split('=', 1) kwargs[k] = int(v) set_throttle(1.0) set_bandwidth(32 * 1024) - print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, - default_grabber.bandwidth) + print("throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, + default_grabber.bandwidth)) - try: from progress import text_progress_meter - except ImportError, e: pass + try: from .progress import text_progress_meter + except ImportError as e: pass else: kwargs['progress_obj'] = text_progress_meter() - try: name = apply(urlgrab, (url, filename), kwargs) - except URLGrabError, e: print e - else: print 'LOCAL FILE:', name + try: name = urlgrab(url, filename, **kwargs) + except URLGrabError as e: print(e) + else: print('LOCAL FILE:', name) def _retry_test(): try: url, filename = sys.argv[1:3] except ValueError: - print 'usage:', sys.argv[0], \ - '<url> <filename> [copy_local=0|1] [close_connection=0|1]' - sys.exit() + print('usage:', sys.argv[0], + '<url> <filename> [copy_local=0|1] [close_connection=0|1]') + sys.exit(2) kwargs = {} for a in sys.argv[3:]: - k, v = string.split(a, '=', 1) + k, v = a.split('=', 1) kwargs[k] = int(v) - try: from progress import text_progress_meter - except ImportError, e: pass + try: from .progress import text_progress_meter + except ImportError as e: pass else: kwargs['progress_obj'] = text_progress_meter() def cfunc(filename, hello, there='foo'): - print hello, there + print(hello, there) import random rnum = random.random() if rnum < .5: - print 'forcing retry' + print('forcing retry') raise URLGrabError(-1, 'forcing retry') if rnum < .75: - print 'forcing failure' + print('forcing failure') raise URLGrabError(-2, 'forcing immediate failure') - print 'success' + print('success') return - + kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) - try: name = apply(retrygrab, (url, filename), kwargs) - except URLGrabError, e: print e - else: print 'LOCAL FILE:', name + try: name = retrygrab(url, filename, **kwargs) + except URLGrabError as e: print(e) + else: print('LOCAL FILE:', name) def _file_object_test(filename=None): - import cStringIO if filename is None: filename = __file__ - print 'using file "%s" for comparisons' % filename + print('using file "%s" for comparisons' % filename) fo = open(filename) s_input = fo.read() fo.close() @@ -1695,17 +2632,17 @@ def _file_object_test(filename=None): _test_file_object_readall, _test_file_object_readline, _test_file_object_readlines]: - fo_input = cStringIO.StringIO(s_input) - fo_output = cStringIO.StringIO() + fo_input = StringIO(s_input) + fo_output = StringIO() wrapper = PyCurlFileObject(fo_input, None, 0) - print 'testing %-30s ' % testfunc.__name__, + print('testing %-30s ' % testfunc.__name__, end=' ') testfunc(wrapper, fo_output) s_output = fo_output.getvalue() - if s_output == s_input: print 'passed' - else: print 'FAILED' - + if s_output == s_input: print('passed') + else: print('FAILED') + def _test_file_object_smallread(wrapper, fo_output): - while 1: + while True: s = wrapper.read(23) fo_output.write(s) if not s: return @@ -1715,14 +2652,14 @@ def _test_file_object_readall(wrapper, fo_output): fo_output.write(s) def _test_file_object_readline(wrapper, fo_output): - while 1: + while True: s = wrapper.readline() fo_output.write(s) if not s: return def _test_file_object_readlines(wrapper, fo_output): li = wrapper.readlines() - fo_output.write(string.join(li, '')) + fo_output.write(''.join(li)) if __name__ == '__main__': _main_test() diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index dad410b..d95863e 100644..100755 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -76,6 +76,10 @@ CUSTOMIZATION 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. + kwarg 'max_connections' limits the number of concurrent + connections to this mirror. When omitted or set to zero, + the default limit (2) will be used. + 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. @@ -87,12 +91,29 @@ CUSTOMIZATION """ +import sys import random -import thread # needed for locking to make this threadsafe -from grabber import URLGrabError, CallbackObject, DEBUG +if sys.version_info >= (3,): + # We use a version check because python2 also has _thread + import _thread as thread +else: + import thread + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse + +from six import string_types -def _(st): +from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 +from .grabber import _run_callback, _do_raise +from .grabber import exception2msg +from .grabber import _TH +from .grabber import _bytes_repr + +def _(st): return st class GrabRequest: @@ -126,13 +147,15 @@ class MirrorGroup: files) * if the local list is ever exhausted, a URLGrabError will be - raised (errno=256, no more mirrors) + raised (errno=256, No more mirrors). The 'errors' attribute + holds a list of (full_url, errmsg) tuples. This contains + all URLs tried and the corresponding error messages. OPTIONS In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: - + default_action A dict that describes the actions to be taken upon failure @@ -153,7 +176,8 @@ class MirrorGroup: The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current - download. + download. As in the "No more mirrors" case, the 'errors' + attribute is set in the exception object. This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) @@ -162,7 +186,7 @@ class MirrorGroup: or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. - + If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default @@ -180,10 +204,11 @@ class MirrorGroup: etc). Otherwise, it is assumed to be the callable object itself. The callback will be passed a grabber.CallbackObject instance along with args and kwargs (if present). The following - attributes are defined withing the instance: + attributes are defined within the instance: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > + obj.tries = < the number of mirror tries so far > obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror @@ -251,22 +276,34 @@ class MirrorGroup: self.default_action = None self._process_kwargs(kwargs) + # use the same algorithm as parallel downloader to initially sort + # the mirror list (sort by speed, but prefer live private mirrors) + def estimate(m): + speed, fail = _TH.estimate(m['mirror']) + private = not fail and m.get('kwargs', {}).get('private', False) + return private, speed + + # update the initial order. since sorting is stable, the relative + # order of unknown (not used yet) hosts is retained. + self.mirrors.sort(key=estimate, reverse=True) + # if these values are found in **kwargs passed to one of the urlXXX # methods, they will be stripped before getting passed on to the # grabber options = ['default_action', 'failure_callback'] - + def _process_kwargs(self, kwargs): self.failure_callback = kwargs.get('failure_callback') self.default_action = kwargs.get('default_action') - + def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: - if type(m) == type(''): m = {'mirror': m} + if isinstance(m, string_types): + m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors - + def _load_gr(self, gr): # OVERRIDE IDEAS: # shuffle gr list @@ -280,7 +317,9 @@ class MirrorGroup: # return a random mirror so that multiple mirrors get used # even without failures. if not gr.mirrors: - raise URLGrabError(256, _('No more mirrors to try.')) + e = URLGrabError(256, _('No more mirrors to try.')) + e.errors = gr.errors + raise e return gr.mirrors[gr._next] def _failure(self, gr, cb_obj): @@ -290,7 +329,7 @@ class MirrorGroup: # the callback) cb = gr.kw.get('failure_callback') or self.failure_callback if cb: - if type(cb) == type( () ): + if isinstance(cb, tuple): cb, args, kwargs = cb else: args, kwargs = (), {} @@ -307,7 +346,9 @@ class MirrorGroup: a.update(action) action = a self.increment_mirror(gr, action) - if action and action.get('fail', 0): raise + if action and action.get('fail', 0): + sys.exc_info()[1].errors = gr.errors + raise def increment_mirror(self, gr, action={}): """Tell the mirror object increment the mirror index @@ -323,7 +364,7 @@ class MirrorGroup: urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). - + remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only @@ -345,7 +386,7 @@ class MirrorGroup: self._next += 1 if self._next >= len(self.mirrors): self._next = 0 self._lock.release() - + if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): @@ -353,9 +394,9 @@ class MirrorGroup: if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: - grm = [m['mirror'] for m in gr.mirrors] + grm = [m['mirror'].decode() for m in gr.mirrors] DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) - selfm = [m['mirror'] for m in self.mirrors] + selfm = [m['mirror'].decode() for m in self.mirrors] DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) ##################################################################### @@ -366,47 +407,68 @@ class MirrorGroup: # by overriding the configuration methods :) def _join_url(self, base_url, rel_url): - if base_url.endswith('/') or rel_url.startswith('/'): - return base_url + rel_url + (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) + + if isinstance(base_url, bytes): + if not isinstance(rel_url, bytes): + rel_url = rel_url.encode('utf8') + sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/' else: - return base_url + '/' + rel_url - + sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + + return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid)) + def _mirror_try(self, func, url, kw): gr = GrabRequest() gr.func = func gr.url = url gr.kw = dict(kw) self._load_gr(gr) + gr.errors = [] for k in self.options: try: del kw[k] except KeyError: pass - while 1: + tries = 0 + while True: + tries += 1 mirrorchoice = self._get_mirror(gr) fullurl = self._join_url(mirrorchoice['mirror'], gr.url) - kwargs = dict(mirrorchoice.get('kwargs', {})) - kwargs.update(kw) grabber = mirrorchoice.get('grabber') or self.grabber + # apply mirrorchoice kwargs on top of grabber.opts + opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {})) func_ref = getattr(grabber, func) - if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl)) try: - return func_ref( *(fullurl,), **kwargs ) - except URLGrabError, e: + return func_ref( *(fullurl,), opts=opts, **kw ) + except URLGrabError as e: if DEBUG: DEBUG.info('MIRROR: failed') + gr.errors.append((fullurl, exception2msg(e))) obj = CallbackObject() obj.exception = e obj.mirror = mirrorchoice['mirror'] obj.relative_url = gr.url obj.url = fullurl + obj.tries = tries self._failure(gr, obj) def urlgrab(self, url, filename=None, **kwargs): kw = dict(kwargs) kw['filename'] = filename + if kw.get('async_') or kw.get('async'): + # enable mirror failovers in async path + kw['mirror_group'] = self, [], {}, set() + kw['relative_url'] = url + else: + kw.pop('failfunc', None) func = 'urlgrab' - return self._mirror_try(func, url, kw) - + try: + return self._mirror_try(func, url, kw) + except URLGrabError as e: + obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) + return _run_callback(kwargs.get('failfunc', _do_raise), obj) + def urlopen(self, url, **kwargs): kw = dict(kwargs) func = 'urlopen' @@ -417,7 +479,7 @@ class MirrorGroup: kw['limit'] = limit func = 'urlread' return self._mirror_try(func, url, kw) - + class MGRandomStart(MirrorGroup): """A mirror group that starts at a random mirror in the list. diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index dd07c6a..5b4c450 100644..100755 --- a/urlgrabber/progress.py +++ b/urlgrabber/progress.py @@ -9,23 +9,31 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +from __future__ import print_function import sys import time import math -import thread import fcntl import struct import termios +if sys.version_info >= (3,): + # We use a version check because python2 also has _thread + import _thread as thread +else: + import thread + +from six import integer_types, string_types + # Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html def terminal_width(fd=1): """ Get the real terminal width """ @@ -107,7 +115,7 @@ class BaseMeter: self.last_amount_read = 0 self.last_update_time = None self.re = RateEstimator() - + def start(self, filename=None, url=None, basename=None, size=None, now=None, text=None): self.filename = filename @@ -125,7 +133,7 @@ class BaseMeter: self.last_amount_read = 0 self.last_update_time = now self._do_start(now) - + def _do_start(self, now=None): pass @@ -133,8 +141,8 @@ class BaseMeter: # for a real gui, you probably want to override and put a call # to your mainloop iteration function here if now is None: now = time.time() - if (now >= self.last_update_time + self.update_period) or \ - not self.last_update_time: + if (not self.last_update_time or + (now >= self.last_update_time + self.update_period)): self.re.update(amount_read, now) self.last_amount_read = amount_read self.last_update_time = now @@ -152,7 +160,7 @@ class BaseMeter: def _do_end(self, amount_read, now=None): pass - + # This is kind of a hack, but progress is gotten from grabber which doesn't # know about the total size to download. So we do this so we can get the data # out of band here. This will be "fixed" one way or anther soon. @@ -167,7 +175,7 @@ def text_meter_total_size(size, downloaded=0): # # update: No size (minimal: 17 chars) # ----------------------------------- -# <text> <rate> | <current size> <elapsed time> +# <text> <rate> | <current size> <elapsed time> # 8-48 1 8 3 6 1 9 5 # # Order: 1. <text>+<current size> (17) @@ -202,7 +210,7 @@ def text_meter_total_size(size, downloaded=0): # # end # --- -# <text> | <current size> <elapsed time> +# <text> | <current size> <elapsed time> # 8-56 3 6 1 9 5 # # Order: 1. <text> ( 8) @@ -211,6 +219,21 @@ def text_meter_total_size(size, downloaded=0): # 4. + ( 5, total: 32) # +def _term_add_bar(tl, bar_max_length, pc): + blen = bar_max_length + bar = '='*int(blen * pc) + if (blen * pc) - int(blen * pc) >= 0.5: + bar += '-' + return tl.add(' [%-*.*s]' % (blen, blen, bar)) + +def _term_add_end(tl, osize, size): + if osize: # osize should be None or >0, but that's been broken. + if size > osize: # Is ??? better? Really need something to say < vs >. + return tl.add(' !!! '), True + elif size != osize: + return tl.add(' ... '), True + return tl.add(' ' * 5), False + class TextMeter(BaseMeter): def __init__(self, fo=sys.stderr): BaseMeter.__init__(self) @@ -218,7 +241,6 @@ class TextMeter(BaseMeter): def _do_update(self, amount_read, now=None): etime = self.re.elapsed_time() - fetime = format_time(etime) fread = format_number(amount_read) #self.size = None if self.text is not None: @@ -234,19 +256,23 @@ class TextMeter(BaseMeter): # Include text + ui_rate in minimal tl = TerminalLine(8, 8+1+8) + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. + else: + use_hours = False ui_size = tl.add(' | %5sB' % fread) if self.size is None: - ui_time = tl.add(' %9s' % fetime) + ui_time = tl.add(' %s' % format_time(etime, use_hours)) ui_end = tl.add(' ' * 5) ui_rate = tl.add(' %5sB/s' % ave_dl) out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, ui_rate, ui_size, ui_time, ui_end) else: rtime = self.re.remaining_time() - frtime = format_time(rtime) + frtime = format_time(rtime, use_hours) frac = self.re.fraction_read() - ui_time = tl.add(' %9s' % frtime) + ui_time = tl.add(' %s' % frtime) ui_end = tl.add(' ETA ') if sofar_size is None: @@ -259,13 +285,10 @@ class TextMeter(BaseMeter): ui_rate = tl.add(' %5sB/s' % ave_dl) # Make text grow a bit before we start growing the bar too blen = 4 + tl.rest_split(8 + 8 + 4) - bar = '='*int(blen * frac) - if (blen * frac) - int(blen * frac) >= 0.5: - bar += '-' - ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar)) - out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, - ui_sofar_pc, ui_pc, ui_bar, - ui_rate, ui_size, ui_time, ui_end) + ui_bar = _term_add_bar(tl, blen, frac) + out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_sofar_pc, ui_pc, ui_bar, + ui_rate,ui_size,ui_time, ui_end) self.fo.write(out) self.fo.flush() @@ -274,7 +297,6 @@ class TextMeter(BaseMeter): global _text_meter_total_size global _text_meter_sofar_size - total_time = format_time(self.re.elapsed_time()) total_size = format_number(amount_read) if self.text is not None: text = self.text @@ -282,14 +304,13 @@ class TextMeter(BaseMeter): text = self.basename tl = TerminalLine(8) - ui_size = tl.add(' | %5sB' % total_size) - ui_time = tl.add(' %9s' % total_time) - not_done = self.size is not None and amount_read != self.size - if not_done: - ui_end = tl.add(' ... ') + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. else: - ui_end = tl.add(' ' * 5) - + use_hours = False + ui_size = tl.add(' | %5sB' % total_size) + ui_time = tl.add(' %s' % format_time(self.re.elapsed_time(), use_hours)) + ui_end, not_done = _term_add_end(tl, self.size, amount_read) out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, ui_size, ui_time, ui_end) self.fo.write(out) @@ -331,14 +352,23 @@ class MultiFileHelper(BaseMeter): def message(self, message): self.master.message_meter(self, message) +class _FakeLock: + def acquire(self): + pass + def release(self): + pass + class MultiFileMeter: helperclass = MultiFileHelper - def __init__(self): + def __init__(self, threaded=True): self.meters = [] self.in_progress_meters = [] - self._lock = thread.allocate_lock() + if threaded: + self._lock = thread.allocate_lock() + else: + self._lock = _FakeLock() self.update_period = 0.3 # seconds - + self.numfiles = None self.finished_files = 0 self.failed_files = 0 @@ -369,8 +399,9 @@ class MultiFileMeter: def end(self, now=None): if now is None: now = time.time() + self.re.update(self._amount_read(), now) self._do_end(now) - + def _do_end(self, now): pass @@ -383,10 +414,10 @@ class MultiFileMeter: newmeter = self.helperclass(self) self.meters.append(newmeter) return newmeter - + def removeMeter(self, meter): self.meters.remove(meter) - + ########################################################### # child functions - these should only be called by helpers def start_meter(self, meter, now): @@ -400,15 +431,15 @@ class MultiFileMeter: finally: self._lock.release() self._do_start_meter(meter, now) - + def _do_start_meter(self, meter, now): pass - + def update_meter(self, meter, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') - if (now >= self.last_update_time + self.update_period) or \ - not self.last_update_time: + if (not self.last_update_time or + (now >= self.last_update_time + self.update_period)): self.re.update(self._amount_read(), now) self.last_update_time = now self._do_update_meter(meter, now) @@ -466,34 +497,83 @@ class MultiFileMeter: class TextMultiFileMeter(MultiFileMeter): - def __init__(self, fo=sys.stderr): + def __init__(self, fo=sys.stderr, threaded=True): self.fo = fo - MultiFileMeter.__init__(self) + MultiFileMeter.__init__(self, threaded) + self.index_time = self.index = 0 # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:## +# New output, like TextMeter output... +# update: No size (minimal: 17 chars) +# ----------------------------------- +# (<#file>/<#tot files>): <text> <rate> | <current size> <elapsed> +# 8-48 1 8 3 6 1 7-9 5 +# +# update: Size, All files +# ----------------------- +# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA +# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1 +# end +# --- +# <text> | <file size> <file elapsed time> +# 8-56 3 6 1 9 5 def _do_update_meter(self, meter, now): self._lock.acquire() try: - format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \ - "time: %8.8s/%8.8s" df = self.finished_files tf = self.numfiles or 1 - pf = 100 * float(df)/tf + 0.49 + # Don't use "percent of files complete" ... + # pf = 100 * float(df)/tf + 0.49 dd = self.re.last_amount_read - td = self.total_size + td = self.re.total pd = 100 * (self.re.fraction_read() or 0) + 0.49 dt = self.re.elapsed_time() rt = self.re.remaining_time() - if rt is None: tt = None - else: tt = dt + rt - - fdd = format_number(dd) + 'B' - ftd = format_number(td) + 'B' - fdt = format_time(dt, 1) - ftt = format_time(tt, 1) - - out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt)) - self.fo.write('\r' + out) + + frac = self.re.fraction_read() or 0 + pf = 100 * frac + ave_dl = format_number(self.re.average_rate()) + + # cycle through active meters + if now > self.index_time: + self.index_time = now + 1.0 + self.index += 1 + if self.index >= len(self.meters): + self.index = 0 + meter = self.meters[self.index] + text = meter.text or meter.basename + if tf > 1: + text = '(%u/%u): %s' % (df+1+self.index, tf, text) + + # Include text + ui_rate in minimal + tl = TerminalLine(8, 8+1+8) + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. + else: + use_hours = False + ui_size = tl.add(' | %5sB' % format_number(dd)) + if not self.re.total: + ui_time = tl.add(' %s' % format_time(dt, use_hours)) + ui_end = tl.add(' ' * 5) + ui_rate = tl.add(' %5sB/s' % ave_dl) + out = '\r%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_rate, ui_size, ui_time, ui_end) + else: + ui_time = tl.add(' %s' % format_time(rt, use_hours)) + ui_end = tl.add(' ETA ') + + ui_sofar_pc = tl.add(' %i%%' % pf, + full_len=len(" (100%)")) + ui_rate = tl.add(' %5sB/s' % ave_dl) + + # Make text grow a bit before we start growing the bar too + blen = 4 + tl.rest_split(8 + 8 + 4) + ui_bar = _term_add_bar(tl, blen, frac) + out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_sofar_pc, ui_bar, + ui_rate, ui_size, ui_time, + ui_end) + self.fo.write(out) self.fo.flush() finally: self._lock.release() @@ -502,25 +582,39 @@ class TextMultiFileMeter(MultiFileMeter): self._lock.acquire() try: format = "%-30.30s %6.6s %8.8s %9.9s" - fn = meter.basename + fn = meter.text or meter.basename size = meter.last_amount_read fsize = format_number(size) + 'B' et = meter.re.elapsed_time() - fet = format_time(et, 1) - frate = format_number(size / et) + 'B/s' - - out = '%-79.79s' % (format % (fn, fsize, fet, frate)) - self.fo.write('\r' + out + '\n') + frate = format_number(et and size / et) + 'B/s' + df = self.finished_files + tf = self.numfiles or 1 + + total_size = format_number(size) + text = meter.text or meter.basename + if tf > 1: + text = '(%u/%u): %s' % (df, tf, text) + + tl = TerminalLine(8) + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. + else: + use_hours = False + ui_size = tl.add(' | %5sB' % total_size) + ui_time = tl.add(' %s' % format_time(et, use_hours)) + ui_end, not_done = _term_add_end(tl, meter.size, size) + out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, + ui_size, ui_time, ui_end) + self.fo.write(out) finally: self._lock.release() - self._do_update_meter(meter, now) def _do_failure_meter(self, meter, message, now): self._lock.acquire() try: format = "%-30.30s %6.6s %s" - fn = meter.basename - if type(message) in (type(''), type(u'')): + fn = meter.text or meter.basename + if isinstance(message, string_types): message = message.splitlines() if not message: message = [''] out = '%-79s' % (format % (fn, 'FAILED', message[0] or '')) @@ -537,15 +631,6 @@ class TextMultiFileMeter(MultiFileMeter): finally: self._lock.release() - def _do_end(self, now): - self._do_update_meter(None, now) - self._lock.acquire() - try: - self.fo.write('\n') - self.fo.flush() - finally: - self._lock.release() - ###################################################################### # support classes and functions @@ -560,13 +645,17 @@ class RateEstimator: self.last_update_time = now self.last_amount_read = 0 self.ave_rate = None - + def update(self, amount_read, now=None): if now is None: now = time.time() - if amount_read == 0: + # libcurl calls the progress callback when fetching headers + # too, thus amount_read = 0 .. hdr_size .. 0 .. content_size. + # Ocassionally we miss the 2nd zero and report avg speed < 0. + # Handle read_diff < 0 here. BZ 1001767. + if amount_read == 0 or amount_read < self.last_amount_read: # if we just started this file, all bets are off self.last_update_time = now - self.last_amount_read = 0 + self.last_amount_read = amount_read self.ave_rate = None return @@ -576,11 +665,11 @@ class RateEstimator: # First update, on reget is the file size if self.last_amount_read: self.last_update_time = now - self.ave_rate = self._temporal_rolling_ave(\ + self.ave_rate = self._temporal_rolling_ave( time_diff, read_diff, self.ave_rate, self.timescale) self.last_amount_read = amount_read #print 'results', time_diff, read_diff, self.ave_rate - + ##################################################################### # result methods def average_rate(self): @@ -616,14 +705,14 @@ class RateEstimator: epsilon = time_diff / timescale if epsilon > 1: epsilon = 1.0 return self._rolling_ave(time_diff, read_diff, last_ave, epsilon) - + def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon): """perform a "rolling average" iteration a rolling average "folds" new data into an existing average with some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) a value of 0.0 means only the old value (initial value) counts, and a value of 1.0 means only the newest value is considered.""" - + try: recent_rate = read_diff / time_diff except ZeroDivisionError: @@ -652,23 +741,25 @@ class RateEstimator: rt = int(rt) if shift <= 0: return rt return float(int(rt) >> shift << shift) - + def format_time(seconds, use_hours=0): if seconds is None or seconds < 0: if use_hours: return '--:--:--' else: return '--:--' + elif seconds == float('inf'): + return 'Infinite' else: seconds = int(seconds) - minutes = seconds / 60 + minutes = seconds // 60 seconds = seconds % 60 if use_hours: - hours = minutes / 60 + hours = minutes // 60 minutes = minutes % 60 return '%02i:%02i:%02i' % (hours, minutes, seconds) else: return '%02i:%02i' % (minutes, seconds) - + def format_number(number, SI=0, space=' '): """Turn numbers into human-readable metric-like numbers""" symbols = ['', # (none) @@ -680,14 +771,14 @@ def format_number(number, SI=0, space=' '): 'E', # exa 'Z', # zetta 'Y'] # yotta - + if SI: step = 1000.0 else: step = 1024.0 thresh = 999 depth = 0 max_depth = len(symbols) - 1 - + # we want numbers between 0 and thresh, but don't exceed the length # of our list. In that event, the formatting will be screwed up, # but it'll still show the right number. @@ -695,7 +786,7 @@ def format_number(number, SI=0, space=' '): depth = depth + 1 number = number / step - if type(number) == type(1) or type(number) == type(1L): + if isinstance(number, integer_types): # it's an int or a long, which means it didn't get divided, # which means it's already short enough format = '%i%s%s' @@ -705,7 +796,7 @@ def format_number(number, SI=0, space=' '): format = '%.1f%s%s' else: format = '%.0f%s%s' - + return(format % (float(number or 0), space, symbols[depth])) def _tst(fn, cur, tot, beg, size, *args): @@ -722,9 +813,77 @@ def _tst(fn, cur, tot, beg, size, *args): time.sleep(delay) tm.end(size) +def _mtst(datas, *args): + print('-' * 79) + tm = TextMultiFileMeter(threaded=False) + + dl_sizes = {} + + num = 0 + total_size = 0 + dl_total_size = 0 + for data in datas: + dl_size = None + if len(data) == 2: + fn, size = data + dl_size = size + if len(data) == 3: + fn, size, dl_size = data + nm = tm.newMeter() + nm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, + text=fn) + num += 1 + assert dl_size is not None + dl_total_size += dl_size + dl_sizes[nm] = dl_size + if size is None or total_size is None: + total_size = None + else: + total_size += size + tm.start(num, total_size) + + num = 0 + off = 0 + for (inc, delay) in args: + off += 1 + while num < ((dl_total_size * off) / len(args)): + num += inc + for nm in tm.meters[:]: + if dl_sizes[nm] <= num: + nm.end(dl_sizes[nm]) + tm.removeMeter(nm) + else: + nm.update(num) + time.sleep(delay) + assert not tm.meters + if __name__ == "__main__": - # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 - # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 + # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 + # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 + if len(sys.argv) >= 2 and sys.argv[1] == 'multi': + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", 5000), + ("m-1.0.1-2.fc8.i386.rpm", 10000)), + (100, 0.33), (500, 0.25), (1000, 0.1)) + + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", 5000), + ("m-1.0.1-2.fc8.i386.rpm", None, 10000)), + (100, 0.33), (500, 0.25), (1000, 0.1)) + + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", 2500000), + ("m-1.0.1-2.fc8.i386.rpm", 10000)), + (10, 0.2), (50, 0.1), (1000, 0.1)) + + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", None, 2500000), + ("m-1.0.1-2.fc8.i386.rpm", None, 10000)), + (10, 0.2), (50, 0.1), (1000, 0.1)) + # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) + # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) + sys.exit(0) + if len(sys.argv) >= 2 and sys.argv[1] == 'total': text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 + 1000000 + 10000 + 10000 + 10000 + 1000000) |