summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbiao716.wang <biao716.wang@samsung.com>2020-08-26 11:34:58 +0900
committerbiao716.wang <biao716.wang@samsung.com>2020-08-26 11:34:58 +0900
commita5651e772478cb72e7fd02e1d60bbe7a509a4d50 (patch)
treedd6921b15d6285d79e2e42dcbbd132ac066b19d1
parent136d1e028cec5dcddc6ef6ac7302c794fda5f135 (diff)
downloadpython-urlgrabber-debian/4.1.0.tar.gz
python-urlgrabber-debian/4.1.0.tar.bz2
python-urlgrabber-debian/4.1.0.zip
Port to Python3debian/4.1.0
Change-Id: I46b8f71dce3d1f009617aa6e969414ad0c6393a6 Signed-off-by: biao716.wang <biao716.wang@samsung.com>
-rwxr-xr-x[-rw-r--r--]ChangeLog37
-rwxr-xr-x[-rw-r--r--]LICENSE0
-rwxr-xr-xMANIFEST.in9
-rwxr-xr-x[-rw-r--r--]PKG-INFO63
-rwxr-xr-x[-rw-r--r--]README2
-rwxr-xr-x[-rw-r--r--]TODO0
-rw-r--r--debian/changelog6
-rw-r--r--debian/control8
-rwxr-xr-xdebian/rules4
-rwxr-xr-x[-rw-r--r--]makefile14
-rw-r--r--packaging/python-urlgrabber.changes206
-rw-r--r--packaging/python-urlgrabber.spec9
-rwxr-xr-x[-rw-r--r--]scripts/urlgrabber75
-rwxr-xr-xscripts/urlgrabber-ext-down82
-rwxr-xr-xsetup.cfg4
-rwxr-xr-x[-rw-r--r--]setup.py99
-rwxr-xr-x[-rw-r--r--]test/base_test_code.py7
-rwxr-xr-x[-rw-r--r--]test/grabberperf.py40
-rwxr-xr-x[-rw-r--r--]test/munittest.py140
-rwxr-xr-x[-rw-r--r--]test/runtests.py20
-rwxr-xr-x[-rw-r--r--]test/test_byterange.py103
-rwxr-xr-x[-rw-r--r--]test/test_grabber.py327
-rwxr-xr-x[-rw-r--r--]test/test_mirror.py207
-rwxr-xr-x[-rw-r--r--]test/threading/batchgrabber.py42
-rwxr-xr-xurlgrabber.egg-info/PKG-INFO31
-rwxr-xr-xurlgrabber.egg-info/SOURCES.txt27
-rwxr-xr-xurlgrabber.egg-info/dependency_links.txt1
-rwxr-xr-xurlgrabber.egg-info/requires.txt3
-rwxr-xr-xurlgrabber.egg-info/top_level.txt1
-rwxr-xr-x[-rw-r--r--]urlgrabber/__init__.py44
-rwxr-xr-x[-rw-r--r--]urlgrabber/byterange.py201
-rwxr-xr-x[-rw-r--r--]urlgrabber/grabber.py1701
-rwxr-xr-x[-rw-r--r--]urlgrabber/mirror.py132
-rwxr-xr-x[-rw-r--r--]urlgrabber/progress.py341
34 files changed, 2841 insertions, 1145 deletions
diff --git a/ChangeLog b/ChangeLog
index 644fbdb..5a72bbf 100644..100755
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,40 @@
+2019-10-08 Neal Gompa <ngompa13@gmail.com>
+
+ * Fix confused license header to clarify licensing
+ * Fix Python 3 compatibility with urlgrabber-ext-down
+ * Support HTTP CONNECT with reget. BZ 1585596
+ * Fix for usage of _levelNames from logging module
+ * Fix issue when URLGRABBER_DEBUG is not an integer on Python 3
+ * Revise setup.py to remove need for extra setup-time dependencies
+ * setuptools: Update Development Status to "Production/Stable"
+ * Bump version to 4.1.0
+
+2019-02-25 Neal Gompa <ngompa13@gmail.com>
+
+ * Port to Python 3
+ * Add curl_obj option to grabber
+ * Throw an obvious error message when urlgrabber-ext-down
+ is missing when attempting to use external downloader
+ * Use setuptools for setup.py instead of distutils
+ * bump version to 4.0.0
+
+2017-02-02 Valentina Mukhamedzhanova <vmukhame@redhat.com>
+
+ * Add no_cache and retry_no_cache options.
+ * Work around pycurl dependency in setup.py.
+ * Don't set speed=0 on a new mirror that 404'd.
+ * Add a comprehensive error message to pycurl error 77.
+ * Don't crash on timedhosts parsing error.
+ * bump version to 3.10.2
+
+2013-10-09 Zdenek Pavlas <zpavlas@redhat.com>
+
+ * lots of enahncements and bugfixes
+ (parallel downloading, mirror profiling, new options)
+ * updated authors, url
+ * updated unit tests
+ * bump version to 3.10
+
2009-09-25 Seth Vidal <skvidal@fedoraproject.org>
* urlgrabber/__init__.py: bump version to 3.9.1
diff --git a/LICENSE b/LICENSE
index 3b20440..3b20440 100644..100755
--- a/LICENSE
+++ b/LICENSE
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100755
index 0000000..999fdee
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,9 @@
+recursive-include urlgrabber *.py
+recursive-include test *.py
+include scripts/urlgrabber
+include README
+include LICENSE
+include TODO
+include ChangeLog
+include MANIFEST
+include makefile \ No newline at end of file
diff --git a/PKG-INFO b/PKG-INFO
index 1368b10..20b9966 100644..100755
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,48 +1,31 @@
-Metadata-Version: 1.0
+Metadata-Version: 1.2
Name: urlgrabber
-Version: 3.9.1
+Version: 4.1.0
Summary: A high-level cross-protocol url-grabber
-Home-page: http://linux.duke.edu/projects/urlgrabber/
-Author: Michael D. Stenner, Ryan Tomayko
-Author-email: mstenner@linux.duke.edu, skvidal@fedoraproject.org
-License: LGPL
-Description: A high-level cross-protocol url-grabber.
-
- Using urlgrabber, data can be fetched in three basic ways:
-
- urlgrab(url) copy the file to the local filesystem
- urlopen(url) open the remote file and return a file object
- (like urllib2.urlopen)
- urlread(url) return the contents of the file as a string
-
- When using these functions (or methods), urlgrabber supports the
- following features:
-
- * identical behavior for http://, ftp://, and file:// urls
- * http keepalive - faster downloads of many files by using
- only a single connection
- * byte ranges - fetch only a portion of the file
- * reget - for a urlgrab, resume a partial download
- * progress meters - the ability to report download progress
- automatically, even when using urlopen!
- * throttling - restrict bandwidth usage
- * retries - automatically retry a download if it fails. The
- number of retries and failure types are configurable.
- * authenticated server access for http and ftp
- * proxy support - support for authenticated http and ftp proxies
- * mirror groups - treat a list of mirrors as a single source,
- automatically switching mirrors if there is a failure.
-
+Home-page: http://urlgrabber.baseurl.org/
+Author: Michael D. Stenner, Ryan Tomayko, Seth Vidal, Zdenek Pavlas
+Author-email: mstenner@linux.duke.edu, rtomayko@naeblis.cx, skvidal@fedoraproject.org, zpavlas@redhat.com
+Maintainer: Neal Gompa
+Maintainer-email: ngompa@fedoraproject.org
+License: LGPLv2+
+Description: UNKNOWN
+Keywords: urlgrabber yum http ftp
Platform: UNKNOWN
-Classifier: Development Status :: 4 - Beta
-Classifier: Environment :: Console
-Classifier: Environment :: Web Environment
+Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: Intended Audience :: System Administrators
-Classifier: License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)
-Classifier: Operating System :: POSIX
-Classifier: Operating System :: POSIX :: Linux
-Classifier: Programming Language :: Python
Classifier: Topic :: Internet :: File Transfer Protocol (FTP)
Classifier: Topic :: Internet :: WWW/HTTP
Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Environment :: Console
+Classifier: Environment :: Web Environment
+Classifier: License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)
+Classifier: Operating System :: POSIX
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
diff --git a/README b/README
index 5fd378b..2718d3c 100644..100755
--- a/README
+++ b/README
@@ -19,7 +19,7 @@ You can build rpms by running
python setup.py bdist_rpm
The rpms (both source and "binary") will be specific to the current
-distrubution/version and may not be portable to others. This is
+distribution/version and may not be portable to others. This is
because they will be built for the currently installed python.
keepalive.py and byterange.py are generic urllib2 extension modules and
diff --git a/TODO b/TODO
index ad1dc8a..ad1dc8a 100644..100755
--- a/TODO
+++ b/TODO
diff --git a/debian/changelog b/debian/changelog
index 1e95e82..9c663ba 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+python-urlgrabber (4.1.0) unstable; urgency=low
+
+ * Port to Python 3
+
+ -- Biao Wang <biao716.wang@samsung.com> Wed, 26 Aug 2020 10:09:30 +0800
+
python-urlgrabber (3.9.1) unstable; urgency=low
* add debian
diff --git a/debian/control b/debian/control
index a365ee5..6776271 100644
--- a/debian/control
+++ b/debian/control
@@ -2,15 +2,15 @@ Source: python-urlgrabber
Section: devel
Priority: extra
Maintainer: Jong-Woo Chae <jonwoo.chae@samsung.com>
-Build-Depends: debhelper (>= 7.0.15), python-dev, python-pycurl
-Standards-Version: 3.9.1
+Build-Depends: debhelper (>= 7.0.15), python3-six, python3-pycurl, python3-setuptools, dh_python3
+Standards-Version: 4.1.0
Homepage: http://www.tizen.org
Package: python-urlgrabber
Architecture: all
Depends: ${python:Depends},
- python-m2crypto | python3-m2crypto,
- python-pycurl
+ python3-six,
+ python3-pycurl
Description: image creator for Linux distributions
The tool createrep is used to ppfarm test
diff --git a/debian/rules b/debian/rules
index 0fa0995..b77ab76 100755
--- a/debian/rules
+++ b/debian/rules
@@ -6,7 +6,7 @@
build: build-stamp
build-stamp:
dh_testdir
- python setup.py build
+ python3 setup.py build
clean:
dh_testdir
dh_testroot
@@ -34,7 +34,7 @@ binary-indep: build install
dh_strip
dh_compress
dh_fixperms
- dh_python2
+ dh_python3
dh_installdeb
dh_shlibdeps
dh_gencontrol
diff --git a/makefile b/makefile
index caa0f9e..2be1eb9 100644..100755
--- a/makefile
+++ b/makefile
@@ -10,14 +10,14 @@ CLEANFILES = MANIFEST *~ build dist export release daily reference nonexistent_f
*.pyc urlgrabber/*.pyc scripts/*.pyc test/*.pyc test/nonexistent_file \
test/reference test/reference.part urlgrabber/*~
##############################################################################
-VERSION = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__version__')
-DATE = $(shell $(PYTHON) -c 'import $(PY_MODULE); print $(PY_MODULE).__date__')
+VERSION = $(shell $(PYTHON) -c 'import $(PY_MODULE); print($(PY_MODULE).__version__)')
+DATE = $(shell $(PYTHON) -c 'import $(PY_MODULE); print($(PY_MODULE).__date__)')
SCM_TAG = release-$(shell echo $(VERSION) | sed -e 's/\./_/g')
-PYTHON22 = $(shell /usr/bin/which python2.2 2>/dev/null)
-PYTHON23 = $(shell /usr/bin/which python2.3 2>/dev/null)
-PYTHON24 = $(shell /usr/bin/which python2.4 2>/dev/null)
-PYTHON25 = $(shell /usr/bin/which python2.5 2>/dev/null)
-TESTPYTHONS = $(PYTHON22) $(PYTHON23) $(PYTHON24) $(PYTHON25)
+PYTHON26 = $(shell /usr/bin/which python2.6 2>/dev/null)
+PYTHON27 = $(shell /usr/bin/which python2.7 2>/dev/null)
+PYTHON36 = $(shell /usr/bin/which python3.6 2>/dev/null)
+PYTHON37 = $(shell /usr/bin/which python3.7 2>/dev/null)
+TESTPYTHONS = $(PYTHON26) $(PYTHON27) $(PYTHON36) $(PYTHON37)
##############################################################################
default:
diff --git a/packaging/python-urlgrabber.changes b/packaging/python-urlgrabber.changes
index dc0d37f..c33ecab 100644
--- a/packaging/python-urlgrabber.changes
+++ b/packaging/python-urlgrabber.changes
@@ -1,4 +1,204 @@
-* Wed Mar 06 2013 Patrick McCarty <patrick.mccarty@linux.intel.com> upstream/3.9.1@c5b017c
-- Add packaging
-- Imported Upstream version 3.9.1
+-------------------------------------------------------------------
+Wed May 20 07:47:54 UTC 2020 - pgajdos@suse.com
+
+- urlgrabber-ext-down as an alternative
+
+-------------------------------------------------------------------
+Mon May 18 09:10:43 UTC 2020 - Petr Gajdos <pgajdos@suse.com>
+
+- %python3_only -> %python_alternative
+- urlgrabber-ext-down is expected to reside under /usr/libexec
+
+-------------------------------------------------------------------
+Wed Oct 9 07:16:52 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com>
+
+- Update to 4.1.0:
+ * Fix confused license header to clarify licensing
+ * Fix Python 3 compatibility with urlgrabber-ext-down
+ * Support HTTP CONNECT with reget. BZ 1585596
+ * Fix for usage of _levelNames from logging module
+ * Fix issue when URLGRABBER_DEBUG is not an integer on Python 3
+ * Revise setup.py to remove need for extra setup-time dependencies
+ * setuptools: Update Development Status to Production/Stable
+
+-------------------------------------------------------------------
+Wed Feb 27 09:39:32 UTC 2019 - Tomáš Chvátal <tchvatal@suse.com>
+
+- Drop patch grabber_fix.diff that was never upstreamed. Should
+ not be needed anymore
+
+-------------------------------------------------------------------
+Mon Feb 25 17:44:43 CET 2019 - Matej Cepl <mcepl@suse.com>
+
+- Update to the upstream version 4.0.0:
+ * Port to Python 3 rocket
+ * Add curl_obj option to grabber
+ * Throw an obvious error message when urlgrabber-ext-down is
+ missing when attempting to use external downloader
+ * Use setuptools for setup.py instead of distutils
+- Remove merged patches:
+ * declare-dollar-sign-as-safe-in-urlquote.patch
+ * python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif
+ * python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch
+
+-------------------------------------------------------------------
+Tue Dec 4 12:55:41 UTC 2018 - Matej Cepl <mcepl@suse.com>
+
+- Remove superfluous devel dependency for noarch package
+
+-------------------------------------------------------------------
+Tue May 29 14:23:52 UTC 2018 - mcepl@suse.com
+
+- Clean SPEC file
+
+-------------------------------------------------------------------
+Thu Aug 24 13:56:44 UTC 2017 - jmatejek@suse.com
+
+- singlespec auto-conversion
+
+-------------------------------------------------------------------
+Thu Feb 12 13:42:05 CET 2015 - mc@suse.de
+
+- declare $ sign as a safe character in url paths to prevent
+ escaping /$RCE/ which lead into problems with token auth
+ (bnc#902416)
+ * declare-dollar-sign-as-safe-in-urlquote.patch
+ * python-urlgrabber-3.9.1-set-SSL_VERIFYHOST-correct.dif
+- set curl option SSL_VERIFYHOST correct
+
+-------------------------------------------------------------------
+Tue Sep 16 12:38:07 UTC 2014 - dmacvicar@suse.de
+
+- Add python-urlgrabber-3.9.1-preserve-queryparams-in-urls.patch
+ (bnc#896844)
+
+-------------------------------------------------------------------
+Wed Feb 6 18:06:41 UTC 2013 - jmatejek@suse.com
+
+- Add grabber_fix.diff: Fixed timeout and other errors breaking yum
+ compatibility (bnc#793650)
+
+-------------------------------------------------------------------
+Mon Oct 1 09:53:26 UTC 2012 - saschpe@suse.de
+
+- Fixed wrong license header in urlgrabber/__init__.py (bnc#781323)
+- Updated upstream URL, the project moved the baseurl.org (yum)
+
+-------------------------------------------------------------------
+Tue Sep 20 11:40:05 UTC 2011 - saschpe@suse.de
+
+- Update to version 3.9.1:
+ * cleanup all the old urlgrabber urllib code that's not being used
+ * delete sslfactory and keepalive fix up the unittests to match existing code
+ * make sure the value we get back from the parse150 and other calls is
+ converted to an int before we make it 'size' rhbug: #524705
+- Spec file updates:
+ * Removed authors from description
+ * Dropped useless python-urlgrabber-2.9.9.patch
+ * Dropped obsolete python-urlgrabber-3.1.0.patch (upstream changed)
+ * Require python-pycurl
+
+-------------------------------------------------------------------
+Wed Aug 12 20:10:37 CEST 2009 - matejcik@suse.cz
+
+- build as noarch on newer distros
+- switched filelist to --record-rpm
+
+-------------------------------------------------------------------
+Mon Aug 10 14:06:55 CEST 2009 - coolo@novell.com
+
+- sync factory and build service
+
+-------------------------------------------------------------------
+Sun Aug 9 08:45:09 CEST 2009 - coolo@novell.com
+
+- use new python macros
+
+-------------------------------------------------------------------
+Tue May 12 14:14:04 CEST 2009 - poeml@suse.de
+
+- fix build on 11.1 onwards, where python must be in the
+ buildrequires in addition to python-devel, because otherwise
+ urllib2 appears to have no SSL support
+
+-------------------------------------------------------------------
+Fri Sep 26 11:11:32 CEST 2008 - cthiel@suse.de
+
+- add python to BuildRequires to fix build
+
+-------------------------------------------------------------------
+Tue Feb 19 16:01:09 CET 2008 - cthiel@suse.de
+
+- fix url parsing error in grabber.py (bnc #362937)
+
+-------------------------------------------------------------------
+Mon Oct 2 13:53:16 CEST 2006 - cthiel@suse.de
+
+- fix build on older distributions
+
+-------------------------------------------------------------------
+Sun Oct 1 15:34:10 CEST 2006 - cthiel@suse.de
+
+- update to version 3.1.0
+ * various fixes
+
+-------------------------------------------------------------------
+Thu Sep 21 14:26:46 CEST 2006 - cthiel@suse.de
+
+- fix build with python 2.5
+
+-------------------------------------------------------------------
+Fri Aug 4 17:25:18 CEST 2006 - cthiel@suse.de
+
+- update to version 2.9.10
+ * Make keepalive, byteranges, etc. work with https.
+ * Fixed a minor error reporting bug due to changes in python 2.4.
+ * Catch read errors after the file has been opened.
+- removed obsolete urlgrabber-read-error.patch
+
+-------------------------------------------------------------------
+Thu May 25 14:19:34 CEST 2006 - cthiel@suse.de
+
+- update to version 2.9.9
+ * Added tests to make sure that the "quote" option works as advertised
+ * Significant improvement to URL parsing. Parsing is now broken out into
+ a separate class (URLParser). It will now (by default) guess whether a
+ URL is already quoted, properly handle local files and URLs on windows,
+ and display un-quoted versions of the filename in the progress meter.
+ * Added a reget progress bar patch from Menno, and fixed the annoying next
+ _IndexError bug.
+- added urlgrabber-read-error.patch (from Fedora)
+- removed python-urlgrabber-2.9.7-reget.patch (included upstream)
+
+-------------------------------------------------------------------
+Tue Feb 28 16:46:03 CET 2006 - jmatejek@suse.cz
+
+- updated to reflect python changes due to #149809
+
+-------------------------------------------------------------------
+Wed Jan 25 21:40:54 CET 2006 - mls@suse.de
+
+- converted neededforbuild to BuildRequires
+
+-------------------------------------------------------------------
+Mon Oct 31 11:56:02 CET 2005 - dmueller@suse.de
+
+- don't build as root
+
+-------------------------------------------------------------------
+Wed Oct 26 13:14:27 CEST 2005 - cthiel@suse.de
+
+- update to version 2.9.7
+
+-------------------------------------------------------------------
+Tue Sep 13 11:01:26 CEST 2005 - cthiel@suse.de
+
+- specfile cleanup
+
+-------------------------------------------------------------------
+Sun Aug 14 02:04:14 CEST 2005 - cthiel@suse.de
+
+- initial package (version 2.9.6)
+
+
diff --git a/packaging/python-urlgrabber.spec b/packaging/python-urlgrabber.spec
index 6113fb5..58e8755 100644
--- a/packaging/python-urlgrabber.spec
+++ b/packaging/python-urlgrabber.spec
@@ -1,6 +1,6 @@
Name: python-urlgrabber
Summary: A high-level cross-protocol url-grabber
-Version: 3.9.1
+Version: 4.1.0
Release: 0
Group: Development/Libraries
License: LGPL-2.1+
@@ -8,10 +8,11 @@ BuildArch: noarch
URL: http://urlgrabber.baseurl.org/
Source0: urlgrabber-%{version}.tar.gz
Source1001: python-urlgrabber.manifest
-BuildRequires: python-devel
-BuildRequires: python-pycurl
-Requires: python-M2Crypto
+BuildRequires: python3-pycurl
+BuildRequires: python3-six
+BuildRequires: python3-setuptools
Requires: python-pycurl
+Requires: python-six
Provides: urlgrabber = %{version}-%{release}
%description
diff --git a/scripts/urlgrabber b/scripts/urlgrabber
index 518e512..1b1e077 100644..100755
--- a/scripts/urlgrabber
+++ b/scripts/urlgrabber
@@ -19,6 +19,8 @@
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
+from __future__ import print_function
+
"""NAME
urlgrabber - a simple client for the urlgrabber python package
@@ -115,6 +117,7 @@ options:
including quotes in the case of strings.
e.g. --user_agent='"foobar/2.0"'
+ --output FILE
-o FILE write output to FILE, otherwise the basename of the
url will be used
-O print the names of saved files to STDOUT
@@ -130,8 +133,6 @@ options:
--profile profile the actual fetching and print the results
"""
-# $Id: urlgrabber,v 1.7 2006/12/08 00:14:16 mstenner Exp $
-
import sys
import getopt
import re
@@ -170,12 +171,17 @@ class client_options:
return ug_options, ug_defaults
def process_command_line(self):
- short_options = 'vd:hoOpD'
+ short_options = 'vd:ho:OpD'
long_options = ['profile', 'repeat=', 'verbose=',
- 'debug=', 'help', 'progress']
+ 'debug=', 'help', 'progress', 'output=']
ug_long = [ o + '=' for o in self.ug_options ]
- optlist, args = getopt.getopt(sys.argv[1:], short_options,
- long_options + ug_long)
+ try:
+ optlist, args = getopt.getopt(sys.argv[1:], short_options,
+ long_options + ug_long)
+ except getopt.GetoptError as e:
+ print("Error:", e, file=sys.stderr)
+ self.help([], ret=1)
+
self.verbose = 0
self.debug = None
self.outputfile = None
@@ -193,6 +199,7 @@ class client_options:
if o == '--verbose': self.verbose = v
if o == '-v': self.verbose += 1
if o == '-o': self.outputfile = v
+ if o == '--output': self.outputfile = v
if o == '-p' or o == '--progress': self.progress = 1
if o == '-d' or o == '--debug': self.debug = v
if o == '--profile': self.profile = 1
@@ -202,7 +209,7 @@ class client_options:
self.repeat = int(v)
if self.repeat < 1: raise ValueError()
except ValueError:
- print 'ERROR: repeat value must be an int >= 1'
+ print('ERROR: repeat value must be an int >= 1')
sys.exit(1)
if o == '-D':
self.verbose = 3
@@ -211,20 +218,20 @@ class client_options:
if o in ug_dash:
try:
val = eval(v)
- except Exception, e:
- print "error processing option value: %s" % v
- print e
+ except Exception as e:
+ print("error processing option value: %s" % v)
+ print(e)
sys.exit(1)
else:
self.ugops[o[2:]] = val
if len(self.args) > 1 and self.outputfile is not None:
- print "ERROR: cannot use -o when grabbing multiple files"
+ print("ERROR: cannot use -o when grabbing multiple files")
sys.exit(1)
- def help(self, args):
+ def help(self, args, ret=0):
if not args:
- print MAINHELP
+ print(MAINHELP)
else:
for a in args:
m = getattr(self, 'help_'+a, None)
@@ -233,20 +240,20 @@ class client_options:
elif a in self.ug_options:
self.help_ug_option(a)
else:
- print 'ERROR: no help on command "%s"' % a
- sys.exit(0)
+ print('ERROR: no help on command "%s"' % a)
+ sys.exit(ret)
def help_doc(self):
- print __doc__
+ print(__doc__)
def help_options(self):
width = max(map(len, self.ug_options))
format = ' %-' + str(width) + 's = %s'
hformat = ' %-' + str(width) + 's %s'
- print hformat % ('OPTION', 'DEFAULT')
- print '-'*(width + 20)
+ print(hformat % ('OPTION', 'DEFAULT'))
+ print('-'*(width + 20))
for k in self.ug_options:
- print format % (k, self.ug_defaults[k])
+ print(format % (k, self.ug_defaults[k]))
def help_all(self):
for k in self.ug_options:
@@ -257,21 +264,21 @@ class client_options:
m = re.search(r'^( '+option+'.*?)\s*^ {,2}\S',
urlgrabber.grabber.__doc__, re.M|re.S)
if m:
- print m.group(1)
+ print(m.group(1))
else:
- print ' %s: no help found for this option' % option
- print ''
+ print(' %s: no help found for this option' % option)
+ print('')
class ugclient:
def __init__(self):
op = client_options()
self.op = op
if op.verbose >= 2 and op.ugops:
- print "Module Options:"
+ print("Module Options:")
width = max(map(len, op.ugops.keys()))
format = " %-" + str(width) + "s = %s"
for k, v in op.ugops.items():
- print format % (k, repr(v))
+ print(format % (k, repr(v)))
if op.debug:
self.set_debug_logger(op.debug)
@@ -287,22 +294,26 @@ class ugclient:
def run(self):
for url in self.op.args:
- if self.op.verbose: print 'grabbing: %s' % url
+ if self.op.verbose: print('grabbing: %s' % url)
try:
for i in range(0, self.op.repeat):
f = self.g.urlgrab(url, self.op.outputfile)
- if self.op.localfile: print f
- except URLGrabError, e:
- print e
-
+ if self.op.localfile: print(f)
+ except URLGrabError as e:
+ print(e)
+ sys.exit(1)
+
def set_debug_logger(self, dbspec):
try:
dbinfo = dbspec.split(',')
import logging
- level = logging._levelNames.get(dbinfo[0], None)
- if level is None: level = int(dbinfo[0])
+ if sys.version_info.major == 2:
+ level = logging._levelNames.get(dbinfo[0], None)
+ else:
+ level = logging.getLevelName(dbinfo[0])
+ if level is None or not isinstance(level, int): level = int(dbinfo[0])
if level < 1: raise ValueError()
-
+
formatter = logging.Formatter('%(asctime)s %(message)s')
if len(dbinfo) > 1: filename = dbinfo[1]
else: filename = ''
diff --git a/scripts/urlgrabber-ext-down b/scripts/urlgrabber-ext-down
new file mode 100755
index 0000000..40469a7
--- /dev/null
+++ b/scripts/urlgrabber-ext-down
@@ -0,0 +1,82 @@
+#! /usr/bin/python
+# A very simple external downloader
+# Copyright 2011-2012 Zdenek Pavlas
+
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
+# Boston, MA 02111-1307 USA
+
+import time, os, errno, sys
+import six
+from urlgrabber.grabber import \
+ _readlines, URLGrabberOptions, _loads, \
+ PyCurlFileObject, URLGrabError, _to_utf8
+
+def write(fmt, *arg):
+ buf = fmt % arg
+ if six.PY3:
+ buf = buf.encode()
+ try:
+ os.write(1, buf)
+ except OSError as e:
+ if e.args[0] != errno.EPIPE: raise
+ sys.exit(1)
+
+class ProxyProgress:
+ def start(self, *d1, **d2):
+ self.next_update = 0
+ def update(self, _amount_read):
+ t = time.time()
+ if t < self.next_update: return
+ self.next_update = t + 0.31
+ write('%d %d\n', self._id, _amount_read)
+
+def main():
+ import signal
+ signal.signal(signal.SIGINT, lambda n, f: sys.exit(1))
+ cnt = 0
+ while True:
+ lines = _readlines(0)
+ if not lines: break
+ for line in lines:
+ if not isinstance(line, six.string_types):
+ line = line.decode('utf-8')
+ cnt += 1
+ opts = URLGrabberOptions()
+ opts._id = cnt
+ for k in line.split(' '):
+ k, v = k.split('=', 1)
+ setattr(opts, k, _loads(v))
+ if opts.progress_obj:
+ opts.progress_obj = ProxyProgress()
+ opts.progress_obj._id = cnt
+
+ dlsz = dltm = 0
+ try:
+ fo = PyCurlFileObject(_to_utf8(opts.url), opts.filename, opts)
+ fo._do_grab()
+ fo.fo.close()
+ size = fo._amount_read
+ if fo._tm_last:
+ dlsz = fo._tm_last[0] - fo._tm_first[0]
+ dltm = fo._tm_last[1] - fo._tm_first[1]
+ ug_err = 'OK'
+ except URLGrabError as e:
+ size = 0
+ ug_err = '%d %d %s' % (e.errno, getattr(e, 'code', 0), e.strerror)
+ write('%d %d %d %.3f %s\n', opts._id, size, dlsz, dltm, ug_err)
+
+if __name__ == '__main__':
+ main()
diff --git a/setup.cfg b/setup.cfg
new file mode 100755
index 0000000..8bfd5a1
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0
+
diff --git a/setup.py b/setup.py
index d0b87b8..6f6a6bd 100644..100755
--- a/setup.py
+++ b/setup.py
@@ -1,45 +1,58 @@
-# urlgrabber distutils setup
-import re as _re
-import urlgrabber as _urlgrabber
+from setuptools import setup
-name = "urlgrabber"
-description = "A high-level cross-protocol url-grabber"
-long_description = _urlgrabber.__doc__
-license = "LGPL"
-version = _urlgrabber.__version__
-_authors = _re.split(r',\s+', _urlgrabber.__author__)
-author = ', '.join([_re.sub(r'\s+<.*', r'', _) for _ in _authors])
-author_email = ', '.join([_re.sub(r'(^.*<)|(>.*$)', r'', _) for _ in _authors])
-url = _urlgrabber.__url__
+pkg_name = "urlgrabber"
+pkg_version = "4.1.0"
-packages = ['urlgrabber']
-package_dir = {'urlgrabber':'urlgrabber'}
-scripts = ['scripts/urlgrabber']
-data_files = [('share/doc/' + name + '-' + version,
- ['README','LICENSE', 'TODO', 'ChangeLog'])]
-options = { 'clean' : { 'all' : 1 } }
-classifiers = [
- 'Development Status :: 4 - Beta',
- 'Environment :: Console',
- 'Environment :: Web Environment',
- 'Intended Audience :: Developers',
- 'Intended Audience :: System Administrators',
- 'License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)',
- 'Operating System :: POSIX',
- 'Operating System :: POSIX :: Linux',
- 'Programming Language :: Python',
- 'Topic :: Internet :: File Transfer Protocol (FTP)',
- 'Topic :: Internet :: WWW/HTTP',
- 'Topic :: Software Development :: Libraries :: Python Modules'
- ]
-
-# load up distutils
-if __name__ == '__main__':
- config = globals().copy()
- keys = config.keys()
- for k in keys:
- #print '%-20s -> %s' % (k, config[k])
- if k.startswith('_'): del config[k]
-
- from distutils.core import setup
- setup(**config)
+setup(
+ name=pkg_name,
+ version=pkg_version,
+ license="LGPLv2+",
+ description="A high-level cross-protocol url-grabber",
+ keywords="urlgrabber yum http ftp",
+ # From https://pypi.python.org/pypi?%3Aaction=list_classifiers
+ classifiers=[
+ # Development status
+ "Development Status :: 5 - Production/Stable",
+ # Target audience
+ "Intended Audience :: Developers",
+ "Intended Audience :: System Administrators",
+ # Type of software
+ "Topic :: Internet :: File Transfer Protocol (FTP)",
+ "Topic :: Internet :: WWW/HTTP",
+ "Topic :: Software Development :: Libraries :: Python Modules",
+ # Kind of software
+ "Environment :: Console",
+ "Environment :: Web Environment",
+ # License (must match license field)
+ "License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)",
+ # Operating systems supported
+ "Operating System :: POSIX",
+ "Operating System :: POSIX :: Linux",
+ # Supported Python versions
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2",
+ "Programming Language :: Python :: 2.6",
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ ],
+ url="http://urlgrabber.baseurl.org/",
+ author="Michael D. Stenner, Ryan Tomayko, Seth Vidal, Zdenek Pavlas",
+ author_email="mstenner@linux.duke.edu, rtomayko@naeblis.cx, skvidal@fedoraproject.org, zpavlas@redhat.com",
+ maintainer="Neal Gompa",
+ maintainer_email="ngompa@fedoraproject.org",
+ packages=["urlgrabber"],
+ package_dir = {'urlgrabber':'urlgrabber'},
+ include_package_data=True,
+ install_requires=[
+ "pycurl",
+ "six",
+ "setuptools",
+ ],
+ scripts = ['scripts/urlgrabber'],
+ data_files = [
+ ('share/doc/' + pkg_name + '-' + pkg_version, ['README','LICENSE', 'TODO', 'ChangeLog']),
+ ('libexec', ['scripts/urlgrabber-ext-down']),
+ ],
+)
diff --git a/test/base_test_code.py b/test/base_test_code.py
index 50c6348..97901be 100644..100755
--- a/test/base_test_code.py
+++ b/test/base_test_code.py
@@ -1,17 +1,18 @@
from munittest import *
-base_http = 'http://www.linux.duke.edu/projects/urlgrabber/test/'
+#base_http = 'http://urlgrabber.baseurl.org/test/'
+base_http = 'http://in.waw.pl/urlgrabber/test/'
base_ftp = 'ftp://localhost/test/'
# set to a proftp server only. we're working around a couple of
# bugs in their implementation in byterange.py.
base_proftp = 'ftp://localhost/test/'
-reference_data = ''.join( [str(i)+'\n' for i in range(20000) ] )
+reference_data = ''.join(str(i) + '\n' for i in range(20000)).encode('utf8')
ref_http = base_http + 'reference'
ref_ftp = base_ftp + 'reference'
ref_proftp = base_proftp + 'reference'
-short_reference_data = ' '.join( [str(i) for i in range(10) ] )
+short_reference_data = ' '.join(str(i) for i in range(10)).encode('utf8')
short_ref_http = base_http + 'short_reference'
short_ref_ftp = base_ftp + 'short_reference'
diff --git a/test/grabberperf.py b/test/grabberperf.py
index 820da2c..31771ae 100644..100755
--- a/test/grabberperf.py
+++ b/test/grabberperf.py
@@ -11,14 +11,16 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+from __future__ import print_function
+
import sys
import os
from os.path import dirname, join as joinpath
@@ -46,16 +48,16 @@ def main():
# remove temp files
os.unlink(tempsrc)
os.unlink(tempdst)
-
+
def setuptemp(size):
- if DEBUG: print 'writing %d KB to temporary file (%s).' % (size / 1024, tempsrc)
+ if DEBUG: print('writing %d KB to temporary file (%s).' % (size / 1024, tempsrc))
file = open(tempsrc, 'w', 1024)
chars = '0123456789'
for i in range(size):
file.write(chars[i % 10])
file.flush()
file.close()
-
+
def speedtest(size):
setuptemp(size)
full_times = []
@@ -65,12 +67,12 @@ def speedtest(size):
try:
from urlgrabber.progress import text_progress_meter
- except ImportError, e:
+ except ImportError as e:
tpm = None
- print 'not using progress meter'
+ print('not using progress meter')
else:
tpm = text_progress_meter(fo=open('/dev/null', 'w'))
-
+
# to address concerns that the overhead from the progress meter
# and throttling slow things down, we do this little test.
#
@@ -81,17 +83,17 @@ def speedtest(size):
# note: it _is_ even slower to direct the progress meter to a real
# tty or file, but I'm just interested in the overhead from _this_
# module.
-
+
# get it nicely cached before we start comparing
- if DEBUG: print 'pre-caching'
+ if DEBUG: print('pre-caching')
for i in range(100):
urlgrab(tempsrc, tempdst, copy_local=1, throttle=None, proxies=proxies)
-
- if DEBUG: print 'running speed test.'
+
+ if DEBUG: print('running speed test.')
reps = 500
for i in range(reps):
- if DEBUG:
- print '\r%4i/%-4i' % (i+1, reps),
+ if DEBUG:
+ print('\r%4i/%-4i' % (i+1, reps), end=' ')
sys.stdout.flush()
t = time.time()
urlgrab(tempsrc, tempdst,
@@ -108,7 +110,7 @@ def speedtest(size):
t = time.time()
in_fo = open(tempsrc)
out_fo = open(tempdst, 'wb')
- while 1:
+ while True:
s = in_fo.read(1024 * 8)
if not s: break
out_fo.write(s)
@@ -116,9 +118,9 @@ def speedtest(size):
out_fo.close()
none_times.append(1000 * (time.time() - t))
- if DEBUG: print '\r'
+ if DEBUG: print('\r')
- print "%d KB Results:" % (size / 1024)
+ print("%d KB Results:" % (size / 1024))
print_result('full', full_times)
print_result('raw', raw_times)
print_result('none', none_times)
@@ -131,7 +133,7 @@ def print_result(label, result_list):
for i in result_list: mean += i
mean = mean/len(result_list)
median = result_list[int(len(result_list)/2)]
- print format % (label, mean, median, result_list[0], result_list[-1])
+ print(format % (label, mean, median, result_list[0], result_list[-1]))
if __name__ == '__main__':
main()
diff --git a/test/munittest.py b/test/munittest.py
index 96230b8..5fdf6f6 100644..100755
--- a/test/munittest.py
+++ b/test/munittest.py
@@ -1,4 +1,7 @@
#!/usr/bin/env python
+
+from __future__ import print_function
+
"""
This is a modified version of the unittest module has been modified by
Michael D. Stenner from Steve Purcell's version (revision 1.46, as
@@ -98,14 +101,20 @@ AND THERE IS NO OBLIGATION WHATSOEVER TO PROVIDE MAINTENANCE,
SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
"""
-# $Id: munittest.py,v 1.2 2004/03/31 01:27:24 mstenner Exp $
-
import time
import sys
import traceback
-import string
import os
import types
+import unittest
+
+from six import class_types, string_types
+
+try:
+ cmp
+except NameError:
+ def cmp(a, b):
+ return (a > b) - (a < b)
##############################################################################
# Exported classes and functions
@@ -113,7 +122,7 @@ import types
__all__ = ['TestResult', 'TestCase', 'TestSuite', 'TextTestRunner',
'TestLoader', 'FunctionTestCase', 'main', 'defaultTestLoader']
-# Expose obsolete functions for backwards compatability
+# Expose obsolete functions for backwards compatibility
__all__.extend(['getTestCaseNames', 'makeSuite', 'findTestCases'])
@@ -190,7 +199,7 @@ class TestResult:
def _exc_info_to_string(self, err):
"""Converts a sys.exc_info()-style tuple of values into a string."""
- return string.join(traceback.format_exception(*err), '')
+ return ''.join(traceback.format_exception(*err))
def __repr__(self):
return "<%s run=%i errors=%i failures=%i>" % \
@@ -198,7 +207,7 @@ class TestResult:
len(self.failures))
-class TestCase:
+class TestCase(unittest.TestCase):
"""A class whose instances are single test cases.
By default, the test code itself should be placed in a method named
@@ -241,27 +250,6 @@ class TestCase:
interrupt_skips = 0
- def __init__(self, methodName='runTest'):
- """Create an instance of the class that will use the named test
- method when executed. Raises a ValueError if the instance does
- not have a method with the specified name.
- """
- try:
- self._testMethodName = methodName
- testMethod = getattr(self, methodName)
- self._testMethodDoc = testMethod.__doc__
- except AttributeError:
- raise ValueError, "no such test method in %s: %s" % \
- (self.__class__, methodName)
-
- def setUp(self):
- "Hook method for setting up the test fixture before exercising it."
- pass
-
- def tearDown(self):
- "Hook method for deconstructing the test fixture after testing it."
- pass
-
def countTestCases(self):
return 1
@@ -276,7 +264,7 @@ class TestCase:
the specified test method's docstring.
"""
doc = self._testMethodDoc
- return doc and string.strip(string.split(doc, "\n")[0]) or None
+ return doc and doc.split('\n')[0].strip() or None
def id(self):
return "%s.%s" % (_strclass(self.__class__), self._testMethodName)
@@ -288,9 +276,6 @@ class TestCase:
return "<%s testMethod=%s>" % \
(_strclass(self.__class__), self._testMethodName)
- def run(self, result=None):
- return self(result)
-
def __call__(self, result=None):
if result is None: result = self.defaultTestResult()
result.startTest(self)
@@ -361,15 +346,15 @@ class TestCase:
def fail(self, msg=None):
"""Fail immediately, with the given message."""
- raise self.failureException, msg
+ raise self.failureException(msg)
def failIf(self, expr, msg=None):
"Fail the test if the expression is true."
- if expr: raise self.failureException, msg
+ if expr: raise self.failureException(msg)
def failUnless(self, expr, msg=None):
"""Fail the test unless the expression is true."""
- if not expr: raise self.failureException, msg
+ if not expr: raise self.failureException(msg)
def failUnlessRaises(self, excClass, callableObj, *args, **kwargs):
"""Fail unless an exception of class excClass is thrown
@@ -386,23 +371,21 @@ class TestCase:
else:
if hasattr(excClass,'__name__'): excName = excClass.__name__
else: excName = str(excClass)
- raise self.failureException, excName
+ raise self.failureException(excName)
def failUnlessEqual(self, first, second, msg=None):
"""Fail if the two objects are unequal as determined by the '=='
operator.
"""
if not first == second:
- raise self.failureException, \
- (msg or '%s != %s' % (`first`, `second`))
+ raise self.failureException(msg or '%r != %r' % (first, second))
def failIfEqual(self, first, second, msg=None):
"""Fail if the two objects are equal as determined by the '=='
operator.
"""
if first == second:
- raise self.failureException, \
- (msg or '%s == %s' % (`first`, `second`))
+ raise self.failureException(msg or '%r == %r' % (first, second))
def failUnlessAlmostEqual(self, first, second, places=7, msg=None):
"""Fail if the two objects are unequal as determined by their
@@ -410,11 +393,10 @@ class TestCase:
(default 7) and comparing to zero.
Note that decimal places (from zero) is usually not the same
- as significant digits (measured from the most signficant digit).
+ as significant digits (measured from the most significant digit).
"""
if round(second-first, places) != 0:
- raise self.failureException, \
- (msg or '%s != %s within %s places' % (`first`, `second`, `places` ))
+ raise self.failureException(msg or '%r != %r within %s places' % (first, second, places))
def failIfAlmostEqual(self, first, second, places=7, msg=None):
"""Fail if the two objects are equal as determined by their
@@ -422,11 +404,10 @@ class TestCase:
(default 7) and comparing to zero.
Note that decimal places (from zero) is usually not the same
- as significant digits (measured from the most signficant digit).
+ as significant digits (measured from the most significant digit).
"""
if round(second-first, places) == 0:
- raise self.failureException, \
- (msg or '%s == %s within %s places' % (`first`, `second`, `places`))
+ raise self.failureException(msg or '%r == %r within %r places' % (first, second, places))
assertEqual = assertEquals = failUnlessEqual
@@ -442,15 +423,15 @@ class TestCase:
def skip(self, msg=None):
"""Skip the test"""
- raise self.skipException, msg
+ raise self.skipException(msg)
def skipIf(self, expr, msg=None):
"Skip the test if the expression is true."
- if expr: raise self.skipException, msg
+ if expr: raise self.skipException(msg)
def skipUnless(self, expr, msg=None):
"""Skip the test unless the expression is true."""
- if not expr: raise self.skipException, msg
+ if not expr: raise self.skipException(msg)
@@ -467,12 +448,12 @@ class TestSuite:
self._tests = []
self.addTests(tests)
self.description = description or '(no description)'
-
+
def __repr__(self):
return "<%s tests=%s>" % (_strclass(self.__class__), self._tests)
__str__ = __repr__
-
+
def shortDescription(self):
return self.description
@@ -498,7 +479,7 @@ class TestSuite:
def __call__(self, result):
try: result.startSuite(self)
except AttributeError: pass
-
+
for test in self._tests:
if result.shouldStop:
break
@@ -554,8 +535,7 @@ class FunctionTestCase(TestCase):
def shortDescription(self):
if self._description is not None: return self._description
doc = self._testFunc.__doc__
- return doc and string.strip(string.split(doc, "\n")[0]) or None
-
+ return doc and doc.split('\n')[0].strip() or None
##############################################################################
@@ -576,16 +556,16 @@ class TestLoader:
instance_list = map(testCaseClass, name_list)
description = getattr(testCaseClass, '__doc__') \
or testCaseClass.__name__
- description = (description.splitlines()[0]).strip()
+ description = description.splitlines()[0].strip()
suite = self.suiteClass(instance_list, description)
return suite
-
+
def loadTestsFromModule(self, module):
"""Return a suite of all tests cases contained in the given module"""
tests = []
for name in dir(module):
obj = getattr(module, name)
- if (isinstance(obj, (type, types.ClassType)) and
+ if (isinstance(obj, class_types) and
issubclass(obj, TestCase) and
not obj in [TestCase, FunctionTestCase]):
tests.append(self.loadTestsFromTestCase(obj))
@@ -603,15 +583,15 @@ class TestLoader:
The method optionally resolves the names relative to a given module.
"""
- parts = string.split(name, '.')
+ parts = name.split('.')
if module is None:
if not parts:
- raise ValueError, "incomplete test name: %s" % name
+ raise ValueError("incomplete test name: %s" % name)
else:
parts_copy = parts[:]
while parts_copy:
try:
- module = __import__(string.join(parts_copy,'.'))
+ module = __import__('.'.join(parts_copy))
break
except ImportError:
del parts_copy[-1]
@@ -622,22 +602,20 @@ class TestLoader:
obj = getattr(obj, part)
import unittest
- if type(obj) == types.ModuleType:
+ if isinstance(obj, types.ModuleType):
return self.loadTestsFromModule(obj)
- elif (isinstance(obj, (type, types.ClassType)) and
+ elif (isinstance(obj, class_types) and
issubclass(obj, unittest.TestCase)):
return self.loadTestsFromTestCase(obj)
- elif type(obj) == types.UnboundMethodType:
- return obj.im_class(obj.__name__)
+ elif isinstance(obj, types.UnboundMethodType):
+ return obj.__self__.__class__(obj.__name__)
elif callable(obj):
test = obj()
- if not isinstance(test, unittest.TestCase) and \
- not isinstance(test, unittest.TestSuite):
- raise ValueError, \
- "calling %s returned %s, not a test" % (obj,test)
+ if not isinstance(test, (unittest.TestCase, unittest.TestSuite)):
+ raise ValueError("calling %s returned %s, not a test" % (obj,test))
return test
else:
- raise ValueError, "don't know how to make test from: %s" % obj
+ raise ValueError("don't know how to make test from: %s" % obj)
def loadTestsFromNames(self, names, module=None):
"""Return a suite of all tests cases found using the given sequence
@@ -651,18 +629,16 @@ class TestLoader:
def getTestCaseNames(self, testCaseClass):
"""Return a sorted sequence of method names found within testCaseClass
"""
- testFnNames = filter(lambda n,p=self.testMethodPrefix: n[:len(p)] == p,
- dir(testCaseClass))
+ testFnNames = [n for n in dir(testCaseClass)
+ if n.startswith(self.testMethodPrefix)]
+
for baseclass in testCaseClass.__bases__:
for testFnName in self.getTestCaseNames(baseclass):
if testFnName not in testFnNames: # handle overridden methods
testFnNames.append(testFnName)
- if self.sortTestMethodsUsing:
- testFnNames.sort(self.sortTestMethodsUsing)
+ testFnNames.sort()
return testFnNames
-
-
defaultTestLoader = TestLoader()
@@ -740,7 +716,7 @@ class _TextTestResult(TestResult):
except AttributeError: desc = '(no description)'
self.stream.writeln(desc)
self.depth += 1
-
+
def startTest(self, test):
TestResult.startTest(self, test)
if self.showAll:
@@ -825,8 +801,8 @@ class TextTestRunner:
self.stream.writeln()
if not result.wasSuccessful():
self.stream.write("FAILED (")
- failed, errored, skipped = map(len, \
- (result.failures, result.errors, result.skipped))
+ failed, errored, skipped = map(len,
+ (result.failures, result.errors, result.skipped))
if failed:
self.stream.write("failures=%d" % failed)
if errored:
@@ -869,9 +845,9 @@ Examples:
"""
def __init__(self, module='__main__', defaultTest=None,
argv=None, testRunner=None, testLoader=defaultTestLoader):
- if type(module) == type(''):
+ if isinstance(module, string_types):
self.module = __import__(module)
- for part in string.split(module,'.')[1:]:
+ for part in module.split('.')[1:]:
self.module = getattr(self.module, part)
else:
self.module = module
@@ -886,8 +862,8 @@ Examples:
self.runTests()
def usageExit(self, msg=None):
- if msg: print msg
- print self.USAGE % self.__dict__
+ if msg: print(msg)
+ print(self.USAGE % self.__dict__)
sys.exit(2)
def parseArgs(self, argv):
@@ -910,7 +886,7 @@ Examples:
else:
self.testNames = (self.defaultTest,)
self.createTests()
- except getopt.error, msg:
+ except getopt.error as msg:
self.usageExit(msg)
def createTests(self):
diff --git a/test/runtests.py b/test/runtests.py
index c48bd1d..5aaac26 100644..100755
--- a/test/runtests.py
+++ b/test/runtests.py
@@ -1,20 +1,20 @@
#!/usr/bin/python
+from __future__ import print_function
+
"""Usage: python runtests.py [OPTIONS]
-Quick script to run all unit tests from source directory
+Quick script to run all unit tests from source directory
(e.g. without having to install.)
OPTIONS:
-
- -d, --descriptions=NUM Set to 0 to turn off printing
+
+ -d, --descriptions=NUM Set to 0 to turn off printing
test doc strings as descriptions.
-v, --verbosity=NUM Output verbosity level. Defaults to
- 2 which is one line of info per test. Set
+ 2 which is one line of info per test. Set
to 1 to get one char of info per test
or 0 to disable status output completely.
"""
-
-# $Id: runtests.py,v 1.7 2004/03/31 17:02:00 mstenner Exp $
import sys
from os.path import dirname, join as joinpath
@@ -31,7 +31,7 @@ def main():
# it's okay to import now that sys.path is setup.
import test_grabber, test_byterange, test_mirror
suite = TestSuite( (test_grabber.suite(),
- test_byterange.suite(),
+ test_byterange.suite(),
test_mirror.suite()) )
suite.description = 'urlgrabber tests'
runner = TextTestRunner(stream=sys.stdout,
@@ -52,9 +52,9 @@ def parse_args():
elif o in ('-v', '--verbosity'):
verbosity = int(a)
return (descriptions,verbosity)
-
+
def usage():
- print __doc__
-
+ print(__doc__)
+
if __name__ == '__main__':
main()
diff --git a/test/test_byterange.py b/test/test_byterange.py
index 96f1573..dfed311 100644..100755
--- a/test/test_byterange.py
+++ b/test/test_byterange.py
@@ -11,98 +11,93 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-"""byterange.py tests"""
+from __future__ import print_function
-# $Id: test_byterange.py,v 1.6 2004/03/31 17:02:00 mstenner Exp $
+"""byterange.py tests"""
import sys
-from StringIO import StringIO
+if sys.version_info >= (3,):
+ # We do an explicit version check here because because python2
+ # also has an io module with StringIO, but it is incompatible,
+ # and returns str instead of unicode somewhere.
+ from io import StringIO
+else:
+ from cStringIO import StringIO
+
from urlgrabber.byterange import RangeableFileObject
from base_test_code import *
class RangeableFileObjectTestCase(TestCase):
"""Test range.RangeableFileObject class"""
-
+
def setUp(self):
# 0 1 2 3 4 5 6 7 8 9
# 0123456789012345678901234567890123456789012345678901234567 890123456789012345678901234567890
self.test = 'Why cannot we write the entire 24 volumes of Encyclopaedia\nBrittanica on the head of a pin?\n'
self.fo = StringIO(self.test)
self.rfo = RangeableFileObject(self.fo, (20,69))
-
+
def tearDown(self):
pass
-
+
def test_seek(self):
"""RangeableFileObject.seek()"""
self.rfo.seek(11)
- self.assertEquals('24', self.rfo.read(2))
+ self.assertEqual('24', self.rfo.read(2))
self.rfo.seek(14)
- self.assertEquals('volumes', self.rfo.read(7))
+ self.assertEqual('volumes', self.rfo.read(7))
self.rfo.seek(1,1)
- self.assertEquals('of', self.rfo.read(2))
-
- def test_poor_mans_seek(self):
- """RangeableFileObject.seek() poor mans version..
-
- We just delete the seek method from StringIO so we can
- excercise RangeableFileObject when the file object supplied
- doesn't support seek.
- """
- seek = StringIO.seek
- del(StringIO.seek)
- self.test_seek()
- StringIO.seek = seek
-
+ self.assertEqual('of', self.rfo.read(2))
+
def test_read(self):
"""RangeableFileObject.read()"""
- self.assertEquals('the', self.rfo.read(3))
- self.assertEquals(' entire 24 volumes of ', self.rfo.read(22))
- self.assertEquals('Encyclopaedia\nBrittanica', self.rfo.read(50))
- self.assertEquals('', self.rfo.read())
-
+ self.assertEqual('the', self.rfo.read(3))
+ self.assertEqual(' entire 24 volumes of ', self.rfo.read(22))
+ self.assertEqual('Encyclopaedia\nBrittanica', self.rfo.read(50))
+ self.assertEqual('', self.rfo.read())
+
def test_readall(self):
"""RangeableFileObject.read(): to end of file."""
rfo = RangeableFileObject(StringIO(self.test),(11,))
- self.assertEquals(self.test[11:],rfo.read())
-
+ self.assertEqual(self.test[11:],rfo.read())
+
def test_readline(self):
"""RangeableFileObject.readline()"""
- self.assertEquals('the entire 24 volumes of Encyclopaedia\n', self.rfo.readline())
- self.assertEquals('Brittanica', self.rfo.readline())
- self.assertEquals('', self.rfo.readline())
-
+ self.assertEqual('the entire 24 volumes of Encyclopaedia\n', self.rfo.readline())
+ self.assertEqual('Brittanica', self.rfo.readline())
+ self.assertEqual('', self.rfo.readline())
+
def test_tell(self):
"""RangeableFileObject.tell()"""
- self.assertEquals(0,self.rfo.tell())
+ self.assertEqual(0,self.rfo.tell())
self.rfo.read(5)
- self.assertEquals(5,self.rfo.tell())
+ self.assertEqual(5,self.rfo.tell())
self.rfo.readline()
- self.assertEquals(39,self.rfo.tell())
-
+ self.assertEqual(39,self.rfo.tell())
+
class RangeModuleTestCase(TestCase):
"""Test module level functions defined in range.py"""
def setUp(self):
pass
-
+
def tearDown(self):
pass
-
+
def test_range_tuple_normalize(self):
"""byterange.range_tuple_normalize()"""
from urlgrabber.byterange import range_tuple_normalize
from urlgrabber.byterange import RangeError
- tests = (
+ tests = (
((None,50), (0,50)),
((500,600), (500,600)),
((500,), (500,'')),
@@ -112,28 +107,28 @@ class RangeModuleTestCase(TestCase):
(None, None)
)
for test, ex in tests:
- self.assertEquals( range_tuple_normalize(test), ex )
-
+ self.assertEqual( range_tuple_normalize(test), ex )
+
try: range_tuple_normalize( (10,8) )
except RangeError: pass
else: self.fail("range_tuple_normalize( (10,8) ) should have raised RangeError")
-
+
def test_range_header_to_tuple(self):
"""byterange.range_header_to_tuple()"""
from urlgrabber.byterange import range_header_to_tuple
- tests = (
+ tests = (
('bytes=500-600', (500,601)),
('bytes=500-', (500,'')),
('bla bla', ()),
(None, None)
)
for test, ex in tests:
- self.assertEquals( range_header_to_tuple(test), ex )
-
+ self.assertEqual( range_header_to_tuple(test), ex )
+
def test_range_tuple_to_header(self):
"""byterange.range_tuple_to_header()"""
from urlgrabber.byterange import range_tuple_to_header
- tests = (
+ tests = (
((500,600), 'bytes=500-599'),
((500,''), 'bytes=500-'),
((500,), 'bytes=500-'),
@@ -142,16 +137,16 @@ class RangeModuleTestCase(TestCase):
(None, None),
)
for test, ex in tests:
- self.assertEquals( range_tuple_to_header(test), ex )
-
+ self.assertEqual( range_tuple_to_header(test), ex )
+
try: range_tuple_to_header( ('not an int',500) )
except ValueError: pass
else: self.fail("range_tuple_to_header( ('not an int',500) ) should have raised ValueError")
-
+
try: range_tuple_to_header( (0,'not an int') )
except ValueError: pass
else: self.fail("range_tuple_to_header( (0, 'not an int') ) should have raised ValueError")
-
+
def suite():
tl = TestLoader()
return tl.loadTestsFromModule(sys.modules[__name__])
diff --git a/test/test_grabber.py b/test/test_grabber.py
index eecdbcf..465e5f5 100644..100755
--- a/test/test_grabber.py
+++ b/test/test_grabber.py
@@ -11,23 +11,37 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
-"""grabber.py tests"""
+from __future__ import print_function
-# $Id: test_grabber.py,v 1.31 2006/12/08 00:14:16 mstenner Exp $
+"""grabber.py tests"""
import sys
import os
-import string, tempfile, random, cStringIO, os
-import urllib2
+import tempfile, random, os
import socket
+from io import BytesIO
+from six import string_types
+
+if sys.version_info >= (3,):
+ # We do an explicit version check here because because python2
+ # also has an io module with StringIO, but it is incompatible,
+ # and returns str instead of unicode somewhere.
+ from io import StringIO
+else:
+ from cStringIO import StringIO
+
+try:
+ from urllib.request import urlopen, OpenerDirector
+except ImportError:
+ from urllib2 import urlopen, OpenerDirector
from base_test_code import *
@@ -38,15 +52,14 @@ from urlgrabber.grabber import URLGrabber, URLGrabError, CallbackObject, \
from urlgrabber.progress import text_progress_meter
class FileObjectTests(TestCase):
-
+
def setUp(self):
self.filename = tempfile.mktemp()
- fo = file(self.filename, 'wb')
- fo.write(reference_data)
- fo.close()
+ with open(self.filename, 'wb') as fo:
+ fo.write(reference_data)
- self.fo_input = cStringIO.StringIO(reference_data)
- self.fo_output = cStringIO.StringIO()
+ self.fo_input = BytesIO(reference_data)
+ self.fo_output = BytesIO()
(url, parts) = grabber.default_grabber.opts.urlparser.parse(
self.filename, grabber.default_grabber.opts)
self.wrapper = grabber.PyCurlFileObject(
@@ -60,41 +73,39 @@ class FileObjectTests(TestCase):
"PYCurlFileObject .read() method"
s = self.wrapper.read()
self.fo_output.write(s)
- self.assert_(reference_data == self.fo_output.getvalue())
+ self.assertTrue(reference_data == self.fo_output.getvalue())
def test_readline(self):
"PyCurlFileObject .readline() method"
- while 1:
+ while True:
s = self.wrapper.readline()
self.fo_output.write(s)
if not s: break
- self.assert_(reference_data == self.fo_output.getvalue())
+ self.assertTrue(reference_data == self.fo_output.getvalue())
def test_readlines(self):
"PyCurlFileObject .readlines() method"
li = self.wrapper.readlines()
- self.fo_output.write(string.join(li, ''))
- self.assert_(reference_data == self.fo_output.getvalue())
+ self.fo_output.write(b''.join(li))
+ self.assertTrue(reference_data == self.fo_output.getvalue())
def test_smallread(self):
"PyCurlFileObject .read(N) with small N"
- while 1:
+ while True:
s = self.wrapper.read(23)
self.fo_output.write(s)
if not s: break
- self.assert_(reference_data == self.fo_output.getvalue())
-
+ self.assertTrue(reference_data == self.fo_output.getvalue())
+
class HTTPTests(TestCase):
def test_reference_file(self):
- "download refernce file via HTTP"
+ "download reference file via HTTP"
filename = tempfile.mktemp()
grabber.urlgrab(ref_http, filename)
- fo = file(filename, 'rb')
- contents = fo.read()
- fo.close()
+ contents = open(filename, 'rb').read()
- self.assert_(contents == reference_data)
+ self.assertTrue(contents == reference_data)
def test_post(self):
"do an HTTP post"
@@ -109,46 +120,46 @@ class URLGrabberModuleTestCase(TestCase):
"""Test module level functions defined in grabber.py"""
def setUp(self):
pass
-
+
def tearDown(self):
pass
-
+
def test_urlopen(self):
"module-level urlopen() function"
fo = urlgrabber.urlopen('http://www.python.org')
fo.close()
-
+
def test_urlgrab(self):
"module-level urlgrab() function"
outfile = tempfile.mktemp()
- filename = urlgrabber.urlgrab('http://www.python.org',
+ filename = urlgrabber.urlgrab('http://www.python.org',
filename=outfile)
os.unlink(outfile)
-
+
def test_urlread(self):
"module-level urlread() function"
s = urlgrabber.urlread('http://www.python.org')
-
+
class URLGrabberTestCase(TestCase):
"""Test grabber.URLGrabber class"""
-
+
def setUp(self):
-
- self.meter = text_progress_meter( fo=cStringIO.StringIO() )
+
+ self.meter = text_progress_meter( fo=StringIO() )
pass
-
+
def tearDown(self):
pass
-
+
def testKeywordArgs(self):
"""grabber.URLGrabber.__init__() **kwargs handling.
-
+
This is a simple test that just passes some arbitrary
values into the URLGrabber constructor and checks that
they've been set properly.
"""
- opener = urllib2.OpenerDirector()
+ opener = OpenerDirector()
g = URLGrabber( progress_obj=self.meter,
throttle=0.9,
bandwidth=20,
@@ -160,78 +171,81 @@ class URLGrabberTestCase(TestCase):
proxies={'http' : 'http://www.proxy.com:9090'},
opener=opener )
opts = g.opts
- self.assertEquals( opts.progress_obj, self.meter )
- self.assertEquals( opts.throttle, 0.9 )
- self.assertEquals( opts.bandwidth, 20 )
- self.assertEquals( opts.retry, 20 )
- self.assertEquals( opts.retrycodes, [5,6,7] )
- self.assertEquals( opts.copy_local, 1 )
- self.assertEquals( opts.close_connection, 1 )
- self.assertEquals( opts.user_agent, 'test ua/1.0' )
- self.assertEquals( opts.proxies, {'http' : 'http://www.proxy.com:9090'} )
- self.assertEquals( opts.opener, opener )
-
- nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5,
+ self.assertEqual( opts.progress_obj, self.meter )
+ self.assertEqual( opts.throttle, 0.9 )
+ self.assertEqual( opts.bandwidth, 20 )
+ self.assertEqual( opts.retry, 20 )
+ self.assertEqual( opts.retrycodes, [5,6,7] )
+ self.assertEqual( opts.copy_local, 1 )
+ self.assertEqual( opts.close_connection, 1 )
+ self.assertEqual( opts.user_agent, 'test ua/1.0' )
+ self.assertEqual( opts.proxies, {'http' : 'http://www.proxy.com:9090'} )
+ self.assertEqual( opts.opener, opener )
+
+ nopts = grabber.URLGrabberOptions(delegate=opts, throttle=0.5,
copy_local=0)
- self.assertEquals( nopts.progress_obj, self.meter )
- self.assertEquals( nopts.throttle, 0.5 )
- self.assertEquals( nopts.bandwidth, 20 )
- self.assertEquals( nopts.retry, 20 )
- self.assertEquals( nopts.retrycodes, [5,6,7] )
- self.assertEquals( nopts.copy_local, 0 )
- self.assertEquals( nopts.close_connection, 1 )
- self.assertEquals( nopts.user_agent, 'test ua/1.0' )
- self.assertEquals( nopts.proxies, {'http' : 'http://www.proxy.com:9090'} )
+ self.assertEqual( nopts.progress_obj, self.meter )
+ self.assertEqual( nopts.throttle, 0.5 )
+ self.assertEqual( nopts.bandwidth, 20 )
+ self.assertEqual( nopts.retry, 20 )
+ self.assertEqual( nopts.retrycodes, [5,6,7] )
+ self.assertEqual( nopts.copy_local, 0 )
+ self.assertEqual( nopts.close_connection, 1 )
+ self.assertEqual( nopts.user_agent, 'test ua/1.0' )
+ self.assertEqual( nopts.proxies, {'http' : 'http://www.proxy.com:9090'} )
nopts.opener = None
- self.assertEquals( nopts.opener, None )
-
+ self.assertEqual( nopts.opener, None )
+
def test_make_callback(self):
"""grabber.URLGrabber._make_callback() tests"""
def cb(e): pass
tup_cb = (cb, ('stuff'), {'some': 'dict'})
g = URLGrabber()
- self.assertEquals(g._make_callback(cb), (cb, (), {}))
- self.assertEquals(g._make_callback(tup_cb), tup_cb)
+ self.assertEqual(g._make_callback(cb), (cb, (), {}))
+ self.assertEqual(g._make_callback(tup_cb), tup_cb)
class URLParserTestCase(TestCase):
def setUp(self):
pass
-
+
def tearDown(self):
pass
def test_parse_url_with_prefix(self):
"""grabber.URLParser.parse() with opts.prefix"""
- base = 'http://foo.com/dir'
- bases = [base, base+'/']
- filename = 'bar/baz'
- target = base + '/' + filename
-
+ base = b'http://foo.com/dir'
+ bases = [base, base + b'/']
+ filename = b'bar/baz'
+ target = base + b'/' + filename
+
for b in bases:
g = URLGrabber(prefix=b)
(url, parts) = g.opts.urlparser.parse(filename, g.opts)
- self.assertEquals(url, target)
+ self.assertEqual(url, target)
def _test_url(self, urllist):
g = URLGrabber()
try: quote = urllist[3]
except IndexError: quote = None
g.opts.quote = quote
- (url, parts) = g.opts.urlparser.parse(urllist[0], g.opts)
-
+ url = urllist[0].encode('utf8')
+ expected_url = urllist[1].encode('utf8')
+ expected_parts = tuple(part.encode('utf8') for part in urllist[2])
+ (url, parts) = g.opts.urlparser.parse(url, g.opts)
+
if 1:
- self.assertEquals(url, urllist[1])
- self.assertEquals(parts, urllist[2])
+ self.assertEqual(url, expected_url)
+ self.assertEqual(parts, expected_parts)
else:
if url == urllist[1] and parts == urllist[2]:
- print 'OK: %s' % urllist[0]
+ print('OK: %s' % urllist[0])
else:
- print 'ERROR: %s' % urllist[0]
- print ' ' + urllist[1]
- print ' ' + url
- print ' ' + urllist[2]
- print ' ' + parts
-
+ print('ERROR: %s' % urllist[0])
+ print(' ' + urllist[1])
+ print(' ' + url)
+ print(' ' + urllist[2])
+ print(' ' + parts)
+
url_tests_all = (
['http://host.com/path/basename.ext?arg1=val1&arg2=val2#hash',
@@ -251,13 +265,13 @@ class URLParserTestCase(TestCase):
'http://host.com/Should%2520Not',
('http', 'host.com', '/Should%2520Not', '', '', ''), 1],
)
-
+
url_tests_posix = (
['/etc/passwd',
'file:///etc/passwd',
('file', '', '/etc/passwd', '', '', '')],
)
-
+
url_tests_nt = (
[r'\\foo.com\path\file.ext',
'file://foo.com/path/file.ext',
@@ -295,7 +309,7 @@ class FailureTestCase(TestCase):
self.obj = obj
self.args = args
self.kwargs = kwargs
-
+
def test_failure_callback_called(self):
"failure callback is called on retry"
self.failure_callback_called = 0
@@ -303,7 +317,7 @@ class FailureTestCase(TestCase):
failure_callback=self._failure_callback)
try: g.urlgrab(ref_404)
except URLGrabError: pass
- self.assertEquals(self.failure_callback_called, 1)
+ self.assertEqual(self.failure_callback_called, 1)
def test_failure_callback_args(self):
"failure callback is called with the proper args"
@@ -312,14 +326,17 @@ class FailureTestCase(TestCase):
failure_callback=fc)
try: g.urlgrab(ref_404)
except URLGrabError: pass
- self.assert_(hasattr(self, 'obj'))
- self.assert_(hasattr(self, 'args'))
- self.assert_(hasattr(self, 'kwargs'))
- self.assertEquals(self.args, ('foo',))
- self.assertEquals(self.kwargs, {'bar': 'baz'})
- self.assert_(isinstance(self.obj, CallbackObject))
- self.assertEquals(self.obj.url, ref_404)
- self.assert_(isinstance(self.obj.exception, URLGrabError))
+ self.assertTrue(hasattr(self, 'obj'))
+ self.assertTrue(hasattr(self, 'args'))
+ self.assertTrue(hasattr(self, 'kwargs'))
+ self.assertEqual(self.args, ('foo',))
+ self.assertEqual(self.kwargs, {'bar': 'baz'})
+ self.assertTrue(isinstance(self.obj, CallbackObject))
+ url = self.obj.url
+ if not isinstance(url, string_types):
+ url = url.decode('utf8')
+ self.assertEqual(url, ref_404)
+ self.assertTrue(isinstance(self.obj.exception, URLGrabError))
del self.obj
class InterruptTestCase(TestCase):
@@ -339,7 +356,7 @@ class InterruptTestCase(TestCase):
self.kwargs = kwargs
if kwargs.get('exception', None):
raise kwargs['exception']
-
+
def test_interrupt_callback_called(self):
"interrupt callback is called on retry"
self.interrupt_callback_called = 0
@@ -348,7 +365,7 @@ class InterruptTestCase(TestCase):
interrupt_callback=ic)
try: g.urlgrab(ref_http)
except KeyboardInterrupt: pass
- self.assertEquals(self.interrupt_callback_called, 1)
+ self.assertEqual(self.interrupt_callback_called, 1)
def test_interrupt_callback_raises(self):
"interrupt callback raises an exception"
@@ -366,12 +383,12 @@ class CheckfuncTestCase(TestCase):
self.g = grabber.URLGrabber(checkfunc=cf)
self.filename = tempfile.mktemp()
self.data = short_reference_data
-
+
def tearDown(self):
try: os.unlink(self.filename)
except: pass
if hasattr(self, 'obj'): del self.obj
-
+
def _checkfunc(self, obj, *args, **kwargs):
self.obj = obj
self.args = args
@@ -379,37 +396,38 @@ class CheckfuncTestCase(TestCase):
if hasattr(obj, 'filename'):
# we used urlgrab
- fo = file(obj.filename)
- data = fo.read()
- fo.close()
+ data = open(obj.filename, 'rb').read()
else:
# we used urlread
data = obj.data
if data == self.data: return
else: raise URLGrabError(-2, "data doesn't match")
-
+
def _check_common_args(self):
"check the args that are common to both urlgrab and urlread"
- self.assert_(hasattr(self, 'obj'))
- self.assert_(hasattr(self, 'args'))
- self.assert_(hasattr(self, 'kwargs'))
- self.assertEquals(self.args, ('foo',))
- self.assertEquals(self.kwargs, {'bar': 'baz'})
- self.assert_(isinstance(self.obj, CallbackObject))
- self.assertEquals(self.obj.url, short_ref_http)
+ self.assertTrue(hasattr(self, 'obj'))
+ self.assertTrue(hasattr(self, 'args'))
+ self.assertTrue(hasattr(self, 'kwargs'))
+ self.assertEqual(self.args, ('foo',))
+ self.assertEqual(self.kwargs, {'bar': 'baz'})
+ self.assertTrue(isinstance(self.obj, CallbackObject))
+ url = self.obj.url
+ if not isinstance(url, string_types):
+ url = url.decode()
+ self.assertEqual(url, short_ref_http)
def test_checkfunc_urlgrab_args(self):
"check for proper args when used with urlgrab"
self.g.urlgrab(short_ref_http, self.filename)
self._check_common_args()
- self.assertEquals(self.obj.filename, self.filename)
+ self.assertEqual(self.obj.filename, self.filename)
def test_checkfunc_urlread_args(self):
"check for proper args when used with urlread"
self.g.urlread(short_ref_http)
self._check_common_args()
- self.assertEquals(self.obj.data, short_reference_data)
+ self.assertEqual(self.obj.data, short_reference_data)
def test_checkfunc_urlgrab_success(self):
"check success with urlgrab checkfunc"
@@ -425,20 +443,20 @@ class CheckfuncTestCase(TestCase):
"check failure with urlgrab checkfunc"
self.data = 'other data'
self.assertRaises(URLGrabError, self.g.urlgrab,
- short_ref_http, self.filename)
+ ref_404, self.filename)
def test_checkfunc_urlread_failure(self):
"check failure with urlread checkfunc"
self.data = 'other data'
self.assertRaises(URLGrabError, self.g.urlread,
- short_ref_http)
+ ref_404)
class RegetTestBase:
def setUp(self):
self.ref = short_reference_data
self.grabber = grabber.URLGrabber(reget='check_timestamp')
self.filename = tempfile.mktemp()
- self.hl = len(self.ref) / 2
+ self.hl = len(self.ref) // 2
self.url = 'OVERRIDE THIS'
def tearDown(self):
@@ -446,16 +464,13 @@ class RegetTestBase:
except: pass
def _make_half_zero_file(self):
- fo = file(self.filename, 'wb')
- fo.write('0'*self.hl)
- fo.close()
+ with open(self.filename, 'wb') as fo:
+ fo.write(b'0' * self.hl)
def _read_file(self):
- fo = file(self.filename, 'rb')
- data = fo.read()
- fo.close()
+ data = open(self.filename, 'rb').read()
return data
-
+
class CommonRegetTests(RegetTestBase, TestCase):
def test_bad_reget_type(self):
"exception raised for illegal reget mode"
@@ -469,7 +484,7 @@ class FTPRegetTests(RegetTestBase, TestCase):
# this tests to see if the server is available. If it's not,
# then these tests will be skipped
try:
- fo = urllib2.urlopen(self.url).close()
+ fo = urlopen(self.url).close()
except IOError:
self.skip()
@@ -479,52 +494,55 @@ class FTPRegetTests(RegetTestBase, TestCase):
self.grabber.urlgrab(self.url, self.filename, reget='simple')
data = self._read_file()
- self.assertEquals(data[:self.hl], '0'*self.hl)
- self.assertEquals(data[self.hl:], self.ref[self.hl:])
+ self.assertEqual(data[:self.hl], b'0'*self.hl)
+ self.assertEqual(data[self.hl:], self.ref[self.hl:])
class HTTPRegetTests(FTPRegetTests):
def setUp(self):
RegetTestBase.setUp(self)
self.url = short_ref_http
-
+
def test_older_check_timestamp(self):
+ # define this here rather than in the FTP tests because currently,
+ # we get no timestamp information back from ftp servers.
+ self._make_half_zero_file()
+ ts = 1600000000 # set local timestamp to 2020
+ os.utime(self.filename, (ts, ts))
+
try:
- # define this here rather than in the FTP tests because currently,
- # we get no timestamp information back from ftp servers.
- self._make_half_zero_file()
- ts = 1600000000 # set local timestamp to 2020
- os.utime(self.filename, (ts, ts))
self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp')
- data = self._read_file()
-
- self.assertEquals(data[:self.hl], '0'*self.hl)
- self.assertEquals(data[self.hl:], self.ref[self.hl:])
except NotImplementedError:
self.skip()
-
+
+ data = self._read_file()
+
+ self.assertEqual(data[:self.hl], b'0'*self.hl)
+ self.assertEqual(data[self.hl:], self.ref[self.hl:])
+
def test_newer_check_timestamp(self):
+ # define this here rather than in the FTP tests because currently,
+ # we get no timestamp information back from ftp servers.
+ self._make_half_zero_file()
+ ts = 1 # set local timestamp to 1969
+ os.utime(self.filename, (ts, ts))
+
try:
- # define this here rather than in the FTP tests because currently,
- # we get no timestamp information back from ftp servers.
- self._make_half_zero_file()
- ts = 1 # set local timestamp to 1969
- os.utime(self.filename, (ts, ts))
self.grabber.urlgrab(self.url, self.filename, reget='check_timestamp')
- data = self._read_file()
-
- self.assertEquals(data, self.ref)
- except:
+ except NotImplementedError:
self.skip()
-
+
+ data = self._read_file()
+
+ self.assertEqual(data, self.ref)
+
class FileRegetTests(HTTPRegetTests):
def setUp(self):
self.ref = short_reference_data
tmp = tempfile.mktemp()
- tmpfo = file(tmp, 'wb')
- tmpfo.write(self.ref)
- tmpfo.close()
+ with open(tmp, 'wb') as tmpfo:
+ tmpfo.write(self.ref)
self.tmp = tmp
-
+
(url, parts) = grabber.default_grabber.opts.urlparser.parse(
tmp, grabber.default_grabber.opts)
self.url = url
@@ -532,7 +550,7 @@ class FileRegetTests(HTTPRegetTests):
self.grabber = grabber.URLGrabber(reget='check_timestamp',
copy_local=1)
self.filename = tempfile.mktemp()
- self.hl = len(self.ref) / 2
+ self.hl = len(self.ref) // 2
def tearDown(self):
try: os.unlink(self.filename)
@@ -544,14 +562,14 @@ class ProFTPDSucksTests(TestCase):
def setUp(self):
self.url = ref_proftp
try:
- fo = urllib2.urlopen(self.url).close()
+ fo = urlopen(self.url).close()
except IOError:
self.skip()
def test_restart_workaround(self):
inst = grabber.URLGrabber()
rslt = inst.urlread(self.url, range=(500, 1000))
-
+
class BaseProxyTests(TestCase):
good_p = '%s://%s:%s@%s:%i' % (proxy_proto, proxy_user,
good_proxy_pass, proxy_host, proxy_port)
@@ -591,8 +609,8 @@ class ProxyFTPAuthTests(ProxyHTTPAuthTests):
if not self.have_proxy():
self.skip()
try:
- fo = urllib2.urlopen(self.url).close()
- except IOError:
+ fo = urlopen(self.url).close()
+ except URLError:
self.skip()
self.g = URLGrabber()
@@ -604,4 +622,3 @@ if __name__ == '__main__':
grabber.DEBUG = 0
runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2)
runner.run(suite())
-
diff --git a/test/test_mirror.py b/test/test_mirror.py
index 70fe069..a175977 100644..100755
--- a/test/test_mirror.py
+++ b/test/test_mirror.py
@@ -11,9 +11,9 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -21,14 +21,12 @@
"""mirror.py tests"""
-# $Id: test_mirror.py,v 1.12 2005/10/22 21:57:27 mstenner Exp $
-
import sys
import os
-import string, tempfile, random, cStringIO, os
+import tempfile, random, os
import urlgrabber.grabber
-from urlgrabber.grabber import URLGrabber, URLGrabError
+from urlgrabber.grabber import URLGrabber, URLGrabError, URLGrabberOptions
import urlgrabber.mirror
from urlgrabber.mirror import MirrorGroup, MGRandomStart, MGRandomOrder
@@ -53,9 +51,7 @@ class BasicTests(TestCase):
url = 'short_reference'
self.mg.urlgrab(url, filename)
- fo = open(filename)
- data = fo.read()
- fo.close()
+ data = open(filename, 'rb').read()
self.assertEqual(data, short_reference_data)
@@ -87,9 +83,7 @@ class SubclassTests(TestCase):
url = 'short_reference'
self.mg.urlgrab(url, filename)
- fo = open(filename)
- data = fo.read()
- fo.close()
+ data = open(filename, 'rb').read()
self.assertEqual(data, short_reference_data)
@@ -106,8 +100,11 @@ class CallbackTests(TestCase):
self.g = URLGrabber()
fullmirrors = [base_mirror_url + m + '/' for m in \
(bad_mirrors + good_mirrors)]
+ if hasattr(urlgrabber.grabber, '_TH'):
+ # test assumes mirrors are not re-ordered
+ urlgrabber.grabber._TH.hosts.clear()
self.mg = MirrorGroup(self.g, fullmirrors)
-
+
def test_failure_callback(self):
"test that MG executes the failure callback correctly"
tricky_list = []
@@ -115,9 +112,9 @@ class CallbackTests(TestCase):
tl.append(str(cb_obj.exception))
self.mg.failure_callback = failure_callback, (tricky_list, ), {}
data = self.mg.urlread('reference')
- self.assert_(data == reference_data)
- self.assertEquals(tricky_list[0][:25],
- '[Errno 14] HTTP Error 403')
+ self.assertTrue(data == reference_data)
+ self.assertEqual(tricky_list[0][:25],
+ '[Errno 14] HTTP Error 404')
def test_callback_reraise(self):
"test that the callback can correctly re-raise the exception"
@@ -152,10 +149,8 @@ class FailoverTests(TestCase):
def cb(e, elist=elist): elist.append(e)
self.mg.urlgrab(url, filename, failure_callback=cb)
- fo = open(filename)
- contents = fo.read()
- fo.close()
-
+ contents = open(filename, 'rb').read()
+
# first be sure that the first mirror failed and that the
# callback was called
self.assertEqual(len(elist), 1)
@@ -168,7 +163,8 @@ class FakeGrabber:
self.resultlist = resultlist or []
self.index = 0
self.calls = []
-
+ self.opts = URLGrabberOptions()
+
def urlgrab(self, url, filename=None, **kwargs):
self.calls.append( (url, filename) )
res = self.resultlist[self.index]
@@ -187,11 +183,11 @@ class ActionTests(TestCase):
def tearDown(self):
urlgrabber.mirror.DEBUG = self.db
-
+
def test_defaults(self):
'test default action policy'
self.mg.urlgrab('somefile')
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:3] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -203,15 +199,15 @@ class ActionTests(TestCase):
'GR mirrors: [c d e f] 0',
'MAIN mirrors: [a b c d e f] 2',
'MIRROR: trying somefile -> c/somefile']
-
- self.assertEquals(self.g.calls, expected_calls)
- self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
-
+
+ self.assertEqual(self.g.calls, expected_calls)
+ self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs)
+
def test_instance_action(self):
'test the effects of passed-in default_action'
self.mg.default_action = {'remove_master': 1}
self.mg.urlgrab('somefile')
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:3] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -223,14 +219,14 @@ class ActionTests(TestCase):
'GR mirrors: [c d e f] 0',
'MAIN mirrors: [c d e f] 0',
'MIRROR: trying somefile -> c/somefile']
-
- self.assertEquals(self.g.calls, expected_calls)
- self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
-
+
+ self.assertEqual(self.g.calls, expected_calls)
+ self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs)
+
def test_method_action(self):
'test the effects of method-level default_action'
self.mg.urlgrab('somefile', default_action={'remove_master': 1})
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:3] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -242,18 +238,18 @@ class ActionTests(TestCase):
'GR mirrors: [c d e f] 0',
'MAIN mirrors: [c d e f] 0',
'MIRROR: trying somefile -> c/somefile']
-
- self.assertEquals(self.g.calls, expected_calls)
- self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
-
+
+ self.assertEqual(self.g.calls, expected_calls)
+ self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs)
+
def callback(self, e): return {'fail': 1}
-
+
def test_callback_action(self):
'test the effects of a callback-returned action'
self.assertRaises(URLGrabError, self.mg.urlgrab, 'somefile',
failure_callback=self.callback)
- expected_calls = [ (m + '/' + 'somefile', None) \
+ expected_calls = [ (m.encode('utf8') + b'/somefile', None)
for m in self.mirrors[:1] ]
expected_logs = \
['MIRROR: trying somefile -> a/somefile',
@@ -261,9 +257,133 @@ class ActionTests(TestCase):
'GR mirrors: [b c d e f] 0',
'MAIN mirrors: [a b c d e f] 1']
- self.assertEquals(self.g.calls, expected_calls)
- self.assertEquals(urlgrabber.mirror.DEBUG.logs, expected_logs)
-
+ self.assertEqual(self.g.calls, expected_calls)
+ self.assertEqual(urlgrabber.mirror.DEBUG.logs, expected_logs)
+
+import threading, socket
+
+class HttpReplyCode(TestCase):
+ def setUp(self):
+ # start the server
+ self.exit = False
+ self.process = lambda data: None
+
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ s.bind(('localhost', 0)); s.listen(1)
+ self.port = s.getsockname()[1]
+
+ def server():
+ while True:
+ c, a = s.accept()
+ if self.exit: c.close(); break
+ data = b''
+ while not data.endswith(b'\r\n\r\n'):
+ data = c.recv(4096)
+ self.process(data)
+ c.sendall(b'HTTP/1.1 %d %s\r\n' % self.reply)
+ if self.content is not None:
+ c.sendall(b'Content-Length: %d\r\n\r\n' % len(self.content))
+ c.sendall(self.content)
+ c.close()
+ s.close()
+ self.exit = False
+
+ self.thread = threading.Thread(target=server)
+ self.thread.start()
+
+ # create grabber and mirror group objects
+ def failure(obj):
+ self.code = getattr(obj.exception, 'code', None)
+ return {}
+ self.g = URLGrabber()
+ self.mg = MirrorGroup(self.g, ['http://localhost:%d' % self.port],
+ failure_callback = failure)
+
+ def tearDown(self):
+ # shut down the server
+ self.exit = True
+ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ try:
+ s.connect(('localhost', self.port)) # wake it up
+ except ConnectionRefusedError:
+ # already gone?
+ pass
+ s.close()
+ self.thread.join()
+
+ def test_grab(self):
+ 'tests the propagation of HTTP reply code'
+ self.reply = 503, b'Busy'
+ self.content = None
+
+ # single
+ self.assertRaises(URLGrabError, self.mg.urlgrab, 'foo')
+ self.assertEqual(self.code, 503); del self.code
+
+ # multi
+ err = []
+ self.mg.urlgrab('foo', async_=True, failfunc=err.append)
+ urlgrabber.grabber.parallel_wait()
+ self.assertEqual([e.exception.errno for e in err], [256])
+ self.assertEqual(self.code, 503); del self.code
+
+ def test_range(self):
+ 'test client-side processing of HTTP ranges'
+ # server does not process ranges
+ self.reply = 200, b'OK'
+ self.content = b'ABCDEF'
+
+ # no range specified
+ data = self.mg.urlread('foo')
+ self.assertEqual(data, b'ABCDEF')
+
+ data = self.mg.urlread('foo', range = (3, 5))
+ self.assertEqual(data, b'DE')
+
+ def test_retry_no_cache(self):
+ 'test bypassing proxy cache on failure'
+ def process(data):
+ if b'Pragma:no-cache' in data:
+ self.content = b'version2'
+ else:
+ self.content = b'version1'
+
+ def checkfunc_read(obj):
+ if obj.data == b'version1':
+ raise URLGrabError(-1, 'Outdated version of foo')
+ elif obj.data != b'version2':
+ self.fail('Unexpected file content')
+
+ def checkfunc_grab(obj):
+ with open('foo') as f:
+ data = f.read()
+ if data == 'version1':
+ raise URLGrabError(-1, 'Outdated version of foo')
+ elif data != 'version2':
+ self.fail('Unexpected file content')
+
+ self.process = process
+ self.reply = 200, b'OK'
+
+ opts = self.g.opts
+ opts.retry = 3
+ opts.retry_no_cache = True
+
+ # single
+ opts.checkfunc = checkfunc_read
+ try:
+ self.mg.urlread('foo')
+ except URLGrabError as e:
+ self.fail(str(e))
+
+ # multi
+ opts.checkfunc = checkfunc_grab
+ self.mg.urlgrab('foo', async_=True)
+ try:
+ urlgrabber.grabber.parallel_wait()
+ except URLGrabError as e:
+ self.fail(str(e))
def suite():
tl = TestLoader()
@@ -272,4 +392,3 @@ def suite():
if __name__ == '__main__':
runner = TextTestRunner(stream=sys.stdout,descriptions=1,verbosity=2)
runner.run(suite())
-
diff --git a/test/threading/batchgrabber.py b/test/threading/batchgrabber.py
index 076b7ef..4ee71e1 100644..100755
--- a/test/threading/batchgrabber.py
+++ b/test/threading/batchgrabber.py
@@ -9,20 +9,22 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+from __future__ import print_function
+
"""Module for testing urlgrabber under multiple threads.
-This module can be used from the command line. Each argument is
+This module can be used from the command line. Each argument is
a URL to grab.
-The BatchURLGrabber class has an interface similar to URLGrabber
+The BatchURLGrabber class has an interface similar to URLGrabber
but instead of pulling files when urlgrab is called, the request
is queued. Calling BatchURLGrabber.batchgrab causes all files to
be pulled in multiple threads.
@@ -31,7 +33,7 @@ be pulled in multiple threads.
import os.path, sys
if __name__ == '__main__':
- print os.path.dirname(sys.argv[0])
+ print(os.path.dirname(sys.argv[0]))
sys.path.insert(0, (os.path.dirname(sys.argv[0]) or '.') + '/../..')
from threading import Thread, Semaphore
@@ -48,10 +50,10 @@ class BatchURLGrabber:
self.queue = []
self.threads = []
self.sem = Semaphore()
-
+
def urlgrab(self, url, filename=None, **kwargs):
self.queue.append( (url, filename, kwargs) )
-
+
def batchgrab(self):
if hasattr(self.grabber.opts.progress_obj, 'start'):
self.grabber.opts.progress_obj.start(len(self.queue))
@@ -61,17 +63,17 @@ class BatchURLGrabber:
del self.queue[0]
thread = Worker(self, url, filename, kwargs)
self.threads.append(thread)
- if DEBUG: print "starting worker: " + url
+ if DEBUG: print("starting worker: " + url)
thread.start()
else:
for t in self.threads:
if not t.isAlive():
- if DEBUG: print "cleaning up worker: " + t.url
+ if DEBUG: print("cleaning up worker: " + t.url)
self.threads.remove(t)
#if len(self.threads) == self.maxthreads:
# sleep(0.2)
sleep(0.2)
-
+
class Worker(Thread):
def __init__(self, parent, url, filename, kwargs):
Thread.__init__(self)
@@ -79,18 +81,18 @@ class Worker(Thread):
self.url = url
self.filename = filename
self.kwargs = kwargs
-
+
def run(self):
- if DEBUG: print "worker thread started."
+ if DEBUG: print("worker thread started.")
grabber = self.parent.grabber
progress_obj = grabber.opts.progress_obj
if isinstance(progress_obj, MultiFileMeter):
self.kwargs['progress_obj'] = progress_obj.newMeter()
try:
rslt = self.parent.grabber.urlgrab(self.url, self.filename, **self.kwargs)
- except URLGrabError, e:
- print '%s, %s' % (e, self.url)
-
+ except URLGrabError as e:
+ print('%s, %s' % (e, self.url))
+
def main():
progress_obj = None
# uncomment to play with BatchProgressMeter (doesn't work right now)
@@ -98,13 +100,13 @@ def main():
g = BatchURLGrabber(keepalive=1, progress_obj=progress_obj)
for arg in sys.argv[1:]:
g.urlgrab(arg)
- if DEBUG: print "before batchgrab"
+ if DEBUG: print("before batchgrab")
try:
g.batchgrab()
except KeyboardInterrupt:
sys.exit(1)
-
- if DEBUG: print "after batchgrab"
-
+
+ if DEBUG: print("after batchgrab")
+
if __name__ == '__main__':
main()
diff --git a/urlgrabber.egg-info/PKG-INFO b/urlgrabber.egg-info/PKG-INFO
new file mode 100755
index 0000000..20b9966
--- /dev/null
+++ b/urlgrabber.egg-info/PKG-INFO
@@ -0,0 +1,31 @@
+Metadata-Version: 1.2
+Name: urlgrabber
+Version: 4.1.0
+Summary: A high-level cross-protocol url-grabber
+Home-page: http://urlgrabber.baseurl.org/
+Author: Michael D. Stenner, Ryan Tomayko, Seth Vidal, Zdenek Pavlas
+Author-email: mstenner@linux.duke.edu, rtomayko@naeblis.cx, skvidal@fedoraproject.org, zpavlas@redhat.com
+Maintainer: Neal Gompa
+Maintainer-email: ngompa@fedoraproject.org
+License: LGPLv2+
+Description: UNKNOWN
+Keywords: urlgrabber yum http ftp
+Platform: UNKNOWN
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: System Administrators
+Classifier: Topic :: Internet :: File Transfer Protocol (FTP)
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Environment :: Console
+Classifier: Environment :: Web Environment
+Classifier: License :: OSI Approved :: GNU Lesser General Public License v2 or later (LGPLv2+)
+Classifier: Operating System :: POSIX
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
diff --git a/urlgrabber.egg-info/SOURCES.txt b/urlgrabber.egg-info/SOURCES.txt
new file mode 100755
index 0000000..35174af
--- /dev/null
+++ b/urlgrabber.egg-info/SOURCES.txt
@@ -0,0 +1,27 @@
+ChangeLog
+LICENSE
+MANIFEST.in
+README
+TODO
+makefile
+setup.py
+scripts/urlgrabber
+scripts/urlgrabber-ext-down
+test/base_test_code.py
+test/grabberperf.py
+test/munittest.py
+test/runtests.py
+test/test_byterange.py
+test/test_grabber.py
+test/test_mirror.py
+test/threading/batchgrabber.py
+urlgrabber/__init__.py
+urlgrabber/byterange.py
+urlgrabber/grabber.py
+urlgrabber/mirror.py
+urlgrabber/progress.py
+urlgrabber.egg-info/PKG-INFO
+urlgrabber.egg-info/SOURCES.txt
+urlgrabber.egg-info/dependency_links.txt
+urlgrabber.egg-info/requires.txt
+urlgrabber.egg-info/top_level.txt \ No newline at end of file
diff --git a/urlgrabber.egg-info/dependency_links.txt b/urlgrabber.egg-info/dependency_links.txt
new file mode 100755
index 0000000..8b13789
--- /dev/null
+++ b/urlgrabber.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/urlgrabber.egg-info/requires.txt b/urlgrabber.egg-info/requires.txt
new file mode 100755
index 0000000..28d1cd4
--- /dev/null
+++ b/urlgrabber.egg-info/requires.txt
@@ -0,0 +1,3 @@
+pycurl
+six
+setuptools
diff --git a/urlgrabber.egg-info/top_level.txt b/urlgrabber.egg-info/top_level.txt
new file mode 100755
index 0000000..9aa2f20
--- /dev/null
+++ b/urlgrabber.egg-info/top_level.txt
@@ -0,0 +1 @@
+urlgrabber
diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py
index ddd5204..60f56c3 100644..100755
--- a/urlgrabber/__init__.py
+++ b/urlgrabber/__init__.py
@@ -1,16 +1,18 @@
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
-# (at your option) any later version.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Library General Public License for more details.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
+# Boston, MA 02111-1307 USA
# Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko
# Copyright 2009 Red Hat, Inc - pycurl support added by Seth Vidal
@@ -44,11 +46,17 @@ following features:
automatically switching mirrors if there is a failure.
"""
-__version__ = '3.9.1'
-__date__ = '2009/09/25'
-__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \
- 'Ryan Tomayko <rtomayko@naeblis.cx>' \
- 'Seth Vidal <skvidal@fedoraproject.org>'
-__url__ = 'http://linux.duke.edu/projects/urlgrabber/'
+try:
+ from email import message_from_string
+ from pkg_resources import get_distribution
+ pkgInfo = get_distribution(__package__).get_metadata('PKG-INFO')
+ __metadata__ = message_from_string(pkgInfo)
+ del pkgInfo
-from grabber import urlgrab, urlopen, urlread
+ __version__ = __metadata__['Version']
+ __author__ = __metadata__['Author']
+ __url__ = __metadata__['Home-page']
+except:
+ __author__ = __version__ = __url__ = '<see setup.cfg>'
+
+from .grabber import urlgrab, urlopen, urlread
diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py
index 3e5f3b7..e341add 100644..100755
--- a/urlgrabber/byterange.py
+++ b/urlgrabber/byterange.py
@@ -9,9 +9,9 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -19,56 +19,74 @@
import os
+import sys
import stat
import urllib
-import urllib2
-import rfc822
+import email
+import ftplib
+import socket
+import sys
+import mimetypes
+
+try:
+ from urllib.request import BaseHandler, FileHandler, FTPHandler, URLError
+ from urllib.request import addclosehook, addinfourl
+ from urllib.request import ftpwrapper as urllib_ftpwrapper
+ from urllib.parse import splitport, splituser, splitpasswd, splitattr, unquote
+except ImportError:
+ from urllib2 import BaseHandler, FileHandler, FTPHandler, URLError
+ from urllib2 import ftpwrapper as urllib_ftpwrapper
+ from urllib import (splitport, splituser, splitpasswd, splitattr,
+ unquote, addclosehook, addinfourl)
DEBUG = None
-try:
+if sys.version_info >= (3,):
+ # We do an explicit version check here because because python2
+ # also has an io module with StringIO, but it is incompatible,
+ # and returns str instead of unicode somewhere.
+ from io import StringIO
+else:
from cStringIO import StringIO
-except ImportError, msg:
- from StringIO import StringIO
class RangeError(IOError):
"""Error raised when an unsatisfiable range is requested."""
pass
-
-class HTTPRangeHandler(urllib2.BaseHandler):
+
+class HTTPRangeHandler(BaseHandler):
"""Handler that enables HTTP Range headers.
-
+
This was extremely simple. The Range header is a HTTP feature to
- begin with so all this class does is tell urllib2 that the
- "206 Partial Content" reponse from the HTTP server is what we
+ begin with so all this class does is tell urllib2 that the
+ "206 Partial Content" response from the HTTP server is what we
expected.
-
+
Example:
import urllib2
import byterange
-
+
range_handler = range.HTTPRangeHandler()
- opener = urllib2.build_opener(range_handler)
-
+ opener = urllib.request.build_opener(range_handler)
+
# install it
- urllib2.install_opener(opener)
-
+ urllib.request.install_opener(opener)
+
# create Request and set Range header
- req = urllib2.Request('http://www.python.org/')
+ req = urllib.request.Request('http://www.python.org/')
req.header['Range'] = 'bytes=30-50'
- f = urllib2.urlopen(req)
+ f = urllib.request.urlopen(req)
"""
-
+
def http_error_206(self, req, fp, code, msg, hdrs):
# 206 Partial Content Response
r = urllib.addinfourl(fp, hdrs, req.get_full_url())
r.code = code
r.msg = msg
return r
-
+
def http_error_416(self, req, fp, code, msg, hdrs):
# HTTP's Range Not Satisfiable error
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
class HTTPSRangeHandler(HTTPRangeHandler):
""" Range Header support for HTTPS. """
@@ -81,13 +99,13 @@ class HTTPSRangeHandler(HTTPRangeHandler):
class RangeableFileObject:
"""File object wrapper to enable raw range handling.
- This was implemented primarilary for handling range
- specifications for file:// urls. This object effectively makes
- a file object look like it consists only of a range of bytes in
+ This was implemented primarilary for handling range
+ specifications for file:// urls. This object effectively makes
+ a file object look like it consists only of a range of bytes in
the stream.
-
+
Examples:
- # expose 10 bytes, starting at byte position 20, from
+ # expose 10 bytes, starting at byte position 20, from
# /etc/aliases.
>>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
# seek seeks within the range (to position 23 in this case)
@@ -99,11 +117,11 @@ class RangeableFileObject:
# byte in the range. the following will return only 7 bytes.
>>> fo.read(30)
"""
-
+
def __init__(self, fo, rangetup):
"""Create a RangeableFileObject.
- fo -- a file like object. only the read() method need be
- supported but supporting an optimized seek() is
+ fo -- a file like object. only the read() method need be
+ supported but supporting an optimized seek() is
preferable.
rangetup -- a (firstbyte,lastbyte) tuple specifying the range
to work over.
@@ -113,24 +131,24 @@ class RangeableFileObject:
(self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
self.realpos = 0
self._do_seek(self.firstbyte)
-
+
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for
in self.fo. This includes methods."""
if hasattr(self.fo, name):
return getattr(self.fo, name)
- raise AttributeError, name
-
+ raise AttributeError(name)
+
def tell(self):
"""Return the position within the range.
- This is different from fo.seek in that position 0 is the
+ This is different from fo.seek in that position 0 is the
first byte position of the range tuple. For example, if
this object was created with a range tuple of (500,899),
tell() will return 0 when at byte position 500 of the file.
"""
return (self.realpos - self.firstbyte)
-
+
def seek(self,offset,whence=0):
"""Seek within the byte range.
Positioning is identical to that described under tell().
@@ -143,13 +161,13 @@ class RangeableFileObject:
elif whence == 2: # absolute from end of file
# XXX: are we raising the right Error here?
raise IOError('seek from end of file not supported.')
-
+
# do not allow seek past lastbyte in range
if self.lastbyte and (realoffset >= self.lastbyte):
realoffset = self.lastbyte
-
+
self._do_seek(realoffset - self.realpos)
-
+
def read(self, size=-1):
"""Read within the range.
This method will limit the size read based on the range.
@@ -158,7 +176,7 @@ class RangeableFileObject:
rslt = self.fo.read(size)
self.realpos += len(rslt)
return rslt
-
+
def readline(self, size=-1):
"""Read lines within the range.
This method will limit the size read based on the range.
@@ -167,7 +185,7 @@ class RangeableFileObject:
rslt = self.fo.readline(size)
self.realpos += len(rslt)
return rslt
-
+
def _calc_read_size(self, size):
"""Handles calculating the amount of data to read based on
the range.
@@ -179,7 +197,7 @@ class RangeableFileObject:
else:
size = (self.lastbyte - self.realpos)
return size
-
+
def _do_seek(self,offset):
"""Seek based on whether wrapped object supports seek().
offset is relative to the current position (self.realpos).
@@ -190,7 +208,7 @@ class RangeableFileObject:
else:
self.fo.seek(self.realpos + offset)
self.realpos+= offset
-
+
def _poor_mans_seek(self,offset):
"""Seek by calling the wrapped file objects read() method.
This is used for file like objects that do not have native
@@ -198,7 +216,7 @@ class RangeableFileObject:
to manually seek to the desired position.
offset -- read this number of bytes from the wrapped
file object.
- raise RangeError if we encounter EOF before reaching the
+ raise RangeError if we encounter EOF before reaching the
specified offset.
"""
pos = 0
@@ -208,28 +226,26 @@ class RangeableFileObject:
bufsize = offset - pos
buf = self.fo.read(bufsize)
if len(buf) != bufsize:
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
pos+= bufsize
-class FileRangeHandler(urllib2.FileHandler):
+class FileRangeHandler(FileHandler):
"""FileHandler subclass that adds Range support.
This class handles Range headers exactly like an HTTP
server would.
"""
def open_local_file(self, req):
- import mimetypes
- import mimetools
host = req.get_host()
file = req.get_selector()
localfile = urllib.url2pathname(file)
stats = os.stat(localfile)
size = stats[stat.ST_SIZE]
- modified = rfc822.formatdate(stats[stat.ST_MTIME])
+ modified = email.utils.formatdate(stats[stat.ST_MTIME])
mtype = mimetypes.guess_type(file)[0]
if host:
host, port = urllib.splitport(host)
if port or socket.gethostbyname(host) not in self.get_names():
- raise urllib2.URLError('file not on local host')
+ raise URLError('file not on local host')
fo = open(localfile,'rb')
brange = req.headers.get('Range',None)
brange = range_header_to_tuple(brange)
@@ -238,35 +254,27 @@ class FileRangeHandler(urllib2.FileHandler):
(fb,lb) = brange
if lb == '': lb = size
if fb < 0 or fb > size or lb > size:
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
size = (lb - fb)
fo = RangeableFileObject(fo, (fb,lb))
- headers = mimetools.Message(StringIO(
+ headers = email.message_from_string(
'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
- (mtype or 'text/plain', size, modified)))
+ (mtype or 'text/plain', size, modified))
return urllib.addinfourl(fo, headers, 'file:'+file)
-# FTP Range Support
+# FTP Range Support
# Unfortunately, a large amount of base FTP code had to be copied
# from urllib and urllib2 in order to insert the FTP REST command.
-# Code modifications for range support have been commented as
+# Code modifications for range support have been commented as
# follows:
# -- range support modifications start/end here
-from urllib import splitport, splituser, splitpasswd, splitattr, \
- unquote, addclosehook, addinfourl
-import ftplib
-import socket
-import sys
-import mimetypes
-import mimetools
-
-class FTPRangeHandler(urllib2.FTPHandler):
+class FTPRangeHandler(FTPHandler):
def ftp_open(self, req):
host = req.get_host()
if not host:
- raise IOError, ('ftp error', 'no host given')
+ raise IOError('ftp error', 'no host given')
host, port = splitport(host)
if port is None:
port = ftplib.FTP_PORT
@@ -282,11 +290,11 @@ class FTPRangeHandler(urllib2.FTPHandler):
host = unquote(host)
user = unquote(user or '')
passwd = unquote(passwd or '')
-
+
try:
host = socket.gethostbyname(host)
- except socket.error, msg:
- raise urllib2.URLError(msg)
+ except socket.error as msg:
+ raise URLError(msg)
path, attrs = splitattr(req.get_selector())
dirs = path.split('/')
dirs = map(unquote, dirs)
@@ -301,34 +309,34 @@ class FTPRangeHandler(urllib2.FTPHandler):
if attr.lower() == 'type' and \
value in ('a', 'A', 'i', 'I', 'd', 'D'):
type = value.upper()
-
+
# -- range support modifications start here
rest = None
- range_tup = range_header_to_tuple(req.headers.get('Range',None))
+ range_tup = range_header_to_tuple(req.headers.get('Range',None))
assert range_tup != ()
if range_tup:
(fb,lb) = range_tup
if fb > 0: rest = fb
# -- range support modifications end here
-
+
fp, retrlen = fw.retrfile(file, type, rest)
-
+
# -- range support modifications start here
if range_tup:
(fb,lb) = range_tup
- if lb == '':
+ if lb == '':
if retrlen is None or retrlen == 0:
- raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
+ raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.')
lb = retrlen
retrlen = lb - fb
if retrlen < 0:
# beginning of range is larger than file
- raise RangeError('Requested Range Not Satisfiable')
+ raise RangeError(9, 'Requested Range Not Satisfiable')
else:
retrlen = lb - fb
fp = RangeableFileObject(fp, (0,retrlen))
# -- range support modifications end here
-
+
headers = ""
mtype = mimetypes.guess_type(req.get_full_url())[0]
if mtype:
@@ -338,14 +346,14 @@ class FTPRangeHandler(urllib2.FTPHandler):
sf = StringIO(headers)
headers = mimetools.Message(sf)
return addinfourl(fp, headers, req.get_full_url())
- except ftplib.all_errors, msg:
- raise IOError, ('ftp error', msg), sys.exc_info()[2]
+ except ftplib.all_errors as msg:
+ raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2])
def connect_ftp(self, user, passwd, host, port, dirs):
fw = ftpwrapper(user, passwd, host, port, dirs)
return fw
-class ftpwrapper(urllib.ftpwrapper):
+class ftpwrapper(urllib_ftpwrapper):
# range support note:
# this ftpwrapper code is copied directly from
# urllib. The only enhancement is to add the rest
@@ -364,22 +372,22 @@ class ftpwrapper(urllib.ftpwrapper):
# Use nlst to see if the file exists at all
try:
self.ftp.nlst(file)
- except ftplib.error_perm, reason:
- raise IOError, ('ftp error', reason), sys.exc_info()[2]
+ except ftplib.error_perm as reason:
+ raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2])
# Restore the transfer mode!
self.ftp.voidcmd(cmd)
# Try to retrieve as a file
try:
cmd = 'RETR ' + file
conn = self.ftp.ntransfercmd(cmd, rest)
- except ftplib.error_perm, reason:
+ except ftplib.error_perm as reason:
if str(reason)[:3] == '501':
# workaround for REST not supported error
fp, retrlen = self.retrfile(file, type)
fp = RangeableFileObject(fp, (rest,''))
return (fp, retrlen)
elif str(reason)[:3] != '550':
- raise IOError, ('ftp error', reason), sys.exc_info()[2]
+ raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2])
if not conn:
# Set transfer mode to ASCII!
self.ftp.voidcmd('TYPE A')
@@ -400,17 +408,17 @@ class ftpwrapper(urllib.ftpwrapper):
_rangere = None
def range_header_to_tuple(range_header):
"""Get a (firstbyte,lastbyte) tuple from a Range header value.
-
+
Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
function pulls the firstbyte and lastbyte values and returns
a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
the header value, it is returned as an empty string in the
tuple.
-
+
Return None if range_header is None
- Return () if range_header does not conform to the range spec
+ Return () if range_header does not conform to the range spec
pattern.
-
+
"""
global _rangere
if range_header is None: return None
@@ -418,9 +426,9 @@ def range_header_to_tuple(range_header):
import re
_rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
match = _rangere.match(range_header)
- if match:
+ if match:
tup = range_tuple_normalize(match.group(1,2))
- if tup and tup[1]:
+ if tup and tup[1]:
tup = (tup[0],tup[1]+1)
return tup
return ()
@@ -433,16 +441,16 @@ def range_tuple_to_header(range_tup):
if range_tup is None: return None
range_tup = range_tuple_normalize(range_tup)
if range_tup:
- if range_tup[1]:
+ if range_tup[1]:
range_tup = (range_tup[0],range_tup[1] - 1)
return 'bytes=%s-%s' % range_tup
-
+
def range_tuple_normalize(range_tup):
"""Normalize a (first_byte,last_byte) range tuple.
Return a tuple whose first element is guaranteed to be an int
- and whose second element will be '' (meaning: the last byte) or
+ and whose second element will be '' (meaning: the last byte) or
an int. Finally, return None if the normalized tuple == (0,'')
- as that is equivelant to retrieving the entire file.
+ as that is equivalent to retrieving the entire file.
"""
if range_tup is None: return None
# handle first byte
@@ -452,12 +460,13 @@ def range_tuple_normalize(range_tup):
# handle last byte
try: lb = range_tup[1]
except IndexError: lb = ''
- else:
+ else:
if lb is None: lb = ''
elif lb != '': lb = int(lb)
# check if range is over the entire file
if (fb,lb) == (0,''): return None
# check that the range is valid
- if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb))
+ if lb != '' and fb >= lb:
+ raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb))
return (fb,lb)
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
index e090e90..b72d089 100644..100755
--- a/urlgrabber/grabber.py
+++ b/urlgrabber/grabber.py
@@ -9,15 +9,17 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
+from __future__ import print_function
+
"""A high-level cross-protocol url-grabber.
GENERAL ARGUMENTS (kwargs)
@@ -35,7 +37,7 @@ GENERAL ARGUMENTS (kwargs)
close_connection = 0 [0|1]
tells URLGrabber to close the connection after a file has been
- transfered. This is ignored unless the download happens with the
+ transferred. This is ignored unless the download happens with the
http keepalive handler (keepalive=1). Otherwise, the connection
is left open for further use. The module level default for this
option is 0 (keepalive connections will not be closed).
@@ -49,13 +51,46 @@ GENERAL ARGUMENTS (kwargs)
progress_obj = None
a class instance that supports the following methods:
- po.start(filename, url, basename, length, text)
+ po.start(filename, url, basename, size, now, text)
# length will be None if unknown
po.update(read) # read == bytes read so far
po.end()
+ multi_progress_obj = None
+
+ a class instance that supports the following methods:
+ mo.start(total_files, total_size)
+ mo.newMeter() => meter
+ mo.removeMeter(meter)
+ mo.end()
+
+ The 'meter' object is similar to progress_obj, but multiple
+ instances may be created and updated at the same time.
+
+ When downloading multiple files in parallel and multi_progress_obj
+ is None progress_obj is used in compatibility mode: finished files
+ are shown but there's no in-progress display.
+
+ curl_obj = None
+
+ a pycurl.Curl instance to be used instead of the default module-level
+ instance.
+
+ Note that you don't have to configure the passed instance in any way;
+ urlgrabber will do all the necessary work.
+
+ This option exists primarily to allow using urlgrabber from multiple
+ threads in your application, in which case you would want to instantiate a
+ fresh Curl object for each thread, to avoid race conditions. See the curl
+ documentation on thread safety for more information:
+ https://curl.haxx.se/libcurl/c/threadsafe.html
+
+ Note that connection reuse (keepalive=1) is limited to the Curl instance it
+ was enabled on so if you're using multiple instances in your application,
+ connections won't be shared among them.
+
text = None
-
+
specifies alternative text to be passed to the progress meter
object. If not given, the default progress meter will use the
basename of the file.
@@ -68,14 +103,20 @@ GENERAL ARGUMENTS (kwargs)
(which can be set on default_grabber.throttle) is used. See
BANDWIDTH THROTTLING for more information.
- timeout = None
+ timeout = 300
+
+ a positive integer expressing the number of seconds to wait before
+ timing out attempts to connect to a server. If the value is None
+ or 0, connection attempts will not time out. The timeout is passed
+ to the underlying pycurl object as its CONNECTTIMEOUT option, see
+ the curl documentation on CURLOPT_CONNECTTIMEOUT for more information.
+ http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT
- a positive float expressing the number of seconds to wait for socket
- operations. If the value is None or 0.0, socket operations will block
- forever. Setting this option causes urlgrabber to call the settimeout
- method on the Socket object used for the request. See the Python
- documentation on settimeout for more information.
- http://www.python.org/doc/current/lib/socket-objects.html
+ minrate = 1000
+
+ This sets the low speed threshold in bytes per second. If the server
+ is sending data slower than this for at least `timeout' seconds, the
+ library aborts the connection.
bandwidth = 0
@@ -91,11 +132,11 @@ GENERAL ARGUMENTS (kwargs)
range to retrieve. Either or both of the values may set to
None. If first_byte is None, byte offset 0 is assumed. If
last_byte is None, the last byte available is assumed. Note that
- the range specification is python-like in that (0,10) will yeild
+ the range specification is python-like in that (0,10) will yield
the first 10 bytes of the file.
If set to None, no range will be used.
-
+
reget = None [None|'simple'|'check_timestamp']
whether to attempt to reget a partially-downloaded file. Reget
@@ -143,8 +184,18 @@ GENERAL ARGUMENTS (kwargs)
note that proxy authentication information may be provided using
normal URL constructs:
proxies={ 'http' : 'http://user:host@foo:3128' }
- Lastly, if proxies is None, the default environment settings will
- be used.
+
+ libproxy = False
+
+ Use the libproxy module (if installed) to find proxies.
+ The libproxy code is only used if the proxies dictionary
+ does not provide any proxies.
+
+ no_cache = False
+
+ When True, server-side cache will be disabled for http and https
+ requests. This is equivalent to setting
+ http_headers = (('Pragma', 'no-cache'),)
prefix = None
@@ -175,7 +226,7 @@ GENERAL ARGUMENTS (kwargs)
option. Note that python 2.2 handles the case of these
badly and if you do not use the proper case (shown here), your
values will be overridden with the defaults.
-
+
urlparser = URLParser()
The URLParser class handles pre-processing of URLs, including
@@ -198,6 +249,12 @@ GENERAL ARGUMENTS (kwargs)
control, you should probably subclass URLParser and pass it in via
the 'urlparser' option.
+ username = None
+ username to use for simple http auth - is automatically quoted for special characters
+
+ password = None
+ password to use for simple http auth - is automatically quoted for special characters
+
ssl_ca_cert = None
this option can be used if M2Crypto is available and will be
@@ -209,45 +266,84 @@ GENERAL ARGUMENTS (kwargs)
ssl_context = None
No-op when using the curl backend (default)
-
- self.ssl_verify_peer = True
+
+ ssl_verify_peer = True
Check the server's certificate to make sure it is valid with what our CA validates
-
- self.ssl_verify_host = True
+
+ ssl_verify_host = True
Check the server's hostname to make sure it matches the certificate DN
- self.ssl_key = None
+ ssl_key = None
Path to the key the client should use to connect/authenticate with
- self.ssl_key_type = 'PEM'
+ ssl_key_type = 'PEM'
PEM or DER - format of key
-
- self.ssl_cert = None
+
+ ssl_cert = None
Path to the ssl certificate the client should use to to authenticate with
- self.ssl_cert_type = 'PEM'
+ ssl_cert_type = 'PEM'
PEM or DER - format of certificate
-
- self.ssl_key_pass = None
+
+ ssl_key_pass = None
password to access the ssl_key
-
- self.size = None
- size (in bytes) or Maximum size of the thing being downloaded.
+ size = None
+
+ size (in bytes) or Maximum size of the thing being downloaded.
This is mostly to keep us from exploding with an endless datastream
-
- self.max_header_size = 2097152
+
+ max_header_size = 2097152
Maximum size (in bytes) of the headers.
-
+
+ ip_resolve = 'whatever'
+
+ What type of name to IP resolving to use, default is to do both IPV4 and
+ IPV6.
+
+ async_ = (key, limit)
+
+ When this option is set, the urlgrab() is not processed immediately
+ but queued. parallel_wait() then processes grabs in parallel, limiting
+ the numer of connections in each 'key' group to at most 'limit'.
+
+ max_connections
+
+ The global connection limit.
+
+ timedhosts
+
+ The filename of the host download statistics. If defined, urlgrabber
+ will update the stats at the end of every download. At the end of
+ parallel_wait(), the updated stats are saved. If synchronous grabs
+ are used, you should call th_save().
+
+ default_speed, half_life
+
+ These options only affect the async mirror selection code.
+ The default_speed option sets the speed estimate for mirrors
+ we have never downloaded from, and defaults to 1 MBps.
+
+ The speed estimate also drifts exponentially from the speed
+ actually measured to the default speed, with default
+ period of 30 days.
+
+ ftp_disable_epsv = False
+
+ False, True
+
+ This options disables Extended Passive Mode (the EPSV command)
+ which does not work correctly on some buggy ftp servers.
+
RETRY RELATED ARGUMENTS
@@ -271,7 +367,7 @@ RETRY RELATED ARGUMENTS
retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
if 12 not in retrycodes:
retrycodes.append(12)
-
+
checkfunc = None
a function to do additional checks. This defaults to None, which
@@ -302,7 +398,7 @@ RETRY RELATED ARGUMENTS
function(obj, 'arg1', 2, kwarg=3)
# obj.filename = '/tmp/stuff'
# obj.url = 'http://foo.com/stuff'
-
+
NOTE: both the "args" tuple and "kwargs" dict must be present if
you use this syntax, but either (or both) can be empty.
@@ -313,10 +409,11 @@ RETRY RELATED ARGUMENTS
identical to checkfunc, except for the attributes defined in the
CallbackObject instance. The attributes for failure_callback are:
- exception = the raised exception
- url = the url we're trying to fetch
- tries = the number of tries so far (including this one)
- retry = the value of the retry option
+ exception = the raised exception
+ url = the url we're trying to fetch
+ tries = the number of tries so far (including this one)
+ retry = the value of the retry option
+ retry_no_cache = the value of the retry_no_cache option
The callback is present primarily to inform the calling program of
the failure, but if it raises an exception (including the one it's
@@ -328,6 +425,15 @@ RETRY RELATED ARGUMENTS
but it cannot (without severe trickiness) prevent the exception
from being raised.
+ failfunc = None
+
+ The callback that gets called when urlgrab request fails.
+ If defined, urlgrab() calls it instead of raising URLGrabError.
+ Callback syntax is identical to failure_callback.
+
+ Contrary to failure_callback, it's called only once. It's primary
+ purpose is to use urlgrab() without a try/except block.
+
interrupt_callback = None
This callback is called if KeyboardInterrupt is received at any
@@ -351,7 +457,20 @@ RETRY RELATED ARGUMENTS
This callback is very similar to failure_callback. They are
passed the same arguments, so you could use the same function for
both.
-
+
+ retry_no_cache = False
+
+ When True, automatically enable no_cache for future retries if
+ checkfunc performs an unsuccessful check.
+
+ This option is useful if your application expects a set of files
+ from the same server to form an atomic unit and you write your
+ checkfunc to ensure each file being downloaded belongs to such a
+ unit. If transparent proxy caching is in effect, the files can
+ become out-of-sync, disrupting the atomicity. Enabling this option
+ will prevent that, while ensuring that you still enjoy the benefits
+ of caching when possible.
+
BANDWIDTH THROTTLING
urlgrabber supports throttling via two values: throttle and
@@ -368,6 +487,11 @@ BANDWIDTH THROTTLING
is a float and bandwidth == 0, throttling is disabled. If None, the
module-level default (which can be set with set_bandwidth) is used.
+ Note that when multiple downloads run simultaneously (multiprocessing
+ or the parallel urlgrab() feature is used) the total bandwidth might
+ exceed the throttle limit. You may want to also set max_connections=1
+ or scale your throttle option down accordingly.
+
THROTTLING EXAMPLES:
Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
@@ -411,25 +535,64 @@ BANDWIDTH THROTTLING
"""
-
-
import os
import sys
-import urlparse
import time
import string
import urllib
-import urllib2
-import mimetools
-import thread
import types
import stat
import pycurl
from ftplib import parse150
-from StringIO import StringIO
-from httplib import HTTPException
-import socket
-from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+import socket, select, fcntl
+from io import BytesIO
+import numbers
+
+try:
+ import urllib.parse as urlparse
+ urlquote, urlunquote = urlparse.quote, urlparse.unquote
+ from urllib.request import HTTPError, url2pathname, pathname2url
+except ImportError:
+ import urlparse
+ from urllib2 import HTTPError
+ urlquote, urlunquote = urllib.quote, urllib.unquote
+ from urllib import url2pathname, pathname2url
+
+try:
+ from http.client import responses, HTTPException
+except ImportError:
+ from httplib import responses, HTTPException
+
+if sys.version_info >= (3,):
+ # We do an explicit version check here because because python2
+ # also has an io module with StringIO, but it is incompatible,
+ # and returns str instead of unicode somewhere.
+ from io import StringIO
+else:
+ from cStringIO import StringIO
+
+from six import text_type, string_types
+
+from .byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+
+try:
+ import xattr
+ if not hasattr(xattr, 'set'):
+ xattr = None # This is a "newer" API.
+except ImportError:
+ xattr = None
+
+def _bytes_repr(s):
+ "A wrapper to avoid the b'' that python3 insists on when printing bytes"
+ if isinstance(s, string_types):
+ return s
+ else:
+ return repr(s)[2:-1]
+
+def _urlunquote_convert(s):
+ if not isinstance(s, text_type):
+ s = s.decode('utf8')
+ return urlunquote(s)
########################################################################
# MODULE INITIALIZATION
@@ -439,6 +602,12 @@ try:
except:
__version__ = '???'
+try:
+ # this part isn't going to do much - need to talk to gettext
+ from i18n import _
+except ImportError as msg:
+ def _(st): return st
+
########################################################################
# functions for debugging output. These functions are here because they
# are also part of the module initialization.
@@ -468,7 +637,7 @@ def _init_default_logger(logspec=None):
the form
URLGRABBER_DEBUG=level,filename
-
+
where "level" can be either an integer or a log level from the
logging module (DEBUG, INFO, etc). If the integer is zero or
less, logging will be disabled. Filename is the filename where
@@ -481,8 +650,8 @@ def _init_default_logger(logspec=None):
URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
URLGRABBER_DEBUG=INFO # log info and higher to stderr
-
- This funtion is called during module initialization. It is not
+
+ This function is called during module initialization. It is not
intended to be called from outside. The only reason it is a
function at all is to keep the module-level namespace tidy and to
collect the code into a nice block.'''
@@ -492,8 +661,11 @@ def _init_default_logger(logspec=None):
logspec = os.environ['URLGRABBER_DEBUG']
dbinfo = logspec.split(',')
import logging
- level = logging._levelNames.get(dbinfo[0], None)
- if level is None: level = int(dbinfo[0])
+ if sys.version_info.major == 2:
+ level = logging._levelNames.get(dbinfo[0], None)
+ else:
+ level = logging.getLevelName(dbinfo[0])
+ if level is None or not isinstance(level, int): level = int(dbinfo[0])
if level < 1: raise ValueError()
formatter = logging.Formatter('%(asctime)s %(message)s')
@@ -504,6 +676,7 @@ def _init_default_logger(logspec=None):
else: handler = logging.FileHandler(filename)
handler.setFormatter(formatter)
DBOBJ = logging.getLogger('urlgrabber')
+ DBOBJ.propagate = False
DBOBJ.addHandler(handler)
DBOBJ.setLevel(level)
except (KeyError, ImportError, ValueError):
@@ -512,9 +685,9 @@ def _init_default_logger(logspec=None):
def _log_package_state():
if not DEBUG: return
- DEBUG.info('urlgrabber version = %s' % __version__)
- DEBUG.info('trans function "_" = %s' % _)
-
+ DEBUG.debug('urlgrabber version = %s' % __version__)
+ DEBUG.debug('trans function "_" = %s' % _)
+
_init_default_logger()
_log_package_state()
@@ -527,6 +700,29 @@ def _(st):
# END MODULE INITIALIZATION
########################################################################
+########################################################################
+# UTILITY FUNCTIONS
+########################################################################
+
+# These functions are meant to be utilities for the urlgrabber library to use.
+
+def _to_utf8(obj, errors='replace'):
+ '''convert 'unicode' to an encoded utf-8 byte string '''
+ # stolen from yum.i18n
+ if isinstance(obj, text_type):
+ obj = obj.encode('utf-8', errors)
+ return obj
+
+def exception2msg(e):
+ try:
+ return str(e)
+ except UnicodeEncodeError:
+ # always use byte strings
+ return text_type(e).encode('utf8')
+
+########################################################################
+# END UTILITY FUNCTIONS
+########################################################################
class URLGrabError(IOError):
@@ -551,7 +747,7 @@ class URLGrabError(IOError):
14 - HTTPError (includes .code and .exception attributes)
15 - user abort
16 - error writing to local file
-
+
MirrorGroup error codes (256 -- 511)
256 - No more mirrors left to try
@@ -562,7 +758,7 @@ class URLGrabError(IOError):
-1 - retry the download, unknown reason
Note: to test which group a code is in, you can simply do integer
- division by 256: e.errno / 256
+ division by 256: e.errno // 256
Negative codes are reserved for use by functions passed in to
retrygrab with checkfunc. The value -1 is built in as a generic
@@ -606,7 +802,7 @@ def urlgrab(url, filename=None, **kwargs):
If filename is none, the basename of the url is used.
urlgrab returns the filename of the local file, which may be different
from the passed-in filename if the copy_local kwarg == 0.
-
+
See module documentation for a description of possible kwargs.
"""
return default_grabber.urlgrab(url, filename, **kwargs)
@@ -616,7 +812,7 @@ def urlopen(url, **kwargs):
If a progress object or throttle specifications exist, then
a special file object will be returned that supports them.
The file object can be treated like any other file object.
-
+
See module documentation for a description of possible kwargs.
"""
return default_grabber.urlopen(url, **kwargs)
@@ -626,7 +822,7 @@ def urlread(url, limit=None, **kwargs):
If the limit is exceeded, an exception will be thrown. Note that urlread
is NOT intended to be used as a way of saying "I want the first N bytes"
but rather 'read the whole file into memory, but don't use too much'
-
+
See module documentation for a description of possible kwargs.
"""
return default_grabber.urlread(url, limit, **kwargs)
@@ -662,37 +858,41 @@ class URLParser:
opts.quote = 0 --> do not quote it
opts.quote = None --> guess
"""
+ url = _to_utf8(url)
quote = opts.quote
-
+
if opts.prefix:
url = self.add_prefix(url, opts.prefix)
-
+
parts = urlparse.urlparse(url)
(scheme, host, path, parm, query, frag) = parts
if not scheme or (len(scheme) == 1 and scheme in string.letters):
# if a scheme isn't specified, we guess that it's "file:"
- if url[0] not in '/\\': url = os.path.abspath(url)
- url = 'file:' + urllib.pathname2url(url)
+ if url[0] not in b'/\\': url = os.path.abspath(url)
+ pathname = pathname2url(url)
+ if not isinstance(pathname, bytes):
+ pathname = pathname.encode('utf8')
+ url = b'file:' + pathname
parts = urlparse.urlparse(url)
quote = 0 # pathname2url quotes, so we won't do it again
-
- if scheme in ['http', 'https']:
+
+ if scheme in [b'http', b'https']:
parts = self.process_http(parts, url)
-
+
if quote is None:
quote = self.guess_should_quote(parts)
if quote:
parts = self.quote(parts)
-
+
url = urlparse.urlunparse(parts)
return url, parts
def add_prefix(self, url, prefix):
- if prefix[-1] == '/' or url[0] == '/':
+ if prefix.endswith(b'/') or url.startswith(b'/'):
url = prefix + url
else:
- url = prefix + '/' + url
+ url = prefix + b'/' + url
return url
def process_http(self, parts, url):
@@ -709,8 +909,10 @@ class URLParser:
passing into urlgrabber.
"""
(scheme, host, path, parm, query, frag) = parts
- path = urllib.quote(path)
- return (scheme, host, path, parm, query, frag)
+ newpath = urlquote(path, safe='/$')
+ if not isinstance(path, text_type) and isinstance(newpath, text_type):
+ newpath = newpath.encode('utf8')
+ return (scheme, host, newpath, parm, query, frag)
hexvals = '0123456789ABCDEF'
def guess_should_quote(self, parts):
@@ -724,9 +926,11 @@ class URLParser:
else -> 1
"""
(scheme, host, path, parm, query, frag) = parts
+ if not isinstance(path, text_type):
+ path = path.decode('utf8')
if ' ' in path:
return 1
- ind = string.find(path, '%')
+ ind = path.find('%')
if ind > -1:
while ind > -1:
if len(path) < ind+3:
@@ -735,10 +939,10 @@ class URLParser:
if code[0] not in self.hexvals or \
code[1] not in self.hexvals:
return 1
- ind = string.find(path, '%', ind+1)
+ ind = path.find('%', ind+1)
return 0
return 1
-
+
class URLGrabberOptions:
"""Class to ease kwargs handling."""
@@ -751,70 +955,116 @@ class URLGrabberOptions:
if delegate is None:
self._set_defaults()
self._set_attributes(**kwargs)
-
+
def __getattr__(self, name):
if self.delegate and hasattr(self.delegate, name):
return getattr(self.delegate, name)
- raise AttributeError, name
-
+ raise AttributeError(name)
+
def raw_throttle(self):
- """Calculate raw throttle value from throttle and bandwidth
+ """Calculate raw throttle value from throttle and bandwidth
values.
"""
- if self.throttle <= 0:
+ if self.throttle <= 0:
return 0
- elif type(self.throttle) == type(0):
+ elif isinstance(self.throttle, int):
return float(self.throttle)
else: # throttle is a float
return self.bandwidth * self.throttle
-
+
+ def find_proxy(self, url, scheme):
+ """Find the proxy to use for this URL.
+ Use the proxies dictionary first, then libproxy.
+ """
+ self.proxy = None
+ if scheme not in ('ftp', 'http', 'https'):
+ return
+
+ if self.proxies:
+ proxy = self.proxies.get(scheme)
+ if proxy is None:
+ if scheme == 'http':
+ proxy = self.proxies.get('https')
+ elif scheme == 'https':
+ proxy = self.proxies.get('http')
+ if proxy == '_none_':
+ proxy = ''
+ self.proxy = proxy
+ return
+
+ if self.libproxy:
+ global _libproxy_cache
+ if _libproxy_cache is None:
+ try:
+ import libproxy
+ _libproxy_cache = libproxy.ProxyFactory()
+ except:
+ _libproxy_cache = False
+ if _libproxy_cache:
+ for proxy in _libproxy_cache.getProxies(url):
+ if proxy.startswith('http://'):
+ if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url))
+ self.proxy = proxy
+ break
+
def derive(self, **kwargs):
"""Create a derived URLGrabberOptions instance.
This method creates a new instance and overrides the
options specified in kwargs.
"""
return URLGrabberOptions(delegate=self, **kwargs)
-
+
def _set_attributes(self, **kwargs):
"""Update object attributes with those provided in kwargs."""
self.__dict__.update(kwargs)
- if kwargs.has_key('range'):
+ if 'range' in kwargs:
# normalize the supplied range value
self.range = range_tuple_normalize(self.range)
+ if 'async' in kwargs:
+ self.async_ = self.__dict__.pop('async')
if not self.reget in [None, 'simple', 'check_timestamp']:
- raise URLGrabError(11, _('Illegal reget mode: %s') \
- % (self.reget, ))
+ raise URLGrabError(11, _('Illegal reget mode: %s')
+ % (self.reget,))
def _set_defaults(self):
- """Set all options to their default values.
+ """Set all options to their default values.
When adding new options, make sure a default is
provided here.
"""
self.progress_obj = None
+ self.multi_progress_obj = None
+ self.curl_obj = None
self.throttle = 1.0
self.bandwidth = 0
self.retry = None
self.retrycodes = [-1,2,4,5,6,7]
self.checkfunc = None
+ self.failfunc = _do_raise
self.copy_local = 0
self.close_connection = 0
self.range = None
self.user_agent = 'urlgrabber/%s' % __version__
+ self.ip_resolve = None
self.keepalive = 1
self.proxies = None
+ self.libproxy = False
+ self.proxy = None
self.reget = None
self.failure_callback = None
self.interrupt_callback = None
self.prefix = None
self.opener = None
self.cache_openers = True
- self.timeout = None
+ self.timeout = 300
+ self.minrate = None
self.text = None
self.http_headers = None
self.ftp_headers = None
self.data = None
self.urlparser = URLParser()
self.quote = None
+ self.username = None
+ self.password = None
self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
self.ssl_context = None # no-op in pycurl
self.ssl_verify_peer = True # check peer's cert for authenticityb
@@ -827,10 +1077,19 @@ class URLGrabberOptions:
self.size = None # if we know how big the thing we're getting is going
# to be. this is ultimately a MAXIMUM size for the file
self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
-
+ self.async_ = None # blocking by default
+ self.mirror_group = None
+ self.max_connections = 5
+ self.timedhosts = None
+ self.half_life = 30*24*60*60 # 30 days
+ self.default_speed = 500e3 # 500 kBps
+ self.ftp_disable_epsv = False
+ self.no_cache = False
+ self.retry_no_cache = False
+
def __repr__(self):
return self.format()
-
+
def format(self, indent=' '):
keys = self.__dict__.keys()
if self.delegate is not None:
@@ -838,29 +1097,39 @@ class URLGrabberOptions:
keys.sort()
s = '{\n'
for k in keys:
- s = s + indent + '%-15s: %s,\n' % \
- (repr(k), repr(self.__dict__[k]))
+ s = s + indent + '%-15r: %r,\n' % (k, self.__dict__[k])
if self.delegate:
df = self.delegate.format(indent + ' ')
s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
s = s + indent + '}'
return s
-class URLGrabber:
+def _do_raise(obj):
+ raise obj.exception
+
+def _run_callback(cb, obj):
+ if not cb:
+ return
+ if callable(cb):
+ return cb(obj)
+ cb, arg, karg = cb
+ return cb(obj, *arg, **karg)
+
+class URLGrabber(object):
"""Provides easy opening of URLs with a variety of options.
-
+
All options are specified as kwargs. Options may be specified when
the class is created and may be overridden on a per request basis.
-
+
New objects inherit default values from default_grabber.
"""
-
+
def __init__(self, **kwargs):
self.opts = URLGrabberOptions(**kwargs)
-
+
def _retry(self, opts, func, *args):
tries = 0
- while 1:
+ while True:
# there are only two ways out of this loop. The second has
# several "sub-ways"
# 1) via the return in the "try" block
@@ -872,122 +1141,142 @@ class URLGrabber:
# beware of infinite loops :)
tries = tries + 1
exception = None
- retrycode = None
callback = None
if DEBUG: DEBUG.info('attempt %i/%s: %s',
tries, opts.retry, args[0])
try:
- r = apply(func, (opts,) + args, {})
+ r = func(opts, *args)
if DEBUG: DEBUG.info('success')
return r
- except URLGrabError, e:
+ except URLGrabError as e:
exception = e
callback = opts.failure_callback
- retrycode = e.errno
- except KeyboardInterrupt, e:
+ except KeyboardInterrupt as e:
exception = e
callback = opts.interrupt_callback
+ if not callback:
+ raise
if DEBUG: DEBUG.info('exception: %s', exception)
if callback:
if DEBUG: DEBUG.info('calling callback: %s', callback)
- cb_func, cb_args, cb_kwargs = self._make_callback(callback)
obj = CallbackObject(exception=exception, url=args[0],
- tries=tries, retry=opts.retry)
- cb_func(obj, *cb_args, **cb_kwargs)
+ tries=tries, retry=opts.retry,
+ retry_no_cache=opts.retry_no_cache)
+ _run_callback(callback, obj)
if (opts.retry is None) or (tries == opts.retry):
if DEBUG: DEBUG.info('retries exceeded, re-raising')
- raise
+ raise exception
+ retrycode = getattr(exception, 'errno', None)
if (retrycode is not None) and (retrycode not in opts.retrycodes):
if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
retrycode, opts.retrycodes)
- raise
-
- def urlopen(self, url, **kwargs):
+ raise exception
+ if retrycode is not None and retrycode < 0 and opts.retry_no_cache:
+ opts.no_cache = True
+
+ def urlopen(self, url, opts=None, **kwargs):
"""open the url and return a file object
- If a progress object or throttle value specified when this
- object was created, then a special file object will be
- returned that supports them. The file object can be treated
+ If a progress object or throttle value specified when this
+ object was created, then a special file object will be
+ returned that supports them. The file object can be treated
like any other file object.
"""
- opts = self.opts.derive(**kwargs)
- if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
- (url,parts) = opts.urlparser.parse(url, opts)
+ url = _to_utf8(url)
+ opts = (opts or self.opts).derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %r' % (opts,))
+ (url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
def retryfunc(opts, url):
return PyCurlFileObject(url, filename=None, opts=opts)
return self._retry(opts, retryfunc, url)
-
- def urlgrab(self, url, filename=None, **kwargs):
+
+ def urlgrab(self, url, filename=None, opts=None, **kwargs):
"""grab the file at <url> and make a local copy at <filename>
If filename is none, the basename of the url is used.
- urlgrab returns the filename of the local file, which may be
+ urlgrab returns the filename of the local file, which may be
different from the passed-in filename if copy_local == 0.
"""
- opts = self.opts.derive(**kwargs)
- if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
- (url,parts) = opts.urlparser.parse(url, opts)
+ url = _to_utf8(url)
+ opts = (opts or self.opts).derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %r' % (opts,))
+ (url,parts) = opts.urlparser.parse(url, opts)
(scheme, host, path, parm, query, frag) = parts
+ opts.find_proxy(url, scheme)
if filename is None:
- filename = os.path.basename( urllib.unquote(path) )
+ filename = os.path.basename(_urlunquote_convert(path))
+ if not filename:
+ # This is better than nothing.
+ filename = 'index.html'
if scheme == 'file' and not opts.copy_local:
- # just return the name of the local file - don't make a
+ # just return the name of the local file - don't make a
# copy currently
- path = urllib.url2pathname(path)
+ path = url2pathname(path)
if host:
path = os.path.normpath('//' + host + path)
if not os.path.exists(path):
- err = URLGrabError(2,
+ err = URLGrabError(2,
_('Local file does not exist: %s') % (path, ))
err.url = url
raise err
elif not os.path.isfile(path):
- err = URLGrabError(3,
+ err = URLGrabError(3,
_('Not a normal file: %s') % (path, ))
err.url = url
raise err
elif not opts.range:
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = path
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(filename=path, url=url)
+ _run_callback(opts.checkfunc, obj)
return path
-
+
+ if opts.async_:
+ opts.url = url
+ opts.filename = filename
+ opts.size = int(opts.size or 0)
+ _async_queue.append(opts)
+ return filename
+
def retryfunc(opts, url, filename):
fo = PyCurlFileObject(url, filename, opts)
try:
fo._do_grab()
+ if fo._tm_last:
+ dlsz = fo._tm_last[0] - fo._tm_first[0]
+ dltm = fo._tm_last[1] - fo._tm_first[1]
+ _TH.update(url, dlsz, dltm, None)
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.filename = filename
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(filename=filename, url=url)
+ _run_callback(opts.checkfunc, obj)
finally:
fo.close()
return filename
-
- return self._retry(opts, retryfunc, url, filename)
-
- def urlread(self, url, limit=None, **kwargs):
+
+ try:
+ return self._retry(opts, retryfunc, url, filename)
+ except URLGrabError as e:
+ _TH.update(url, 0, 0, e)
+ opts.exception = e
+ return _run_callback(opts.failfunc, opts)
+
+ def urlread(self, url, limit=None, opts=None, **kwargs):
"""read the url into a string, up to 'limit' bytes
If the limit is exceeded, an exception will be thrown. Note
- that urlread is NOT intended to be used as a way of saying
- "I want the first N bytes" but rather 'read the whole file
+ that urlread is NOT intended to be used as a way of saying
+ "I want the first N bytes" but rather 'read the whole file
into memory, but don't use too much'
"""
- opts = self.opts.derive(**kwargs)
- if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
- (url,parts) = opts.urlparser.parse(url, opts)
+ url = _to_utf8(url)
+ opts = (opts or self.opts).derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %r' % (opts,))
+ (url,parts) = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
if limit is not None:
limit = limit + 1
-
+
def retryfunc(opts, url, limit):
fo = PyCurlFileObject(url, filename=None, opts=opts)
s = ''
@@ -1000,26 +1289,23 @@ class URLGrabber:
else: s = fo.read(limit)
if not opts.checkfunc is None:
- cb_func, cb_args, cb_kwargs = \
- self._make_callback(opts.checkfunc)
- obj = CallbackObject()
- obj.data = s
- obj.url = url
- apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ obj = CallbackObject(data=s, url=url)
+ _run_callback(opts.checkfunc, obj)
finally:
fo.close()
return s
-
+
s = self._retry(opts, retryfunc, url, limit)
if limit and len(s) > limit:
- err = URLGrabError(8,
+ err = URLGrabError(8,
_('Exceeded limit (%i): %s') % (limit, url))
err.url = url
raise err
return s
-
+
def _make_callback(self, callback_obj):
+ # not used, left for compatibility
if callable(callback_obj):
return callback_obj, (), {}
else:
@@ -1030,10 +1316,10 @@ class URLGrabber:
default_grabber = URLGrabber()
-class PyCurlFileObject():
+class PyCurlFileObject(object):
def __init__(self, url, filename, opts):
self.fo = None
- self._hdr_dump = ''
+ self._hdr_dump = b''
self._parsed_hdr = None
self.url = url
self.scheme = urlparse.urlsplit(self.url)[0]
@@ -1042,20 +1328,24 @@ class PyCurlFileObject():
self.reget_time = None
self.opts = opts
if self.opts.reget == 'check_timestamp':
- raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
+ raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.")
self._complete = False
- self._rbuf = ''
+ self._rbuf = b''
self._rbufsize = 1024*8
self._ttime = time.time()
self._tsize = 0
self._amount_read = 0
self._reget_length = 0
+ self._range = None
self._prog_running = False
self._error = (None, None)
- self.size = None
+ self.size = 0
+ self._hdr_ended = False
+ self._tm_first = None
+ self._tm_last = None
self._do_open()
-
-
+
+
def __getattr__(self, name):
"""This effectively allows us to wrap at the instance level.
Any attribute not found in _this_ object will be searched for
@@ -1063,48 +1353,93 @@ class PyCurlFileObject():
if hasattr(self.fo, name):
return getattr(self.fo, name)
- raise AttributeError, name
+ raise AttributeError(name)
def _retrieve(self, buf):
try:
+ tm = self._amount_read + len(buf), time.time()
+ if self._tm_first is None:
+ self._tm_first = tm
+ else:
+ self._tm_last = tm
+
if not self._prog_running:
if self.opts.progress_obj:
size = self.size + self._reget_length
- self.opts.progress_obj.start(self._prog_reportname,
- urllib.unquote(self.url),
- self._prog_basename,
+ self.opts.progress_obj.start(self._prog_reportname,
+ _urlunquote_convert(self.url),
+ self._prog_basename,
size=size,
text=self.opts.text)
self._prog_running = True
self.opts.progress_obj.update(self._amount_read)
self._amount_read += len(buf)
- self.fo.write(buf)
+ try:
+ if self._range:
+ # client-side ranges
+ pos = self._amount_read - len(buf)
+ start = self._range[0] - pos
+ stop = self._range[1] - pos
+ if start < len(buf) and stop > 0:
+ self.fo.write(buf[max(start, 0):stop])
+ else:
+ self.fo.write(buf)
+ except IOError as e:
+ self._cb_error = URLGrabError(16, exception2msg(e))
+ return -1
return len(buf)
except KeyboardInterrupt:
return -1
-
+
def _hdr_retrieve(self, buf):
- if self._over_max_size(cur=len(self._hdr_dump),
+ if self._hdr_ended:
+ self._hdr_dump = b''
+ self.size = 0
+ self._hdr_ended = False
+
+ if self._over_max_size(cur=len(self._hdr_dump),
max_size=self.opts.max_header_size):
- return -1
+ return -1
try:
- self._hdr_dump += buf
# we have to get the size before we do the progress obj start
# but we can't do that w/o making it do 2 connects, which sucks
# so we cheat and stuff it in here in the hdr_retrieve
- if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
- length = buf.split(':')[1]
- self.size = int(length)
- elif self.scheme in ['ftp']:
+ if self.scheme in [b'http', b'https']:
+ if buf.lower().find(b'content-length:') != -1:
+ length = buf.split(b':')[1]
+ self.size = int(length)
+ elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 OK ' in buf:
+ # reget was attempted but server sends it all
+ # undo what we did in _build_range()
+ self.append = False
+ self.reget_time = None
+ self._amount_read = 0
+ self._reget_length = 0
+ self._range = self.opts.range
+ self.fo.truncate(0)
+ elif self.scheme in [b'ftp']:
s = None
- if buf.startswith('213 '):
+ if buf.startswith(b'213 '):
s = buf[3:].strip()
- elif buf.startswith('150 '):
+ if len(s) >= 14:
+ s = None # ignore MDTM responses
+ elif buf.startswith(b'150 '):
s = parse150(buf)
if s:
self.size = int(s)
-
+
+ if buf.lower().find(b'location') != -1:
+ location = b':'.join(buf.split(b':')[1:])
+ location = location.strip()
+ self.scheme = urlparse.urlsplit(location)[0]
+ self.url = location
+
+ self._hdr_dump += buf
+ if len(self._hdr_dump) != 0 and buf == b'\r\n':
+ self._hdr_ended = True
+ if DEBUG: DEBUG.debug('header ended:')
+
return len(buf)
except KeyboardInterrupt:
return pycurl.READFUNC_ABORT
@@ -1112,12 +1447,14 @@ class PyCurlFileObject():
def _return_hdr_obj(self):
if self._parsed_hdr:
return self._parsed_hdr
- statusend = self._hdr_dump.find('\n')
+ statusend = self._hdr_dump.find(b'\n')
+ statusend += 1 # ridiculous as it may seem.
hdrfp = StringIO()
hdrfp.write(self._hdr_dump[statusend:])
- self._parsed_hdr = mimetools.Message(hdrfp)
+ hdrfp.seek(0)
+ self._parsed_hdr = email.message_from_string(hdrfp)
return self._parsed_hdr
-
+
hdr = property(_return_hdr_obj)
http_code = property(fget=
lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
@@ -1127,6 +1464,9 @@ class PyCurlFileObject():
if not opts:
opts = self.opts
+ # keepalives
+ if not opts.keepalive:
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
# defaults we're always going to set
self.curl_obj.setopt(pycurl.NOPROGRESS, False)
@@ -1136,172 +1476,219 @@ class PyCurlFileObject():
self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
self.curl_obj.setopt(pycurl.FAILONERROR, True)
self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
-
- if DEBUG:
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
+
+ if DEBUG and DEBUG.level <= 10:
self.curl_obj.setopt(pycurl.VERBOSE, True)
if opts.user_agent:
self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
-
+ if opts.ip_resolve:
+ # Default is: IPRESOLVE_WHATEVER
+ ipr = opts.ip_resolve.lower()
+ if ipr == 'whatever': # Do we need this?
+ self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER)
+ if ipr == 'ipv4':
+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
+ if ipr == 'ipv6':
+ self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6)
+
# maybe to be options later
self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
-
+
# timeouts
timeout = 300
- if opts.timeout:
- timeout = int(opts.timeout)
- self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ if hasattr(opts, 'timeout'):
+ timeout = int(opts.timeout or 0)
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, opts.minrate or 1000)
+ self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout)
# ssl options
- if self.scheme == 'https':
+ if self.scheme == b'https':
if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs
self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
- self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+ if opts.ssl_verify_host: # 1 is meaningless to curl
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2)
if opts.ssl_key:
self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
if opts.ssl_key_type:
self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
if opts.ssl_cert:
self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
- if opts.ssl_cert_type:
+ # if we have a client side cert - turn off reuse b/c nss is odd
+ self.curl_obj.setopt(pycurl.FORBID_REUSE, 1)
+ if opts.ssl_cert_type:
self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
if opts.ssl_key_pass:
self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
#headers:
- if opts.http_headers and self.scheme in ('http', 'https'):
+ if self.scheme in (b'http', b'https'):
headers = []
- for (tag, content) in opts.http_headers:
- headers.append('%s:%s' % (tag, content))
- self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
+ if opts.http_headers is not None:
+ for (tag, content) in opts.http_headers:
+ headers.append('%s:%s' % (tag, content))
+ if opts.no_cache:
+ headers.append('Pragma:no-cache')
+ if headers:
+ self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
# ranges:
if opts.range or opts.reget:
range_str = self._build_range()
if range_str:
self.curl_obj.setopt(pycurl.RANGE, range_str)
-
+
# throttle/bandwidth
if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
-
- # proxy settings
- if opts.proxies:
- for (scheme, proxy) in opts.proxies.items():
- if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
- if scheme not in ('ftp'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
- elif self.scheme in ('http', 'https'):
- if scheme not in ('http', 'https'):
- continue
- else:
- if proxy == '_none_': proxy = ""
- self.curl_obj.setopt(pycurl.PROXY, proxy)
-
- # FIXME username/password/auth settings
+
+ # proxy
+ if opts.proxy is not None:
+ self.curl_obj.setopt(pycurl.PROXY, opts.proxy)
+ self.curl_obj.setopt(pycurl.PROXYAUTH,
+ # All but Kerberos. BZ 769254
+ pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE)
+
+ if opts.username and opts.password:
+ if self.scheme in (b'http', b'https'):
+ self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY)
+
+ if opts.username and opts.password:
+ # apparently when applying them as curlopts they do not require quoting of any kind
+ userpwd = '%s:%s' % (opts.username, opts.password)
+ self.curl_obj.setopt(pycurl.USERPWD, userpwd)
#posts - simple - expects the fields as they are
if opts.data:
self.curl_obj.setopt(pycurl.POST, True)
- self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
-
+ self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data))
+
+ # ftp
+ if opts.ftp_disable_epsv:
+ self.curl_obj.setopt(pycurl.FTP_USE_EPSV, False)
+
# our url
self.curl_obj.setopt(pycurl.URL, self.url)
-
-
+
+
def _do_perform(self):
if self._complete:
return
-
+
try:
self.curl_obj.perform()
- except pycurl.error, e:
+ except pycurl.error as e:
# XXX - break some of these out a bit more clearly
- # to other URLGrabErrors from
+ # to other URLGrabErrors from
# http://curl.haxx.se/libcurl/c/libcurl-errors.html
# this covers e.args[0] == 22 pretty well - which will be common
-
+
code = self.http_code
errcode = e.args[0]
+ errurl = _urlunquote_convert(self.url)
+
if self._error[0]:
errcode = self._error[0]
-
- if errcode == 23 and code >= 200 and code < 299:
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
- err.url = self.url
-
+
+ if errcode == 23 and 200 <= code <= 299:
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
# since we cannot know what it is -I'm banking on it being
- # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
# figure out what aborted the pycurl process FIXME
- raise KeyboardInterrupt
-
+ raise getattr(self, '_cb_error', KeyboardInterrupt)
+
elif errcode == 28:
- err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
- err.url = self.url
+ err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e))
+ err.url = errurl
raise err
- elif errcode == 35:
- msg = _("problem making ssl connection")
- err = URLGrabError(14, msg)
- err.url = self.url
- raise err
- elif errcode == 37:
- msg = _("Could not open/read %s") % (self.url)
- err = URLGrabError(14, msg)
- err.url = self.url
- raise err
-
+
elif errcode == 42:
- err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
- err.url = self.url
# this is probably wrong but ultimately this is what happens
# we have a legit http code and a pycurl 'writer failed' code
# which almost always means something aborted it from outside
# since we cannot know what it is -I'm banking on it being
- # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
# figure out what aborted the pycurl process FIXME
raise KeyboardInterrupt
-
- elif errcode == 58:
- msg = _("problem with the local client certificate")
- err = URLGrabError(14, msg)
- err.url = self.url
- raise err
- elif errcode == 60:
- msg = _("client cert cannot be verified or client cert incorrect")
+ else:
+ pyerr2str = { 5 : _("Couldn't resolve proxy"),
+ 6 : _("Couldn't resolve host"),
+ 7 : _("Couldn't connect"),
+ 8 : _("Bad reply to FTP server"),
+ 9 : _("Access denied"),
+ 11 : _("Bad reply to FTP pass"),
+ 13 : _("Bad reply to FTP pasv"),
+ 14 : _("Bad reply to FTP 227"),
+ 15 : _("Couldn't get FTP host"),
+ 17 : _("Couldn't set FTP type"),
+ 18 : _("Partial file"),
+ 19 : _("FTP RETR command failed"),
+ 22 : _("HTTP returned error"),
+ 23 : _("Write error"),
+ 25 : _("Upload failed"),
+ 26 : _("Read error"),
+ 27 : _("Out of Memory"),
+ 28 : _("Operation timed out"),
+ 30 : _("FTP PORT command failed"),
+ 31 : _("FTP REST command failed"),
+ 33 : _("Range failed"),
+ 34 : _("HTTP POST failed"),
+ 35 : _("SSL CONNECT failed"),
+ 36 : _("Couldn't resume download"),
+ 37 : _("Couldn't read file"),
+ 42 : _("Aborted by callback"),
+ 47 : _("Too many redirects"),
+ 51 : _("Peer certificate failed verification"),
+ 52 : _("Got nothing: SSL certificate expired?"),
+ 53 : _("SSL engine not found"),
+ 54 : _("SSL engine set failed"),
+ 55 : _("Network error send()"),
+ 56 : _("Network error recv()"),
+ 58 : _("Local certificate failed"),
+ 59 : _("SSL set cipher failed"),
+ 60 : _("Local CA certificate failed"),
+ 61 : _("HTTP bad transfer encoding"),
+ 63 : _("Maximum file size exceeded"),
+ 64 : _("FTP SSL failed"),
+ 67 : _("Authentication failure"),
+ 70 : _("Out of disk space on server"),
+ 73 : _("Remove file exists"),
+ 77 : _("Problem with the SSL CA cert (path? access rights?)"),
+ }
+ errstr = str(e.args[1]) or pyerr2str.get(errcode, '<Unknown>')
+ if code and not 200 <= code <= 299:
+ scheme = _bytes_repr(self.scheme)
+ msg = '%s Error %d - %s' % (scheme.upper(), code,
+ scheme in ('http', 'https')
+ and responses.get(code) or errstr)
+ else:
+ msg = 'curl#%s - "%s"' % (errcode, errstr)
+ code = errcode
+
err = URLGrabError(14, msg)
- err.url = self.url
+ err.url = errurl
+ err.code = code
raise err
-
- elif errcode == 63:
- if self._error[1]:
- msg = self._error[1]
- else:
- msg = _("Max download size exceeded on %s") % (self.url)
+
+ else:
+ if self._error[1]:
+ msg = self._error[1]
err = URLGrabError(14, msg)
- err.url = self.url
+ err.url = _urlunquote_convert(self.url)
raise err
-
- elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
- msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
- else:
- msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
- code = errcode
- err = URLGrabError(14, msg)
- err.code = code
- err.exception = e
- raise err
def _do_open(self):
- self.curl_obj = _curl_cache
+ if hasattr(self.opts, 'curl_obj') and self.opts.curl_obj is not None:
+ self.curl_obj = self.opts.curl_obj
+ else:
+ self.curl_obj = _curl_cache
self.curl_obj.reset() # reset all old settings away, just in case
# setup any ranges
self._set_opts()
@@ -1310,11 +1697,11 @@ class PyCurlFileObject():
def _add_headers(self):
pass
-
+
def _build_range(self):
reget_length = 0
rt = None
- if self.opts.reget and type(self.filename) in types.StringTypes:
+ if self.opts.reget and isinstance(self.filename, string_types):
# we have reget turned on and we're dumping to a file
try:
s = os.stat(self.filename)
@@ -1325,15 +1712,19 @@ class PyCurlFileObject():
reget_length = s[stat.ST_SIZE]
# Set initial length when regetting
- self._amount_read = reget_length
+ self._amount_read = reget_length
self._reget_length = reget_length # set where we started from, too
rt = reget_length, ''
self.append = 1
-
+
if self.opts.range:
rt = self.opts.range
- if rt[0]: rt = (rt[0] + reget_length, rt[1])
+
+ if rt[0] is None:
+ rt = (0, rt[1])
+ rt = (rt[0] + reget_length, rt[1])
+
if rt:
header = range_tuple_to_header(rt)
@@ -1345,10 +1736,10 @@ class PyCurlFileObject():
def _make_request(self, req, opener):
#XXXX
# This doesn't do anything really, but we could use this
- # instead of do_open() to catch a lot of crap errors as
+ # instead of do_open() to catch a lot of crap errors as
# mstenner did before here
return (self.fo, self.hdr)
-
+
try:
if self.opts.timeout:
old_to = socket.getdefaulttimeout()
@@ -1360,22 +1751,22 @@ class PyCurlFileObject():
else:
fo = opener.open(req)
hdr = fo.info()
- except ValueError, e:
+ except ValueError as e:
err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
err.url = self.url
raise err
- except RangeError, e:
+ except RangeError as e:
err = URLGrabError(9, _('%s on %s') % (e, self.url))
err.url = self.url
raise err
- except urllib2.HTTPError, e:
+ except HTTPError as e:
new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
new_e.code = e.code
new_e.exception = e
new_e.url = self.url
raise new_e
- except IOError, e:
+ except IOError as e:
if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
err.url = self.url
@@ -1385,41 +1776,41 @@ class PyCurlFileObject():
err.url = self.url
raise err
- except OSError, e:
+ except OSError as e:
err = URLGrabError(5, _('%s on %s') % (e, self.url))
err.url = self.url
raise err
- except HTTPException, e:
- err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
- (e.__class__.__name__, self.url, e))
+ except HTTPException as e:
+ err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s')
+ % (e.__class__.__name__, self.url, e))
err.url = self.url
raise err
else:
return (fo, hdr)
-
+
def _do_grab(self):
"""dump the file to a filename or StringIO buffer"""
if self._complete:
return
_was_filename = False
- if type(self.filename) in types.StringTypes and self.filename:
+ if isinstance(self.filename, string_types) and self.filename:
_was_filename = True
self._prog_reportname = str(self.filename)
self._prog_basename = os.path.basename(self.filename)
-
+
if self.append: mode = 'ab'
else: mode = 'wb'
- if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
- (self.filename, mode))
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s'
+ % (self.filename, mode))
try:
self.fo = open(self.filename, mode)
- except IOError, e:
- err = URLGrabError(16, _(\
- 'error opening local file from %s, IOError: %s') % (self.url, e))
+ except IOError as e:
+ err = URLGrabError(16, _('error opening local file from %s, IOError: %s')
+ % (self.url, e))
err.url = self.url
raise err
@@ -1427,34 +1818,58 @@ class PyCurlFileObject():
self._prog_reportname = 'MEMORY'
self._prog_basename = 'MEMORY'
-
- self.fo = StringIO()
+
+ self.fo = BytesIO()
# if this is to be a tempfile instead....
# it just makes crap in the tempdir
#fh, self._temp_name = mkstemp()
#self.fo = open(self._temp_name, 'wb')
-
- self._do_perform()
-
-
+ try:
+ self._do_perform()
+ except URLGrabError as e:
+ self.fo.flush()
+ self.fo.close()
+ raise e
if _was_filename:
# close it up
self.fo.flush()
self.fo.close()
+
+ # Set the URL where we got it from:
+ if xattr is not None:
+ # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes
+ try:
+ xattr.set(self.filename, 'user.xdg.origin.url', self.url)
+ except:
+ pass # URL too long. = IOError ... ignore everything.
+
# set the time
mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
if mod_time != -1:
- os.utime(self.filename, (mod_time, mod_time))
+ try:
+ os.utime(self.filename, (mod_time, mod_time))
+ except OSError as e:
+ err = URLGrabError(16, _('error setting timestamp on file %s from %s, OSError: %s')
+ % (self.filename, self.url, e))
+ err.url = self.url
+ raise err
# re open it
- self.fo = open(self.filename, 'r')
+ try:
+ self.fo = open(self.filename, 'r')
+ except IOError as e:
+ err = URLGrabError(16, _('error opening file from %s, IOError: %s')
+ % (self.url, e))
+ err.url = self.url
+ raise err
+
else:
#self.fo = open(self._temp_name, 'r')
self.fo.seek(0)
self._complete = True
-
+
def _fill_buffer(self, amt=None):
"""fill the buffer to contain at least 'amt' bytes by reading
from the underlying file object. If amt is None, then it will
@@ -1471,9 +1886,9 @@ class PyCurlFileObject():
# if we've made it here, then we don't have enough in the buffer
# and we need to read more.
-
+
if not self._complete: self._do_grab() #XXX cheater - change on ranges
-
+
buf = [self._rbuf]
bufsize = len(self._rbuf)
while amt is None or amt:
@@ -1483,23 +1898,23 @@ class PyCurlFileObject():
(time.time() - self._ttime)
if diff > 0: time.sleep(diff)
self._ttime = time.time()
-
+
# now read some data, up to self._rbufsize
if amt is None: readamount = self._rbufsize
else: readamount = min(amt, self._rbufsize)
try:
new = self.fo.read(readamount)
- except socket.error, e:
+ except socket.error as e:
err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
err.url = self.url
raise err
- except socket.timeout, e:
+ except socket.timeout as e:
raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
err.url = self.url
raise err
- except IOError, e:
+ except IOError as e:
raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
err.url = self.url
raise err
@@ -1515,7 +1930,7 @@ class PyCurlFileObject():
#if self.opts.progress_obj:
# self.opts.progress_obj.update(self._amount_read)
- self._rbuf = string.join(buf, '')
+ self._rbuf = b''.join(buf)
return
def _progress_update(self, download_total, downloaded, upload_total, uploaded):
@@ -1526,35 +1941,31 @@ class PyCurlFileObject():
if self._prog_running:
downloaded += self._reget_length
self.opts.progress_obj.update(downloaded)
- except KeyboardInterrupt:
+ except (KeyboardInterrupt, IOError):
return -1
-
+
def _over_max_size(self, cur, max_size=None):
if not max_size:
- max_size = self.size
- if self.opts.size: # if we set an opts size use that, no matter what
- max_size = self.opts.size
+ if not self.opts.size:
+ max_size = self.size
+ else:
+ max_size = self.opts.size
+
if not max_size: return False # if we have None for all of the Max then this is dumb
- if cur > max_size + max_size*.10:
+
+ if cur > int(float(max_size) * 1.10):
msg = _("Downloaded more than max size for %s: %s > %s") \
% (self.url, cur, max_size)
self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
return True
return False
-
- def _to_utf8(self, obj, errors='replace'):
- '''convert 'unicode' to an encoded utf-8 byte string '''
- # stolen from yum.i18n
- if isinstance(obj, unicode):
- obj = obj.encode('utf-8', errors)
- return obj
-
+
def read(self, amt=None):
self._fill_buffer(amt)
if amt is None:
- s, self._rbuf = self._rbuf, ''
+ s, self._rbuf = self._rbuf, b''
else:
s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
return s
@@ -1562,13 +1973,13 @@ class PyCurlFileObject():
def readline(self, limit=-1):
if not self._complete: self._do_grab()
return self.fo.readline()
-
- i = string.find(self._rbuf, '\n')
+
+ i = self._rbuf.find('\n')
while i < 0 and not (0 < limit <= len(self._rbuf)):
L = len(self._rbuf)
self._fill_buffer(L + self._rbufsize)
if not len(self._rbuf) > L: break
- i = string.find(self._rbuf, '\n', L)
+ i = self._rbuf.find('\n', L)
if i < 0: i = len(self._rbuf)
else: i = i+1
@@ -1581,10 +1992,25 @@ class PyCurlFileObject():
if self._prog_running:
self.opts.progress_obj.end(self._amount_read)
self.fo.close()
-
+ def geturl(self):
+ """ Provide the geturl() method, used to be got from
+ urllib.addinfourl, via. urllib.URLopener.* """
+ return self.url
+
+if hasattr(pycurl, 'GLOBAL_ACK_EINTR'):
+ # fail immediately on ctrl-c
+ pycurl.global_init(pycurl.GLOBAL_DEFAULT | pycurl.GLOBAL_ACK_EINTR)
_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+def reset_curl_obj():
+ """To make sure curl has reread the network/dns info we force a reload"""
+ global _curl_cache
+ _curl_cache.close()
+ _curl_cache = pycurl.Curl()
+
+_libproxy_cache = None
+
#####################################################################
# DEPRECATED FUNCTIONS
@@ -1603,90 +2029,601 @@ def set_progress_obj(new_progress_obj):
def set_user_agent(new_user_agent):
"""Deprecated. Use: default_grabber.user_agent = new_user_agent"""
default_grabber.user_agent = new_user_agent
-
+
def retrygrab(url, filename=None, copy_local=0, close_connection=0,
progress_obj=None, throttle=None, bandwidth=None,
numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
"""Deprecated. Use: urlgrab() with the retry arg instead"""
- kwargs = {'copy_local' : copy_local,
+ kwargs = {'copy_local' : copy_local,
'close_connection' : close_connection,
- 'progress_obj' : progress_obj,
- 'throttle' : throttle,
+ 'progress_obj' : progress_obj,
+ 'throttle' : throttle,
'bandwidth' : bandwidth,
'retry' : numtries,
'retrycodes' : retrycodes,
- 'checkfunc' : checkfunc
+ 'checkfunc' : checkfunc
}
return urlgrab(url, filename, **kwargs)
-
+
+#####################################################################
+# Serializer + parser: A replacement of the rather bulky Json code.
+#
+# - handles basic python literals, lists and tuples.
+# - serialized strings never contain ' ' or '\n'
+#
+#####################################################################
+
+def _quoter(c):
+ if c in '%[(,)] \n':
+ return '%%%02x' % ord(c)
+ return c
+
+def _dumps(v):
+ if v is None: return 'None'
+ if v is True: return 'True'
+ if v is False: return 'False'
+ if isinstance(v, numbers.Number):
+ return str(v)
+ if isinstance(v, (str, text_type, bytes)):
+ # standarize to str on both py2 to py3
+ if sys.version_info < (3,):
+ if isinstance(v, text_type):
+ v = v.encode('utf8')
+ else:
+ if isinstance(v, bytes):
+ v = v.decode('utf8')
+ return "'%s'" % ''.join(map(_quoter, v))
+ if isinstance(v, tuple):
+ return "(%s)" % ','.join(map(_dumps, v))
+ if isinstance(v, list):
+ return "[%s]" % ','.join(map(_dumps, v))
+ raise TypeError("Can't serialize %s" % v)
+
+def _loads(s):
+ def decode(v):
+ if v == 'None': return None
+ if v == 'True': return True
+ if v == 'False': return False
+ try: return int(v)
+ except ValueError: pass
+ try: return float(v)
+ except ValueError: pass
+ if len(v) >= 2 and v[0] == v[-1] == "'":
+ ret = []; i = 1
+ while True:
+ j = v.find('%', i)
+ ret.append(v[i:j]) # skips the final "'"
+ if j == -1: break
+ ret.append(chr(int(v[j + 1:j + 3], 16)))
+ i = j + 3
+ v = ''.join(ret)
+ return v
+ stk = None
+ l = []
+ i = j = 0
+ while True:
+ if j == len(s) or s[j] in ',)]':
+ if j > i:
+ l.append(decode(s[i:j]))
+ if j == len(s): break
+ if s[j] in ')]':
+ if s[j] == ')':
+ l = tuple(l)
+ stk[0].append(l)
+ l, stk = stk
+ i = j = j + 1
+ elif s[j] in '[(':
+ stk = l, stk
+ l = []
+ i = j = j + 1
+ else:
+ j += 1 # safe because '[(,)]' are quoted
+ if stk: raise ValueError
+ if len(l) == 1: l = l[0]
+ return l
+
+
+#####################################################################
+# External downloader process
+#####################################################################
+
+def _readlines(fd):
+ buf = os.read(fd, 4096)
+ if not buf: return None
+ # whole lines only, no buffering
+ while not buf.endswith(b'\n'):
+ buf += os.read(fd, 4096)
+ return buf[:-1].split(b'\n')
+
+import subprocess
+
+class _ExternalDownloader:
+ def __init__(self):
+ # raise if urlgrabber-ext-down is not installed so the user gets a
+ # an obvious error message instead of "[Errno 5] [Errno 2] No such file
+ # or directory"
+ if not os.path.exists('/usr/libexec/urlgrabber-ext-down') and os.getenv('URLGRABBER_EXT_DOWN') is None:
+ raise OSError('"/usr/libexec/urlgrabber-ext-down" is not installed')
+ urlgrabber_path = (os.getenv('URLGRABBER_EXT_DOWN', None)
+ or '/usr/libexec/urlgrabber-ext-down')
+ self.popen = subprocess.Popen(
+ urlgrabber_path,
+ stdin = subprocess.PIPE,
+ stdout = subprocess.PIPE,
+ )
+ self.stdin = self.popen.stdin.fileno()
+ self.stdout = self.popen.stdout.fileno()
+ self.running = {}
+ self.cnt = 0
+
+ # list of options we pass to downloader
+ _options = (
+ 'url', 'filename',
+ 'timeout', 'minrate', 'close_connection', 'keepalive',
+ 'throttle', 'bandwidth', 'range', 'reget',
+ 'user_agent', 'http_headers', 'ftp_headers',
+ 'proxy', 'prefix', 'username', 'password',
+ 'ssl_ca_cert',
+ 'ssl_cert', 'ssl_cert_type',
+ 'ssl_key', 'ssl_key_type',
+ 'ssl_key_pass',
+ 'ssl_verify_peer', 'ssl_verify_host',
+ 'size', 'max_header_size', 'ip_resolve',
+ 'ftp_disable_epsv',
+ 'no_cache',
+ )
+
+ def start(self, opts):
+ arg = []
+ for k in self._options:
+ v = getattr(opts, k)
+ if v is None: continue
+ arg.append('%s=%s' % (k, _dumps(v)))
+ if opts.progress_obj and opts.multi_progress_obj:
+ arg.append('progress_obj=True')
+ arg = ' '.join(arg)
+ if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url)
+
+ self.cnt += 1
+ self.running[self.cnt] = opts
+ os.write(self.stdin, (arg +'\n').encode('utf8'))
+
+ def perform(self):
+ ret = []
+ lines = _readlines(self.stdout)
+ if not lines:
+ if DEBUG: DEBUG.info('downloader died')
+ raise KeyboardInterrupt
+ for line in lines:
+ # parse downloader output
+ line = line.split(b' ', 6)
+ _id, size = map(int, line[:2])
+ if len(line) == 2:
+ self.running[_id]._progress.update(size)
+ continue
+ # job done
+ opts = self.running.pop(_id)
+ if line[4] == b'OK':
+ ug_err = None
+ if DEBUG: DEBUG.info('success')
+ else:
+ ug_err = URLGrabError(int(line[4]), line[6])
+ if line[5] != b'0':
+ ug_err.code = int(line[5])
+ if DEBUG: DEBUG.info('failure: %s', ug_err)
+ _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async_[0])
+ ret.append((opts, size, ug_err))
+ return ret
+
+ def abort(self):
+ self.popen.stdin.close()
+ self.popen.stdout.close()
+ self.popen.wait()
+
+class _ExternalDownloaderPool:
+ def __init__(self):
+ self.epoll = select.epoll()
+ self.running = {}
+ self.cache = {}
+
+ def start(self, opts):
+ host = urlparse.urlsplit(opts.url).netloc
+ dl = self.cache.pop(host, None)
+ if not dl:
+ dl = _ExternalDownloader()
+ fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD)
+ fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC)
+ self.epoll.register(dl.stdout, select.EPOLLIN)
+ self.running[dl.stdout] = dl
+ dl.start(opts)
+
+ def perform(self):
+ ret = []
+ for fd, event in self.epoll.poll():
+ if event & select.EPOLLHUP:
+ if DEBUG: DEBUG.info('downloader died')
+ raise KeyboardInterrupt
+ assert event & select.EPOLLIN
+ done = self.running[fd].perform()
+ if not done: continue
+ assert len(done) == 1
+ ret.extend(done)
+
+ # dl finished, move it to the cache
+ host = urlparse.urlsplit(done[0][0].url).netloc
+ if host in self.cache: self.cache[host].abort()
+ self.epoll.unregister(fd)
+ self.cache[host] = self.running.pop(fd)
+ return ret
+
+ def abort(self):
+ for dl in self.running.values():
+ self.epoll.unregister(dl.stdout)
+ dl.abort()
+ for dl in self.cache.values():
+ dl.abort()
+
+
+#####################################################################
+# High level async API
+#####################################################################
+
+_async_queue = []
+
+def parallel_wait(meter=None):
+ '''Process queued requests in parallel.
+ '''
+
+ # calculate total sizes
+ meters = {}
+ for opts in _async_queue:
+ if opts.progress_obj and opts.multi_progress_obj:
+ count, total = meters.get(opts.multi_progress_obj) or (0, 0)
+ meters[opts.multi_progress_obj] = count + 1, total + opts.size
+
+ # start multi-file meters
+ for meter in meters:
+ count, total = meters[meter]
+ meter.start(count, total)
+
+ dl = _ExternalDownloaderPool()
+ host_con = {} # current host connection counts
+ single = set() # hosts in single connection mode
+ retry_queue = []
+
+ def start(opts, tries):
+ opts.tries = tries
+ try:
+ dl.start(opts)
+ except OSError as e:
+ # can't spawn downloader, give up immediately
+ opts.exception = URLGrabError(5, exception2msg(e))
+ _run_callback(opts.failfunc, opts)
+ return
+
+ key, limit = opts.async_
+ host_con[key] = host_con.get(key, 0) + 1
+ if opts.progress_obj:
+ if opts.multi_progress_obj:
+ opts._progress = opts.multi_progress_obj.newMeter()
+ opts._progress.start(text=opts.text)
+ else:
+ opts._progress = time.time() # no updates
+
+ def perform():
+ for opts, size, ug_err in dl.perform():
+ key, limit = opts.async_
+ host_con[key] -= 1
+
+ if ug_err is None:
+ if opts.checkfunc:
+ try:
+ _run_callback(opts.checkfunc, opts)
+ except URLGrabError as e:
+ ug_err = e
+
+ if opts.progress_obj:
+ if opts.multi_progress_obj:
+ if ug_err:
+ opts._progress.failure(None)
+ else:
+ opts.multi_progress_obj.re.total += size - opts.size # correct totals
+ opts._progress.end(size)
+ opts.multi_progress_obj.removeMeter(opts._progress)
+ else:
+ opts.progress_obj.start(text=opts.text, now=opts._progress)
+ opts.progress_obj.update(size)
+ opts.progress_obj.end(size)
+ del opts._progress
+
+ if ug_err is None:
+ continue
+ if limit != 1 and key not in single and ug_err.errno in (12, 14):
+ # One possible cause is connection-limited server.
+ # Turn on the max_connections=1 override. BZ 853432
+ if DEBUG: DEBUG.info('max_connections(%s) %s => 1', key, limit)
+ single.add(key)
+ # When using multi-downloader the parent's _curl_cache
+ # object is idle. Kill it, as it might use keepalive=1.
+ reset_curl_obj()
+
+ retry = opts.retry or 0
+ if opts.failure_callback:
+ opts.exception = ug_err
+ try:
+ _run_callback(opts.failure_callback, opts)
+ except URLGrabError as e:
+ ug_err = e
+ retry = 0 # no retries
+ if opts.tries < retry and ug_err.errno in opts.retrycodes:
+ if ug_err.errno < 0 and opts.retry_no_cache:
+ opts.no_cache = True
+ start(opts, opts.tries + 1) # simple retry
+ continue
+
+ if opts.mirror_group:
+ mg, errors, failed, removed = opts.mirror_group
+ errors.append((opts.url, exception2msg(ug_err)))
+ failed[key] = failed.get(key, 0) + 1
+ opts.mirror = key
+ opts.exception = ug_err
+ action = mg.default_action or {}
+ if mg.failure_callback:
+ opts.tries = len(errors)
+ action = dict(action) # update only the copy
+ action.update(_run_callback(mg.failure_callback, opts))
+ if not action.get('fail', 0):
+ # mask this mirror and retry
+ if action.get('remove', 1):
+ removed.add(key)
+ retry_queue.append(opts)
+ continue
+ # fail=1 from callback
+ ug_err.errors = errors
+
+ # urlgrab failed
+ opts.exception = ug_err
+ _run_callback(opts.failfunc, opts)
+
+ try:
+ retry_idx = idx = 0
+ while True:
+ if retry_idx < len(retry_queue):
+ # retries first
+ opts = retry_queue[retry_idx]
+ retry_idx += 1
+ elif idx < len(_async_queue):
+ # handle next request
+ opts = _async_queue[idx]
+ idx += 1
+ else:
+ # both queues are empty
+ if not dl.running: break
+ perform()
+ continue
+
+ # check global limit
+ while len(dl.running) >= default_grabber.opts.max_connections:
+ perform()
+ if DEBUG:
+ DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections)
+
+ if opts.mirror_group:
+ mg, errors, failed, removed = opts.mirror_group
+
+ # find the best mirror
+ best = None
+ best_speed = None
+ for mirror in mg.mirrors:
+ key = mirror['mirror']
+ if key in removed: continue
+
+ # estimate mirror speed
+ speed, fail = _TH.estimate(key)
+ speed /= 1 + host_con.get(key, 0)
+
+ # order by: least failures, private flag, best speed
+ # ignore 'private' flag if there were failures
+ private = not fail and mirror.get('kwargs', {}).get('private', False)
+ speed = -failed.get(key, 0), private, speed
+ if best is None or speed > best_speed:
+ best = mirror
+ best_speed = speed
+
+ if best is None:
+ opts.exception = URLGrabError(256, _('No more mirrors to try.'))
+ opts.exception.errors = errors
+ _run_callback(opts.failfunc, opts)
+ continue
+
+ # update the grabber object, apply mirror kwargs
+ grabber = best.get('grabber') or mg.grabber
+ opts.delegate = grabber.opts.derive(**best.get('kwargs', {}))
+
+ # update the current mirror and limit
+ key = best['mirror']
+ limit = best.get('kwargs', {}).get('max_connections')
+ opts.async_ = key, limit
+
+ # update URL and proxy
+ url = mg._join_url(key, opts.relative_url)
+ url, parts = opts.urlparser.parse(url, opts)
+ opts.find_proxy(url, parts[0])
+ opts.url = url
+
+ # check host limit, then start
+ key, limit = opts.async_
+ if key in single:
+ limit = 1
+ while host_con.get(key, 0) >= (limit or 2):
+ perform()
+ if DEBUG:
+ DEBUG.info('max_connections(%s): %d/%s', key, host_con.get(key, 0), limit)
+
+ start(opts, 1)
+ except IOError as e:
+ if e.errno != 4: raise
+ raise KeyboardInterrupt
+
+ finally:
+ dl.abort()
+ for meter in meters:
+ meter.end()
+ del _async_queue[:]
+ _TH.save()
+
+
+#####################################################################
+# Host bandwidth estimation
+#####################################################################
+
+class _TH:
+ hosts = {}
+ dirty = None
+
+ @staticmethod
+ def load():
+ filename = default_grabber.opts.timedhosts
+ if filename and _TH.dirty is None:
+ try:
+ now = int(time.time())
+ for line in open(filename):
+ try:
+ host, speed, fail, ts = line.rsplit(' ', 3)
+ _TH.hosts[host] = int(speed), int(fail), min(int(ts), now)
+ except ValueError:
+ if DEBUG: DEBUG.info('Error parsing timedhosts: line "%s"', line)
+ except IOError: pass
+ _TH.dirty = False
+
+ @staticmethod
+ def save():
+ filename = default_grabber.opts.timedhosts
+ if filename and _TH.dirty is True:
+ tmp = '%s.%d' % (filename, os.getpid())
+ try:
+ f = open(tmp, 'w')
+ for host in _TH.hosts:
+ f.write(host + ' %d %d %d\n' % _TH.hosts[host])
+ f.close()
+ os.rename(tmp, filename)
+ except IOError: pass
+ _TH.dirty = False
+
+ @staticmethod
+ def update(url, dl_size, dl_time, ug_err, baseurl=None):
+ # Use hostname from URL. If it's a file:// URL, use baseurl.
+ # If no baseurl, do not update timedhosts.
+ host = urlparse.urlsplit(url).netloc.split(b'@')[-1] or baseurl
+ if not host: return
+
+ _TH.load()
+ speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0)
+ now = time.time()
+
+ if ug_err is None:
+ # defer first update if the file was small. BZ 851178.
+ if not ts and dl_size < 1e6: return
+ # k1: the older, the less useful
+ # k2: <500ms readings are less reliable
+ # speeds vary, use 10:1 smoothing
+ k1 = 2**((ts - now) / default_grabber.opts.half_life)
+ k2 = min(dl_time / .500, 1.0) / 10
+ if k2 > 0:
+ speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2)
+ fail = 0
+ elif getattr(ug_err, 'code', None) == 404:
+ if not ts: return # 1st update, avoid speed=0
+ fail = 0 # alive, at least
+ else:
+ fail += 1 # seems dead
+
+ _TH.hosts[host] = speed, fail, now
+ _TH.dirty = True
+
+ @staticmethod
+ def estimate(baseurl):
+ _TH.load()
+
+ # Use just the hostname, unless it's a file:// baseurl.
+ host = urlparse.urlsplit(baseurl).netloc.split(b'@')[-1] or baseurl
+
+ default_speed = default_grabber.opts.default_speed
+ try: speed, fail, ts = _TH.hosts[host]
+ except KeyError: return default_speed, 0
+
+ speed *= 2**-fail
+ k = 2**((ts - time.time()) / default_grabber.opts.half_life)
+ speed = k * speed + (1 - k) * default_speed
+ return speed, fail
+
#####################################################################
# TESTING
def _main_test():
try: url, filename = sys.argv[1:3]
except ValueError:
- print 'usage:', sys.argv[0], \
- '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
- sys.exit()
+ print('usage:', sys.argv[0],
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]')
+ sys.exit(2)
kwargs = {}
for a in sys.argv[3:]:
- k, v = string.split(a, '=', 1)
+ k, v = a.split('=', 1)
kwargs[k] = int(v)
set_throttle(1.0)
set_bandwidth(32 * 1024)
- print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
- default_grabber.bandwidth)
+ print("throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
+ default_grabber.bandwidth))
- try: from progress import text_progress_meter
- except ImportError, e: pass
+ try: from .progress import text_progress_meter
+ except ImportError as e: pass
else: kwargs['progress_obj'] = text_progress_meter()
- try: name = apply(urlgrab, (url, filename), kwargs)
- except URLGrabError, e: print e
- else: print 'LOCAL FILE:', name
+ try: name = urlgrab(url, filename, **kwargs)
+ except URLGrabError as e: print(e)
+ else: print('LOCAL FILE:', name)
def _retry_test():
try: url, filename = sys.argv[1:3]
except ValueError:
- print 'usage:', sys.argv[0], \
- '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
- sys.exit()
+ print('usage:', sys.argv[0],
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]')
+ sys.exit(2)
kwargs = {}
for a in sys.argv[3:]:
- k, v = string.split(a, '=', 1)
+ k, v = a.split('=', 1)
kwargs[k] = int(v)
- try: from progress import text_progress_meter
- except ImportError, e: pass
+ try: from .progress import text_progress_meter
+ except ImportError as e: pass
else: kwargs['progress_obj'] = text_progress_meter()
def cfunc(filename, hello, there='foo'):
- print hello, there
+ print(hello, there)
import random
rnum = random.random()
if rnum < .5:
- print 'forcing retry'
+ print('forcing retry')
raise URLGrabError(-1, 'forcing retry')
if rnum < .75:
- print 'forcing failure'
+ print('forcing failure')
raise URLGrabError(-2, 'forcing immediate failure')
- print 'success'
+ print('success')
return
-
+
kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
- try: name = apply(retrygrab, (url, filename), kwargs)
- except URLGrabError, e: print e
- else: print 'LOCAL FILE:', name
+ try: name = retrygrab(url, filename, **kwargs)
+ except URLGrabError as e: print(e)
+ else: print('LOCAL FILE:', name)
def _file_object_test(filename=None):
- import cStringIO
if filename is None:
filename = __file__
- print 'using file "%s" for comparisons' % filename
+ print('using file "%s" for comparisons' % filename)
fo = open(filename)
s_input = fo.read()
fo.close()
@@ -1695,17 +2632,17 @@ def _file_object_test(filename=None):
_test_file_object_readall,
_test_file_object_readline,
_test_file_object_readlines]:
- fo_input = cStringIO.StringIO(s_input)
- fo_output = cStringIO.StringIO()
+ fo_input = StringIO(s_input)
+ fo_output = StringIO()
wrapper = PyCurlFileObject(fo_input, None, 0)
- print 'testing %-30s ' % testfunc.__name__,
+ print('testing %-30s ' % testfunc.__name__, end=' ')
testfunc(wrapper, fo_output)
s_output = fo_output.getvalue()
- if s_output == s_input: print 'passed'
- else: print 'FAILED'
-
+ if s_output == s_input: print('passed')
+ else: print('FAILED')
+
def _test_file_object_smallread(wrapper, fo_output):
- while 1:
+ while True:
s = wrapper.read(23)
fo_output.write(s)
if not s: return
@@ -1715,14 +2652,14 @@ def _test_file_object_readall(wrapper, fo_output):
fo_output.write(s)
def _test_file_object_readline(wrapper, fo_output):
- while 1:
+ while True:
s = wrapper.readline()
fo_output.write(s)
if not s: return
def _test_file_object_readlines(wrapper, fo_output):
li = wrapper.readlines()
- fo_output.write(string.join(li, ''))
+ fo_output.write(''.join(li))
if __name__ == '__main__':
_main_test()
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..d95863e 100644..100755
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -9,9 +9,9 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -76,6 +76,10 @@ CUSTOMIZATION
'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used.
+ kwarg 'max_connections' limits the number of concurrent
+ connections to this mirror. When omitted or set to zero,
+ the default limit (2) will be used.
+
3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument.
@@ -87,12 +91,29 @@ CUSTOMIZATION
"""
+import sys
import random
-import thread # needed for locking to make this threadsafe
-from grabber import URLGrabError, CallbackObject, DEBUG
+if sys.version_info >= (3,):
+ # We use a version check because python2 also has _thread
+ import _thread as thread
+else:
+ import thread
+
+try:
+ import urllib.parse as urlparse
+except ImportError:
+ import urlparse
+
+from six import string_types
-def _(st):
+from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
+from .grabber import _run_callback, _do_raise
+from .grabber import exception2msg
+from .grabber import _TH
+from .grabber import _bytes_repr
+
+def _(st):
return st
class GrabRequest:
@@ -126,13 +147,15 @@ class MirrorGroup:
files)
* if the local list is ever exhausted, a URLGrabError will be
- raised (errno=256, no more mirrors)
+ raised (errno=256, No more mirrors). The 'errors' attribute
+ holds a list of (full_url, errmsg) tuples. This contains
+ all URLs tried and the corresponding error messages.
OPTIONS
In addition to the required arguments "grabber" and "mirrors",
MirrorGroup also takes the following optional arguments:
-
+
default_action
A dict that describes the actions to be taken upon failure
@@ -153,7 +176,8 @@ class MirrorGroup:
The 'fail' option will cause immediate failure by re-raising
the exception and no further attempts to get the current
- download.
+ download. As in the "No more mirrors" case, the 'errors'
+ attribute is set in the exception object.
This dict can be set at instantiation time,
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
@@ -162,7 +186,7 @@ class MirrorGroup:
or by returning an action dict from the failure_callback
return {'fail':0}
in increasing precedence.
-
+
If all three of these were done, the net result would be:
{'increment': 0, # set in method
'increment_master': 1, # class default
@@ -180,10 +204,11 @@ class MirrorGroup:
etc). Otherwise, it is assumed to be the callable object
itself. The callback will be passed a grabber.CallbackObject
instance along with args and kwargs (if present). The following
- attributes are defined withing the instance:
+ attributes are defined within the instance:
obj.exception = < exception that was raised >
obj.mirror = < the mirror that was tried >
+ obj.tries = < the number of mirror tries so far >
obj.relative_url = < url relative to the mirror >
obj.url = < full url that failed >
# .url is just the combination of .mirror
@@ -251,22 +276,34 @@ class MirrorGroup:
self.default_action = None
self._process_kwargs(kwargs)
+ # use the same algorithm as parallel downloader to initially sort
+ # the mirror list (sort by speed, but prefer live private mirrors)
+ def estimate(m):
+ speed, fail = _TH.estimate(m['mirror'])
+ private = not fail and m.get('kwargs', {}).get('private', False)
+ return private, speed
+
+ # update the initial order. since sorting is stable, the relative
+ # order of unknown (not used yet) hosts is retained.
+ self.mirrors.sort(key=estimate, reverse=True)
+
# if these values are found in **kwargs passed to one of the urlXXX
# methods, they will be stripped before getting passed on to the
# grabber
options = ['default_action', 'failure_callback']
-
+
def _process_kwargs(self, kwargs):
self.failure_callback = kwargs.get('failure_callback')
self.default_action = kwargs.get('default_action')
-
+
def _parse_mirrors(self, mirrors):
parsed_mirrors = []
for m in mirrors:
- if type(m) == type(''): m = {'mirror': m}
+ if isinstance(m, string_types):
+ m = {'mirror': _to_utf8(m)}
parsed_mirrors.append(m)
return parsed_mirrors
-
+
def _load_gr(self, gr):
# OVERRIDE IDEAS:
# shuffle gr list
@@ -280,7 +317,9 @@ class MirrorGroup:
# return a random mirror so that multiple mirrors get used
# even without failures.
if not gr.mirrors:
- raise URLGrabError(256, _('No more mirrors to try.'))
+ e = URLGrabError(256, _('No more mirrors to try.'))
+ e.errors = gr.errors
+ raise e
return gr.mirrors[gr._next]
def _failure(self, gr, cb_obj):
@@ -290,7 +329,7 @@ class MirrorGroup:
# the callback)
cb = gr.kw.get('failure_callback') or self.failure_callback
if cb:
- if type(cb) == type( () ):
+ if isinstance(cb, tuple):
cb, args, kwargs = cb
else:
args, kwargs = (), {}
@@ -307,7 +346,9 @@ class MirrorGroup:
a.update(action)
action = a
self.increment_mirror(gr, action)
- if action and action.get('fail', 0): raise
+ if action and action.get('fail', 0):
+ sys.exc_info()[1].errors = gr.errors
+ raise
def increment_mirror(self, gr, action={}):
"""Tell the mirror object increment the mirror index
@@ -323,7 +364,7 @@ class MirrorGroup:
urlopen, there's no good way for the mirror group to know that
an error occurs mid-download (it's already returned and given
you the file object).
-
+
remove --- can have several values
0 do not remove the mirror from the list
1 remove the mirror for this download only
@@ -345,7 +386,7 @@ class MirrorGroup:
self._next += 1
if self._next >= len(self.mirrors): self._next = 0
self._lock.release()
-
+
if action.get('remove', 1):
del gr.mirrors[gr._next]
elif action.get('increment', 1):
@@ -353,9 +394,9 @@ class MirrorGroup:
if gr._next >= len(gr.mirrors): gr._next = 0
if DEBUG:
- grm = [m['mirror'] for m in gr.mirrors]
+ grm = [m['mirror'].decode() for m in gr.mirrors]
DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
- selfm = [m['mirror'] for m in self.mirrors]
+ selfm = [m['mirror'].decode() for m in self.mirrors]
DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
#####################################################################
@@ -366,47 +407,68 @@ class MirrorGroup:
# by overriding the configuration methods :)
def _join_url(self, base_url, rel_url):
- if base_url.endswith('/') or rel_url.startswith('/'):
- return base_url + rel_url
+ (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
+
+ if isinstance(base_url, bytes):
+ if not isinstance(rel_url, bytes):
+ rel_url = rel_url.encode('utf8')
+ sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/'
else:
- return base_url + '/' + rel_url
-
+ sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
+ return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid))
+
def _mirror_try(self, func, url, kw):
gr = GrabRequest()
gr.func = func
gr.url = url
gr.kw = dict(kw)
self._load_gr(gr)
+ gr.errors = []
for k in self.options:
try: del kw[k]
except KeyError: pass
- while 1:
+ tries = 0
+ while True:
+ tries += 1
mirrorchoice = self._get_mirror(gr)
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
- kwargs = dict(mirrorchoice.get('kwargs', {}))
- kwargs.update(kw)
grabber = mirrorchoice.get('grabber') or self.grabber
+ # apply mirrorchoice kwargs on top of grabber.opts
+ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
func_ref = getattr(grabber, func)
- if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
+ if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl))
try:
- return func_ref( *(fullurl,), **kwargs )
- except URLGrabError, e:
+ return func_ref( *(fullurl,), opts=opts, **kw )
+ except URLGrabError as e:
if DEBUG: DEBUG.info('MIRROR: failed')
+ gr.errors.append((fullurl, exception2msg(e)))
obj = CallbackObject()
obj.exception = e
obj.mirror = mirrorchoice['mirror']
obj.relative_url = gr.url
obj.url = fullurl
+ obj.tries = tries
self._failure(gr, obj)
def urlgrab(self, url, filename=None, **kwargs):
kw = dict(kwargs)
kw['filename'] = filename
+ if kw.get('async_') or kw.get('async'):
+ # enable mirror failovers in async path
+ kw['mirror_group'] = self, [], {}, set()
+ kw['relative_url'] = url
+ else:
+ kw.pop('failfunc', None)
func = 'urlgrab'
- return self._mirror_try(func, url, kw)
-
+ try:
+ return self._mirror_try(func, url, kw)
+ except URLGrabError as e:
+ obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
+ return _run_callback(kwargs.get('failfunc', _do_raise), obj)
+
def urlopen(self, url, **kwargs):
kw = dict(kwargs)
func = 'urlopen'
@@ -417,7 +479,7 @@ class MirrorGroup:
kw['limit'] = limit
func = 'urlread'
return self._mirror_try(func, url, kw)
-
+
class MGRandomStart(MirrorGroup):
"""A mirror group that starts at a random mirror in the list.
diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py
index dd07c6a..5b4c450 100644..100755
--- a/urlgrabber/progress.py
+++ b/urlgrabber/progress.py
@@ -9,23 +9,31 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+from __future__ import print_function
import sys
import time
import math
-import thread
import fcntl
import struct
import termios
+if sys.version_info >= (3,):
+ # We use a version check because python2 also has _thread
+ import _thread as thread
+else:
+ import thread
+
+from six import integer_types, string_types
+
# Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html
def terminal_width(fd=1):
""" Get the real terminal width """
@@ -107,7 +115,7 @@ class BaseMeter:
self.last_amount_read = 0
self.last_update_time = None
self.re = RateEstimator()
-
+
def start(self, filename=None, url=None, basename=None,
size=None, now=None, text=None):
self.filename = filename
@@ -125,7 +133,7 @@ class BaseMeter:
self.last_amount_read = 0
self.last_update_time = now
self._do_start(now)
-
+
def _do_start(self, now=None):
pass
@@ -133,8 +141,8 @@ class BaseMeter:
# for a real gui, you probably want to override and put a call
# to your mainloop iteration function here
if now is None: now = time.time()
- if (now >= self.last_update_time + self.update_period) or \
- not self.last_update_time:
+ if (not self.last_update_time or
+ (now >= self.last_update_time + self.update_period)):
self.re.update(amount_read, now)
self.last_amount_read = amount_read
self.last_update_time = now
@@ -152,7 +160,7 @@ class BaseMeter:
def _do_end(self, amount_read, now=None):
pass
-
+
# This is kind of a hack, but progress is gotten from grabber which doesn't
# know about the total size to download. So we do this so we can get the data
# out of band here. This will be "fixed" one way or anther soon.
@@ -167,7 +175,7 @@ def text_meter_total_size(size, downloaded=0):
#
# update: No size (minimal: 17 chars)
# -----------------------------------
-# <text> <rate> | <current size> <elapsed time>
+# <text> <rate> | <current size> <elapsed time>
# 8-48 1 8 3 6 1 9 5
#
# Order: 1. <text>+<current size> (17)
@@ -202,7 +210,7 @@ def text_meter_total_size(size, downloaded=0):
#
# end
# ---
-# <text> | <current size> <elapsed time>
+# <text> | <current size> <elapsed time>
# 8-56 3 6 1 9 5
#
# Order: 1. <text> ( 8)
@@ -211,6 +219,21 @@ def text_meter_total_size(size, downloaded=0):
# 4. + ( 5, total: 32)
#
+def _term_add_bar(tl, bar_max_length, pc):
+ blen = bar_max_length
+ bar = '='*int(blen * pc)
+ if (blen * pc) - int(blen * pc) >= 0.5:
+ bar += '-'
+ return tl.add(' [%-*.*s]' % (blen, blen, bar))
+
+def _term_add_end(tl, osize, size):
+ if osize: # osize should be None or >0, but that's been broken.
+ if size > osize: # Is ??? better? Really need something to say < vs >.
+ return tl.add(' !!! '), True
+ elif size != osize:
+ return tl.add(' ... '), True
+ return tl.add(' ' * 5), False
+
class TextMeter(BaseMeter):
def __init__(self, fo=sys.stderr):
BaseMeter.__init__(self)
@@ -218,7 +241,6 @@ class TextMeter(BaseMeter):
def _do_update(self, amount_read, now=None):
etime = self.re.elapsed_time()
- fetime = format_time(etime)
fread = format_number(amount_read)
#self.size = None
if self.text is not None:
@@ -234,19 +256,23 @@ class TextMeter(BaseMeter):
# Include text + ui_rate in minimal
tl = TerminalLine(8, 8+1+8)
+ if tl._llen > 80:
+ use_hours = True # For big screens, make it more readable.
+ else:
+ use_hours = False
ui_size = tl.add(' | %5sB' % fread)
if self.size is None:
- ui_time = tl.add(' %9s' % fetime)
+ ui_time = tl.add(' %s' % format_time(etime, use_hours))
ui_end = tl.add(' ' * 5)
ui_rate = tl.add(' %5sB/s' % ave_dl)
out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
ui_rate, ui_size, ui_time, ui_end)
else:
rtime = self.re.remaining_time()
- frtime = format_time(rtime)
+ frtime = format_time(rtime, use_hours)
frac = self.re.fraction_read()
- ui_time = tl.add(' %9s' % frtime)
+ ui_time = tl.add(' %s' % frtime)
ui_end = tl.add(' ETA ')
if sofar_size is None:
@@ -259,13 +285,10 @@ class TextMeter(BaseMeter):
ui_rate = tl.add(' %5sB/s' % ave_dl)
# Make text grow a bit before we start growing the bar too
blen = 4 + tl.rest_split(8 + 8 + 4)
- bar = '='*int(blen * frac)
- if (blen * frac) - int(blen * frac) >= 0.5:
- bar += '-'
- ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar))
- out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
- ui_sofar_pc, ui_pc, ui_bar,
- ui_rate, ui_size, ui_time, ui_end)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_pc, ui_bar,
+ ui_rate,ui_size,ui_time, ui_end)
self.fo.write(out)
self.fo.flush()
@@ -274,7 +297,6 @@ class TextMeter(BaseMeter):
global _text_meter_total_size
global _text_meter_sofar_size
- total_time = format_time(self.re.elapsed_time())
total_size = format_number(amount_read)
if self.text is not None:
text = self.text
@@ -282,14 +304,13 @@ class TextMeter(BaseMeter):
text = self.basename
tl = TerminalLine(8)
- ui_size = tl.add(' | %5sB' % total_size)
- ui_time = tl.add(' %9s' % total_time)
- not_done = self.size is not None and amount_read != self.size
- if not_done:
- ui_end = tl.add(' ... ')
+ if tl._llen > 80:
+ use_hours = True # For big screens, make it more readable.
else:
- ui_end = tl.add(' ' * 5)
-
+ use_hours = False
+ ui_size = tl.add(' | %5sB' % total_size)
+ ui_time = tl.add(' %s' % format_time(self.re.elapsed_time(), use_hours))
+ ui_end, not_done = _term_add_end(tl, self.size, amount_read)
out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
ui_size, ui_time, ui_end)
self.fo.write(out)
@@ -331,14 +352,23 @@ class MultiFileHelper(BaseMeter):
def message(self, message):
self.master.message_meter(self, message)
+class _FakeLock:
+ def acquire(self):
+ pass
+ def release(self):
+ pass
+
class MultiFileMeter:
helperclass = MultiFileHelper
- def __init__(self):
+ def __init__(self, threaded=True):
self.meters = []
self.in_progress_meters = []
- self._lock = thread.allocate_lock()
+ if threaded:
+ self._lock = thread.allocate_lock()
+ else:
+ self._lock = _FakeLock()
self.update_period = 0.3 # seconds
-
+
self.numfiles = None
self.finished_files = 0
self.failed_files = 0
@@ -369,8 +399,9 @@ class MultiFileMeter:
def end(self, now=None):
if now is None: now = time.time()
+ self.re.update(self._amount_read(), now)
self._do_end(now)
-
+
def _do_end(self, now):
pass
@@ -383,10 +414,10 @@ class MultiFileMeter:
newmeter = self.helperclass(self)
self.meters.append(newmeter)
return newmeter
-
+
def removeMeter(self, meter):
self.meters.remove(meter)
-
+
###########################################################
# child functions - these should only be called by helpers
def start_meter(self, meter, now):
@@ -400,15 +431,15 @@ class MultiFileMeter:
finally:
self._lock.release()
self._do_start_meter(meter, now)
-
+
def _do_start_meter(self, meter, now):
pass
-
+
def update_meter(self, meter, now):
if not meter in self.meters:
raise ValueError('attempt to use orphaned meter')
- if (now >= self.last_update_time + self.update_period) or \
- not self.last_update_time:
+ if (not self.last_update_time or
+ (now >= self.last_update_time + self.update_period)):
self.re.update(self._amount_read(), now)
self.last_update_time = now
self._do_update_meter(meter, now)
@@ -466,34 +497,83 @@ class MultiFileMeter:
class TextMultiFileMeter(MultiFileMeter):
- def __init__(self, fo=sys.stderr):
+ def __init__(self, fo=sys.stderr, threaded=True):
self.fo = fo
- MultiFileMeter.__init__(self)
+ MultiFileMeter.__init__(self, threaded)
+ self.index_time = self.index = 0
# files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:##
+# New output, like TextMeter output...
+# update: No size (minimal: 17 chars)
+# -----------------------------------
+# (<#file>/<#tot files>): <text> <rate> | <current size> <elapsed>
+# 8-48 1 8 3 6 1 7-9 5
+#
+# update: Size, All files
+# -----------------------
+# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA
+# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1
+# end
+# ---
+# <text> | <file size> <file elapsed time>
+# 8-56 3 6 1 9 5
def _do_update_meter(self, meter, now):
self._lock.acquire()
try:
- format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \
- "time: %8.8s/%8.8s"
df = self.finished_files
tf = self.numfiles or 1
- pf = 100 * float(df)/tf + 0.49
+ # Don't use "percent of files complete" ...
+ # pf = 100 * float(df)/tf + 0.49
dd = self.re.last_amount_read
- td = self.total_size
+ td = self.re.total
pd = 100 * (self.re.fraction_read() or 0) + 0.49
dt = self.re.elapsed_time()
rt = self.re.remaining_time()
- if rt is None: tt = None
- else: tt = dt + rt
-
- fdd = format_number(dd) + 'B'
- ftd = format_number(td) + 'B'
- fdt = format_time(dt, 1)
- ftt = format_time(tt, 1)
-
- out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt))
- self.fo.write('\r' + out)
+
+ frac = self.re.fraction_read() or 0
+ pf = 100 * frac
+ ave_dl = format_number(self.re.average_rate())
+
+ # cycle through active meters
+ if now > self.index_time:
+ self.index_time = now + 1.0
+ self.index += 1
+ if self.index >= len(self.meters):
+ self.index = 0
+ meter = self.meters[self.index]
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df+1+self.index, tf, text)
+
+ # Include text + ui_rate in minimal
+ tl = TerminalLine(8, 8+1+8)
+ if tl._llen > 80:
+ use_hours = True # For big screens, make it more readable.
+ else:
+ use_hours = False
+ ui_size = tl.add(' | %5sB' % format_number(dd))
+ if not self.re.total:
+ ui_time = tl.add(' %s' % format_time(dt, use_hours))
+ ui_end = tl.add(' ' * 5)
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+ out = '\r%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_rate, ui_size, ui_time, ui_end)
+ else:
+ ui_time = tl.add(' %s' % format_time(rt, use_hours))
+ ui_end = tl.add(' ETA ')
+
+ ui_sofar_pc = tl.add(' %i%%' % pf,
+ full_len=len(" (100%)"))
+ ui_rate = tl.add(' %5sB/s' % ave_dl)
+
+ # Make text grow a bit before we start growing the bar too
+ blen = 4 + tl.rest_split(8 + 8 + 4)
+ ui_bar = _term_add_bar(tl, blen, frac)
+ out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text,
+ ui_sofar_pc, ui_bar,
+ ui_rate, ui_size, ui_time,
+ ui_end)
+ self.fo.write(out)
self.fo.flush()
finally:
self._lock.release()
@@ -502,25 +582,39 @@ class TextMultiFileMeter(MultiFileMeter):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %8.8s %9.9s"
- fn = meter.basename
+ fn = meter.text or meter.basename
size = meter.last_amount_read
fsize = format_number(size) + 'B'
et = meter.re.elapsed_time()
- fet = format_time(et, 1)
- frate = format_number(size / et) + 'B/s'
-
- out = '%-79.79s' % (format % (fn, fsize, fet, frate))
- self.fo.write('\r' + out + '\n')
+ frate = format_number(et and size / et) + 'B/s'
+ df = self.finished_files
+ tf = self.numfiles or 1
+
+ total_size = format_number(size)
+ text = meter.text or meter.basename
+ if tf > 1:
+ text = '(%u/%u): %s' % (df, tf, text)
+
+ tl = TerminalLine(8)
+ if tl._llen > 80:
+ use_hours = True # For big screens, make it more readable.
+ else:
+ use_hours = False
+ ui_size = tl.add(' | %5sB' % total_size)
+ ui_time = tl.add(' %s' % format_time(et, use_hours))
+ ui_end, not_done = _term_add_end(tl, meter.size, size)
+ out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text,
+ ui_size, ui_time, ui_end)
+ self.fo.write(out)
finally:
self._lock.release()
- self._do_update_meter(meter, now)
def _do_failure_meter(self, meter, message, now):
self._lock.acquire()
try:
format = "%-30.30s %6.6s %s"
- fn = meter.basename
- if type(message) in (type(''), type(u'')):
+ fn = meter.text or meter.basename
+ if isinstance(message, string_types):
message = message.splitlines()
if not message: message = ['']
out = '%-79s' % (format % (fn, 'FAILED', message[0] or ''))
@@ -537,15 +631,6 @@ class TextMultiFileMeter(MultiFileMeter):
finally:
self._lock.release()
- def _do_end(self, now):
- self._do_update_meter(None, now)
- self._lock.acquire()
- try:
- self.fo.write('\n')
- self.fo.flush()
- finally:
- self._lock.release()
-
######################################################################
# support classes and functions
@@ -560,13 +645,17 @@ class RateEstimator:
self.last_update_time = now
self.last_amount_read = 0
self.ave_rate = None
-
+
def update(self, amount_read, now=None):
if now is None: now = time.time()
- if amount_read == 0:
+ # libcurl calls the progress callback when fetching headers
+ # too, thus amount_read = 0 .. hdr_size .. 0 .. content_size.
+ # Ocassionally we miss the 2nd zero and report avg speed < 0.
+ # Handle read_diff < 0 here. BZ 1001767.
+ if amount_read == 0 or amount_read < self.last_amount_read:
# if we just started this file, all bets are off
self.last_update_time = now
- self.last_amount_read = 0
+ self.last_amount_read = amount_read
self.ave_rate = None
return
@@ -576,11 +665,11 @@ class RateEstimator:
# First update, on reget is the file size
if self.last_amount_read:
self.last_update_time = now
- self.ave_rate = self._temporal_rolling_ave(\
+ self.ave_rate = self._temporal_rolling_ave(
time_diff, read_diff, self.ave_rate, self.timescale)
self.last_amount_read = amount_read
#print 'results', time_diff, read_diff, self.ave_rate
-
+
#####################################################################
# result methods
def average_rate(self):
@@ -616,14 +705,14 @@ class RateEstimator:
epsilon = time_diff / timescale
if epsilon > 1: epsilon = 1.0
return self._rolling_ave(time_diff, read_diff, last_ave, epsilon)
-
+
def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon):
"""perform a "rolling average" iteration
a rolling average "folds" new data into an existing average with
some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive)
a value of 0.0 means only the old value (initial value) counts,
and a value of 1.0 means only the newest value is considered."""
-
+
try:
recent_rate = read_diff / time_diff
except ZeroDivisionError:
@@ -652,23 +741,25 @@ class RateEstimator:
rt = int(rt)
if shift <= 0: return rt
return float(int(rt) >> shift << shift)
-
+
def format_time(seconds, use_hours=0):
if seconds is None or seconds < 0:
if use_hours: return '--:--:--'
else: return '--:--'
+ elif seconds == float('inf'):
+ return 'Infinite'
else:
seconds = int(seconds)
- minutes = seconds / 60
+ minutes = seconds // 60
seconds = seconds % 60
if use_hours:
- hours = minutes / 60
+ hours = minutes // 60
minutes = minutes % 60
return '%02i:%02i:%02i' % (hours, minutes, seconds)
else:
return '%02i:%02i' % (minutes, seconds)
-
+
def format_number(number, SI=0, space=' '):
"""Turn numbers into human-readable metric-like numbers"""
symbols = ['', # (none)
@@ -680,14 +771,14 @@ def format_number(number, SI=0, space=' '):
'E', # exa
'Z', # zetta
'Y'] # yotta
-
+
if SI: step = 1000.0
else: step = 1024.0
thresh = 999
depth = 0
max_depth = len(symbols) - 1
-
+
# we want numbers between 0 and thresh, but don't exceed the length
# of our list. In that event, the formatting will be screwed up,
# but it'll still show the right number.
@@ -695,7 +786,7 @@ def format_number(number, SI=0, space=' '):
depth = depth + 1
number = number / step
- if type(number) == type(1) or type(number) == type(1L):
+ if isinstance(number, integer_types):
# it's an int or a long, which means it didn't get divided,
# which means it's already short enough
format = '%i%s%s'
@@ -705,7 +796,7 @@ def format_number(number, SI=0, space=' '):
format = '%.1f%s%s'
else:
format = '%.0f%s%s'
-
+
return(format % (float(number or 0), space, symbols[depth]))
def _tst(fn, cur, tot, beg, size, *args):
@@ -722,9 +813,77 @@ def _tst(fn, cur, tot, beg, size, *args):
time.sleep(delay)
tm.end(size)
+def _mtst(datas, *args):
+ print('-' * 79)
+ tm = TextMultiFileMeter(threaded=False)
+
+ dl_sizes = {}
+
+ num = 0
+ total_size = 0
+ dl_total_size = 0
+ for data in datas:
+ dl_size = None
+ if len(data) == 2:
+ fn, size = data
+ dl_size = size
+ if len(data) == 3:
+ fn, size, dl_size = data
+ nm = tm.newMeter()
+ nm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size,
+ text=fn)
+ num += 1
+ assert dl_size is not None
+ dl_total_size += dl_size
+ dl_sizes[nm] = dl_size
+ if size is None or total_size is None:
+ total_size = None
+ else:
+ total_size += size
+ tm.start(num, total_size)
+
+ num = 0
+ off = 0
+ for (inc, delay) in args:
+ off += 1
+ while num < ((dl_total_size * off) / len(args)):
+ num += inc
+ for nm in tm.meters[:]:
+ if dl_sizes[nm] <= num:
+ nm.end(dl_sizes[nm])
+ tm.removeMeter(nm)
+ else:
+ nm.update(num)
+ time.sleep(delay)
+ assert not tm.meters
+
if __name__ == "__main__":
- # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
- # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
+ # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28
+ # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08
+ if len(sys.argv) >= 2 and sys.argv[1] == 'multi':
+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
+ ("s-1.0.1-1.fc8.i386.rpm", 5000),
+ ("m-1.0.1-2.fc8.i386.rpm", 10000)),
+ (100, 0.33), (500, 0.25), (1000, 0.1))
+
+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
+ ("s-1.0.1-1.fc8.i386.rpm", 5000),
+ ("m-1.0.1-2.fc8.i386.rpm", None, 10000)),
+ (100, 0.33), (500, 0.25), (1000, 0.1))
+
+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
+ ("s-1.0.1-1.fc8.i386.rpm", 2500000),
+ ("m-1.0.1-2.fc8.i386.rpm", 10000)),
+ (10, 0.2), (50, 0.1), (1000, 0.1))
+
+ _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000),
+ ("s-1.0.1-1.fc8.i386.rpm", None, 2500000),
+ ("m-1.0.1-2.fc8.i386.rpm", None, 10000)),
+ (10, 0.2), (50, 0.1), (1000, 0.1))
+ # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
+ # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25))
+ sys.exit(0)
+
if len(sys.argv) >= 2 and sys.argv[1] == 'total':
text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 +
1000000 + 10000 + 10000 + 10000 + 1000000)