diff options
Diffstat (limited to 'urlgrabber')
-rwxr-xr-x[-rw-r--r--] | urlgrabber/__init__.py | 44 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/byterange.py | 201 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/grabber.py | 1701 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/mirror.py | 132 | ||||
-rwxr-xr-x[-rw-r--r--] | urlgrabber/progress.py | 341 |
5 files changed, 1797 insertions, 622 deletions
diff --git a/urlgrabber/__init__.py b/urlgrabber/__init__.py index ddd5204..60f56c3 100644..100755 --- a/urlgrabber/__init__.py +++ b/urlgrabber/__init__.py @@ -1,16 +1,18 @@ -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or -# (at your option) any later version. +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Library General Public License for more details. +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. # -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, +# Boston, MA 02111-1307 USA # Copyright 2002-2006 Michael D. Stenner, Ryan Tomayko # Copyright 2009 Red Hat, Inc - pycurl support added by Seth Vidal @@ -44,11 +46,17 @@ following features: automatically switching mirrors if there is a failure. """ -__version__ = '3.9.1' -__date__ = '2009/09/25' -__author__ = 'Michael D. Stenner <mstenner@linux.duke.edu>, ' \ - 'Ryan Tomayko <rtomayko@naeblis.cx>' \ - 'Seth Vidal <skvidal@fedoraproject.org>' -__url__ = 'http://linux.duke.edu/projects/urlgrabber/' +try: + from email import message_from_string + from pkg_resources import get_distribution + pkgInfo = get_distribution(__package__).get_metadata('PKG-INFO') + __metadata__ = message_from_string(pkgInfo) + del pkgInfo -from grabber import urlgrab, urlopen, urlread + __version__ = __metadata__['Version'] + __author__ = __metadata__['Author'] + __url__ = __metadata__['Home-page'] +except: + __author__ = __version__ = __url__ = '<see setup.cfg>' + +from .grabber import urlgrab, urlopen, urlread diff --git a/urlgrabber/byterange.py b/urlgrabber/byterange.py index 3e5f3b7..e341add 100644..100755 --- a/urlgrabber/byterange.py +++ b/urlgrabber/byterange.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -19,56 +19,74 @@ import os +import sys import stat import urllib -import urllib2 -import rfc822 +import email +import ftplib +import socket +import sys +import mimetypes + +try: + from urllib.request import BaseHandler, FileHandler, FTPHandler, URLError + from urllib.request import addclosehook, addinfourl + from urllib.request import ftpwrapper as urllib_ftpwrapper + from urllib.parse import splitport, splituser, splitpasswd, splitattr, unquote +except ImportError: + from urllib2 import BaseHandler, FileHandler, FTPHandler, URLError + from urllib2 import ftpwrapper as urllib_ftpwrapper + from urllib import (splitport, splituser, splitpasswd, splitattr, + unquote, addclosehook, addinfourl) DEBUG = None -try: +if sys.version_info >= (3,): + # We do an explicit version check here because because python2 + # also has an io module with StringIO, but it is incompatible, + # and returns str instead of unicode somewhere. + from io import StringIO +else: from cStringIO import StringIO -except ImportError, msg: - from StringIO import StringIO class RangeError(IOError): """Error raised when an unsatisfiable range is requested.""" pass - -class HTTPRangeHandler(urllib2.BaseHandler): + +class HTTPRangeHandler(BaseHandler): """Handler that enables HTTP Range headers. - + This was extremely simple. The Range header is a HTTP feature to - begin with so all this class does is tell urllib2 that the - "206 Partial Content" reponse from the HTTP server is what we + begin with so all this class does is tell urllib2 that the + "206 Partial Content" response from the HTTP server is what we expected. - + Example: import urllib2 import byterange - + range_handler = range.HTTPRangeHandler() - opener = urllib2.build_opener(range_handler) - + opener = urllib.request.build_opener(range_handler) + # install it - urllib2.install_opener(opener) - + urllib.request.install_opener(opener) + # create Request and set Range header - req = urllib2.Request('http://www.python.org/') + req = urllib.request.Request('http://www.python.org/') req.header['Range'] = 'bytes=30-50' - f = urllib2.urlopen(req) + f = urllib.request.urlopen(req) """ - + def http_error_206(self, req, fp, code, msg, hdrs): # 206 Partial Content Response r = urllib.addinfourl(fp, hdrs, req.get_full_url()) r.code = code r.msg = msg return r - + def http_error_416(self, req, fp, code, msg, hdrs): # HTTP's Range Not Satisfiable error - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') class HTTPSRangeHandler(HTTPRangeHandler): """ Range Header support for HTTPS. """ @@ -81,13 +99,13 @@ class HTTPSRangeHandler(HTTPRangeHandler): class RangeableFileObject: """File object wrapper to enable raw range handling. - This was implemented primarilary for handling range - specifications for file:// urls. This object effectively makes - a file object look like it consists only of a range of bytes in + This was implemented primarilary for handling range + specifications for file:// urls. This object effectively makes + a file object look like it consists only of a range of bytes in the stream. - + Examples: - # expose 10 bytes, starting at byte position 20, from + # expose 10 bytes, starting at byte position 20, from # /etc/aliases. >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30)) # seek seeks within the range (to position 23 in this case) @@ -99,11 +117,11 @@ class RangeableFileObject: # byte in the range. the following will return only 7 bytes. >>> fo.read(30) """ - + def __init__(self, fo, rangetup): """Create a RangeableFileObject. - fo -- a file like object. only the read() method need be - supported but supporting an optimized seek() is + fo -- a file like object. only the read() method need be + supported but supporting an optimized seek() is preferable. rangetup -- a (firstbyte,lastbyte) tuple specifying the range to work over. @@ -113,24 +131,24 @@ class RangeableFileObject: (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup) self.realpos = 0 self._do_seek(self.firstbyte) - + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for in self.fo. This includes methods.""" if hasattr(self.fo, name): return getattr(self.fo, name) - raise AttributeError, name - + raise AttributeError(name) + def tell(self): """Return the position within the range. - This is different from fo.seek in that position 0 is the + This is different from fo.seek in that position 0 is the first byte position of the range tuple. For example, if this object was created with a range tuple of (500,899), tell() will return 0 when at byte position 500 of the file. """ return (self.realpos - self.firstbyte) - + def seek(self,offset,whence=0): """Seek within the byte range. Positioning is identical to that described under tell(). @@ -143,13 +161,13 @@ class RangeableFileObject: elif whence == 2: # absolute from end of file # XXX: are we raising the right Error here? raise IOError('seek from end of file not supported.') - + # do not allow seek past lastbyte in range if self.lastbyte and (realoffset >= self.lastbyte): realoffset = self.lastbyte - + self._do_seek(realoffset - self.realpos) - + def read(self, size=-1): """Read within the range. This method will limit the size read based on the range. @@ -158,7 +176,7 @@ class RangeableFileObject: rslt = self.fo.read(size) self.realpos += len(rslt) return rslt - + def readline(self, size=-1): """Read lines within the range. This method will limit the size read based on the range. @@ -167,7 +185,7 @@ class RangeableFileObject: rslt = self.fo.readline(size) self.realpos += len(rslt) return rslt - + def _calc_read_size(self, size): """Handles calculating the amount of data to read based on the range. @@ -179,7 +197,7 @@ class RangeableFileObject: else: size = (self.lastbyte - self.realpos) return size - + def _do_seek(self,offset): """Seek based on whether wrapped object supports seek(). offset is relative to the current position (self.realpos). @@ -190,7 +208,7 @@ class RangeableFileObject: else: self.fo.seek(self.realpos + offset) self.realpos+= offset - + def _poor_mans_seek(self,offset): """Seek by calling the wrapped file objects read() method. This is used for file like objects that do not have native @@ -198,7 +216,7 @@ class RangeableFileObject: to manually seek to the desired position. offset -- read this number of bytes from the wrapped file object. - raise RangeError if we encounter EOF before reaching the + raise RangeError if we encounter EOF before reaching the specified offset. """ pos = 0 @@ -208,28 +226,26 @@ class RangeableFileObject: bufsize = offset - pos buf = self.fo.read(bufsize) if len(buf) != bufsize: - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') pos+= bufsize -class FileRangeHandler(urllib2.FileHandler): +class FileRangeHandler(FileHandler): """FileHandler subclass that adds Range support. This class handles Range headers exactly like an HTTP server would. """ def open_local_file(self, req): - import mimetypes - import mimetools host = req.get_host() file = req.get_selector() localfile = urllib.url2pathname(file) stats = os.stat(localfile) size = stats[stat.ST_SIZE] - modified = rfc822.formatdate(stats[stat.ST_MTIME]) + modified = email.utils.formatdate(stats[stat.ST_MTIME]) mtype = mimetypes.guess_type(file)[0] if host: host, port = urllib.splitport(host) if port or socket.gethostbyname(host) not in self.get_names(): - raise urllib2.URLError('file not on local host') + raise URLError('file not on local host') fo = open(localfile,'rb') brange = req.headers.get('Range',None) brange = range_header_to_tuple(brange) @@ -238,35 +254,27 @@ class FileRangeHandler(urllib2.FileHandler): (fb,lb) = brange if lb == '': lb = size if fb < 0 or fb > size or lb > size: - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') size = (lb - fb) fo = RangeableFileObject(fo, (fb,lb)) - headers = mimetools.Message(StringIO( + headers = email.message_from_string( 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' % - (mtype or 'text/plain', size, modified))) + (mtype or 'text/plain', size, modified)) return urllib.addinfourl(fo, headers, 'file:'+file) -# FTP Range Support +# FTP Range Support # Unfortunately, a large amount of base FTP code had to be copied # from urllib and urllib2 in order to insert the FTP REST command. -# Code modifications for range support have been commented as +# Code modifications for range support have been commented as # follows: # -- range support modifications start/end here -from urllib import splitport, splituser, splitpasswd, splitattr, \ - unquote, addclosehook, addinfourl -import ftplib -import socket -import sys -import mimetypes -import mimetools - -class FTPRangeHandler(urllib2.FTPHandler): +class FTPRangeHandler(FTPHandler): def ftp_open(self, req): host = req.get_host() if not host: - raise IOError, ('ftp error', 'no host given') + raise IOError('ftp error', 'no host given') host, port = splitport(host) if port is None: port = ftplib.FTP_PORT @@ -282,11 +290,11 @@ class FTPRangeHandler(urllib2.FTPHandler): host = unquote(host) user = unquote(user or '') passwd = unquote(passwd or '') - + try: host = socket.gethostbyname(host) - except socket.error, msg: - raise urllib2.URLError(msg) + except socket.error as msg: + raise URLError(msg) path, attrs = splitattr(req.get_selector()) dirs = path.split('/') dirs = map(unquote, dirs) @@ -301,34 +309,34 @@ class FTPRangeHandler(urllib2.FTPHandler): if attr.lower() == 'type' and \ value in ('a', 'A', 'i', 'I', 'd', 'D'): type = value.upper() - + # -- range support modifications start here rest = None - range_tup = range_header_to_tuple(req.headers.get('Range',None)) + range_tup = range_header_to_tuple(req.headers.get('Range',None)) assert range_tup != () if range_tup: (fb,lb) = range_tup if fb > 0: rest = fb # -- range support modifications end here - + fp, retrlen = fw.retrfile(file, type, rest) - + # -- range support modifications start here if range_tup: (fb,lb) = range_tup - if lb == '': + if lb == '': if retrlen is None or retrlen == 0: - raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.') + raise RangeError(9, 'Requested Range Not Satisfiable due to unobtainable file length.') lb = retrlen retrlen = lb - fb if retrlen < 0: # beginning of range is larger than file - raise RangeError('Requested Range Not Satisfiable') + raise RangeError(9, 'Requested Range Not Satisfiable') else: retrlen = lb - fb fp = RangeableFileObject(fp, (0,retrlen)) # -- range support modifications end here - + headers = "" mtype = mimetypes.guess_type(req.get_full_url())[0] if mtype: @@ -338,14 +346,14 @@ class FTPRangeHandler(urllib2.FTPHandler): sf = StringIO(headers) headers = mimetools.Message(sf) return addinfourl(fp, headers, req.get_full_url()) - except ftplib.all_errors, msg: - raise IOError, ('ftp error', msg), sys.exc_info()[2] + except ftplib.all_errors as msg: + raise IOError('ftp error', msg).with_traceback(sys.exc_info()[2]) def connect_ftp(self, user, passwd, host, port, dirs): fw = ftpwrapper(user, passwd, host, port, dirs) return fw -class ftpwrapper(urllib.ftpwrapper): +class ftpwrapper(urllib_ftpwrapper): # range support note: # this ftpwrapper code is copied directly from # urllib. The only enhancement is to add the rest @@ -364,22 +372,22 @@ class ftpwrapper(urllib.ftpwrapper): # Use nlst to see if the file exists at all try: self.ftp.nlst(file) - except ftplib.error_perm, reason: - raise IOError, ('ftp error', reason), sys.exc_info()[2] + except ftplib.error_perm as reason: + raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2]) # Restore the transfer mode! self.ftp.voidcmd(cmd) # Try to retrieve as a file try: cmd = 'RETR ' + file conn = self.ftp.ntransfercmd(cmd, rest) - except ftplib.error_perm, reason: + except ftplib.error_perm as reason: if str(reason)[:3] == '501': # workaround for REST not supported error fp, retrlen = self.retrfile(file, type) fp = RangeableFileObject(fp, (rest,'')) return (fp, retrlen) elif str(reason)[:3] != '550': - raise IOError, ('ftp error', reason), sys.exc_info()[2] + raise IOError('ftp error', reason).with_traceback(sys.exc_info()[2]) if not conn: # Set transfer mode to ASCII! self.ftp.voidcmd('TYPE A') @@ -400,17 +408,17 @@ class ftpwrapper(urllib.ftpwrapper): _rangere = None def range_header_to_tuple(range_header): """Get a (firstbyte,lastbyte) tuple from a Range header value. - + Range headers have the form "bytes=<firstbyte>-<lastbyte>". This function pulls the firstbyte and lastbyte values and returns a (firstbyte,lastbyte) tuple. If lastbyte is not specified in the header value, it is returned as an empty string in the tuple. - + Return None if range_header is None - Return () if range_header does not conform to the range spec + Return () if range_header does not conform to the range spec pattern. - + """ global _rangere if range_header is None: return None @@ -418,9 +426,9 @@ def range_header_to_tuple(range_header): import re _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)') match = _rangere.match(range_header) - if match: + if match: tup = range_tuple_normalize(match.group(1,2)) - if tup and tup[1]: + if tup and tup[1]: tup = (tup[0],tup[1]+1) return tup return () @@ -433,16 +441,16 @@ def range_tuple_to_header(range_tup): if range_tup is None: return None range_tup = range_tuple_normalize(range_tup) if range_tup: - if range_tup[1]: + if range_tup[1]: range_tup = (range_tup[0],range_tup[1] - 1) return 'bytes=%s-%s' % range_tup - + def range_tuple_normalize(range_tup): """Normalize a (first_byte,last_byte) range tuple. Return a tuple whose first element is guaranteed to be an int - and whose second element will be '' (meaning: the last byte) or + and whose second element will be '' (meaning: the last byte) or an int. Finally, return None if the normalized tuple == (0,'') - as that is equivelant to retrieving the entire file. + as that is equivalent to retrieving the entire file. """ if range_tup is None: return None # handle first byte @@ -452,12 +460,13 @@ def range_tuple_normalize(range_tup): # handle last byte try: lb = range_tup[1] except IndexError: lb = '' - else: + else: if lb is None: lb = '' elif lb != '': lb = int(lb) # check if range is over the entire file if (fb,lb) == (0,''): return None # check that the range is valid - if lb < fb: raise RangeError('Invalid byte range: %s-%s' % (fb,lb)) + if lb != '' and fb >= lb: + raise RangeError(9, 'Invalid byte range: %s-%s' % (fb,lb)) return (fb,lb) diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py index e090e90..b72d089 100644..100755 --- a/urlgrabber/grabber.py +++ b/urlgrabber/grabber.py @@ -9,15 +9,17 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko # Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal +from __future__ import print_function + """A high-level cross-protocol url-grabber. GENERAL ARGUMENTS (kwargs) @@ -35,7 +37,7 @@ GENERAL ARGUMENTS (kwargs) close_connection = 0 [0|1] tells URLGrabber to close the connection after a file has been - transfered. This is ignored unless the download happens with the + transferred. This is ignored unless the download happens with the http keepalive handler (keepalive=1). Otherwise, the connection is left open for further use. The module level default for this option is 0 (keepalive connections will not be closed). @@ -49,13 +51,46 @@ GENERAL ARGUMENTS (kwargs) progress_obj = None a class instance that supports the following methods: - po.start(filename, url, basename, length, text) + po.start(filename, url, basename, size, now, text) # length will be None if unknown po.update(read) # read == bytes read so far po.end() + multi_progress_obj = None + + a class instance that supports the following methods: + mo.start(total_files, total_size) + mo.newMeter() => meter + mo.removeMeter(meter) + mo.end() + + The 'meter' object is similar to progress_obj, but multiple + instances may be created and updated at the same time. + + When downloading multiple files in parallel and multi_progress_obj + is None progress_obj is used in compatibility mode: finished files + are shown but there's no in-progress display. + + curl_obj = None + + a pycurl.Curl instance to be used instead of the default module-level + instance. + + Note that you don't have to configure the passed instance in any way; + urlgrabber will do all the necessary work. + + This option exists primarily to allow using urlgrabber from multiple + threads in your application, in which case you would want to instantiate a + fresh Curl object for each thread, to avoid race conditions. See the curl + documentation on thread safety for more information: + https://curl.haxx.se/libcurl/c/threadsafe.html + + Note that connection reuse (keepalive=1) is limited to the Curl instance it + was enabled on so if you're using multiple instances in your application, + connections won't be shared among them. + text = None - + specifies alternative text to be passed to the progress meter object. If not given, the default progress meter will use the basename of the file. @@ -68,14 +103,20 @@ GENERAL ARGUMENTS (kwargs) (which can be set on default_grabber.throttle) is used. See BANDWIDTH THROTTLING for more information. - timeout = None + timeout = 300 + + a positive integer expressing the number of seconds to wait before + timing out attempts to connect to a server. If the value is None + or 0, connection attempts will not time out. The timeout is passed + to the underlying pycurl object as its CONNECTTIMEOUT option, see + the curl documentation on CURLOPT_CONNECTTIMEOUT for more information. + http://curl.haxx.se/libcurl/c/curl_easy_setopt.html#CURLOPTCONNECTTIMEOUT - a positive float expressing the number of seconds to wait for socket - operations. If the value is None or 0.0, socket operations will block - forever. Setting this option causes urlgrabber to call the settimeout - method on the Socket object used for the request. See the Python - documentation on settimeout for more information. - http://www.python.org/doc/current/lib/socket-objects.html + minrate = 1000 + + This sets the low speed threshold in bytes per second. If the server + is sending data slower than this for at least `timeout' seconds, the + library aborts the connection. bandwidth = 0 @@ -91,11 +132,11 @@ GENERAL ARGUMENTS (kwargs) range to retrieve. Either or both of the values may set to None. If first_byte is None, byte offset 0 is assumed. If last_byte is None, the last byte available is assumed. Note that - the range specification is python-like in that (0,10) will yeild + the range specification is python-like in that (0,10) will yield the first 10 bytes of the file. If set to None, no range will be used. - + reget = None [None|'simple'|'check_timestamp'] whether to attempt to reget a partially-downloaded file. Reget @@ -143,8 +184,18 @@ GENERAL ARGUMENTS (kwargs) note that proxy authentication information may be provided using normal URL constructs: proxies={ 'http' : 'http://user:host@foo:3128' } - Lastly, if proxies is None, the default environment settings will - be used. + + libproxy = False + + Use the libproxy module (if installed) to find proxies. + The libproxy code is only used if the proxies dictionary + does not provide any proxies. + + no_cache = False + + When True, server-side cache will be disabled for http and https + requests. This is equivalent to setting + http_headers = (('Pragma', 'no-cache'),) prefix = None @@ -175,7 +226,7 @@ GENERAL ARGUMENTS (kwargs) option. Note that python 2.2 handles the case of these badly and if you do not use the proper case (shown here), your values will be overridden with the defaults. - + urlparser = URLParser() The URLParser class handles pre-processing of URLs, including @@ -198,6 +249,12 @@ GENERAL ARGUMENTS (kwargs) control, you should probably subclass URLParser and pass it in via the 'urlparser' option. + username = None + username to use for simple http auth - is automatically quoted for special characters + + password = None + password to use for simple http auth - is automatically quoted for special characters + ssl_ca_cert = None this option can be used if M2Crypto is available and will be @@ -209,45 +266,84 @@ GENERAL ARGUMENTS (kwargs) ssl_context = None No-op when using the curl backend (default) - - self.ssl_verify_peer = True + + ssl_verify_peer = True Check the server's certificate to make sure it is valid with what our CA validates - - self.ssl_verify_host = True + + ssl_verify_host = True Check the server's hostname to make sure it matches the certificate DN - self.ssl_key = None + ssl_key = None Path to the key the client should use to connect/authenticate with - self.ssl_key_type = 'PEM' + ssl_key_type = 'PEM' PEM or DER - format of key - - self.ssl_cert = None + + ssl_cert = None Path to the ssl certificate the client should use to to authenticate with - self.ssl_cert_type = 'PEM' + ssl_cert_type = 'PEM' PEM or DER - format of certificate - - self.ssl_key_pass = None + + ssl_key_pass = None password to access the ssl_key - - self.size = None - size (in bytes) or Maximum size of the thing being downloaded. + size = None + + size (in bytes) or Maximum size of the thing being downloaded. This is mostly to keep us from exploding with an endless datastream - - self.max_header_size = 2097152 + + max_header_size = 2097152 Maximum size (in bytes) of the headers. - + + ip_resolve = 'whatever' + + What type of name to IP resolving to use, default is to do both IPV4 and + IPV6. + + async_ = (key, limit) + + When this option is set, the urlgrab() is not processed immediately + but queued. parallel_wait() then processes grabs in parallel, limiting + the numer of connections in each 'key' group to at most 'limit'. + + max_connections + + The global connection limit. + + timedhosts + + The filename of the host download statistics. If defined, urlgrabber + will update the stats at the end of every download. At the end of + parallel_wait(), the updated stats are saved. If synchronous grabs + are used, you should call th_save(). + + default_speed, half_life + + These options only affect the async mirror selection code. + The default_speed option sets the speed estimate for mirrors + we have never downloaded from, and defaults to 1 MBps. + + The speed estimate also drifts exponentially from the speed + actually measured to the default speed, with default + period of 30 days. + + ftp_disable_epsv = False + + False, True + + This options disables Extended Passive Mode (the EPSV command) + which does not work correctly on some buggy ftp servers. + RETRY RELATED ARGUMENTS @@ -271,7 +367,7 @@ RETRY RELATED ARGUMENTS retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes if 12 not in retrycodes: retrycodes.append(12) - + checkfunc = None a function to do additional checks. This defaults to None, which @@ -302,7 +398,7 @@ RETRY RELATED ARGUMENTS function(obj, 'arg1', 2, kwarg=3) # obj.filename = '/tmp/stuff' # obj.url = 'http://foo.com/stuff' - + NOTE: both the "args" tuple and "kwargs" dict must be present if you use this syntax, but either (or both) can be empty. @@ -313,10 +409,11 @@ RETRY RELATED ARGUMENTS identical to checkfunc, except for the attributes defined in the CallbackObject instance. The attributes for failure_callback are: - exception = the raised exception - url = the url we're trying to fetch - tries = the number of tries so far (including this one) - retry = the value of the retry option + exception = the raised exception + url = the url we're trying to fetch + tries = the number of tries so far (including this one) + retry = the value of the retry option + retry_no_cache = the value of the retry_no_cache option The callback is present primarily to inform the calling program of the failure, but if it raises an exception (including the one it's @@ -328,6 +425,15 @@ RETRY RELATED ARGUMENTS but it cannot (without severe trickiness) prevent the exception from being raised. + failfunc = None + + The callback that gets called when urlgrab request fails. + If defined, urlgrab() calls it instead of raising URLGrabError. + Callback syntax is identical to failure_callback. + + Contrary to failure_callback, it's called only once. It's primary + purpose is to use urlgrab() without a try/except block. + interrupt_callback = None This callback is called if KeyboardInterrupt is received at any @@ -351,7 +457,20 @@ RETRY RELATED ARGUMENTS This callback is very similar to failure_callback. They are passed the same arguments, so you could use the same function for both. - + + retry_no_cache = False + + When True, automatically enable no_cache for future retries if + checkfunc performs an unsuccessful check. + + This option is useful if your application expects a set of files + from the same server to form an atomic unit and you write your + checkfunc to ensure each file being downloaded belongs to such a + unit. If transparent proxy caching is in effect, the files can + become out-of-sync, disrupting the atomicity. Enabling this option + will prevent that, while ensuring that you still enjoy the benefits + of caching when possible. + BANDWIDTH THROTTLING urlgrabber supports throttling via two values: throttle and @@ -368,6 +487,11 @@ BANDWIDTH THROTTLING is a float and bandwidth == 0, throttling is disabled. If None, the module-level default (which can be set with set_bandwidth) is used. + Note that when multiple downloads run simultaneously (multiprocessing + or the parallel urlgrab() feature is used) the total bandwidth might + exceed the throttle limit. You may want to also set max_connections=1 + or scale your throttle option down accordingly. + THROTTLING EXAMPLES: Lets say you have a 100 Mbps connection. This is (about) 10^8 bits @@ -411,25 +535,64 @@ BANDWIDTH THROTTLING """ - - import os import sys -import urlparse import time import string import urllib -import urllib2 -import mimetools -import thread import types import stat import pycurl from ftplib import parse150 -from StringIO import StringIO -from httplib import HTTPException -import socket -from byterange import range_tuple_normalize, range_tuple_to_header, RangeError +import socket, select, fcntl +from io import BytesIO +import numbers + +try: + import urllib.parse as urlparse + urlquote, urlunquote = urlparse.quote, urlparse.unquote + from urllib.request import HTTPError, url2pathname, pathname2url +except ImportError: + import urlparse + from urllib2 import HTTPError + urlquote, urlunquote = urllib.quote, urllib.unquote + from urllib import url2pathname, pathname2url + +try: + from http.client import responses, HTTPException +except ImportError: + from httplib import responses, HTTPException + +if sys.version_info >= (3,): + # We do an explicit version check here because because python2 + # also has an io module with StringIO, but it is incompatible, + # and returns str instead of unicode somewhere. + from io import StringIO +else: + from cStringIO import StringIO + +from six import text_type, string_types + +from .byterange import range_tuple_normalize, range_tuple_to_header, RangeError + +try: + import xattr + if not hasattr(xattr, 'set'): + xattr = None # This is a "newer" API. +except ImportError: + xattr = None + +def _bytes_repr(s): + "A wrapper to avoid the b'' that python3 insists on when printing bytes" + if isinstance(s, string_types): + return s + else: + return repr(s)[2:-1] + +def _urlunquote_convert(s): + if not isinstance(s, text_type): + s = s.decode('utf8') + return urlunquote(s) ######################################################################## # MODULE INITIALIZATION @@ -439,6 +602,12 @@ try: except: __version__ = '???' +try: + # this part isn't going to do much - need to talk to gettext + from i18n import _ +except ImportError as msg: + def _(st): return st + ######################################################################## # functions for debugging output. These functions are here because they # are also part of the module initialization. @@ -468,7 +637,7 @@ def _init_default_logger(logspec=None): the form URLGRABBER_DEBUG=level,filename - + where "level" can be either an integer or a log level from the logging module (DEBUG, INFO, etc). If the integer is zero or less, logging will be disabled. Filename is the filename where @@ -481,8 +650,8 @@ def _init_default_logger(logspec=None): URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout URLGRABBER_DEBUG=INFO # log info and higher to stderr - - This funtion is called during module initialization. It is not + + This function is called during module initialization. It is not intended to be called from outside. The only reason it is a function at all is to keep the module-level namespace tidy and to collect the code into a nice block.''' @@ -492,8 +661,11 @@ def _init_default_logger(logspec=None): logspec = os.environ['URLGRABBER_DEBUG'] dbinfo = logspec.split(',') import logging - level = logging._levelNames.get(dbinfo[0], None) - if level is None: level = int(dbinfo[0]) + if sys.version_info.major == 2: + level = logging._levelNames.get(dbinfo[0], None) + else: + level = logging.getLevelName(dbinfo[0]) + if level is None or not isinstance(level, int): level = int(dbinfo[0]) if level < 1: raise ValueError() formatter = logging.Formatter('%(asctime)s %(message)s') @@ -504,6 +676,7 @@ def _init_default_logger(logspec=None): else: handler = logging.FileHandler(filename) handler.setFormatter(formatter) DBOBJ = logging.getLogger('urlgrabber') + DBOBJ.propagate = False DBOBJ.addHandler(handler) DBOBJ.setLevel(level) except (KeyError, ImportError, ValueError): @@ -512,9 +685,9 @@ def _init_default_logger(logspec=None): def _log_package_state(): if not DEBUG: return - DEBUG.info('urlgrabber version = %s' % __version__) - DEBUG.info('trans function "_" = %s' % _) - + DEBUG.debug('urlgrabber version = %s' % __version__) + DEBUG.debug('trans function "_" = %s' % _) + _init_default_logger() _log_package_state() @@ -527,6 +700,29 @@ def _(st): # END MODULE INITIALIZATION ######################################################################## +######################################################################## +# UTILITY FUNCTIONS +######################################################################## + +# These functions are meant to be utilities for the urlgrabber library to use. + +def _to_utf8(obj, errors='replace'): + '''convert 'unicode' to an encoded utf-8 byte string ''' + # stolen from yum.i18n + if isinstance(obj, text_type): + obj = obj.encode('utf-8', errors) + return obj + +def exception2msg(e): + try: + return str(e) + except UnicodeEncodeError: + # always use byte strings + return text_type(e).encode('utf8') + +######################################################################## +# END UTILITY FUNCTIONS +######################################################################## class URLGrabError(IOError): @@ -551,7 +747,7 @@ class URLGrabError(IOError): 14 - HTTPError (includes .code and .exception attributes) 15 - user abort 16 - error writing to local file - + MirrorGroup error codes (256 -- 511) 256 - No more mirrors left to try @@ -562,7 +758,7 @@ class URLGrabError(IOError): -1 - retry the download, unknown reason Note: to test which group a code is in, you can simply do integer - division by 256: e.errno / 256 + division by 256: e.errno // 256 Negative codes are reserved for use by functions passed in to retrygrab with checkfunc. The value -1 is built in as a generic @@ -606,7 +802,7 @@ def urlgrab(url, filename=None, **kwargs): If filename is none, the basename of the url is used. urlgrab returns the filename of the local file, which may be different from the passed-in filename if the copy_local kwarg == 0. - + See module documentation for a description of possible kwargs. """ return default_grabber.urlgrab(url, filename, **kwargs) @@ -616,7 +812,7 @@ def urlopen(url, **kwargs): If a progress object or throttle specifications exist, then a special file object will be returned that supports them. The file object can be treated like any other file object. - + See module documentation for a description of possible kwargs. """ return default_grabber.urlopen(url, **kwargs) @@ -626,7 +822,7 @@ def urlread(url, limit=None, **kwargs): If the limit is exceeded, an exception will be thrown. Note that urlread is NOT intended to be used as a way of saying "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' - + See module documentation for a description of possible kwargs. """ return default_grabber.urlread(url, limit, **kwargs) @@ -662,37 +858,41 @@ class URLParser: opts.quote = 0 --> do not quote it opts.quote = None --> guess """ + url = _to_utf8(url) quote = opts.quote - + if opts.prefix: url = self.add_prefix(url, opts.prefix) - + parts = urlparse.urlparse(url) (scheme, host, path, parm, query, frag) = parts if not scheme or (len(scheme) == 1 and scheme in string.letters): # if a scheme isn't specified, we guess that it's "file:" - if url[0] not in '/\\': url = os.path.abspath(url) - url = 'file:' + urllib.pathname2url(url) + if url[0] not in b'/\\': url = os.path.abspath(url) + pathname = pathname2url(url) + if not isinstance(pathname, bytes): + pathname = pathname.encode('utf8') + url = b'file:' + pathname parts = urlparse.urlparse(url) quote = 0 # pathname2url quotes, so we won't do it again - - if scheme in ['http', 'https']: + + if scheme in [b'http', b'https']: parts = self.process_http(parts, url) - + if quote is None: quote = self.guess_should_quote(parts) if quote: parts = self.quote(parts) - + url = urlparse.urlunparse(parts) return url, parts def add_prefix(self, url, prefix): - if prefix[-1] == '/' or url[0] == '/': + if prefix.endswith(b'/') or url.startswith(b'/'): url = prefix + url else: - url = prefix + '/' + url + url = prefix + b'/' + url return url def process_http(self, parts, url): @@ -709,8 +909,10 @@ class URLParser: passing into urlgrabber. """ (scheme, host, path, parm, query, frag) = parts - path = urllib.quote(path) - return (scheme, host, path, parm, query, frag) + newpath = urlquote(path, safe='/$') + if not isinstance(path, text_type) and isinstance(newpath, text_type): + newpath = newpath.encode('utf8') + return (scheme, host, newpath, parm, query, frag) hexvals = '0123456789ABCDEF' def guess_should_quote(self, parts): @@ -724,9 +926,11 @@ class URLParser: else -> 1 """ (scheme, host, path, parm, query, frag) = parts + if not isinstance(path, text_type): + path = path.decode('utf8') if ' ' in path: return 1 - ind = string.find(path, '%') + ind = path.find('%') if ind > -1: while ind > -1: if len(path) < ind+3: @@ -735,10 +939,10 @@ class URLParser: if code[0] not in self.hexvals or \ code[1] not in self.hexvals: return 1 - ind = string.find(path, '%', ind+1) + ind = path.find('%', ind+1) return 0 return 1 - + class URLGrabberOptions: """Class to ease kwargs handling.""" @@ -751,70 +955,116 @@ class URLGrabberOptions: if delegate is None: self._set_defaults() self._set_attributes(**kwargs) - + def __getattr__(self, name): if self.delegate and hasattr(self.delegate, name): return getattr(self.delegate, name) - raise AttributeError, name - + raise AttributeError(name) + def raw_throttle(self): - """Calculate raw throttle value from throttle and bandwidth + """Calculate raw throttle value from throttle and bandwidth values. """ - if self.throttle <= 0: + if self.throttle <= 0: return 0 - elif type(self.throttle) == type(0): + elif isinstance(self.throttle, int): return float(self.throttle) else: # throttle is a float return self.bandwidth * self.throttle - + + def find_proxy(self, url, scheme): + """Find the proxy to use for this URL. + Use the proxies dictionary first, then libproxy. + """ + self.proxy = None + if scheme not in ('ftp', 'http', 'https'): + return + + if self.proxies: + proxy = self.proxies.get(scheme) + if proxy is None: + if scheme == 'http': + proxy = self.proxies.get('https') + elif scheme == 'https': + proxy = self.proxies.get('http') + if proxy == '_none_': + proxy = '' + self.proxy = proxy + return + + if self.libproxy: + global _libproxy_cache + if _libproxy_cache is None: + try: + import libproxy + _libproxy_cache = libproxy.ProxyFactory() + except: + _libproxy_cache = False + if _libproxy_cache: + for proxy in _libproxy_cache.getProxies(url): + if proxy.startswith('http://'): + if DEBUG: DEBUG.info('using proxy "%s" for url %s' % (proxy, url)) + self.proxy = proxy + break + def derive(self, **kwargs): """Create a derived URLGrabberOptions instance. This method creates a new instance and overrides the options specified in kwargs. """ return URLGrabberOptions(delegate=self, **kwargs) - + def _set_attributes(self, **kwargs): """Update object attributes with those provided in kwargs.""" self.__dict__.update(kwargs) - if kwargs.has_key('range'): + if 'range' in kwargs: # normalize the supplied range value self.range = range_tuple_normalize(self.range) + if 'async' in kwargs: + self.async_ = self.__dict__.pop('async') if not self.reget in [None, 'simple', 'check_timestamp']: - raise URLGrabError(11, _('Illegal reget mode: %s') \ - % (self.reget, )) + raise URLGrabError(11, _('Illegal reget mode: %s') + % (self.reget,)) def _set_defaults(self): - """Set all options to their default values. + """Set all options to their default values. When adding new options, make sure a default is provided here. """ self.progress_obj = None + self.multi_progress_obj = None + self.curl_obj = None self.throttle = 1.0 self.bandwidth = 0 self.retry = None self.retrycodes = [-1,2,4,5,6,7] self.checkfunc = None + self.failfunc = _do_raise self.copy_local = 0 self.close_connection = 0 self.range = None self.user_agent = 'urlgrabber/%s' % __version__ + self.ip_resolve = None self.keepalive = 1 self.proxies = None + self.libproxy = False + self.proxy = None self.reget = None self.failure_callback = None self.interrupt_callback = None self.prefix = None self.opener = None self.cache_openers = True - self.timeout = None + self.timeout = 300 + self.minrate = None self.text = None self.http_headers = None self.ftp_headers = None self.data = None self.urlparser = URLParser() self.quote = None + self.username = None + self.password = None self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb self.ssl_context = None # no-op in pycurl self.ssl_verify_peer = True # check peer's cert for authenticityb @@ -827,10 +1077,19 @@ class URLGrabberOptions: self.size = None # if we know how big the thing we're getting is going # to be. this is ultimately a MAXIMUM size for the file self.max_header_size = 2097152 #2mb seems reasonable for maximum header size - + self.async_ = None # blocking by default + self.mirror_group = None + self.max_connections = 5 + self.timedhosts = None + self.half_life = 30*24*60*60 # 30 days + self.default_speed = 500e3 # 500 kBps + self.ftp_disable_epsv = False + self.no_cache = False + self.retry_no_cache = False + def __repr__(self): return self.format() - + def format(self, indent=' '): keys = self.__dict__.keys() if self.delegate is not None: @@ -838,29 +1097,39 @@ class URLGrabberOptions: keys.sort() s = '{\n' for k in keys: - s = s + indent + '%-15s: %s,\n' % \ - (repr(k), repr(self.__dict__[k])) + s = s + indent + '%-15r: %r,\n' % (k, self.__dict__[k]) if self.delegate: df = self.delegate.format(indent + ' ') s = s + indent + '%-15s: %s\n' % ("'delegate'", df) s = s + indent + '}' return s -class URLGrabber: +def _do_raise(obj): + raise obj.exception + +def _run_callback(cb, obj): + if not cb: + return + if callable(cb): + return cb(obj) + cb, arg, karg = cb + return cb(obj, *arg, **karg) + +class URLGrabber(object): """Provides easy opening of URLs with a variety of options. - + All options are specified as kwargs. Options may be specified when the class is created and may be overridden on a per request basis. - + New objects inherit default values from default_grabber. """ - + def __init__(self, **kwargs): self.opts = URLGrabberOptions(**kwargs) - + def _retry(self, opts, func, *args): tries = 0 - while 1: + while True: # there are only two ways out of this loop. The second has # several "sub-ways" # 1) via the return in the "try" block @@ -872,122 +1141,142 @@ class URLGrabber: # beware of infinite loops :) tries = tries + 1 exception = None - retrycode = None callback = None if DEBUG: DEBUG.info('attempt %i/%s: %s', tries, opts.retry, args[0]) try: - r = apply(func, (opts,) + args, {}) + r = func(opts, *args) if DEBUG: DEBUG.info('success') return r - except URLGrabError, e: + except URLGrabError as e: exception = e callback = opts.failure_callback - retrycode = e.errno - except KeyboardInterrupt, e: + except KeyboardInterrupt as e: exception = e callback = opts.interrupt_callback + if not callback: + raise if DEBUG: DEBUG.info('exception: %s', exception) if callback: if DEBUG: DEBUG.info('calling callback: %s', callback) - cb_func, cb_args, cb_kwargs = self._make_callback(callback) obj = CallbackObject(exception=exception, url=args[0], - tries=tries, retry=opts.retry) - cb_func(obj, *cb_args, **cb_kwargs) + tries=tries, retry=opts.retry, + retry_no_cache=opts.retry_no_cache) + _run_callback(callback, obj) if (opts.retry is None) or (tries == opts.retry): if DEBUG: DEBUG.info('retries exceeded, re-raising') - raise + raise exception + retrycode = getattr(exception, 'errno', None) if (retrycode is not None) and (retrycode not in opts.retrycodes): if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising', retrycode, opts.retrycodes) - raise - - def urlopen(self, url, **kwargs): + raise exception + if retrycode is not None and retrycode < 0 and opts.retry_no_cache: + opts.no_cache = True + + def urlopen(self, url, opts=None, **kwargs): """open the url and return a file object - If a progress object or throttle value specified when this - object was created, then a special file object will be - returned that supports them. The file object can be treated + If a progress object or throttle value specified when this + object was created, then a special file object will be + returned that supports them. The file object can be treated like any other file object. """ - opts = self.opts.derive(**kwargs) - if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + url = _to_utf8(url) + opts = (opts or self.opts).derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %r' % (opts,)) + (url,parts) = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) def retryfunc(opts, url): return PyCurlFileObject(url, filename=None, opts=opts) return self._retry(opts, retryfunc, url) - - def urlgrab(self, url, filename=None, **kwargs): + + def urlgrab(self, url, filename=None, opts=None, **kwargs): """grab the file at <url> and make a local copy at <filename> If filename is none, the basename of the url is used. - urlgrab returns the filename of the local file, which may be + urlgrab returns the filename of the local file, which may be different from the passed-in filename if copy_local == 0. """ - opts = self.opts.derive(**kwargs) - if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + url = _to_utf8(url) + opts = (opts or self.opts).derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %r' % (opts,)) + (url,parts) = opts.urlparser.parse(url, opts) (scheme, host, path, parm, query, frag) = parts + opts.find_proxy(url, scheme) if filename is None: - filename = os.path.basename( urllib.unquote(path) ) + filename = os.path.basename(_urlunquote_convert(path)) + if not filename: + # This is better than nothing. + filename = 'index.html' if scheme == 'file' and not opts.copy_local: - # just return the name of the local file - don't make a + # just return the name of the local file - don't make a # copy currently - path = urllib.url2pathname(path) + path = url2pathname(path) if host: path = os.path.normpath('//' + host + path) if not os.path.exists(path): - err = URLGrabError(2, + err = URLGrabError(2, _('Local file does not exist: %s') % (path, )) err.url = url raise err elif not os.path.isfile(path): - err = URLGrabError(3, + err = URLGrabError(3, _('Not a normal file: %s') % (path, )) err.url = url raise err elif not opts.range: if not opts.checkfunc is None: - cb_func, cb_args, cb_kwargs = \ - self._make_callback(opts.checkfunc) - obj = CallbackObject() - obj.filename = path - obj.url = url - apply(cb_func, (obj, )+cb_args, cb_kwargs) + obj = CallbackObject(filename=path, url=url) + _run_callback(opts.checkfunc, obj) return path - + + if opts.async_: + opts.url = url + opts.filename = filename + opts.size = int(opts.size or 0) + _async_queue.append(opts) + return filename + def retryfunc(opts, url, filename): fo = PyCurlFileObject(url, filename, opts) try: fo._do_grab() + if fo._tm_last: + dlsz = fo._tm_last[0] - fo._tm_first[0] + dltm = fo._tm_last[1] - fo._tm_first[1] + _TH.update(url, dlsz, dltm, None) if not opts.checkfunc is None: - cb_func, cb_args, cb_kwargs = \ - self._make_callback(opts.checkfunc) - obj = CallbackObject() - obj.filename = filename - obj.url = url - apply(cb_func, (obj, )+cb_args, cb_kwargs) + obj = CallbackObject(filename=filename, url=url) + _run_callback(opts.checkfunc, obj) finally: fo.close() return filename - - return self._retry(opts, retryfunc, url, filename) - - def urlread(self, url, limit=None, **kwargs): + + try: + return self._retry(opts, retryfunc, url, filename) + except URLGrabError as e: + _TH.update(url, 0, 0, e) + opts.exception = e + return _run_callback(opts.failfunc, opts) + + def urlread(self, url, limit=None, opts=None, **kwargs): """read the url into a string, up to 'limit' bytes If the limit is exceeded, an exception will be thrown. Note - that urlread is NOT intended to be used as a way of saying - "I want the first N bytes" but rather 'read the whole file + that urlread is NOT intended to be used as a way of saying + "I want the first N bytes" but rather 'read the whole file into memory, but don't use too much' """ - opts = self.opts.derive(**kwargs) - if DEBUG: DEBUG.debug('combined options: %s' % repr(opts)) - (url,parts) = opts.urlparser.parse(url, opts) + url = _to_utf8(url) + opts = (opts or self.opts).derive(**kwargs) + if DEBUG: DEBUG.debug('combined options: %r' % (opts,)) + (url,parts) = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) if limit is not None: limit = limit + 1 - + def retryfunc(opts, url, limit): fo = PyCurlFileObject(url, filename=None, opts=opts) s = '' @@ -1000,26 +1289,23 @@ class URLGrabber: else: s = fo.read(limit) if not opts.checkfunc is None: - cb_func, cb_args, cb_kwargs = \ - self._make_callback(opts.checkfunc) - obj = CallbackObject() - obj.data = s - obj.url = url - apply(cb_func, (obj, )+cb_args, cb_kwargs) + obj = CallbackObject(data=s, url=url) + _run_callback(opts.checkfunc, obj) finally: fo.close() return s - + s = self._retry(opts, retryfunc, url, limit) if limit and len(s) > limit: - err = URLGrabError(8, + err = URLGrabError(8, _('Exceeded limit (%i): %s') % (limit, url)) err.url = url raise err return s - + def _make_callback(self, callback_obj): + # not used, left for compatibility if callable(callback_obj): return callback_obj, (), {} else: @@ -1030,10 +1316,10 @@ class URLGrabber: default_grabber = URLGrabber() -class PyCurlFileObject(): +class PyCurlFileObject(object): def __init__(self, url, filename, opts): self.fo = None - self._hdr_dump = '' + self._hdr_dump = b'' self._parsed_hdr = None self.url = url self.scheme = urlparse.urlsplit(self.url)[0] @@ -1042,20 +1328,24 @@ class PyCurlFileObject(): self.reget_time = None self.opts = opts if self.opts.reget == 'check_timestamp': - raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this." + raise NotImplementedError("check_timestamp regets are not implemented in this ver of urlgrabber. Please report this.") self._complete = False - self._rbuf = '' + self._rbuf = b'' self._rbufsize = 1024*8 self._ttime = time.time() self._tsize = 0 self._amount_read = 0 self._reget_length = 0 + self._range = None self._prog_running = False self._error = (None, None) - self.size = None + self.size = 0 + self._hdr_ended = False + self._tm_first = None + self._tm_last = None self._do_open() - - + + def __getattr__(self, name): """This effectively allows us to wrap at the instance level. Any attribute not found in _this_ object will be searched for @@ -1063,48 +1353,93 @@ class PyCurlFileObject(): if hasattr(self.fo, name): return getattr(self.fo, name) - raise AttributeError, name + raise AttributeError(name) def _retrieve(self, buf): try: + tm = self._amount_read + len(buf), time.time() + if self._tm_first is None: + self._tm_first = tm + else: + self._tm_last = tm + if not self._prog_running: if self.opts.progress_obj: size = self.size + self._reget_length - self.opts.progress_obj.start(self._prog_reportname, - urllib.unquote(self.url), - self._prog_basename, + self.opts.progress_obj.start(self._prog_reportname, + _urlunquote_convert(self.url), + self._prog_basename, size=size, text=self.opts.text) self._prog_running = True self.opts.progress_obj.update(self._amount_read) self._amount_read += len(buf) - self.fo.write(buf) + try: + if self._range: + # client-side ranges + pos = self._amount_read - len(buf) + start = self._range[0] - pos + stop = self._range[1] - pos + if start < len(buf) and stop > 0: + self.fo.write(buf[max(start, 0):stop]) + else: + self.fo.write(buf) + except IOError as e: + self._cb_error = URLGrabError(16, exception2msg(e)) + return -1 return len(buf) except KeyboardInterrupt: return -1 - + def _hdr_retrieve(self, buf): - if self._over_max_size(cur=len(self._hdr_dump), + if self._hdr_ended: + self._hdr_dump = b'' + self.size = 0 + self._hdr_ended = False + + if self._over_max_size(cur=len(self._hdr_dump), max_size=self.opts.max_header_size): - return -1 + return -1 try: - self._hdr_dump += buf # we have to get the size before we do the progress obj start # but we can't do that w/o making it do 2 connects, which sucks # so we cheat and stuff it in here in the hdr_retrieve - if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1: - length = buf.split(':')[1] - self.size = int(length) - elif self.scheme in ['ftp']: + if self.scheme in [b'http', b'https']: + if buf.lower().find(b'content-length:') != -1: + length = buf.split(b':')[1] + self.size = int(length) + elif (self.append or self.opts.range) and not self._hdr_dump and b' 200 OK ' in buf: + # reget was attempted but server sends it all + # undo what we did in _build_range() + self.append = False + self.reget_time = None + self._amount_read = 0 + self._reget_length = 0 + self._range = self.opts.range + self.fo.truncate(0) + elif self.scheme in [b'ftp']: s = None - if buf.startswith('213 '): + if buf.startswith(b'213 '): s = buf[3:].strip() - elif buf.startswith('150 '): + if len(s) >= 14: + s = None # ignore MDTM responses + elif buf.startswith(b'150 '): s = parse150(buf) if s: self.size = int(s) - + + if buf.lower().find(b'location') != -1: + location = b':'.join(buf.split(b':')[1:]) + location = location.strip() + self.scheme = urlparse.urlsplit(location)[0] + self.url = location + + self._hdr_dump += buf + if len(self._hdr_dump) != 0 and buf == b'\r\n': + self._hdr_ended = True + if DEBUG: DEBUG.debug('header ended:') + return len(buf) except KeyboardInterrupt: return pycurl.READFUNC_ABORT @@ -1112,12 +1447,14 @@ class PyCurlFileObject(): def _return_hdr_obj(self): if self._parsed_hdr: return self._parsed_hdr - statusend = self._hdr_dump.find('\n') + statusend = self._hdr_dump.find(b'\n') + statusend += 1 # ridiculous as it may seem. hdrfp = StringIO() hdrfp.write(self._hdr_dump[statusend:]) - self._parsed_hdr = mimetools.Message(hdrfp) + hdrfp.seek(0) + self._parsed_hdr = email.message_from_string(hdrfp) return self._parsed_hdr - + hdr = property(_return_hdr_obj) http_code = property(fget= lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE)) @@ -1127,6 +1464,9 @@ class PyCurlFileObject(): if not opts: opts = self.opts + # keepalives + if not opts.keepalive: + self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) # defaults we're always going to set self.curl_obj.setopt(pycurl.NOPROGRESS, False) @@ -1136,172 +1476,219 @@ class PyCurlFileObject(): self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update) self.curl_obj.setopt(pycurl.FAILONERROR, True) self.curl_obj.setopt(pycurl.OPT_FILETIME, True) - - if DEBUG: + self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) + + if DEBUG and DEBUG.level <= 10: self.curl_obj.setopt(pycurl.VERBOSE, True) if opts.user_agent: self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent) - + if opts.ip_resolve: + # Default is: IPRESOLVE_WHATEVER + ipr = opts.ip_resolve.lower() + if ipr == 'whatever': # Do we need this? + self.curl_obj.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_WHATEVER) + if ipr == 'ipv4': + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) + if ipr == 'ipv6': + self.curl_obj.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V6) + # maybe to be options later self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True) self.curl_obj.setopt(pycurl.MAXREDIRS, 5) - + # timeouts timeout = 300 - if opts.timeout: - timeout = int(opts.timeout) - self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + if hasattr(opts, 'timeout'): + timeout = int(opts.timeout or 0) + self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout) + self.curl_obj.setopt(pycurl.LOW_SPEED_LIMIT, opts.minrate or 1000) + self.curl_obj.setopt(pycurl.LOW_SPEED_TIME, timeout) # ssl options - if self.scheme == 'https': + if self.scheme == b'https': if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert) self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer) - self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host) + if opts.ssl_verify_host: # 1 is meaningless to curl + self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, 2) if opts.ssl_key: self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key) if opts.ssl_key_type: self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type) if opts.ssl_cert: self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert) - if opts.ssl_cert_type: + # if we have a client side cert - turn off reuse b/c nss is odd + self.curl_obj.setopt(pycurl.FORBID_REUSE, 1) + if opts.ssl_cert_type: self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type) if opts.ssl_key_pass: self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass) #headers: - if opts.http_headers and self.scheme in ('http', 'https'): + if self.scheme in (b'http', b'https'): headers = [] - for (tag, content) in opts.http_headers: - headers.append('%s:%s' % (tag, content)) - self.curl_obj.setopt(pycurl.HTTPHEADER, headers) + if opts.http_headers is not None: + for (tag, content) in opts.http_headers: + headers.append('%s:%s' % (tag, content)) + if opts.no_cache: + headers.append('Pragma:no-cache') + if headers: + self.curl_obj.setopt(pycurl.HTTPHEADER, headers) # ranges: if opts.range or opts.reget: range_str = self._build_range() if range_str: self.curl_obj.setopt(pycurl.RANGE, range_str) - + # throttle/bandwidth if hasattr(opts, 'raw_throttle') and opts.raw_throttle(): self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle())) - - # proxy settings - if opts.proxies: - for (scheme, proxy) in opts.proxies.items(): - if self.scheme in ('ftp'): # only set the ftp proxy for ftp items - if scheme not in ('ftp'): - continue - else: - if proxy == '_none_': proxy = "" - self.curl_obj.setopt(pycurl.PROXY, proxy) - elif self.scheme in ('http', 'https'): - if scheme not in ('http', 'https'): - continue - else: - if proxy == '_none_': proxy = "" - self.curl_obj.setopt(pycurl.PROXY, proxy) - - # FIXME username/password/auth settings + + # proxy + if opts.proxy is not None: + self.curl_obj.setopt(pycurl.PROXY, opts.proxy) + self.curl_obj.setopt(pycurl.PROXYAUTH, + # All but Kerberos. BZ 769254 + pycurl.HTTPAUTH_ANY - pycurl.HTTPAUTH_GSSNEGOTIATE) + + if opts.username and opts.password: + if self.scheme in (b'http', b'https'): + self.curl_obj.setopt(pycurl.HTTPAUTH, pycurl.HTTPAUTH_ANY) + + if opts.username and opts.password: + # apparently when applying them as curlopts they do not require quoting of any kind + userpwd = '%s:%s' % (opts.username, opts.password) + self.curl_obj.setopt(pycurl.USERPWD, userpwd) #posts - simple - expects the fields as they are if opts.data: self.curl_obj.setopt(pycurl.POST, True) - self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data)) - + self.curl_obj.setopt(pycurl.POSTFIELDS, _to_utf8(opts.data)) + + # ftp + if opts.ftp_disable_epsv: + self.curl_obj.setopt(pycurl.FTP_USE_EPSV, False) + # our url self.curl_obj.setopt(pycurl.URL, self.url) - - + + def _do_perform(self): if self._complete: return - + try: self.curl_obj.perform() - except pycurl.error, e: + except pycurl.error as e: # XXX - break some of these out a bit more clearly - # to other URLGrabErrors from + # to other URLGrabErrors from # http://curl.haxx.se/libcurl/c/libcurl-errors.html # this covers e.args[0] == 22 pretty well - which will be common - + code = self.http_code errcode = e.args[0] + errurl = _urlunquote_convert(self.url) + if self._error[0]: errcode = self._error[0] - - if errcode == 23 and code >= 200 and code < 299: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url - + + if errcode == 23 and 200 <= code <= 299: # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside # since we cannot know what it is -I'm banking on it being - # a ctrl-c. XXXX - if there's a way of going back two raises to + # a ctrl-c. XXXX - if there's a way of going back two raises to # figure out what aborted the pycurl process FIXME - raise KeyboardInterrupt - + raise getattr(self, '_cb_error', KeyboardInterrupt) + elif errcode == 28: - err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) - err.url = self.url + err = URLGrabError(12, _('Timeout on %s: %s') % (errurl, e)) + err.url = errurl raise err - elif errcode == 35: - msg = _("problem making ssl connection") - err = URLGrabError(14, msg) - err.url = self.url - raise err - elif errcode == 37: - msg = _("Could not open/read %s") % (self.url) - err = URLGrabError(14, msg) - err.url = self.url - raise err - + elif errcode == 42: - err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e)) - err.url = self.url # this is probably wrong but ultimately this is what happens # we have a legit http code and a pycurl 'writer failed' code # which almost always means something aborted it from outside # since we cannot know what it is -I'm banking on it being - # a ctrl-c. XXXX - if there's a way of going back two raises to + # a ctrl-c. XXXX - if there's a way of going back two raises to # figure out what aborted the pycurl process FIXME raise KeyboardInterrupt - - elif errcode == 58: - msg = _("problem with the local client certificate") - err = URLGrabError(14, msg) - err.url = self.url - raise err - elif errcode == 60: - msg = _("client cert cannot be verified or client cert incorrect") + else: + pyerr2str = { 5 : _("Couldn't resolve proxy"), + 6 : _("Couldn't resolve host"), + 7 : _("Couldn't connect"), + 8 : _("Bad reply to FTP server"), + 9 : _("Access denied"), + 11 : _("Bad reply to FTP pass"), + 13 : _("Bad reply to FTP pasv"), + 14 : _("Bad reply to FTP 227"), + 15 : _("Couldn't get FTP host"), + 17 : _("Couldn't set FTP type"), + 18 : _("Partial file"), + 19 : _("FTP RETR command failed"), + 22 : _("HTTP returned error"), + 23 : _("Write error"), + 25 : _("Upload failed"), + 26 : _("Read error"), + 27 : _("Out of Memory"), + 28 : _("Operation timed out"), + 30 : _("FTP PORT command failed"), + 31 : _("FTP REST command failed"), + 33 : _("Range failed"), + 34 : _("HTTP POST failed"), + 35 : _("SSL CONNECT failed"), + 36 : _("Couldn't resume download"), + 37 : _("Couldn't read file"), + 42 : _("Aborted by callback"), + 47 : _("Too many redirects"), + 51 : _("Peer certificate failed verification"), + 52 : _("Got nothing: SSL certificate expired?"), + 53 : _("SSL engine not found"), + 54 : _("SSL engine set failed"), + 55 : _("Network error send()"), + 56 : _("Network error recv()"), + 58 : _("Local certificate failed"), + 59 : _("SSL set cipher failed"), + 60 : _("Local CA certificate failed"), + 61 : _("HTTP bad transfer encoding"), + 63 : _("Maximum file size exceeded"), + 64 : _("FTP SSL failed"), + 67 : _("Authentication failure"), + 70 : _("Out of disk space on server"), + 73 : _("Remove file exists"), + 77 : _("Problem with the SSL CA cert (path? access rights?)"), + } + errstr = str(e.args[1]) or pyerr2str.get(errcode, '<Unknown>') + if code and not 200 <= code <= 299: + scheme = _bytes_repr(self.scheme) + msg = '%s Error %d - %s' % (scheme.upper(), code, + scheme in ('http', 'https') + and responses.get(code) or errstr) + else: + msg = 'curl#%s - "%s"' % (errcode, errstr) + code = errcode + err = URLGrabError(14, msg) - err.url = self.url + err.url = errurl + err.code = code raise err - - elif errcode == 63: - if self._error[1]: - msg = self._error[1] - else: - msg = _("Max download size exceeded on %s") % (self.url) + + else: + if self._error[1]: + msg = self._error[1] err = URLGrabError(14, msg) - err.url = self.url + err.url = _urlunquote_convert(self.url) raise err - - elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it - msg = 'HTTP Error %s : %s ' % (self.http_code, self.url) - else: - msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1])) - code = errcode - err = URLGrabError(14, msg) - err.code = code - err.exception = e - raise err def _do_open(self): - self.curl_obj = _curl_cache + if hasattr(self.opts, 'curl_obj') and self.opts.curl_obj is not None: + self.curl_obj = self.opts.curl_obj + else: + self.curl_obj = _curl_cache self.curl_obj.reset() # reset all old settings away, just in case # setup any ranges self._set_opts() @@ -1310,11 +1697,11 @@ class PyCurlFileObject(): def _add_headers(self): pass - + def _build_range(self): reget_length = 0 rt = None - if self.opts.reget and type(self.filename) in types.StringTypes: + if self.opts.reget and isinstance(self.filename, string_types): # we have reget turned on and we're dumping to a file try: s = os.stat(self.filename) @@ -1325,15 +1712,19 @@ class PyCurlFileObject(): reget_length = s[stat.ST_SIZE] # Set initial length when regetting - self._amount_read = reget_length + self._amount_read = reget_length self._reget_length = reget_length # set where we started from, too rt = reget_length, '' self.append = 1 - + if self.opts.range: rt = self.opts.range - if rt[0]: rt = (rt[0] + reget_length, rt[1]) + + if rt[0] is None: + rt = (0, rt[1]) + rt = (rt[0] + reget_length, rt[1]) + if rt: header = range_tuple_to_header(rt) @@ -1345,10 +1736,10 @@ class PyCurlFileObject(): def _make_request(self, req, opener): #XXXX # This doesn't do anything really, but we could use this - # instead of do_open() to catch a lot of crap errors as + # instead of do_open() to catch a lot of crap errors as # mstenner did before here return (self.fo, self.hdr) - + try: if self.opts.timeout: old_to = socket.getdefaulttimeout() @@ -1360,22 +1751,22 @@ class PyCurlFileObject(): else: fo = opener.open(req) hdr = fo.info() - except ValueError, e: + except ValueError as e: err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, )) err.url = self.url raise err - except RangeError, e: + except RangeError as e: err = URLGrabError(9, _('%s on %s') % (e, self.url)) err.url = self.url raise err - except urllib2.HTTPError, e: + except HTTPError as e: new_e = URLGrabError(14, _('%s on %s') % (e, self.url)) new_e.code = e.code new_e.exception = e new_e.url = self.url raise new_e - except IOError, e: + except IOError as e: if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout): err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) err.url = self.url @@ -1385,41 +1776,41 @@ class PyCurlFileObject(): err.url = self.url raise err - except OSError, e: + except OSError as e: err = URLGrabError(5, _('%s on %s') % (e, self.url)) err.url = self.url raise err - except HTTPException, e: - err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \ - (e.__class__.__name__, self.url, e)) + except HTTPException as e: + err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') + % (e.__class__.__name__, self.url, e)) err.url = self.url raise err else: return (fo, hdr) - + def _do_grab(self): """dump the file to a filename or StringIO buffer""" if self._complete: return _was_filename = False - if type(self.filename) in types.StringTypes and self.filename: + if isinstance(self.filename, string_types) and self.filename: _was_filename = True self._prog_reportname = str(self.filename) self._prog_basename = os.path.basename(self.filename) - + if self.append: mode = 'ab' else: mode = 'wb' - if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \ - (self.filename, mode)) + if DEBUG: DEBUG.info('opening local file "%s" with mode %s' + % (self.filename, mode)) try: self.fo = open(self.filename, mode) - except IOError, e: - err = URLGrabError(16, _(\ - 'error opening local file from %s, IOError: %s') % (self.url, e)) + except IOError as e: + err = URLGrabError(16, _('error opening local file from %s, IOError: %s') + % (self.url, e)) err.url = self.url raise err @@ -1427,34 +1818,58 @@ class PyCurlFileObject(): self._prog_reportname = 'MEMORY' self._prog_basename = 'MEMORY' - - self.fo = StringIO() + + self.fo = BytesIO() # if this is to be a tempfile instead.... # it just makes crap in the tempdir #fh, self._temp_name = mkstemp() #self.fo = open(self._temp_name, 'wb') - - self._do_perform() - - + try: + self._do_perform() + except URLGrabError as e: + self.fo.flush() + self.fo.close() + raise e if _was_filename: # close it up self.fo.flush() self.fo.close() + + # Set the URL where we got it from: + if xattr is not None: + # See: http://www.freedesktop.org/wiki/CommonExtendedAttributes + try: + xattr.set(self.filename, 'user.xdg.origin.url', self.url) + except: + pass # URL too long. = IOError ... ignore everything. + # set the time mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME) if mod_time != -1: - os.utime(self.filename, (mod_time, mod_time)) + try: + os.utime(self.filename, (mod_time, mod_time)) + except OSError as e: + err = URLGrabError(16, _('error setting timestamp on file %s from %s, OSError: %s') + % (self.filename, self.url, e)) + err.url = self.url + raise err # re open it - self.fo = open(self.filename, 'r') + try: + self.fo = open(self.filename, 'r') + except IOError as e: + err = URLGrabError(16, _('error opening file from %s, IOError: %s') + % (self.url, e)) + err.url = self.url + raise err + else: #self.fo = open(self._temp_name, 'r') self.fo.seek(0) self._complete = True - + def _fill_buffer(self, amt=None): """fill the buffer to contain at least 'amt' bytes by reading from the underlying file object. If amt is None, then it will @@ -1471,9 +1886,9 @@ class PyCurlFileObject(): # if we've made it here, then we don't have enough in the buffer # and we need to read more. - + if not self._complete: self._do_grab() #XXX cheater - change on ranges - + buf = [self._rbuf] bufsize = len(self._rbuf) while amt is None or amt: @@ -1483,23 +1898,23 @@ class PyCurlFileObject(): (time.time() - self._ttime) if diff > 0: time.sleep(diff) self._ttime = time.time() - + # now read some data, up to self._rbufsize if amt is None: readamount = self._rbufsize else: readamount = min(amt, self._rbufsize) try: new = self.fo.read(readamount) - except socket.error, e: + except socket.error as e: err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e)) err.url = self.url raise err - except socket.timeout, e: + except socket.timeout as e: raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e)) err.url = self.url raise err - except IOError, e: + except IOError as e: raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e)) err.url = self.url raise err @@ -1515,7 +1930,7 @@ class PyCurlFileObject(): #if self.opts.progress_obj: # self.opts.progress_obj.update(self._amount_read) - self._rbuf = string.join(buf, '') + self._rbuf = b''.join(buf) return def _progress_update(self, download_total, downloaded, upload_total, uploaded): @@ -1526,35 +1941,31 @@ class PyCurlFileObject(): if self._prog_running: downloaded += self._reget_length self.opts.progress_obj.update(downloaded) - except KeyboardInterrupt: + except (KeyboardInterrupt, IOError): return -1 - + def _over_max_size(self, cur, max_size=None): if not max_size: - max_size = self.size - if self.opts.size: # if we set an opts size use that, no matter what - max_size = self.opts.size + if not self.opts.size: + max_size = self.size + else: + max_size = self.opts.size + if not max_size: return False # if we have None for all of the Max then this is dumb - if cur > max_size + max_size*.10: + + if cur > int(float(max_size) * 1.10): msg = _("Downloaded more than max size for %s: %s > %s") \ % (self.url, cur, max_size) self._error = (pycurl.E_FILESIZE_EXCEEDED, msg) return True return False - - def _to_utf8(self, obj, errors='replace'): - '''convert 'unicode' to an encoded utf-8 byte string ''' - # stolen from yum.i18n - if isinstance(obj, unicode): - obj = obj.encode('utf-8', errors) - return obj - + def read(self, amt=None): self._fill_buffer(amt) if amt is None: - s, self._rbuf = self._rbuf, '' + s, self._rbuf = self._rbuf, b'' else: s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:] return s @@ -1562,13 +1973,13 @@ class PyCurlFileObject(): def readline(self, limit=-1): if not self._complete: self._do_grab() return self.fo.readline() - - i = string.find(self._rbuf, '\n') + + i = self._rbuf.find('\n') while i < 0 and not (0 < limit <= len(self._rbuf)): L = len(self._rbuf) self._fill_buffer(L + self._rbufsize) if not len(self._rbuf) > L: break - i = string.find(self._rbuf, '\n', L) + i = self._rbuf.find('\n', L) if i < 0: i = len(self._rbuf) else: i = i+1 @@ -1581,10 +1992,25 @@ class PyCurlFileObject(): if self._prog_running: self.opts.progress_obj.end(self._amount_read) self.fo.close() - + def geturl(self): + """ Provide the geturl() method, used to be got from + urllib.addinfourl, via. urllib.URLopener.* """ + return self.url + +if hasattr(pycurl, 'GLOBAL_ACK_EINTR'): + # fail immediately on ctrl-c + pycurl.global_init(pycurl.GLOBAL_DEFAULT | pycurl.GLOBAL_ACK_EINTR) _curl_cache = pycurl.Curl() # make one and reuse it over and over and over +def reset_curl_obj(): + """To make sure curl has reread the network/dns info we force a reload""" + global _curl_cache + _curl_cache.close() + _curl_cache = pycurl.Curl() + +_libproxy_cache = None + ##################################################################### # DEPRECATED FUNCTIONS @@ -1603,90 +2029,601 @@ def set_progress_obj(new_progress_obj): def set_user_agent(new_user_agent): """Deprecated. Use: default_grabber.user_agent = new_user_agent""" default_grabber.user_agent = new_user_agent - + def retrygrab(url, filename=None, copy_local=0, close_connection=0, progress_obj=None, throttle=None, bandwidth=None, numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None): """Deprecated. Use: urlgrab() with the retry arg instead""" - kwargs = {'copy_local' : copy_local, + kwargs = {'copy_local' : copy_local, 'close_connection' : close_connection, - 'progress_obj' : progress_obj, - 'throttle' : throttle, + 'progress_obj' : progress_obj, + 'throttle' : throttle, 'bandwidth' : bandwidth, 'retry' : numtries, 'retrycodes' : retrycodes, - 'checkfunc' : checkfunc + 'checkfunc' : checkfunc } return urlgrab(url, filename, **kwargs) - + +##################################################################### +# Serializer + parser: A replacement of the rather bulky Json code. +# +# - handles basic python literals, lists and tuples. +# - serialized strings never contain ' ' or '\n' +# +##################################################################### + +def _quoter(c): + if c in '%[(,)] \n': + return '%%%02x' % ord(c) + return c + +def _dumps(v): + if v is None: return 'None' + if v is True: return 'True' + if v is False: return 'False' + if isinstance(v, numbers.Number): + return str(v) + if isinstance(v, (str, text_type, bytes)): + # standarize to str on both py2 to py3 + if sys.version_info < (3,): + if isinstance(v, text_type): + v = v.encode('utf8') + else: + if isinstance(v, bytes): + v = v.decode('utf8') + return "'%s'" % ''.join(map(_quoter, v)) + if isinstance(v, tuple): + return "(%s)" % ','.join(map(_dumps, v)) + if isinstance(v, list): + return "[%s]" % ','.join(map(_dumps, v)) + raise TypeError("Can't serialize %s" % v) + +def _loads(s): + def decode(v): + if v == 'None': return None + if v == 'True': return True + if v == 'False': return False + try: return int(v) + except ValueError: pass + try: return float(v) + except ValueError: pass + if len(v) >= 2 and v[0] == v[-1] == "'": + ret = []; i = 1 + while True: + j = v.find('%', i) + ret.append(v[i:j]) # skips the final "'" + if j == -1: break + ret.append(chr(int(v[j + 1:j + 3], 16))) + i = j + 3 + v = ''.join(ret) + return v + stk = None + l = [] + i = j = 0 + while True: + if j == len(s) or s[j] in ',)]': + if j > i: + l.append(decode(s[i:j])) + if j == len(s): break + if s[j] in ')]': + if s[j] == ')': + l = tuple(l) + stk[0].append(l) + l, stk = stk + i = j = j + 1 + elif s[j] in '[(': + stk = l, stk + l = [] + i = j = j + 1 + else: + j += 1 # safe because '[(,)]' are quoted + if stk: raise ValueError + if len(l) == 1: l = l[0] + return l + + +##################################################################### +# External downloader process +##################################################################### + +def _readlines(fd): + buf = os.read(fd, 4096) + if not buf: return None + # whole lines only, no buffering + while not buf.endswith(b'\n'): + buf += os.read(fd, 4096) + return buf[:-1].split(b'\n') + +import subprocess + +class _ExternalDownloader: + def __init__(self): + # raise if urlgrabber-ext-down is not installed so the user gets a + # an obvious error message instead of "[Errno 5] [Errno 2] No such file + # or directory" + if not os.path.exists('/usr/libexec/urlgrabber-ext-down') and os.getenv('URLGRABBER_EXT_DOWN') is None: + raise OSError('"/usr/libexec/urlgrabber-ext-down" is not installed') + urlgrabber_path = (os.getenv('URLGRABBER_EXT_DOWN', None) + or '/usr/libexec/urlgrabber-ext-down') + self.popen = subprocess.Popen( + urlgrabber_path, + stdin = subprocess.PIPE, + stdout = subprocess.PIPE, + ) + self.stdin = self.popen.stdin.fileno() + self.stdout = self.popen.stdout.fileno() + self.running = {} + self.cnt = 0 + + # list of options we pass to downloader + _options = ( + 'url', 'filename', + 'timeout', 'minrate', 'close_connection', 'keepalive', + 'throttle', 'bandwidth', 'range', 'reget', + 'user_agent', 'http_headers', 'ftp_headers', + 'proxy', 'prefix', 'username', 'password', + 'ssl_ca_cert', + 'ssl_cert', 'ssl_cert_type', + 'ssl_key', 'ssl_key_type', + 'ssl_key_pass', + 'ssl_verify_peer', 'ssl_verify_host', + 'size', 'max_header_size', 'ip_resolve', + 'ftp_disable_epsv', + 'no_cache', + ) + + def start(self, opts): + arg = [] + for k in self._options: + v = getattr(opts, k) + if v is None: continue + arg.append('%s=%s' % (k, _dumps(v))) + if opts.progress_obj and opts.multi_progress_obj: + arg.append('progress_obj=True') + arg = ' '.join(arg) + if DEBUG: DEBUG.info('attempt %i/%s: %s', opts.tries, opts.retry, opts.url) + + self.cnt += 1 + self.running[self.cnt] = opts + os.write(self.stdin, (arg +'\n').encode('utf8')) + + def perform(self): + ret = [] + lines = _readlines(self.stdout) + if not lines: + if DEBUG: DEBUG.info('downloader died') + raise KeyboardInterrupt + for line in lines: + # parse downloader output + line = line.split(b' ', 6) + _id, size = map(int, line[:2]) + if len(line) == 2: + self.running[_id]._progress.update(size) + continue + # job done + opts = self.running.pop(_id) + if line[4] == b'OK': + ug_err = None + if DEBUG: DEBUG.info('success') + else: + ug_err = URLGrabError(int(line[4]), line[6]) + if line[5] != b'0': + ug_err.code = int(line[5]) + if DEBUG: DEBUG.info('failure: %s', ug_err) + _TH.update(opts.url, int(line[2]), float(line[3]), ug_err, opts.async_[0]) + ret.append((opts, size, ug_err)) + return ret + + def abort(self): + self.popen.stdin.close() + self.popen.stdout.close() + self.popen.wait() + +class _ExternalDownloaderPool: + def __init__(self): + self.epoll = select.epoll() + self.running = {} + self.cache = {} + + def start(self, opts): + host = urlparse.urlsplit(opts.url).netloc + dl = self.cache.pop(host, None) + if not dl: + dl = _ExternalDownloader() + fl = fcntl.fcntl(dl.stdin, fcntl.F_GETFD) + fcntl.fcntl(dl.stdin, fcntl.F_SETFD, fl | fcntl.FD_CLOEXEC) + self.epoll.register(dl.stdout, select.EPOLLIN) + self.running[dl.stdout] = dl + dl.start(opts) + + def perform(self): + ret = [] + for fd, event in self.epoll.poll(): + if event & select.EPOLLHUP: + if DEBUG: DEBUG.info('downloader died') + raise KeyboardInterrupt + assert event & select.EPOLLIN + done = self.running[fd].perform() + if not done: continue + assert len(done) == 1 + ret.extend(done) + + # dl finished, move it to the cache + host = urlparse.urlsplit(done[0][0].url).netloc + if host in self.cache: self.cache[host].abort() + self.epoll.unregister(fd) + self.cache[host] = self.running.pop(fd) + return ret + + def abort(self): + for dl in self.running.values(): + self.epoll.unregister(dl.stdout) + dl.abort() + for dl in self.cache.values(): + dl.abort() + + +##################################################################### +# High level async API +##################################################################### + +_async_queue = [] + +def parallel_wait(meter=None): + '''Process queued requests in parallel. + ''' + + # calculate total sizes + meters = {} + for opts in _async_queue: + if opts.progress_obj and opts.multi_progress_obj: + count, total = meters.get(opts.multi_progress_obj) or (0, 0) + meters[opts.multi_progress_obj] = count + 1, total + opts.size + + # start multi-file meters + for meter in meters: + count, total = meters[meter] + meter.start(count, total) + + dl = _ExternalDownloaderPool() + host_con = {} # current host connection counts + single = set() # hosts in single connection mode + retry_queue = [] + + def start(opts, tries): + opts.tries = tries + try: + dl.start(opts) + except OSError as e: + # can't spawn downloader, give up immediately + opts.exception = URLGrabError(5, exception2msg(e)) + _run_callback(opts.failfunc, opts) + return + + key, limit = opts.async_ + host_con[key] = host_con.get(key, 0) + 1 + if opts.progress_obj: + if opts.multi_progress_obj: + opts._progress = opts.multi_progress_obj.newMeter() + opts._progress.start(text=opts.text) + else: + opts._progress = time.time() # no updates + + def perform(): + for opts, size, ug_err in dl.perform(): + key, limit = opts.async_ + host_con[key] -= 1 + + if ug_err is None: + if opts.checkfunc: + try: + _run_callback(opts.checkfunc, opts) + except URLGrabError as e: + ug_err = e + + if opts.progress_obj: + if opts.multi_progress_obj: + if ug_err: + opts._progress.failure(None) + else: + opts.multi_progress_obj.re.total += size - opts.size # correct totals + opts._progress.end(size) + opts.multi_progress_obj.removeMeter(opts._progress) + else: + opts.progress_obj.start(text=opts.text, now=opts._progress) + opts.progress_obj.update(size) + opts.progress_obj.end(size) + del opts._progress + + if ug_err is None: + continue + if limit != 1 and key not in single and ug_err.errno in (12, 14): + # One possible cause is connection-limited server. + # Turn on the max_connections=1 override. BZ 853432 + if DEBUG: DEBUG.info('max_connections(%s) %s => 1', key, limit) + single.add(key) + # When using multi-downloader the parent's _curl_cache + # object is idle. Kill it, as it might use keepalive=1. + reset_curl_obj() + + retry = opts.retry or 0 + if opts.failure_callback: + opts.exception = ug_err + try: + _run_callback(opts.failure_callback, opts) + except URLGrabError as e: + ug_err = e + retry = 0 # no retries + if opts.tries < retry and ug_err.errno in opts.retrycodes: + if ug_err.errno < 0 and opts.retry_no_cache: + opts.no_cache = True + start(opts, opts.tries + 1) # simple retry + continue + + if opts.mirror_group: + mg, errors, failed, removed = opts.mirror_group + errors.append((opts.url, exception2msg(ug_err))) + failed[key] = failed.get(key, 0) + 1 + opts.mirror = key + opts.exception = ug_err + action = mg.default_action or {} + if mg.failure_callback: + opts.tries = len(errors) + action = dict(action) # update only the copy + action.update(_run_callback(mg.failure_callback, opts)) + if not action.get('fail', 0): + # mask this mirror and retry + if action.get('remove', 1): + removed.add(key) + retry_queue.append(opts) + continue + # fail=1 from callback + ug_err.errors = errors + + # urlgrab failed + opts.exception = ug_err + _run_callback(opts.failfunc, opts) + + try: + retry_idx = idx = 0 + while True: + if retry_idx < len(retry_queue): + # retries first + opts = retry_queue[retry_idx] + retry_idx += 1 + elif idx < len(_async_queue): + # handle next request + opts = _async_queue[idx] + idx += 1 + else: + # both queues are empty + if not dl.running: break + perform() + continue + + # check global limit + while len(dl.running) >= default_grabber.opts.max_connections: + perform() + if DEBUG: + DEBUG.info('max_connections: %d/%d', len(dl.running), default_grabber.opts.max_connections) + + if opts.mirror_group: + mg, errors, failed, removed = opts.mirror_group + + # find the best mirror + best = None + best_speed = None + for mirror in mg.mirrors: + key = mirror['mirror'] + if key in removed: continue + + # estimate mirror speed + speed, fail = _TH.estimate(key) + speed /= 1 + host_con.get(key, 0) + + # order by: least failures, private flag, best speed + # ignore 'private' flag if there were failures + private = not fail and mirror.get('kwargs', {}).get('private', False) + speed = -failed.get(key, 0), private, speed + if best is None or speed > best_speed: + best = mirror + best_speed = speed + + if best is None: + opts.exception = URLGrabError(256, _('No more mirrors to try.')) + opts.exception.errors = errors + _run_callback(opts.failfunc, opts) + continue + + # update the grabber object, apply mirror kwargs + grabber = best.get('grabber') or mg.grabber + opts.delegate = grabber.opts.derive(**best.get('kwargs', {})) + + # update the current mirror and limit + key = best['mirror'] + limit = best.get('kwargs', {}).get('max_connections') + opts.async_ = key, limit + + # update URL and proxy + url = mg._join_url(key, opts.relative_url) + url, parts = opts.urlparser.parse(url, opts) + opts.find_proxy(url, parts[0]) + opts.url = url + + # check host limit, then start + key, limit = opts.async_ + if key in single: + limit = 1 + while host_con.get(key, 0) >= (limit or 2): + perform() + if DEBUG: + DEBUG.info('max_connections(%s): %d/%s', key, host_con.get(key, 0), limit) + + start(opts, 1) + except IOError as e: + if e.errno != 4: raise + raise KeyboardInterrupt + + finally: + dl.abort() + for meter in meters: + meter.end() + del _async_queue[:] + _TH.save() + + +##################################################################### +# Host bandwidth estimation +##################################################################### + +class _TH: + hosts = {} + dirty = None + + @staticmethod + def load(): + filename = default_grabber.opts.timedhosts + if filename and _TH.dirty is None: + try: + now = int(time.time()) + for line in open(filename): + try: + host, speed, fail, ts = line.rsplit(' ', 3) + _TH.hosts[host] = int(speed), int(fail), min(int(ts), now) + except ValueError: + if DEBUG: DEBUG.info('Error parsing timedhosts: line "%s"', line) + except IOError: pass + _TH.dirty = False + + @staticmethod + def save(): + filename = default_grabber.opts.timedhosts + if filename and _TH.dirty is True: + tmp = '%s.%d' % (filename, os.getpid()) + try: + f = open(tmp, 'w') + for host in _TH.hosts: + f.write(host + ' %d %d %d\n' % _TH.hosts[host]) + f.close() + os.rename(tmp, filename) + except IOError: pass + _TH.dirty = False + + @staticmethod + def update(url, dl_size, dl_time, ug_err, baseurl=None): + # Use hostname from URL. If it's a file:// URL, use baseurl. + # If no baseurl, do not update timedhosts. + host = urlparse.urlsplit(url).netloc.split(b'@')[-1] or baseurl + if not host: return + + _TH.load() + speed, fail, ts = _TH.hosts.get(host) or (0, 0, 0) + now = time.time() + + if ug_err is None: + # defer first update if the file was small. BZ 851178. + if not ts and dl_size < 1e6: return + # k1: the older, the less useful + # k2: <500ms readings are less reliable + # speeds vary, use 10:1 smoothing + k1 = 2**((ts - now) / default_grabber.opts.half_life) + k2 = min(dl_time / .500, 1.0) / 10 + if k2 > 0: + speed = (k1 * speed + k2 * dl_size / dl_time) / (k1 + k2) + fail = 0 + elif getattr(ug_err, 'code', None) == 404: + if not ts: return # 1st update, avoid speed=0 + fail = 0 # alive, at least + else: + fail += 1 # seems dead + + _TH.hosts[host] = speed, fail, now + _TH.dirty = True + + @staticmethod + def estimate(baseurl): + _TH.load() + + # Use just the hostname, unless it's a file:// baseurl. + host = urlparse.urlsplit(baseurl).netloc.split(b'@')[-1] or baseurl + + default_speed = default_grabber.opts.default_speed + try: speed, fail, ts = _TH.hosts[host] + except KeyError: return default_speed, 0 + + speed *= 2**-fail + k = 2**((ts - time.time()) / default_grabber.opts.half_life) + speed = k * speed + (1 - k) * default_speed + return speed, fail + ##################################################################### # TESTING def _main_test(): try: url, filename = sys.argv[1:3] except ValueError: - print 'usage:', sys.argv[0], \ - '<url> <filename> [copy_local=0|1] [close_connection=0|1]' - sys.exit() + print('usage:', sys.argv[0], + '<url> <filename> [copy_local=0|1] [close_connection=0|1]') + sys.exit(2) kwargs = {} for a in sys.argv[3:]: - k, v = string.split(a, '=', 1) + k, v = a.split('=', 1) kwargs[k] = int(v) set_throttle(1.0) set_bandwidth(32 * 1024) - print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, - default_grabber.bandwidth) + print("throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle, + default_grabber.bandwidth)) - try: from progress import text_progress_meter - except ImportError, e: pass + try: from .progress import text_progress_meter + except ImportError as e: pass else: kwargs['progress_obj'] = text_progress_meter() - try: name = apply(urlgrab, (url, filename), kwargs) - except URLGrabError, e: print e - else: print 'LOCAL FILE:', name + try: name = urlgrab(url, filename, **kwargs) + except URLGrabError as e: print(e) + else: print('LOCAL FILE:', name) def _retry_test(): try: url, filename = sys.argv[1:3] except ValueError: - print 'usage:', sys.argv[0], \ - '<url> <filename> [copy_local=0|1] [close_connection=0|1]' - sys.exit() + print('usage:', sys.argv[0], + '<url> <filename> [copy_local=0|1] [close_connection=0|1]') + sys.exit(2) kwargs = {} for a in sys.argv[3:]: - k, v = string.split(a, '=', 1) + k, v = a.split('=', 1) kwargs[k] = int(v) - try: from progress import text_progress_meter - except ImportError, e: pass + try: from .progress import text_progress_meter + except ImportError as e: pass else: kwargs['progress_obj'] = text_progress_meter() def cfunc(filename, hello, there='foo'): - print hello, there + print(hello, there) import random rnum = random.random() if rnum < .5: - print 'forcing retry' + print('forcing retry') raise URLGrabError(-1, 'forcing retry') if rnum < .75: - print 'forcing failure' + print('forcing failure') raise URLGrabError(-2, 'forcing immediate failure') - print 'success' + print('success') return - + kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'}) - try: name = apply(retrygrab, (url, filename), kwargs) - except URLGrabError, e: print e - else: print 'LOCAL FILE:', name + try: name = retrygrab(url, filename, **kwargs) + except URLGrabError as e: print(e) + else: print('LOCAL FILE:', name) def _file_object_test(filename=None): - import cStringIO if filename is None: filename = __file__ - print 'using file "%s" for comparisons' % filename + print('using file "%s" for comparisons' % filename) fo = open(filename) s_input = fo.read() fo.close() @@ -1695,17 +2632,17 @@ def _file_object_test(filename=None): _test_file_object_readall, _test_file_object_readline, _test_file_object_readlines]: - fo_input = cStringIO.StringIO(s_input) - fo_output = cStringIO.StringIO() + fo_input = StringIO(s_input) + fo_output = StringIO() wrapper = PyCurlFileObject(fo_input, None, 0) - print 'testing %-30s ' % testfunc.__name__, + print('testing %-30s ' % testfunc.__name__, end=' ') testfunc(wrapper, fo_output) s_output = fo_output.getvalue() - if s_output == s_input: print 'passed' - else: print 'FAILED' - + if s_output == s_input: print('passed') + else: print('FAILED') + def _test_file_object_smallread(wrapper, fo_output): - while 1: + while True: s = wrapper.read(23) fo_output.write(s) if not s: return @@ -1715,14 +2652,14 @@ def _test_file_object_readall(wrapper, fo_output): fo_output.write(s) def _test_file_object_readline(wrapper, fo_output): - while 1: + while True: s = wrapper.readline() fo_output.write(s) if not s: return def _test_file_object_readlines(wrapper, fo_output): li = wrapper.readlines() - fo_output.write(string.join(li, '')) + fo_output.write(''.join(li)) if __name__ == '__main__': _main_test() diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index dad410b..d95863e 100644..100755 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -76,6 +76,10 @@ CUSTOMIZATION 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. + kwarg 'max_connections' limits the number of concurrent + connections to this mirror. When omitted or set to zero, + the default limit (2) will be used. + 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. @@ -87,12 +91,29 @@ CUSTOMIZATION """ +import sys import random -import thread # needed for locking to make this threadsafe -from grabber import URLGrabError, CallbackObject, DEBUG +if sys.version_info >= (3,): + # We use a version check because python2 also has _thread + import _thread as thread +else: + import thread + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse + +from six import string_types -def _(st): +from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 +from .grabber import _run_callback, _do_raise +from .grabber import exception2msg +from .grabber import _TH +from .grabber import _bytes_repr + +def _(st): return st class GrabRequest: @@ -126,13 +147,15 @@ class MirrorGroup: files) * if the local list is ever exhausted, a URLGrabError will be - raised (errno=256, no more mirrors) + raised (errno=256, No more mirrors). The 'errors' attribute + holds a list of (full_url, errmsg) tuples. This contains + all URLs tried and the corresponding error messages. OPTIONS In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: - + default_action A dict that describes the actions to be taken upon failure @@ -153,7 +176,8 @@ class MirrorGroup: The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current - download. + download. As in the "No more mirrors" case, the 'errors' + attribute is set in the exception object. This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) @@ -162,7 +186,7 @@ class MirrorGroup: or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. - + If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default @@ -180,10 +204,11 @@ class MirrorGroup: etc). Otherwise, it is assumed to be the callable object itself. The callback will be passed a grabber.CallbackObject instance along with args and kwargs (if present). The following - attributes are defined withing the instance: + attributes are defined within the instance: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > + obj.tries = < the number of mirror tries so far > obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror @@ -251,22 +276,34 @@ class MirrorGroup: self.default_action = None self._process_kwargs(kwargs) + # use the same algorithm as parallel downloader to initially sort + # the mirror list (sort by speed, but prefer live private mirrors) + def estimate(m): + speed, fail = _TH.estimate(m['mirror']) + private = not fail and m.get('kwargs', {}).get('private', False) + return private, speed + + # update the initial order. since sorting is stable, the relative + # order of unknown (not used yet) hosts is retained. + self.mirrors.sort(key=estimate, reverse=True) + # if these values are found in **kwargs passed to one of the urlXXX # methods, they will be stripped before getting passed on to the # grabber options = ['default_action', 'failure_callback'] - + def _process_kwargs(self, kwargs): self.failure_callback = kwargs.get('failure_callback') self.default_action = kwargs.get('default_action') - + def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: - if type(m) == type(''): m = {'mirror': m} + if isinstance(m, string_types): + m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors - + def _load_gr(self, gr): # OVERRIDE IDEAS: # shuffle gr list @@ -280,7 +317,9 @@ class MirrorGroup: # return a random mirror so that multiple mirrors get used # even without failures. if not gr.mirrors: - raise URLGrabError(256, _('No more mirrors to try.')) + e = URLGrabError(256, _('No more mirrors to try.')) + e.errors = gr.errors + raise e return gr.mirrors[gr._next] def _failure(self, gr, cb_obj): @@ -290,7 +329,7 @@ class MirrorGroup: # the callback) cb = gr.kw.get('failure_callback') or self.failure_callback if cb: - if type(cb) == type( () ): + if isinstance(cb, tuple): cb, args, kwargs = cb else: args, kwargs = (), {} @@ -307,7 +346,9 @@ class MirrorGroup: a.update(action) action = a self.increment_mirror(gr, action) - if action and action.get('fail', 0): raise + if action and action.get('fail', 0): + sys.exc_info()[1].errors = gr.errors + raise def increment_mirror(self, gr, action={}): """Tell the mirror object increment the mirror index @@ -323,7 +364,7 @@ class MirrorGroup: urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). - + remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only @@ -345,7 +386,7 @@ class MirrorGroup: self._next += 1 if self._next >= len(self.mirrors): self._next = 0 self._lock.release() - + if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): @@ -353,9 +394,9 @@ class MirrorGroup: if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: - grm = [m['mirror'] for m in gr.mirrors] + grm = [m['mirror'].decode() for m in gr.mirrors] DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) - selfm = [m['mirror'] for m in self.mirrors] + selfm = [m['mirror'].decode() for m in self.mirrors] DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) ##################################################################### @@ -366,47 +407,68 @@ class MirrorGroup: # by overriding the configuration methods :) def _join_url(self, base_url, rel_url): - if base_url.endswith('/') or rel_url.startswith('/'): - return base_url + rel_url + (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) + + if isinstance(base_url, bytes): + if not isinstance(rel_url, bytes): + rel_url = rel_url.encode('utf8') + sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/' else: - return base_url + '/' + rel_url - + sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + + return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid)) + def _mirror_try(self, func, url, kw): gr = GrabRequest() gr.func = func gr.url = url gr.kw = dict(kw) self._load_gr(gr) + gr.errors = [] for k in self.options: try: del kw[k] except KeyError: pass - while 1: + tries = 0 + while True: + tries += 1 mirrorchoice = self._get_mirror(gr) fullurl = self._join_url(mirrorchoice['mirror'], gr.url) - kwargs = dict(mirrorchoice.get('kwargs', {})) - kwargs.update(kw) grabber = mirrorchoice.get('grabber') or self.grabber + # apply mirrorchoice kwargs on top of grabber.opts + opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {})) func_ref = getattr(grabber, func) - if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl)) try: - return func_ref( *(fullurl,), **kwargs ) - except URLGrabError, e: + return func_ref( *(fullurl,), opts=opts, **kw ) + except URLGrabError as e: if DEBUG: DEBUG.info('MIRROR: failed') + gr.errors.append((fullurl, exception2msg(e))) obj = CallbackObject() obj.exception = e obj.mirror = mirrorchoice['mirror'] obj.relative_url = gr.url obj.url = fullurl + obj.tries = tries self._failure(gr, obj) def urlgrab(self, url, filename=None, **kwargs): kw = dict(kwargs) kw['filename'] = filename + if kw.get('async_') or kw.get('async'): + # enable mirror failovers in async path + kw['mirror_group'] = self, [], {}, set() + kw['relative_url'] = url + else: + kw.pop('failfunc', None) func = 'urlgrab' - return self._mirror_try(func, url, kw) - + try: + return self._mirror_try(func, url, kw) + except URLGrabError as e: + obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) + return _run_callback(kwargs.get('failfunc', _do_raise), obj) + def urlopen(self, url, **kwargs): kw = dict(kwargs) func = 'urlopen' @@ -417,7 +479,7 @@ class MirrorGroup: kw['limit'] = limit func = 'urlread' return self._mirror_try(func, url, kw) - + class MGRandomStart(MirrorGroup): """A mirror group that starts at a random mirror in the list. diff --git a/urlgrabber/progress.py b/urlgrabber/progress.py index dd07c6a..5b4c450 100644..100755 --- a/urlgrabber/progress.py +++ b/urlgrabber/progress.py @@ -9,23 +9,31 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko +from __future__ import print_function import sys import time import math -import thread import fcntl import struct import termios +if sys.version_info >= (3,): + # We use a version check because python2 also has _thread + import _thread as thread +else: + import thread + +from six import integer_types, string_types + # Code from http://mail.python.org/pipermail/python-list/2000-May/033365.html def terminal_width(fd=1): """ Get the real terminal width """ @@ -107,7 +115,7 @@ class BaseMeter: self.last_amount_read = 0 self.last_update_time = None self.re = RateEstimator() - + def start(self, filename=None, url=None, basename=None, size=None, now=None, text=None): self.filename = filename @@ -125,7 +133,7 @@ class BaseMeter: self.last_amount_read = 0 self.last_update_time = now self._do_start(now) - + def _do_start(self, now=None): pass @@ -133,8 +141,8 @@ class BaseMeter: # for a real gui, you probably want to override and put a call # to your mainloop iteration function here if now is None: now = time.time() - if (now >= self.last_update_time + self.update_period) or \ - not self.last_update_time: + if (not self.last_update_time or + (now >= self.last_update_time + self.update_period)): self.re.update(amount_read, now) self.last_amount_read = amount_read self.last_update_time = now @@ -152,7 +160,7 @@ class BaseMeter: def _do_end(self, amount_read, now=None): pass - + # This is kind of a hack, but progress is gotten from grabber which doesn't # know about the total size to download. So we do this so we can get the data # out of band here. This will be "fixed" one way or anther soon. @@ -167,7 +175,7 @@ def text_meter_total_size(size, downloaded=0): # # update: No size (minimal: 17 chars) # ----------------------------------- -# <text> <rate> | <current size> <elapsed time> +# <text> <rate> | <current size> <elapsed time> # 8-48 1 8 3 6 1 9 5 # # Order: 1. <text>+<current size> (17) @@ -202,7 +210,7 @@ def text_meter_total_size(size, downloaded=0): # # end # --- -# <text> | <current size> <elapsed time> +# <text> | <current size> <elapsed time> # 8-56 3 6 1 9 5 # # Order: 1. <text> ( 8) @@ -211,6 +219,21 @@ def text_meter_total_size(size, downloaded=0): # 4. + ( 5, total: 32) # +def _term_add_bar(tl, bar_max_length, pc): + blen = bar_max_length + bar = '='*int(blen * pc) + if (blen * pc) - int(blen * pc) >= 0.5: + bar += '-' + return tl.add(' [%-*.*s]' % (blen, blen, bar)) + +def _term_add_end(tl, osize, size): + if osize: # osize should be None or >0, but that's been broken. + if size > osize: # Is ??? better? Really need something to say < vs >. + return tl.add(' !!! '), True + elif size != osize: + return tl.add(' ... '), True + return tl.add(' ' * 5), False + class TextMeter(BaseMeter): def __init__(self, fo=sys.stderr): BaseMeter.__init__(self) @@ -218,7 +241,6 @@ class TextMeter(BaseMeter): def _do_update(self, amount_read, now=None): etime = self.re.elapsed_time() - fetime = format_time(etime) fread = format_number(amount_read) #self.size = None if self.text is not None: @@ -234,19 +256,23 @@ class TextMeter(BaseMeter): # Include text + ui_rate in minimal tl = TerminalLine(8, 8+1+8) + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. + else: + use_hours = False ui_size = tl.add(' | %5sB' % fread) if self.size is None: - ui_time = tl.add(' %9s' % fetime) + ui_time = tl.add(' %s' % format_time(etime, use_hours)) ui_end = tl.add(' ' * 5) ui_rate = tl.add(' %5sB/s' % ave_dl) out = '%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, ui_rate, ui_size, ui_time, ui_end) else: rtime = self.re.remaining_time() - frtime = format_time(rtime) + frtime = format_time(rtime, use_hours) frac = self.re.fraction_read() - ui_time = tl.add(' %9s' % frtime) + ui_time = tl.add(' %s' % frtime) ui_end = tl.add(' ETA ') if sofar_size is None: @@ -259,13 +285,10 @@ class TextMeter(BaseMeter): ui_rate = tl.add(' %5sB/s' % ave_dl) # Make text grow a bit before we start growing the bar too blen = 4 + tl.rest_split(8 + 8 + 4) - bar = '='*int(blen * frac) - if (blen * frac) - int(blen * frac) >= 0.5: - bar += '-' - ui_bar = tl.add(' [%-*.*s]' % (blen, blen, bar)) - out = '%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, - ui_sofar_pc, ui_pc, ui_bar, - ui_rate, ui_size, ui_time, ui_end) + ui_bar = _term_add_bar(tl, blen, frac) + out = '\r%-*.*s%s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_sofar_pc, ui_pc, ui_bar, + ui_rate,ui_size,ui_time, ui_end) self.fo.write(out) self.fo.flush() @@ -274,7 +297,6 @@ class TextMeter(BaseMeter): global _text_meter_total_size global _text_meter_sofar_size - total_time = format_time(self.re.elapsed_time()) total_size = format_number(amount_read) if self.text is not None: text = self.text @@ -282,14 +304,13 @@ class TextMeter(BaseMeter): text = self.basename tl = TerminalLine(8) - ui_size = tl.add(' | %5sB' % total_size) - ui_time = tl.add(' %9s' % total_time) - not_done = self.size is not None and amount_read != self.size - if not_done: - ui_end = tl.add(' ... ') + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. else: - ui_end = tl.add(' ' * 5) - + use_hours = False + ui_size = tl.add(' | %5sB' % total_size) + ui_time = tl.add(' %s' % format_time(self.re.elapsed_time(), use_hours)) + ui_end, not_done = _term_add_end(tl, self.size, amount_read) out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, ui_size, ui_time, ui_end) self.fo.write(out) @@ -331,14 +352,23 @@ class MultiFileHelper(BaseMeter): def message(self, message): self.master.message_meter(self, message) +class _FakeLock: + def acquire(self): + pass + def release(self): + pass + class MultiFileMeter: helperclass = MultiFileHelper - def __init__(self): + def __init__(self, threaded=True): self.meters = [] self.in_progress_meters = [] - self._lock = thread.allocate_lock() + if threaded: + self._lock = thread.allocate_lock() + else: + self._lock = _FakeLock() self.update_period = 0.3 # seconds - + self.numfiles = None self.finished_files = 0 self.failed_files = 0 @@ -369,8 +399,9 @@ class MultiFileMeter: def end(self, now=None): if now is None: now = time.time() + self.re.update(self._amount_read(), now) self._do_end(now) - + def _do_end(self, now): pass @@ -383,10 +414,10 @@ class MultiFileMeter: newmeter = self.helperclass(self) self.meters.append(newmeter) return newmeter - + def removeMeter(self, meter): self.meters.remove(meter) - + ########################################################### # child functions - these should only be called by helpers def start_meter(self, meter, now): @@ -400,15 +431,15 @@ class MultiFileMeter: finally: self._lock.release() self._do_start_meter(meter, now) - + def _do_start_meter(self, meter, now): pass - + def update_meter(self, meter, now): if not meter in self.meters: raise ValueError('attempt to use orphaned meter') - if (now >= self.last_update_time + self.update_period) or \ - not self.last_update_time: + if (not self.last_update_time or + (now >= self.last_update_time + self.update_period)): self.re.update(self._amount_read(), now) self.last_update_time = now self._do_update_meter(meter, now) @@ -466,34 +497,83 @@ class MultiFileMeter: class TextMultiFileMeter(MultiFileMeter): - def __init__(self, fo=sys.stderr): + def __init__(self, fo=sys.stderr, threaded=True): self.fo = fo - MultiFileMeter.__init__(self) + MultiFileMeter.__init__(self, threaded) + self.index_time = self.index = 0 # files: ###/### ###% data: ######/###### ###% time: ##:##:##/##:##:## +# New output, like TextMeter output... +# update: No size (minimal: 17 chars) +# ----------------------------------- +# (<#file>/<#tot files>): <text> <rate> | <current size> <elapsed> +# 8-48 1 8 3 6 1 7-9 5 +# +# update: Size, All files +# ----------------------- +# (<#file>/<#tot files>): <text> <pc> <bar> <rate> | <size> <eta time> ETA +# 8-22 1 3-4 1 6-12 1 8 3 6 1 7-9 1 3 1 +# end +# --- +# <text> | <file size> <file elapsed time> +# 8-56 3 6 1 9 5 def _do_update_meter(self, meter, now): self._lock.acquire() try: - format = "files: %3i/%-3i %3i%% data: %6.6s/%-6.6s %3i%% " \ - "time: %8.8s/%8.8s" df = self.finished_files tf = self.numfiles or 1 - pf = 100 * float(df)/tf + 0.49 + # Don't use "percent of files complete" ... + # pf = 100 * float(df)/tf + 0.49 dd = self.re.last_amount_read - td = self.total_size + td = self.re.total pd = 100 * (self.re.fraction_read() or 0) + 0.49 dt = self.re.elapsed_time() rt = self.re.remaining_time() - if rt is None: tt = None - else: tt = dt + rt - - fdd = format_number(dd) + 'B' - ftd = format_number(td) + 'B' - fdt = format_time(dt, 1) - ftt = format_time(tt, 1) - - out = '%-79.79s' % (format % (df, tf, pf, fdd, ftd, pd, fdt, ftt)) - self.fo.write('\r' + out) + + frac = self.re.fraction_read() or 0 + pf = 100 * frac + ave_dl = format_number(self.re.average_rate()) + + # cycle through active meters + if now > self.index_time: + self.index_time = now + 1.0 + self.index += 1 + if self.index >= len(self.meters): + self.index = 0 + meter = self.meters[self.index] + text = meter.text or meter.basename + if tf > 1: + text = '(%u/%u): %s' % (df+1+self.index, tf, text) + + # Include text + ui_rate in minimal + tl = TerminalLine(8, 8+1+8) + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. + else: + use_hours = False + ui_size = tl.add(' | %5sB' % format_number(dd)) + if not self.re.total: + ui_time = tl.add(' %s' % format_time(dt, use_hours)) + ui_end = tl.add(' ' * 5) + ui_rate = tl.add(' %5sB/s' % ave_dl) + out = '\r%-*.*s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_rate, ui_size, ui_time, ui_end) + else: + ui_time = tl.add(' %s' % format_time(rt, use_hours)) + ui_end = tl.add(' ETA ') + + ui_sofar_pc = tl.add(' %i%%' % pf, + full_len=len(" (100%)")) + ui_rate = tl.add(' %5sB/s' % ave_dl) + + # Make text grow a bit before we start growing the bar too + blen = 4 + tl.rest_split(8 + 8 + 4) + ui_bar = _term_add_bar(tl, blen, frac) + out = '\r%-*.*s%s%s%s%s%s%s\r' % (tl.rest(), tl.rest(), text, + ui_sofar_pc, ui_bar, + ui_rate, ui_size, ui_time, + ui_end) + self.fo.write(out) self.fo.flush() finally: self._lock.release() @@ -502,25 +582,39 @@ class TextMultiFileMeter(MultiFileMeter): self._lock.acquire() try: format = "%-30.30s %6.6s %8.8s %9.9s" - fn = meter.basename + fn = meter.text or meter.basename size = meter.last_amount_read fsize = format_number(size) + 'B' et = meter.re.elapsed_time() - fet = format_time(et, 1) - frate = format_number(size / et) + 'B/s' - - out = '%-79.79s' % (format % (fn, fsize, fet, frate)) - self.fo.write('\r' + out + '\n') + frate = format_number(et and size / et) + 'B/s' + df = self.finished_files + tf = self.numfiles or 1 + + total_size = format_number(size) + text = meter.text or meter.basename + if tf > 1: + text = '(%u/%u): %s' % (df, tf, text) + + tl = TerminalLine(8) + if tl._llen > 80: + use_hours = True # For big screens, make it more readable. + else: + use_hours = False + ui_size = tl.add(' | %5sB' % total_size) + ui_time = tl.add(' %s' % format_time(et, use_hours)) + ui_end, not_done = _term_add_end(tl, meter.size, size) + out = '\r%-*.*s%s%s%s\n' % (tl.rest(), tl.rest(), text, + ui_size, ui_time, ui_end) + self.fo.write(out) finally: self._lock.release() - self._do_update_meter(meter, now) def _do_failure_meter(self, meter, message, now): self._lock.acquire() try: format = "%-30.30s %6.6s %s" - fn = meter.basename - if type(message) in (type(''), type(u'')): + fn = meter.text or meter.basename + if isinstance(message, string_types): message = message.splitlines() if not message: message = [''] out = '%-79s' % (format % (fn, 'FAILED', message[0] or '')) @@ -537,15 +631,6 @@ class TextMultiFileMeter(MultiFileMeter): finally: self._lock.release() - def _do_end(self, now): - self._do_update_meter(None, now) - self._lock.acquire() - try: - self.fo.write('\n') - self.fo.flush() - finally: - self._lock.release() - ###################################################################### # support classes and functions @@ -560,13 +645,17 @@ class RateEstimator: self.last_update_time = now self.last_amount_read = 0 self.ave_rate = None - + def update(self, amount_read, now=None): if now is None: now = time.time() - if amount_read == 0: + # libcurl calls the progress callback when fetching headers + # too, thus amount_read = 0 .. hdr_size .. 0 .. content_size. + # Ocassionally we miss the 2nd zero and report avg speed < 0. + # Handle read_diff < 0 here. BZ 1001767. + if amount_read == 0 or amount_read < self.last_amount_read: # if we just started this file, all bets are off self.last_update_time = now - self.last_amount_read = 0 + self.last_amount_read = amount_read self.ave_rate = None return @@ -576,11 +665,11 @@ class RateEstimator: # First update, on reget is the file size if self.last_amount_read: self.last_update_time = now - self.ave_rate = self._temporal_rolling_ave(\ + self.ave_rate = self._temporal_rolling_ave( time_diff, read_diff, self.ave_rate, self.timescale) self.last_amount_read = amount_read #print 'results', time_diff, read_diff, self.ave_rate - + ##################################################################### # result methods def average_rate(self): @@ -616,14 +705,14 @@ class RateEstimator: epsilon = time_diff / timescale if epsilon > 1: epsilon = 1.0 return self._rolling_ave(time_diff, read_diff, last_ave, epsilon) - + def _rolling_ave(self, time_diff, read_diff, last_ave, epsilon): """perform a "rolling average" iteration a rolling average "folds" new data into an existing average with some weight, epsilon. epsilon must be between 0.0 and 1.0 (inclusive) a value of 0.0 means only the old value (initial value) counts, and a value of 1.0 means only the newest value is considered.""" - + try: recent_rate = read_diff / time_diff except ZeroDivisionError: @@ -652,23 +741,25 @@ class RateEstimator: rt = int(rt) if shift <= 0: return rt return float(int(rt) >> shift << shift) - + def format_time(seconds, use_hours=0): if seconds is None or seconds < 0: if use_hours: return '--:--:--' else: return '--:--' + elif seconds == float('inf'): + return 'Infinite' else: seconds = int(seconds) - minutes = seconds / 60 + minutes = seconds // 60 seconds = seconds % 60 if use_hours: - hours = minutes / 60 + hours = minutes // 60 minutes = minutes % 60 return '%02i:%02i:%02i' % (hours, minutes, seconds) else: return '%02i:%02i' % (minutes, seconds) - + def format_number(number, SI=0, space=' '): """Turn numbers into human-readable metric-like numbers""" symbols = ['', # (none) @@ -680,14 +771,14 @@ def format_number(number, SI=0, space=' '): 'E', # exa 'Z', # zetta 'Y'] # yotta - + if SI: step = 1000.0 else: step = 1024.0 thresh = 999 depth = 0 max_depth = len(symbols) - 1 - + # we want numbers between 0 and thresh, but don't exceed the length # of our list. In that event, the formatting will be screwed up, # but it'll still show the right number. @@ -695,7 +786,7 @@ def format_number(number, SI=0, space=' '): depth = depth + 1 number = number / step - if type(number) == type(1) or type(number) == type(1L): + if isinstance(number, integer_types): # it's an int or a long, which means it didn't get divided, # which means it's already short enough format = '%i%s%s' @@ -705,7 +796,7 @@ def format_number(number, SI=0, space=' '): format = '%.1f%s%s' else: format = '%.0f%s%s' - + return(format % (float(number or 0), space, symbols[depth])) def _tst(fn, cur, tot, beg, size, *args): @@ -722,9 +813,77 @@ def _tst(fn, cur, tot, beg, size, *args): time.sleep(delay) tm.end(size) +def _mtst(datas, *args): + print('-' * 79) + tm = TextMultiFileMeter(threaded=False) + + dl_sizes = {} + + num = 0 + total_size = 0 + dl_total_size = 0 + for data in datas: + dl_size = None + if len(data) == 2: + fn, size = data + dl_size = size + if len(data) == 3: + fn, size, dl_size = data + nm = tm.newMeter() + nm.start(fn, "http://www.example.com/path/to/fn/" + fn, fn, size, + text=fn) + num += 1 + assert dl_size is not None + dl_total_size += dl_size + dl_sizes[nm] = dl_size + if size is None or total_size is None: + total_size = None + else: + total_size += size + tm.start(num, total_size) + + num = 0 + off = 0 + for (inc, delay) in args: + off += 1 + while num < ((dl_total_size * off) / len(args)): + num += inc + for nm in tm.meters[:]: + if dl_sizes[nm] <= num: + nm.end(dl_sizes[nm]) + tm.removeMeter(nm) + else: + nm.update(num) + time.sleep(delay) + assert not tm.meters + if __name__ == "__main__": - # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 - # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 + # (1/2): subversion-1.4.4-7.x86_64.rpm 2.4 MB / 85 kB/s 00:28 + # (2/2): mercurial-0.9.5-6.fc8.x86_64.rpm 924 kB / 106 kB/s 00:08 + if len(sys.argv) >= 2 and sys.argv[1] == 'multi': + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", 5000), + ("m-1.0.1-2.fc8.i386.rpm", 10000)), + (100, 0.33), (500, 0.25), (1000, 0.1)) + + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", 5000), + ("m-1.0.1-2.fc8.i386.rpm", None, 10000)), + (100, 0.33), (500, 0.25), (1000, 0.1)) + + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", 2500000), + ("m-1.0.1-2.fc8.i386.rpm", 10000)), + (10, 0.2), (50, 0.1), (1000, 0.1)) + + _mtst((("sm-1.0.0-1.fc8.i386.rpm", 1000), + ("s-1.0.1-1.fc8.i386.rpm", None, 2500000), + ("m-1.0.1-2.fc8.i386.rpm", None, 10000)), + (10, 0.2), (50, 0.1), (1000, 0.1)) + # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) + # (10, 0.2), (100, 0.1), (100, 0.1), (100, 0.25)) + sys.exit(0) + if len(sys.argv) >= 2 and sys.argv[1] == 'total': text_meter_total_size(1000 + 10000 + 10000 + 1000000 + 1000000 + 1000000 + 10000 + 10000 + 10000 + 1000000) |