summaryrefslogtreecommitdiff
path: root/urlgrabber/grabber.py
diff options
context:
space:
mode:
Diffstat (limited to 'urlgrabber/grabber.py')
-rw-r--r--urlgrabber/grabber.py1730
1 files changed, 1730 insertions, 0 deletions
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
new file mode 100644
index 0000000..e090e90
--- /dev/null
+++ b/urlgrabber/grabber.py
@@ -0,0 +1,1730 @@
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
+# Boston, MA 02111-1307 USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
+
+"""A high-level cross-protocol url-grabber.
+
+GENERAL ARGUMENTS (kwargs)
+
+ Where possible, the module-level default is indicated, and legal
+ values are provided.
+
+ copy_local = 0 [0|1]
+
+ ignored except for file:// urls, in which case it specifies
+ whether urlgrab should still make a copy of the file, or simply
+ point to the existing copy. The module level default for this
+ option is 0.
+
+ close_connection = 0 [0|1]
+
+ tells URLGrabber to close the connection after a file has been
+ transfered. This is ignored unless the download happens with the
+ http keepalive handler (keepalive=1). Otherwise, the connection
+ is left open for further use. The module level default for this
+ option is 0 (keepalive connections will not be closed).
+
+ keepalive = 1 [0|1]
+
+ specifies whether keepalive should be used for HTTP/1.1 servers
+ that support it. The module level default for this option is 1
+ (keepalive is enabled).
+
+ progress_obj = None
+
+ a class instance that supports the following methods:
+ po.start(filename, url, basename, length, text)
+ # length will be None if unknown
+ po.update(read) # read == bytes read so far
+ po.end()
+
+ text = None
+
+ specifies alternative text to be passed to the progress meter
+ object. If not given, the default progress meter will use the
+ basename of the file.
+
+ throttle = 1.0
+
+ a number - if it's an int, it's the bytes/second throttle limit.
+ If it's a float, it is first multiplied by bandwidth. If throttle
+ == 0, throttling is disabled. If None, the module-level default
+ (which can be set on default_grabber.throttle) is used. See
+ BANDWIDTH THROTTLING for more information.
+
+ timeout = None
+
+ a positive float expressing the number of seconds to wait for socket
+ operations. If the value is None or 0.0, socket operations will block
+ forever. Setting this option causes urlgrabber to call the settimeout
+ method on the Socket object used for the request. See the Python
+ documentation on settimeout for more information.
+ http://www.python.org/doc/current/lib/socket-objects.html
+
+ bandwidth = 0
+
+ the nominal max bandwidth in bytes/second. If throttle is a float
+ and bandwidth == 0, throttling is disabled. If None, the
+ module-level default (which can be set on
+ default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
+ more information.
+
+ range = None
+
+ a tuple of the form (first_byte, last_byte) describing a byte
+ range to retrieve. Either or both of the values may set to
+ None. If first_byte is None, byte offset 0 is assumed. If
+ last_byte is None, the last byte available is assumed. Note that
+ the range specification is python-like in that (0,10) will yeild
+ the first 10 bytes of the file.
+
+ If set to None, no range will be used.
+
+ reget = None [None|'simple'|'check_timestamp']
+
+ whether to attempt to reget a partially-downloaded file. Reget
+ only applies to .urlgrab and (obviously) only if there is a
+ partially downloaded file. Reget has two modes:
+
+ 'simple' -- the local file will always be trusted. If there
+ are 100 bytes in the local file, then the download will always
+ begin 100 bytes into the requested file.
+
+ 'check_timestamp' -- the timestamp of the server file will be
+ compared to the timestamp of the local file. ONLY if the
+ local file is newer than or the same age as the server file
+ will reget be used. If the server file is newer, or the
+ timestamp is not returned, the entire file will be fetched.
+
+ NOTE: urlgrabber can do very little to verify that the partial
+ file on disk is identical to the beginning of the remote file.
+ You may want to either employ a custom "checkfunc" or simply avoid
+ using reget in situations where corruption is a concern.
+
+ user_agent = 'urlgrabber/VERSION'
+
+ a string, usually of the form 'AGENT/VERSION' that is provided to
+ HTTP servers in the User-agent header. The module level default
+ for this option is "urlgrabber/VERSION".
+
+ http_headers = None
+
+ a tuple of 2-tuples, each containing a header and value. These
+ will be used for http and https requests only. For example, you
+ can do
+ http_headers = (('Pragma', 'no-cache'),)
+
+ ftp_headers = None
+
+ this is just like http_headers, but will be used for ftp requests.
+
+ proxies = None
+
+ a dictionary that maps protocol schemes to proxy hosts. For
+ example, to use a proxy server on host "foo" port 3128 for http
+ and https URLs:
+ proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
+ note that proxy authentication information may be provided using
+ normal URL constructs:
+ proxies={ 'http' : 'http://user:host@foo:3128' }
+ Lastly, if proxies is None, the default environment settings will
+ be used.
+
+ prefix = None
+
+ a url prefix that will be prepended to all requested urls. For
+ example:
+ g = URLGrabber(prefix='http://foo.com/mirror/')
+ g.urlgrab('some/file.txt')
+ ## this will fetch 'http://foo.com/mirror/some/file.txt'
+ This option exists primarily to allow identical behavior to
+ MirrorGroup (and derived) instances. Note: a '/' will be inserted
+ if necessary, so you cannot specify a prefix that ends with a
+ partial file or directory name.
+
+ opener = None
+ No-op when using the curl backend (default)
+
+ cache_openers = True
+ No-op when using the curl backend (default)
+
+ data = None
+
+ Only relevant for the HTTP family (and ignored for other
+ protocols), this allows HTTP POSTs. When the data kwarg is
+ present (and not None), an HTTP request will automatically become
+ a POST rather than GET. This is done by direct passthrough to
+ urllib2. If you use this, you may also want to set the
+ 'Content-length' and 'Content-type' headers with the http_headers
+ option. Note that python 2.2 handles the case of these
+ badly and if you do not use the proper case (shown here), your
+ values will be overridden with the defaults.
+
+ urlparser = URLParser()
+
+ The URLParser class handles pre-processing of URLs, including
+ auth-handling for user/pass encoded in http urls, file handing
+ (that is, filenames not sent as a URL), and URL quoting. If you
+ want to override any of this behavior, you can pass in a
+ replacement instance. See also the 'quote' option.
+
+ quote = None
+
+ Whether or not to quote the path portion of a url.
+ quote = 1 -> quote the URLs (they're not quoted yet)
+ quote = 0 -> do not quote them (they're already quoted)
+ quote = None -> guess what to do
+
+ This option only affects proper urls like 'file:///etc/passwd'; it
+ does not affect 'raw' filenames like '/etc/passwd'. The latter
+ will always be quoted as they are converted to URLs. Also, only
+ the path part of a url is quoted. If you need more fine-grained
+ control, you should probably subclass URLParser and pass it in via
+ the 'urlparser' option.
+
+ ssl_ca_cert = None
+
+ this option can be used if M2Crypto is available and will be
+ ignored otherwise. If provided, it will be used to create an SSL
+ context. If both ssl_ca_cert and ssl_context are provided, then
+ ssl_context will be ignored and a new context will be created from
+ ssl_ca_cert.
+
+ ssl_context = None
+
+ No-op when using the curl backend (default)
+
+
+ self.ssl_verify_peer = True
+
+ Check the server's certificate to make sure it is valid with what our CA validates
+
+ self.ssl_verify_host = True
+
+ Check the server's hostname to make sure it matches the certificate DN
+
+ self.ssl_key = None
+
+ Path to the key the client should use to connect/authenticate with
+
+ self.ssl_key_type = 'PEM'
+
+ PEM or DER - format of key
+
+ self.ssl_cert = None
+
+ Path to the ssl certificate the client should use to to authenticate with
+
+ self.ssl_cert_type = 'PEM'
+
+ PEM or DER - format of certificate
+
+ self.ssl_key_pass = None
+
+ password to access the ssl_key
+
+ self.size = None
+
+ size (in bytes) or Maximum size of the thing being downloaded.
+ This is mostly to keep us from exploding with an endless datastream
+
+ self.max_header_size = 2097152
+
+ Maximum size (in bytes) of the headers.
+
+
+RETRY RELATED ARGUMENTS
+
+ retry = None
+
+ the number of times to retry the grab before bailing. If this is
+ zero, it will retry forever. This was intentional... really, it
+ was :). If this value is not supplied or is supplied but is None
+ retrying does not occur.
+
+ retrycodes = [-1,2,4,5,6,7]
+
+ a sequence of errorcodes (values of e.errno) for which it should
+ retry. See the doc on URLGrabError for more details on this. You
+ might consider modifying a copy of the default codes rather than
+ building yours from scratch so that if the list is extended in the
+ future (or one code is split into two) you can still enjoy the
+ benefits of the default list. You can do that with something like
+ this:
+
+ retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
+ if 12 not in retrycodes:
+ retrycodes.append(12)
+
+ checkfunc = None
+
+ a function to do additional checks. This defaults to None, which
+ means no additional checking. The function should simply return
+ on a successful check. It should raise URLGrabError on an
+ unsuccessful check. Raising of any other exception will be
+ considered immediate failure and no retries will occur.
+
+ If it raises URLGrabError, the error code will determine the retry
+ behavior. Negative error numbers are reserved for use by these
+ passed in functions, so you can use many negative numbers for
+ different types of failure. By default, -1 results in a retry,
+ but this can be customized with retrycodes.
+
+ If you simply pass in a function, it will be given exactly one
+ argument: a CallbackObject instance with the .url attribute
+ defined and either .filename (for urlgrab) or .data (for urlread).
+ For urlgrab, .filename is the name of the local file. For
+ urlread, .data is the actual string data. If you need other
+ arguments passed to the callback (program state of some sort), you
+ can do so like this:
+
+ checkfunc=(function, ('arg1', 2), {'kwarg': 3})
+
+ if the downloaded file has filename /tmp/stuff, then this will
+ result in this call (for urlgrab):
+
+ function(obj, 'arg1', 2, kwarg=3)
+ # obj.filename = '/tmp/stuff'
+ # obj.url = 'http://foo.com/stuff'
+
+ NOTE: both the "args" tuple and "kwargs" dict must be present if
+ you use this syntax, but either (or both) can be empty.
+
+ failure_callback = None
+
+ The callback that gets called during retries when an attempt to
+ fetch a file fails. The syntax for specifying the callback is
+ identical to checkfunc, except for the attributes defined in the
+ CallbackObject instance. The attributes for failure_callback are:
+
+ exception = the raised exception
+ url = the url we're trying to fetch
+ tries = the number of tries so far (including this one)
+ retry = the value of the retry option
+
+ The callback is present primarily to inform the calling program of
+ the failure, but if it raises an exception (including the one it's
+ passed) that exception will NOT be caught and will therefore cause
+ future retries to be aborted.
+
+ The callback is called for EVERY failure, including the last one.
+ On the last try, the callback can raise an alternate exception,
+ but it cannot (without severe trickiness) prevent the exception
+ from being raised.
+
+ interrupt_callback = None
+
+ This callback is called if KeyboardInterrupt is received at any
+ point in the transfer. Basically, this callback can have three
+ impacts on the fetch process based on the way it exits:
+
+ 1) raise no exception: the current fetch will be aborted, but
+ any further retries will still take place
+
+ 2) raise a URLGrabError: if you're using a MirrorGroup, then
+ this will prompt a failover to the next mirror according to
+ the behavior of the MirrorGroup subclass. It is recommended
+ that you raise URLGrabError with code 15, 'user abort'. If
+ you are NOT using a MirrorGroup subclass, then this is the
+ same as (3).
+
+ 3) raise some other exception (such as KeyboardInterrupt), which
+ will not be caught at either the grabber or mirror levels.
+ That is, it will be raised up all the way to the caller.
+
+ This callback is very similar to failure_callback. They are
+ passed the same arguments, so you could use the same function for
+ both.
+
+BANDWIDTH THROTTLING
+
+ urlgrabber supports throttling via two values: throttle and
+ bandwidth Between the two, you can either specify and absolute
+ throttle threshold or specify a theshold as a fraction of maximum
+ available bandwidth.
+
+ throttle is a number - if it's an int, it's the bytes/second
+ throttle limit. If it's a float, it is first multiplied by
+ bandwidth. If throttle == 0, throttling is disabled. If None, the
+ module-level default (which can be set with set_throttle) is used.
+
+ bandwidth is the nominal max bandwidth in bytes/second. If throttle
+ is a float and bandwidth == 0, throttling is disabled. If None, the
+ module-level default (which can be set with set_bandwidth) is used.
+
+ THROTTLING EXAMPLES:
+
+ Lets say you have a 100 Mbps connection. This is (about) 10^8 bits
+ per second, or 12,500,000 Bytes per second. You have a number of
+ throttling options:
+
+ *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
+
+ This will limit urlgrab to use half of your available bandwidth.
+
+ *) set_throttle(6250000) # throttle is an int
+
+ This will also limit urlgrab to use half of your available
+ bandwidth, regardless of what bandwidth is set to.
+
+ *) set_throttle(6250000); set_throttle(1.0) # float
+
+ Use half your bandwidth
+
+ *) set_throttle(6250000); set_throttle(2.0) # float
+
+ Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
+
+ *) set_throttle(6250000); set_throttle(0) # throttle = 0
+
+ Disable throttling - this is more efficient than a very large
+ throttle setting.
+
+ *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
+
+ Disable throttling - this is the default when the module is loaded.
+
+ SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
+
+ While this is flexible, it's not extremely obvious to the user. I
+ suggest you implement a float throttle as a percent to make the
+ distinction between absolute and relative throttling very explicit.
+
+ Also, you may want to convert the units to something more convenient
+ than bytes/second, such as kbps or kB/s, etc.
+
+"""
+
+
+
+import os
+import sys
+import urlparse
+import time
+import string
+import urllib
+import urllib2
+import mimetools
+import thread
+import types
+import stat
+import pycurl
+from ftplib import parse150
+from StringIO import StringIO
+from httplib import HTTPException
+import socket
+from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+
+########################################################################
+# MODULE INITIALIZATION
+########################################################################
+try:
+ exec('from ' + (__name__.split('.'))[0] + ' import __version__')
+except:
+ __version__ = '???'
+
+########################################################################
+# functions for debugging output. These functions are here because they
+# are also part of the module initialization.
+DEBUG = None
+def set_logger(DBOBJ):
+ """Set the DEBUG object. This is called by _init_default_logger when
+ the environment variable URLGRABBER_DEBUG is set, but can also be
+ called by a calling program. Basically, if the calling program uses
+ the logging module and would like to incorporate urlgrabber logging,
+ then it can do so this way. It's probably not necessary as most
+ internal logging is only for debugging purposes.
+
+ The passed-in object should be a logging.Logger instance. It will
+ be pushed into the keepalive and byterange modules if they're
+ being used. The mirror module pulls this object in on import, so
+ you will need to manually push into it. In fact, you may find it
+ tidier to simply push your logging object (or objects) into each
+ of these modules independently.
+ """
+
+ global DEBUG
+ DEBUG = DBOBJ
+
+def _init_default_logger(logspec=None):
+ '''Examines the environment variable URLGRABBER_DEBUG and creates
+ a logging object (logging.logger) based on the contents. It takes
+ the form
+
+ URLGRABBER_DEBUG=level,filename
+
+ where "level" can be either an integer or a log level from the
+ logging module (DEBUG, INFO, etc). If the integer is zero or
+ less, logging will be disabled. Filename is the filename where
+ logs will be sent. If it is "-", then stdout will be used. If
+ the filename is empty or missing, stderr will be used. If the
+ variable cannot be processed or the logging module cannot be
+ imported (python < 2.3) then logging will be disabled. Here are
+ some examples:
+
+ URLGRABBER_DEBUG=1,debug.txt # log everything to debug.txt
+ URLGRABBER_DEBUG=WARNING,- # log warning and higher to stdout
+ URLGRABBER_DEBUG=INFO # log info and higher to stderr
+
+ This funtion is called during module initialization. It is not
+ intended to be called from outside. The only reason it is a
+ function at all is to keep the module-level namespace tidy and to
+ collect the code into a nice block.'''
+
+ try:
+ if logspec is None:
+ logspec = os.environ['URLGRABBER_DEBUG']
+ dbinfo = logspec.split(',')
+ import logging
+ level = logging._levelNames.get(dbinfo[0], None)
+ if level is None: level = int(dbinfo[0])
+ if level < 1: raise ValueError()
+
+ formatter = logging.Formatter('%(asctime)s %(message)s')
+ if len(dbinfo) > 1: filename = dbinfo[1]
+ else: filename = ''
+ if filename == '': handler = logging.StreamHandler(sys.stderr)
+ elif filename == '-': handler = logging.StreamHandler(sys.stdout)
+ else: handler = logging.FileHandler(filename)
+ handler.setFormatter(formatter)
+ DBOBJ = logging.getLogger('urlgrabber')
+ DBOBJ.addHandler(handler)
+ DBOBJ.setLevel(level)
+ except (KeyError, ImportError, ValueError):
+ DBOBJ = None
+ set_logger(DBOBJ)
+
+def _log_package_state():
+ if not DEBUG: return
+ DEBUG.info('urlgrabber version = %s' % __version__)
+ DEBUG.info('trans function "_" = %s' % _)
+
+_init_default_logger()
+_log_package_state()
+
+
+# normally this would be from i18n or something like it ...
+def _(st):
+ return st
+
+########################################################################
+# END MODULE INITIALIZATION
+########################################################################
+
+
+
+class URLGrabError(IOError):
+ """
+ URLGrabError error codes:
+
+ URLGrabber error codes (0 -- 255)
+ 0 - everything looks good (you should never see this)
+ 1 - malformed url
+ 2 - local file doesn't exist
+ 3 - request for non-file local file (dir, etc)
+ 4 - IOError on fetch
+ 5 - OSError on fetch
+ 6 - no content length header when we expected one
+ 7 - HTTPException
+ 8 - Exceeded read limit (for urlread)
+ 9 - Requested byte range not satisfiable.
+ 10 - Byte range requested, but range support unavailable
+ 11 - Illegal reget mode
+ 12 - Socket timeout
+ 13 - malformed proxy url
+ 14 - HTTPError (includes .code and .exception attributes)
+ 15 - user abort
+ 16 - error writing to local file
+
+ MirrorGroup error codes (256 -- 511)
+ 256 - No more mirrors left to try
+
+ Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
+ [ this range reserved for application-specific error codes ]
+
+ Retry codes (< 0)
+ -1 - retry the download, unknown reason
+
+ Note: to test which group a code is in, you can simply do integer
+ division by 256: e.errno / 256
+
+ Negative codes are reserved for use by functions passed in to
+ retrygrab with checkfunc. The value -1 is built in as a generic
+ retry code and is already included in the retrycodes list.
+ Therefore, you can create a custom check function that simply
+ returns -1 and the fetch will be re-tried. For more customized
+ retries, you can use other negative number and include them in
+ retry-codes. This is nice for outputting useful messages about
+ what failed.
+
+ You can use these error codes like so:
+ try: urlgrab(url)
+ except URLGrabError, e:
+ if e.errno == 3: ...
+ # or
+ print e.strerror
+ # or simply
+ print e #### print '[Errno %i] %s' % (e.errno, e.strerror)
+ """
+ def __init__(self, *args):
+ IOError.__init__(self, *args)
+ self.url = "No url specified"
+
+class CallbackObject:
+ """Container for returned callback data.
+
+ This is currently a dummy class into which urlgrabber can stuff
+ information for passing to callbacks. This way, the prototype for
+ all callbacks is the same, regardless of the data that will be
+ passed back. Any function that accepts a callback function as an
+ argument SHOULD document what it will define in this object.
+
+ It is possible that this class will have some greater
+ functionality in the future.
+ """
+ def __init__(self, **kwargs):
+ self.__dict__.update(kwargs)
+
+def urlgrab(url, filename=None, **kwargs):
+ """grab the file at <url> and make a local copy at <filename>
+ If filename is none, the basename of the url is used.
+ urlgrab returns the filename of the local file, which may be different
+ from the passed-in filename if the copy_local kwarg == 0.
+
+ See module documentation for a description of possible kwargs.
+ """
+ return default_grabber.urlgrab(url, filename, **kwargs)
+
+def urlopen(url, **kwargs):
+ """open the url and return a file object
+ If a progress object or throttle specifications exist, then
+ a special file object will be returned that supports them.
+ The file object can be treated like any other file object.
+
+ See module documentation for a description of possible kwargs.
+ """
+ return default_grabber.urlopen(url, **kwargs)
+
+def urlread(url, limit=None, **kwargs):
+ """read the url into a string, up to 'limit' bytes
+ If the limit is exceeded, an exception will be thrown. Note that urlread
+ is NOT intended to be used as a way of saying "I want the first N bytes"
+ but rather 'read the whole file into memory, but don't use too much'
+
+ See module documentation for a description of possible kwargs.
+ """
+ return default_grabber.urlread(url, limit, **kwargs)
+
+
+class URLParser:
+ """Process the URLs before passing them to urllib2.
+
+ This class does several things:
+
+ * add any prefix
+ * translate a "raw" file to a proper file: url
+ * handle any http or https auth that's encoded within the url
+ * quote the url
+
+ Only the "parse" method is called directly, and it calls sub-methods.
+
+ An instance of this class is held in the options object, which
+ means that it's easy to change the behavior by sub-classing and
+ passing the replacement in. It need only have a method like:
+
+ url, parts = urlparser.parse(url, opts)
+ """
+
+ def parse(self, url, opts):
+ """parse the url and return the (modified) url and its parts
+
+ Note: a raw file WILL be quoted when it's converted to a URL.
+ However, other urls (ones which come with a proper scheme) may
+ or may not be quoted according to opts.quote
+
+ opts.quote = 1 --> quote it
+ opts.quote = 0 --> do not quote it
+ opts.quote = None --> guess
+ """
+ quote = opts.quote
+
+ if opts.prefix:
+ url = self.add_prefix(url, opts.prefix)
+
+ parts = urlparse.urlparse(url)
+ (scheme, host, path, parm, query, frag) = parts
+
+ if not scheme or (len(scheme) == 1 and scheme in string.letters):
+ # if a scheme isn't specified, we guess that it's "file:"
+ if url[0] not in '/\\': url = os.path.abspath(url)
+ url = 'file:' + urllib.pathname2url(url)
+ parts = urlparse.urlparse(url)
+ quote = 0 # pathname2url quotes, so we won't do it again
+
+ if scheme in ['http', 'https']:
+ parts = self.process_http(parts, url)
+
+ if quote is None:
+ quote = self.guess_should_quote(parts)
+ if quote:
+ parts = self.quote(parts)
+
+ url = urlparse.urlunparse(parts)
+ return url, parts
+
+ def add_prefix(self, url, prefix):
+ if prefix[-1] == '/' or url[0] == '/':
+ url = prefix + url
+ else:
+ url = prefix + '/' + url
+ return url
+
+ def process_http(self, parts, url):
+ (scheme, host, path, parm, query, frag) = parts
+ # TODO: auth-parsing here, maybe? pycurl doesn't really need it
+ return (scheme, host, path, parm, query, frag)
+
+ def quote(self, parts):
+ """quote the URL
+
+ This method quotes ONLY the path part. If you need to quote
+ other parts, you should override this and pass in your derived
+ class. The other alternative is to quote other parts before
+ passing into urlgrabber.
+ """
+ (scheme, host, path, parm, query, frag) = parts
+ path = urllib.quote(path)
+ return (scheme, host, path, parm, query, frag)
+
+ hexvals = '0123456789ABCDEF'
+ def guess_should_quote(self, parts):
+ """
+ Guess whether we should quote a path. This amounts to
+ guessing whether it's already quoted.
+
+ find ' ' -> 1
+ find '%' -> 1
+ find '%XX' -> 0
+ else -> 1
+ """
+ (scheme, host, path, parm, query, frag) = parts
+ if ' ' in path:
+ return 1
+ ind = string.find(path, '%')
+ if ind > -1:
+ while ind > -1:
+ if len(path) < ind+3:
+ return 1
+ code = path[ind+1:ind+3].upper()
+ if code[0] not in self.hexvals or \
+ code[1] not in self.hexvals:
+ return 1
+ ind = string.find(path, '%', ind+1)
+ return 0
+ return 1
+
+class URLGrabberOptions:
+ """Class to ease kwargs handling."""
+
+ def __init__(self, delegate=None, **kwargs):
+ """Initialize URLGrabberOptions object.
+ Set default values for all options and then update options specified
+ in kwargs.
+ """
+ self.delegate = delegate
+ if delegate is None:
+ self._set_defaults()
+ self._set_attributes(**kwargs)
+
+ def __getattr__(self, name):
+ if self.delegate and hasattr(self.delegate, name):
+ return getattr(self.delegate, name)
+ raise AttributeError, name
+
+ def raw_throttle(self):
+ """Calculate raw throttle value from throttle and bandwidth
+ values.
+ """
+ if self.throttle <= 0:
+ return 0
+ elif type(self.throttle) == type(0):
+ return float(self.throttle)
+ else: # throttle is a float
+ return self.bandwidth * self.throttle
+
+ def derive(self, **kwargs):
+ """Create a derived URLGrabberOptions instance.
+ This method creates a new instance and overrides the
+ options specified in kwargs.
+ """
+ return URLGrabberOptions(delegate=self, **kwargs)
+
+ def _set_attributes(self, **kwargs):
+ """Update object attributes with those provided in kwargs."""
+ self.__dict__.update(kwargs)
+ if kwargs.has_key('range'):
+ # normalize the supplied range value
+ self.range = range_tuple_normalize(self.range)
+ if not self.reget in [None, 'simple', 'check_timestamp']:
+ raise URLGrabError(11, _('Illegal reget mode: %s') \
+ % (self.reget, ))
+
+ def _set_defaults(self):
+ """Set all options to their default values.
+ When adding new options, make sure a default is
+ provided here.
+ """
+ self.progress_obj = None
+ self.throttle = 1.0
+ self.bandwidth = 0
+ self.retry = None
+ self.retrycodes = [-1,2,4,5,6,7]
+ self.checkfunc = None
+ self.copy_local = 0
+ self.close_connection = 0
+ self.range = None
+ self.user_agent = 'urlgrabber/%s' % __version__
+ self.keepalive = 1
+ self.proxies = None
+ self.reget = None
+ self.failure_callback = None
+ self.interrupt_callback = None
+ self.prefix = None
+ self.opener = None
+ self.cache_openers = True
+ self.timeout = None
+ self.text = None
+ self.http_headers = None
+ self.ftp_headers = None
+ self.data = None
+ self.urlparser = URLParser()
+ self.quote = None
+ self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
+ self.ssl_context = None # no-op in pycurl
+ self.ssl_verify_peer = True # check peer's cert for authenticityb
+ self.ssl_verify_host = True # make sure who they are and who the cert is for matches
+ self.ssl_key = None # client key
+ self.ssl_key_type = 'PEM' #(or DER)
+ self.ssl_cert = None # client cert
+ self.ssl_cert_type = 'PEM' # (or DER)
+ self.ssl_key_pass = None # password to access the key
+ self.size = None # if we know how big the thing we're getting is going
+ # to be. this is ultimately a MAXIMUM size for the file
+ self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
+
+ def __repr__(self):
+ return self.format()
+
+ def format(self, indent=' '):
+ keys = self.__dict__.keys()
+ if self.delegate is not None:
+ keys.remove('delegate')
+ keys.sort()
+ s = '{\n'
+ for k in keys:
+ s = s + indent + '%-15s: %s,\n' % \
+ (repr(k), repr(self.__dict__[k]))
+ if self.delegate:
+ df = self.delegate.format(indent + ' ')
+ s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
+ s = s + indent + '}'
+ return s
+
+class URLGrabber:
+ """Provides easy opening of URLs with a variety of options.
+
+ All options are specified as kwargs. Options may be specified when
+ the class is created and may be overridden on a per request basis.
+
+ New objects inherit default values from default_grabber.
+ """
+
+ def __init__(self, **kwargs):
+ self.opts = URLGrabberOptions(**kwargs)
+
+ def _retry(self, opts, func, *args):
+ tries = 0
+ while 1:
+ # there are only two ways out of this loop. The second has
+ # several "sub-ways"
+ # 1) via the return in the "try" block
+ # 2) by some exception being raised
+ # a) an excepton is raised that we don't "except"
+ # b) a callback raises ANY exception
+ # c) we're not retry-ing or have run out of retries
+ # d) the URLGrabError code is not in retrycodes
+ # beware of infinite loops :)
+ tries = tries + 1
+ exception = None
+ retrycode = None
+ callback = None
+ if DEBUG: DEBUG.info('attempt %i/%s: %s',
+ tries, opts.retry, args[0])
+ try:
+ r = apply(func, (opts,) + args, {})
+ if DEBUG: DEBUG.info('success')
+ return r
+ except URLGrabError, e:
+ exception = e
+ callback = opts.failure_callback
+ retrycode = e.errno
+ except KeyboardInterrupt, e:
+ exception = e
+ callback = opts.interrupt_callback
+
+ if DEBUG: DEBUG.info('exception: %s', exception)
+ if callback:
+ if DEBUG: DEBUG.info('calling callback: %s', callback)
+ cb_func, cb_args, cb_kwargs = self._make_callback(callback)
+ obj = CallbackObject(exception=exception, url=args[0],
+ tries=tries, retry=opts.retry)
+ cb_func(obj, *cb_args, **cb_kwargs)
+
+ if (opts.retry is None) or (tries == opts.retry):
+ if DEBUG: DEBUG.info('retries exceeded, re-raising')
+ raise
+
+ if (retrycode is not None) and (retrycode not in opts.retrycodes):
+ if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
+ retrycode, opts.retrycodes)
+ raise
+
+ def urlopen(self, url, **kwargs):
+ """open the url and return a file object
+ If a progress object or throttle value specified when this
+ object was created, then a special file object will be
+ returned that supports them. The file object can be treated
+ like any other file object.
+ """
+ opts = self.opts.derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+ (url,parts) = opts.urlparser.parse(url, opts)
+ def retryfunc(opts, url):
+ return PyCurlFileObject(url, filename=None, opts=opts)
+ return self._retry(opts, retryfunc, url)
+
+ def urlgrab(self, url, filename=None, **kwargs):
+ """grab the file at <url> and make a local copy at <filename>
+ If filename is none, the basename of the url is used.
+ urlgrab returns the filename of the local file, which may be
+ different from the passed-in filename if copy_local == 0.
+ """
+ opts = self.opts.derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+ (url,parts) = opts.urlparser.parse(url, opts)
+ (scheme, host, path, parm, query, frag) = parts
+ if filename is None:
+ filename = os.path.basename( urllib.unquote(path) )
+ if scheme == 'file' and not opts.copy_local:
+ # just return the name of the local file - don't make a
+ # copy currently
+ path = urllib.url2pathname(path)
+ if host:
+ path = os.path.normpath('//' + host + path)
+ if not os.path.exists(path):
+ err = URLGrabError(2,
+ _('Local file does not exist: %s') % (path, ))
+ err.url = url
+ raise err
+ elif not os.path.isfile(path):
+ err = URLGrabError(3,
+ _('Not a normal file: %s') % (path, ))
+ err.url = url
+ raise err
+
+ elif not opts.range:
+ if not opts.checkfunc is None:
+ cb_func, cb_args, cb_kwargs = \
+ self._make_callback(opts.checkfunc)
+ obj = CallbackObject()
+ obj.filename = path
+ obj.url = url
+ apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ return path
+
+ def retryfunc(opts, url, filename):
+ fo = PyCurlFileObject(url, filename, opts)
+ try:
+ fo._do_grab()
+ if not opts.checkfunc is None:
+ cb_func, cb_args, cb_kwargs = \
+ self._make_callback(opts.checkfunc)
+ obj = CallbackObject()
+ obj.filename = filename
+ obj.url = url
+ apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ finally:
+ fo.close()
+ return filename
+
+ return self._retry(opts, retryfunc, url, filename)
+
+ def urlread(self, url, limit=None, **kwargs):
+ """read the url into a string, up to 'limit' bytes
+ If the limit is exceeded, an exception will be thrown. Note
+ that urlread is NOT intended to be used as a way of saying
+ "I want the first N bytes" but rather 'read the whole file
+ into memory, but don't use too much'
+ """
+ opts = self.opts.derive(**kwargs)
+ if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+ (url,parts) = opts.urlparser.parse(url, opts)
+ if limit is not None:
+ limit = limit + 1
+
+ def retryfunc(opts, url, limit):
+ fo = PyCurlFileObject(url, filename=None, opts=opts)
+ s = ''
+ try:
+ # this is an unfortunate thing. Some file-like objects
+ # have a default "limit" of None, while the built-in (real)
+ # file objects have -1. They each break the other, so for
+ # now, we just force the default if necessary.
+ if limit is None: s = fo.read()
+ else: s = fo.read(limit)
+
+ if not opts.checkfunc is None:
+ cb_func, cb_args, cb_kwargs = \
+ self._make_callback(opts.checkfunc)
+ obj = CallbackObject()
+ obj.data = s
+ obj.url = url
+ apply(cb_func, (obj, )+cb_args, cb_kwargs)
+ finally:
+ fo.close()
+ return s
+
+ s = self._retry(opts, retryfunc, url, limit)
+ if limit and len(s) > limit:
+ err = URLGrabError(8,
+ _('Exceeded limit (%i): %s') % (limit, url))
+ err.url = url
+ raise err
+
+ return s
+
+ def _make_callback(self, callback_obj):
+ if callable(callback_obj):
+ return callback_obj, (), {}
+ else:
+ return callback_obj
+
+# create the default URLGrabber used by urlXXX functions.
+# NOTE: actual defaults are set in URLGrabberOptions
+default_grabber = URLGrabber()
+
+
+class PyCurlFileObject():
+ def __init__(self, url, filename, opts):
+ self.fo = None
+ self._hdr_dump = ''
+ self._parsed_hdr = None
+ self.url = url
+ self.scheme = urlparse.urlsplit(self.url)[0]
+ self.filename = filename
+ self.append = False
+ self.reget_time = None
+ self.opts = opts
+ if self.opts.reget == 'check_timestamp':
+ raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
+ self._complete = False
+ self._rbuf = ''
+ self._rbufsize = 1024*8
+ self._ttime = time.time()
+ self._tsize = 0
+ self._amount_read = 0
+ self._reget_length = 0
+ self._prog_running = False
+ self._error = (None, None)
+ self.size = None
+ self._do_open()
+
+
+ def __getattr__(self, name):
+ """This effectively allows us to wrap at the instance level.
+ Any attribute not found in _this_ object will be searched for
+ in self.fo. This includes methods."""
+
+ if hasattr(self.fo, name):
+ return getattr(self.fo, name)
+ raise AttributeError, name
+
+ def _retrieve(self, buf):
+ try:
+ if not self._prog_running:
+ if self.opts.progress_obj:
+ size = self.size + self._reget_length
+ self.opts.progress_obj.start(self._prog_reportname,
+ urllib.unquote(self.url),
+ self._prog_basename,
+ size=size,
+ text=self.opts.text)
+ self._prog_running = True
+ self.opts.progress_obj.update(self._amount_read)
+
+ self._amount_read += len(buf)
+ self.fo.write(buf)
+ return len(buf)
+ except KeyboardInterrupt:
+ return -1
+
+ def _hdr_retrieve(self, buf):
+ if self._over_max_size(cur=len(self._hdr_dump),
+ max_size=self.opts.max_header_size):
+ return -1
+ try:
+ self._hdr_dump += buf
+ # we have to get the size before we do the progress obj start
+ # but we can't do that w/o making it do 2 connects, which sucks
+ # so we cheat and stuff it in here in the hdr_retrieve
+ if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
+ length = buf.split(':')[1]
+ self.size = int(length)
+ elif self.scheme in ['ftp']:
+ s = None
+ if buf.startswith('213 '):
+ s = buf[3:].strip()
+ elif buf.startswith('150 '):
+ s = parse150(buf)
+ if s:
+ self.size = int(s)
+
+ return len(buf)
+ except KeyboardInterrupt:
+ return pycurl.READFUNC_ABORT
+
+ def _return_hdr_obj(self):
+ if self._parsed_hdr:
+ return self._parsed_hdr
+ statusend = self._hdr_dump.find('\n')
+ hdrfp = StringIO()
+ hdrfp.write(self._hdr_dump[statusend:])
+ self._parsed_hdr = mimetools.Message(hdrfp)
+ return self._parsed_hdr
+
+ hdr = property(_return_hdr_obj)
+ http_code = property(fget=
+ lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
+
+ def _set_opts(self, opts={}):
+ # XXX
+ if not opts:
+ opts = self.opts
+
+
+ # defaults we're always going to set
+ self.curl_obj.setopt(pycurl.NOPROGRESS, False)
+ self.curl_obj.setopt(pycurl.NOSIGNAL, True)
+ self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
+ self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
+ self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
+ self.curl_obj.setopt(pycurl.FAILONERROR, True)
+ self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+
+ if DEBUG:
+ self.curl_obj.setopt(pycurl.VERBOSE, True)
+ if opts.user_agent:
+ self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
+
+ # maybe to be options later
+ self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
+ self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
+
+ # timeouts
+ timeout = 300
+ if opts.timeout:
+ timeout = int(opts.timeout)
+ self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+
+ # ssl options
+ if self.scheme == 'https':
+ if opts.ssl_ca_cert: # this may do ZERO with nss according to curl docs
+ self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
+ self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
+ self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
+ self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+ if opts.ssl_key:
+ self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
+ if opts.ssl_key_type:
+ self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
+ if opts.ssl_cert:
+ self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+ if opts.ssl_cert_type:
+ self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
+ if opts.ssl_key_pass:
+ self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
+
+ #headers:
+ if opts.http_headers and self.scheme in ('http', 'https'):
+ headers = []
+ for (tag, content) in opts.http_headers:
+ headers.append('%s:%s' % (tag, content))
+ self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
+
+ # ranges:
+ if opts.range or opts.reget:
+ range_str = self._build_range()
+ if range_str:
+ self.curl_obj.setopt(pycurl.RANGE, range_str)
+
+ # throttle/bandwidth
+ if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
+ self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
+
+ # proxy settings
+ if opts.proxies:
+ for (scheme, proxy) in opts.proxies.items():
+ if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
+ if scheme not in ('ftp'):
+ continue
+ else:
+ if proxy == '_none_': proxy = ""
+ self.curl_obj.setopt(pycurl.PROXY, proxy)
+ elif self.scheme in ('http', 'https'):
+ if scheme not in ('http', 'https'):
+ continue
+ else:
+ if proxy == '_none_': proxy = ""
+ self.curl_obj.setopt(pycurl.PROXY, proxy)
+
+ # FIXME username/password/auth settings
+
+ #posts - simple - expects the fields as they are
+ if opts.data:
+ self.curl_obj.setopt(pycurl.POST, True)
+ self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
+
+ # our url
+ self.curl_obj.setopt(pycurl.URL, self.url)
+
+
+ def _do_perform(self):
+ if self._complete:
+ return
+
+ try:
+ self.curl_obj.perform()
+ except pycurl.error, e:
+ # XXX - break some of these out a bit more clearly
+ # to other URLGrabErrors from
+ # http://curl.haxx.se/libcurl/c/libcurl-errors.html
+ # this covers e.args[0] == 22 pretty well - which will be common
+
+ code = self.http_code
+ errcode = e.args[0]
+ if self._error[0]:
+ errcode = self._error[0]
+
+ if errcode == 23 and code >= 200 and code < 299:
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+ err.url = self.url
+
+ # this is probably wrong but ultimately this is what happens
+ # we have a legit http code and a pycurl 'writer failed' code
+ # which almost always means something aborted it from outside
+ # since we cannot know what it is -I'm banking on it being
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # figure out what aborted the pycurl process FIXME
+ raise KeyboardInterrupt
+
+ elif errcode == 28:
+ err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+ elif errcode == 35:
+ msg = _("problem making ssl connection")
+ err = URLGrabError(14, msg)
+ err.url = self.url
+ raise err
+ elif errcode == 37:
+ msg = _("Could not open/read %s") % (self.url)
+ err = URLGrabError(14, msg)
+ err.url = self.url
+ raise err
+
+ elif errcode == 42:
+ err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+ err.url = self.url
+ # this is probably wrong but ultimately this is what happens
+ # we have a legit http code and a pycurl 'writer failed' code
+ # which almost always means something aborted it from outside
+ # since we cannot know what it is -I'm banking on it being
+ # a ctrl-c. XXXX - if there's a way of going back two raises to
+ # figure out what aborted the pycurl process FIXME
+ raise KeyboardInterrupt
+
+ elif errcode == 58:
+ msg = _("problem with the local client certificate")
+ err = URLGrabError(14, msg)
+ err.url = self.url
+ raise err
+
+ elif errcode == 60:
+ msg = _("client cert cannot be verified or client cert incorrect")
+ err = URLGrabError(14, msg)
+ err.url = self.url
+ raise err
+
+ elif errcode == 63:
+ if self._error[1]:
+ msg = self._error[1]
+ else:
+ msg = _("Max download size exceeded on %s") % (self.url)
+ err = URLGrabError(14, msg)
+ err.url = self.url
+ raise err
+
+ elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
+ msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+ else:
+ msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
+ code = errcode
+ err = URLGrabError(14, msg)
+ err.code = code
+ err.exception = e
+ raise err
+
+ def _do_open(self):
+ self.curl_obj = _curl_cache
+ self.curl_obj.reset() # reset all old settings away, just in case
+ # setup any ranges
+ self._set_opts()
+ self._do_grab()
+ return self.fo
+
+ def _add_headers(self):
+ pass
+
+ def _build_range(self):
+ reget_length = 0
+ rt = None
+ if self.opts.reget and type(self.filename) in types.StringTypes:
+ # we have reget turned on and we're dumping to a file
+ try:
+ s = os.stat(self.filename)
+ except OSError:
+ pass
+ else:
+ self.reget_time = s[stat.ST_MTIME]
+ reget_length = s[stat.ST_SIZE]
+
+ # Set initial length when regetting
+ self._amount_read = reget_length
+ self._reget_length = reget_length # set where we started from, too
+
+ rt = reget_length, ''
+ self.append = 1
+
+ if self.opts.range:
+ rt = self.opts.range
+ if rt[0]: rt = (rt[0] + reget_length, rt[1])
+
+ if rt:
+ header = range_tuple_to_header(rt)
+ if header:
+ return header.split('=')[1]
+
+
+
+ def _make_request(self, req, opener):
+ #XXXX
+ # This doesn't do anything really, but we could use this
+ # instead of do_open() to catch a lot of crap errors as
+ # mstenner did before here
+ return (self.fo, self.hdr)
+
+ try:
+ if self.opts.timeout:
+ old_to = socket.getdefaulttimeout()
+ socket.setdefaulttimeout(self.opts.timeout)
+ try:
+ fo = opener.open(req)
+ finally:
+ socket.setdefaulttimeout(old_to)
+ else:
+ fo = opener.open(req)
+ hdr = fo.info()
+ except ValueError, e:
+ err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
+ err.url = self.url
+ raise err
+
+ except RangeError, e:
+ err = URLGrabError(9, _('%s on %s') % (e, self.url))
+ err.url = self.url
+ raise err
+ except urllib2.HTTPError, e:
+ new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
+ new_e.code = e.code
+ new_e.exception = e
+ new_e.url = self.url
+ raise new_e
+ except IOError, e:
+ if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
+ err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+ else:
+ err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ except OSError, e:
+ err = URLGrabError(5, _('%s on %s') % (e, self.url))
+ err.url = self.url
+ raise err
+
+ except HTTPException, e:
+ err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
+ (e.__class__.__name__, self.url, e))
+ err.url = self.url
+ raise err
+
+ else:
+ return (fo, hdr)
+
+ def _do_grab(self):
+ """dump the file to a filename or StringIO buffer"""
+
+ if self._complete:
+ return
+ _was_filename = False
+ if type(self.filename) in types.StringTypes and self.filename:
+ _was_filename = True
+ self._prog_reportname = str(self.filename)
+ self._prog_basename = os.path.basename(self.filename)
+
+ if self.append: mode = 'ab'
+ else: mode = 'wb'
+
+ if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+ (self.filename, mode))
+ try:
+ self.fo = open(self.filename, mode)
+ except IOError, e:
+ err = URLGrabError(16, _(\
+ 'error opening local file from %s, IOError: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ else:
+ self._prog_reportname = 'MEMORY'
+ self._prog_basename = 'MEMORY'
+
+
+ self.fo = StringIO()
+ # if this is to be a tempfile instead....
+ # it just makes crap in the tempdir
+ #fh, self._temp_name = mkstemp()
+ #self.fo = open(self._temp_name, 'wb')
+
+
+ self._do_perform()
+
+
+
+ if _was_filename:
+ # close it up
+ self.fo.flush()
+ self.fo.close()
+ # set the time
+ mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+ if mod_time != -1:
+ os.utime(self.filename, (mod_time, mod_time))
+ # re open it
+ self.fo = open(self.filename, 'r')
+ else:
+ #self.fo = open(self._temp_name, 'r')
+ self.fo.seek(0)
+
+ self._complete = True
+
+ def _fill_buffer(self, amt=None):
+ """fill the buffer to contain at least 'amt' bytes by reading
+ from the underlying file object. If amt is None, then it will
+ read until it gets nothing more. It updates the progress meter
+ and throttles after every self._rbufsize bytes."""
+ # the _rbuf test is only in this first 'if' for speed. It's not
+ # logically necessary
+ if self._rbuf and not amt is None:
+ L = len(self._rbuf)
+ if amt > L:
+ amt = amt - L
+ else:
+ return
+
+ # if we've made it here, then we don't have enough in the buffer
+ # and we need to read more.
+
+ if not self._complete: self._do_grab() #XXX cheater - change on ranges
+
+ buf = [self._rbuf]
+ bufsize = len(self._rbuf)
+ while amt is None or amt:
+ # first, delay if necessary for throttling reasons
+ if self.opts.raw_throttle():
+ diff = self._tsize/self.opts.raw_throttle() - \
+ (time.time() - self._ttime)
+ if diff > 0: time.sleep(diff)
+ self._ttime = time.time()
+
+ # now read some data, up to self._rbufsize
+ if amt is None: readamount = self._rbufsize
+ else: readamount = min(amt, self._rbufsize)
+ try:
+ new = self.fo.read(readamount)
+ except socket.error, e:
+ err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ except socket.timeout, e:
+ raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+ err.url = self.url
+ raise err
+
+ except IOError, e:
+ raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
+ err.url = self.url
+ raise err
+
+ newsize = len(new)
+ if not newsize: break # no more to read
+
+ if amt: amt = amt - newsize
+ buf.append(new)
+ bufsize = bufsize + newsize
+ self._tsize = newsize
+ self._amount_read = self._amount_read + newsize
+ #if self.opts.progress_obj:
+ # self.opts.progress_obj.update(self._amount_read)
+
+ self._rbuf = string.join(buf, '')
+ return
+
+ def _progress_update(self, download_total, downloaded, upload_total, uploaded):
+ if self._over_max_size(cur=self._amount_read-self._reget_length):
+ return -1
+
+ try:
+ if self._prog_running:
+ downloaded += self._reget_length
+ self.opts.progress_obj.update(downloaded)
+ except KeyboardInterrupt:
+ return -1
+
+ def _over_max_size(self, cur, max_size=None):
+
+ if not max_size:
+ max_size = self.size
+ if self.opts.size: # if we set an opts size use that, no matter what
+ max_size = self.opts.size
+ if not max_size: return False # if we have None for all of the Max then this is dumb
+ if cur > max_size + max_size*.10:
+
+ msg = _("Downloaded more than max size for %s: %s > %s") \
+ % (self.url, cur, max_size)
+ self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
+ return True
+ return False
+
+ def _to_utf8(self, obj, errors='replace'):
+ '''convert 'unicode' to an encoded utf-8 byte string '''
+ # stolen from yum.i18n
+ if isinstance(obj, unicode):
+ obj = obj.encode('utf-8', errors)
+ return obj
+
+ def read(self, amt=None):
+ self._fill_buffer(amt)
+ if amt is None:
+ s, self._rbuf = self._rbuf, ''
+ else:
+ s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
+ return s
+
+ def readline(self, limit=-1):
+ if not self._complete: self._do_grab()
+ return self.fo.readline()
+
+ i = string.find(self._rbuf, '\n')
+ while i < 0 and not (0 < limit <= len(self._rbuf)):
+ L = len(self._rbuf)
+ self._fill_buffer(L + self._rbufsize)
+ if not len(self._rbuf) > L: break
+ i = string.find(self._rbuf, '\n', L)
+
+ if i < 0: i = len(self._rbuf)
+ else: i = i+1
+ if 0 <= limit < len(self._rbuf): i = limit
+
+ s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+ return s
+
+ def close(self):
+ if self._prog_running:
+ self.opts.progress_obj.end(self._amount_read)
+ self.fo.close()
+
+
+_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+
+
+#####################################################################
+# DEPRECATED FUNCTIONS
+def set_throttle(new_throttle):
+ """Deprecated. Use: default_grabber.throttle = new_throttle"""
+ default_grabber.throttle = new_throttle
+
+def set_bandwidth(new_bandwidth):
+ """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
+ default_grabber.bandwidth = new_bandwidth
+
+def set_progress_obj(new_progress_obj):
+ """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
+ default_grabber.progress_obj = new_progress_obj
+
+def set_user_agent(new_user_agent):
+ """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
+ default_grabber.user_agent = new_user_agent
+
+def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+ progress_obj=None, throttle=None, bandwidth=None,
+ numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
+ """Deprecated. Use: urlgrab() with the retry arg instead"""
+ kwargs = {'copy_local' : copy_local,
+ 'close_connection' : close_connection,
+ 'progress_obj' : progress_obj,
+ 'throttle' : throttle,
+ 'bandwidth' : bandwidth,
+ 'retry' : numtries,
+ 'retrycodes' : retrycodes,
+ 'checkfunc' : checkfunc
+ }
+ return urlgrab(url, filename, **kwargs)
+
+
+#####################################################################
+# TESTING
+def _main_test():
+ try: url, filename = sys.argv[1:3]
+ except ValueError:
+ print 'usage:', sys.argv[0], \
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+ sys.exit()
+
+ kwargs = {}
+ for a in sys.argv[3:]:
+ k, v = string.split(a, '=', 1)
+ kwargs[k] = int(v)
+
+ set_throttle(1.0)
+ set_bandwidth(32 * 1024)
+ print "throttle: %s, throttle bandwidth: %s B/s" % (default_grabber.throttle,
+ default_grabber.bandwidth)
+
+ try: from progress import text_progress_meter
+ except ImportError, e: pass
+ else: kwargs['progress_obj'] = text_progress_meter()
+
+ try: name = apply(urlgrab, (url, filename), kwargs)
+ except URLGrabError, e: print e
+ else: print 'LOCAL FILE:', name
+
+
+def _retry_test():
+ try: url, filename = sys.argv[1:3]
+ except ValueError:
+ print 'usage:', sys.argv[0], \
+ '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+ sys.exit()
+
+ kwargs = {}
+ for a in sys.argv[3:]:
+ k, v = string.split(a, '=', 1)
+ kwargs[k] = int(v)
+
+ try: from progress import text_progress_meter
+ except ImportError, e: pass
+ else: kwargs['progress_obj'] = text_progress_meter()
+
+ def cfunc(filename, hello, there='foo'):
+ print hello, there
+ import random
+ rnum = random.random()
+ if rnum < .5:
+ print 'forcing retry'
+ raise URLGrabError(-1, 'forcing retry')
+ if rnum < .75:
+ print 'forcing failure'
+ raise URLGrabError(-2, 'forcing immediate failure')
+ print 'success'
+ return
+
+ kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
+ try: name = apply(retrygrab, (url, filename), kwargs)
+ except URLGrabError, e: print e
+ else: print 'LOCAL FILE:', name
+
+def _file_object_test(filename=None):
+ import cStringIO
+ if filename is None:
+ filename = __file__
+ print 'using file "%s" for comparisons' % filename
+ fo = open(filename)
+ s_input = fo.read()
+ fo.close()
+
+ for testfunc in [_test_file_object_smallread,
+ _test_file_object_readall,
+ _test_file_object_readline,
+ _test_file_object_readlines]:
+ fo_input = cStringIO.StringIO(s_input)
+ fo_output = cStringIO.StringIO()
+ wrapper = PyCurlFileObject(fo_input, None, 0)
+ print 'testing %-30s ' % testfunc.__name__,
+ testfunc(wrapper, fo_output)
+ s_output = fo_output.getvalue()
+ if s_output == s_input: print 'passed'
+ else: print 'FAILED'
+
+def _test_file_object_smallread(wrapper, fo_output):
+ while 1:
+ s = wrapper.read(23)
+ fo_output.write(s)
+ if not s: return
+
+def _test_file_object_readall(wrapper, fo_output):
+ s = wrapper.read()
+ fo_output.write(s)
+
+def _test_file_object_readline(wrapper, fo_output):
+ while 1:
+ s = wrapper.readline()
+ fo_output.write(s)
+ if not s: return
+
+def _test_file_object_readlines(wrapper, fo_output):
+ li = wrapper.readlines()
+ fo_output.write(string.join(li, ''))
+
+if __name__ == '__main__':
+ _main_test()
+ _retry_test()
+ _file_object_test('test')