1 files changed, 1730 insertions, 0 deletions
diff --git a/urlgrabber/grabber.py b/urlgrabber/grabber.py
new file mode 100644
index 0000000..e090e90
--- /dev/null
+++ b/urlgrabber/grabber.py
@@ -0,0 +1,1730 @@
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 2.1 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, write to the 
+#      Free Software Foundation, Inc., 
+#      59 Temple Place, Suite 330, 
+#      Boston, MA  02111-1307  USA
+
+# This file is part of urlgrabber, a high-level cross-protocol url-grabber
+# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
+# Copyright 2009 Red Hat inc, pycurl code written by Seth Vidal
+
+"""A high-level cross-protocol url-grabber.
+
+GENERAL ARGUMENTS (kwargs)
+
+  Where possible, the module-level default is indicated, and legal
+  values are provided.
+
+  copy_local = 0   [0|1]
+
+    ignored except for file:// urls, in which case it specifies
+    whether urlgrab should still make a copy of the file, or simply
+    point to the existing copy. The module level default for this
+    option is 0.
+
+  close_connection = 0   [0|1]
+
+    tells URLGrabber to close the connection after a file has been
+    transfered. This is ignored unless the download happens with the
+    http keepalive handler (keepalive=1).  Otherwise, the connection
+    is left open for further use. The module level default for this
+    option is 0 (keepalive connections will not be closed).
+
+  keepalive = 1   [0|1]
+
+    specifies whether keepalive should be used for HTTP/1.1 servers
+    that support it. The module level default for this option is 1
+    (keepalive is enabled).
+
+  progress_obj = None
+
+    a class instance that supports the following methods:
+      po.start(filename, url, basename, length, text)
+      # length will be None if unknown
+      po.update(read) # read == bytes read so far
+      po.end()
+
+  text = None
+  
+    specifies alternative text to be passed to the progress meter
+    object.  If not given, the default progress meter will use the
+    basename of the file.
+
+  throttle = 1.0
+
+    a number - if it's an int, it's the bytes/second throttle limit.
+    If it's a float, it is first multiplied by bandwidth.  If throttle
+    == 0, throttling is disabled.  If None, the module-level default
+    (which can be set on default_grabber.throttle) is used. See
+    BANDWIDTH THROTTLING for more information.
+
+  timeout = None
+
+    a positive float expressing the number of seconds to wait for socket
+    operations. If the value is None or 0.0, socket operations will block
+    forever. Setting this option causes urlgrabber to call the settimeout
+    method on the Socket object used for the request. See the Python
+    documentation on settimeout for more information.
+    http://www.python.org/doc/current/lib/socket-objects.html
+
+  bandwidth = 0
+
+    the nominal max bandwidth in bytes/second.  If throttle is a float
+    and bandwidth == 0, throttling is disabled.  If None, the
+    module-level default (which can be set on
+    default_grabber.bandwidth) is used. See BANDWIDTH THROTTLING for
+    more information.
+
+  range = None
+
+    a tuple of the form (first_byte, last_byte) describing a byte
+    range to retrieve. Either or both of the values may set to
+    None. If first_byte is None, byte offset 0 is assumed. If
+    last_byte is None, the last byte available is assumed. Note that
+    the range specification is python-like in that (0,10) will yeild
+    the first 10 bytes of the file.
+
+    If set to None, no range will be used.
+    
+  reget = None   [None|'simple'|'check_timestamp']
+
+    whether to attempt to reget a partially-downloaded file.  Reget
+    only applies to .urlgrab and (obviously) only if there is a
+    partially downloaded file.  Reget has two modes:
+
+      'simple' -- the local file will always be trusted.  If there
+        are 100 bytes in the local file, then the download will always
+        begin 100 bytes into the requested file.
+
+      'check_timestamp' -- the timestamp of the server file will be
+        compared to the timestamp of the local file.  ONLY if the
+        local file is newer than or the same age as the server file
+        will reget be used.  If the server file is newer, or the
+        timestamp is not returned, the entire file will be fetched.
+
+    NOTE: urlgrabber can do very little to verify that the partial
+    file on disk is identical to the beginning of the remote file.
+    You may want to either employ a custom "checkfunc" or simply avoid
+    using reget in situations where corruption is a concern.
+
+  user_agent = 'urlgrabber/VERSION'
+
+    a string, usually of the form 'AGENT/VERSION' that is provided to
+    HTTP servers in the User-agent header. The module level default
+    for this option is "urlgrabber/VERSION".
+
+  http_headers = None
+
+    a tuple of 2-tuples, each containing a header and value.  These
+    will be used for http and https requests only.  For example, you
+    can do
+      http_headers = (('Pragma', 'no-cache'),)
+
+  ftp_headers = None
+
+    this is just like http_headers, but will be used for ftp requests.
+
+  proxies = None
+
+    a dictionary that maps protocol schemes to proxy hosts. For
+    example, to use a proxy server on host "foo" port 3128 for http
+    and https URLs:
+      proxies={ 'http' : 'http://foo:3128', 'https' : 'http://foo:3128' }
+    note that proxy authentication information may be provided using
+    normal URL constructs:
+      proxies={ 'http' : 'http://user:host@foo:3128' }
+    Lastly, if proxies is None, the default environment settings will
+    be used.
+
+  prefix = None
+
+    a url prefix that will be prepended to all requested urls.  For
+    example:
+      g = URLGrabber(prefix='http://foo.com/mirror/')
+      g.urlgrab('some/file.txt')
+      ## this will fetch 'http://foo.com/mirror/some/file.txt'
+    This option exists primarily to allow identical behavior to
+    MirrorGroup (and derived) instances.  Note: a '/' will be inserted
+    if necessary, so you cannot specify a prefix that ends with a
+    partial file or directory name.
+
+  opener = None
+    No-op when using the curl backend (default)
+
+  cache_openers = True
+    No-op when using the curl backend (default)
+
+  data = None
+
+    Only relevant for the HTTP family (and ignored for other
+    protocols), this allows HTTP POSTs.  When the data kwarg is
+    present (and not None), an HTTP request will automatically become
+    a POST rather than GET.  This is done by direct passthrough to
+    urllib2.  If you use this, you may also want to set the
+    'Content-length' and 'Content-type' headers with the http_headers
+    option.  Note that python 2.2 handles the case of these
+    badly and if you do not use the proper case (shown here), your
+    values will be overridden with the defaults.
+    
+  urlparser = URLParser()
+
+    The URLParser class handles pre-processing of URLs, including
+    auth-handling for user/pass encoded in http urls, file handing
+    (that is, filenames not sent as a URL), and URL quoting.  If you
+    want to override any of this behavior, you can pass in a
+    replacement instance.  See also the 'quote' option.
+
+  quote = None
+
+    Whether or not to quote the path portion of a url.
+      quote = 1    ->  quote the URLs (they're not quoted yet)
+      quote = 0    ->  do not quote them (they're already quoted)
+      quote = None ->  guess what to do
+
+    This option only affects proper urls like 'file:///etc/passwd'; it
+    does not affect 'raw' filenames like '/etc/passwd'.  The latter
+    will always be quoted as they are converted to URLs.  Also, only
+    the path part of a url is quoted.  If you need more fine-grained
+    control, you should probably subclass URLParser and pass it in via
+    the 'urlparser' option.
+
+  ssl_ca_cert = None
+
+    this option can be used if M2Crypto is available and will be
+    ignored otherwise.  If provided, it will be used to create an SSL
+    context.  If both ssl_ca_cert and ssl_context are provided, then
+    ssl_context will be ignored and a new context will be created from
+    ssl_ca_cert.
+
+  ssl_context = None
+
+    No-op when using the curl backend (default)
+   
+
+  self.ssl_verify_peer = True 
+
+    Check the server's certificate to make sure it is valid with what our CA validates
+  
+  self.ssl_verify_host = True
+
+    Check the server's hostname to make sure it matches the certificate DN
+
+  self.ssl_key = None
+
+    Path to the key the client should use to connect/authenticate with
+
+  self.ssl_key_type = 'PEM' 
+
+    PEM or DER - format of key
+     
+  self.ssl_cert = None
+
+    Path to the ssl certificate the client should use to to authenticate with
+
+  self.ssl_cert_type = 'PEM' 
+
+    PEM or DER - format of certificate
+    
+  self.ssl_key_pass = None 
+
+    password to access the ssl_key
+    
+  self.size = None
+
+    size (in bytes) or Maximum size of the thing being downloaded. 
+    This is mostly to keep us from exploding with an endless datastream
+  
+  self.max_header_size = 2097152 
+
+    Maximum size (in bytes) of the headers.
+    
+
+RETRY RELATED ARGUMENTS
+
+  retry = None
+
+    the number of times to retry the grab before bailing.  If this is
+    zero, it will retry forever. This was intentional... really, it
+    was :). If this value is not supplied or is supplied but is None
+    retrying does not occur.
+
+  retrycodes = [-1,2,4,5,6,7]
+
+    a sequence of errorcodes (values of e.errno) for which it should
+    retry. See the doc on URLGrabError for more details on this.  You
+    might consider modifying a copy of the default codes rather than
+    building yours from scratch so that if the list is extended in the
+    future (or one code is split into two) you can still enjoy the
+    benefits of the default list.  You can do that with something like
+    this:
+
+      retrycodes = urlgrabber.grabber.URLGrabberOptions().retrycodes
+      if 12 not in retrycodes:
+          retrycodes.append(12)
+      
+  checkfunc = None
+
+    a function to do additional checks. This defaults to None, which
+    means no additional checking.  The function should simply return
+    on a successful check.  It should raise URLGrabError on an
+    unsuccessful check.  Raising of any other exception will be
+    considered immediate failure and no retries will occur.
+
+    If it raises URLGrabError, the error code will determine the retry
+    behavior.  Negative error numbers are reserved for use by these
+    passed in functions, so you can use many negative numbers for
+    different types of failure.  By default, -1 results in a retry,
+    but this can be customized with retrycodes.
+
+    If you simply pass in a function, it will be given exactly one
+    argument: a CallbackObject instance with the .url attribute
+    defined and either .filename (for urlgrab) or .data (for urlread).
+    For urlgrab, .filename is the name of the local file.  For
+    urlread, .data is the actual string data.  If you need other
+    arguments passed to the callback (program state of some sort), you
+    can do so like this:
+
+      checkfunc=(function, ('arg1', 2), {'kwarg': 3})
+
+    if the downloaded file has filename /tmp/stuff, then this will
+    result in this call (for urlgrab):
+
+      function(obj, 'arg1', 2, kwarg=3)
+      # obj.filename = '/tmp/stuff'
+      # obj.url = 'http://foo.com/stuff'
+      
+    NOTE: both the "args" tuple and "kwargs" dict must be present if
+    you use this syntax, but either (or both) can be empty.
+
+  failure_callback = None
+
+    The callback that gets called during retries when an attempt to
+    fetch a file fails.  The syntax for specifying the callback is
+    identical to checkfunc, except for the attributes defined in the
+    CallbackObject instance.  The attributes for failure_callback are:
+
+      exception = the raised exception
+      url       = the url we're trying to fetch
+      tries     = the number of tries so far (including this one)
+      retry     = the value of the retry option
+
+    The callback is present primarily to inform the calling program of
+    the failure, but if it raises an exception (including the one it's
+    passed) that exception will NOT be caught and will therefore cause
+    future retries to be aborted.
+
+    The callback is called for EVERY failure, including the last one.
+    On the last try, the callback can raise an alternate exception,
+    but it cannot (without severe trickiness) prevent the exception
+    from being raised.
+
+  interrupt_callback = None
+
+    This callback is called if KeyboardInterrupt is received at any
+    point in the transfer.  Basically, this callback can have three
+    impacts on the fetch process based on the way it exits:
+
+      1) raise no exception: the current fetch will be aborted, but
+         any further retries will still take place
+
+      2) raise a URLGrabError: if you're using a MirrorGroup, then
+         this will prompt a failover to the next mirror according to
+         the behavior of the MirrorGroup subclass.  It is recommended
+         that you raise URLGrabError with code 15, 'user abort'.  If
+         you are NOT using a MirrorGroup subclass, then this is the
+         same as (3).
+
+      3) raise some other exception (such as KeyboardInterrupt), which
+         will not be caught at either the grabber or mirror levels.
+         That is, it will be raised up all the way to the caller.
+
+    This callback is very similar to failure_callback.  They are
+    passed the same arguments, so you could use the same function for
+    both.
+      
+BANDWIDTH THROTTLING
+
+  urlgrabber supports throttling via two values: throttle and
+  bandwidth Between the two, you can either specify and absolute
+  throttle threshold or specify a theshold as a fraction of maximum
+  available bandwidth.
+
+  throttle is a number - if it's an int, it's the bytes/second
+  throttle limit.  If it's a float, it is first multiplied by
+  bandwidth.  If throttle == 0, throttling is disabled.  If None, the
+  module-level default (which can be set with set_throttle) is used.
+
+  bandwidth is the nominal max bandwidth in bytes/second.  If throttle
+  is a float and bandwidth == 0, throttling is disabled.  If None, the
+  module-level default (which can be set with set_bandwidth) is used.
+
+  THROTTLING EXAMPLES:
+
+  Lets say you have a 100 Mbps connection.  This is (about) 10^8 bits
+  per second, or 12,500,000 Bytes per second.  You have a number of
+  throttling options:
+
+  *) set_bandwidth(12500000); set_throttle(0.5) # throttle is a float
+
+     This will limit urlgrab to use half of your available bandwidth.
+
+  *) set_throttle(6250000) # throttle is an int
+
+     This will also limit urlgrab to use half of your available
+     bandwidth, regardless of what bandwidth is set to.
+
+  *) set_throttle(6250000); set_throttle(1.0) # float
+
+     Use half your bandwidth
+
+  *) set_throttle(6250000); set_throttle(2.0) # float
+
+    Use up to 12,500,000 Bytes per second (your nominal max bandwidth)
+
+  *) set_throttle(6250000); set_throttle(0) # throttle = 0
+
+     Disable throttling - this is more efficient than a very large
+     throttle setting.
+
+  *) set_throttle(0); set_throttle(1.0) # throttle is float, bandwidth = 0
+
+     Disable throttling - this is the default when the module is loaded.
+
+  SUGGESTED AUTHOR IMPLEMENTATION (THROTTLING)
+
+  While this is flexible, it's not extremely obvious to the user.  I
+  suggest you implement a float throttle as a percent to make the
+  distinction between absolute and relative throttling very explicit.
+
+  Also, you may want to convert the units to something more convenient
+  than bytes/second, such as kbps or kB/s, etc.
+
+"""
+
+
+
+import os
+import sys
+import urlparse
+import time
+import string
+import urllib
+import urllib2
+import mimetools
+import thread
+import types
+import stat
+import pycurl
+from ftplib import parse150
+from StringIO import StringIO
+from httplib import HTTPException
+import socket
+from byterange import range_tuple_normalize, range_tuple_to_header, RangeError
+
+########################################################################
+#                     MODULE INITIALIZATION
+########################################################################
+try:
+    exec('from ' + (__name__.split('.'))[0] + ' import __version__')
+except:
+    __version__ = '???'
+
+########################################################################
+# functions for debugging output.  These functions are here because they
+# are also part of the module initialization.
+DEBUG = None
+def set_logger(DBOBJ):
+    """Set the DEBUG object.  This is called by _init_default_logger when
+    the environment variable URLGRABBER_DEBUG is set, but can also be
+    called by a calling program.  Basically, if the calling program uses
+    the logging module and would like to incorporate urlgrabber logging,
+    then it can do so this way.  It's probably not necessary as most
+    internal logging is only for debugging purposes.
+
+    The passed-in object should be a logging.Logger instance.  It will
+    be pushed into the keepalive and byterange modules if they're
+    being used.  The mirror module pulls this object in on import, so
+    you will need to manually push into it.  In fact, you may find it
+    tidier to simply push your logging object (or objects) into each
+    of these modules independently.
+    """
+
+    global DEBUG
+    DEBUG = DBOBJ
+
+def _init_default_logger(logspec=None):
+    '''Examines the environment variable URLGRABBER_DEBUG and creates
+    a logging object (logging.logger) based on the contents.  It takes
+    the form
+
+      URLGRABBER_DEBUG=level,filename
+      
+    where "level" can be either an integer or a log level from the
+    logging module (DEBUG, INFO, etc).  If the integer is zero or
+    less, logging will be disabled.  Filename is the filename where
+    logs will be sent.  If it is "-", then stdout will be used.  If
+    the filename is empty or missing, stderr will be used.  If the
+    variable cannot be processed or the logging module cannot be
+    imported (python < 2.3) then logging will be disabled.  Here are
+    some examples:
+
+      URLGRABBER_DEBUG=1,debug.txt   # log everything to debug.txt
+      URLGRABBER_DEBUG=WARNING,-     # log warning and higher to stdout
+      URLGRABBER_DEBUG=INFO          # log info and higher to stderr
+      
+    This funtion is called during module initialization.  It is not
+    intended to be called from outside.  The only reason it is a
+    function at all is to keep the module-level namespace tidy and to
+    collect the code into a nice block.'''
+
+    try:
+        if logspec is None:
+            logspec = os.environ['URLGRABBER_DEBUG']
+        dbinfo = logspec.split(',')
+        import logging
+        level = logging._levelNames.get(dbinfo[0], None)
+        if level is None: level = int(dbinfo[0])
+        if level < 1: raise ValueError()
+
+        formatter = logging.Formatter('%(asctime)s %(message)s')
+        if len(dbinfo) > 1: filename = dbinfo[1]
+        else: filename = ''
+        if filename == '': handler = logging.StreamHandler(sys.stderr)
+        elif filename == '-': handler = logging.StreamHandler(sys.stdout)
+        else:  handler = logging.FileHandler(filename)
+        handler.setFormatter(formatter)
+        DBOBJ = logging.getLogger('urlgrabber')
+        DBOBJ.addHandler(handler)
+        DBOBJ.setLevel(level)
+    except (KeyError, ImportError, ValueError):
+        DBOBJ = None
+    set_logger(DBOBJ)
+
+def _log_package_state():
+    if not DEBUG: return
+    DEBUG.info('urlgrabber version  = %s' % __version__)
+    DEBUG.info('trans function "_"  = %s' % _)
+        
+_init_default_logger()
+_log_package_state()
+
+
+# normally this would be from i18n or something like it ...
+def _(st):
+    return st
+
+########################################################################
+#                 END MODULE INITIALIZATION
+########################################################################
+
+
+
+class URLGrabError(IOError):
+    """
+    URLGrabError error codes:
+
+      URLGrabber error codes (0 -- 255)
+        0    - everything looks good (you should never see this)
+        1    - malformed url
+        2    - local file doesn't exist
+        3    - request for non-file local file (dir, etc)
+        4    - IOError on fetch
+        5    - OSError on fetch
+        6    - no content length header when we expected one
+        7    - HTTPException
+        8    - Exceeded read limit (for urlread)
+        9    - Requested byte range not satisfiable.
+        10   - Byte range requested, but range support unavailable
+        11   - Illegal reget mode
+        12   - Socket timeout
+        13   - malformed proxy url
+        14   - HTTPError (includes .code and .exception attributes)
+        15   - user abort
+        16   - error writing to local file
+        
+      MirrorGroup error codes (256 -- 511)
+        256  - No more mirrors left to try
+
+      Custom (non-builtin) classes derived from MirrorGroup (512 -- 767)
+        [ this range reserved for application-specific error codes ]
+
+      Retry codes (< 0)
+        -1   - retry the download, unknown reason
+
+    Note: to test which group a code is in, you can simply do integer
+    division by 256: e.errno / 256
+
+    Negative codes are reserved for use by functions passed in to
+    retrygrab with checkfunc.  The value -1 is built in as a generic
+    retry code and is already included in the retrycodes list.
+    Therefore, you can create a custom check function that simply
+    returns -1 and the fetch will be re-tried.  For more customized
+    retries, you can use other negative number and include them in
+    retry-codes.  This is nice for outputting useful messages about
+    what failed.
+
+    You can use these error codes like so:
+      try: urlgrab(url)
+      except URLGrabError, e:
+         if e.errno == 3: ...
+           # or
+         print e.strerror
+           # or simply
+         print e  #### print '[Errno %i] %s' % (e.errno, e.strerror)
+    """
+    def __init__(self, *args):
+        IOError.__init__(self, *args)
+        self.url = "No url specified"
+
+class CallbackObject:
+    """Container for returned callback data.
+
+    This is currently a dummy class into which urlgrabber can stuff
+    information for passing to callbacks.  This way, the prototype for
+    all callbacks is the same, regardless of the data that will be
+    passed back.  Any function that accepts a callback function as an
+    argument SHOULD document what it will define in this object.
+
+    It is possible that this class will have some greater
+    functionality in the future.
+    """
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
+def urlgrab(url, filename=None, **kwargs):
+    """grab the file at <url> and make a local copy at <filename>
+    If filename is none, the basename of the url is used.
+    urlgrab returns the filename of the local file, which may be different
+    from the passed-in filename if the copy_local kwarg == 0.
+    
+    See module documentation for a description of possible kwargs.
+    """
+    return default_grabber.urlgrab(url, filename, **kwargs)
+
+def urlopen(url, **kwargs):
+    """open the url and return a file object
+    If a progress object or throttle specifications exist, then
+    a special file object will be returned that supports them.
+    The file object can be treated like any other file object.
+    
+    See module documentation for a description of possible kwargs.
+    """
+    return default_grabber.urlopen(url, **kwargs)
+
+def urlread(url, limit=None, **kwargs):
+    """read the url into a string, up to 'limit' bytes
+    If the limit is exceeded, an exception will be thrown.  Note that urlread
+    is NOT intended to be used as a way of saying "I want the first N bytes"
+    but rather 'read the whole file into memory, but don't use too much'
+    
+    See module documentation for a description of possible kwargs.
+    """
+    return default_grabber.urlread(url, limit, **kwargs)
+
+
+class URLParser:
+    """Process the URLs before passing them to urllib2.
+
+    This class does several things:
+
+      * add any prefix
+      * translate a "raw" file to a proper file: url
+      * handle any http or https auth that's encoded within the url
+      * quote the url
+
+    Only the "parse" method is called directly, and it calls sub-methods.
+
+    An instance of this class is held in the options object, which
+    means that it's easy to change the behavior by sub-classing and
+    passing the replacement in.  It need only have a method like:
+
+        url, parts = urlparser.parse(url, opts)
+    """
+
+    def parse(self, url, opts):
+        """parse the url and return the (modified) url and its parts
+
+        Note: a raw file WILL be quoted when it's converted to a URL.
+        However, other urls (ones which come with a proper scheme) may
+        or may not be quoted according to opts.quote
+
+          opts.quote = 1     --> quote it
+          opts.quote = 0     --> do not quote it
+          opts.quote = None  --> guess
+        """
+        quote = opts.quote
+        
+        if opts.prefix:
+            url = self.add_prefix(url, opts.prefix)
+            
+        parts = urlparse.urlparse(url)
+        (scheme, host, path, parm, query, frag) = parts
+
+        if not scheme or (len(scheme) == 1 and scheme in string.letters):
+            # if a scheme isn't specified, we guess that it's "file:"
+            if url[0] not in '/\\': url = os.path.abspath(url)
+            url = 'file:' + urllib.pathname2url(url)
+            parts = urlparse.urlparse(url)
+            quote = 0 # pathname2url quotes, so we won't do it again
+            
+        if scheme in ['http', 'https']:
+            parts = self.process_http(parts, url)
+            
+        if quote is None:
+            quote = self.guess_should_quote(parts)
+        if quote:
+            parts = self.quote(parts)
+        
+        url = urlparse.urlunparse(parts)
+        return url, parts
+
+    def add_prefix(self, url, prefix):
+        if prefix[-1] == '/' or url[0] == '/':
+            url = prefix + url
+        else:
+            url = prefix + '/' + url
+        return url
+
+    def process_http(self, parts, url):
+        (scheme, host, path, parm, query, frag) = parts
+        # TODO: auth-parsing here, maybe? pycurl doesn't really need it
+        return (scheme, host, path, parm, query, frag)
+
+    def quote(self, parts):
+        """quote the URL
+
+        This method quotes ONLY the path part.  If you need to quote
+        other parts, you should override this and pass in your derived
+        class.  The other alternative is to quote other parts before
+        passing into urlgrabber.
+        """
+        (scheme, host, path, parm, query, frag) = parts
+        path = urllib.quote(path)
+        return (scheme, host, path, parm, query, frag)
+
+    hexvals = '0123456789ABCDEF'
+    def guess_should_quote(self, parts):
+        """
+        Guess whether we should quote a path.  This amounts to
+        guessing whether it's already quoted.
+
+        find ' '   ->  1
+        find '%'   ->  1
+        find '%XX' ->  0
+        else       ->  1
+        """
+        (scheme, host, path, parm, query, frag) = parts
+        if ' ' in path:
+            return 1
+        ind = string.find(path, '%')
+        if ind > -1:
+            while ind > -1:
+                if len(path) < ind+3:
+                    return 1
+                code = path[ind+1:ind+3].upper()
+                if     code[0] not in self.hexvals or \
+                       code[1] not in self.hexvals:
+                    return 1
+                ind = string.find(path, '%', ind+1)
+            return 0
+        return 1
+    
+class URLGrabberOptions:
+    """Class to ease kwargs handling."""
+
+    def __init__(self, delegate=None, **kwargs):
+        """Initialize URLGrabberOptions object.
+        Set default values for all options and then update options specified
+        in kwargs.
+        """
+        self.delegate = delegate
+        if delegate is None:
+            self._set_defaults()
+        self._set_attributes(**kwargs)
+    
+    def __getattr__(self, name):
+        if self.delegate and hasattr(self.delegate, name):
+            return getattr(self.delegate, name)
+        raise AttributeError, name
+    
+    def raw_throttle(self):
+        """Calculate raw throttle value from throttle and bandwidth 
+        values.
+        """
+        if self.throttle <= 0:  
+            return 0
+        elif type(self.throttle) == type(0): 
+            return float(self.throttle)
+        else: # throttle is a float
+            return self.bandwidth * self.throttle
+        
+    def derive(self, **kwargs):
+        """Create a derived URLGrabberOptions instance.
+        This method creates a new instance and overrides the
+        options specified in kwargs.
+        """
+        return URLGrabberOptions(delegate=self, **kwargs)
+        
+    def _set_attributes(self, **kwargs):
+        """Update object attributes with those provided in kwargs."""
+        self.__dict__.update(kwargs)
+        if kwargs.has_key('range'):
+            # normalize the supplied range value
+            self.range = range_tuple_normalize(self.range)
+        if not self.reget in [None, 'simple', 'check_timestamp']:
+            raise URLGrabError(11, _('Illegal reget mode: %s') \
+                               % (self.reget, ))
+
+    def _set_defaults(self):
+        """Set all options to their default values. 
+        When adding new options, make sure a default is
+        provided here.
+        """
+        self.progress_obj = None
+        self.throttle = 1.0
+        self.bandwidth = 0
+        self.retry = None
+        self.retrycodes = [-1,2,4,5,6,7]
+        self.checkfunc = None
+        self.copy_local = 0
+        self.close_connection = 0
+        self.range = None
+        self.user_agent = 'urlgrabber/%s' % __version__
+        self.keepalive = 1
+        self.proxies = None
+        self.reget = None
+        self.failure_callback = None
+        self.interrupt_callback = None
+        self.prefix = None
+        self.opener = None
+        self.cache_openers = True
+        self.timeout = None
+        self.text = None
+        self.http_headers = None
+        self.ftp_headers = None
+        self.data = None
+        self.urlparser = URLParser()
+        self.quote = None
+        self.ssl_ca_cert = None # sets SSL_CAINFO - path to certdb
+        self.ssl_context = None # no-op in pycurl
+        self.ssl_verify_peer = True # check peer's cert for authenticityb
+        self.ssl_verify_host = True # make sure who they are and who the cert is for matches
+        self.ssl_key = None # client key
+        self.ssl_key_type = 'PEM' #(or DER)
+        self.ssl_cert = None # client cert
+        self.ssl_cert_type = 'PEM' # (or DER)
+        self.ssl_key_pass = None # password to access the key
+        self.size = None # if we know how big the thing we're getting is going
+                         # to be. this is ultimately a MAXIMUM size for the file
+        self.max_header_size = 2097152 #2mb seems reasonable for maximum header size
+        
+    def __repr__(self):
+        return self.format()
+        
+    def format(self, indent='  '):
+        keys = self.__dict__.keys()
+        if self.delegate is not None:
+            keys.remove('delegate')
+        keys.sort()
+        s = '{\n'
+        for k in keys:
+            s = s + indent + '%-15s: %s,\n' % \
+                (repr(k), repr(self.__dict__[k]))
+        if self.delegate:
+            df = self.delegate.format(indent + '  ')
+            s = s + indent + '%-15s: %s\n' % ("'delegate'", df)
+        s = s + indent + '}'
+        return s
+
+class URLGrabber:
+    """Provides easy opening of URLs with a variety of options.
+    
+    All options are specified as kwargs. Options may be specified when
+    the class is created and may be overridden on a per request basis.
+    
+    New objects inherit default values from default_grabber.
+    """
+    
+    def __init__(self, **kwargs):
+        self.opts = URLGrabberOptions(**kwargs)
+    
+    def _retry(self, opts, func, *args):
+        tries = 0
+        while 1:
+            # there are only two ways out of this loop.  The second has
+            # several "sub-ways"
+            #   1) via the return in the "try" block
+            #   2) by some exception being raised
+            #      a) an excepton is raised that we don't "except"
+            #      b) a callback raises ANY exception
+            #      c) we're not retry-ing or have run out of retries
+            #      d) the URLGrabError code is not in retrycodes
+            # beware of infinite loops :)
+            tries = tries + 1
+            exception = None
+            retrycode = None
+            callback  = None
+            if DEBUG: DEBUG.info('attempt %i/%s: %s',
+                                 tries, opts.retry, args[0])
+            try:
+                r = apply(func, (opts,) + args, {})
+                if DEBUG: DEBUG.info('success')
+                return r
+            except URLGrabError, e:
+                exception = e
+                callback = opts.failure_callback
+                retrycode = e.errno
+            except KeyboardInterrupt, e:
+                exception = e
+                callback = opts.interrupt_callback
+
+            if DEBUG: DEBUG.info('exception: %s', exception)
+            if callback:
+                if DEBUG: DEBUG.info('calling callback: %s', callback)
+                cb_func, cb_args, cb_kwargs = self._make_callback(callback)
+                obj = CallbackObject(exception=exception, url=args[0],
+                                     tries=tries, retry=opts.retry)
+                cb_func(obj, *cb_args, **cb_kwargs)
+
+            if (opts.retry is None) or (tries == opts.retry):
+                if DEBUG: DEBUG.info('retries exceeded, re-raising')
+                raise
+
+            if (retrycode is not None) and (retrycode not in opts.retrycodes):
+                if DEBUG: DEBUG.info('retrycode (%i) not in list %s, re-raising',
+                                     retrycode, opts.retrycodes)
+                raise
+    
+    def urlopen(self, url, **kwargs):
+        """open the url and return a file object
+        If a progress object or throttle value specified when this 
+        object was created, then  a special file object will be 
+        returned that supports them. The file object can be treated 
+        like any other file object.
+        """
+        opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+        (url,parts) = opts.urlparser.parse(url, opts) 
+        def retryfunc(opts, url):
+            return PyCurlFileObject(url, filename=None, opts=opts)
+        return self._retry(opts, retryfunc, url)
+    
+    def urlgrab(self, url, filename=None, **kwargs):
+        """grab the file at <url> and make a local copy at <filename>
+        If filename is none, the basename of the url is used.
+        urlgrab returns the filename of the local file, which may be 
+        different from the passed-in filename if copy_local == 0.
+        """
+        opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+        (url,parts) = opts.urlparser.parse(url, opts) 
+        (scheme, host, path, parm, query, frag) = parts
+        if filename is None:
+            filename = os.path.basename( urllib.unquote(path) )
+        if scheme == 'file' and not opts.copy_local:
+            # just return the name of the local file - don't make a 
+            # copy currently
+            path = urllib.url2pathname(path)
+            if host:
+                path = os.path.normpath('//' + host + path)
+            if not os.path.exists(path):
+                err = URLGrabError(2, 
+                      _('Local file does not exist: %s') % (path, ))
+                err.url = url
+                raise err
+            elif not os.path.isfile(path):
+                err = URLGrabError(3, 
+                                 _('Not a normal file: %s') % (path, ))
+                err.url = url
+                raise err
+
+            elif not opts.range:
+                if not opts.checkfunc is None:
+                    cb_func, cb_args, cb_kwargs = \
+                       self._make_callback(opts.checkfunc)
+                    obj = CallbackObject()
+                    obj.filename = path
+                    obj.url = url
+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)        
+                return path
+        
+        def retryfunc(opts, url, filename):
+            fo = PyCurlFileObject(url, filename, opts)
+            try:
+                fo._do_grab()
+                if not opts.checkfunc is None:
+                    cb_func, cb_args, cb_kwargs = \
+                             self._make_callback(opts.checkfunc)
+                    obj = CallbackObject()
+                    obj.filename = filename
+                    obj.url = url
+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
+            finally:
+                fo.close()
+            return filename
+        
+        return self._retry(opts, retryfunc, url, filename)
+    
+    def urlread(self, url, limit=None, **kwargs):
+        """read the url into a string, up to 'limit' bytes
+        If the limit is exceeded, an exception will be thrown.  Note
+        that urlread is NOT intended to be used as a way of saying 
+        "I want the first N bytes" but rather 'read the whole file 
+        into memory, but don't use too much'
+        """
+        opts = self.opts.derive(**kwargs)
+        if DEBUG: DEBUG.debug('combined options: %s' % repr(opts))
+        (url,parts) = opts.urlparser.parse(url, opts) 
+        if limit is not None:
+            limit = limit + 1
+            
+        def retryfunc(opts, url, limit):
+            fo = PyCurlFileObject(url, filename=None, opts=opts)
+            s = ''
+            try:
+                # this is an unfortunate thing.  Some file-like objects
+                # have a default "limit" of None, while the built-in (real)
+                # file objects have -1.  They each break the other, so for
+                # now, we just force the default if necessary.
+                if limit is None: s = fo.read()
+                else: s = fo.read(limit)
+
+                if not opts.checkfunc is None:
+                    cb_func, cb_args, cb_kwargs = \
+                             self._make_callback(opts.checkfunc)
+                    obj = CallbackObject()
+                    obj.data = s
+                    obj.url = url
+                    apply(cb_func, (obj, )+cb_args, cb_kwargs)
+            finally:
+                fo.close()
+            return s
+            
+        s = self._retry(opts, retryfunc, url, limit)
+        if limit and len(s) > limit:
+            err = URLGrabError(8, 
+                               _('Exceeded limit (%i): %s') % (limit, url))
+            err.url = url
+            raise err
+
+        return s
+        
+    def _make_callback(self, callback_obj):
+        if callable(callback_obj):
+            return callback_obj, (), {}
+        else:
+            return callback_obj
+
+# create the default URLGrabber used by urlXXX functions.
+# NOTE: actual defaults are set in URLGrabberOptions
+default_grabber = URLGrabber()
+
+
+class PyCurlFileObject():
+    def __init__(self, url, filename, opts):
+        self.fo = None
+        self._hdr_dump = ''
+        self._parsed_hdr = None
+        self.url = url
+        self.scheme = urlparse.urlsplit(self.url)[0]
+        self.filename = filename
+        self.append = False
+        self.reget_time = None
+        self.opts = opts
+        if self.opts.reget == 'check_timestamp':
+            raise NotImplementedError, "check_timestamp regets are not implemented in this ver of urlgrabber. Please report this."
+        self._complete = False
+        self._rbuf = ''
+        self._rbufsize = 1024*8
+        self._ttime = time.time()
+        self._tsize = 0
+        self._amount_read = 0
+        self._reget_length = 0
+        self._prog_running = False
+        self._error = (None, None)
+        self.size = None
+        self._do_open()
+        
+        
+    def __getattr__(self, name):
+        """This effectively allows us to wrap at the instance level.
+        Any attribute not found in _this_ object will be searched for
+        in self.fo.  This includes methods."""
+
+        if hasattr(self.fo, name):
+            return getattr(self.fo, name)
+        raise AttributeError, name
+
+    def _retrieve(self, buf):
+        try:
+            if not self._prog_running:
+                if self.opts.progress_obj:
+                    size  = self.size + self._reget_length
+                    self.opts.progress_obj.start(self._prog_reportname, 
+                                                 urllib.unquote(self.url), 
+                                                 self._prog_basename, 
+                                                 size=size,
+                                                 text=self.opts.text)
+                    self._prog_running = True
+                    self.opts.progress_obj.update(self._amount_read)
+
+            self._amount_read += len(buf)
+            self.fo.write(buf)
+            return len(buf)
+        except KeyboardInterrupt:
+            return -1
+            
+    def _hdr_retrieve(self, buf):
+        if self._over_max_size(cur=len(self._hdr_dump), 
+                               max_size=self.opts.max_header_size):
+            return -1            
+        try:
+            self._hdr_dump += buf
+            # we have to get the size before we do the progress obj start
+            # but we can't do that w/o making it do 2 connects, which sucks
+            # so we cheat and stuff it in here in the hdr_retrieve
+            if self.scheme in ['http','https'] and buf.lower().find('content-length') != -1:
+                length = buf.split(':')[1]
+                self.size = int(length)
+            elif self.scheme in ['ftp']:
+                s = None
+                if buf.startswith('213 '):
+                    s = buf[3:].strip()
+                elif buf.startswith('150 '):
+                    s = parse150(buf)
+                if s:
+                    self.size = int(s)
+            
+            return len(buf)
+        except KeyboardInterrupt:
+            return pycurl.READFUNC_ABORT
+
+    def _return_hdr_obj(self):
+        if self._parsed_hdr:
+            return self._parsed_hdr
+        statusend = self._hdr_dump.find('\n')
+        hdrfp = StringIO()
+        hdrfp.write(self._hdr_dump[statusend:])
+        self._parsed_hdr =  mimetools.Message(hdrfp)
+        return self._parsed_hdr
+    
+    hdr = property(_return_hdr_obj)
+    http_code = property(fget=
+                 lambda self: self.curl_obj.getinfo(pycurl.RESPONSE_CODE))
+
+    def _set_opts(self, opts={}):
+        # XXX
+        if not opts:
+            opts = self.opts
+
+
+        # defaults we're always going to set
+        self.curl_obj.setopt(pycurl.NOPROGRESS, False)
+        self.curl_obj.setopt(pycurl.NOSIGNAL, True)
+        self.curl_obj.setopt(pycurl.WRITEFUNCTION, self._retrieve)
+        self.curl_obj.setopt(pycurl.HEADERFUNCTION, self._hdr_retrieve)
+        self.curl_obj.setopt(pycurl.PROGRESSFUNCTION, self._progress_update)
+        self.curl_obj.setopt(pycurl.FAILONERROR, True)
+        self.curl_obj.setopt(pycurl.OPT_FILETIME, True)
+        
+        if DEBUG:
+            self.curl_obj.setopt(pycurl.VERBOSE, True)
+        if opts.user_agent:
+            self.curl_obj.setopt(pycurl.USERAGENT, opts.user_agent)
+        
+        # maybe to be options later
+        self.curl_obj.setopt(pycurl.FOLLOWLOCATION, True)
+        self.curl_obj.setopt(pycurl.MAXREDIRS, 5)
+        
+        # timeouts
+        timeout = 300
+        if opts.timeout:
+            timeout = int(opts.timeout)
+            self.curl_obj.setopt(pycurl.CONNECTTIMEOUT, timeout)
+
+        # ssl options
+        if self.scheme == 'https':
+            if opts.ssl_ca_cert: # this may do ZERO with nss  according to curl docs
+                self.curl_obj.setopt(pycurl.CAPATH, opts.ssl_ca_cert)
+                self.curl_obj.setopt(pycurl.CAINFO, opts.ssl_ca_cert)
+            self.curl_obj.setopt(pycurl.SSL_VERIFYPEER, opts.ssl_verify_peer)
+            self.curl_obj.setopt(pycurl.SSL_VERIFYHOST, opts.ssl_verify_host)
+            if opts.ssl_key:
+                self.curl_obj.setopt(pycurl.SSLKEY, opts.ssl_key)
+            if opts.ssl_key_type:
+                self.curl_obj.setopt(pycurl.SSLKEYTYPE, opts.ssl_key_type)
+            if opts.ssl_cert:
+                self.curl_obj.setopt(pycurl.SSLCERT, opts.ssl_cert)
+            if opts.ssl_cert_type:                
+                self.curl_obj.setopt(pycurl.SSLCERTTYPE, opts.ssl_cert_type)
+            if opts.ssl_key_pass:
+                self.curl_obj.setopt(pycurl.SSLKEYPASSWD, opts.ssl_key_pass)
+
+        #headers:
+        if opts.http_headers and self.scheme in ('http', 'https'):
+            headers = []
+            for (tag, content) in opts.http_headers:
+                headers.append('%s:%s' % (tag, content))
+            self.curl_obj.setopt(pycurl.HTTPHEADER, headers)
+
+        # ranges:
+        if opts.range or opts.reget:
+            range_str = self._build_range()
+            if range_str:
+                self.curl_obj.setopt(pycurl.RANGE, range_str)
+            
+        # throttle/bandwidth
+        if hasattr(opts, 'raw_throttle') and opts.raw_throttle():
+            self.curl_obj.setopt(pycurl.MAX_RECV_SPEED_LARGE, int(opts.raw_throttle()))
+            
+        # proxy settings
+        if opts.proxies:
+            for (scheme, proxy) in opts.proxies.items():
+                if self.scheme in ('ftp'): # only set the ftp proxy for ftp items
+                    if scheme not in ('ftp'):
+                        continue
+                    else:
+                        if proxy == '_none_': proxy = ""
+                        self.curl_obj.setopt(pycurl.PROXY, proxy)
+                elif self.scheme in ('http', 'https'):
+                    if scheme not in ('http', 'https'):
+                        continue
+                    else:
+                        if proxy == '_none_': proxy = ""
+                        self.curl_obj.setopt(pycurl.PROXY, proxy)
+            
+        # FIXME username/password/auth settings
+
+        #posts - simple - expects the fields as they are
+        if opts.data:
+            self.curl_obj.setopt(pycurl.POST, True)
+            self.curl_obj.setopt(pycurl.POSTFIELDS, self._to_utf8(opts.data))
+            
+        # our url
+        self.curl_obj.setopt(pycurl.URL, self.url)
+        
+    
+    def _do_perform(self):
+        if self._complete:
+            return
+        
+        try:
+            self.curl_obj.perform()
+        except pycurl.error, e:
+            # XXX - break some of these out a bit more clearly
+            # to other URLGrabErrors from 
+            # http://curl.haxx.se/libcurl/c/libcurl-errors.html
+            # this covers e.args[0] == 22 pretty well - which will be common
+            
+            code = self.http_code
+            errcode = e.args[0]
+            if self._error[0]:
+                errcode = self._error[0]
+                
+            if errcode == 23 and code >= 200 and code < 299:
+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+                err.url = self.url
+                
+                # this is probably wrong but ultimately this is what happens
+                # we have a legit http code and a pycurl 'writer failed' code
+                # which almost always means something aborted it from outside
+                # since we cannot know what it is -I'm banking on it being
+                # a ctrl-c. XXXX - if there's a way of going back two raises to 
+                # figure out what aborted the pycurl process FIXME
+                raise KeyboardInterrupt
+            
+            elif errcode == 28:
+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+            elif errcode == 35:
+                msg = _("problem making ssl connection")
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+            elif errcode == 37:
+                msg = _("Could not open/read %s") % (self.url)
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+                
+            elif errcode == 42:
+                err = URLGrabError(15, _('User (or something) called abort %s: %s') % (self.url, e))
+                err.url = self.url
+                # this is probably wrong but ultimately this is what happens
+                # we have a legit http code and a pycurl 'writer failed' code
+                # which almost always means something aborted it from outside
+                # since we cannot know what it is -I'm banking on it being
+                # a ctrl-c. XXXX - if there's a way of going back two raises to 
+                # figure out what aborted the pycurl process FIXME
+                raise KeyboardInterrupt
+                
+            elif errcode == 58:
+                msg = _("problem with the local client certificate")
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+
+            elif errcode == 60:
+                msg = _("client cert cannot be verified or client cert incorrect")
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+            
+            elif errcode == 63:
+                if self._error[1]:
+                    msg = self._error[1]
+                else:
+                    msg = _("Max download size exceeded on %s") % (self.url)
+                err = URLGrabError(14, msg)
+                err.url = self.url
+                raise err
+                    
+            elif str(e.args[1]) == '' and self.http_code != 0: # fake it until you make it
+                msg = 'HTTP Error %s : %s ' % (self.http_code, self.url)
+            else:
+                msg = 'PYCURL ERROR %s - "%s"' % (errcode, str(e.args[1]))
+                code = errcode
+            err = URLGrabError(14, msg)
+            err.code = code
+            err.exception = e
+            raise err
+
+    def _do_open(self):
+        self.curl_obj = _curl_cache
+        self.curl_obj.reset() # reset all old settings away, just in case
+        # setup any ranges
+        self._set_opts()
+        self._do_grab()
+        return self.fo
+
+    def _add_headers(self):
+        pass
+        
+    def _build_range(self):
+        reget_length = 0
+        rt = None
+        if self.opts.reget and type(self.filename) in types.StringTypes:
+            # we have reget turned on and we're dumping to a file
+            try:
+                s = os.stat(self.filename)
+            except OSError:
+                pass
+            else:
+                self.reget_time = s[stat.ST_MTIME]
+                reget_length = s[stat.ST_SIZE]
+
+                # Set initial length when regetting
+                self._amount_read = reget_length    
+                self._reget_length = reget_length # set where we started from, too
+
+                rt = reget_length, ''
+                self.append = 1
+                
+        if self.opts.range:
+            rt = self.opts.range
+            if rt[0]: rt = (rt[0] + reget_length, rt[1])
+
+        if rt:
+            header = range_tuple_to_header(rt)
+            if header:
+                return header.split('=')[1]
+
+
+
+    def _make_request(self, req, opener):
+        #XXXX
+        # This doesn't do anything really, but we could use this
+        # instead of do_open() to catch a lot of crap errors as 
+        # mstenner did before here
+        return (self.fo, self.hdr)
+        
+        try:
+            if self.opts.timeout:
+                old_to = socket.getdefaulttimeout()
+                socket.setdefaulttimeout(self.opts.timeout)
+                try:
+                    fo = opener.open(req)
+                finally:
+                    socket.setdefaulttimeout(old_to)
+            else:
+                fo = opener.open(req)
+            hdr = fo.info()
+        except ValueError, e:
+            err = URLGrabError(1, _('Bad URL: %s : %s') % (self.url, e, ))
+            err.url = self.url
+            raise err
+
+        except RangeError, e:
+            err = URLGrabError(9, _('%s on %s') % (e, self.url))
+            err.url = self.url
+            raise err
+        except urllib2.HTTPError, e:
+            new_e = URLGrabError(14, _('%s on %s') % (e, self.url))
+            new_e.code = e.code
+            new_e.exception = e
+            new_e.url = self.url
+            raise new_e
+        except IOError, e:
+            if hasattr(e, 'reason') and isinstance(e.reason, socket.timeout):
+                err = URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+            else:
+                err = URLGrabError(4, _('IOError on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+        except OSError, e:
+            err = URLGrabError(5, _('%s on %s') % (e, self.url))
+            err.url = self.url
+            raise err
+
+        except HTTPException, e:
+            err = URLGrabError(7, _('HTTP Exception (%s) on %s: %s') % \
+                            (e.__class__.__name__, self.url, e))
+            err.url = self.url
+            raise err
+
+        else:
+            return (fo, hdr)
+        
+    def _do_grab(self):
+        """dump the file to a filename or StringIO buffer"""
+
+        if self._complete:
+            return
+        _was_filename = False
+        if type(self.filename) in types.StringTypes and self.filename:
+            _was_filename = True
+            self._prog_reportname = str(self.filename)
+            self._prog_basename = os.path.basename(self.filename)
+            
+            if self.append: mode = 'ab'
+            else: mode = 'wb'
+
+            if DEBUG: DEBUG.info('opening local file "%s" with mode %s' % \
+                                 (self.filename, mode))
+            try:
+                self.fo = open(self.filename, mode)
+            except IOError, e:
+                err = URLGrabError(16, _(\
+                  'error opening local file from %s, IOError: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+        else:
+            self._prog_reportname = 'MEMORY'
+            self._prog_basename = 'MEMORY'
+
+            
+            self.fo = StringIO()
+            # if this is to be a tempfile instead....
+            # it just makes crap in the tempdir
+            #fh, self._temp_name = mkstemp()
+            #self.fo = open(self._temp_name, 'wb')
+
+            
+        self._do_perform()
+        
+
+
+        if _was_filename:
+            # close it up
+            self.fo.flush()
+            self.fo.close()
+            # set the time
+            mod_time = self.curl_obj.getinfo(pycurl.INFO_FILETIME)
+            if mod_time != -1:
+                os.utime(self.filename, (mod_time, mod_time))
+            # re open it
+            self.fo = open(self.filename, 'r')
+        else:
+            #self.fo = open(self._temp_name, 'r')
+            self.fo.seek(0)
+
+        self._complete = True
+    
+    def _fill_buffer(self, amt=None):
+        """fill the buffer to contain at least 'amt' bytes by reading
+        from the underlying file object.  If amt is None, then it will
+        read until it gets nothing more.  It updates the progress meter
+        and throttles after every self._rbufsize bytes."""
+        # the _rbuf test is only in this first 'if' for speed.  It's not
+        # logically necessary
+        if self._rbuf and not amt is None:
+            L = len(self._rbuf)
+            if amt > L:
+                amt = amt - L
+            else:
+                return
+
+        # if we've made it here, then we don't have enough in the buffer
+        # and we need to read more.
+        
+        if not self._complete: self._do_grab() #XXX cheater - change on ranges
+        
+        buf = [self._rbuf]
+        bufsize = len(self._rbuf)
+        while amt is None or amt:
+            # first, delay if necessary for throttling reasons
+            if self.opts.raw_throttle():
+                diff = self._tsize/self.opts.raw_throttle() - \
+                       (time.time() - self._ttime)
+                if diff > 0: time.sleep(diff)
+                self._ttime = time.time()
+                
+            # now read some data, up to self._rbufsize
+            if amt is None: readamount = self._rbufsize
+            else:           readamount = min(amt, self._rbufsize)
+            try:
+                new = self.fo.read(readamount)
+            except socket.error, e:
+                err = URLGrabError(4, _('Socket Error on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+            except socket.timeout, e:
+                raise URLGrabError(12, _('Timeout on %s: %s') % (self.url, e))
+                err.url = self.url
+                raise err
+
+            except IOError, e:
+                raise URLGrabError(4, _('IOError on %s: %s') %(self.url, e))
+                err.url = self.url
+                raise err
+
+            newsize = len(new)
+            if not newsize: break # no more to read
+
+            if amt: amt = amt - newsize
+            buf.append(new)
+            bufsize = bufsize + newsize
+            self._tsize = newsize
+            self._amount_read = self._amount_read + newsize
+            #if self.opts.progress_obj:
+            #    self.opts.progress_obj.update(self._amount_read)
+
+        self._rbuf = string.join(buf, '')
+        return
+
+    def _progress_update(self, download_total, downloaded, upload_total, uploaded):
+        if self._over_max_size(cur=self._amount_read-self._reget_length):
+            return -1
+
+        try:
+            if self._prog_running:
+                downloaded += self._reget_length
+                self.opts.progress_obj.update(downloaded)
+        except KeyboardInterrupt:
+            return -1
+    
+    def _over_max_size(self, cur, max_size=None):
+
+        if not max_size:
+            max_size = self.size
+        if self.opts.size: # if we set an opts size use that, no matter what
+            max_size = self.opts.size
+        if not max_size: return False # if we have None for all of the Max then this is dumb
+        if cur > max_size + max_size*.10:
+
+            msg = _("Downloaded more than max size for %s: %s > %s") \
+                        % (self.url, cur, max_size)
+            self._error = (pycurl.E_FILESIZE_EXCEEDED, msg)
+            return True
+        return False
+        
+    def _to_utf8(self, obj, errors='replace'):
+        '''convert 'unicode' to an encoded utf-8 byte string '''
+        # stolen from yum.i18n
+        if isinstance(obj, unicode):
+            obj = obj.encode('utf-8', errors)
+        return obj
+        
+    def read(self, amt=None):
+        self._fill_buffer(amt)
+        if amt is None:
+            s, self._rbuf = self._rbuf, ''
+        else:
+            s, self._rbuf = self._rbuf[:amt], self._rbuf[amt:]
+        return s
+
+    def readline(self, limit=-1):
+        if not self._complete: self._do_grab()
+        return self.fo.readline()
+        
+        i = string.find(self._rbuf, '\n')
+        while i < 0 and not (0 < limit <= len(self._rbuf)):
+            L = len(self._rbuf)
+            self._fill_buffer(L + self._rbufsize)
+            if not len(self._rbuf) > L: break
+            i = string.find(self._rbuf, '\n', L)
+
+        if i < 0: i = len(self._rbuf)
+        else: i = i+1
+        if 0 <= limit < len(self._rbuf): i = limit
+
+        s, self._rbuf = self._rbuf[:i], self._rbuf[i:]
+        return s
+
+    def close(self):
+        if self._prog_running:
+            self.opts.progress_obj.end(self._amount_read)
+        self.fo.close()
+        
+
+_curl_cache = pycurl.Curl() # make one and reuse it over and over and over
+
+
+#####################################################################
+# DEPRECATED FUNCTIONS
+def set_throttle(new_throttle):
+    """Deprecated. Use: default_grabber.throttle = new_throttle"""
+    default_grabber.throttle = new_throttle
+
+def set_bandwidth(new_bandwidth):
+    """Deprecated. Use: default_grabber.bandwidth = new_bandwidth"""
+    default_grabber.bandwidth = new_bandwidth
+
+def set_progress_obj(new_progress_obj):
+    """Deprecated. Use: default_grabber.progress_obj = new_progress_obj"""
+    default_grabber.progress_obj = new_progress_obj
+
+def set_user_agent(new_user_agent):
+    """Deprecated. Use: default_grabber.user_agent = new_user_agent"""
+    default_grabber.user_agent = new_user_agent
+    
+def retrygrab(url, filename=None, copy_local=0, close_connection=0,
+              progress_obj=None, throttle=None, bandwidth=None,
+              numtries=3, retrycodes=[-1,2,4,5,6,7], checkfunc=None):
+    """Deprecated. Use: urlgrab() with the retry arg instead"""
+    kwargs = {'copy_local' :  copy_local, 
+              'close_connection' : close_connection,
+              'progress_obj' : progress_obj, 
+              'throttle' : throttle, 
+              'bandwidth' : bandwidth,
+              'retry' : numtries,
+              'retrycodes' : retrycodes,
+              'checkfunc' : checkfunc 
+              }
+    return urlgrab(url, filename, **kwargs)
+
+        
+#####################################################################
+#  TESTING
+def _main_test():
+    try: url, filename = sys.argv[1:3]
+    except ValueError:
+        print 'usage:', sys.argv[0], \
+              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+        sys.exit()
+
+    kwargs = {}
+    for a in sys.argv[3:]:
+        k, v = string.split(a, '=', 1)
+        kwargs[k] = int(v)
+
+    set_throttle(1.0)
+    set_bandwidth(32 * 1024)
+    print "throttle: %s,  throttle bandwidth: %s B/s" % (default_grabber.throttle, 
+                                                        default_grabber.bandwidth)
+
+    try: from progress import text_progress_meter
+    except ImportError, e: pass
+    else: kwargs['progress_obj'] = text_progress_meter()
+
+    try: name = apply(urlgrab, (url, filename), kwargs)
+    except URLGrabError, e: print e
+    else: print 'LOCAL FILE:', name
+
+
+def _retry_test():
+    try: url, filename = sys.argv[1:3]
+    except ValueError:
+        print 'usage:', sys.argv[0], \
+              '<url> <filename> [copy_local=0|1] [close_connection=0|1]'
+        sys.exit()
+
+    kwargs = {}
+    for a in sys.argv[3:]:
+        k, v = string.split(a, '=', 1)
+        kwargs[k] = int(v)
+
+    try: from progress import text_progress_meter
+    except ImportError, e: pass
+    else: kwargs['progress_obj'] = text_progress_meter()
+
+    def cfunc(filename, hello, there='foo'):
+        print hello, there
+        import random
+        rnum = random.random()
+        if rnum < .5:
+            print 'forcing retry'
+            raise URLGrabError(-1, 'forcing retry')
+        if rnum < .75:
+            print 'forcing failure'
+            raise URLGrabError(-2, 'forcing immediate failure')
+        print 'success'
+        return
+        
+    kwargs['checkfunc'] = (cfunc, ('hello',), {'there':'there'})
+    try: name = apply(retrygrab, (url, filename), kwargs)
+    except URLGrabError, e: print e
+    else: print 'LOCAL FILE:', name
+
+def _file_object_test(filename=None):
+    import cStringIO
+    if filename is None:
+        filename = __file__
+    print 'using file "%s" for comparisons' % filename
+    fo = open(filename)
+    s_input = fo.read()
+    fo.close()
+
+    for testfunc in [_test_file_object_smallread,
+                     _test_file_object_readall,
+                     _test_file_object_readline,
+                     _test_file_object_readlines]:
+        fo_input = cStringIO.StringIO(s_input)
+        fo_output = cStringIO.StringIO()
+        wrapper = PyCurlFileObject(fo_input, None, 0)
+        print 'testing %-30s ' % testfunc.__name__,
+        testfunc(wrapper, fo_output)
+        s_output = fo_output.getvalue()
+        if s_output == s_input: print 'passed'
+        else: print 'FAILED'
+            
+def _test_file_object_smallread(wrapper, fo_output):
+    while 1:
+        s = wrapper.read(23)
+        fo_output.write(s)
+        if not s: return
+
+def _test_file_object_readall(wrapper, fo_output):
+    s = wrapper.read()
+    fo_output.write(s)
+
+def _test_file_object_readline(wrapper, fo_output):
+    while 1:
+        s = wrapper.readline()
+        fo_output.write(s)
+        if not s: return
+
+def _test_file_object_readlines(wrapper, fo_output):
+    li = wrapper.readlines()
+    fo_output.write(string.join(li, ''))
+
+if __name__ == '__main__':
+    _main_test()
+    _retry_test()
+    _file_object_test('test')