Port to Python3debian/4.1.0

Change-Id: I46b8f71dce3d1f009617aa6e969414ad0c6393a6 Signed-off-by: biao716.wang <biao716.wang@samsung.com>
author: biao716.wang <biao716.wang@samsung.com> 2020-08-26 11:34:58 +0900
committer: biao716.wang <biao716.wang@samsung.com> 2020-08-26 11:34:58 +0900
commit: a5651e772478cb72e7fd02e1d60bbe7a509a4d50 (patch)
tree: dd6921b15d6285d79e2e42dcbbd132ac066b19d1 /urlgrabber/mirror.py
parent: 136d1e028cec5dcddc6ef6ac7302c794fda5f135 (diff)
download: python-urlgrabber-debian/4.1.0.tar.gz
python-urlgrabber-debian/4.1.0.tar.bz2
python-urlgrabber-debian/4.1.0.zip
1 files changed, 97 insertions, 35 deletions
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..d95863e 100644..100755
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -9,9 +9,9 @@
 #   Lesser General Public License for more details.
 #
 #   You should have received a copy of the GNU Lesser General Public
-#   License along with this library; if not, write to the 
-#      Free Software Foundation, Inc., 
-#      59 Temple Place, Suite 330, 
+#   License along with this library; if not, write to the
+#      Free Software Foundation, Inc.,
+#      59 Temple Place, Suite 330,
 #      Boston, MA  02111-1307  USA
 
 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -76,6 +76,10 @@ CUSTOMIZATION
        'grabber' is omitted, the default grabber will be used.  If
        kwargs are omitted, then (duh) they will not be used.
 
+       kwarg 'max_connections' limits the number of concurrent
+       connections to this mirror.  When omitted or set to zero,
+       the default limit (2) will be used.
+
     3) Pass keyword arguments when instantiating the mirror group.
        See, for example, the failure_callback argument.
 
@@ -87,12 +91,29 @@ CUSTOMIZATION
 """
 
 
+import sys
 import random
-import thread  # needed for locking to make this threadsafe
 
-from grabber import URLGrabError, CallbackObject, DEBUG
+if sys.version_info >= (3,):
+    # We use a version check because python2 also has _thread
+    import _thread as thread
+else:
+    import thread
+
+try:
+    import urllib.parse as urlparse
+except ImportError:
+    import urlparse
+
+from six import string_types
 
-def _(st): 
+from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
+from .grabber import _run_callback, _do_raise
+from .grabber import exception2msg
+from .grabber import _TH
+from .grabber import _bytes_repr
+
+def _(st):
     return st
 
 class GrabRequest:
@@ -126,13 +147,15 @@ class MirrorGroup:
         files)
 
       * if the local list is ever exhausted, a URLGrabError will be
-        raised (errno=256, no more mirrors)
+        raised (errno=256, No more mirrors).  The 'errors' attribute
+        holds a list of (full_url, errmsg) tuples.  This contains
+        all URLs tried and the corresponding error messages.
 
     OPTIONS
 
       In addition to the required arguments "grabber" and "mirrors",
       MirrorGroup also takes the following optional arguments:
-      
+
       default_action
 
         A dict that describes the actions to be taken upon failure
@@ -153,7 +176,8 @@ class MirrorGroup:
 
         The 'fail' option will cause immediate failure by re-raising
         the exception and no further attempts to get the current
-        download.
+        download.  As in the "No more mirrors" case, the 'errors'
+        attribute is set in the exception object.
 
         This dict can be set at instantiation time,
           mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
@@ -162,7 +186,7 @@ class MirrorGroup:
         or by returning an action dict from the failure_callback
           return {'fail':0}
         in increasing precedence.
-        
+
         If all three of these were done, the net result would be:
               {'increment': 0,         # set in method
                'increment_master': 1,  # class default
@@ -180,10 +204,11 @@ class MirrorGroup:
         etc).  Otherwise, it is assumed to be the callable object
         itself.  The callback will be passed a grabber.CallbackObject
         instance along with args and kwargs (if present).  The following
-        attributes are defined withing the instance:
+        attributes are defined within the instance:
 
            obj.exception    = < exception that was raised >
            obj.mirror       = < the mirror that was tried >
+           obj.tries        = < the number of mirror tries so far >
            obj.relative_url = < url relative to the mirror >
            obj.url          = < full url that failed >
                               # .url is just the combination of .mirror
@@ -251,22 +276,34 @@ class MirrorGroup:
         self.default_action = None
         self._process_kwargs(kwargs)
 
+        # use the same algorithm as parallel downloader to initially sort
+        # the mirror list (sort by speed, but prefer live private mirrors)
+        def estimate(m):
+            speed, fail = _TH.estimate(m['mirror'])
+            private = not fail and m.get('kwargs', {}).get('private', False)
+            return private, speed
+
+        # update the initial order.  since sorting is stable, the relative
+        # order of unknown (not used yet) hosts is retained.
+        self.mirrors.sort(key=estimate, reverse=True)
+
     # if these values are found in **kwargs passed to one of the urlXXX
     # methods, they will be stripped before getting passed on to the
     # grabber
     options = ['default_action', 'failure_callback']
-    
+
     def _process_kwargs(self, kwargs):
         self.failure_callback = kwargs.get('failure_callback')
         self.default_action   = kwargs.get('default_action')
-       
+
     def _parse_mirrors(self, mirrors):
         parsed_mirrors = []
         for m in mirrors:
-            if type(m) == type(''): m = {'mirror': m}
+            if isinstance(m, string_types):
+                m = {'mirror': _to_utf8(m)}
             parsed_mirrors.append(m)
         return parsed_mirrors
-    
+
     def _load_gr(self, gr):
         # OVERRIDE IDEAS:
         #   shuffle gr list
@@ -280,7 +317,9 @@ class MirrorGroup:
         #   return a random mirror so that multiple mirrors get used
         #   even without failures.
         if not gr.mirrors:
-            raise URLGrabError(256, _('No more mirrors to try.'))
+            e = URLGrabError(256, _('No more mirrors to try.'))
+            e.errors = gr.errors
+            raise e
         return gr.mirrors[gr._next]
 
     def _failure(self, gr, cb_obj):
@@ -290,7 +329,7 @@ class MirrorGroup:
         #                       the callback)
         cb = gr.kw.get('failure_callback') or self.failure_callback
         if cb:
-            if type(cb) == type( () ):
+            if isinstance(cb, tuple):
                 cb, args, kwargs = cb
             else:
                 args, kwargs = (), {}
@@ -307,7 +346,9 @@ class MirrorGroup:
         a.update(action)
         action = a
         self.increment_mirror(gr, action)
-        if action and action.get('fail', 0): raise
+        if action and action.get('fail', 0):
+            sys.exc_info()[1].errors = gr.errors
+            raise
 
     def increment_mirror(self, gr, action={}):
         """Tell the mirror object increment the mirror index
@@ -323,7 +364,7 @@ class MirrorGroup:
         urlopen, there's no good way for the mirror group to know that
         an error occurs mid-download (it's already returned and given
         you the file object).
-        
+
         remove  ---  can have several values
            0   do not remove the mirror from the list
            1   remove the mirror for this download only
@@ -345,7 +386,7 @@ class MirrorGroup:
                 self._next += 1
             if self._next >= len(self.mirrors): self._next = 0
         self._lock.release()
-        
+
         if action.get('remove', 1):
             del gr.mirrors[gr._next]
         elif action.get('increment', 1):
@@ -353,9 +394,9 @@ class MirrorGroup:
         if gr._next >= len(gr.mirrors): gr._next = 0
 
         if DEBUG:
-            grm = [m['mirror'] for m in gr.mirrors]
+            grm = [m['mirror'].decode() for m in gr.mirrors]
             DEBUG.info('GR   mirrors: [%s] %i', ' '.join(grm), gr._next)
-            selfm = [m['mirror'] for m in self.mirrors]
+            selfm = [m['mirror'].decode() for m in self.mirrors]
             DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
 
     #####################################################################
@@ -366,47 +407,68 @@ class MirrorGroup:
     # by overriding the configuration methods :)
 
     def _join_url(self, base_url, rel_url):
-        if base_url.endswith('/') or rel_url.startswith('/'):
-            return base_url + rel_url
+        (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
+
+        if isinstance(base_url, bytes):
+            if not isinstance(rel_url, bytes):
+                rel_url = rel_url.encode('utf8')
+            sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/'
         else:
-            return base_url + '/' + rel_url
-        
+            sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
+        return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid))
+
     def _mirror_try(self, func, url, kw):
         gr = GrabRequest()
         gr.func = func
         gr.url  = url
         gr.kw   = dict(kw)
         self._load_gr(gr)
+        gr.errors = []
 
         for k in self.options:
             try: del kw[k]
             except KeyError: pass
 
-        while 1:
+        tries = 0
+        while True:
+            tries += 1
             mirrorchoice = self._get_mirror(gr)
             fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
-            kwargs = dict(mirrorchoice.get('kwargs', {}))
-            kwargs.update(kw)
             grabber = mirrorchoice.get('grabber') or self.grabber
+            # apply mirrorchoice kwargs on top of grabber.opts
+            opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
             func_ref = getattr(grabber, func)
-            if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
+            if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl))
             try:
-                return func_ref( *(fullurl,), **kwargs )
-            except URLGrabError, e:
+                return func_ref( *(fullurl,), opts=opts, **kw )
+            except URLGrabError as e:
                 if DEBUG: DEBUG.info('MIRROR: failed')
+                gr.errors.append((fullurl, exception2msg(e)))
                 obj = CallbackObject()
                 obj.exception = e
                 obj.mirror = mirrorchoice['mirror']
                 obj.relative_url = gr.url
                 obj.url = fullurl
+                obj.tries = tries
                 self._failure(gr, obj)
 
     def urlgrab(self, url, filename=None, **kwargs):
         kw = dict(kwargs)
         kw['filename'] = filename
+        if kw.get('async_') or kw.get('async'):
+            # enable mirror failovers in async path
+            kw['mirror_group'] = self, [], {}, set()
+            kw['relative_url'] = url
+        else:
+            kw.pop('failfunc', None)
         func = 'urlgrab'
-        return self._mirror_try(func, url, kw)
-    
+        try:
+            return self._mirror_try(func, url, kw)
+        except URLGrabError as e:
+            obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
+            return _run_callback(kwargs.get('failfunc', _do_raise), obj)
+
     def urlopen(self, url, **kwargs):
         kw = dict(kwargs)
         func = 'urlopen'
@@ -417,7 +479,7 @@ class MirrorGroup:
         kw['limit'] = limit
         func = 'urlread'
         return self._mirror_try(func, url, kw)
-            
+
 
 class MGRandomStart(MirrorGroup):
     """A mirror group that starts at a random mirror in the list.
author	biao716.wang <biao716.wang@samsung.com>	2020-08-26 11:34:58 +0900
committer	biao716.wang <biao716.wang@samsung.com>	2020-08-26 11:34:58 +0900
commit	a5651e772478cb72e7fd02e1d60bbe7a509a4d50 (patch)
tree	dd6921b15d6285d79e2e42dcbbd132ac066b19d1 /urlgrabber/mirror.py
parent	136d1e028cec5dcddc6ef6ac7302c794fda5f135 (diff)
download	python-urlgrabber-debian/4.1.0.tar.gz python-urlgrabber-debian/4.1.0.tar.bz2 python-urlgrabber-debian/4.1.0.zip