diff options
author | biao716.wang <biao716.wang@samsung.com> | 2020-08-26 11:34:58 +0900 |
---|---|---|
committer | biao716.wang <biao716.wang@samsung.com> | 2020-08-26 11:34:58 +0900 |
commit | a5651e772478cb72e7fd02e1d60bbe7a509a4d50 (patch) | |
tree | dd6921b15d6285d79e2e42dcbbd132ac066b19d1 /urlgrabber/mirror.py | |
parent | 136d1e028cec5dcddc6ef6ac7302c794fda5f135 (diff) | |
download | python-urlgrabber-debian/4.1.0.tar.gz python-urlgrabber-debian/4.1.0.tar.bz2 python-urlgrabber-debian/4.1.0.zip |
Port to Python3debian/4.1.0
Change-Id: I46b8f71dce3d1f009617aa6e969414ad0c6393a6
Signed-off-by: biao716.wang <biao716.wang@samsung.com>
Diffstat (limited to 'urlgrabber/mirror.py')
-rwxr-xr-x[-rw-r--r--] | urlgrabber/mirror.py | 132 |
1 files changed, 97 insertions, 35 deletions
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py index dad410b..d95863e 100644..100755 --- a/urlgrabber/mirror.py +++ b/urlgrabber/mirror.py @@ -9,9 +9,9 @@ # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public -# License along with this library; if not, write to the -# Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, +# License along with this library; if not, write to the +# Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, # Boston, MA 02111-1307 USA # This file is part of urlgrabber, a high-level cross-protocol url-grabber @@ -76,6 +76,10 @@ CUSTOMIZATION 'grabber' is omitted, the default grabber will be used. If kwargs are omitted, then (duh) they will not be used. + kwarg 'max_connections' limits the number of concurrent + connections to this mirror. When omitted or set to zero, + the default limit (2) will be used. + 3) Pass keyword arguments when instantiating the mirror group. See, for example, the failure_callback argument. @@ -87,12 +91,29 @@ CUSTOMIZATION """ +import sys import random -import thread # needed for locking to make this threadsafe -from grabber import URLGrabError, CallbackObject, DEBUG +if sys.version_info >= (3,): + # We use a version check because python2 also has _thread + import _thread as thread +else: + import thread + +try: + import urllib.parse as urlparse +except ImportError: + import urlparse + +from six import string_types -def _(st): +from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8 +from .grabber import _run_callback, _do_raise +from .grabber import exception2msg +from .grabber import _TH +from .grabber import _bytes_repr + +def _(st): return st class GrabRequest: @@ -126,13 +147,15 @@ class MirrorGroup: files) * if the local list is ever exhausted, a URLGrabError will be - raised (errno=256, no more mirrors) + raised (errno=256, No more mirrors). The 'errors' attribute + holds a list of (full_url, errmsg) tuples. This contains + all URLs tried and the corresponding error messages. OPTIONS In addition to the required arguments "grabber" and "mirrors", MirrorGroup also takes the following optional arguments: - + default_action A dict that describes the actions to be taken upon failure @@ -153,7 +176,8 @@ class MirrorGroup: The 'fail' option will cause immediate failure by re-raising the exception and no further attempts to get the current - download. + download. As in the "No more mirrors" case, the 'errors' + attribute is set in the exception object. This dict can be set at instantiation time, mg = MirrorGroup(grabber, mirrors, default_action={'fail':1}) @@ -162,7 +186,7 @@ class MirrorGroup: or by returning an action dict from the failure_callback return {'fail':0} in increasing precedence. - + If all three of these were done, the net result would be: {'increment': 0, # set in method 'increment_master': 1, # class default @@ -180,10 +204,11 @@ class MirrorGroup: etc). Otherwise, it is assumed to be the callable object itself. The callback will be passed a grabber.CallbackObject instance along with args and kwargs (if present). The following - attributes are defined withing the instance: + attributes are defined within the instance: obj.exception = < exception that was raised > obj.mirror = < the mirror that was tried > + obj.tries = < the number of mirror tries so far > obj.relative_url = < url relative to the mirror > obj.url = < full url that failed > # .url is just the combination of .mirror @@ -251,22 +276,34 @@ class MirrorGroup: self.default_action = None self._process_kwargs(kwargs) + # use the same algorithm as parallel downloader to initially sort + # the mirror list (sort by speed, but prefer live private mirrors) + def estimate(m): + speed, fail = _TH.estimate(m['mirror']) + private = not fail and m.get('kwargs', {}).get('private', False) + return private, speed + + # update the initial order. since sorting is stable, the relative + # order of unknown (not used yet) hosts is retained. + self.mirrors.sort(key=estimate, reverse=True) + # if these values are found in **kwargs passed to one of the urlXXX # methods, they will be stripped before getting passed on to the # grabber options = ['default_action', 'failure_callback'] - + def _process_kwargs(self, kwargs): self.failure_callback = kwargs.get('failure_callback') self.default_action = kwargs.get('default_action') - + def _parse_mirrors(self, mirrors): parsed_mirrors = [] for m in mirrors: - if type(m) == type(''): m = {'mirror': m} + if isinstance(m, string_types): + m = {'mirror': _to_utf8(m)} parsed_mirrors.append(m) return parsed_mirrors - + def _load_gr(self, gr): # OVERRIDE IDEAS: # shuffle gr list @@ -280,7 +317,9 @@ class MirrorGroup: # return a random mirror so that multiple mirrors get used # even without failures. if not gr.mirrors: - raise URLGrabError(256, _('No more mirrors to try.')) + e = URLGrabError(256, _('No more mirrors to try.')) + e.errors = gr.errors + raise e return gr.mirrors[gr._next] def _failure(self, gr, cb_obj): @@ -290,7 +329,7 @@ class MirrorGroup: # the callback) cb = gr.kw.get('failure_callback') or self.failure_callback if cb: - if type(cb) == type( () ): + if isinstance(cb, tuple): cb, args, kwargs = cb else: args, kwargs = (), {} @@ -307,7 +346,9 @@ class MirrorGroup: a.update(action) action = a self.increment_mirror(gr, action) - if action and action.get('fail', 0): raise + if action and action.get('fail', 0): + sys.exc_info()[1].errors = gr.errors + raise def increment_mirror(self, gr, action={}): """Tell the mirror object increment the mirror index @@ -323,7 +364,7 @@ class MirrorGroup: urlopen, there's no good way for the mirror group to know that an error occurs mid-download (it's already returned and given you the file object). - + remove --- can have several values 0 do not remove the mirror from the list 1 remove the mirror for this download only @@ -345,7 +386,7 @@ class MirrorGroup: self._next += 1 if self._next >= len(self.mirrors): self._next = 0 self._lock.release() - + if action.get('remove', 1): del gr.mirrors[gr._next] elif action.get('increment', 1): @@ -353,9 +394,9 @@ class MirrorGroup: if gr._next >= len(gr.mirrors): gr._next = 0 if DEBUG: - grm = [m['mirror'] for m in gr.mirrors] + grm = [m['mirror'].decode() for m in gr.mirrors] DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next) - selfm = [m['mirror'] for m in self.mirrors] + selfm = [m['mirror'].decode() for m in self.mirrors] DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next) ##################################################################### @@ -366,47 +407,68 @@ class MirrorGroup: # by overriding the configuration methods :) def _join_url(self, base_url, rel_url): - if base_url.endswith('/') or rel_url.startswith('/'): - return base_url + rel_url + (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url) + + if isinstance(base_url, bytes): + if not isinstance(rel_url, bytes): + rel_url = rel_url.encode('utf8') + sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/' else: - return base_url + '/' + rel_url - + sep = '' if path.endswith('/') or rel_url.startswith('/') else '/' + + return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid)) + def _mirror_try(self, func, url, kw): gr = GrabRequest() gr.func = func gr.url = url gr.kw = dict(kw) self._load_gr(gr) + gr.errors = [] for k in self.options: try: del kw[k] except KeyError: pass - while 1: + tries = 0 + while True: + tries += 1 mirrorchoice = self._get_mirror(gr) fullurl = self._join_url(mirrorchoice['mirror'], gr.url) - kwargs = dict(mirrorchoice.get('kwargs', {})) - kwargs.update(kw) grabber = mirrorchoice.get('grabber') or self.grabber + # apply mirrorchoice kwargs on top of grabber.opts + opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {})) func_ref = getattr(grabber, func) - if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl) + if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl)) try: - return func_ref( *(fullurl,), **kwargs ) - except URLGrabError, e: + return func_ref( *(fullurl,), opts=opts, **kw ) + except URLGrabError as e: if DEBUG: DEBUG.info('MIRROR: failed') + gr.errors.append((fullurl, exception2msg(e))) obj = CallbackObject() obj.exception = e obj.mirror = mirrorchoice['mirror'] obj.relative_url = gr.url obj.url = fullurl + obj.tries = tries self._failure(gr, obj) def urlgrab(self, url, filename=None, **kwargs): kw = dict(kwargs) kw['filename'] = filename + if kw.get('async_') or kw.get('async'): + # enable mirror failovers in async path + kw['mirror_group'] = self, [], {}, set() + kw['relative_url'] = url + else: + kw.pop('failfunc', None) func = 'urlgrab' - return self._mirror_try(func, url, kw) - + try: + return self._mirror_try(func, url, kw) + except URLGrabError as e: + obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs) + return _run_callback(kwargs.get('failfunc', _do_raise), obj) + def urlopen(self, url, **kwargs): kw = dict(kwargs) func = 'urlopen' @@ -417,7 +479,7 @@ class MirrorGroup: kw['limit'] = limit func = 'urlread' return self._mirror_try(func, url, kw) - + class MGRandomStart(MirrorGroup): """A mirror group that starts at a random mirror in the list. |