summaryrefslogtreecommitdiff
path: root/urlgrabber/mirror.py
diff options
context:
space:
mode:
authorbiao716.wang <biao716.wang@samsung.com>2020-08-26 11:34:58 +0900
committerbiao716.wang <biao716.wang@samsung.com>2020-08-26 11:34:58 +0900
commita5651e772478cb72e7fd02e1d60bbe7a509a4d50 (patch)
treedd6921b15d6285d79e2e42dcbbd132ac066b19d1 /urlgrabber/mirror.py
parent136d1e028cec5dcddc6ef6ac7302c794fda5f135 (diff)
downloadpython-urlgrabber-debian/4.1.0.tar.gz
python-urlgrabber-debian/4.1.0.tar.bz2
python-urlgrabber-debian/4.1.0.zip
Port to Python3debian/4.1.0
Change-Id: I46b8f71dce3d1f009617aa6e969414ad0c6393a6 Signed-off-by: biao716.wang <biao716.wang@samsung.com>
Diffstat (limited to 'urlgrabber/mirror.py')
-rwxr-xr-x[-rw-r--r--]urlgrabber/mirror.py132
1 files changed, 97 insertions, 35 deletions
diff --git a/urlgrabber/mirror.py b/urlgrabber/mirror.py
index dad410b..d95863e 100644..100755
--- a/urlgrabber/mirror.py
+++ b/urlgrabber/mirror.py
@@ -9,9 +9,9 @@
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc.,
-# 59 Temple Place, Suite 330,
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330,
# Boston, MA 02111-1307 USA
# This file is part of urlgrabber, a high-level cross-protocol url-grabber
@@ -76,6 +76,10 @@ CUSTOMIZATION
'grabber' is omitted, the default grabber will be used. If
kwargs are omitted, then (duh) they will not be used.
+ kwarg 'max_connections' limits the number of concurrent
+ connections to this mirror. When omitted or set to zero,
+ the default limit (2) will be used.
+
3) Pass keyword arguments when instantiating the mirror group.
See, for example, the failure_callback argument.
@@ -87,12 +91,29 @@ CUSTOMIZATION
"""
+import sys
import random
-import thread # needed for locking to make this threadsafe
-from grabber import URLGrabError, CallbackObject, DEBUG
+if sys.version_info >= (3,):
+ # We use a version check because python2 also has _thread
+ import _thread as thread
+else:
+ import thread
+
+try:
+ import urllib.parse as urlparse
+except ImportError:
+ import urlparse
+
+from six import string_types
-def _(st):
+from .grabber import URLGrabError, CallbackObject, DEBUG, _to_utf8
+from .grabber import _run_callback, _do_raise
+from .grabber import exception2msg
+from .grabber import _TH
+from .grabber import _bytes_repr
+
+def _(st):
return st
class GrabRequest:
@@ -126,13 +147,15 @@ class MirrorGroup:
files)
* if the local list is ever exhausted, a URLGrabError will be
- raised (errno=256, no more mirrors)
+ raised (errno=256, No more mirrors). The 'errors' attribute
+ holds a list of (full_url, errmsg) tuples. This contains
+ all URLs tried and the corresponding error messages.
OPTIONS
In addition to the required arguments "grabber" and "mirrors",
MirrorGroup also takes the following optional arguments:
-
+
default_action
A dict that describes the actions to be taken upon failure
@@ -153,7 +176,8 @@ class MirrorGroup:
The 'fail' option will cause immediate failure by re-raising
the exception and no further attempts to get the current
- download.
+ download. As in the "No more mirrors" case, the 'errors'
+ attribute is set in the exception object.
This dict can be set at instantiation time,
mg = MirrorGroup(grabber, mirrors, default_action={'fail':1})
@@ -162,7 +186,7 @@ class MirrorGroup:
or by returning an action dict from the failure_callback
return {'fail':0}
in increasing precedence.
-
+
If all three of these were done, the net result would be:
{'increment': 0, # set in method
'increment_master': 1, # class default
@@ -180,10 +204,11 @@ class MirrorGroup:
etc). Otherwise, it is assumed to be the callable object
itself. The callback will be passed a grabber.CallbackObject
instance along with args and kwargs (if present). The following
- attributes are defined withing the instance:
+ attributes are defined within the instance:
obj.exception = < exception that was raised >
obj.mirror = < the mirror that was tried >
+ obj.tries = < the number of mirror tries so far >
obj.relative_url = < url relative to the mirror >
obj.url = < full url that failed >
# .url is just the combination of .mirror
@@ -251,22 +276,34 @@ class MirrorGroup:
self.default_action = None
self._process_kwargs(kwargs)
+ # use the same algorithm as parallel downloader to initially sort
+ # the mirror list (sort by speed, but prefer live private mirrors)
+ def estimate(m):
+ speed, fail = _TH.estimate(m['mirror'])
+ private = not fail and m.get('kwargs', {}).get('private', False)
+ return private, speed
+
+ # update the initial order. since sorting is stable, the relative
+ # order of unknown (not used yet) hosts is retained.
+ self.mirrors.sort(key=estimate, reverse=True)
+
# if these values are found in **kwargs passed to one of the urlXXX
# methods, they will be stripped before getting passed on to the
# grabber
options = ['default_action', 'failure_callback']
-
+
def _process_kwargs(self, kwargs):
self.failure_callback = kwargs.get('failure_callback')
self.default_action = kwargs.get('default_action')
-
+
def _parse_mirrors(self, mirrors):
parsed_mirrors = []
for m in mirrors:
- if type(m) == type(''): m = {'mirror': m}
+ if isinstance(m, string_types):
+ m = {'mirror': _to_utf8(m)}
parsed_mirrors.append(m)
return parsed_mirrors
-
+
def _load_gr(self, gr):
# OVERRIDE IDEAS:
# shuffle gr list
@@ -280,7 +317,9 @@ class MirrorGroup:
# return a random mirror so that multiple mirrors get used
# even without failures.
if not gr.mirrors:
- raise URLGrabError(256, _('No more mirrors to try.'))
+ e = URLGrabError(256, _('No more mirrors to try.'))
+ e.errors = gr.errors
+ raise e
return gr.mirrors[gr._next]
def _failure(self, gr, cb_obj):
@@ -290,7 +329,7 @@ class MirrorGroup:
# the callback)
cb = gr.kw.get('failure_callback') or self.failure_callback
if cb:
- if type(cb) == type( () ):
+ if isinstance(cb, tuple):
cb, args, kwargs = cb
else:
args, kwargs = (), {}
@@ -307,7 +346,9 @@ class MirrorGroup:
a.update(action)
action = a
self.increment_mirror(gr, action)
- if action and action.get('fail', 0): raise
+ if action and action.get('fail', 0):
+ sys.exc_info()[1].errors = gr.errors
+ raise
def increment_mirror(self, gr, action={}):
"""Tell the mirror object increment the mirror index
@@ -323,7 +364,7 @@ class MirrorGroup:
urlopen, there's no good way for the mirror group to know that
an error occurs mid-download (it's already returned and given
you the file object).
-
+
remove --- can have several values
0 do not remove the mirror from the list
1 remove the mirror for this download only
@@ -345,7 +386,7 @@ class MirrorGroup:
self._next += 1
if self._next >= len(self.mirrors): self._next = 0
self._lock.release()
-
+
if action.get('remove', 1):
del gr.mirrors[gr._next]
elif action.get('increment', 1):
@@ -353,9 +394,9 @@ class MirrorGroup:
if gr._next >= len(gr.mirrors): gr._next = 0
if DEBUG:
- grm = [m['mirror'] for m in gr.mirrors]
+ grm = [m['mirror'].decode() for m in gr.mirrors]
DEBUG.info('GR mirrors: [%s] %i', ' '.join(grm), gr._next)
- selfm = [m['mirror'] for m in self.mirrors]
+ selfm = [m['mirror'].decode() for m in self.mirrors]
DEBUG.info('MAIN mirrors: [%s] %i', ' '.join(selfm), self._next)
#####################################################################
@@ -366,47 +407,68 @@ class MirrorGroup:
# by overriding the configuration methods :)
def _join_url(self, base_url, rel_url):
- if base_url.endswith('/') or rel_url.startswith('/'):
- return base_url + rel_url
+ (scheme, netloc, path, query, fragid) = urlparse.urlsplit(base_url)
+
+ if isinstance(base_url, bytes):
+ if not isinstance(rel_url, bytes):
+ rel_url = rel_url.encode('utf8')
+ sep = b'' if path.endswith(b'/') or rel_url.startswith(b'/') else b'/'
else:
- return base_url + '/' + rel_url
-
+ sep = '' if path.endswith('/') or rel_url.startswith('/') else '/'
+
+ return urlparse.urlunsplit((scheme, netloc, path + sep + rel_url, query, fragid))
+
def _mirror_try(self, func, url, kw):
gr = GrabRequest()
gr.func = func
gr.url = url
gr.kw = dict(kw)
self._load_gr(gr)
+ gr.errors = []
for k in self.options:
try: del kw[k]
except KeyError: pass
- while 1:
+ tries = 0
+ while True:
+ tries += 1
mirrorchoice = self._get_mirror(gr)
fullurl = self._join_url(mirrorchoice['mirror'], gr.url)
- kwargs = dict(mirrorchoice.get('kwargs', {}))
- kwargs.update(kw)
grabber = mirrorchoice.get('grabber') or self.grabber
+ # apply mirrorchoice kwargs on top of grabber.opts
+ opts = grabber.opts.derive(**mirrorchoice.get('kwargs', {}))
func_ref = getattr(grabber, func)
- if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', url, fullurl)
+ if DEBUG: DEBUG.info('MIRROR: trying %s -> %s', _bytes_repr(url), _bytes_repr(fullurl))
try:
- return func_ref( *(fullurl,), **kwargs )
- except URLGrabError, e:
+ return func_ref( *(fullurl,), opts=opts, **kw )
+ except URLGrabError as e:
if DEBUG: DEBUG.info('MIRROR: failed')
+ gr.errors.append((fullurl, exception2msg(e)))
obj = CallbackObject()
obj.exception = e
obj.mirror = mirrorchoice['mirror']
obj.relative_url = gr.url
obj.url = fullurl
+ obj.tries = tries
self._failure(gr, obj)
def urlgrab(self, url, filename=None, **kwargs):
kw = dict(kwargs)
kw['filename'] = filename
+ if kw.get('async_') or kw.get('async'):
+ # enable mirror failovers in async path
+ kw['mirror_group'] = self, [], {}, set()
+ kw['relative_url'] = url
+ else:
+ kw.pop('failfunc', None)
func = 'urlgrab'
- return self._mirror_try(func, url, kw)
-
+ try:
+ return self._mirror_try(func, url, kw)
+ except URLGrabError as e:
+ obj = CallbackObject(url=url, filename=filename, exception=e, **kwargs)
+ return _run_callback(kwargs.get('failfunc', _do_raise), obj)
+
def urlopen(self, url, **kwargs):
kw = dict(kwargs)
func = 'urlopen'
@@ -417,7 +479,7 @@ class MirrorGroup:
kw['limit'] = limit
func = 'urlread'
return self._mirror_try(func, url, kw)
-
+
class MGRandomStart(MirrorGroup):
"""A mirror group that starts at a random mirror in the list.