test/threading/batchgrabber.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110

#   This library is free software; you can redistribute it and/or
#   modify it under the terms of the GNU Lesser General Public
#   License as published by the Free Software Foundation; either
#   version 2.1 of the License, or (at your option) any later version.
#
#   This library is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#   Lesser General Public License for more details.
#
#   You should have received a copy of the GNU Lesser General Public
#   License along with this library; if not, write to the 
#      Free Software Foundation, Inc., 
#      59 Temple Place, Suite 330, 
#      Boston, MA  02111-1307  USA

# This file is part of urlgrabber, a high-level cross-protocol url-grabber
# Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko

"""Module for testing urlgrabber under multiple threads.

This module can be used from the command line. Each argument is 
a URL to grab.

The BatchURLGrabber class has an interface similar to URLGrabber 
but instead of pulling files when urlgrab is called, the request
is queued. Calling BatchURLGrabber.batchgrab causes all files to
be pulled in multiple threads.

"""

import os.path, sys
if __name__ == '__main__':
  print os.path.dirname(sys.argv[0])
  sys.path.insert(0, (os.path.dirname(sys.argv[0]) or '.') + '/../..')

from threading import Thread, Semaphore
from urlgrabber.grabber import URLGrabber, URLGrabError
from urlgrabber.progress import MultiFileMeter, TextMultiFileMeter
from time import sleep, time

DEBUG=0

class BatchURLGrabber:
  def __init__(self, maxthreads=5, **kwargs):
    self.maxthreads = 5
    self.grabber = URLGrabber(**kwargs)
    self.queue = []
    self.threads = []
    self.sem = Semaphore()
    
  def urlgrab(self, url, filename=None, **kwargs):
    self.queue.append( (url, filename, kwargs) )
  
  def batchgrab(self):
    if hasattr(self.grabber.opts.progress_obj, 'start'):
        self.grabber.opts.progress_obj.start(len(self.queue))
    while self.queue or self.threads:
      if self.queue and (len(self.threads) < self.maxthreads):
        url, filename, kwargs = self.queue[0]
        del self.queue[0]
        thread = Worker(self, url, filename, kwargs)
        self.threads.append(thread)
        if DEBUG: print "starting worker: " + url
        thread.start()
      else:
        for t in self.threads:
          if not t.isAlive():
            if DEBUG: print "cleaning up worker: " + t.url
            self.threads.remove(t)
        #if len(self.threads) == self.maxthreads:
        #  sleep(0.2)
        sleep(0.2)
        
class Worker(Thread):
  def __init__(self, parent, url, filename, kwargs):
    Thread.__init__(self)
    self.parent = parent
    self.url = url
    self.filename = filename
    self.kwargs = kwargs
  
  def run(self):
    if DEBUG: print "worker thread started."
    grabber = self.parent.grabber
    progress_obj = grabber.opts.progress_obj
    if isinstance(progress_obj, MultiFileMeter):
      self.kwargs['progress_obj'] = progress_obj.newMeter()
    try:
      rslt = self.parent.grabber.urlgrab(self.url, self.filename, **self.kwargs)
    except URLGrabError, e:
      print '%s, %s' % (e, self.url)
      
def main():
  progress_obj = None
  # uncomment to play with BatchProgressMeter (doesn't work right now)
  # progress_obj = TextMultiFileMeter()
  g = BatchURLGrabber(keepalive=1, progress_obj=progress_obj)
  for arg in sys.argv[1:]:
    g.urlgrab(arg)
  if DEBUG: print "before batchgrab"
  try:
    g.batchgrab()
  except KeyboardInterrupt:
    sys.exit(1)
    
  if DEBUG: print "after batchgrab"
  
if __name__ == '__main__':
  main()