summaryrefslogtreecommitdiff
path: root/tsp/scripts/crawler.py
diff options
context:
space:
mode:
Diffstat (limited to 'tsp/scripts/crawler.py')
-rwxr-xr-xtsp/scripts/crawler.py63
1 files changed, 38 insertions, 25 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py
index 59429e2..66cf0ec 100755
--- a/tsp/scripts/crawler.py
+++ b/tsp/scripts/crawler.py
@@ -21,40 +21,36 @@
import os
import urllib2
import time
+import argparse
import bs4
discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'
-root = os.environ.get('ROOT', 'http://download.tizen.org/prerelease/tizen/common/')
-seeds = (root,)
-
-
-def crawl(seeds):
+def crawl(url):
visited = set()
- for url in seeds:
- visited.add(url)
+ visited.add(url)
- h = urllib2.build_opener()
- h.addheaders = [('User-agent', 'Prerelease Crawler')]
+ h = urllib2.build_opener()
+ h.addheaders = [('User-agent', 'Prerelease Crawler')]
- try:
- resp = h.open(url)
- except urllib2.HTTPError as e:
- print 'Failed to access {url}: {code} - {reason}'\
- .format(url=url, code=e.code, reason=e.reason)
+ try:
+ resp = h.open(url)
+ except urllib2.HTTPError as e:
+ print 'Failed to access {url}: {code} - {reason}'\
+ .format(url=url, code=e.code, reason=e.reason)
- html = str(resp.read())
- soup = bs4.BeautifulSoup(html, 'lxml')
- links = soup('a')
+ html = str(resp.read())
+ soup = bs4.BeautifulSoup(html, 'lxml')
+ links = soup('a')
- discovered = set()
- for link in links:
- if link not in discovered and link not in visited and link not in seeds:
- if link.string.startswith('tizen-common'):
- discovered.add(url + link['href'])
+ discovered = set()
+ for link in links:
+ if link not in discovered and link not in visited:
+ if link.string.startswith('tizen-common'):
+ discovered.add(url + link['href'])
return discovered
@@ -81,17 +77,34 @@ def get_modified_paths(discovered, timestamp):
break
return ret
+def parse_arguments():
+ """parse_arguments() -> args
+
+ Parse any command-line options given returning both
+ the parsed options and arguments.
+ """
+
+ parser = argparse.ArgumentParser(description="Crawler for download.tizen.org")
+
+ parser.add_argument("url", type=str,
+ help='URL of prerelease or snapshot to crawl.')
+
+ args = parser.parse_args()
+
+ return args
+
if '__main__' == __name__:
- snapshots = crawl(seeds)
+ args = parse_arguments()
+ snapshots = crawl(args.url)
timestamp_file = 'timestamp'
- if "snapshots" in root:
+ if "snapshots" in args.url:
timestamp_file = 'timestamp_snapshot'
discovered = snapshots
else:
discovered = set()
for snapshot in snapshots:
- discovered |= crawl((snapshot,))
+ discovered |= crawl(snapshot)
if os.path.exists(dispatched_urls):
with open(dispatched_urls, 'r') as f: