diff options
author | Aleksander Mistewicz <a.mistewicz@samsung.com> | 2016-07-07 13:51:40 +0200 |
---|---|---|
committer | Aleksander Mistewicz <a.mistewicz@samsung.com> | 2016-10-07 17:29:30 +0200 |
commit | fc48025902c14cf59d0ab43010aa49e703824b0e (patch) | |
tree | cb64a70f970e96e92cd80678c534ad29854969e1 /tsp | |
parent | 77f8c41a1eecb1da3be3fed9e7dabf04d2dc73f9 (diff) | |
download | major-fc48025902c14cf59d0ab43010aa49e703824b0e.tar.gz major-fc48025902c14cf59d0ab43010aa49e703824b0e.tar.bz2 major-fc48025902c14cf59d0ab43010aa49e703824b0e.zip |
Replace env variable with argparse in crawler.py
Change-Id: I4a299d124df4692670d46726177073980aefdc88
Signed-off-by: Aleksander Mistewicz <a.mistewicz@samsung.com>
Diffstat (limited to 'tsp')
-rwxr-xr-x | tsp/jobs/watcher.sh | 10 | ||||
-rwxr-xr-x | tsp/scripts/crawler.py | 63 |
2 files changed, 42 insertions, 31 deletions
diff --git a/tsp/jobs/watcher.sh b/tsp/jobs/watcher.sh index b7160e4..47884cb 100755 --- a/tsp/jobs/watcher.sh +++ b/tsp/jobs/watcher.sh @@ -25,9 +25,7 @@ cd "${WS_WATCHER}" touch dispatched_urls touch timestamp touch timestamp_snapshot -nr=$(tspoll -L PRERELEASE_WATCHER timeout 120 sh -c "ROOT=\"http://download.tizen.org/prerelease/tizen/common/\" \ - ${TSP_DIR}/scripts/crawler.py") -nr=$(tspoll -D $nr -L DOWNLOAD_TRIGGER sh -c "${TSP_DIR}/jobs/trigger_downloads.sh") -nr=$(tspoll -D $nr -L SNAPSHOT_WATCHER timeout 120 sh -c "ROOT=\"http://download.tizen.org/snapshots/tizen/common/\" \ - ${TSP_DIR}/scripts/crawler.py") -tspoll -D $nr -L DOWNLOAD_TRIGGER sh -c "${TSP_DIR}/jobs/trigger_downloads.sh" +nr=$(tspoll -L PRERELEASE_WATCHER timeout 120 "${TSP_DIR}/scripts/crawler.py" "http://download.tizen.org/prerelease/tizen/common/") +nr=$(tspoll -D $nr -L DOWNLOAD_TRIGGER sh "${TSP_DIR}/jobs/trigger_downloads.sh") +nr=$(tspoll -D $nr -L SNAPSHOT_WATCHER timeout 120 "${TSP_DIR}/scripts/crawler.py" "http://download.tizen.org/snapshots/tizen/common/") +tspoll -D $nr -L DOWNLOAD_TRIGGER sh "${TSP_DIR}/jobs/trigger_downloads.sh" diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py index 59429e2..66cf0ec 100755 --- a/tsp/scripts/crawler.py +++ b/tsp/scripts/crawler.py @@ -21,40 +21,36 @@ import os import urllib2 import time +import argparse import bs4 discovered_urls = 'modified_urls' dispatched_urls = 'dispatched_urls' -root = os.environ.get('ROOT', 'http://download.tizen.org/prerelease/tizen/common/') -seeds = (root,) - - -def crawl(seeds): +def crawl(url): visited = set() - for url in seeds: - visited.add(url) + visited.add(url) - h = urllib2.build_opener() - h.addheaders = [('User-agent', 'Prerelease Crawler')] + h = urllib2.build_opener() + h.addheaders = [('User-agent', 'Prerelease Crawler')] - try: - resp = h.open(url) - except urllib2.HTTPError as e: - print 'Failed to access {url}: {code} - {reason}'\ - .format(url=url, code=e.code, reason=e.reason) + try: + resp = h.open(url) + except urllib2.HTTPError as e: + print 'Failed to access {url}: {code} - {reason}'\ + .format(url=url, code=e.code, reason=e.reason) - html = str(resp.read()) - soup = bs4.BeautifulSoup(html, 'lxml') - links = soup('a') + html = str(resp.read()) + soup = bs4.BeautifulSoup(html, 'lxml') + links = soup('a') - discovered = set() - for link in links: - if link not in discovered and link not in visited and link not in seeds: - if link.string.startswith('tizen-common'): - discovered.add(url + link['href']) + discovered = set() + for link in links: + if link not in discovered and link not in visited: + if link.string.startswith('tizen-common'): + discovered.add(url + link['href']) return discovered @@ -81,17 +77,34 @@ def get_modified_paths(discovered, timestamp): break return ret +def parse_arguments(): + """parse_arguments() -> args + + Parse any command-line options given returning both + the parsed options and arguments. + """ + + parser = argparse.ArgumentParser(description="Crawler for download.tizen.org") + + parser.add_argument("url", type=str, + help='URL of prerelease or snapshot to crawl.') + + args = parser.parse_args() + + return args + if '__main__' == __name__: - snapshots = crawl(seeds) + args = parse_arguments() + snapshots = crawl(args.url) timestamp_file = 'timestamp' - if "snapshots" in root: + if "snapshots" in args.url: timestamp_file = 'timestamp_snapshot' discovered = snapshots else: discovered = set() for snapshot in snapshots: - discovered |= crawl((snapshot,)) + discovered |= crawl(snapshot) if os.path.exists(dispatched_urls): with open(dispatched_urls, 'r') as f: |