diff options
Diffstat (limited to 'tsp/scripts/crawler.py')
-rwxr-xr-x | tsp/scripts/crawler.py | 43 |
1 files changed, 21 insertions, 22 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py index 59436c8..e2e7d81 100755 --- a/tsp/scripts/crawler.py +++ b/tsp/scripts/crawler.py @@ -19,7 +19,7 @@ # @author Pawel Wieczorek <p.wieczorek2@samsung.com> import os -import urllib2 +import requests import time import argparse import logging @@ -35,25 +35,20 @@ def crawl(url): visited.add(url) - h = urllib2.build_opener() - h.addheaders = [('User-agent', 'Prerelease Crawler')] + headers = {'User-agent': 'Prerelease Crawler'} + r = requests.get(url, headers=headers) - try: - resp = h.open(url) - except urllib2.HTTPError as e: - print 'Failed to access {url}: {code} - {reason}'\ - .format(url=url, code=e.code, reason=e.reason) - - html = str(resp.read()) - soup = bs4.BeautifulSoup(html, 'lxml') - links = soup('a') + links = set() + soup = bs4.BeautifulSoup(r.text, 'html.parser') + for link in soup.find_all('a'): + links.add(link.get('href')) discovered = set() for link in links: if link not in discovered and link not in visited: - if link.string.startswith('tizen-common'): - logging.debug("Add link to discovered: %s", link['href']) - discovered.add(url + link['href']) + if link.startswith('tizen-common'): + logging.debug("Add link to discovered: %s", link) + discovered.add(url + link) return discovered @@ -68,6 +63,8 @@ def get_modified_paths(discovered, timestamp): else: return discovered logging.info("Previous timestamp: %s", stamp) + s = requests.Session() + s.headers.update({"If-Modified-Since": stamp}) for url in discovered: logging.debug("Check for MD5SUMS change: %s", url) md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\ @@ -78,15 +75,17 @@ def get_modified_paths(discovered, timestamp): url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS'] change = False for md5sums_url in md5sums_urls: - try: - u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp})) - except urllib2.HTTPError as e: - if e.code == 404: - logging.debug("MD5SUMS missing: %s", md5sums_url) - break - else: + r = s.get(md5sums_url) + if r.status_code == requests.codes.ok: logging.debug("MD5SUMS changed: %s", md5sums_url) change = True + elif r.status_code == 404: + logging.debug("MD5SUMS missing: %s", md5sums_url) + break + elif r.status_code == 304: + logging.debug("MD5SUMS unchanged: %s", md5sums_url) + else: + logging.warn("MD5SUMS error: %s", md5sums_url) if change: logging.info("Add to dispatch: %s", url) ret.add(url) |