summaryrefslogtreecommitdiff
path: root/tsp/scripts/crawler.py
diff options
context:
space:
mode:
Diffstat (limited to 'tsp/scripts/crawler.py')
-rwxr-xr-xtsp/scripts/crawler.py43
1 files changed, 21 insertions, 22 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py
index 59436c8..e2e7d81 100755
--- a/tsp/scripts/crawler.py
+++ b/tsp/scripts/crawler.py
@@ -19,7 +19,7 @@
# @author Pawel Wieczorek <p.wieczorek2@samsung.com>
import os
-import urllib2
+import requests
import time
import argparse
import logging
@@ -35,25 +35,20 @@ def crawl(url):
visited.add(url)
- h = urllib2.build_opener()
- h.addheaders = [('User-agent', 'Prerelease Crawler')]
+ headers = {'User-agent': 'Prerelease Crawler'}
+ r = requests.get(url, headers=headers)
- try:
- resp = h.open(url)
- except urllib2.HTTPError as e:
- print 'Failed to access {url}: {code} - {reason}'\
- .format(url=url, code=e.code, reason=e.reason)
-
- html = str(resp.read())
- soup = bs4.BeautifulSoup(html, 'lxml')
- links = soup('a')
+ links = set()
+ soup = bs4.BeautifulSoup(r.text, 'html.parser')
+ for link in soup.find_all('a'):
+ links.add(link.get('href'))
discovered = set()
for link in links:
if link not in discovered and link not in visited:
- if link.string.startswith('tizen-common'):
- logging.debug("Add link to discovered: %s", link['href'])
- discovered.add(url + link['href'])
+ if link.startswith('tizen-common'):
+ logging.debug("Add link to discovered: %s", link)
+ discovered.add(url + link)
return discovered
@@ -68,6 +63,8 @@ def get_modified_paths(discovered, timestamp):
else:
return discovered
logging.info("Previous timestamp: %s", stamp)
+ s = requests.Session()
+ s.headers.update({"If-Modified-Since": stamp})
for url in discovered:
logging.debug("Check for MD5SUMS change: %s", url)
md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
@@ -78,15 +75,17 @@ def get_modified_paths(discovered, timestamp):
url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS']
change = False
for md5sums_url in md5sums_urls:
- try:
- u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
- except urllib2.HTTPError as e:
- if e.code == 404:
- logging.debug("MD5SUMS missing: %s", md5sums_url)
- break
- else:
+ r = s.get(md5sums_url)
+ if r.status_code == requests.codes.ok:
logging.debug("MD5SUMS changed: %s", md5sums_url)
change = True
+ elif r.status_code == 404:
+ logging.debug("MD5SUMS missing: %s", md5sums_url)
+ break
+ elif r.status_code == 304:
+ logging.debug("MD5SUMS unchanged: %s", md5sums_url)
+ else:
+ logging.warn("MD5SUMS error: %s", md5sums_url)
if change:
logging.info("Add to dispatch: %s", url)
ret.add(url)