summaryrefslogtreecommitdiff
path: root/tsp
diff options
context:
space:
mode:
Diffstat (limited to 'tsp')
-rwxr-xr-xtsp/scripts/crawler.py40
1 files changed, 26 insertions, 14 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py
index e2e7d81..f2cc309 100755
--- a/tsp/scripts/crawler.py
+++ b/tsp/scripts/crawler.py
@@ -29,29 +29,45 @@ import bs4
discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'
+def get_links(session, url):
+ main = session.get(url, timeout=120)
+ soup = bs4.BeautifulSoup(main.text, 'html.parser')
+ links = set()
+ for link in soup.find_all('a'):
+ links.add(link.get('href'))
+ return links
+
def crawl(url):
logging.info("crawl: %s", url)
visited = set()
visited.add(url)
- headers = {'User-agent': 'Prerelease Crawler'}
- r = requests.get(url, headers=headers)
-
- links = set()
- soup = bs4.BeautifulSoup(r.text, 'html.parser')
- for link in soup.find_all('a'):
- links.add(link.get('href'))
+ s = requests.Session()
+ s.headers.update({'User-agent': 'Prerelease Crawler'})
+ links = get_links(s, url)
discovered = set()
for link in links:
if link not in discovered and link not in visited:
- if link.startswith('tizen-common'):
+ if link.startswith('tizen-'):
logging.debug("Add link to discovered: %s", link)
discovered.add(url + link)
return discovered
+def get_urls2check(session, md5sums, url, level=2):
+ links = get_links(session, url)
+ for link in links:
+ if not link.startswith("/") and link.endswith("/"):
+ if level > 0:
+ logging.debug("Step into: %s", link)
+ get_urls2check(session, md5sums, url + link, level - 1)
+ else:
+ logging.debug("Reached depth limit, ignore: %s", link)
+ if level == 0:
+ md5sums.add(url + "MD5SUMS")
+
def get_modified_paths(discovered, timestamp):
logging.info("get_modified_paths")
ret = set()
@@ -67,12 +83,8 @@ def get_modified_paths(discovered, timestamp):
s.headers.update({"If-Modified-Since": stamp})
for url in discovered:
logging.debug("Check for MD5SUMS change: %s", url)
- md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
- url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\
- url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS',\
- url + 'images/x86_64-wayland/common-minimal-mbr-x86_64/MD5SUMS',\
- url + 'images/arm-wayland/common-wayland-3parts-armv7l-artik/MD5SUMS',\
- url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS']
+ md5sums_urls = set()
+ get_urls2check(s, md5sums_urls, url + "images/")
change = False
for md5sums_url in md5sums_urls:
r = s.get(md5sums_url)