diff options
Diffstat (limited to 'tsp')
-rwxr-xr-x | tsp/scripts/crawler.py | 40 |
1 files changed, 26 insertions, 14 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py index e2e7d81..f2cc309 100755 --- a/tsp/scripts/crawler.py +++ b/tsp/scripts/crawler.py @@ -29,29 +29,45 @@ import bs4 discovered_urls = 'modified_urls' dispatched_urls = 'dispatched_urls' +def get_links(session, url): + main = session.get(url, timeout=120) + soup = bs4.BeautifulSoup(main.text, 'html.parser') + links = set() + for link in soup.find_all('a'): + links.add(link.get('href')) + return links + def crawl(url): logging.info("crawl: %s", url) visited = set() visited.add(url) - headers = {'User-agent': 'Prerelease Crawler'} - r = requests.get(url, headers=headers) - - links = set() - soup = bs4.BeautifulSoup(r.text, 'html.parser') - for link in soup.find_all('a'): - links.add(link.get('href')) + s = requests.Session() + s.headers.update({'User-agent': 'Prerelease Crawler'}) + links = get_links(s, url) discovered = set() for link in links: if link not in discovered and link not in visited: - if link.startswith('tizen-common'): + if link.startswith('tizen-'): logging.debug("Add link to discovered: %s", link) discovered.add(url + link) return discovered +def get_urls2check(session, md5sums, url, level=2): + links = get_links(session, url) + for link in links: + if not link.startswith("/") and link.endswith("/"): + if level > 0: + logging.debug("Step into: %s", link) + get_urls2check(session, md5sums, url + link, level - 1) + else: + logging.debug("Reached depth limit, ignore: %s", link) + if level == 0: + md5sums.add(url + "MD5SUMS") + def get_modified_paths(discovered, timestamp): logging.info("get_modified_paths") ret = set() @@ -67,12 +83,8 @@ def get_modified_paths(discovered, timestamp): s.headers.update({"If-Modified-Since": stamp}) for url in discovered: logging.debug("Check for MD5SUMS change: %s", url) - md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\ - url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\ - url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS',\ - url + 'images/x86_64-wayland/common-minimal-mbr-x86_64/MD5SUMS',\ - url + 'images/arm-wayland/common-wayland-3parts-armv7l-artik/MD5SUMS',\ - url + 'images/arm-wayland/common-headless-3parts-armv7l-artik/MD5SUMS'] + md5sums_urls = set() + get_urls2check(s, md5sums_urls, url + "images/") change = False for md5sums_url in md5sums_urls: r = s.get(md5sums_url) |