summaryrefslogtreecommitdiff
path: root/tsp/scripts/crawler.py
diff options
context:
space:
mode:
authorAleksander Mistewicz <a.mistewicz@samsung.com>2016-06-09 11:44:57 +0200
committerAleksander Mistewicz <a.mistewicz@samsung.com>2016-08-24 10:54:50 +0200
commitf98afdbe644bd48c6b72e3352e698901f923dcfb (patch)
treeb697bed43f6d1c4d36b8c35407e27889db32b9eb /tsp/scripts/crawler.py
parent2320429ce2eace8ea847487428dbd2cbc665a7be (diff)
downloadmajor-f98afdbe644bd48c6b72e3352e698901f923dcfb.tar.gz
major-f98afdbe644bd48c6b72e3352e698901f923dcfb.tar.bz2
major-f98afdbe644bd48c6b72e3352e698901f923dcfb.zip
Modify tsp behaviour to comply with OBS changes
Use filenames that won't confuse future developer Change-Id: I3c0928045ab169b20333c94f762d6be64f58a17c Signed-off-by: Aleksander Mistewicz <a.mistewicz@samsung.com>
Diffstat (limited to 'tsp/scripts/crawler.py')
-rwxr-xr-xtsp/scripts/crawler.py44
1 files changed, 35 insertions, 9 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py
index c61510a..59429e2 100755
--- a/tsp/scripts/crawler.py
+++ b/tsp/scripts/crawler.py
@@ -20,10 +20,11 @@
import os
import urllib2
+import time
import bs4
-new_urls = 'new_urls'
+discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'
root = os.environ.get('ROOT', 'http://download.tizen.org/prerelease/tizen/common/')
@@ -57,26 +58,51 @@ def crawl(seeds):
return discovered
+def get_modified_paths(discovered, timestamp):
+ ret = set()
+ str_time = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
+ if os.path.exists(dispatched_urls):
+ with open(timestamp, 'r') as f:
+ stamp = f.read();
+ else:
+ return discovered
+ with open(timestamp, 'w') as f:
+ f.write(str_time)
+ for url in discovered:
+ for md5sums_url in [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
+ url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\
+ url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS']:
+ try:
+ u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
+ except urllib2.HTTPError as e:
+ pass
+ else:
+ ret.add(url)
+ break
+ return ret
+
if '__main__' == __name__:
snapshots = crawl(seeds)
+ timestamp_file = 'timestamp'
if "snapshots" in root:
- new = snapshots
+ timestamp_file = 'timestamp_snapshot'
+ discovered = snapshots
else:
- new = set()
+ discovered = set()
for snapshot in snapshots:
- new |= crawl((snapshot,))
+ discovered |= crawl((snapshot,))
if os.path.exists(dispatched_urls):
with open(dispatched_urls, 'r') as f:
dispatched = set([url.rstrip() for url in f.readlines()])
- # save new URLs for dispatching download requests
- new -= dispatched
- with open(new_urls, 'w') as f:
- f.write('\n'.join(new))
+ # save discovered URLs for dispatching download requests
+ modified = get_modified_paths(discovered, timestamp_file)
+ with open(discovered_urls, 'w') as f:
+ f.write('\n'.join(modified))
# save all URLs for storing download history
- dispatched |= new
+ dispatched |= modified
with open(dispatched_urls, 'w') as f:
f.write('\n'.join(dispatched))