Modify tsp behaviour to comply with OBS changes

Use filenames that won't confuse future developer Change-Id: I3c0928045ab169b20333c94f762d6be64f58a17c Signed-off-by: Aleksander Mistewicz <a.mistewicz@samsung.com>
author: Aleksander Mistewicz <a.mistewicz@samsung.com> 2016-06-09 11:44:57 +0200
committer: Aleksander Mistewicz <a.mistewicz@samsung.com> 2016-08-24 10:54:50 +0200
commit: f98afdbe644bd48c6b72e3352e698901f923dcfb (patch)
tree: b697bed43f6d1c4d36b8c35407e27889db32b9eb /tsp/scripts/crawler.py
parent: 2320429ce2eace8ea847487428dbd2cbc665a7be (diff)
download: major-f98afdbe644bd48c6b72e3352e698901f923dcfb.tar.gz
major-f98afdbe644bd48c6b72e3352e698901f923dcfb.tar.bz2
major-f98afdbe644bd48c6b72e3352e698901f923dcfb.zip
1 files changed, 35 insertions, 9 deletions
diff --git a/tsp/scripts/crawler.py b/tsp/scripts/crawler.py
index c61510a..59429e2 100755
--- a/tsp/scripts/crawler.py
+++ b/tsp/scripts/crawler.py
@@ -20,10 +20,11 @@
 
 import os
 import urllib2
+import time
 
 import bs4
 
-new_urls = 'new_urls'
+discovered_urls = 'modified_urls'
 dispatched_urls = 'dispatched_urls'
 
 root = os.environ.get('ROOT', 'http://download.tizen.org/prerelease/tizen/common/')
@@ -57,26 +58,51 @@ def crawl(seeds):
 
     return discovered
 
+def get_modified_paths(discovered, timestamp):
+    ret = set()
+    str_time = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
+    if os.path.exists(dispatched_urls):
+        with open(timestamp, 'r') as f:
+            stamp = f.read();
+    else:
+        return discovered
+    with open(timestamp, 'w') as f:
+        f.write(str_time)
+    for url in discovered:
+        for md5sums_url in [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
+                url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\
+                url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS']:
+            try:
+                u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
+            except urllib2.HTTPError as e:
+                pass
+            else:
+                ret.add(url)
+                break
+    return ret
+
 if '__main__' == __name__:
     snapshots = crawl(seeds)
+    timestamp_file = 'timestamp'
 
     if "snapshots" in root:
-        new = snapshots
+        timestamp_file = 'timestamp_snapshot'
+        discovered = snapshots
     else:
-        new = set()
+        discovered = set()
         for snapshot in snapshots:
-            new |= crawl((snapshot,))
+            discovered |= crawl((snapshot,))
 
     if os.path.exists(dispatched_urls):
         with open(dispatched_urls, 'r') as f:
             dispatched = set([url.rstrip() for url in f.readlines()])
 
-    # save new URLs for dispatching download requests
-    new -= dispatched
-    with open(new_urls, 'w') as f:
-        f.write('\n'.join(new))
+    # save discovered URLs for dispatching download requests
+    modified = get_modified_paths(discovered, timestamp_file)
+    with open(discovered_urls, 'w') as f:
+        f.write('\n'.join(modified))
 
     # save all URLs for storing download history
-    dispatched |= new
+    dispatched |= modified
     with open(dispatched_urls, 'w') as f:
         f.write('\n'.join(dispatched))
author	Aleksander Mistewicz <a.mistewicz@samsung.com>	2016-06-09 11:44:57 +0200
committer	Aleksander Mistewicz <a.mistewicz@samsung.com>	2016-08-24 10:54:50 +0200
commit	f98afdbe644bd48c6b72e3352e698901f923dcfb (patch)
tree	b697bed43f6d1c4d36b8c35407e27889db32b9eb /tsp/scripts/crawler.py
parent	2320429ce2eace8ea847487428dbd2cbc665a7be (diff)
download	major-f98afdbe644bd48c6b72e3352e698901f923dcfb.tar.gz major-f98afdbe644bd48c6b72e3352e698901f923dcfb.tar.bz2 major-f98afdbe644bd48c6b72e3352e698901f923dcfb.zip