Add prerelease crawler script

This patch introduces script for obtaining information about new prerelease projects. Root for crawling is taken from 'ROOT' environment variable. If it is not set, script will fallback to: "http://download.tizen.org/prerelease/tizen/common/". Change-Id: I363332fcf3f24cc81ac81d56af0f0204788653ab
author: Pawel Wieczorek <p.wieczorek2@samsung.com> 2016-06-13 11:46:38 +0200
committer: Pawel Wieczorek <p.wieczorek2@samsung.com> 2016-06-13 13:06:07 +0200
commit: a93c9fc7842cc93cd55dd4d7549f93707f39a9c1 (patch)
tree: b8ace9c18c466e0b2bb6e9eea7f05a85c73f9a6c /tsp/scripts
parent: dd04b625b6c5b109b8d3f94f0fc60a845af1a89e (diff)
download: major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.tar.gz
major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.tar.bz2
major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.zip
1 files changed, 79 insertions, 0 deletions
diff --git a/tsp/scripts/prerelease_crawler.py b/tsp/scripts/prerelease_crawler.py
new file mode 100755
index 0000000..31125f5
--- /dev/null
+++ b/tsp/scripts/prerelease_crawler.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##
+# @author Pawel Wieczorek <p.wieczorek2@samsung.com>
+
+import os
+import urllib2
+
+import bs4
+
+new_urls = 'new_urls'
+dispatched_urls = 'dispatched_urls'
+
+root = os.environ.get('ROOT', 'http://download.tizen.org/prerelease/tizen/common/')
+seeds = (root,)
+
+
+def crawl(seeds):
+    visited = set()
+
+    for url in seeds:
+        visited.add(url)
+
+        h = urllib2.build_opener()
+        h.addheaders = [('User-agent', 'Prerelease Crawler')]
+
+        try:
+            resp = h.open(url)
+        except urllib2.HTTPError as e:
+            print 'Failed to access {url}: {code} - {reason}'\
+                .format(url=url, code=e.code, reason=e.reason)
+
+        html = str(resp.read())
+        soup = bs4.BeautifulSoup(html, 'lxml')
+        links = soup('a')
+
+        discovered = set()
+        for link in links:
+            if link not in discovered and link not in visited and link not in seeds:
+                if link.string.startswith('tizen-common'):
+                    discovered.add(url + link['href'])
+
+    return discovered
+
+if '__main__' == __name__:
+    snapshots = crawl(seeds)
+
+    new = set()
+    for snapshot in snapshots:
+        new |= crawl((snapshot,))
+
+    if os.path.exists(dispatched_urls):
+        with open(dispatched_urls, 'r') as f:
+            dispatched = set([url.rstrip() for url in f.readlines()])
+
+    # save new URLs for dispatching download requests
+    new -= dispatched
+    with open(new_urls, 'w') as f:
+        f.write('\n'.join(new))
+
+    # save all URLs for storing download history
+    dispatched |= new
+    with open(dispatched_urls, 'w') as f:
+        f.write('\n'.join(dispatched))
author	Pawel Wieczorek <p.wieczorek2@samsung.com>	2016-06-13 11:46:38 +0200
committer	Pawel Wieczorek <p.wieczorek2@samsung.com>	2016-06-13 13:06:07 +0200
commit	a93c9fc7842cc93cd55dd4d7549f93707f39a9c1 (patch)
tree	b8ace9c18c466e0b2bb6e9eea7f05a85c73f9a6c /tsp/scripts
parent	dd04b625b6c5b109b8d3f94f0fc60a845af1a89e (diff)
download	major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.tar.gz major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.tar.bz2 major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.zip