summaryrefslogtreecommitdiff
path: root/tsp/scripts
diff options
context:
space:
mode:
authorPawel Wieczorek <p.wieczorek2@samsung.com>2016-06-13 11:46:38 +0200
committerPawel Wieczorek <p.wieczorek2@samsung.com>2016-06-13 13:06:07 +0200
commita93c9fc7842cc93cd55dd4d7549f93707f39a9c1 (patch)
treeb8ace9c18c466e0b2bb6e9eea7f05a85c73f9a6c /tsp/scripts
parentdd04b625b6c5b109b8d3f94f0fc60a845af1a89e (diff)
downloadmajor-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.tar.gz
major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.tar.bz2
major-a93c9fc7842cc93cd55dd4d7549f93707f39a9c1.zip
Add prerelease crawler script
This patch introduces script for obtaining information about new prerelease projects. Root for crawling is taken from 'ROOT' environment variable. If it is not set, script will fallback to: "http://download.tizen.org/prerelease/tizen/common/". Change-Id: I363332fcf3f24cc81ac81d56af0f0204788653ab
Diffstat (limited to 'tsp/scripts')
-rwxr-xr-xtsp/scripts/prerelease_crawler.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/tsp/scripts/prerelease_crawler.py b/tsp/scripts/prerelease_crawler.py
new file mode 100755
index 0000000..31125f5
--- /dev/null
+++ b/tsp/scripts/prerelease_crawler.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+##
+# @author Pawel Wieczorek <p.wieczorek2@samsung.com>
+
+import os
+import urllib2
+
+import bs4
+
+new_urls = 'new_urls'
+dispatched_urls = 'dispatched_urls'
+
+root = os.environ.get('ROOT', 'http://download.tizen.org/prerelease/tizen/common/')
+seeds = (root,)
+
+
+def crawl(seeds):
+ visited = set()
+
+ for url in seeds:
+ visited.add(url)
+
+ h = urllib2.build_opener()
+ h.addheaders = [('User-agent', 'Prerelease Crawler')]
+
+ try:
+ resp = h.open(url)
+ except urllib2.HTTPError as e:
+ print 'Failed to access {url}: {code} - {reason}'\
+ .format(url=url, code=e.code, reason=e.reason)
+
+ html = str(resp.read())
+ soup = bs4.BeautifulSoup(html, 'lxml')
+ links = soup('a')
+
+ discovered = set()
+ for link in links:
+ if link not in discovered and link not in visited and link not in seeds:
+ if link.string.startswith('tizen-common'):
+ discovered.add(url + link['href'])
+
+ return discovered
+
+if '__main__' == __name__:
+ snapshots = crawl(seeds)
+
+ new = set()
+ for snapshot in snapshots:
+ new |= crawl((snapshot,))
+
+ if os.path.exists(dispatched_urls):
+ with open(dispatched_urls, 'r') as f:
+ dispatched = set([url.rstrip() for url in f.readlines()])
+
+ # save new URLs for dispatching download requests
+ new -= dispatched
+ with open(new_urls, 'w') as f:
+ f.write('\n'.join(new))
+
+ # save all URLs for storing download history
+ dispatched |= new
+ with open(dispatched_urls, 'w') as f:
+ f.write('\n'.join(dispatched))