1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##
# @author Pawel Wieczorek <p.wieczorek2@samsung.com>
import os
import urllib2
import time
import argparse
import logging
import bs4
discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'
def crawl(url):
logging.info("crawl: %s", url)
visited = set()
visited.add(url)
h = urllib2.build_opener()
h.addheaders = [('User-agent', 'Prerelease Crawler')]
try:
resp = h.open(url)
except urllib2.HTTPError as e:
print 'Failed to access {url}: {code} - {reason}'\
.format(url=url, code=e.code, reason=e.reason)
html = str(resp.read())
soup = bs4.BeautifulSoup(html, 'lxml')
links = soup('a')
discovered = set()
for link in links:
if link not in discovered and link not in visited:
if link.string.startswith('tizen-common'):
logging.debug("Add link to discovered: %s", link['href'])
discovered.add(url + link['href'])
return discovered
def get_modified_paths(discovered, timestamp):
logging.info("get_modified_paths")
ret = set()
str_time = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
logging.info("Next timestamp: %s", str_time)
if os.path.exists(dispatched_urls):
with open(timestamp, 'r') as f:
stamp = f.read();
else:
return discovered
logging.info("Previous timestamp: %s", stamp)
for url in discovered:
logging.debug("Check for MD5SUMS change: %s", url)
md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\
url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS']
change = False
for md5sums_url in md5sums_urls:
try:
u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
except urllib2.HTTPError as e:
if e.code == 404:
logging.debug("MD5SUMS missing: %s", md5sums_url)
break
else:
logging.debug("MD5SUMS changed: %s", md5sums_url)
change = True
if change:
logging.info("Add to dispatch: %s", url)
ret.add(url)
with open(timestamp, 'w') as f:
f.write(str_time)
return ret
def parse_arguments():
"""parse_arguments() -> args
Parse any command-line options given returning both
the parsed options and arguments.
"""
parser = argparse.ArgumentParser(description="Crawler for download.tizen.org")
parser.add_argument("url", type=str,
help='URL of prerelease or snapshot to crawl.')
parser.add_argument("-l", "--log",
action="store", dest="loglevel",
help="Verbosity level")
args = parser.parse_args()
return args
if '__main__' == __name__:
args = parse_arguments()
if args.loglevel:
numeric_level = getattr(logging, args.loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % args.loglevel)
logging.basicConfig(format='%(asctime)s %(message)s',level=numeric_level)
logging.debug("Begin")
snapshots = crawl(args.url)
timestamp_file = 'timestamp'
if "snapshots" in args.url:
timestamp_file = 'timestamp_snapshot'
discovered = snapshots
else:
discovered = set()
for snapshot in snapshots:
discovered |= crawl(snapshot)
if os.path.exists(dispatched_urls):
with open(dispatched_urls, 'r') as f:
dispatched = set([url.rstrip() for url in f.readlines()])
# save discovered URLs for dispatching download requests
modified = get_modified_paths(discovered, timestamp_file)
with open(discovered_urls, 'w') as f:
f.write('\n'.join(modified))
# save all URLs for storing download history
dispatched |= modified
with open(dispatched_urls, 'w') as f:
f.write('\n'.join(dispatched))
logging.debug("End")
|