1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
##
# @author Pawel Wieczorek <p.wieczorek2@samsung.com>
import os
import requests
import time
import argparse
import logging
import bs4
discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'
def get_links(session, url):
main = session.get(url, timeout=120)
soup = bs4.BeautifulSoup(main.text, 'html.parser')
links = set()
for link in soup.find_all('a'):
links.add(link.get('href'))
return links
def crawl(url):
logging.info("crawl: %s", url)
visited = set()
visited.add(url)
s = requests.Session()
s.headers.update({'User-agent': 'Prerelease Crawler'})
links = get_links(s, url)
discovered = set()
for link in links:
if link not in discovered and link not in visited:
if link.startswith('tizen-'):
logging.debug("Add link to discovered: %s", link)
discovered.add(url + link)
return discovered
def get_urls2check(session, md5sums, url, level=2):
links = get_links(session, url)
for link in links:
if not link.startswith("/") and link.endswith("/"):
if level > 0:
logging.debug("Step into: %s", link)
get_urls2check(session, md5sums, url + link, level - 1)
else:
logging.debug("Reached depth limit, ignore: %s", link)
if level == 0:
md5sums.add(url + "MD5SUMS")
def get_modified_paths(discovered, timestamp):
logging.info("get_modified_paths")
ret = set()
str_time = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
logging.info("Next timestamp: %s", str_time)
if os.path.exists(dispatched_urls):
with open(timestamp, 'r') as f:
stamp = f.read();
else:
return discovered
logging.info("Previous timestamp: %s", stamp)
s = requests.Session()
s.headers.update({"If-Modified-Since": stamp})
for url in discovered:
logging.debug("Check for MD5SUMS change: %s", url)
md5sums_urls = set()
get_urls2check(s, md5sums_urls, url + "images/")
change = False
for md5sums_url in md5sums_urls:
r = s.get(md5sums_url)
if r.status_code == requests.codes.ok:
logging.debug("MD5SUMS changed: %s", md5sums_url)
change = True
elif r.status_code == 404:
logging.debug("MD5SUMS missing: %s", md5sums_url)
break
elif r.status_code == 304:
logging.debug("MD5SUMS unchanged: %s", md5sums_url)
else:
logging.warn("MD5SUMS error: %s", md5sums_url)
if change:
logging.info("Add to dispatch: %s", url)
ret.add(url)
with open(timestamp, 'w') as f:
f.write(str_time)
return ret
def parse_arguments():
"""parse_arguments() -> args
Parse any command-line options given returning both
the parsed options and arguments.
"""
parser = argparse.ArgumentParser(description="Crawler for download.tizen.org")
parser.add_argument("url", type=str,
help='URL of prerelease or snapshot to crawl.')
parser.add_argument("-l", "--log",
action="store", dest="loglevel",
help="Verbosity level")
parser.add_argument("-t", "--timestamp", default="timestamp",
help="File to read a reference timestamp from")
args = parser.parse_args()
return args
if '__main__' == __name__:
args = parse_arguments()
if args.loglevel:
numeric_level = getattr(logging, args.loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError('Invalid log level: %s' % args.loglevel)
logging.basicConfig(format='%(asctime)s %(message)s',level=numeric_level)
logging.debug("Begin")
snapshots = crawl(args.url)
timestamp_file = args.timestamp
if "snapshots" in args.url:
discovered = snapshots
else:
discovered = set()
for snapshot in snapshots:
discovered |= crawl(snapshot)
if os.path.exists(dispatched_urls):
with open(dispatched_urls, 'r') as f:
dispatched = set([url.rstrip() for url in f.readlines()])
# save discovered URLs for dispatching download requests
modified = get_modified_paths(discovered, timestamp_file)
with open(discovered_urls, 'w') as f:
f.write('\n'.join(modified) + '\n')
# save all URLs for storing download history
dispatched |= modified
with open(dispatched_urls, 'w') as f:
f.write('\n'.join(dispatched))
logging.debug("End")
|