tsp/scripts/crawler.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##
# @author Pawel Wieczorek <p.wieczorek2@samsung.com>

import os
import urllib2
import time
import argparse
import logging

import bs4

discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'

def crawl(url):
    logging.info("crawl: %s", url)
    visited = set()

    visited.add(url)

    h = urllib2.build_opener()
    h.addheaders = [('User-agent', 'Prerelease Crawler')]

    try:
        resp = h.open(url)
    except urllib2.HTTPError as e:
        print 'Failed to access {url}: {code} - {reason}'\
            .format(url=url, code=e.code, reason=e.reason)

    html = str(resp.read())
    soup = bs4.BeautifulSoup(html, 'lxml')
    links = soup('a')

    discovered = set()
    for link in links:
        if link not in discovered and link not in visited:
            if link.string.startswith('tizen-common'):
                logging.debug("Add link to discovered: %s", link['href'])
                discovered.add(url + link['href'])

    return discovered

def get_modified_paths(discovered, timestamp):
    logging.info("get_modified_paths")
    ret = set()
    str_time = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
    logging.info("Next timestamp: %s", str_time)
    if os.path.exists(dispatched_urls):
        with open(timestamp, 'r') as f:
            stamp = f.read();
    else:
        return discovered
    logging.info("Previous timestamp: %s", stamp)
    for url in discovered:
        logging.debug("Check for MD5SUMS change: %s", url)
        md5sums_urls = [url + 'images/arm-wayland/common-wayland-3parts-armv7l-odroidu3/MD5SUMS',\
            url + 'images/x86_64-wayland/common-wayland-efi-x86_64/MD5SUMS',\
            url + 'images/ia32-wayland/common-wayland-efi-i586/MD5SUMS']
        change = False
        for md5sums_url in md5sums_urls:
            try:
                u = urllib2.urlopen(urllib2.Request(md5sums_url, headers={"If-Modified-Since": stamp}))
            except urllib2.HTTPError as e:
                if e.code == 404:
                    logging.debug("MD5SUMS missing: %s", md5sums_url)
                    break
            else:
                logging.debug("MD5SUMS changed: %s", md5sums_url)
                change = True
        if change:
            logging.info("Add to dispatch: %s", url)
            ret.add(url)
    with open(timestamp, 'w') as f:
        f.write(str_time)
    return ret

def parse_arguments():
    """parse_arguments() -> args

    Parse any command-line options given returning both
    the parsed options and arguments.
    """

    parser = argparse.ArgumentParser(description="Crawler for download.tizen.org")

    parser.add_argument("url", type=str,
            help='URL of prerelease or snapshot to crawl.')

    parser.add_argument("-l", "--log",
            action="store", dest="loglevel",
            help="Verbosity level")

    args = parser.parse_args()

    return args

if '__main__' == __name__:
    args = parse_arguments()
    if args.loglevel:
        numeric_level = getattr(logging, args.loglevel.upper(), None)
        if not isinstance(numeric_level, int):
            raise ValueError('Invalid log level: %s' % args.loglevel)
        logging.basicConfig(format='%(asctime)s %(message)s',level=numeric_level)
    logging.debug("Begin")
    snapshots = crawl(args.url)
    timestamp_file = 'timestamp'

    if "snapshots" in args.url:
        timestamp_file = 'timestamp_snapshot'
        discovered = snapshots
    else:
        discovered = set()
        for snapshot in snapshots:
            discovered |= crawl(snapshot)

    if os.path.exists(dispatched_urls):
        with open(dispatched_urls, 'r') as f:
            dispatched = set([url.rstrip() for url in f.readlines()])

    # save discovered URLs for dispatching download requests
    modified = get_modified_paths(discovered, timestamp_file)
    with open(discovered_urls, 'w') as f:
        f.write('\n'.join(modified))

    # save all URLs for storing download history
    dispatched |= modified
    with open(dispatched_urls, 'w') as f:
        f.write('\n'.join(dispatched))
    logging.debug("End")