tsp/scripts/crawler.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

##
# @author Pawel Wieczorek <p.wieczorek2@samsung.com>

import os
import requests
import time
import argparse
import logging

import bs4

discovered_urls = 'modified_urls'
dispatched_urls = 'dispatched_urls'

def get_links(session, url):
    main = session.get(url, timeout=120)
    soup = bs4.BeautifulSoup(main.text, 'html.parser')
    links = set()
    for link in soup.find_all('a'):
        links.add(link.get('href'))
    return links

def crawl(url):
    logging.info("crawl: %s", url)
    visited = set()

    visited.add(url)

    s = requests.Session()
    s.headers.update({'User-agent': 'Prerelease Crawler'})
    links = get_links(s, url)

    discovered = set()
    for link in links:
        if link not in discovered and link not in visited:
            if link.startswith('tizen-'):
                logging.debug("Add link to discovered: %s", link)
                discovered.add(url + link)

    return discovered

def get_urls2check(session, md5sums, url, level=2):
    links = get_links(session, url)
    for link in links:
        if not link.startswith("/") and link.endswith("/"):
            if level > 0:
                logging.debug("Step into: %s", link)
                get_urls2check(session, md5sums, url + link, level - 1)
            else:
                logging.debug("Reached depth limit, ignore: %s", link)
    if level == 0:
        md5sums.add(url + "MD5SUMS")

def get_modified_paths(discovered, timestamp):
    logging.info("get_modified_paths")
    ret = set()
    str_time = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
    logging.info("Next timestamp: %s", str_time)
    if os.path.exists(dispatched_urls):
        with open(timestamp, 'r') as f:
            stamp = f.read();
    else:
        return discovered
    logging.info("Previous timestamp: %s", stamp)
    s = requests.Session()
    s.headers.update({"If-Modified-Since": stamp})
    for url in discovered:
        logging.debug("Check for MD5SUMS change: %s", url)
        md5sums_urls = set()
        get_urls2check(s, md5sums_urls, url + "images/")
        change = False
        for md5sums_url in md5sums_urls:
            r = s.get(md5sums_url)
            if r.status_code == requests.codes.ok:
                logging.debug("MD5SUMS changed: %s", md5sums_url)
                change = True
            elif r.status_code == 404:
                logging.debug("MD5SUMS missing: %s", md5sums_url)
                break
            elif r.status_code == 304:
                logging.debug("MD5SUMS unchanged: %s", md5sums_url)
            else:
                logging.warn("MD5SUMS error: %s", md5sums_url)
        if change:
            logging.info("Add to dispatch: %s", url)
            ret.add(url)
    with open(timestamp, 'w') as f:
        f.write(str_time)
    return ret

def parse_arguments():
    """parse_arguments() -> args

    Parse any command-line options given returning both
    the parsed options and arguments.
    """

    parser = argparse.ArgumentParser(description="Crawler for download.tizen.org")

    parser.add_argument("url", type=str,
            help='URL of prerelease or snapshot to crawl.')

    parser.add_argument("-l", "--log",
            action="store", dest="loglevel",
            help="Verbosity level")

    parser.add_argument("-t", "--timestamp", default="timestamp",
            help="File to read a reference timestamp from")

    args = parser.parse_args()

    return args

if '__main__' == __name__:
    args = parse_arguments()
    if args.loglevel:
        numeric_level = getattr(logging, args.loglevel.upper(), None)
        if not isinstance(numeric_level, int):
            raise ValueError('Invalid log level: %s' % args.loglevel)
        logging.basicConfig(format='%(asctime)s %(message)s',level=numeric_level)
    logging.debug("Begin")
    snapshots = crawl(args.url)
    timestamp_file = args.timestamp

    if "snapshots" in args.url:
        discovered = snapshots
    else:
        discovered = set()
        for snapshot in snapshots:
            discovered |= crawl(snapshot)

    if os.path.exists(dispatched_urls):
        with open(dispatched_urls, 'r') as f:
            dispatched = set([url.rstrip() for url in f.readlines()])

    # save discovered URLs for dispatching download requests
    modified = get_modified_paths(discovered, timestamp_file)
    with open(discovered_urls, 'w') as f:
        f.write('\n'.join(modified) + '\n')

    # save all URLs for storing download history
    dispatched |= modified
    with open(dispatched_urls, 'w') as f:
        f.write('\n'.join(dispatched))
    logging.debug("End")