#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (c) 2016 Samsung Electronics Co., Ltd All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## # @author Aleksander Mistewicz import os import subprocess import time import re import requests import argparse import logging import bs4 import threading import signal __version__ = "0.0.1" __license__ = "APACHE-2.0" __author__ = "Aleksander Mistewicz" __author_email__ = "a.mistewicz@samsung.com" USAGE = "%prog " AGENT = "%s/%s" % (__name__, __version__) class ImageVersion(object): def __init__(self, url): names = re.findall(r'tizen\-.{0,7}\w+\d{8}\.\d+', url) if len(names) >= 1: self.name = names[0] else: raise ValueError versions = re.findall(r'\d{8}\.\d+', url) if len(versions) == 3: if versions[0] != versions[1]: raise ValueError self.snapshot = versions[0] self.submission = versions[2] elif len(versions) == 1: self.snapshot = versions[0] self.submission = None else: raise ValueError def get_version(self): if self.submission: return '.'.join([self.snapshot, self.submission]) else: return self.snapshot def get_snapshot(self): return self.snapshot def get_submission(self): return self.submission def get_name(self): if self.submission: return '.'.join([self.name, self.submission]) else: return self.name def is_prerelease(self): if self.submission: return True else: return False class Crawler(object): @classmethod def get_links(self, session, url): main = session.get(url, timeout=120) soup = bs4.BeautifulSoup(main.text, 'html.parser') links = set() for link in soup.find_all('a'): links.add(link.get('href')) return links @classmethod def get_targets(self, url): url += "images/" s = requests.Session() return self.crawl_targets(s, url) @classmethod def crawl_targets(self, session, url): links = Crawler.get_links(session, url) discovered = set() for link in links: if not link.startswith("/") and link.endswith("/") and not "../" in link: logging.debug("Add link to discovered: %s", link) discovered |= self.crawl_targets(session, url + link) else: if link == "MD5SUMS": discovered.add(url) return discovered @classmethod def crawl_images(self, session, url): links = Crawler.get_links(session, url) discovered = set() for link in links: if link == "MD5SUMS" \ or link.endswith(".tar.gz") \ or link.endswith(".ks") \ or link.endswith(".packages") \ or link.endswith(".xml") \ or link.endswith(".bmap") \ or link.endswith(".raw.bz2") \ or link.endswith("-default") \ or link.endswith(".log"): discovered.add(url + link) return discovered class Downloader(threading.Thread): def __init__(self, work, img_ver, session, url): self.work = work threading.Thread.__init__(self) self.url = url self.session = session self.img_ver = img_ver self.is_prerelease = img_ver.is_prerelease() m = re.search(r'.*/(.*)/$', url) self.name = m.group(1) try: os.mkdir(self.name) except OSError as e: logging.warn("mkdir %s: %s" % (self.name, e.strerror)) self.diff_report_filename = self.name + "/diff.report" def run(self): logging.info("Start downloader: %s" % self.name) self.files = Crawler.crawl_images(self.session, self.url) logging.debug(self.files) for url in frozenset(self.files): if url.endswith(".packages"): pre_url = url self.files.discard(url) elif url.endswith("/MD5SUMS"): md5sums = url self.files.discard(url) if not self.is_prerelease: self.write_diff_for_snapshot() else: # Replace prerelease with snapshots snap_url = re.sub('prerelease', 'snapshots', pre_url) # Remove prerelease subdirectory snap_url = re.sub("/[^/]*" + self.img_ver.get_submission() + "/", '/', snap_url) # Remove SR from filename snap_url = re.sub("\." + self.img_ver.get_submission(), '', snap_url) logging.info("snap: %s" % snap_url) snap = self.session.get(snap_url, timeout=120) pre = self.session.get(pre_url, timeout=120) if self.check_diff(pre.text, snap.text): return while self.work.is_set(): sub_dwns = set() for url in self.files: sub_dwns.add(subprocess.Popen(["wget", "-cq", url], cwd=self.name)) for sub_dwn in sub_dwns: sub_dwn.wait() if md5sums: r = self.session.head(md5sums) if r.status_code == requests.codes.ok: if self.check_md5(md5sums): break elif r.status_code == 404: logging.debug("MD5SUMS missing: %s", md5sums) break else: logging.warn("MD5SUMS error: %s", md5sums) if self.work.is_set(): time.sleep(10) else: break logging.info("Stop downloader: %s" % self.name) def check_diff(self, pre_pkgs, snap_pkgs): logging.debug("Checking diff") set_snap_pkgs = set(snap_pkgs.splitlines()) set_pre_pkgs = set(pre_pkgs.splitlines()) diff = set_pre_pkgs ^ set_snap_pkgs with open(self.diff_report_filename, 'w') as f: ret = (len(diff) == 0) if ret: s = 'Images are identical' else: s = '\n'.join(diff) logging.info(s) f.write(s) return ret def write_diff_for_snapshot(self): logging.debug("Write diff for snapshot image") with open(self.diff_report_filename, 'w') as f: f.write('Snapshot') def check_md5(self, md5sum_url): logging.debug("Checking md5sum") md5_file = "MD5SUMS" md5_path = self.name + "/" + md5_file subprocess.call(["wget", md5sum_url, "-qO", md5_path]) subprocess.call(["sed", "-e", "/\(ks\|json\|log\|xml\|-default\|packages\)/d", "-i", md5_path]) p = subprocess.Popen(["md5sum", "-c", md5_file], cwd=self.name) p.wait() ret = p.returncode if not ret: logging.info("Checksum OK") else: logging.warn("Checksum FAILED\nRemoving files mentioned in md5sums file") with open(md5_path, 'r') as f: for line in f: try: filename = line.split(' ') if len(filename) != 2: logging.warn("Unexpected number of substrings [%d]: %s", len(filename), line) break os.remove(self.name + "/" + filename[1]) except OSError as e: logging.warn("rm: %s" % e.strerror) os.remove(md5_path) return False return True class ImageDownloader(object): def __init__(self, url, dry, mapping): self.url = url self.dry = dry self.mapping = mapping self.img_ver = ImageVersion(url) self.diff_report_filename = "diff.report" logging.debug('snapshot number: %s', self.img_ver.get_snapshot()) logging.debug('version number: %s', self.img_ver.get_version()) self.urls = Crawler.get_targets(self.url) # Postcondition logging.debug("Files to download: %s", self.urls) if self.dry: logging.debug("Skipping run") else: self.create_projectconf("N/A", "N/A"); self.run() def run(self): logging.debug("Dispatching downloaders...") s = requests.Session() downloaders = set() def handler(signum, frame): logging.info("SIGINT") work.clear() work = threading.Event() work.set() signal.signal(signal.SIGINT, handler) if self.mapping: with open(self.mapping, 'r') as f: mapped = f.read().splitlines() logging.debug(mapped) for url in self.urls: for image in mapped: if image in url: dwn = Downloader(work, self.img_ver, s, url) dwn.start() downloaders.add(dwn) break else: for url in self.urls: dwn = Downloader(work, self.img_ver, s, url) dwn.start() downloaders.add(dwn) for dwn in downloaders: dwn.join() def create_projectconf(self, arch, target_name): logging.debug("Create project.conf file for: %s %s", arch, target_name) if self.dry: return prjconf = [ self.img_ver.get_name(), arch, target_name ] with open("project.conf", 'w') as f: f.write('\n'.join(prjconf) + '\n') def parse_arguments(): parser = argparse.ArgumentParser(description="Image downloader for download.tizen.org") parser.add_argument("url", metavar='', type=str, help='URL of prerelease or snapshot to download images from.') parser.add_argument("-m", "--map", type=str, help='Path to file with list of images to download') parser.add_argument("-d", "--dry-run", action="store_true", dest="dry", help="Dry run - do not actually download images") parser.add_argument("-l", "--log", action="store", dest="loglevel", help="Verbosity level") args = parser.parse_args() return args def main(): args = parse_arguments() if args.loglevel: numeric_level = getattr(logging, args.loglevel.upper(), None) if not isinstance(numeric_level, int): raise ValueError('Invalid log level: %s' % args.loglevel) logging.basicConfig(format='%(asctime)s %(message)s',level=numeric_level) logging.debug("Begin") ImageDownloader(args.url, args.dry, args.map) logging.debug("End") if __name__ == '__main__': main()