diff options
author | Milan Verespej <m.verespej@partner.samsung.com> | 2019-06-18 13:40:08 +0200 |
---|---|---|
committer | Milan Verespej <m.verespej@partner.samsung.com> | 2019-06-18 18:41:41 +0200 |
commit | 2e1328a8867190f203043fb5758dc8117ba3d673 (patch) | |
tree | 485de0595f1dcd275f8c7d9d14a1137bbe3e04f6 | |
parent | f2f06700b7b9ad99c9c182fc01ee5cc0782ead78 (diff) |
Refactor http files download
Original download scripts got out of hand.
This series of commits improves style of code
(code duplicates, etc.)
Issue-ID: OOM-1803
Change-Id: I7b82c1711d27fe450430fbe6d962a450301b0be0
Signed-off-by: Milan Verespej <m.verespej@partner.samsung.com>
-rw-r--r-- | build/download/concurrent_downloader.py | 77 | ||||
-rw-r--r-- | build/download/downloader.py | 126 | ||||
-rw-r--r-- | build/download/http_downloader.py | 144 | ||||
-rw-r--r-- | build/download/http_file.py | 49 |
4 files changed, 396 insertions, 0 deletions
diff --git a/build/download/concurrent_downloader.py b/build/download/concurrent_downloader.py new file mode 100644 index 00000000..c84dac86 --- /dev/null +++ b/build/download/concurrent_downloader.py @@ -0,0 +1,77 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# COPYRIGHT NOTICE STARTS HERE + +# Copyright 2019 © Samsung Electronics Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# COPYRIGHT NOTICE ENDS HERE + +import concurrent.futures +import logging +from abc import ABC, abstractmethod + +from downloader import AbstractDownloader + +log = logging.getLogger(__name__) + + +class ConcurrentDownloader(AbstractDownloader, ABC): + def __init__(self, list_type, *list_args, workers=None): + super().__init__(list_type, *list_args) + self._workers = workers + + @abstractmethod + def _download_item(self, item): + """ + Download item from list + :param item: item to be downloaded + """ + pass + + def download(self): + """ + Download images concurrently from data lists. + """ + if not self._initial_log(): + return + items_left = len(self._missing) + try: + for _ in self.run_concurrent(self._download_item, self._missing.items()): + items_left -= 1 + log.info('{} {} left to download.'.format(items_left, self._list_type)) + except RuntimeError as err: + log.error('{} {} were not downloaded.'.format(items_left, self._list_type)) + raise err + + def run_concurrent(self, fn, iterable, *args): + """ + Run function concurrently for iterable + :param fn: function to run + :param iterable: iterable to process + :param args: arguments for function (fn) + """ + with concurrent.futures.ThreadPoolExecutor(max_workers=self._workers) as executor: + futures = [executor.submit(fn, item, *args) for item in iterable] + error_occurred = False + + for future in concurrent.futures.as_completed(futures): + error = future.exception() + if error: + error_occurred = True + else: + yield + if error_occurred: + raise RuntimeError('One or more errors occurred') diff --git a/build/download/downloader.py b/build/download/downloader.py new file mode 100644 index 00000000..b8e9ed50 --- /dev/null +++ b/build/download/downloader.py @@ -0,0 +1,126 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# COPYRIGHT NOTICE STARTS HERE + +# Copyright 2019 © Samsung Electronics Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# COPYRIGHT NOTICE ENDS HERE + +import logging +from abc import ABC, abstractmethod + +import prettytable + +log = logging.getLogger(__name__) + + +class AbstractDownloader(ABC): + + def __init__(self, list_type, *list_args): + self._list_type = list_type + self._data_list = {item: list_arg[1] for list_arg in list_args + for item in self._load_list(list_arg[0])} + self._missing = self.missing() + + @property + def list_type(self): + """ + Type of resource in list + """ + return self._list_type + + @staticmethod + def _load_list(path): + """ + Load list from file. + :param path: path to file + :return: set of items in list + """ + with open(path, 'r') as f: + return {item for item in (line.strip() for line in f) + if item and not item.startswith('#')} + + @staticmethod + def _check_table(header, alignment_dict, data): + """ + General method to generate table + :param header: header of the table + :param alignment_dict: dictionary with alignment for columns + :param data: iterable of rows of table + :return: table formatted data + """ + table = prettytable.PrettyTable(header) + + for k, v in alignment_dict.items(): + table.align[k] = v + + for row in sorted(data): + table.add_row(row) + + return table + + @abstractmethod + def download(self): + """ + Download resources from lists + """ + pass + + @abstractmethod + def _is_missing(self, item): + """ + Check if item is not downloaded + """ + pass + + def missing(self): + """ + Check for missing data (not downloaded) + :return: dictionary of missing items + """ + self._missing = {item: dst for item, dst in self._data_list.items() if + self._is_missing(item)} + return self._missing + + def _log_existing(self): + """ + Log items that are already downloaded. + """ + for item in self._merged_lists(): + if item not in self._missing: + log.info('File or directory present: {}'.format(item)) + + def _merged_lists(self): + """ + Get all item names in one set + :return: set with all items + """ + return set(self._data_list.keys()) + + def _initial_log(self): + """ + Log initial info for download. + :return: True if download is necessary False if everything is already downloaded + """ + self._log_existing() + items_left = len(self._missing) + class_name = type(self).__name__ + if items_left == 0: + log.info('{}: Everything seems to be present no download necessary.'.format(class_name)) + return False + log.info('{}: Initializing download {} {} are not present.'.format(class_name, items_left, + self._list_type)) + return True diff --git a/build/download/http_downloader.py b/build/download/http_downloader.py new file mode 100644 index 00000000..69adc4dd --- /dev/null +++ b/build/download/http_downloader.py @@ -0,0 +1,144 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# COPYRIGHT NOTICE STARTS HERE + +# Copyright 2019 © Samsung Electronics Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# COPYRIGHT NOTICE ENDS HERE + +import argparse +import datetime +import logging +import os +import sys +import timeit + +import requests +from retrying import retry + +import http_file +from concurrent_downloader import ConcurrentDownloader + +log = logging.getLogger(__name__) + + +class HttpDownloader(ConcurrentDownloader): + def __init__(self, *list_args, workers=None): + super().__init__('http files', *list_args, workers=workers) + + @property + def check_table(self): + """ + Table with information what items from lists are downloaded + """ + self.missing() + header = ['Name', 'Downloaded'] + return self._check_table(header, {'Name': 'l'}, + ((item, item not in self._missing) for item + in self._data_list)) + + @staticmethod + def _make_get_request(url): + """ + Run http get request + :param url: url to reqeuest + :return: requests.Response + """ + req = requests.get(url) + req.raise_for_status() + return req + + def _is_missing(self, item): + """ + Check if item is missing (not downloaded) + :param item: item to check + :return: boolean + """ + return not os.path.isfile( + '{}/{}'.format(self._data_list[item], item.rsplit('//')[-1])) + + @retry(stop_max_attempt_number=5, wait_fixed=2000) + def _get_file(self, file_uri): + """ + Get http file from uri + :param file_uri: uri of the file + :return: file content + """ + if not file_uri.startswith('http'): + file_uri = 'http://' + file_uri + file_req = self._make_get_request(file_uri) + return file_req.content + + def _download_item(self, item): + """ + Download http file + :param item: http file to be downloaded (tuple: (uri, dst_dir)) + """ + log.info('Downloading: {}'.format(item[0])) + dst_path = '{}/{}'.format(item[1], item[0].rsplit('//')[-1]) + try: + f = http_file.HttpFile(item[0], self._get_file(item[0]), dst_path) + f.save_to_file() + except Exception as err: + log.exception('Error downloading: {}: {}'.format(item[0], err)) + if os.path.isfile(dst_path): + os.remove(dst_path) + raise err + log.info('Downloaded: {}'.format(f.name)) + + +def run_cli(): + """ + Run as cli tool + """ + parser = argparse.ArgumentParser(description='Download http files from list') + parser.add_argument('file_list', metavar='file-list', + help='File with list of http files to download') + parser.add_argument('--output-dir', '-o', default=os.getcwd(), + help='Destination directory for saving') + parser.add_argument('--check', '-c', action='store_true', default=False, + help='Check mode') + parser.add_argument('--debug', action='store_true', default=False, + help='Turn on debug output') + parser.add_argument('--workers', type=int, default=None, + help='Set maximum workers for parallel download (default: cores * 5)') + + args = parser.parse_args() + + if args.debug: + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + else: + logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s') + + downloader = HttpDownloader([args.file_list, args.output_dir], workers=args.workers) + + if args.check: + log.info('Check mode. No download will be executed.') + log.info(downloader.check_table) + sys.exit(0) + + timer_start = timeit.default_timer() + try: + downloader.download() + except RuntimeError: + sys.exit(1) + finally: + log.info('Downloading finished in {}'.format( + datetime.timedelta(seconds=timeit.default_timer() - timer_start))) + + +if __name__ == '__main__': + run_cli() diff --git a/build/download/http_file.py b/build/download/http_file.py new file mode 100644 index 00000000..397f0930 --- /dev/null +++ b/build/download/http_file.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +# COPYRIGHT NOTICE STARTS HERE + +# Copyright 2019 © Samsung Electronics Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# COPYRIGHT NOTICE ENDS HERE + +import os + + +class HttpFile: + """ + File to be saved + """ + + def __init__(self, name, content, dst): + self._name = name + self._content = content + self._dst = dst + + @property + def name(self): + """ + Name of the file + """ + return self._name + + def save_to_file(self): + """ + Save it to disk + """ + dst_dir = os.path.dirname(self._dst) + if not os.path.exists(dst_dir): + os.makedirs(dst_dir) + with open(self._dst, 'wb') as dst_file: + dst_file.write(self._content) |