summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMilan Verespej <m.verespej@partner.samsung.com>2019-06-18 13:40:08 +0200
committerMilan Verespej <m.verespej@partner.samsung.com>2019-06-18 18:41:41 +0200
commit2e1328a8867190f203043fb5758dc8117ba3d673 (patch)
tree485de0595f1dcd275f8c7d9d14a1137bbe3e04f6
parentf2f06700b7b9ad99c9c182fc01ee5cc0782ead78 (diff)
Refactor http files download
Original download scripts got out of hand. This series of commits improves style of code (code duplicates, etc.) Issue-ID: OOM-1803 Change-Id: I7b82c1711d27fe450430fbe6d962a450301b0be0 Signed-off-by: Milan Verespej <m.verespej@partner.samsung.com>
-rw-r--r--build/download/concurrent_downloader.py77
-rw-r--r--build/download/downloader.py126
-rw-r--r--build/download/http_downloader.py144
-rw-r--r--build/download/http_file.py49
4 files changed, 396 insertions, 0 deletions
diff --git a/build/download/concurrent_downloader.py b/build/download/concurrent_downloader.py
new file mode 100644
index 00000000..c84dac86
--- /dev/null
+++ b/build/download/concurrent_downloader.py
@@ -0,0 +1,77 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# COPYRIGHT NOTICE STARTS HERE
+
+# Copyright 2019 © Samsung Electronics Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# COPYRIGHT NOTICE ENDS HERE
+
+import concurrent.futures
+import logging
+from abc import ABC, abstractmethod
+
+from downloader import AbstractDownloader
+
+log = logging.getLogger(__name__)
+
+
+class ConcurrentDownloader(AbstractDownloader, ABC):
+ def __init__(self, list_type, *list_args, workers=None):
+ super().__init__(list_type, *list_args)
+ self._workers = workers
+
+ @abstractmethod
+ def _download_item(self, item):
+ """
+ Download item from list
+ :param item: item to be downloaded
+ """
+ pass
+
+ def download(self):
+ """
+ Download images concurrently from data lists.
+ """
+ if not self._initial_log():
+ return
+ items_left = len(self._missing)
+ try:
+ for _ in self.run_concurrent(self._download_item, self._missing.items()):
+ items_left -= 1
+ log.info('{} {} left to download.'.format(items_left, self._list_type))
+ except RuntimeError as err:
+ log.error('{} {} were not downloaded.'.format(items_left, self._list_type))
+ raise err
+
+ def run_concurrent(self, fn, iterable, *args):
+ """
+ Run function concurrently for iterable
+ :param fn: function to run
+ :param iterable: iterable to process
+ :param args: arguments for function (fn)
+ """
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self._workers) as executor:
+ futures = [executor.submit(fn, item, *args) for item in iterable]
+ error_occurred = False
+
+ for future in concurrent.futures.as_completed(futures):
+ error = future.exception()
+ if error:
+ error_occurred = True
+ else:
+ yield
+ if error_occurred:
+ raise RuntimeError('One or more errors occurred')
diff --git a/build/download/downloader.py b/build/download/downloader.py
new file mode 100644
index 00000000..b8e9ed50
--- /dev/null
+++ b/build/download/downloader.py
@@ -0,0 +1,126 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# COPYRIGHT NOTICE STARTS HERE
+
+# Copyright 2019 © Samsung Electronics Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# COPYRIGHT NOTICE ENDS HERE
+
+import logging
+from abc import ABC, abstractmethod
+
+import prettytable
+
+log = logging.getLogger(__name__)
+
+
+class AbstractDownloader(ABC):
+
+ def __init__(self, list_type, *list_args):
+ self._list_type = list_type
+ self._data_list = {item: list_arg[1] for list_arg in list_args
+ for item in self._load_list(list_arg[0])}
+ self._missing = self.missing()
+
+ @property
+ def list_type(self):
+ """
+ Type of resource in list
+ """
+ return self._list_type
+
+ @staticmethod
+ def _load_list(path):
+ """
+ Load list from file.
+ :param path: path to file
+ :return: set of items in list
+ """
+ with open(path, 'r') as f:
+ return {item for item in (line.strip() for line in f)
+ if item and not item.startswith('#')}
+
+ @staticmethod
+ def _check_table(header, alignment_dict, data):
+ """
+ General method to generate table
+ :param header: header of the table
+ :param alignment_dict: dictionary with alignment for columns
+ :param data: iterable of rows of table
+ :return: table formatted data
+ """
+ table = prettytable.PrettyTable(header)
+
+ for k, v in alignment_dict.items():
+ table.align[k] = v
+
+ for row in sorted(data):
+ table.add_row(row)
+
+ return table
+
+ @abstractmethod
+ def download(self):
+ """
+ Download resources from lists
+ """
+ pass
+
+ @abstractmethod
+ def _is_missing(self, item):
+ """
+ Check if item is not downloaded
+ """
+ pass
+
+ def missing(self):
+ """
+ Check for missing data (not downloaded)
+ :return: dictionary of missing items
+ """
+ self._missing = {item: dst for item, dst in self._data_list.items() if
+ self._is_missing(item)}
+ return self._missing
+
+ def _log_existing(self):
+ """
+ Log items that are already downloaded.
+ """
+ for item in self._merged_lists():
+ if item not in self._missing:
+ log.info('File or directory present: {}'.format(item))
+
+ def _merged_lists(self):
+ """
+ Get all item names in one set
+ :return: set with all items
+ """
+ return set(self._data_list.keys())
+
+ def _initial_log(self):
+ """
+ Log initial info for download.
+ :return: True if download is necessary False if everything is already downloaded
+ """
+ self._log_existing()
+ items_left = len(self._missing)
+ class_name = type(self).__name__
+ if items_left == 0:
+ log.info('{}: Everything seems to be present no download necessary.'.format(class_name))
+ return False
+ log.info('{}: Initializing download {} {} are not present.'.format(class_name, items_left,
+ self._list_type))
+ return True
diff --git a/build/download/http_downloader.py b/build/download/http_downloader.py
new file mode 100644
index 00000000..69adc4dd
--- /dev/null
+++ b/build/download/http_downloader.py
@@ -0,0 +1,144 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# COPYRIGHT NOTICE STARTS HERE
+
+# Copyright 2019 © Samsung Electronics Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# COPYRIGHT NOTICE ENDS HERE
+
+import argparse
+import datetime
+import logging
+import os
+import sys
+import timeit
+
+import requests
+from retrying import retry
+
+import http_file
+from concurrent_downloader import ConcurrentDownloader
+
+log = logging.getLogger(__name__)
+
+
+class HttpDownloader(ConcurrentDownloader):
+ def __init__(self, *list_args, workers=None):
+ super().__init__('http files', *list_args, workers=workers)
+
+ @property
+ def check_table(self):
+ """
+ Table with information what items from lists are downloaded
+ """
+ self.missing()
+ header = ['Name', 'Downloaded']
+ return self._check_table(header, {'Name': 'l'},
+ ((item, item not in self._missing) for item
+ in self._data_list))
+
+ @staticmethod
+ def _make_get_request(url):
+ """
+ Run http get request
+ :param url: url to reqeuest
+ :return: requests.Response
+ """
+ req = requests.get(url)
+ req.raise_for_status()
+ return req
+
+ def _is_missing(self, item):
+ """
+ Check if item is missing (not downloaded)
+ :param item: item to check
+ :return: boolean
+ """
+ return not os.path.isfile(
+ '{}/{}'.format(self._data_list[item], item.rsplit('//')[-1]))
+
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
+ def _get_file(self, file_uri):
+ """
+ Get http file from uri
+ :param file_uri: uri of the file
+ :return: file content
+ """
+ if not file_uri.startswith('http'):
+ file_uri = 'http://' + file_uri
+ file_req = self._make_get_request(file_uri)
+ return file_req.content
+
+ def _download_item(self, item):
+ """
+ Download http file
+ :param item: http file to be downloaded (tuple: (uri, dst_dir))
+ """
+ log.info('Downloading: {}'.format(item[0]))
+ dst_path = '{}/{}'.format(item[1], item[0].rsplit('//')[-1])
+ try:
+ f = http_file.HttpFile(item[0], self._get_file(item[0]), dst_path)
+ f.save_to_file()
+ except Exception as err:
+ log.exception('Error downloading: {}: {}'.format(item[0], err))
+ if os.path.isfile(dst_path):
+ os.remove(dst_path)
+ raise err
+ log.info('Downloaded: {}'.format(f.name))
+
+
+def run_cli():
+ """
+ Run as cli tool
+ """
+ parser = argparse.ArgumentParser(description='Download http files from list')
+ parser.add_argument('file_list', metavar='file-list',
+ help='File with list of http files to download')
+ parser.add_argument('--output-dir', '-o', default=os.getcwd(),
+ help='Destination directory for saving')
+ parser.add_argument('--check', '-c', action='store_true', default=False,
+ help='Check mode')
+ parser.add_argument('--debug', action='store_true', default=False,
+ help='Turn on debug output')
+ parser.add_argument('--workers', type=int, default=None,
+ help='Set maximum workers for parallel download (default: cores * 5)')
+
+ args = parser.parse_args()
+
+ if args.debug:
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+ else:
+ logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')
+
+ downloader = HttpDownloader([args.file_list, args.output_dir], workers=args.workers)
+
+ if args.check:
+ log.info('Check mode. No download will be executed.')
+ log.info(downloader.check_table)
+ sys.exit(0)
+
+ timer_start = timeit.default_timer()
+ try:
+ downloader.download()
+ except RuntimeError:
+ sys.exit(1)
+ finally:
+ log.info('Downloading finished in {}'.format(
+ datetime.timedelta(seconds=timeit.default_timer() - timer_start)))
+
+
+if __name__ == '__main__':
+ run_cli()
diff --git a/build/download/http_file.py b/build/download/http_file.py
new file mode 100644
index 00000000..397f0930
--- /dev/null
+++ b/build/download/http_file.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# COPYRIGHT NOTICE STARTS HERE
+
+# Copyright 2019 © Samsung Electronics Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# COPYRIGHT NOTICE ENDS HERE
+
+import os
+
+
+class HttpFile:
+ """
+ File to be saved
+ """
+
+ def __init__(self, name, content, dst):
+ self._name = name
+ self._content = content
+ self._dst = dst
+
+ @property
+ def name(self):
+ """
+ Name of the file
+ """
+ return self._name
+
+ def save_to_file(self):
+ """
+ Save it to disk
+ """
+ dst_dir = os.path.dirname(self._dst)
+ if not os.path.exists(dst_dir):
+ os.makedirs(dst_dir)
+ with open(self._dst, 'wb') as dst_file:
+ dst_file.write(self._content)