diff options
Diffstat (limited to 'imagescanner/imagescanner/tasks.py')
-rw-r--r-- | imagescanner/imagescanner/tasks.py | 156 |
1 files changed, 122 insertions, 34 deletions
diff --git a/imagescanner/imagescanner/tasks.py b/imagescanner/imagescanner/tasks.py index 3610373..61abf15 100644 --- a/imagescanner/imagescanner/tasks.py +++ b/imagescanner/imagescanner/tasks.py @@ -42,19 +42,23 @@ import re import hashlib import datetime from subprocess import run +from xml.etree import ElementTree from celery import Celery import requests -from . import STATUSFILE, LOGS_PATH +from . import config from .in_temp_dir import in_temp_dir - +from .regexdispatch import regexdispatch celery_app = Celery( broker='redis://redis', backend='redis://redis', ) -repo_re = re.compile(r'.*\.git$') -direct_re = re.compile(r'http.*\.(?:img|iso|qcow2)(?:\.gz)?$') -image_re = re.compile(r'.*\.(?:img|iso|qcow2)(?:\.gz)?$') + +# direct_re will match URLs pointing directly to an image to download, over +# http and https connections, and will capture the hostname and filename in +# named groups. This includes URLs to S3 and RadosGW endpoints. +# +image_re = re.compile(r'.*\.(?:img|iso|qcow2?)(?:\.gz)?$') SLACK_TOKEN = os.getenv('SLACK_TOKEN') DOMAIN = os.getenv('DOMAIN') @@ -70,7 +74,8 @@ def sha256(path): @celery_app.task(queue='scans', ignore_result=True) @in_temp_dir() -def request_scan(source, path, recipients): +def request_scan(source, path, recipients=None, jenkins_job_name=None, + checklist_uuid=None): """Retrieve and scan all partitions of (an) image(s), and notify of the results. @@ -88,6 +93,13 @@ def request_scan(source, path, recipients): complete. Currently, this may include Slack usernames and Slack channels. + jenkins_job_name: + The name of the jenkins job that should be built to process the scan + results. + + checklist_uuid: + The UUID of the checklist that should be passed to the jenkins job. + This function assumes the current working directory is a safe workarea for retrieving and manipulating images, but is decorated with in_temp_dir which changes to a new temporary directory upon invocation. @@ -96,7 +108,7 @@ def request_scan(source, path, recipients): # TODO printing to a status file is archaic and messy; let's use the python # logging framework or storing status in redis instead. - with STATUSFILE.open('w') as statusfile: + with config.STATUSFILE.open('w') as statusfile: print( "Processing request {source} {path} in {workspace}".format( @@ -105,7 +117,8 @@ def request_scan(source, path, recipients): flush=True) for image in retrieve_images(source, path): - print("- Image file: {}...".format(image), + print( + "- Image file: {}...".format(image), file=statusfile, flush=True) if not os.path.exists(image): raise ValueError("Path not found: {}".format(image)) @@ -113,12 +126,12 @@ def request_scan(source, path, recipients): print("-- Checksumming...", file=statusfile, flush=True) checksum = sha256(image) - print("-- Scanning...", - file=statusfile, flush=True) - logfile = LOGS_PATH / 'SecurityValidation-{}.txt'.format(checksum) + print("-- Scanning...", file=statusfile, flush=True) + logfile = config.LOGS_PATH / 'SecurityValidation-{}.txt'.format( + checksum) - #for partition in image_partitions(): - # result = scan_partition(partition) + # for partition in image_partitions(): + # result = scan_partition(partition) with open(logfile, 'w') as fd: print(datetime.datetime.utcnow().ctime(), "UTC", file=fd) print("Launching image scan for {} from {} {}".format( @@ -130,25 +143,55 @@ def request_scan(source, path, recipients): stderr=fd, ) - print("-- Scheduling notification (exit code:{})...".format(result.returncode), - file=statusfile, flush=True) + if recipients: + print( + "-- Scheduling notification (exit code: {})..." + .format(result.returncode), file=statusfile, flush=True) + + slack_notify.delay( + status="Success" if result.returncode == 0 else "Failure", + source=source, + filename=image, + checksum=checksum, + recipients=recipients, + ) - slack_notify.delay( - status="Success" if result.returncode == 0 else "Failure", - source=source, - filename=image, - checksum=checksum, - recipients=recipients, - ) + elif checklist_uuid and jenkins_job_name: + print( + "-- Triggering Jenkins job {} for checklist {}" + .format(jenkins_job_name, checklist_uuid), file=statusfile, + flush=True) + + jenkins_notify.delay( + jenkins_job_name, + status=result.returncode, + checksum=checksum, + checklist_uuid=checklist_uuid, + ) + + else: + print( + "-- Skipping notification (exit code was: {})." + .format(result.returncode), file=statusfile, flush=True) print("-- Done.", file=statusfile, flush=True) print("- All images processed.", file=statusfile, flush=True) + +@regexdispatch def retrieve_images(source, path): """Generate the filenames of one or multiple disk images as they are retrieved from _source_. + Source may be one of several types of source, so we dispatch to an + appropriate function to deal with it: + + - a git url to a repo containing disk images + - a normal https url directly to a single disk image + - an https url directly to a single disk image in a radosgw (s3) bucket + - an https url to a radosgw (s3) bucket containing disk images + See the docstring for request_scan for documentation of the source and path arguments. @@ -156,15 +199,11 @@ def retrieve_images(source, path): retrieving and manipulating images. """ - if repo_re.match(source): - return retrieve_images_git(source, path) - elif direct_re.match(source): - return retrieve_image_direct(source) - else: - raise ValueError("Unknown source format {}".format(source)) + raise ValueError("Unknown source type %s" % source) -def retrieve_images_git(source, path): +@retrieve_images.register(r'.*\.git$') +def _ri_git(source, path, **kwargs): run(['/usr/bin/git', 'clone', '--depth', '1', '--single-branch', @@ -188,22 +227,57 @@ def retrieve_images_git(source, path): yield os.path.join(root, name) -def retrieve_image_direct(source): - filename = re.search(r'[^/]*$', source).group(0) +# FIXME this regex won't properly detect URLs with query-strings. +@retrieve_images.register(r'''(?x) # this is a "verbose" regex + https?:// # match an http or https url + (?P<hostname> # capture the hostname: + [^/:]* # anything up to the first / or : + ) + .* # any number of path components + /(?P<filename> # capture the filename after the last / + [^/]* # anything not a / + \.(?:img|iso|qcow2?) # with one of these three extensions + (?:\.gz)? # optionally also compressed + )$''') +def _ri_direct(source, path=None, hostname=None, filename=None, **kwargs): + auth = config.AUTHS.get(hostname) with open(filename, 'wb') as fd: - r = requests.get(source, stream=True) + r = requests.get(source, stream=True, auth=auth) for chunk in r.iter_content(chunk_size=4096): fd.write(chunk) yield filename -# FIXME the slack notification should go into a different queue than the image -# requests so they don't get blocked by the scans. +@retrieve_images.register(r'''(?x) # this is a "verbose" regex + https?:// # match an http or https url + (?P<hostname> # capture the hostname: + [^/:]* # anything up to the first / or : + ) + .* # any number of path components + /$ # ending with a slash + ''') +def _ri_bucket(source, path=None, hostname=None, filename=None, **kwargs): + """We assume that an HTTP(s) URL ending in / is a radosgw bucket.""" + auth = config.AUTHS.get(hostname) + # We could request ?format=json but the output is malformed; all but one + # filename is truncated. + response = requests.get(source, {'format': 'xml'}, auth=auth) + keys = ElementTree.fromstring(response.text).iter( + '{http://s3.amazonaws.com/doc/2006-03-01/}Key') + filenames = [x.text for x in keys] + for filename in filenames: + if image_re.match(filename): + yield from retrieve_images(source + filename) + + @celery_app.task(ignore_result=True) def slack_notify(status, source, filename, checksum, recipients): if not SLACK_TOKEN: print("No Slack token defined; skipping notification.") return + if not recipients: + print("No recipients specified; skipping notification.") + return # TODO replace this handrolled code with a nice slack client library @@ -234,3 +308,17 @@ def slack_notify(status, source, filename, checksum, recipients): "https://hooks.slack.com/services/%s" % SLACK_TOKEN, json=dict(payload, channel=recipient), ) + + +@celery_app.task(ignore_result=True) +def jenkins_notify(name, status, checksum, checklist_uuid): + # The frontend does not need the jenkins library, so we perform the import + # it from within the worker task. + from jenkins import Jenkins + server = Jenkins(**config.JENKINS) + logurl = "http://{}/imagescanner/result/{}".format(DOMAIN, checksum) + server.build_job(name, { + "checklist_uuid": checklist_uuid, + "status": status, + "logurl": logurl, + }) |