1 files changed, 122 insertions, 34 deletions
diff --git a/imagescanner/imagescanner/tasks.py b/imagescanner/imagescanner/tasks.py
index 3610373..61abf15 100644
--- a/imagescanner/imagescanner/tasks.py
+++ b/imagescanner/imagescanner/tasks.py
@@ -42,19 +42,23 @@ import re
 import hashlib
 import datetime
 from subprocess import run
+from xml.etree import ElementTree
 from celery import Celery
 import requests
-from . import STATUSFILE, LOGS_PATH
+from . import config
 from .in_temp_dir import in_temp_dir
-
+from .regexdispatch import regexdispatch
 
 celery_app = Celery(
     broker='redis://redis',
     backend='redis://redis',
     )
-repo_re = re.compile(r'.*\.git$')
-direct_re = re.compile(r'http.*\.(?:img|iso|qcow2)(?:\.gz)?$')
-image_re = re.compile(r'.*\.(?:img|iso|qcow2)(?:\.gz)?$')
+
+# direct_re will match URLs pointing directly to an image to download, over
+# http and https connections, and will capture the hostname and filename in
+# named groups. This includes URLs to S3 and RadosGW endpoints.
+#
+image_re = re.compile(r'.*\.(?:img|iso|qcow2?)(?:\.gz)?$')
 SLACK_TOKEN = os.getenv('SLACK_TOKEN')
 DOMAIN = os.getenv('DOMAIN')
 
@@ -70,7 +74,8 @@ def sha256(path):
 
 @celery_app.task(queue='scans', ignore_result=True)
 @in_temp_dir()
-def request_scan(source, path, recipients):
+def request_scan(source, path, recipients=None, jenkins_job_name=None,
+                 checklist_uuid=None):
     """Retrieve and scan all partitions of (an) image(s), and notify of the
     results.
 
@@ -88,6 +93,13 @@ def request_scan(source, path, recipients):
         complete. Currently, this may include Slack usernames and Slack
         channels.
 
+    jenkins_job_name:
+        The name of the jenkins job that should be built to process the scan
+        results.
+
+    checklist_uuid:
+        The UUID of the checklist that should be passed to the jenkins job.
+
     This function assumes the current working directory is a safe workarea for
     retrieving and manipulating images, but is decorated with in_temp_dir which
     changes to a new temporary directory upon invocation.
@@ -96,7 +108,7 @@ def request_scan(source, path, recipients):
 
     # TODO printing to a status file is archaic and messy; let's use the python
     # logging framework or storing status in redis instead.
-    with STATUSFILE.open('w') as statusfile:
+    with config.STATUSFILE.open('w') as statusfile:
 
         print(
             "Processing request {source} {path} in {workspace}".format(
@@ -105,7 +117,8 @@ def request_scan(source, path, recipients):
             flush=True)
 
         for image in retrieve_images(source, path):
-            print("- Image file: {}...".format(image),
+            print(
+                "- Image file: {}...".format(image),
                 file=statusfile, flush=True)
             if not os.path.exists(image):
                 raise ValueError("Path not found: {}".format(image))
@@ -113,12 +126,12 @@ def request_scan(source, path, recipients):
             print("-- Checksumming...", file=statusfile, flush=True)
             checksum = sha256(image)
 
-            print("-- Scanning...",
-                file=statusfile, flush=True)
-            logfile = LOGS_PATH / 'SecurityValidation-{}.txt'.format(checksum)
+            print("-- Scanning...", file=statusfile, flush=True)
+            logfile = config.LOGS_PATH / 'SecurityValidation-{}.txt'.format(
+                checksum)
 
-            #for partition in image_partitions():
-            #    result = scan_partition(partition)
+            # for partition in image_partitions():
+            #     result = scan_partition(partition)
             with open(logfile, 'w') as fd:
                 print(datetime.datetime.utcnow().ctime(), "UTC", file=fd)
                 print("Launching image scan for {} from {} {}".format(
@@ -130,25 +143,55 @@ def request_scan(source, path, recipients):
                     stderr=fd,
                     )
 
-            print("-- Scheduling notification (exit code:{})...".format(result.returncode),
-                file=statusfile, flush=True)
+            if recipients:
+                print(
+                    "-- Scheduling notification (exit code: {})..."
+                    .format(result.returncode), file=statusfile, flush=True)
+
+                slack_notify.delay(
+                    status="Success" if result.returncode == 0 else "Failure",
+                    source=source,
+                    filename=image,
+                    checksum=checksum,
+                    recipients=recipients,
+                    )
 
-            slack_notify.delay(
-                status="Success" if result.returncode == 0 else "Failure",
-                source=source,
-                filename=image,
-                checksum=checksum,
-                recipients=recipients,
-                )
+            elif checklist_uuid and jenkins_job_name:
+                print(
+                    "-- Triggering Jenkins job {} for checklist {}"
+                    .format(jenkins_job_name, checklist_uuid), file=statusfile,
+                    flush=True)
+
+                jenkins_notify.delay(
+                    jenkins_job_name,
+                    status=result.returncode,
+                    checksum=checksum,
+                    checklist_uuid=checklist_uuid,
+                    )
+
+            else:
+                print(
+                    "-- Skipping notification (exit code was: {})."
+                    .format(result.returncode), file=statusfile, flush=True)
 
             print("-- Done.", file=statusfile, flush=True)
 
         print("- All images processed.", file=statusfile, flush=True)
 
+
+@regexdispatch
 def retrieve_images(source, path):
     """Generate the filenames of one or multiple disk images as they are
     retrieved from _source_.
 
+    Source may be one of several types of source, so we dispatch to an
+    appropriate function to deal with it:
+
+    - a git url to a repo containing disk images
+    - a normal https url directly to a single disk image
+    - an https url directly to a single disk image in a radosgw (s3) bucket
+    - an https url to a radosgw (s3) bucket containing disk images
+
     See the docstring for request_scan for documentation of the source and path
     arguments.
 
@@ -156,15 +199,11 @@ def retrieve_images(source, path):
     retrieving and manipulating images.
 
     """
-    if repo_re.match(source):
-        return retrieve_images_git(source, path)
-    elif direct_re.match(source):
-        return retrieve_image_direct(source)
-    else:
-        raise ValueError("Unknown source format {}".format(source))
+    raise ValueError("Unknown source type %s" % source)
 
 
-def retrieve_images_git(source, path):
+@retrieve_images.register(r'.*\.git$')
+def _ri_git(source, path, **kwargs):
     run(['/usr/bin/git', 'clone',
          '--depth', '1',
          '--single-branch',
@@ -188,22 +227,57 @@ def retrieve_images_git(source, path):
                 yield os.path.join(root, name)
 
 
-def retrieve_image_direct(source):
-    filename = re.search(r'[^/]*$', source).group(0)
+# FIXME this regex won't properly detect URLs with query-strings.
+@retrieve_images.register(r'''(?x)  # this is a "verbose" regex
+    https?://                   # match an http or https url
+    (?P<hostname>               # capture the hostname:
+        [^/:]*                  #   anything up to the first / or :
+    )
+    .*                          # any number of path components
+    /(?P<filename>              # capture the filename after the last /
+        [^/]*                   #   anything not a /
+        \.(?:img|iso|qcow2?)    #   with one of these three extensions
+        (?:\.gz)?               #   optionally also compressed
+    )$''')
+def _ri_direct(source, path=None, hostname=None, filename=None, **kwargs):
+    auth = config.AUTHS.get(hostname)
     with open(filename, 'wb') as fd:
-        r = requests.get(source, stream=True)
+        r = requests.get(source, stream=True, auth=auth)
         for chunk in r.iter_content(chunk_size=4096):
             fd.write(chunk)
     yield filename
 
 
-# FIXME the slack notification should go into a different queue than the image
-# requests so they don't get blocked by the scans.
+@retrieve_images.register(r'''(?x)  # this is a "verbose" regex
+    https?://                   # match an http or https url
+    (?P<hostname>               # capture the hostname:
+        [^/:]*                  #   anything up to the first / or :
+    )
+    .*                          # any number of path components
+    /$                          # ending with a slash
+    ''')
+def _ri_bucket(source, path=None, hostname=None, filename=None, **kwargs):
+    """We assume that an HTTP(s) URL ending in / is a radosgw bucket."""
+    auth = config.AUTHS.get(hostname)
+    # We could request ?format=json but the output is malformed; all but one
+    # filename is truncated.
+    response = requests.get(source, {'format': 'xml'}, auth=auth)
+    keys = ElementTree.fromstring(response.text).iter(
+        '{http://s3.amazonaws.com/doc/2006-03-01/}Key')
+    filenames = [x.text for x in keys]
+    for filename in filenames:
+        if image_re.match(filename):
+            yield from retrieve_images(source + filename)
+
+
 @celery_app.task(ignore_result=True)
 def slack_notify(status, source, filename, checksum, recipients):
     if not SLACK_TOKEN:
         print("No Slack token defined; skipping notification.")
         return
+    if not recipients:
+        print("No recipients specified; skipping notification.")
+        return
 
     # TODO replace this handrolled code with a nice slack client library
 
@@ -234,3 +308,17 @@ def slack_notify(status, source, filename, checksum, recipients):
             "https://hooks.slack.com/services/%s" % SLACK_TOKEN,
             json=dict(payload, channel=recipient),
             )
+
+
+@celery_app.task(ignore_result=True)
+def jenkins_notify(name, status, checksum, checklist_uuid):
+    # The frontend does not need the jenkins library, so we perform the import
+    # it from within the worker task.
+    from jenkins import Jenkins
+    server = Jenkins(**config.JENKINS)
+    logurl = "http://{}/imagescanner/result/{}".format(DOMAIN, checksum)
+    server.build_job(name, {
+        "checklist_uuid": checklist_uuid,
+        "status": status,
+        "logurl": logurl,
+        })