summaryrefslogtreecommitdiffstats
path: root/imagescanner/imagescanner/tasks.py
diff options
context:
space:
mode:
Diffstat (limited to 'imagescanner/imagescanner/tasks.py')
-rw-r--r--imagescanner/imagescanner/tasks.py156
1 files changed, 122 insertions, 34 deletions
diff --git a/imagescanner/imagescanner/tasks.py b/imagescanner/imagescanner/tasks.py
index 3610373..61abf15 100644
--- a/imagescanner/imagescanner/tasks.py
+++ b/imagescanner/imagescanner/tasks.py
@@ -42,19 +42,23 @@ import re
import hashlib
import datetime
from subprocess import run
+from xml.etree import ElementTree
from celery import Celery
import requests
-from . import STATUSFILE, LOGS_PATH
+from . import config
from .in_temp_dir import in_temp_dir
-
+from .regexdispatch import regexdispatch
celery_app = Celery(
broker='redis://redis',
backend='redis://redis',
)
-repo_re = re.compile(r'.*\.git$')
-direct_re = re.compile(r'http.*\.(?:img|iso|qcow2)(?:\.gz)?$')
-image_re = re.compile(r'.*\.(?:img|iso|qcow2)(?:\.gz)?$')
+
+# direct_re will match URLs pointing directly to an image to download, over
+# http and https connections, and will capture the hostname and filename in
+# named groups. This includes URLs to S3 and RadosGW endpoints.
+#
+image_re = re.compile(r'.*\.(?:img|iso|qcow2?)(?:\.gz)?$')
SLACK_TOKEN = os.getenv('SLACK_TOKEN')
DOMAIN = os.getenv('DOMAIN')
@@ -70,7 +74,8 @@ def sha256(path):
@celery_app.task(queue='scans', ignore_result=True)
@in_temp_dir()
-def request_scan(source, path, recipients):
+def request_scan(source, path, recipients=None, jenkins_job_name=None,
+ checklist_uuid=None):
"""Retrieve and scan all partitions of (an) image(s), and notify of the
results.
@@ -88,6 +93,13 @@ def request_scan(source, path, recipients):
complete. Currently, this may include Slack usernames and Slack
channels.
+ jenkins_job_name:
+ The name of the jenkins job that should be built to process the scan
+ results.
+
+ checklist_uuid:
+ The UUID of the checklist that should be passed to the jenkins job.
+
This function assumes the current working directory is a safe workarea for
retrieving and manipulating images, but is decorated with in_temp_dir which
changes to a new temporary directory upon invocation.
@@ -96,7 +108,7 @@ def request_scan(source, path, recipients):
# TODO printing to a status file is archaic and messy; let's use the python
# logging framework or storing status in redis instead.
- with STATUSFILE.open('w') as statusfile:
+ with config.STATUSFILE.open('w') as statusfile:
print(
"Processing request {source} {path} in {workspace}".format(
@@ -105,7 +117,8 @@ def request_scan(source, path, recipients):
flush=True)
for image in retrieve_images(source, path):
- print("- Image file: {}...".format(image),
+ print(
+ "- Image file: {}...".format(image),
file=statusfile, flush=True)
if not os.path.exists(image):
raise ValueError("Path not found: {}".format(image))
@@ -113,12 +126,12 @@ def request_scan(source, path, recipients):
print("-- Checksumming...", file=statusfile, flush=True)
checksum = sha256(image)
- print("-- Scanning...",
- file=statusfile, flush=True)
- logfile = LOGS_PATH / 'SecurityValidation-{}.txt'.format(checksum)
+ print("-- Scanning...", file=statusfile, flush=True)
+ logfile = config.LOGS_PATH / 'SecurityValidation-{}.txt'.format(
+ checksum)
- #for partition in image_partitions():
- # result = scan_partition(partition)
+ # for partition in image_partitions():
+ # result = scan_partition(partition)
with open(logfile, 'w') as fd:
print(datetime.datetime.utcnow().ctime(), "UTC", file=fd)
print("Launching image scan for {} from {} {}".format(
@@ -130,25 +143,55 @@ def request_scan(source, path, recipients):
stderr=fd,
)
- print("-- Scheduling notification (exit code:{})...".format(result.returncode),
- file=statusfile, flush=True)
+ if recipients:
+ print(
+ "-- Scheduling notification (exit code: {})..."
+ .format(result.returncode), file=statusfile, flush=True)
+
+ slack_notify.delay(
+ status="Success" if result.returncode == 0 else "Failure",
+ source=source,
+ filename=image,
+ checksum=checksum,
+ recipients=recipients,
+ )
- slack_notify.delay(
- status="Success" if result.returncode == 0 else "Failure",
- source=source,
- filename=image,
- checksum=checksum,
- recipients=recipients,
- )
+ elif checklist_uuid and jenkins_job_name:
+ print(
+ "-- Triggering Jenkins job {} for checklist {}"
+ .format(jenkins_job_name, checklist_uuid), file=statusfile,
+ flush=True)
+
+ jenkins_notify.delay(
+ jenkins_job_name,
+ status=result.returncode,
+ checksum=checksum,
+ checklist_uuid=checklist_uuid,
+ )
+
+ else:
+ print(
+ "-- Skipping notification (exit code was: {})."
+ .format(result.returncode), file=statusfile, flush=True)
print("-- Done.", file=statusfile, flush=True)
print("- All images processed.", file=statusfile, flush=True)
+
+@regexdispatch
def retrieve_images(source, path):
"""Generate the filenames of one or multiple disk images as they are
retrieved from _source_.
+ Source may be one of several types of source, so we dispatch to an
+ appropriate function to deal with it:
+
+ - a git url to a repo containing disk images
+ - a normal https url directly to a single disk image
+ - an https url directly to a single disk image in a radosgw (s3) bucket
+ - an https url to a radosgw (s3) bucket containing disk images
+
See the docstring for request_scan for documentation of the source and path
arguments.
@@ -156,15 +199,11 @@ def retrieve_images(source, path):
retrieving and manipulating images.
"""
- if repo_re.match(source):
- return retrieve_images_git(source, path)
- elif direct_re.match(source):
- return retrieve_image_direct(source)
- else:
- raise ValueError("Unknown source format {}".format(source))
+ raise ValueError("Unknown source type %s" % source)
-def retrieve_images_git(source, path):
+@retrieve_images.register(r'.*\.git$')
+def _ri_git(source, path, **kwargs):
run(['/usr/bin/git', 'clone',
'--depth', '1',
'--single-branch',
@@ -188,22 +227,57 @@ def retrieve_images_git(source, path):
yield os.path.join(root, name)
-def retrieve_image_direct(source):
- filename = re.search(r'[^/]*$', source).group(0)
+# FIXME this regex won't properly detect URLs with query-strings.
+@retrieve_images.register(r'''(?x) # this is a "verbose" regex
+ https?:// # match an http or https url
+ (?P<hostname> # capture the hostname:
+ [^/:]* # anything up to the first / or :
+ )
+ .* # any number of path components
+ /(?P<filename> # capture the filename after the last /
+ [^/]* # anything not a /
+ \.(?:img|iso|qcow2?) # with one of these three extensions
+ (?:\.gz)? # optionally also compressed
+ )$''')
+def _ri_direct(source, path=None, hostname=None, filename=None, **kwargs):
+ auth = config.AUTHS.get(hostname)
with open(filename, 'wb') as fd:
- r = requests.get(source, stream=True)
+ r = requests.get(source, stream=True, auth=auth)
for chunk in r.iter_content(chunk_size=4096):
fd.write(chunk)
yield filename
-# FIXME the slack notification should go into a different queue than the image
-# requests so they don't get blocked by the scans.
+@retrieve_images.register(r'''(?x) # this is a "verbose" regex
+ https?:// # match an http or https url
+ (?P<hostname> # capture the hostname:
+ [^/:]* # anything up to the first / or :
+ )
+ .* # any number of path components
+ /$ # ending with a slash
+ ''')
+def _ri_bucket(source, path=None, hostname=None, filename=None, **kwargs):
+ """We assume that an HTTP(s) URL ending in / is a radosgw bucket."""
+ auth = config.AUTHS.get(hostname)
+ # We could request ?format=json but the output is malformed; all but one
+ # filename is truncated.
+ response = requests.get(source, {'format': 'xml'}, auth=auth)
+ keys = ElementTree.fromstring(response.text).iter(
+ '{http://s3.amazonaws.com/doc/2006-03-01/}Key')
+ filenames = [x.text for x in keys]
+ for filename in filenames:
+ if image_re.match(filename):
+ yield from retrieve_images(source + filename)
+
+
@celery_app.task(ignore_result=True)
def slack_notify(status, source, filename, checksum, recipients):
if not SLACK_TOKEN:
print("No Slack token defined; skipping notification.")
return
+ if not recipients:
+ print("No recipients specified; skipping notification.")
+ return
# TODO replace this handrolled code with a nice slack client library
@@ -234,3 +308,17 @@ def slack_notify(status, source, filename, checksum, recipients):
"https://hooks.slack.com/services/%s" % SLACK_TOKEN,
json=dict(payload, channel=recipient),
)
+
+
+@celery_app.task(ignore_result=True)
+def jenkins_notify(name, status, checksum, checklist_uuid):
+ # The frontend does not need the jenkins library, so we perform the import
+ # it from within the worker task.
+ from jenkins import Jenkins
+ server = Jenkins(**config.JENKINS)
+ logurl = "http://{}/imagescanner/result/{}".format(DOMAIN, checksum)
+ server.build_job(name, {
+ "checklist_uuid": checklist_uuid,
+ "status": status,
+ "logurl": logurl,
+ })