summaryrefslogtreecommitdiffstats
path: root/vnfs/DAaaS
diff options
context:
space:
mode:
authorMarco Platania <platania@research.att.com>2019-04-11 17:55:23 +0000
committerGerrit Code Review <gerrit@onap.org>2019-04-11 17:55:23 +0000
commit4d91730a54e81986d6fc9df4b55de04cfe0eae8b (patch)
tree018a822bd5cebd83bb56e26efabd7049aa403383 /vnfs/DAaaS
parent858a95e48507cde00977408ee6cf8feddc13e063 (diff)
parent9d98267fe7046c304a71f98e89de6774d09f094e (diff)
Merge "Add helm charts for horovod based app"
Diffstat (limited to 'vnfs/DAaaS')
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml5
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile0
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/README.md162
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py127
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml44
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt5
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl32
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml130
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml19
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml126
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml15
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml19
-rw-r--r--vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml115
13 files changed, 799 insertions, 0 deletions
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml b/vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml
new file mode 100644
index 00000000..50b52b98
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml
@@ -0,0 +1,5 @@
+apiVersion: v1
+appVersion: "1.0"
+description: "A sample horovod application which runs the MNIST application using Tensorflow as backend"
+name: sample-horovod-app
+version: 0.1.0
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile b/vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/README.md b/vnfs/DAaaS/applications/charts/sample-horovod-app/README.md
new file mode 100644
index 00000000..08e7691f
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/README.md
@@ -0,0 +1,162 @@
+# Horovod
+
+[Horovod](https://eng.uber.com/horovod/) is a distributed training framework for TensorFlow, and it's provided by UBER. The goal of Horovod is to make distributed Deep Learning fast and easy to use. And it provides [Horovod in Docker](https://github.com/uber/horovod/blob/master/docs/docker.md) to streamline the installation process.
+
+## Introduction
+
+This chart bootstraps Horovod which is a Distributed TensorFlow Framework on a Kubernetes cluster using the Helm Package Manager. It deploys Horovod workers as statefulsets, and the Horovod master as a job, then discover the host list automatically.
+
+## Prerequisites
+
+- Kubernetes cluster v1.8+
+
+## Build Docker Image
+
+You can use the dockerfile image provided along with this package. The benefit of this dockerfile is it contains many additional packages that the data science engineers usually require like spark, tensorflow, pytorch, matplotlib, nltk,
+keras, h5py, pyarrow.
+
+Before building the docker image, first build and make a Spark distribution following the instructions in http://spark.apache.org/docs/latest/building-spark.html
+If this docker file is being used in the context of building your images from a Spark distribution, the docker build command should be invoked from the top level directory of the Spark distribution. E.g.:
+
+```
+docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile .
+```
+
+Once you build the spark image, go inside the spark package and place the file "keras_mnist_advanced_modified.py" in the dirctory: examples/src/main/python/tensorflow/. Create the 'tensorflow' directory if it doesnt exists.
+We do this because we the file keras_mnist_advanced_modified.py is optimized for CPU running and we want this file to be automatically present in the final docker image that we build.
+
+```
+docker build -t spark-tf-keras-horovod-pytorch:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile .
+```
+
+## Prepare ssh keys
+
+```
+# Setup ssh key
+export SSH_KEY_DIR=`mktemp -d`
+cd $SSH_KEY_DIR
+yes | ssh-keygen -N "" -f id_rsa
+```
+
+## Create the values.yaml
+
+To run Horovod with GPU, you can create `values.yaml` like below
+
+```
+# cat << EOF > ~/values.yaml
+---
+ssh:
+ useSecrets: true
+ hostKey: |-
+$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g')
+
+ hostKeyPub: |-
+$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g')
+
+worker:
+ number: 2
+ image:
+ repository: uber/horovod
+ tag: 0.12.1-tf1.8.0-py3.5
+master:
+ image:
+ repository: uber/horovod
+ tag: 0.12.1-tf1.8.0-py3.5
+ args:
+ - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'"
+EOF
+```
+
+For most cases, the overlay network impacts the Horovod performance greatly, so we should apply `Host Network` solution. To run Horovod with Host Network and GPU, you can create `values.yaml` like below
+
+
+```
+# cat << EOF > ~/values.yaml
+---
+useHostNetwork: true
+
+ssh:
+ useSecrets: true
+ port: 32222
+ hostKey: |-
+$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g')
+
+ hostKeyPub: |-
+$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g')
+
+
+worker:
+ number: 2
+ image:
+ repository: uber/horovod
+ tag: 0.12.1-tf1.8.0-py3.5
+master:
+ image:
+ repository: uber/horovod
+ tag: 0.12.1-tf1.8.0-py3.5
+ args:
+ - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'"
+EOF
+```
+
+```
+NOTE: A sample values.yaml is provided for reference. After adding the above changes, we should have a values.yml similar to that.
+```
+
+> notice: the difference is that you should set `useHostNetwork` as true, then set another ssh port rather than `22`
+
+## Installing the Chart
+
+To install the chart with the release name `mnist`:
+
+```bash
+$ helm install --values ~/values.yaml --name mnist stable/horovod
+```
+
+## Uninstalling the Chart
+
+To uninstall/delete the `mnist` deployment:
+
+```bash
+$ helm delete mnist
+```
+
+The command removes all the Kubernetes components associated with the chart and
+deletes the release.
+
+## Upgrading an existing Release to a new major version
+A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an
+incompatible breaking change needing manual actions.
+
+### 1.0.0
+This version removes the `chart` label from the `spec.selector.matchLabels`
+which is immutable since `StatefulSet apps/v1beta2`. It has been inadvertently
+added, causing any subsequent upgrade to fail. See https://github.com/helm/charts/issues/7726.
+
+In order to upgrade, delete the Horovod StatefulSet before upgrading, supposing your Release is named `my-release`:
+
+```bash
+$ kubectl delete statefulsets.apps --cascade=false my-release
+```
+
+## Configuration
+
+The following table lists the configurable parameters of the Horovod
+chart and their default values.
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `useHostNetwork` | Host network | `false` |
+| `ssh.port` | The ssh port | `22` |
+| `ssh.useSecrets` | Determine if using the secrets for ssh | `false` |
+| `worker.number`| The worker's number | `5` |
+| `worker.image.repository` | horovod worker image | `uber/horovod` |
+| `worker.image.pullPolicy` | `pullPolicy` for the worker | `IfNotPresent` |
+| `worker.image.tag` | `tag` for the worker | `0.12.1-tf1.8.0-py3.5` |
+| `resources`| pod resource requests & limits| `{}`|
+| `worker.env` | worker's environment variables | `{}` |
+| `master.image.repository` | horovod master image | `uber/horovod` |
+| `master.image.tag` | `tag` for the master | `0.12.1-tf1.8.0-py3.5` |
+| `master.image.pullPolicy` | image pullPolicy for the master image| `IfNotPresent` |
+| `master.args` | master's args | `{}` |
+| `master.env` | master's environment variables | `{}` |
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py b/vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py
new file mode 100644
index 00000000..03425ff7
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py
@@ -0,0 +1,127 @@
+from __future__ import print_function
+import keras
+from keras.datasets import mnist
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Flatten
+from keras.layers import Conv2D, MaxPooling2D
+from keras.preprocessing.image import ImageDataGenerator
+from keras import backend as K
+import tensorflow as tf
+import horovod.keras as hvd
+
+# Horovod: initialize Horovod.
+hvd.init()
+
+# Horovod: pin GPU to be used to process local rank (one GPU per process)
+config = tf.ConfigProto()
+#config.gpu_options.allow_growth = True
+#config.gpu_options.visible_device_list = str(hvd.local_rank())
+K.set_session(tf.Session(config=config))
+
+batch_size = 128
+num_classes = 10
+
+# Enough epochs to demonstrate learning rate warmup and the reduction of
+# learning rate when training plateaues.
+epochs = 24
+
+# Input image dimensions
+img_rows, img_cols = 28, 28
+
+# The data, shuffled and split between train and test sets
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+# Determine how many batches are there in train and test sets
+train_batches = len(x_train) // batch_size
+test_batches = len(x_test) // batch_size
+
+if K.image_data_format() == 'channels_first':
+ x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
+ x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
+ input_shape = (1, img_rows, img_cols)
+else:
+ x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
+ x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
+ input_shape = (img_rows, img_cols, 1)
+
+x_train = x_train.astype('float32')
+x_test = x_test.astype('float32')
+x_train /= 255
+x_test /= 255
+print('x_train shape:', x_train.shape)
+print(x_train.shape[0], 'train samples')
+print(x_test.shape[0], 'test samples')
+
+# Convert class vectors to binary class matrices
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+model = Sequential()
+model.add(Conv2D(32, kernel_size=(3, 3),
+ activation='relu',
+ input_shape=input_shape))
+model.add(Conv2D(64, (3, 3), activation='relu'))
+model.add(MaxPooling2D(pool_size=(2, 2)))
+model.add(Dropout(0.25))
+model.add(Flatten())
+model.add(Dense(128, activation='relu'))
+model.add(Dropout(0.5))
+model.add(Dense(num_classes, activation='softmax'))
+
+# Horovod: adjust learning rate based on number of GPUs.
+opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size())
+
+# Horovod: add Horovod Distributed Optimizer.
+opt = hvd.DistributedOptimizer(opt)
+
+model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=opt,
+ metrics=['accuracy'])
+
+callbacks = [
+ # Horovod: broadcast initial variable states from rank 0 to all other processes.
+ # This is necessary to ensure consistent initialization of all workers when
+ # training is started with random weights or restored from a checkpoint.
+ hvd.callbacks.BroadcastGlobalVariablesCallback(0),
+
+ # Horovod: average metrics among workers at the end of every epoch.
+ #
+ # Note: This callback must be in the list before the ReduceLROnPlateau,
+ # TensorBoard or other metrics-based callbacks.
+ hvd.callbacks.MetricAverageCallback(),
+
+ # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
+ # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
+ # the first five epochs. See https://arxiv.org/abs/1706.02677 for details.
+ hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1),
+
+ # Reduce the learning rate if training plateaues.
+ keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1),
+]
+
+# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
+if hvd.rank() == 0:
+ callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
+
+# Set up ImageDataGenerators to do data augmentation for the training images.
+train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3,
+ height_shift_range=0.08, zoom_range=0.08)
+test_gen = ImageDataGenerator()
+
+# Train the model.
+# Horovod: the training will randomly sample 1 / N batches of training data and
+# 3 / N batches of validation data on every worker, where N is the number of workers.
+# Over-sampling of validation data helps to increase probability that every validation
+# example will be evaluated.
+model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size),
+ steps_per_epoch=train_batches // hvd.size(),
+ callbacks=callbacks,
+ epochs=epochs,
+ verbose=1,
+ validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size),
+ validation_steps=3 * test_batches // hvd.size())
+
+# Evaluate the model on the full data set.
+score = model.evaluate(x_test, y_test, verbose=0)
+print('Test loss:', score[0])
+print('Test accuracy:', score[1])
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml
new file mode 100644
index 00000000..6ac31359
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml
@@ -0,0 +1,44 @@
+---
+#useHostNetwork: true
+
+ssh:
+ useSecrets: true
+ hostKey: |-
+ -----BEGIN RSA PRIVATE KEY-----
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey
+ -----END RSA PRIVATE KEY-----
+
+ hostKeyPub: |-
+ ssh-rsa ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey
+ ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey
+ ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey
+ ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey user@openSource
+
+resources: {}
+
+worker:
+ number: 2
+ image:
+ repository: spark-tf-keras-horovod-pytorch
+ tag: latest
+ pullPolicy: Never
+master:
+ image:
+ repository: spark-tf-keras-horovod-pytorch
+ tag: latest
+ pullPolicy: Never
+ args:
+ - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'"
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt
new file mode 100644
index 00000000..774555ae
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt
@@ -0,0 +1,5 @@
+1. Get the application URL by running these commands:
+
+*** NOTE: It may take a few minutes for the statefulset to be available
+
+*** you can watch the status of statefulset by running 'kubectl get sts --namespace {{ .Release.Namespace }} -w {{ template "horovod.fullname" . }}' *** \ No newline at end of file
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl
new file mode 100644
index 00000000..02071c0f
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl
@@ -0,0 +1,32 @@
+{{/* vim: set filetype=mustache: */}}
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "horovod.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/*
+Create a default fully qualified app name.
+We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
+If release name contains chart name it will be used as a full name.
+*/}}
+{{- define "horovod.fullname" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- $name := default .Chart.Name .Values.nameOverride -}}
+{{- if contains $name .Release.Name -}}
+{{- .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/*
+Create chart name and version as used by the chart label.
+*/}}
+{{- define "horovod.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml
new file mode 100644
index 00000000..ae93c445
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml
@@ -0,0 +1,130 @@
+{{- $workerNum := .Values.worker.number -}}
+{{- $name := include "horovod.fullname" . }}
+{{- $slots := 1 }}
+{{- if index .Values.resources "nvidia.com/gpu" }}
+{{- $slots := index .Values.resources "nvidia.com/gpu" }}
+{{- end }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: {{ template "horovod.fullname" . }}
+ labels:
+ heritage: {{ .Release.Service | quote }}
+ release: {{ .Release.Name | quote }}
+ chart: {{ template "horovod.chart" . }}
+ app: {{ template "horovod.fullname" . }}
+data:
+ hostfile.config: |
+ {{ $name }}-master slots={{ $slots }}
+ {{- range $i, $none := until (int $workerNum) }}
+ {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }}
+ {{- end }}
+ ssh.readiness: |
+ #!/bin/bash
+ set -xev
+ ssh localhost ls
+ master.run: |
+ #!/bin/bash
+ set -x
+ sleep 5
+
+ mkdir -p /root/.ssh
+ rm -f /root/.ssh/config
+ touch /root/.ssh/config
+
+ if [ "$USESECRETS" == "true" ];then
+ set +e
+ yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
+ yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
+ set -e
+ fi
+
+ if [ -n "$SSHPORT" ]; then
+ echo "Port $SSHPORT" > /root/.ssh/config
+ sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
+ fi
+ echo "StrictHostKeyChecking no" >> /root/.ssh/config
+ /usr/sbin/sshd
+
+ if [ $# -eq 0 ]; then
+ sleep infinity
+ else
+ bash -c "$*"
+ fi
+ sleep 300
+ master.waitWorkerReady: |
+ #!/bin/bash
+ set -xev
+ function updateSSHPort() {
+ mkdir -p /root/.ssh
+ rm -f /root/.ssh/config
+ touch /root/.ssh/config
+
+ if [ -n "$SSHPORT" ]; then
+ echo "Port $SSHPORT" > /root/.ssh/config
+ echo "StrictHostKeyChecking no" >> /root/.ssh/config
+ fi
+ }
+
+ function runCheckSSH() {
+ if [[ "$USESECRETS" == "true" ]];then
+ set +e
+ yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
+ yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
+ set -e
+ fi
+
+ for i in `cat $1 | awk '{print $(1)}'`;do
+ if [[ "$i" != *"master" ]];then
+ retry 30 ssh -o ConnectTimeout=2 -q $i exit
+ fi
+ done
+ }
+
+ function retry()
+ {
+ local n=0;local try=$1
+ local cmd="${@: 2}"
+ [[ $# -le 1 ]] && {
+ echo "Usage $0 <retry_number> <Command>";
+ }
+ set +e
+ until [[ $n -ge $try ]]
+ do
+ $cmd && break || {
+ echo "Command Fail.."
+ ((n++))
+ echo "retry $n :: [$cmd]"
+ sleep 1;
+ }
+ done
+ $cmd
+ if [ $? -ne 0 ]; then
+ exit 1
+ fi
+ set -e
+ }
+ updateSSHPort
+ runCheckSSH $1
+ worker.run: |
+ #!/bin/bash
+ set -x
+
+ mkdir -p /root/.ssh
+ rm -f /root/.ssh/config
+ touch /root/.ssh/config
+
+ if [[ "$USESECRETS" == "true" ]];then
+ set +e
+ yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
+ yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
+ set -e
+ fi
+
+ if [ -n "$SSHPORT" ]; then
+ echo "Port $SSHPORT" > /root/.ssh/config
+ sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
+ fi
+ echo "StrictHostKeyChecking no" >> /root/.ssh/config
+
+ /usr/sbin/sshd -D
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml
new file mode 100644
index 00000000..e7b05c26
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "horovod.fullname" . }}-master
+ labels:
+ app: {{ template "horovod.name" . }}
+ chart: {{ template "horovod.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ clusterIP: None
+ ports:
+ - name: ssh
+ port: {{ .Values.ssh.port }}
+ targetPort: {{ .Values.ssh.port }}
+ selector:
+ app: {{ template "horovod.name" . }}
+ release: {{ .Release.Name }}
+ role: master
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml
new file mode 100644
index 00000000..4e59b277
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml
@@ -0,0 +1,126 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: {{ template "horovod.fullname" . }}
+ labels:
+ app: {{ template "horovod.name" . }}
+ chart: {{ template "horovod.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ role: master
+spec:
+ template:
+ metadata:
+ labels:
+ app: {{ template "horovod.name" . }}
+ release: {{ .Release.Name }}
+ role: master
+ spec:
+ {{- if .Values.useHostNetwork }}
+ hostNetwork: {{ .Values.useHostNetwork }}
+ dnsPolicy: ClusterFirstWithHostNet
+ {{- end }}
+ {{- if .Values.useHostPID }}
+ hostPID: {{ .Values.useHostPID }}
+ {{- end }}
+ restartPolicy: OnFailure
+ volumes:
+ - name: {{ template "horovod.fullname" . }}-cm
+ configMap:
+ name: {{ template "horovod.fullname" . }}
+ items:
+ - key: hostfile.config
+ path: hostfile
+ mode: 438
+ - key: master.waitWorkerReady
+ path: waitWorkersReady.sh
+ mode: 365
+ - key: master.run
+ path: run.sh
+ mode: 365
+ {{- if .Values.ssh.useSecrets }}
+ - name: {{ template "horovod.fullname" . }}-secret
+ secret:
+ secretName: {{ template "horovod.fullname" . }}
+ defaultMode: 448
+ items:
+ - key: host-key
+ path: id_rsa
+ - key: host-key-pub
+ path: authorized_keys
+ {{- end }}
+{{- if .Values.volumes }}
+{{ toYaml .Values.volumes | indent 6 }}
+{{- end }}
+ containers:
+ - name: horovod-master
+ image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}"
+ imagePullPolicy: {{ .Values.master.image.pullPolicy }}
+ env:
+ - name: SSHPORT
+ value: "{{ .Values.ssh.port }}"
+ {{- if .Values.ssh.useSecrets }}
+ - name: USESECRETS
+ value: "{{ .Values.ssh.useSecrets }}"
+ {{- end }}
+ {{- if .Values.master.env }}
+ {{- range $key, $value := .Values.master.env }}
+ - name: "{{ $key }}"
+ value: "{{ $value }}"
+ {{- end }}
+ {{- end }}
+{{- if .Values.master.privileged }}
+ securityContext:
+ privileged: true
+{{- end }}
+ ports:
+ - containerPort: {{ .Values.ssh.port }}
+ volumeMounts:
+ - name: {{ template "horovod.fullname" . }}-cm
+ mountPath: /horovod/generated
+ {{- if .Values.ssh.useSecrets }}
+ - name: {{ template "horovod.fullname" . }}-secret
+ readOnly: true
+ mountPath: "/etc/secret-volume"
+ {{- end }}
+{{- if .Values.volumeMounts }}
+{{ toYaml .Values.volumeMounts | indent 8 }}
+{{- end }}
+ command:
+ - /horovod/generated/run.sh
+ args:
+{{ toYaml .Values.master.args | indent 10 }}
+ resources:
+{{ toYaml .Values.resources | indent 10 }}
+{{- if .Values.ssh.useSecrets }}
+ initContainers:
+ - name: wait-workers
+ image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}"
+ imagePullPolicy: {{ .Values.master.image.pullPolicy }}
+ env:
+ - name: SSHPORT
+ value: "{{ .Values.ssh.port }}"
+ {{- if .Values.ssh.useSecrets }}
+ - name: USESECRETS
+ value: "{{ .Values.ssh.useSecrets }}"
+ {{- end }}
+ {{- if .Values.master.env }}
+ {{- range $key, $value := .Values.master.env }}
+ - name: "{{ $key }}"
+ value: "{{ $value }}"
+ {{- end }}
+ {{- end }}
+ command:
+ - /horovod/generated/waitWorkersReady.sh
+ args:
+ - /horovod/generated/hostfile
+ volumeMounts:
+ - name: {{ template "horovod.fullname" . }}-cm
+ mountPath: /horovod/generated
+ {{- if .Values.ssh.useSecrets }}
+ - name: {{ template "horovod.fullname" . }}-secret
+ readOnly: true
+ mountPath: "/etc/secret-volume"
+ {{- end }}
+{{- end }}
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml
new file mode 100644
index 00000000..c9853ed0
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml
@@ -0,0 +1,15 @@
+{{- if .Values.ssh.useSecrets }}
+apiVersion: v1
+kind: Secret
+metadata:
+ name: {{ template "horovod.fullname" . }}
+ labels:
+ app: {{ template "horovod.name" . }}
+ chart: {{ template "horovod.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+type: Opaque
+data:
+ host-key: {{ .Values.ssh.hostKey | b64enc | quote }}
+ host-key-pub: {{ .Values.ssh.hostKeyPub | b64enc | quote }}
+{{- end }} \ No newline at end of file
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml
new file mode 100644
index 00000000..d0216a86
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: {{ template "horovod.fullname" . }}
+ labels:
+ app: {{ template "horovod.name" . }}
+ chart: {{ template "horovod.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+spec:
+ clusterIP: None
+ ports:
+ - name: ssh
+ port: {{ .Values.ssh.port }}
+ targetPort: {{ .Values.ssh.port }}
+ selector:
+ app: {{ template "horovod.name" . }}
+ release: {{ .Release.Name }}
+ role: worker
diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml
new file mode 100644
index 00000000..1d3f7577
--- /dev/null
+++ b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml
@@ -0,0 +1,115 @@
+apiVersion: apps/v1beta2
+kind: StatefulSet
+metadata:
+ name: {{ template "horovod.fullname" . }}
+ labels:
+ app: {{ template "horovod.name" . }}
+ chart: {{ template "horovod.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ role: worker
+spec:
+ selector:
+ matchLabels:
+ app: {{ template "horovod.name" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ role: worker
+ serviceName: {{ template "horovod.fullname" . }}
+ podManagementPolicy: {{ .Values.worker.podManagementPolicy }}
+ replicas: {{.Values.worker.number}}
+ template:
+ metadata:
+ labels:
+ app: {{ template "horovod.name" . }}
+ chart: {{ template "horovod.chart" . }}
+ release: {{ .Release.Name }}
+ heritage: {{ .Release.Service }}
+ role: worker
+ spec:
+ selector:
+ matchLabels:
+ app: {{ template "horovod.name" . }}
+ release: {{ .Release.Name }}
+ role: worker
+ {{- if .Values.useHostNetwork }}
+ hostNetwork: {{ .Values.useHostNetwork }}
+ dnsPolicy: ClusterFirstWithHostNet
+ {{- end }}
+ {{- if .Values.useHostPID }}
+ hostPID: {{ .Values.useHostPID }}
+ {{- end }}
+ volumes:
+ - name: {{ template "horovod.fullname" . }}-cm
+ configMap:
+ name: {{ template "horovod.fullname" . }}
+ items:
+ - key: hostfile.config
+ path: hostfile
+ mode: 438
+ - key: ssh.readiness
+ path: check.sh
+ mode: 365
+ - key: worker.run
+ path: run.sh
+ mode: 365
+ {{- if .Values.ssh.useSecrets }}
+ - name: {{ template "horovod.fullname" . }}-secret
+ secret:
+ secretName: {{ template "horovod.fullname" . }}
+ defaultMode: 448
+ items:
+ - key: host-key
+ path: id_rsa
+ - key: host-key-pub
+ path: authorized_keys
+ {{- end }}
+{{- if .Values.volumes }}
+{{ toYaml .Values.volumes | indent 6 }}
+{{- end }}
+ containers:
+ - name: worker
+ image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}"
+ imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
+ env:
+ - name: SSHPORT
+ value: "{{ .Values.ssh.port }}"
+ {{- if .Values.ssh.useSecrets }}
+ - name: USESECRETS
+ value: "{{ .Values.ssh.useSecrets }}"
+ {{- end }}
+ {{- if .Values.master.env }}
+ {{- range $key, $value := .Values.master.env }}
+ - name: "{{ $key }}"
+ value: "{{ $value }}"
+ {{- end }}
+ {{- end }}
+{{- if .Values.worker.privileged }}
+ securityContext:
+ privileged: true
+{{- end }}
+ ports:
+ - containerPort: {{ .Values.ssh.port }}
+ volumeMounts:
+ - name: {{ template "horovod.fullname" . }}-cm
+ mountPath: /horovod/generated
+ {{- if .Values.ssh.useSecrets }}
+ - name: {{ template "horovod.fullname" . }}-secret
+ readOnly: true
+ mountPath: "/etc/secret-volume"
+ {{- end }}
+{{- if .Values.volumeMounts }}
+{{ toYaml .Values.volumeMounts | indent 8 }}
+{{- end }}
+ command:
+ - /horovod/generated/run.sh
+{{- if .Values.ssh.useSecrets }}
+ readinessProbe:
+ exec:
+ command:
+ - /horovod/generated/check.sh
+ initialDelaySeconds: 1
+ periodSeconds: 2
+{{- end }}
+ resources:
+{{ toYaml .Values.resources | indent 10 }}