diff options
author | Dileep Ranganathan <dileep.ranganathan@intel.com> | 2019-05-30 12:38:37 -0700 |
---|---|---|
committer | Dileep Ranganathan <dileep.ranganathan@intel.com> | 2019-05-30 21:11:52 +0000 |
commit | 3d5a3e06530c1250d48f7d838c619f3bfbcd019d (patch) | |
tree | 349e370c43ce7318b3f7eb7736345de6872cbef2 /vnfs/DAaaS/sample-apps/training/sample-horovod-app | |
parent | 31802660dfe74a8671ae29789f0018f0f887ea1a (diff) |
Refactor Distributed Analytics project structure
Modified the project structure to improve maintainability and to add future CI and
integration test support.
Change-Id: Id30bfb1f83f23785a6b5f99e81f42f752d59c0f8
Issue-ID: ONAPARC-280
Signed-off-by: Dileep Ranganathan <dileep.ranganathan@intel.com>
Diffstat (limited to 'vnfs/DAaaS/sample-apps/training/sample-horovod-app')
14 files changed, 1047 insertions, 0 deletions
diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/Chart.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/Chart.yaml new file mode 100644 index 00000000..3ce06e28 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: "A sample horovod application which runs the MNIST application using Tensorflow as backend" +name: sample-horovod-app-keras-mnist-advanced +version: 0.1.0 diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/Dockerfile b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/Dockerfile new file mode 100644 index 00000000..5b8f5636 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/Dockerfile @@ -0,0 +1,143 @@ +# Copyright (c) 2019 Intel Corporation +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Ported kubernetes spark image to Ubuntu + +FROM ubuntu:18.04 + +# Install jdk +RUN apt update -yqq +RUN apt install -y locales openjdk-8-jdk && rm -rf /var/lib/apt/lists/* \ + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 + +# Install all the essentials +RUN apt-get update --fix-missing && \ + apt-get install -y numactl wget curl bzip2 nmap vim ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 \ + git mercurial subversion build-essential openssh-server openssh-client net-tools && \ + mkdir -p /var/run/sshd && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV LANG en_US.utf8 +ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH $JAVA_HOME/bin:$PATH +ENV PATH /opt/conda/bin:/opt/spark/bin:$PATH +ENV OPENMPI_VERSION 3.1 + +# Install openMPI +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget --quiet https://www.open-mpi.org/software/ompi/v${OPENMPI_VERSION}/downloads/openmpi-${OPENMPI_VERSION}.2.tar.gz -O openmpi.tar.gz && \ + tar zxf openmpi.tar.gz && \ + cd openmpi-3.1.2 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi + +# Install miniconda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda && \ + rm ~/miniconda.sh && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc + +# Install tf & keras using conda in the virtual_environment:tf_env +SHELL ["/bin/bash", "-c"] +RUN conda update -n base -c defaults conda && \ + conda create -n tf_env +RUN conda install -n tf_env -y -c anaconda \ + pip tensorflow keras nltk pyarrow +RUN conda install -n tf_env -y -c anaconda h5py + +RUN conda install -n tf_env -y -c pytorch pytorch-cpu +RUN conda install -n tf_env -y -c conda-forge matplotlib + +RUN echo "conda activate tf_env" >> ~/.bashrc && \ + conda install -n tf_env -y -c conda-forge clangdev + +RUN source ~/.bashrc +RUN HOROVOD_WITH_TENSORFLOW=1 /opt/conda/envs/tf_env/bin/pip install --no-cache-dir horovod + +# openMPI sane defaults: +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +# Install tini +RUN apt-get install -y curl grep sed dpkg && \ + TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && echo ${TINI_VERSION} && \ + curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ + dpkg -i tini.deb && \ + rm tini.deb && \ + apt clean + +# This is needed to match the original entrypoint.sh file. +RUN cp /usr/bin/tini /sbin + +# Begin: Installing spark +ARG spark_jars=jars +ARG img_path=kubernetes/dockerfiles +ARG k8s_tests=kubernetes/tests + +# Before building the docker image, first build and make a Spark distribution following +# the instructions in http://spark.apache.org/docs/latest/building-spark.html. +# If this docker file is being used in the context of building your images from a Spark +# distribution, the docker build command should be invoked from the top level directory +# of the Spark distribution. E.g.: +# docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . + +RUN mkdir -p /opt/spark && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd + + +COPY ${spark_jars} /opt/spark/jars +COPY bin /opt/spark/bin +COPY sbin /opt/spark/sbin +COPY ${img_path}/spark/entrypoint.sh /opt/ + +COPY ${k8s_tests} /opt/spark/tests +COPY data /opt/spark/data +ENV SPARK_HOME /opt/spark + +RUN mkdir /opt/spark/python +COPY python/pyspark /opt/spark/python/pyspark +COPY python/lib /opt/spark/python/lib +ENV PYTHONPATH /opt/spark/python/lib/pyspark.zip:/opt/spark/python/lib/py4j-*.zip +ENV PATH /opt/conda/envs/tf_env/bin:$PATH + +RUN echo "export PATH=/opt/conda/envs/tf_env/bin:$PATH" >> ~/.bashrc +# echo "activate tf_env\n" >> ~/.bashrc +RUN pip install petastorm +COPY examples /opt/spark/examples +WORKDIR /opt/spark/work-dir + +ENTRYPOINT [ "/opt/entrypoint.sh" ] + +# End: Installing spark diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/README.md b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/README.md new file mode 100644 index 00000000..08e7691f --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/README.md @@ -0,0 +1,162 @@ +# Horovod + +[Horovod](https://eng.uber.com/horovod/) is a distributed training framework for TensorFlow, and it's provided by UBER. The goal of Horovod is to make distributed Deep Learning fast and easy to use. And it provides [Horovod in Docker](https://github.com/uber/horovod/blob/master/docs/docker.md) to streamline the installation process. + +## Introduction + +This chart bootstraps Horovod which is a Distributed TensorFlow Framework on a Kubernetes cluster using the Helm Package Manager. It deploys Horovod workers as statefulsets, and the Horovod master as a job, then discover the host list automatically. + +## Prerequisites + +- Kubernetes cluster v1.8+ + +## Build Docker Image + +You can use the dockerfile image provided along with this package. The benefit of this dockerfile is it contains many additional packages that the data science engineers usually require like spark, tensorflow, pytorch, matplotlib, nltk, +keras, h5py, pyarrow. + +Before building the docker image, first build and make a Spark distribution following the instructions in http://spark.apache.org/docs/latest/building-spark.html +If this docker file is being used in the context of building your images from a Spark distribution, the docker build command should be invoked from the top level directory of the Spark distribution. E.g.: + +``` +docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . +``` + +Once you build the spark image, go inside the spark package and place the file "keras_mnist_advanced_modified.py" in the dirctory: examples/src/main/python/tensorflow/. Create the 'tensorflow' directory if it doesnt exists. +We do this because we the file keras_mnist_advanced_modified.py is optimized for CPU running and we want this file to be automatically present in the final docker image that we build. + +``` +docker build -t spark-tf-keras-horovod-pytorch:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . +``` + +## Prepare ssh keys + +``` +# Setup ssh key +export SSH_KEY_DIR=`mktemp -d` +cd $SSH_KEY_DIR +yes | ssh-keygen -N "" -f id_rsa +``` + +## Create the values.yaml + +To run Horovod with GPU, you can create `values.yaml` like below + +``` +# cat << EOF > ~/values.yaml +--- +ssh: + useSecrets: true + hostKey: |- +$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g') + + hostKeyPub: |- +$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g') + +worker: + number: 2 + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 +master: + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 + args: + - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" +EOF +``` + +For most cases, the overlay network impacts the Horovod performance greatly, so we should apply `Host Network` solution. To run Horovod with Host Network and GPU, you can create `values.yaml` like below + + +``` +# cat << EOF > ~/values.yaml +--- +useHostNetwork: true + +ssh: + useSecrets: true + port: 32222 + hostKey: |- +$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g') + + hostKeyPub: |- +$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g') + + +worker: + number: 2 + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 +master: + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 + args: + - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" +EOF +``` + +``` +NOTE: A sample values.yaml is provided for reference. After adding the above changes, we should have a values.yml similar to that. +``` + +> notice: the difference is that you should set `useHostNetwork` as true, then set another ssh port rather than `22` + +## Installing the Chart + +To install the chart with the release name `mnist`: + +```bash +$ helm install --values ~/values.yaml --name mnist stable/horovod +``` + +## Uninstalling the Chart + +To uninstall/delete the `mnist` deployment: + +```bash +$ helm delete mnist +``` + +The command removes all the Kubernetes components associated with the chart and +deletes the release. + +## Upgrading an existing Release to a new major version +A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an +incompatible breaking change needing manual actions. + +### 1.0.0 +This version removes the `chart` label from the `spec.selector.matchLabels` +which is immutable since `StatefulSet apps/v1beta2`. It has been inadvertently +added, causing any subsequent upgrade to fail. See https://github.com/helm/charts/issues/7726. + +In order to upgrade, delete the Horovod StatefulSet before upgrading, supposing your Release is named `my-release`: + +```bash +$ kubectl delete statefulsets.apps --cascade=false my-release +``` + +## Configuration + +The following table lists the configurable parameters of the Horovod +chart and their default values. + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `useHostNetwork` | Host network | `false` | +| `ssh.port` | The ssh port | `22` | +| `ssh.useSecrets` | Determine if using the secrets for ssh | `false` | +| `worker.number`| The worker's number | `5` | +| `worker.image.repository` | horovod worker image | `uber/horovod` | +| `worker.image.pullPolicy` | `pullPolicy` for the worker | `IfNotPresent` | +| `worker.image.tag` | `tag` for the worker | `0.12.1-tf1.8.0-py3.5` | +| `resources`| pod resource requests & limits| `{}`| +| `worker.env` | worker's environment variables | `{}` | +| `master.image.repository` | horovod master image | `uber/horovod` | +| `master.image.tag` | `tag` for the master | `0.12.1-tf1.8.0-py3.5` | +| `master.image.pullPolicy` | image pullPolicy for the master image| `IfNotPresent` | +| `master.args` | master's args | `{}` | +| `master.env` | master's environment variables | `{}` | diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/keras_mnist_advanced_modified.py b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/keras_mnist_advanced_modified.py new file mode 100644 index 00000000..fa39cb6a --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/keras_mnist_advanced_modified.py @@ -0,0 +1,169 @@ +from __future__ import print_function +import keras +import os +from tensorflow.keras.datasets import mnist +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Dropout, Flatten +from tensorflow.keras.layers import Conv2D, MaxPooling2D +from tensorflow.keras.preprocessing.image import ImageDataGenerator +from tensorflow.keras import backend as K +from tensorflow_estimator.python.estimator.export import export as export_helpers +from tensorflow.python.saved_model import builder as saved_model_builder +from tensorflow.python.saved_model import tag_constants, signature_constants +from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def +import tensorflow as tf +import horovod.tensorflow.keras as hvd + + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +#config.gpu_options.allow_growth = True +#config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +batch_size = 128 +num_classes = 10 + +# Enough epochs to demonstrate learning rate warmup and the reduction of +# learning rate when training plateaues. +epochs = 24 + +# Input image dimensions +img_rows, img_cols = 28, 28 + +# The data, shuffled and split between train and test sets +(x_train, y_train), (x_test, y_test) = mnist.load_data() + +# Determine how many batches are there in train and test sets +train_batches = len(x_train) // batch_size +test_batches = len(x_test) // batch_size + +if K.image_data_format() == 'channels_first': + x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) + x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) +else: + x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) + x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_train /= 255 +x_test /= 255 +print('x_train shape:', x_train.shape) +print(x_train.shape[0], 'train samples') +print(x_test.shape[0], 'test samples') + +# Convert class vectors to binary class matrices +y_train = tf.keras.utils.to_categorical(y_train, num_classes) +y_test = tf.keras.utils.to_categorical(y_test, num_classes) + +model = Sequential() +model.add(Conv2D(32, kernel_size=(3, 3), + activation='relu', + input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation='relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) +model.add(Flatten()) +model.add(Dense(128, activation='relu')) +model.add(Dropout(0.5)) +model.add(Dense(num_classes, activation='softmax')) + +# Horovod: adjust learning rate based on number of GPUs. +opt = tf.keras.optimizers.Adadelta(lr=1.0 * hvd.size()) + +# Horovod: add Horovod Distributed Optimizer. +opt = hvd.DistributedOptimizer(opt) + +model.compile(loss=tf.keras.losses.categorical_crossentropy, + optimizer=opt, + metrics=['accuracy']) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), + + # Reduce the learning rate if training plateaues. + tf.keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(tf.keras.callbacks.ModelCheckpoint( + './checkpoint-{epoch}.h5')) + +# Set up ImageDataGenerators to do data augmentation for the training images. +train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, + height_shift_range=0.08, zoom_range=0.08) +test_gen = ImageDataGenerator() + +# Train the model. +# Horovod: the training will randomly sample 1 / N batches of training data and +# 3 / N batches of validation data on every worker, where N is the number of workers. +# Over-sampling of validation data helps to increase probability that every validation +# example will be evaluated. +model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size), + steps_per_epoch=train_batches // hvd.size(), + callbacks=callbacks, + epochs=epochs, + verbose=1, + validation_data=test_gen.flow( + x_test, y_test, batch_size=batch_size), + validation_steps=3 * test_batches // hvd.size()) + +# Evaluate the model on the full data set. +score = model.evaluate(x_test, y_test, verbose=0) +print('Test loss:', score[0]) +print('Test accuracy:', score[1]) + +# Save Model to Minio +if hvd.rank() == 0: + print('Model Summary') + model.summary() + print('Exporting trained model to Minio Model Repo') + base_path = os.environ['MODEL_BASE_PATH'] + + # Option 1(Preferred) - Using Keras api and Tensorflow v1.13 version + saved_model_path = tf.contrib.saved_model.save_keras_model(model, base_path) + print('Model Saved to {} Using new Keras API!!!'.format(saved_model_path)) + # Option 2 - Tensorflow v1.13+ Builder saved_model api. + # builder = saved_model_builder.SavedModelBuilder(base_path) + + # print(model.input) + # print(model.outputs) + + # signature = predict_signature_def(inputs={"inputs": model.input}, + # outputs={t.name:t for t in model.outputs}) + # print(signature) + # K.set_learning_phase(0) + # with K.get_session() as sess: + # builder.add_meta_graph_and_variables(sess=sess, + # tags=[tag_constants.SERVING], + # signature_def_map={'predict': signature}) + # builder.save() + # print('Model Saved to S3 Using Builder!!!') + + # Option 3 - Tensorflow v1.13 Will be deprecated in Tensorflow v2 + # tf.saved_model.simple_save( + # keras.backend.get_session(), + # export_path, + # inputs={'input_image': model.input}, + # outputs={t.name: t for t in model.outputs}) diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/sample_values.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/sample_values.yaml new file mode 100644 index 00000000..7030dd24 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/sample_values.yaml @@ -0,0 +1,62 @@ +--- +#useHostNetwork: true + +ssh: + useSecrets: true + port: 22 + hostKey: |- + -----BEGIN RSA PRIVATE KEY----- + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + -----END RSA PRIVATE KEY----- + + hostKeyPub: |- + ssh-rsa ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey + ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey + ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey + ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey user@openSource + +resources: {} + +worker: + number: 2 + image: + repository: spark-tf-keras-horovod-pytorch + tag: latest + pullPolicy: Never +master: + image: + repository: spark-tf-keras-horovod-pytorch + tag: latest + pullPolicy: Never + args: + - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" + +## Model repository information (Minio) +minio: + existingSecret: "" + accessKey: "onapdaas" + secretKey: "onapsecretdaas" + environment: + AWS_REGION: "us-west-1" + S3_REGION: "us-west-1" + S3_ENDPOINT: "minio.edge1.svc.cluster.local:9000" + AWS_ENDPOINT_URL: "http://minio.edge1.svc.cluster.local:9000" + S3_USE_HTTPS: 0 + S3_VERIFY_SSL: 0 + AWS_LOG_LEVEL: 3 + TF_CPP_MIN_LOG_LEVEL: 3 + MODEL_NAME: "mnist" + MODEL_BASE_PATH: "s3://models/mnist/export/" diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/NOTES.txt b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/NOTES.txt new file mode 100644 index 00000000..774555ae --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/NOTES.txt @@ -0,0 +1,5 @@ +1. Get the application URL by running these commands: + +*** NOTE: It may take a few minutes for the statefulset to be available + +*** you can watch the status of statefulset by running 'kubectl get sts --namespace {{ .Release.Namespace }} -w {{ template "horovod.fullname" . }}' ***
\ No newline at end of file diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/_helpers.tpl b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/_helpers.tpl new file mode 100644 index 00000000..02071c0f --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/_helpers.tpl @@ -0,0 +1,32 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "horovod.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "horovod.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "horovod.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/config.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/config.yaml new file mode 100644 index 00000000..ae93c445 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/config.yaml @@ -0,0 +1,130 @@ +{{- $workerNum := .Values.worker.number -}} +{{- $name := include "horovod.fullname" . }} +{{- $slots := 1 }} +{{- if index .Values.resources "nvidia.com/gpu" }} +{{- $slots := index .Values.resources "nvidia.com/gpu" }} +{{- end }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ template "horovod.fullname" . }} + labels: + heritage: {{ .Release.Service | quote }} + release: {{ .Release.Name | quote }} + chart: {{ template "horovod.chart" . }} + app: {{ template "horovod.fullname" . }} +data: + hostfile.config: | + {{ $name }}-master slots={{ $slots }} + {{- range $i, $none := until (int $workerNum) }} + {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }} + {{- end }} + ssh.readiness: | + #!/bin/bash + set -xev + ssh localhost ls + master.run: | + #!/bin/bash + set -x + sleep 5 + + mkdir -p /root/.ssh + rm -f /root/.ssh/config + touch /root/.ssh/config + + if [ "$USESECRETS" == "true" ];then + set +e + yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa + yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys + set -e + fi + + if [ -n "$SSHPORT" ]; then + echo "Port $SSHPORT" > /root/.ssh/config + sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config + fi + echo "StrictHostKeyChecking no" >> /root/.ssh/config + /usr/sbin/sshd + + if [ $# -eq 0 ]; then + sleep infinity + else + bash -c "$*" + fi + sleep 300 + master.waitWorkerReady: | + #!/bin/bash + set -xev + function updateSSHPort() { + mkdir -p /root/.ssh + rm -f /root/.ssh/config + touch /root/.ssh/config + + if [ -n "$SSHPORT" ]; then + echo "Port $SSHPORT" > /root/.ssh/config + echo "StrictHostKeyChecking no" >> /root/.ssh/config + fi + } + + function runCheckSSH() { + if [[ "$USESECRETS" == "true" ]];then + set +e + yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa + yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys + set -e + fi + + for i in `cat $1 | awk '{print $(1)}'`;do + if [[ "$i" != *"master" ]];then + retry 30 ssh -o ConnectTimeout=2 -q $i exit + fi + done + } + + function retry() + { + local n=0;local try=$1 + local cmd="${@: 2}" + [[ $# -le 1 ]] && { + echo "Usage $0 <retry_number> <Command>"; + } + set +e + until [[ $n -ge $try ]] + do + $cmd && break || { + echo "Command Fail.." + ((n++)) + echo "retry $n :: [$cmd]" + sleep 1; + } + done + $cmd + if [ $? -ne 0 ]; then + exit 1 + fi + set -e + } + updateSSHPort + runCheckSSH $1 + worker.run: | + #!/bin/bash + set -x + + mkdir -p /root/.ssh + rm -f /root/.ssh/config + touch /root/.ssh/config + + if [[ "$USESECRETS" == "true" ]];then + set +e + yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa + yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys + set -e + fi + + if [ -n "$SSHPORT" ]; then + echo "Port $SSHPORT" > /root/.ssh/config + sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config + fi + echo "StrictHostKeyChecking no" >> /root/.ssh/config + + /usr/sbin/sshd -D diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/job-service.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/job-service.yaml new file mode 100644 index 00000000..e7b05c26 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/job-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ template "horovod.fullname" . }}-master + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + clusterIP: None + ports: + - name: ssh + port: {{ .Values.ssh.port }} + targetPort: {{ .Values.ssh.port }} + selector: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: master diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/job.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/job.yaml new file mode 100644 index 00000000..da42ded8 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/job.yaml @@ -0,0 +1,140 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: master +spec: + template: + metadata: + labels: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: master + spec: + {{- if .Values.useHostNetwork }} + hostNetwork: {{ .Values.useHostNetwork }} + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + {{- if .Values.useHostPID }} + hostPID: {{ .Values.useHostPID }} + {{- end }} + restartPolicy: OnFailure + volumes: + - name: {{ template "horovod.fullname" . }}-cm + configMap: + name: {{ template "horovod.fullname" . }} + items: + - key: hostfile.config + path: hostfile + mode: 438 + - key: master.waitWorkerReady + path: waitWorkersReady.sh + mode: 365 + - key: master.run + path: run.sh + mode: 365 + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + secret: + secretName: {{ template "horovod.fullname" . }} + defaultMode: 448 + items: + - key: host-key + path: id_rsa + - key: host-key-pub + path: authorized_keys + {{- end }} +{{- if .Values.volumes }} +{{ toYaml .Values.volumes | indent 6 }} +{{- end }} + containers: + - name: horovod-master + image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}" + imagePullPolicy: {{ .Values.master.image.pullPolicy }} + env: + - name: SSHPORT + value: "{{ .Values.ssh.port }}" + {{- if .Values.ssh.useSecrets }} + - name: USESECRETS + value: "{{ .Values.ssh.useSecrets }}" + {{- end }} + {{- if .Values.master.env }} + {{- range $key, $value := .Values.master.env }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- end }} + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{ if .Values.minio.existingSecret }}{{ .Values.minio.existingSecret }}{{ else }}{{ template "horovod.fullname" . }}-minio{{ end }} + key: accesskey + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ if .Values.minio.existingSecret }}{{ .Values.minio.existingSecret }}{{ else }}{{ template "horovod.fullname" . }}-minio{{ end }} + key: secretkey + {{- range $key, $val := .Values.minio.environment }} + - name: {{ $key }} + value: {{ $val | quote }} + {{- end}} +{{- if .Values.master.privileged }} + securityContext: + privileged: true +{{- end }} + ports: + - containerPort: {{ .Values.ssh.port }} + volumeMounts: + - name: {{ template "horovod.fullname" . }}-cm + mountPath: /horovod/generated + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + readOnly: true + mountPath: "/etc/secret-volume" + {{- end }} +{{- if .Values.volumeMounts }} +{{ toYaml .Values.volumeMounts | indent 8 }} +{{- end }} + command: + - /horovod/generated/run.sh + args: +{{ toYaml .Values.master.args | indent 10 }} + resources: +{{ toYaml .Values.resources | indent 10 }} +{{- if .Values.ssh.useSecrets }} + initContainers: + - name: wait-workers + image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}" + imagePullPolicy: {{ .Values.master.image.pullPolicy }} + env: + - name: SSHPORT + value: "{{ .Values.ssh.port }}" + {{- if .Values.ssh.useSecrets }} + - name: USESECRETS + value: "{{ .Values.ssh.useSecrets }}" + {{- end }} + {{- if .Values.master.env }} + {{- range $key, $value := .Values.master.env }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- end }} + command: + - /horovod/generated/waitWorkersReady.sh + args: + - /horovod/generated/hostfile + volumeMounts: + - name: {{ template "horovod.fullname" . }}-cm + mountPath: /horovod/generated + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + readOnly: true + mountPath: "/etc/secret-volume" + {{- end }} +{{- end }} diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/minio-secrets.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/minio-secrets.yaml new file mode 100644 index 00000000..c99abe67 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/minio-secrets.yaml @@ -0,0 +1,31 @@ +{{/* +# Copyright 2019 Intel Corporation, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/}} + +{{- if not .Values.minio.existingSecret }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ template "horovod.fullname" . }}-minio + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +type: Opaque +data: + accesskey: {{ .Values.minio.accessKey | b64enc }} + secretkey: {{ .Values.minio.secretKey | b64enc }} +{{- end }} diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/secrets.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/secrets.yaml new file mode 100644 index 00000000..c9853ed0 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/secrets.yaml @@ -0,0 +1,15 @@ +{{- if .Values.ssh.useSecrets }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +type: Opaque +data: + host-key: {{ .Values.ssh.hostKey | b64enc | quote }} + host-key-pub: {{ .Values.ssh.hostKeyPub | b64enc | quote }} +{{- end }}
\ No newline at end of file diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/statefulset-service.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/statefulset-service.yaml new file mode 100644 index 00000000..d0216a86 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/statefulset-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + clusterIP: None + ports: + - name: ssh + port: {{ .Values.ssh.port }} + targetPort: {{ .Values.ssh.port }} + selector: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: worker diff --git a/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/statefulset.yaml b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/statefulset.yaml new file mode 100644 index 00000000..1d3f7577 --- /dev/null +++ b/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/statefulset.yaml @@ -0,0 +1,115 @@ +apiVersion: apps/v1beta2 +kind: StatefulSet +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: worker +spec: + selector: + matchLabels: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: worker + serviceName: {{ template "horovod.fullname" . }} + podManagementPolicy: {{ .Values.worker.podManagementPolicy }} + replicas: {{.Values.worker.number}} + template: + metadata: + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: worker + spec: + selector: + matchLabels: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: worker + {{- if .Values.useHostNetwork }} + hostNetwork: {{ .Values.useHostNetwork }} + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + {{- if .Values.useHostPID }} + hostPID: {{ .Values.useHostPID }} + {{- end }} + volumes: + - name: {{ template "horovod.fullname" . }}-cm + configMap: + name: {{ template "horovod.fullname" . }} + items: + - key: hostfile.config + path: hostfile + mode: 438 + - key: ssh.readiness + path: check.sh + mode: 365 + - key: worker.run + path: run.sh + mode: 365 + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + secret: + secretName: {{ template "horovod.fullname" . }} + defaultMode: 448 + items: + - key: host-key + path: id_rsa + - key: host-key-pub + path: authorized_keys + {{- end }} +{{- if .Values.volumes }} +{{ toYaml .Values.volumes | indent 6 }} +{{- end }} + containers: + - name: worker + image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}" + imagePullPolicy: {{ .Values.worker.image.pullPolicy }} + env: + - name: SSHPORT + value: "{{ .Values.ssh.port }}" + {{- if .Values.ssh.useSecrets }} + - name: USESECRETS + value: "{{ .Values.ssh.useSecrets }}" + {{- end }} + {{- if .Values.master.env }} + {{- range $key, $value := .Values.master.env }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- end }} +{{- if .Values.worker.privileged }} + securityContext: + privileged: true +{{- end }} + ports: + - containerPort: {{ .Values.ssh.port }} + volumeMounts: + - name: {{ template "horovod.fullname" . }}-cm + mountPath: /horovod/generated + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + readOnly: true + mountPath: "/etc/secret-volume" + {{- end }} +{{- if .Values.volumeMounts }} +{{ toYaml .Values.volumeMounts | indent 8 }} +{{- end }} + command: + - /horovod/generated/run.sh +{{- if .Values.ssh.useSecrets }} + readinessProbe: + exec: + command: + - /horovod/generated/check.sh + initialDelaySeconds: 1 + periodSeconds: 2 +{{- end }} + resources: +{{ toYaml .Values.resources | indent 10 }} |