From 2fd13e2728c1b2e1c290ff2df32b6420d1f1c45f Mon Sep 17 00:00:00 2001 From: Rajamohan Raj Date: Sat, 20 Apr 2019 00:54:45 +0000 Subject: Fixing some helm lint issues. Change-Id: I6d62bcd10c60c422aaeb146078aee1b162838926 Issue-ID: ONAPARC-450 Signed-off-by: Rajamohan Raj --- vnfs/DAaaS/applications/Chart.yaml | 5 + vnfs/DAaaS/applications/Charts.yaml | 5 - .../charts/sample-horovod-app/Charts.yml | 5 - .../charts/sample-horovod-app/Dockerfile | 142 ------------------ .../charts/sample-horovod-app/README.md | 162 --------------------- .../keras_mnist_advanced_modified.py | 127 ---------------- .../charts/sample-horovod-app/sample_values.yaml | 44 ------ .../charts/sample-horovod-app/templates/NOTES.txt | 5 - .../sample-horovod-app/templates/_helpers.tpl | 32 ---- .../sample-horovod-app/templates/config.yaml | 130 ----------------- .../sample-horovod-app/templates/job-service.yaml | 19 --- .../charts/sample-horovod-app/templates/job.yaml | 126 ---------------- .../sample-horovod-app/templates/secrets.yaml | 15 -- .../templates/statefulset-service.yaml | 19 --- .../sample-horovod-app/templates/statefulset.yaml | 115 --------------- .../charts/sample-spark-app/.helmignore | 22 --- .../charts/sample-spark-app/Chart.yaml | 5 - .../charts/sample-spark-app/Dockerfile | 133 ----------------- .../sample-spark-app/templates/SampleSparkApp.yaml | 43 ------ .../charts/sample-spark-app/templates/_helpers.tpl | 32 ---- .../charts/sample-spark-app/values.yaml | 57 -------- .../applications/sample-horovod-app/Chart.yaml | 5 + .../applications/sample-horovod-app/Dockerfile | 142 ++++++++++++++++++ .../applications/sample-horovod-app/README.md | 162 +++++++++++++++++++++ .../keras_mnist_advanced_modified.py | 127 ++++++++++++++++ .../sample-horovod-app/sample_values.yaml | 44 ++++++ .../sample-horovod-app/templates/NOTES.txt | 5 + .../sample-horovod-app/templates/_helpers.tpl | 32 ++++ .../sample-horovod-app/templates/config.yaml | 130 +++++++++++++++++ .../sample-horovod-app/templates/job-service.yaml | 19 +++ .../sample-horovod-app/templates/job.yaml | 126 ++++++++++++++++ .../sample-horovod-app/templates/secrets.yaml | 15 ++ .../templates/statefulset-service.yaml | 19 +++ .../sample-horovod-app/templates/statefulset.yaml | 115 +++++++++++++++ .../applications/sample-spark-app/.helmignore | 22 +++ .../DAaaS/applications/sample-spark-app/Chart.yaml | 5 + .../DAaaS/applications/sample-spark-app/Dockerfile | 133 +++++++++++++++++ .../sample-spark-app/templates/SampleSparkApp.yaml | 43 ++++++ .../sample-spark-app/templates/_helpers.tpl | 32 ++++ .../applications/sample-spark-app/values.yaml | 57 ++++++++ 40 files changed, 1238 insertions(+), 1238 deletions(-) create mode 100644 vnfs/DAaaS/applications/Chart.yaml delete mode 100644 vnfs/DAaaS/applications/Charts.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/README.md delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-spark-app/.helmignore delete mode 100644 vnfs/DAaaS/applications/charts/sample-spark-app/Chart.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-spark-app/Dockerfile delete mode 100644 vnfs/DAaaS/applications/charts/sample-spark-app/templates/SampleSparkApp.yaml delete mode 100644 vnfs/DAaaS/applications/charts/sample-spark-app/templates/_helpers.tpl delete mode 100644 vnfs/DAaaS/applications/charts/sample-spark-app/values.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/Chart.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/Dockerfile create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/README.md create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/keras_mnist_advanced_modified.py create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/sample_values.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/NOTES.txt create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/_helpers.tpl create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/config.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/job-service.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/job.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/secrets.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset-service.yaml create mode 100644 vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset.yaml create mode 100644 vnfs/DAaaS/applications/sample-spark-app/.helmignore create mode 100644 vnfs/DAaaS/applications/sample-spark-app/Chart.yaml create mode 100644 vnfs/DAaaS/applications/sample-spark-app/Dockerfile create mode 100644 vnfs/DAaaS/applications/sample-spark-app/templates/SampleSparkApp.yaml create mode 100644 vnfs/DAaaS/applications/sample-spark-app/templates/_helpers.tpl create mode 100644 vnfs/DAaaS/applications/sample-spark-app/values.yaml (limited to 'vnfs') diff --git a/vnfs/DAaaS/applications/Chart.yaml b/vnfs/DAaaS/applications/Chart.yaml new file mode 100644 index 00000000..803e19aa --- /dev/null +++ b/vnfs/DAaaS/applications/Chart.yaml @@ -0,0 +1,5 @@ + apiVersion: v1 + appVersion: "1.0" + description: Helm chart for sample applications that use the components that the training-core framework deploys. + name: sample-applications + version: 0.1.0 diff --git a/vnfs/DAaaS/applications/Charts.yaml b/vnfs/DAaaS/applications/Charts.yaml deleted file mode 100644 index 803e19aa..00000000 --- a/vnfs/DAaaS/applications/Charts.yaml +++ /dev/null @@ -1,5 +0,0 @@ - apiVersion: v1 - appVersion: "1.0" - description: Helm chart for sample applications that use the components that the training-core framework deploys. - name: sample-applications - version: 0.1.0 diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml b/vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml deleted file mode 100644 index 50b52b98..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/Charts.yml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v1 -appVersion: "1.0" -description: "A sample horovod application which runs the MNIST application using Tensorflow as backend" -name: sample-horovod-app -version: 0.1.0 diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile b/vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile deleted file mode 100644 index 8bdcf5b6..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/Dockerfile +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2019 Intel Corporation -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Ported kubernetes spark image to Ubuntu - -FROM ubuntu:18.04 - -# Install jdk -RUN apt update -yqq -RUN apt install -y locales openjdk-8-jdk && rm -rf /var/lib/apt/lists/* \ - && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 - -# Install all the essentials -RUN apt-get update --fix-missing && \ - apt-get install -y numactl wget curl bzip2 nmap vim ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 \ - git mercurial subversion build-essential openssh-server openssh-client net-tools && \ - mkdir -p /var/run/sshd && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ENV LANG en_US.utf8 -ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 -ENV PATH $JAVA_HOME/bin:$PATH -ENV PATH /opt/conda/bin:/opt/spark/bin:$PATH -ENV OPENMPI_VERSION 3.1 - -# Install openMPI -RUN mkdir /tmp/openmpi && \ - cd /tmp/openmpi && \ - wget --quiet https://www.open-mpi.org/software/ompi/v${OPENMPI_VERSION}/downloads/openmpi-${OPENMPI_VERSION}.2.tar.gz -O openmpi.tar.gz && \ - tar zxf openmpi.tar.gz && \ - cd openmpi-3.1.2 && \ - ./configure --enable-orterun-prefix-by-default && \ - make -j $(nproc) all && \ - make install && \ - ldconfig && \ - rm -rf /tmp/openmpi - -# Install miniconda -RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ - /bin/bash ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc - -# Install tf & keras using conda in the virtual_environment:tf_env -SHELL ["/bin/bash", "-c"] -RUN conda update -n base -c defaults conda && \ - conda create -n tf_env -RUN conda install -n tf_env -y -c anaconda \ - pip tensorflow keras nltk pyarrow -RUN conda install -n tf_env -y -c anaconda h5py - -RUN conda install -n tf_env -y -c pytorch pytorch-cpu -RUN conda install -n tf_env -y -c conda-forge matplotlib - -RUN echo "conda activate tf_env" >> ~/.bashrc && \ - conda install -n tf_env -y -c conda-forge clangdev - -RUN source ~/.bashrc -RUN HOROVOD_WITH_TENSORFLOW=1 /opt/conda/envs/tf_env/bin/pip install --no-cache-dir horovod - -# openMPI sane defaults: -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ - echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ - echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf - -# Allow OpenSSH to talk to containers without asking for confirmation -RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ - echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ - mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config - -# Install tini -RUN apt-get install -y curl grep sed dpkg && \ - TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && echo ${TINI_VERSION} && \ - curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ - dpkg -i tini.deb && \ - rm tini.deb && \ - apt clean - -# This is needed to match the original entrypoint.sh file. -RUN cp /usr/bin/tini /sbin - -# Begin: Installing spark -ARG spark_jars=jars -ARG img_path=kubernetes/dockerfiles -ARG k8s_tests=kubernetes/tests - -# Before building the docker image, first build and make a Spark distribution following -# the instructions in http://spark.apache.org/docs/latest/building-spark.html. -# If this docker file is being used in the context of building your images from a Spark -# distribution, the docker build command should be invoked from the top level directory -# of the Spark distribution. E.g.: -# docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . - -RUN mkdir -p /opt/spark && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd - - -COPY ${spark_jars} /opt/spark/jars -COPY bin /opt/spark/bin -COPY sbin /opt/spark/sbin -COPY ${img_path}/spark/entrypoint.sh /opt/ -COPY examples /opt/spark/examples -COPY ${k8s_tests} /opt/spark/tests -COPY data /opt/spark/data -ENV SPARK_HOME /opt/spark - -RUN mkdir /opt/spark/python -COPY python/pyspark /opt/spark/python/pyspark -COPY python/lib /opt/spark/python/lib -ENV PYTHONPATH /opt/spark/python/lib/pyspark.zip:/opt/spark/python/lib/py4j-*.zip -ENV PATH /opt/conda/envs/tf_env/bin:$PATH - -RUN echo "export PATH=/opt/conda/envs/tf_env/bin:$PATH" >> ~/.bashrc -# echo "activate tf_env\n" >> ~/.bashrc -RUN pip install petastorm -WORKDIR /opt/spark/work-dir - -ENTRYPOINT [ "/opt/entrypoint.sh" ] - -# End: Installing spark diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/README.md b/vnfs/DAaaS/applications/charts/sample-horovod-app/README.md deleted file mode 100644 index 08e7691f..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/README.md +++ /dev/null @@ -1,162 +0,0 @@ -# Horovod - -[Horovod](https://eng.uber.com/horovod/) is a distributed training framework for TensorFlow, and it's provided by UBER. The goal of Horovod is to make distributed Deep Learning fast and easy to use. And it provides [Horovod in Docker](https://github.com/uber/horovod/blob/master/docs/docker.md) to streamline the installation process. - -## Introduction - -This chart bootstraps Horovod which is a Distributed TensorFlow Framework on a Kubernetes cluster using the Helm Package Manager. It deploys Horovod workers as statefulsets, and the Horovod master as a job, then discover the host list automatically. - -## Prerequisites - -- Kubernetes cluster v1.8+ - -## Build Docker Image - -You can use the dockerfile image provided along with this package. The benefit of this dockerfile is it contains many additional packages that the data science engineers usually require like spark, tensorflow, pytorch, matplotlib, nltk, -keras, h5py, pyarrow. - -Before building the docker image, first build and make a Spark distribution following the instructions in http://spark.apache.org/docs/latest/building-spark.html -If this docker file is being used in the context of building your images from a Spark distribution, the docker build command should be invoked from the top level directory of the Spark distribution. E.g.: - -``` -docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . -``` - -Once you build the spark image, go inside the spark package and place the file "keras_mnist_advanced_modified.py" in the dirctory: examples/src/main/python/tensorflow/. Create the 'tensorflow' directory if it doesnt exists. -We do this because we the file keras_mnist_advanced_modified.py is optimized for CPU running and we want this file to be automatically present in the final docker image that we build. - -``` -docker build -t spark-tf-keras-horovod-pytorch:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . -``` - -## Prepare ssh keys - -``` -# Setup ssh key -export SSH_KEY_DIR=`mktemp -d` -cd $SSH_KEY_DIR -yes | ssh-keygen -N "" -f id_rsa -``` - -## Create the values.yaml - -To run Horovod with GPU, you can create `values.yaml` like below - -``` -# cat << EOF > ~/values.yaml ---- -ssh: - useSecrets: true - hostKey: |- -$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g') - - hostKeyPub: |- -$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g') - -worker: - number: 2 - image: - repository: uber/horovod - tag: 0.12.1-tf1.8.0-py3.5 -master: - image: - repository: uber/horovod - tag: 0.12.1-tf1.8.0-py3.5 - args: - - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" -EOF -``` - -For most cases, the overlay network impacts the Horovod performance greatly, so we should apply `Host Network` solution. To run Horovod with Host Network and GPU, you can create `values.yaml` like below - - -``` -# cat << EOF > ~/values.yaml ---- -useHostNetwork: true - -ssh: - useSecrets: true - port: 32222 - hostKey: |- -$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g') - - hostKeyPub: |- -$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g') - - -worker: - number: 2 - image: - repository: uber/horovod - tag: 0.12.1-tf1.8.0-py3.5 -master: - image: - repository: uber/horovod - tag: 0.12.1-tf1.8.0-py3.5 - args: - - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" -EOF -``` - -``` -NOTE: A sample values.yaml is provided for reference. After adding the above changes, we should have a values.yml similar to that. -``` - -> notice: the difference is that you should set `useHostNetwork` as true, then set another ssh port rather than `22` - -## Installing the Chart - -To install the chart with the release name `mnist`: - -```bash -$ helm install --values ~/values.yaml --name mnist stable/horovod -``` - -## Uninstalling the Chart - -To uninstall/delete the `mnist` deployment: - -```bash -$ helm delete mnist -``` - -The command removes all the Kubernetes components associated with the chart and -deletes the release. - -## Upgrading an existing Release to a new major version -A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an -incompatible breaking change needing manual actions. - -### 1.0.0 -This version removes the `chart` label from the `spec.selector.matchLabels` -which is immutable since `StatefulSet apps/v1beta2`. It has been inadvertently -added, causing any subsequent upgrade to fail. See https://github.com/helm/charts/issues/7726. - -In order to upgrade, delete the Horovod StatefulSet before upgrading, supposing your Release is named `my-release`: - -```bash -$ kubectl delete statefulsets.apps --cascade=false my-release -``` - -## Configuration - -The following table lists the configurable parameters of the Horovod -chart and their default values. - -| Parameter | Description | Default | -|-----------|-------------|---------| -| `useHostNetwork` | Host network | `false` | -| `ssh.port` | The ssh port | `22` | -| `ssh.useSecrets` | Determine if using the secrets for ssh | `false` | -| `worker.number`| The worker's number | `5` | -| `worker.image.repository` | horovod worker image | `uber/horovod` | -| `worker.image.pullPolicy` | `pullPolicy` for the worker | `IfNotPresent` | -| `worker.image.tag` | `tag` for the worker | `0.12.1-tf1.8.0-py3.5` | -| `resources`| pod resource requests & limits| `{}`| -| `worker.env` | worker's environment variables | `{}` | -| `master.image.repository` | horovod master image | `uber/horovod` | -| `master.image.tag` | `tag` for the master | `0.12.1-tf1.8.0-py3.5` | -| `master.image.pullPolicy` | image pullPolicy for the master image| `IfNotPresent` | -| `master.args` | master's args | `{}` | -| `master.env` | master's environment variables | `{}` | diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py b/vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py deleted file mode 100644 index 03425ff7..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/keras_mnist_advanced_modified.py +++ /dev/null @@ -1,127 +0,0 @@ -from __future__ import print_function -import keras -from keras.datasets import mnist -from keras.models import Sequential -from keras.layers import Dense, Dropout, Flatten -from keras.layers import Conv2D, MaxPooling2D -from keras.preprocessing.image import ImageDataGenerator -from keras import backend as K -import tensorflow as tf -import horovod.keras as hvd - -# Horovod: initialize Horovod. -hvd.init() - -# Horovod: pin GPU to be used to process local rank (one GPU per process) -config = tf.ConfigProto() -#config.gpu_options.allow_growth = True -#config.gpu_options.visible_device_list = str(hvd.local_rank()) -K.set_session(tf.Session(config=config)) - -batch_size = 128 -num_classes = 10 - -# Enough epochs to demonstrate learning rate warmup and the reduction of -# learning rate when training plateaues. -epochs = 24 - -# Input image dimensions -img_rows, img_cols = 28, 28 - -# The data, shuffled and split between train and test sets -(x_train, y_train), (x_test, y_test) = mnist.load_data() - -# Determine how many batches are there in train and test sets -train_batches = len(x_train) // batch_size -test_batches = len(x_test) // batch_size - -if K.image_data_format() == 'channels_first': - x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) - x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) - input_shape = (1, img_rows, img_cols) -else: - x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) - x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) - input_shape = (img_rows, img_cols, 1) - -x_train = x_train.astype('float32') -x_test = x_test.astype('float32') -x_train /= 255 -x_test /= 255 -print('x_train shape:', x_train.shape) -print(x_train.shape[0], 'train samples') -print(x_test.shape[0], 'test samples') - -# Convert class vectors to binary class matrices -y_train = keras.utils.to_categorical(y_train, num_classes) -y_test = keras.utils.to_categorical(y_test, num_classes) - -model = Sequential() -model.add(Conv2D(32, kernel_size=(3, 3), - activation='relu', - input_shape=input_shape)) -model.add(Conv2D(64, (3, 3), activation='relu')) -model.add(MaxPooling2D(pool_size=(2, 2))) -model.add(Dropout(0.25)) -model.add(Flatten()) -model.add(Dense(128, activation='relu')) -model.add(Dropout(0.5)) -model.add(Dense(num_classes, activation='softmax')) - -# Horovod: adjust learning rate based on number of GPUs. -opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size()) - -# Horovod: add Horovod Distributed Optimizer. -opt = hvd.DistributedOptimizer(opt) - -model.compile(loss=keras.losses.categorical_crossentropy, - optimizer=opt, - metrics=['accuracy']) - -callbacks = [ - # Horovod: broadcast initial variable states from rank 0 to all other processes. - # This is necessary to ensure consistent initialization of all workers when - # training is started with random weights or restored from a checkpoint. - hvd.callbacks.BroadcastGlobalVariablesCallback(0), - - # Horovod: average metrics among workers at the end of every epoch. - # - # Note: This callback must be in the list before the ReduceLROnPlateau, - # TensorBoard or other metrics-based callbacks. - hvd.callbacks.MetricAverageCallback(), - - # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final - # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during - # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. - hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), - - # Reduce the learning rate if training plateaues. - keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), -] - -# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. -if hvd.rank() == 0: - callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) - -# Set up ImageDataGenerators to do data augmentation for the training images. -train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, - height_shift_range=0.08, zoom_range=0.08) -test_gen = ImageDataGenerator() - -# Train the model. -# Horovod: the training will randomly sample 1 / N batches of training data and -# 3 / N batches of validation data on every worker, where N is the number of workers. -# Over-sampling of validation data helps to increase probability that every validation -# example will be evaluated. -model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size), - steps_per_epoch=train_batches // hvd.size(), - callbacks=callbacks, - epochs=epochs, - verbose=1, - validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), - validation_steps=3 * test_batches // hvd.size()) - -# Evaluate the model on the full data set. -score = model.evaluate(x_test, y_test, verbose=0) -print('Test loss:', score[0]) -print('Test accuracy:', score[1]) diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml deleted file mode 100644 index 6ac31359..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/sample_values.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -#useHostNetwork: true - -ssh: - useSecrets: true - hostKey: |- - -----BEGIN RSA PRIVATE KEY----- - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey - -----END RSA PRIVATE KEY----- - - hostKeyPub: |- - ssh-rsa ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey - ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey - ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey - ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey user@openSource - -resources: {} - -worker: - number: 2 - image: - repository: spark-tf-keras-horovod-pytorch - tag: latest - pullPolicy: Never -master: - image: - repository: spark-tf-keras-horovod-pytorch - tag: latest - pullPolicy: Never - args: - - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt deleted file mode 100644 index 774555ae..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/NOTES.txt +++ /dev/null @@ -1,5 +0,0 @@ -1. Get the application URL by running these commands: - -*** NOTE: It may take a few minutes for the statefulset to be available - -*** you can watch the status of statefulset by running 'kubectl get sts --namespace {{ .Release.Namespace }} -w {{ template "horovod.fullname" . }}' *** \ No newline at end of file diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl deleted file mode 100644 index 02071c0f..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/_helpers.tpl +++ /dev/null @@ -1,32 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "horovod.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "horovod.fullname" -}} -{{- if .Values.fullnameOverride -}} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- if contains $name .Release.Name -}} -{{- .Release.Name | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} -{{- end -}} -{{- end -}} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "horovod.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} -{{- end -}} diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml deleted file mode 100644 index ae93c445..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/config.yaml +++ /dev/null @@ -1,130 +0,0 @@ -{{- $workerNum := .Values.worker.number -}} -{{- $name := include "horovod.fullname" . }} -{{- $slots := 1 }} -{{- if index .Values.resources "nvidia.com/gpu" }} -{{- $slots := index .Values.resources "nvidia.com/gpu" }} -{{- end }} -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ template "horovod.fullname" . }} - labels: - heritage: {{ .Release.Service | quote }} - release: {{ .Release.Name | quote }} - chart: {{ template "horovod.chart" . }} - app: {{ template "horovod.fullname" . }} -data: - hostfile.config: | - {{ $name }}-master slots={{ $slots }} - {{- range $i, $none := until (int $workerNum) }} - {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }} - {{- end }} - ssh.readiness: | - #!/bin/bash - set -xev - ssh localhost ls - master.run: | - #!/bin/bash - set -x - sleep 5 - - mkdir -p /root/.ssh - rm -f /root/.ssh/config - touch /root/.ssh/config - - if [ "$USESECRETS" == "true" ];then - set +e - yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa - yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys - set -e - fi - - if [ -n "$SSHPORT" ]; then - echo "Port $SSHPORT" > /root/.ssh/config - sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config - fi - echo "StrictHostKeyChecking no" >> /root/.ssh/config - /usr/sbin/sshd - - if [ $# -eq 0 ]; then - sleep infinity - else - bash -c "$*" - fi - sleep 300 - master.waitWorkerReady: | - #!/bin/bash - set -xev - function updateSSHPort() { - mkdir -p /root/.ssh - rm -f /root/.ssh/config - touch /root/.ssh/config - - if [ -n "$SSHPORT" ]; then - echo "Port $SSHPORT" > /root/.ssh/config - echo "StrictHostKeyChecking no" >> /root/.ssh/config - fi - } - - function runCheckSSH() { - if [[ "$USESECRETS" == "true" ]];then - set +e - yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa - yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys - set -e - fi - - for i in `cat $1 | awk '{print $(1)}'`;do - if [[ "$i" != *"master" ]];then - retry 30 ssh -o ConnectTimeout=2 -q $i exit - fi - done - } - - function retry() - { - local n=0;local try=$1 - local cmd="${@: 2}" - [[ $# -le 1 ]] && { - echo "Usage $0 "; - } - set +e - until [[ $n -ge $try ]] - do - $cmd && break || { - echo "Command Fail.." - ((n++)) - echo "retry $n :: [$cmd]" - sleep 1; - } - done - $cmd - if [ $? -ne 0 ]; then - exit 1 - fi - set -e - } - updateSSHPort - runCheckSSH $1 - worker.run: | - #!/bin/bash - set -x - - mkdir -p /root/.ssh - rm -f /root/.ssh/config - touch /root/.ssh/config - - if [[ "$USESECRETS" == "true" ]];then - set +e - yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa - yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys - set -e - fi - - if [ -n "$SSHPORT" ]; then - echo "Port $SSHPORT" > /root/.ssh/config - sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config - fi - echo "StrictHostKeyChecking no" >> /root/.ssh/config - - /usr/sbin/sshd -D diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml deleted file mode 100644 index e7b05c26..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job-service.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ template "horovod.fullname" . }}-master - labels: - app: {{ template "horovod.name" . }} - chart: {{ template "horovod.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} -spec: - clusterIP: None - ports: - - name: ssh - port: {{ .Values.ssh.port }} - targetPort: {{ .Values.ssh.port }} - selector: - app: {{ template "horovod.name" . }} - release: {{ .Release.Name }} - role: master diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml deleted file mode 100644 index 4e59b277..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/job.yaml +++ /dev/null @@ -1,126 +0,0 @@ ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ template "horovod.fullname" . }} - labels: - app: {{ template "horovod.name" . }} - chart: {{ template "horovod.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} - role: master -spec: - template: - metadata: - labels: - app: {{ template "horovod.name" . }} - release: {{ .Release.Name }} - role: master - spec: - {{- if .Values.useHostNetwork }} - hostNetwork: {{ .Values.useHostNetwork }} - dnsPolicy: ClusterFirstWithHostNet - {{- end }} - {{- if .Values.useHostPID }} - hostPID: {{ .Values.useHostPID }} - {{- end }} - restartPolicy: OnFailure - volumes: - - name: {{ template "horovod.fullname" . }}-cm - configMap: - name: {{ template "horovod.fullname" . }} - items: - - key: hostfile.config - path: hostfile - mode: 438 - - key: master.waitWorkerReady - path: waitWorkersReady.sh - mode: 365 - - key: master.run - path: run.sh - mode: 365 - {{- if .Values.ssh.useSecrets }} - - name: {{ template "horovod.fullname" . }}-secret - secret: - secretName: {{ template "horovod.fullname" . }} - defaultMode: 448 - items: - - key: host-key - path: id_rsa - - key: host-key-pub - path: authorized_keys - {{- end }} -{{- if .Values.volumes }} -{{ toYaml .Values.volumes | indent 6 }} -{{- end }} - containers: - - name: horovod-master - image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}" - imagePullPolicy: {{ .Values.master.image.pullPolicy }} - env: - - name: SSHPORT - value: "{{ .Values.ssh.port }}" - {{- if .Values.ssh.useSecrets }} - - name: USESECRETS - value: "{{ .Values.ssh.useSecrets }}" - {{- end }} - {{- if .Values.master.env }} - {{- range $key, $value := .Values.master.env }} - - name: "{{ $key }}" - value: "{{ $value }}" - {{- end }} - {{- end }} -{{- if .Values.master.privileged }} - securityContext: - privileged: true -{{- end }} - ports: - - containerPort: {{ .Values.ssh.port }} - volumeMounts: - - name: {{ template "horovod.fullname" . }}-cm - mountPath: /horovod/generated - {{- if .Values.ssh.useSecrets }} - - name: {{ template "horovod.fullname" . }}-secret - readOnly: true - mountPath: "/etc/secret-volume" - {{- end }} -{{- if .Values.volumeMounts }} -{{ toYaml .Values.volumeMounts | indent 8 }} -{{- end }} - command: - - /horovod/generated/run.sh - args: -{{ toYaml .Values.master.args | indent 10 }} - resources: -{{ toYaml .Values.resources | indent 10 }} -{{- if .Values.ssh.useSecrets }} - initContainers: - - name: wait-workers - image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}" - imagePullPolicy: {{ .Values.master.image.pullPolicy }} - env: - - name: SSHPORT - value: "{{ .Values.ssh.port }}" - {{- if .Values.ssh.useSecrets }} - - name: USESECRETS - value: "{{ .Values.ssh.useSecrets }}" - {{- end }} - {{- if .Values.master.env }} - {{- range $key, $value := .Values.master.env }} - - name: "{{ $key }}" - value: "{{ $value }}" - {{- end }} - {{- end }} - command: - - /horovod/generated/waitWorkersReady.sh - args: - - /horovod/generated/hostfile - volumeMounts: - - name: {{ template "horovod.fullname" . }}-cm - mountPath: /horovod/generated - {{- if .Values.ssh.useSecrets }} - - name: {{ template "horovod.fullname" . }}-secret - readOnly: true - mountPath: "/etc/secret-volume" - {{- end }} -{{- end }} diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml deleted file mode 100644 index c9853ed0..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/secrets.yaml +++ /dev/null @@ -1,15 +0,0 @@ -{{- if .Values.ssh.useSecrets }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ template "horovod.fullname" . }} - labels: - app: {{ template "horovod.name" . }} - chart: {{ template "horovod.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} -type: Opaque -data: - host-key: {{ .Values.ssh.hostKey | b64enc | quote }} - host-key-pub: {{ .Values.ssh.hostKeyPub | b64enc | quote }} -{{- end }} \ No newline at end of file diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml deleted file mode 100644 index d0216a86..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset-service.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ template "horovod.fullname" . }} - labels: - app: {{ template "horovod.name" . }} - chart: {{ template "horovod.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} -spec: - clusterIP: None - ports: - - name: ssh - port: {{ .Values.ssh.port }} - targetPort: {{ .Values.ssh.port }} - selector: - app: {{ template "horovod.name" . }} - release: {{ .Release.Name }} - role: worker diff --git a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml b/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml deleted file mode 100644 index 1d3f7577..00000000 --- a/vnfs/DAaaS/applications/charts/sample-horovod-app/templates/statefulset.yaml +++ /dev/null @@ -1,115 +0,0 @@ -apiVersion: apps/v1beta2 -kind: StatefulSet -metadata: - name: {{ template "horovod.fullname" . }} - labels: - app: {{ template "horovod.name" . }} - chart: {{ template "horovod.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} - role: worker -spec: - selector: - matchLabels: - app: {{ template "horovod.name" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} - role: worker - serviceName: {{ template "horovod.fullname" . }} - podManagementPolicy: {{ .Values.worker.podManagementPolicy }} - replicas: {{.Values.worker.number}} - template: - metadata: - labels: - app: {{ template "horovod.name" . }} - chart: {{ template "horovod.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} - role: worker - spec: - selector: - matchLabels: - app: {{ template "horovod.name" . }} - release: {{ .Release.Name }} - role: worker - {{- if .Values.useHostNetwork }} - hostNetwork: {{ .Values.useHostNetwork }} - dnsPolicy: ClusterFirstWithHostNet - {{- end }} - {{- if .Values.useHostPID }} - hostPID: {{ .Values.useHostPID }} - {{- end }} - volumes: - - name: {{ template "horovod.fullname" . }}-cm - configMap: - name: {{ template "horovod.fullname" . }} - items: - - key: hostfile.config - path: hostfile - mode: 438 - - key: ssh.readiness - path: check.sh - mode: 365 - - key: worker.run - path: run.sh - mode: 365 - {{- if .Values.ssh.useSecrets }} - - name: {{ template "horovod.fullname" . }}-secret - secret: - secretName: {{ template "horovod.fullname" . }} - defaultMode: 448 - items: - - key: host-key - path: id_rsa - - key: host-key-pub - path: authorized_keys - {{- end }} -{{- if .Values.volumes }} -{{ toYaml .Values.volumes | indent 6 }} -{{- end }} - containers: - - name: worker - image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}" - imagePullPolicy: {{ .Values.worker.image.pullPolicy }} - env: - - name: SSHPORT - value: "{{ .Values.ssh.port }}" - {{- if .Values.ssh.useSecrets }} - - name: USESECRETS - value: "{{ .Values.ssh.useSecrets }}" - {{- end }} - {{- if .Values.master.env }} - {{- range $key, $value := .Values.master.env }} - - name: "{{ $key }}" - value: "{{ $value }}" - {{- end }} - {{- end }} -{{- if .Values.worker.privileged }} - securityContext: - privileged: true -{{- end }} - ports: - - containerPort: {{ .Values.ssh.port }} - volumeMounts: - - name: {{ template "horovod.fullname" . }}-cm - mountPath: /horovod/generated - {{- if .Values.ssh.useSecrets }} - - name: {{ template "horovod.fullname" . }}-secret - readOnly: true - mountPath: "/etc/secret-volume" - {{- end }} -{{- if .Values.volumeMounts }} -{{ toYaml .Values.volumeMounts | indent 8 }} -{{- end }} - command: - - /horovod/generated/run.sh -{{- if .Values.ssh.useSecrets }} - readinessProbe: - exec: - command: - - /horovod/generated/check.sh - initialDelaySeconds: 1 - periodSeconds: 2 -{{- end }} - resources: -{{ toYaml .Values.resources | indent 10 }} diff --git a/vnfs/DAaaS/applications/charts/sample-spark-app/.helmignore b/vnfs/DAaaS/applications/charts/sample-spark-app/.helmignore deleted file mode 100644 index 50af0317..00000000 --- a/vnfs/DAaaS/applications/charts/sample-spark-app/.helmignore +++ /dev/null @@ -1,22 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj -.vscode/ diff --git a/vnfs/DAaaS/applications/charts/sample-spark-app/Chart.yaml b/vnfs/DAaaS/applications/charts/sample-spark-app/Chart.yaml deleted file mode 100644 index 42ed0400..00000000 --- a/vnfs/DAaaS/applications/charts/sample-spark-app/Chart.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v1 -appVersion: "1.0" -description: A sample spark application which finds the top users from the apache logs which is stored in the remote hdfs-k8s cluster -name: sample-spark-app-apache-log-analysis -version: 0.1.0 diff --git a/vnfs/DAaaS/applications/charts/sample-spark-app/Dockerfile b/vnfs/DAaaS/applications/charts/sample-spark-app/Dockerfile deleted file mode 100644 index cd42d4c7..00000000 --- a/vnfs/DAaaS/applications/charts/sample-spark-app/Dockerfile +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2019 Intel Corporation -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Ported kubernetes spark image to Ubuntu - -FROM ubuntu:18.04 - -# Install jdk -RUN apt update -yqq -RUN apt install -y locales openjdk-8-jdk && rm -rf /var/lib/apt/lists/* \ - && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 - -# Install all the essentials -RUN apt-get update --fix-missing && \ - apt-get install -y numactl wget curl bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 \ - git mercurial subversion build-essential openssh-server openssh-client net-tools && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ENV LANG en_US.utf8 -ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 -ENV PATH $JAVA_HOME/bin:$PATH -ENV PATH /opt/conda/bin:$PATH -ENV OPENMPI_VERSION 3.1 - -# Install openMPI -RUN mkdir /tmp/openmpi && \ - cd /tmp/openmpi && \ - wget --quiet https://www.open-mpi.org/software/ompi/v${OPENMPI_VERSION}/downloads/openmpi-${OPENMPI_VERSION}.2.tar.gz -O openmpi.tar.gz && \ - tar zxf openmpi.tar.gz && \ - cd openmpi-3.1.2 && \ - ./configure --enable-orterun-prefix-by-default && \ - make -j $(nproc) all && \ - make install && \ - ldconfig && \ - rm -rf /tmp/openmpi - -# Install miniconda -RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ - /bin/bash ~/miniconda.sh -b -p /opt/conda && \ - rm ~/miniconda.sh && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate base" >> ~/.bashrc - -# Install tf & keras using conda in the virtual_environment:tf_env -SHELL ["/bin/bash", "-c"] -RUN conda update -n base -c defaults conda && \ - conda create -n tf_env -RUN conda install -n tf_env -y -c anaconda \ - pip tensorflow keras nltk - -RUN echo "conda activate tf_env" >> ~/.bashrc && \ - conda install -n tf_env -y -c conda-forge clangdev - -RUN source ~/.bashrc -RUN HOROVOD_WITH_TENSORFLOW=1 /opt/conda/envs/tf_env/bin/pip install --no-cache-dir horovod - -# openMPI sane defaults: -RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ - echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ - echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf - -# Allow OpenSSH to talk to containers without asking for confirmation -RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ - echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ - mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config - -# Install tini -RUN apt-get install -y curl grep sed dpkg && \ - TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && echo ${TINI_VERSION} && \ - curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ - dpkg -i tini.deb && \ - rm tini.deb && \ - apt clean - -# This is needed to match the original entrypoint.sh file. -RUN cp /usr/bin/tini /sbin - -# Begin: Installing spark -ARG spark_jars=jars -ARG img_path=kubernetes/dockerfiles -ARG k8s_tests=kubernetes/tests - -# Before building the docker image, first build and make a Spark distribution following -# the instructions in http://spark.apache.org/docs/latest/building-spark.html. -# If this docker file is being used in the context of building your images from a Spark -# distribution, the docker build command should be invoked from the top level directory -# of the Spark distribution. E.g.: -# docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . - -RUN mkdir -p /opt/spark && \ - mkdir -p /opt/spark/work-dir && \ - touch /opt/spark/RELEASE && \ - rm /bin/sh && \ - ln -sv /bin/bash /bin/sh && \ - echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ - chgrp root /etc/passwd && chmod ug+rw /etc/passwd - - -COPY ${spark_jars} /opt/spark/jars -COPY bin /opt/spark/bin -COPY sbin /opt/spark/sbin -COPY ${img_path}/spark/entrypoint.sh /opt/ -COPY examples /opt/spark/examples -COPY ${k8s_tests} /opt/spark/tests -COPY data /opt/spark/data -ENV SPARK_HOME /opt/spark - -RUN mkdir /opt/spark/python -COPY python/pyspark /opt/spark/python/pyspark -COPY python/lib /opt/spark/python/lib -ENV PYTHONPATH /opt/spark/python/lib/pyspark.zip:/opt/spark/python/lib/py4j-*.zip - -WORKDIR /opt/spark/work-dir - -ENTRYPOINT [ "/opt/entrypoint.sh" ] - -# End: Installing spark diff --git a/vnfs/DAaaS/applications/charts/sample-spark-app/templates/SampleSparkApp.yaml b/vnfs/DAaaS/applications/charts/sample-spark-app/templates/SampleSparkApp.yaml deleted file mode 100644 index f728f82e..00000000 --- a/vnfs/DAaaS/applications/charts/sample-spark-app/templates/SampleSparkApp.yaml +++ /dev/null @@ -1,43 +0,0 @@ -apiVersion: "sparkoperator.k8s.io/v1beta1" -kind: SparkApplication -metadata: - name: {{ .Values.nameOfTheSparkApp }} - namespace: {{ .Release.Namespace }} -spec: - type: {{ .Values.programmingLanguageType }} - mode: {{ .Values.modeOfSparkApp | default "cluster" }} - image: {{ quote .Values.image }} - imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }} - mainClass: {{ .Values.mainClassOfTheSparkApp }} - mainApplicationFile: {{ .Values.mainApplicationFileOfTheSparkApp }} - arguments: - {{- range .Values.argumentsOfTheSparkProgram }} - - {{ . }} - {{ end }} - hadoopConfigMap: {{ .Values.hadoopConfigMap }} - restartPolicy: - type: {{ .Values.restartPolicy | default "Never" }} - volumes: - - name: {{ quote .Values.volumesName | default "test-volume" }} - hostpath: - path: {{ quote .Values.hostpath | default "/tmp" }} - type: {{ .Values.hostpathType | default "Directory" }} - driver: - cores: {{ .Values.driverCores | default 0.1 }} - coreLimit: {{ quote .Values.driverCoreLimit | default "200m" }} - memory: {{ quote .Values.driverMemory | default "1024m" }} - labels: - version: 2.4.0 - serviceAccount: spark - volumeMounts: - - name: {{ quote .Values.driverVolumeMountsName | default "test-volume" }} - mountPath: {{ quote .Values.driverVolumeMountPath | default "/tmp" }} - executor: - cores: {{ .Values.executorCores | default 1 }} - instances: {{ .Values.executorInstances | default 1 }} - memory: {{ quote .Values.executorMemory | default "512m" }} - labels: - version: 2.4.0 - volumeMounts: - - name: {{ quote .Values.executorVolumeMountsName | default "test-volume" }} - mountPath: {{ quote .Values.executorVolumeMountPath | default "/tmp" }} diff --git a/vnfs/DAaaS/applications/charts/sample-spark-app/templates/_helpers.tpl b/vnfs/DAaaS/applications/charts/sample-spark-app/templates/_helpers.tpl deleted file mode 100644 index 6f51811d..00000000 --- a/vnfs/DAaaS/applications/charts/sample-spark-app/templates/_helpers.tpl +++ /dev/null @@ -1,32 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "sample-spark-app.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "sample-spark-app.fullname" -}} -{{- if .Values.fullnameOverride -}} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- if contains $name .Release.Name -}} -{{- .Release.Name | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} -{{- end -}} -{{- end -}} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "sample-spark-app.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} -{{- end -}} diff --git a/vnfs/DAaaS/applications/charts/sample-spark-app/values.yaml b/vnfs/DAaaS/applications/charts/sample-spark-app/values.yaml deleted file mode 100644 index afb48d67..00000000 --- a/vnfs/DAaaS/applications/charts/sample-spark-app/values.yaml +++ /dev/null @@ -1,57 +0,0 @@ -# Default values for sample-spark-app. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - - -#===========================KUBERNETES POD RELATED CONFIGs======================== -image: spark-tf-keras-horo:latest -imagePullPolicy: Never -restartPolicy: Never -volumesName: test-volume -hostpath: /tmp -hostpathType: Directory - - - -#============================SPARK APP RELATED CONFIGs============================= - -nameOfTheSparkApp: spark-apache-logs2 -# Python or Scala supported. -programmingLanguageType: Scala -modeOfSparkApp: cluster -mainClassOfTheSparkApp: ApacheLogAnalysis -# can be http path, s3 path, minio path -mainApplicationFileOfTheSparkApp: https://github.com/mohanraj1311/ApacheLogAnalysisJar/raw/master/analysisofapachelogs_2.11-0.1.jar -argumentsOfTheSparkProgram: - - hdfs://hdfs-1-namenode-1.hdfs-1-namenode.hdfs1.svc.cluster.local:8020/data/apache-logs - - - -#============================SPARK DRIVER RELATED CONFIGs========================= -driverCores: 0.1 -driverCoreLimit: 200m -driverMemory: 1024m -driverVolumeMountsName: test-volume -driverVolumeMountPath: /tmp - - - -#============================SPARK EXECUTOR RELATED CONFIGs======================= -executorCores: 1 -executorInstances: 1 -executorMemory: 512m -executorVolumeMountsName: test-volume -executorVolumeMountPath: /tmp - - - -#===========================HADOOP RELATED CONFIGs=============================== -# config map of the hdfs -hadoopConfigMap: hdfs-1-config - - -################################################################################### - - - - diff --git a/vnfs/DAaaS/applications/sample-horovod-app/Chart.yaml b/vnfs/DAaaS/applications/sample-horovod-app/Chart.yaml new file mode 100644 index 00000000..3ce06e28 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: "A sample horovod application which runs the MNIST application using Tensorflow as backend" +name: sample-horovod-app-keras-mnist-advanced +version: 0.1.0 diff --git a/vnfs/DAaaS/applications/sample-horovod-app/Dockerfile b/vnfs/DAaaS/applications/sample-horovod-app/Dockerfile new file mode 100644 index 00000000..8bdcf5b6 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/Dockerfile @@ -0,0 +1,142 @@ +# Copyright (c) 2019 Intel Corporation +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Ported kubernetes spark image to Ubuntu + +FROM ubuntu:18.04 + +# Install jdk +RUN apt update -yqq +RUN apt install -y locales openjdk-8-jdk && rm -rf /var/lib/apt/lists/* \ + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 + +# Install all the essentials +RUN apt-get update --fix-missing && \ + apt-get install -y numactl wget curl bzip2 nmap vim ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 \ + git mercurial subversion build-essential openssh-server openssh-client net-tools && \ + mkdir -p /var/run/sshd && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV LANG en_US.utf8 +ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH $JAVA_HOME/bin:$PATH +ENV PATH /opt/conda/bin:/opt/spark/bin:$PATH +ENV OPENMPI_VERSION 3.1 + +# Install openMPI +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget --quiet https://www.open-mpi.org/software/ompi/v${OPENMPI_VERSION}/downloads/openmpi-${OPENMPI_VERSION}.2.tar.gz -O openmpi.tar.gz && \ + tar zxf openmpi.tar.gz && \ + cd openmpi-3.1.2 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi + +# Install miniconda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda && \ + rm ~/miniconda.sh && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc + +# Install tf & keras using conda in the virtual_environment:tf_env +SHELL ["/bin/bash", "-c"] +RUN conda update -n base -c defaults conda && \ + conda create -n tf_env +RUN conda install -n tf_env -y -c anaconda \ + pip tensorflow keras nltk pyarrow +RUN conda install -n tf_env -y -c anaconda h5py + +RUN conda install -n tf_env -y -c pytorch pytorch-cpu +RUN conda install -n tf_env -y -c conda-forge matplotlib + +RUN echo "conda activate tf_env" >> ~/.bashrc && \ + conda install -n tf_env -y -c conda-forge clangdev + +RUN source ~/.bashrc +RUN HOROVOD_WITH_TENSORFLOW=1 /opt/conda/envs/tf_env/bin/pip install --no-cache-dir horovod + +# openMPI sane defaults: +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +# Install tini +RUN apt-get install -y curl grep sed dpkg && \ + TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && echo ${TINI_VERSION} && \ + curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ + dpkg -i tini.deb && \ + rm tini.deb && \ + apt clean + +# This is needed to match the original entrypoint.sh file. +RUN cp /usr/bin/tini /sbin + +# Begin: Installing spark +ARG spark_jars=jars +ARG img_path=kubernetes/dockerfiles +ARG k8s_tests=kubernetes/tests + +# Before building the docker image, first build and make a Spark distribution following +# the instructions in http://spark.apache.org/docs/latest/building-spark.html. +# If this docker file is being used in the context of building your images from a Spark +# distribution, the docker build command should be invoked from the top level directory +# of the Spark distribution. E.g.: +# docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . + +RUN mkdir -p /opt/spark && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd + + +COPY ${spark_jars} /opt/spark/jars +COPY bin /opt/spark/bin +COPY sbin /opt/spark/sbin +COPY ${img_path}/spark/entrypoint.sh /opt/ +COPY examples /opt/spark/examples +COPY ${k8s_tests} /opt/spark/tests +COPY data /opt/spark/data +ENV SPARK_HOME /opt/spark + +RUN mkdir /opt/spark/python +COPY python/pyspark /opt/spark/python/pyspark +COPY python/lib /opt/spark/python/lib +ENV PYTHONPATH /opt/spark/python/lib/pyspark.zip:/opt/spark/python/lib/py4j-*.zip +ENV PATH /opt/conda/envs/tf_env/bin:$PATH + +RUN echo "export PATH=/opt/conda/envs/tf_env/bin:$PATH" >> ~/.bashrc +# echo "activate tf_env\n" >> ~/.bashrc +RUN pip install petastorm +WORKDIR /opt/spark/work-dir + +ENTRYPOINT [ "/opt/entrypoint.sh" ] + +# End: Installing spark diff --git a/vnfs/DAaaS/applications/sample-horovod-app/README.md b/vnfs/DAaaS/applications/sample-horovod-app/README.md new file mode 100644 index 00000000..08e7691f --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/README.md @@ -0,0 +1,162 @@ +# Horovod + +[Horovod](https://eng.uber.com/horovod/) is a distributed training framework for TensorFlow, and it's provided by UBER. The goal of Horovod is to make distributed Deep Learning fast and easy to use. And it provides [Horovod in Docker](https://github.com/uber/horovod/blob/master/docs/docker.md) to streamline the installation process. + +## Introduction + +This chart bootstraps Horovod which is a Distributed TensorFlow Framework on a Kubernetes cluster using the Helm Package Manager. It deploys Horovod workers as statefulsets, and the Horovod master as a job, then discover the host list automatically. + +## Prerequisites + +- Kubernetes cluster v1.8+ + +## Build Docker Image + +You can use the dockerfile image provided along with this package. The benefit of this dockerfile is it contains many additional packages that the data science engineers usually require like spark, tensorflow, pytorch, matplotlib, nltk, +keras, h5py, pyarrow. + +Before building the docker image, first build and make a Spark distribution following the instructions in http://spark.apache.org/docs/latest/building-spark.html +If this docker file is being used in the context of building your images from a Spark distribution, the docker build command should be invoked from the top level directory of the Spark distribution. E.g.: + +``` +docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . +``` + +Once you build the spark image, go inside the spark package and place the file "keras_mnist_advanced_modified.py" in the dirctory: examples/src/main/python/tensorflow/. Create the 'tensorflow' directory if it doesnt exists. +We do this because we the file keras_mnist_advanced_modified.py is optimized for CPU running and we want this file to be automatically present in the final docker image that we build. + +``` +docker build -t spark-tf-keras-horovod-pytorch:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . +``` + +## Prepare ssh keys + +``` +# Setup ssh key +export SSH_KEY_DIR=`mktemp -d` +cd $SSH_KEY_DIR +yes | ssh-keygen -N "" -f id_rsa +``` + +## Create the values.yaml + +To run Horovod with GPU, you can create `values.yaml` like below + +``` +# cat << EOF > ~/values.yaml +--- +ssh: + useSecrets: true + hostKey: |- +$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g') + + hostKeyPub: |- +$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g') + +worker: + number: 2 + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 +master: + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 + args: + - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" +EOF +``` + +For most cases, the overlay network impacts the Horovod performance greatly, so we should apply `Host Network` solution. To run Horovod with Host Network and GPU, you can create `values.yaml` like below + + +``` +# cat << EOF > ~/values.yaml +--- +useHostNetwork: true + +ssh: + useSecrets: true + port: 32222 + hostKey: |- +$(cat $SSH_KEY_DIR/id_rsa | sed 's/^/ /g') + + hostKeyPub: |- +$(cat $SSH_KEY_DIR/id_rsa.pub | sed 's/^/ /g') + + +worker: + number: 2 + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 +master: + image: + repository: uber/horovod + tag: 0.12.1-tf1.8.0-py3.5 + args: + - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" +EOF +``` + +``` +NOTE: A sample values.yaml is provided for reference. After adding the above changes, we should have a values.yml similar to that. +``` + +> notice: the difference is that you should set `useHostNetwork` as true, then set another ssh port rather than `22` + +## Installing the Chart + +To install the chart with the release name `mnist`: + +```bash +$ helm install --values ~/values.yaml --name mnist stable/horovod +``` + +## Uninstalling the Chart + +To uninstall/delete the `mnist` deployment: + +```bash +$ helm delete mnist +``` + +The command removes all the Kubernetes components associated with the chart and +deletes the release. + +## Upgrading an existing Release to a new major version +A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an +incompatible breaking change needing manual actions. + +### 1.0.0 +This version removes the `chart` label from the `spec.selector.matchLabels` +which is immutable since `StatefulSet apps/v1beta2`. It has been inadvertently +added, causing any subsequent upgrade to fail. See https://github.com/helm/charts/issues/7726. + +In order to upgrade, delete the Horovod StatefulSet before upgrading, supposing your Release is named `my-release`: + +```bash +$ kubectl delete statefulsets.apps --cascade=false my-release +``` + +## Configuration + +The following table lists the configurable parameters of the Horovod +chart and their default values. + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `useHostNetwork` | Host network | `false` | +| `ssh.port` | The ssh port | `22` | +| `ssh.useSecrets` | Determine if using the secrets for ssh | `false` | +| `worker.number`| The worker's number | `5` | +| `worker.image.repository` | horovod worker image | `uber/horovod` | +| `worker.image.pullPolicy` | `pullPolicy` for the worker | `IfNotPresent` | +| `worker.image.tag` | `tag` for the worker | `0.12.1-tf1.8.0-py3.5` | +| `resources`| pod resource requests & limits| `{}`| +| `worker.env` | worker's environment variables | `{}` | +| `master.image.repository` | horovod master image | `uber/horovod` | +| `master.image.tag` | `tag` for the master | `0.12.1-tf1.8.0-py3.5` | +| `master.image.pullPolicy` | image pullPolicy for the master image| `IfNotPresent` | +| `master.args` | master's args | `{}` | +| `master.env` | master's environment variables | `{}` | diff --git a/vnfs/DAaaS/applications/sample-horovod-app/keras_mnist_advanced_modified.py b/vnfs/DAaaS/applications/sample-horovod-app/keras_mnist_advanced_modified.py new file mode 100644 index 00000000..03425ff7 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/keras_mnist_advanced_modified.py @@ -0,0 +1,127 @@ +from __future__ import print_function +import keras +from keras.datasets import mnist +from keras.models import Sequential +from keras.layers import Dense, Dropout, Flatten +from keras.layers import Conv2D, MaxPooling2D +from keras.preprocessing.image import ImageDataGenerator +from keras import backend as K +import tensorflow as tf +import horovod.keras as hvd + +# Horovod: initialize Horovod. +hvd.init() + +# Horovod: pin GPU to be used to process local rank (one GPU per process) +config = tf.ConfigProto() +#config.gpu_options.allow_growth = True +#config.gpu_options.visible_device_list = str(hvd.local_rank()) +K.set_session(tf.Session(config=config)) + +batch_size = 128 +num_classes = 10 + +# Enough epochs to demonstrate learning rate warmup and the reduction of +# learning rate when training plateaues. +epochs = 24 + +# Input image dimensions +img_rows, img_cols = 28, 28 + +# The data, shuffled and split between train and test sets +(x_train, y_train), (x_test, y_test) = mnist.load_data() + +# Determine how many batches are there in train and test sets +train_batches = len(x_train) // batch_size +test_batches = len(x_test) // batch_size + +if K.image_data_format() == 'channels_first': + x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols) + x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols) + input_shape = (1, img_rows, img_cols) +else: + x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) + x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) + input_shape = (img_rows, img_cols, 1) + +x_train = x_train.astype('float32') +x_test = x_test.astype('float32') +x_train /= 255 +x_test /= 255 +print('x_train shape:', x_train.shape) +print(x_train.shape[0], 'train samples') +print(x_test.shape[0], 'test samples') + +# Convert class vectors to binary class matrices +y_train = keras.utils.to_categorical(y_train, num_classes) +y_test = keras.utils.to_categorical(y_test, num_classes) + +model = Sequential() +model.add(Conv2D(32, kernel_size=(3, 3), + activation='relu', + input_shape=input_shape)) +model.add(Conv2D(64, (3, 3), activation='relu')) +model.add(MaxPooling2D(pool_size=(2, 2))) +model.add(Dropout(0.25)) +model.add(Flatten()) +model.add(Dense(128, activation='relu')) +model.add(Dropout(0.5)) +model.add(Dense(num_classes, activation='softmax')) + +# Horovod: adjust learning rate based on number of GPUs. +opt = keras.optimizers.Adadelta(lr=1.0 * hvd.size()) + +# Horovod: add Horovod Distributed Optimizer. +opt = hvd.DistributedOptimizer(opt) + +model.compile(loss=keras.losses.categorical_crossentropy, + optimizer=opt, + metrics=['accuracy']) + +callbacks = [ + # Horovod: broadcast initial variable states from rank 0 to all other processes. + # This is necessary to ensure consistent initialization of all workers when + # training is started with random weights or restored from a checkpoint. + hvd.callbacks.BroadcastGlobalVariablesCallback(0), + + # Horovod: average metrics among workers at the end of every epoch. + # + # Note: This callback must be in the list before the ReduceLROnPlateau, + # TensorBoard or other metrics-based callbacks. + hvd.callbacks.MetricAverageCallback(), + + # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final + # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during + # the first five epochs. See https://arxiv.org/abs/1706.02677 for details. + hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=5, verbose=1), + + # Reduce the learning rate if training plateaues. + keras.callbacks.ReduceLROnPlateau(patience=10, verbose=1), +] + +# Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them. +if hvd.rank() == 0: + callbacks.append(keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) + +# Set up ImageDataGenerators to do data augmentation for the training images. +train_gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, + height_shift_range=0.08, zoom_range=0.08) +test_gen = ImageDataGenerator() + +# Train the model. +# Horovod: the training will randomly sample 1 / N batches of training data and +# 3 / N batches of validation data on every worker, where N is the number of workers. +# Over-sampling of validation data helps to increase probability that every validation +# example will be evaluated. +model.fit_generator(train_gen.flow(x_train, y_train, batch_size=batch_size), + steps_per_epoch=train_batches // hvd.size(), + callbacks=callbacks, + epochs=epochs, + verbose=1, + validation_data=test_gen.flow(x_test, y_test, batch_size=batch_size), + validation_steps=3 * test_batches // hvd.size()) + +# Evaluate the model on the full data set. +score = model.evaluate(x_test, y_test, verbose=0) +print('Test loss:', score[0]) +print('Test accuracy:', score[1]) diff --git a/vnfs/DAaaS/applications/sample-horovod-app/sample_values.yaml b/vnfs/DAaaS/applications/sample-horovod-app/sample_values.yaml new file mode 100644 index 00000000..6ac31359 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/sample_values.yaml @@ -0,0 +1,44 @@ +--- +#useHostNetwork: true + +ssh: + useSecrets: true + hostKey: |- + -----BEGIN RSA PRIVATE KEY----- + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + ThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKeyThisIsPrivateKey + -----END RSA PRIVATE KEY----- + + hostKeyPub: |- + ssh-rsa ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey + ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey + ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey + ThisIsPublicKeyThisIsPublicKeyThisIsPublicKeyThisIsPublicKey user@openSource + +resources: {} + +worker: + number: 2 + image: + repository: spark-tf-keras-horovod-pytorch + tag: latest + pullPolicy: Never +master: + image: + repository: spark-tf-keras-horovod-pytorch + tag: latest + pullPolicy: Never + args: + - "mpirun -np 3 --hostfile /horovod/generated/hostfile --mca orte_keep_fqdn_hostnames t --allow-run-as-root --display-map --tag-output --timestamp-output sh -c '/opt/conda/envs/tf_env/bin/python /opt/spark/examples/src/main/python/tensorflow/keras_mnist_advanced_modified.py'" diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/NOTES.txt b/vnfs/DAaaS/applications/sample-horovod-app/templates/NOTES.txt new file mode 100644 index 00000000..774555ae --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/NOTES.txt @@ -0,0 +1,5 @@ +1. Get the application URL by running these commands: + +*** NOTE: It may take a few minutes for the statefulset to be available + +*** you can watch the status of statefulset by running 'kubectl get sts --namespace {{ .Release.Namespace }} -w {{ template "horovod.fullname" . }}' *** \ No newline at end of file diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/_helpers.tpl b/vnfs/DAaaS/applications/sample-horovod-app/templates/_helpers.tpl new file mode 100644 index 00000000..02071c0f --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/_helpers.tpl @@ -0,0 +1,32 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "horovod.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "horovod.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "horovod.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/config.yaml b/vnfs/DAaaS/applications/sample-horovod-app/templates/config.yaml new file mode 100644 index 00000000..ae93c445 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/config.yaml @@ -0,0 +1,130 @@ +{{- $workerNum := .Values.worker.number -}} +{{- $name := include "horovod.fullname" . }} +{{- $slots := 1 }} +{{- if index .Values.resources "nvidia.com/gpu" }} +{{- $slots := index .Values.resources "nvidia.com/gpu" }} +{{- end }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ template "horovod.fullname" . }} + labels: + heritage: {{ .Release.Service | quote }} + release: {{ .Release.Name | quote }} + chart: {{ template "horovod.chart" . }} + app: {{ template "horovod.fullname" . }} +data: + hostfile.config: | + {{ $name }}-master slots={{ $slots }} + {{- range $i, $none := until (int $workerNum) }} + {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }} + {{- end }} + ssh.readiness: | + #!/bin/bash + set -xev + ssh localhost ls + master.run: | + #!/bin/bash + set -x + sleep 5 + + mkdir -p /root/.ssh + rm -f /root/.ssh/config + touch /root/.ssh/config + + if [ "$USESECRETS" == "true" ];then + set +e + yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa + yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys + set -e + fi + + if [ -n "$SSHPORT" ]; then + echo "Port $SSHPORT" > /root/.ssh/config + sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config + fi + echo "StrictHostKeyChecking no" >> /root/.ssh/config + /usr/sbin/sshd + + if [ $# -eq 0 ]; then + sleep infinity + else + bash -c "$*" + fi + sleep 300 + master.waitWorkerReady: | + #!/bin/bash + set -xev + function updateSSHPort() { + mkdir -p /root/.ssh + rm -f /root/.ssh/config + touch /root/.ssh/config + + if [ -n "$SSHPORT" ]; then + echo "Port $SSHPORT" > /root/.ssh/config + echo "StrictHostKeyChecking no" >> /root/.ssh/config + fi + } + + function runCheckSSH() { + if [[ "$USESECRETS" == "true" ]];then + set +e + yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa + yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys + set -e + fi + + for i in `cat $1 | awk '{print $(1)}'`;do + if [[ "$i" != *"master" ]];then + retry 30 ssh -o ConnectTimeout=2 -q $i exit + fi + done + } + + function retry() + { + local n=0;local try=$1 + local cmd="${@: 2}" + [[ $# -le 1 ]] && { + echo "Usage $0 "; + } + set +e + until [[ $n -ge $try ]] + do + $cmd && break || { + echo "Command Fail.." + ((n++)) + echo "retry $n :: [$cmd]" + sleep 1; + } + done + $cmd + if [ $? -ne 0 ]; then + exit 1 + fi + set -e + } + updateSSHPort + runCheckSSH $1 + worker.run: | + #!/bin/bash + set -x + + mkdir -p /root/.ssh + rm -f /root/.ssh/config + touch /root/.ssh/config + + if [[ "$USESECRETS" == "true" ]];then + set +e + yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa + yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys + set -e + fi + + if [ -n "$SSHPORT" ]; then + echo "Port $SSHPORT" > /root/.ssh/config + sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config + fi + echo "StrictHostKeyChecking no" >> /root/.ssh/config + + /usr/sbin/sshd -D diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/job-service.yaml b/vnfs/DAaaS/applications/sample-horovod-app/templates/job-service.yaml new file mode 100644 index 00000000..e7b05c26 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/job-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ template "horovod.fullname" . }}-master + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + clusterIP: None + ports: + - name: ssh + port: {{ .Values.ssh.port }} + targetPort: {{ .Values.ssh.port }} + selector: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: master diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/job.yaml b/vnfs/DAaaS/applications/sample-horovod-app/templates/job.yaml new file mode 100644 index 00000000..4e59b277 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/job.yaml @@ -0,0 +1,126 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: master +spec: + template: + metadata: + labels: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: master + spec: + {{- if .Values.useHostNetwork }} + hostNetwork: {{ .Values.useHostNetwork }} + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + {{- if .Values.useHostPID }} + hostPID: {{ .Values.useHostPID }} + {{- end }} + restartPolicy: OnFailure + volumes: + - name: {{ template "horovod.fullname" . }}-cm + configMap: + name: {{ template "horovod.fullname" . }} + items: + - key: hostfile.config + path: hostfile + mode: 438 + - key: master.waitWorkerReady + path: waitWorkersReady.sh + mode: 365 + - key: master.run + path: run.sh + mode: 365 + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + secret: + secretName: {{ template "horovod.fullname" . }} + defaultMode: 448 + items: + - key: host-key + path: id_rsa + - key: host-key-pub + path: authorized_keys + {{- end }} +{{- if .Values.volumes }} +{{ toYaml .Values.volumes | indent 6 }} +{{- end }} + containers: + - name: horovod-master + image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}" + imagePullPolicy: {{ .Values.master.image.pullPolicy }} + env: + - name: SSHPORT + value: "{{ .Values.ssh.port }}" + {{- if .Values.ssh.useSecrets }} + - name: USESECRETS + value: "{{ .Values.ssh.useSecrets }}" + {{- end }} + {{- if .Values.master.env }} + {{- range $key, $value := .Values.master.env }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- end }} +{{- if .Values.master.privileged }} + securityContext: + privileged: true +{{- end }} + ports: + - containerPort: {{ .Values.ssh.port }} + volumeMounts: + - name: {{ template "horovod.fullname" . }}-cm + mountPath: /horovod/generated + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + readOnly: true + mountPath: "/etc/secret-volume" + {{- end }} +{{- if .Values.volumeMounts }} +{{ toYaml .Values.volumeMounts | indent 8 }} +{{- end }} + command: + - /horovod/generated/run.sh + args: +{{ toYaml .Values.master.args | indent 10 }} + resources: +{{ toYaml .Values.resources | indent 10 }} +{{- if .Values.ssh.useSecrets }} + initContainers: + - name: wait-workers + image: "{{ .Values.master.image.repository }}:{{ .Values.master.image.tag }}" + imagePullPolicy: {{ .Values.master.image.pullPolicy }} + env: + - name: SSHPORT + value: "{{ .Values.ssh.port }}" + {{- if .Values.ssh.useSecrets }} + - name: USESECRETS + value: "{{ .Values.ssh.useSecrets }}" + {{- end }} + {{- if .Values.master.env }} + {{- range $key, $value := .Values.master.env }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- end }} + command: + - /horovod/generated/waitWorkersReady.sh + args: + - /horovod/generated/hostfile + volumeMounts: + - name: {{ template "horovod.fullname" . }}-cm + mountPath: /horovod/generated + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + readOnly: true + mountPath: "/etc/secret-volume" + {{- end }} +{{- end }} diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/secrets.yaml b/vnfs/DAaaS/applications/sample-horovod-app/templates/secrets.yaml new file mode 100644 index 00000000..c9853ed0 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/secrets.yaml @@ -0,0 +1,15 @@ +{{- if .Values.ssh.useSecrets }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +type: Opaque +data: + host-key: {{ .Values.ssh.hostKey | b64enc | quote }} + host-key-pub: {{ .Values.ssh.hostKeyPub | b64enc | quote }} +{{- end }} \ No newline at end of file diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset-service.yaml b/vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset-service.yaml new file mode 100644 index 00000000..d0216a86 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +spec: + clusterIP: None + ports: + - name: ssh + port: {{ .Values.ssh.port }} + targetPort: {{ .Values.ssh.port }} + selector: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: worker diff --git a/vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset.yaml b/vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset.yaml new file mode 100644 index 00000000..1d3f7577 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-horovod-app/templates/statefulset.yaml @@ -0,0 +1,115 @@ +apiVersion: apps/v1beta2 +kind: StatefulSet +metadata: + name: {{ template "horovod.fullname" . }} + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: worker +spec: + selector: + matchLabels: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: worker + serviceName: {{ template "horovod.fullname" . }} + podManagementPolicy: {{ .Values.worker.podManagementPolicy }} + replicas: {{.Values.worker.number}} + template: + metadata: + labels: + app: {{ template "horovod.name" . }} + chart: {{ template "horovod.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + role: worker + spec: + selector: + matchLabels: + app: {{ template "horovod.name" . }} + release: {{ .Release.Name }} + role: worker + {{- if .Values.useHostNetwork }} + hostNetwork: {{ .Values.useHostNetwork }} + dnsPolicy: ClusterFirstWithHostNet + {{- end }} + {{- if .Values.useHostPID }} + hostPID: {{ .Values.useHostPID }} + {{- end }} + volumes: + - name: {{ template "horovod.fullname" . }}-cm + configMap: + name: {{ template "horovod.fullname" . }} + items: + - key: hostfile.config + path: hostfile + mode: 438 + - key: ssh.readiness + path: check.sh + mode: 365 + - key: worker.run + path: run.sh + mode: 365 + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + secret: + secretName: {{ template "horovod.fullname" . }} + defaultMode: 448 + items: + - key: host-key + path: id_rsa + - key: host-key-pub + path: authorized_keys + {{- end }} +{{- if .Values.volumes }} +{{ toYaml .Values.volumes | indent 6 }} +{{- end }} + containers: + - name: worker + image: "{{ .Values.worker.image.repository }}:{{ .Values.worker.image.tag }}" + imagePullPolicy: {{ .Values.worker.image.pullPolicy }} + env: + - name: SSHPORT + value: "{{ .Values.ssh.port }}" + {{- if .Values.ssh.useSecrets }} + - name: USESECRETS + value: "{{ .Values.ssh.useSecrets }}" + {{- end }} + {{- if .Values.master.env }} + {{- range $key, $value := .Values.master.env }} + - name: "{{ $key }}" + value: "{{ $value }}" + {{- end }} + {{- end }} +{{- if .Values.worker.privileged }} + securityContext: + privileged: true +{{- end }} + ports: + - containerPort: {{ .Values.ssh.port }} + volumeMounts: + - name: {{ template "horovod.fullname" . }}-cm + mountPath: /horovod/generated + {{- if .Values.ssh.useSecrets }} + - name: {{ template "horovod.fullname" . }}-secret + readOnly: true + mountPath: "/etc/secret-volume" + {{- end }} +{{- if .Values.volumeMounts }} +{{ toYaml .Values.volumeMounts | indent 8 }} +{{- end }} + command: + - /horovod/generated/run.sh +{{- if .Values.ssh.useSecrets }} + readinessProbe: + exec: + command: + - /horovod/generated/check.sh + initialDelaySeconds: 1 + periodSeconds: 2 +{{- end }} + resources: +{{ toYaml .Values.resources | indent 10 }} diff --git a/vnfs/DAaaS/applications/sample-spark-app/.helmignore b/vnfs/DAaaS/applications/sample-spark-app/.helmignore new file mode 100644 index 00000000..50af0317 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-spark-app/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/vnfs/DAaaS/applications/sample-spark-app/Chart.yaml b/vnfs/DAaaS/applications/sample-spark-app/Chart.yaml new file mode 100644 index 00000000..42ed0400 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-spark-app/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: A sample spark application which finds the top users from the apache logs which is stored in the remote hdfs-k8s cluster +name: sample-spark-app-apache-log-analysis +version: 0.1.0 diff --git a/vnfs/DAaaS/applications/sample-spark-app/Dockerfile b/vnfs/DAaaS/applications/sample-spark-app/Dockerfile new file mode 100644 index 00000000..cd42d4c7 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-spark-app/Dockerfile @@ -0,0 +1,133 @@ +# Copyright (c) 2019 Intel Corporation +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Ported kubernetes spark image to Ubuntu + +FROM ubuntu:18.04 + +# Install jdk +RUN apt update -yqq +RUN apt install -y locales openjdk-8-jdk && rm -rf /var/lib/apt/lists/* \ + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 + +# Install all the essentials +RUN apt-get update --fix-missing && \ + apt-get install -y numactl wget curl bzip2 ca-certificates libglib2.0-0 libxext6 libsm6 libxrender1 \ + git mercurial subversion build-essential openssh-server openssh-client net-tools && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV LANG en_US.utf8 +ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64 +ENV PATH $JAVA_HOME/bin:$PATH +ENV PATH /opt/conda/bin:$PATH +ENV OPENMPI_VERSION 3.1 + +# Install openMPI +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget --quiet https://www.open-mpi.org/software/ompi/v${OPENMPI_VERSION}/downloads/openmpi-${OPENMPI_VERSION}.2.tar.gz -O openmpi.tar.gz && \ + tar zxf openmpi.tar.gz && \ + cd openmpi-3.1.2 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi + +# Install miniconda +RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda && \ + rm ~/miniconda.sh && \ + ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ + echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate base" >> ~/.bashrc + +# Install tf & keras using conda in the virtual_environment:tf_env +SHELL ["/bin/bash", "-c"] +RUN conda update -n base -c defaults conda && \ + conda create -n tf_env +RUN conda install -n tf_env -y -c anaconda \ + pip tensorflow keras nltk + +RUN echo "conda activate tf_env" >> ~/.bashrc && \ + conda install -n tf_env -y -c conda-forge clangdev + +RUN source ~/.bashrc +RUN HOROVOD_WITH_TENSORFLOW=1 /opt/conda/envs/tf_env/bin/pip install --no-cache-dir horovod + +# openMPI sane defaults: +RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ + echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf && \ + echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config + +# Install tini +RUN apt-get install -y curl grep sed dpkg && \ + TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && echo ${TINI_VERSION} && \ + curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ + dpkg -i tini.deb && \ + rm tini.deb && \ + apt clean + +# This is needed to match the original entrypoint.sh file. +RUN cp /usr/bin/tini /sbin + +# Begin: Installing spark +ARG spark_jars=jars +ARG img_path=kubernetes/dockerfiles +ARG k8s_tests=kubernetes/tests + +# Before building the docker image, first build and make a Spark distribution following +# the instructions in http://spark.apache.org/docs/latest/building-spark.html. +# If this docker file is being used in the context of building your images from a Spark +# distribution, the docker build command should be invoked from the top level directory +# of the Spark distribution. E.g.: +# docker build -t spark:latest -f kubernetes/dockerfiles/spark/ubuntu18.04/Dockerfile . + +RUN mkdir -p /opt/spark && \ + mkdir -p /opt/spark/work-dir && \ + touch /opt/spark/RELEASE && \ + rm /bin/sh && \ + ln -sv /bin/bash /bin/sh && \ + echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su && \ + chgrp root /etc/passwd && chmod ug+rw /etc/passwd + + +COPY ${spark_jars} /opt/spark/jars +COPY bin /opt/spark/bin +COPY sbin /opt/spark/sbin +COPY ${img_path}/spark/entrypoint.sh /opt/ +COPY examples /opt/spark/examples +COPY ${k8s_tests} /opt/spark/tests +COPY data /opt/spark/data +ENV SPARK_HOME /opt/spark + +RUN mkdir /opt/spark/python +COPY python/pyspark /opt/spark/python/pyspark +COPY python/lib /opt/spark/python/lib +ENV PYTHONPATH /opt/spark/python/lib/pyspark.zip:/opt/spark/python/lib/py4j-*.zip + +WORKDIR /opt/spark/work-dir + +ENTRYPOINT [ "/opt/entrypoint.sh" ] + +# End: Installing spark diff --git a/vnfs/DAaaS/applications/sample-spark-app/templates/SampleSparkApp.yaml b/vnfs/DAaaS/applications/sample-spark-app/templates/SampleSparkApp.yaml new file mode 100644 index 00000000..f728f82e --- /dev/null +++ b/vnfs/DAaaS/applications/sample-spark-app/templates/SampleSparkApp.yaml @@ -0,0 +1,43 @@ +apiVersion: "sparkoperator.k8s.io/v1beta1" +kind: SparkApplication +metadata: + name: {{ .Values.nameOfTheSparkApp }} + namespace: {{ .Release.Namespace }} +spec: + type: {{ .Values.programmingLanguageType }} + mode: {{ .Values.modeOfSparkApp | default "cluster" }} + image: {{ quote .Values.image }} + imagePullPolicy: {{ .Values.imagePullPolicy | default "IfNotPresent" }} + mainClass: {{ .Values.mainClassOfTheSparkApp }} + mainApplicationFile: {{ .Values.mainApplicationFileOfTheSparkApp }} + arguments: + {{- range .Values.argumentsOfTheSparkProgram }} + - {{ . }} + {{ end }} + hadoopConfigMap: {{ .Values.hadoopConfigMap }} + restartPolicy: + type: {{ .Values.restartPolicy | default "Never" }} + volumes: + - name: {{ quote .Values.volumesName | default "test-volume" }} + hostpath: + path: {{ quote .Values.hostpath | default "/tmp" }} + type: {{ .Values.hostpathType | default "Directory" }} + driver: + cores: {{ .Values.driverCores | default 0.1 }} + coreLimit: {{ quote .Values.driverCoreLimit | default "200m" }} + memory: {{ quote .Values.driverMemory | default "1024m" }} + labels: + version: 2.4.0 + serviceAccount: spark + volumeMounts: + - name: {{ quote .Values.driverVolumeMountsName | default "test-volume" }} + mountPath: {{ quote .Values.driverVolumeMountPath | default "/tmp" }} + executor: + cores: {{ .Values.executorCores | default 1 }} + instances: {{ .Values.executorInstances | default 1 }} + memory: {{ quote .Values.executorMemory | default "512m" }} + labels: + version: 2.4.0 + volumeMounts: + - name: {{ quote .Values.executorVolumeMountsName | default "test-volume" }} + mountPath: {{ quote .Values.executorVolumeMountPath | default "/tmp" }} diff --git a/vnfs/DAaaS/applications/sample-spark-app/templates/_helpers.tpl b/vnfs/DAaaS/applications/sample-spark-app/templates/_helpers.tpl new file mode 100644 index 00000000..6f51811d --- /dev/null +++ b/vnfs/DAaaS/applications/sample-spark-app/templates/_helpers.tpl @@ -0,0 +1,32 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "sample-spark-app.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "sample-spark-app.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "sample-spark-app.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} diff --git a/vnfs/DAaaS/applications/sample-spark-app/values.yaml b/vnfs/DAaaS/applications/sample-spark-app/values.yaml new file mode 100644 index 00000000..afb48d67 --- /dev/null +++ b/vnfs/DAaaS/applications/sample-spark-app/values.yaml @@ -0,0 +1,57 @@ +# Default values for sample-spark-app. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + + +#===========================KUBERNETES POD RELATED CONFIGs======================== +image: spark-tf-keras-horo:latest +imagePullPolicy: Never +restartPolicy: Never +volumesName: test-volume +hostpath: /tmp +hostpathType: Directory + + + +#============================SPARK APP RELATED CONFIGs============================= + +nameOfTheSparkApp: spark-apache-logs2 +# Python or Scala supported. +programmingLanguageType: Scala +modeOfSparkApp: cluster +mainClassOfTheSparkApp: ApacheLogAnalysis +# can be http path, s3 path, minio path +mainApplicationFileOfTheSparkApp: https://github.com/mohanraj1311/ApacheLogAnalysisJar/raw/master/analysisofapachelogs_2.11-0.1.jar +argumentsOfTheSparkProgram: + - hdfs://hdfs-1-namenode-1.hdfs-1-namenode.hdfs1.svc.cluster.local:8020/data/apache-logs + + + +#============================SPARK DRIVER RELATED CONFIGs========================= +driverCores: 0.1 +driverCoreLimit: 200m +driverMemory: 1024m +driverVolumeMountsName: test-volume +driverVolumeMountPath: /tmp + + + +#============================SPARK EXECUTOR RELATED CONFIGs======================= +executorCores: 1 +executorInstances: 1 +executorMemory: 512m +executorVolumeMountsName: test-volume +executorVolumeMountPath: /tmp + + + +#===========================HADOOP RELATED CONFIGs=============================== +# config map of the hdfs +hadoopConfigMap: hdfs-1-config + + +################################################################################### + + + + -- cgit 1.2.3-korg