{{- $workerNum := .Values.worker.number -}} {{- $name := include "horovod.fullname" . }} {{- $slots := 1 }} {{- if index .Values.resources "nvidia.com/gpu" }} {{- $slots := index .Values.resources "nvidia.com/gpu" }} {{- end }} apiVersion: v1 kind: ConfigMap metadata: name: {{ template "horovod.fullname" . }} labels: heritage: {{ .Release.Service | quote }} release: {{ .Release.Name | quote }} chart: {{ template "horovod.chart" . }} app: {{ template "horovod.fullname" . }} data: hostfile.config: | {{ $name }}-master slots={{ $slots }} {{- range $i, $none := until (int $workerNum) }} {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }} {{- end }} ssh.readiness: | #!/bin/bash set -xev ssh localhost ls master.run: | #!/bin/bash set -x sleep 5 mkdir -p /root/.ssh rm -f /root/.ssh/config touch /root/.ssh/config if [ "$USESECRETS" == "true" ];then set +e yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys set -e fi if [ -n "$SSHPORT" ]; then echo "Port $SSHPORT" > /root/.ssh/config sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config fi echo "StrictHostKeyChecking no" >> /root/.ssh/config /usr/sbin/sshd if [ $# -eq 0 ]; then sleep infinity else bash -c "$*" fi sleep 300 master.waitWorkerReady: | #!/bin/bash set -xev function updateSSHPort() { mkdir -p /root/.ssh rm -f /root/.ssh/config touch /root/.ssh/config if [ -n "$SSHPORT" ]; then echo "Port $SSHPORT" > /root/.ssh/config echo "StrictHostKeyChecking no" >> /root/.ssh/config fi } function runCheckSSH() { if [[ "$USESECRETS" == "true" ]];then set +e yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys set -e fi for i in `cat $1 | awk '{print $(1)}'`;do if [[ "$i" != *"master" ]];then retry 30 ssh -o ConnectTimeout=2 -q $i exit fi done } function retry() { local n=0;local try=$1 local cmd="${@: 2}" [[ $# -le 1 ]] && { echo "Usage $0 "; } set +e until [[ $n -ge $try ]] do $cmd && break || { echo "Command Fail.." ((n++)) echo "retry $n :: [$cmd]" sleep 1; } done $cmd if [ $? -ne 0 ]; then exit 1 fi set -e } updateSSHPort runCheckSSH $1 worker.run: | #!/bin/bash set -x mkdir -p /root/.ssh rm -f /root/.ssh/config touch /root/.ssh/config if [[ "$USESECRETS" == "true" ]];then set +e yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys set -e fi if [ -n "$SSHPORT" ]; then echo "Port $SSHPORT" > /root/.ssh/config sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config fi echo "StrictHostKeyChecking no" >> /root/.ssh/config /usr/sbin/sshd -D