aboutsummaryrefslogtreecommitdiffstats
path: root/vnfs/DAaaS/sample-apps/training/sample-horovod-app/templates/config.yaml
blob: 70d18eab252689f9d1bcc357d30b961029129071 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
{{- $workerNum := .Values.worker.number -}}
{{- $name := include "horovod.fullname" . }}
{{- $slots := 1 }}
{{- if index .Values.resources "nvidia.com/gpu" }}
{{- $slots := index .Values.resources "nvidia.com/gpu" }}
{{- end }}
apiVersion: v1
kind: ConfigMap
metadata:
  name: {{ template "horovod.fullname" . }}
  labels:
    heritage: {{ .Release.Service | quote }}
    release: {{ .Release.Name | quote }}
    chart: {{ template "horovod.chart" . }}
    app: {{ template "horovod.fullname" . }}
data:
  hostfile.config: |
    {{ $name }}-master slots={{ $slots }}
    {{- range $i, $none := until (int $workerNum) }}
    {{ $name }}-{{ $i }}.{{ $name }} slots={{ $slots }}
    {{- end }}
  ssh.readiness: |
    #!/bin/bash
    set -xev
    ssh localhost ls
  master.run: |
     #!/bin/bash
     set -x
     sleep 5

     mkdir -p /root/.ssh
     rm -f /root/.ssh/config
     touch /root/.ssh/config

     if [ "$USESECRETS" == "true" ];then
        set +e
        yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
        yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
        set -e
     fi

     if [ -n "$SSHPORT" ]; then
        echo "Port $SSHPORT" > /root/.ssh/config
        sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
     fi
     echo "StrictHostKeyChecking no" >> /root/.ssh/config
     /usr/sbin/sshd

     if [ $# -eq 0 ]; then
          sleep infinity
        else
          bash -c "$*"
     fi
     sleep 300
  master.waitWorkerReady: |
    #!/bin/bash
    set -xev
    function updateSSHPort() {
      mkdir -p /root/.ssh
      rm -f /root/.ssh/config
      touch /root/.ssh/config

      if [ -n "$SSHPORT" ]; then
        echo "Port $SSHPORT" > /root/.ssh/config
        echo "StrictHostKeyChecking no" >> /root/.ssh/config
      fi
    }

    function runCheckSSH() {
      if [[ "$USESECRETS" == "true" ]];then
        set +e
        yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
        yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
        set -e
      fi

      for i in `cat $1 | awk '{print $(1)}'`;do
        if [[ "$i" != *"master" ]];then
          retry 30 ssh -o ConnectTimeout=2 -q $i exit
        fi
      done
    }

    function retry()
    {
        local n=0;local try=$1
        local cmd="${@: 2}"
        [[ $# -le 1 ]] && {
            echo "Usage $0 <retry_number> <Command>";
        }
        set +e
        until [[ $n -ge $try ]]
        do
          $cmd && break || {
                  echo "Command Fail.."
                  ((n++))
                  echo "retry $n :: [$cmd]"
                  sleep 1;
                  }
        done
        $cmd
        if [ $? -ne 0 ]; then
          exit 1
        fi
        set -e
    }
    updateSSHPort
    runCheckSSH $1
  worker.run: |
     #!/bin/bash
     set -x

     mkdir -p /root/.ssh
     rm -f /root/.ssh/config
     touch /root/.ssh/config

     if [[ "$USESECRETS" == "true" ]];then
        set +e
        yes | cp /etc/secret-volume/id_rsa /root/.ssh/id_rsa
        yes | cp /etc/secret-volume/authorized_keys /root/.ssh/authorized_keys
        set -e
     fi

     if [ -n "$SSHPORT" ]; then
        echo "Port $SSHPORT" > /root/.ssh/config
        sed -i "s/^Port.*/Port $SSHPORT /g" /etc/ssh/sshd_config
     fi
     echo "StrictHostKeyChecking no" >> /root/.ssh/config

     /usr/sbin/sshd -D