From 2cfc1f2d67103726140b32b5a5c555f7a66636be Mon Sep 17 00:00:00 2001 From: Todd Malsbary Date: Thu, 17 Jun 2021 17:10:38 -0700 Subject: Move topology-manager configuration to kubespray The steps performed by the existing ansible playbook can be performed directly by kubespray. In addtion, fix and enable the topology-manager.sh test. Issue-ID: MULTICLOUD-1324 Signed-off-by: Todd Malsbary Change-Id: Iee2197c1fc3e35288796399cccff0d3ae0925a6c --- .../playbooks/configure-topology-manager.yml | 66 ------------ kud/deployment_infra/playbooks/kud-vars.yml | 7 -- kud/hosting_providers/containerized/installer.sh | 2 +- .../inventory/group_vars/k8s-cluster.yml | 5 + kud/hosting_providers/vagrant/installer.sh | 6 +- .../vagrant/inventory/group_vars/k8s-cluster.yml | 5 + kud/tests/topology-manager-sriov.sh | 112 +++++++++++++++++++++ kud/tests/topology-manager.sh | 16 +-- 8 files changed, 135 insertions(+), 84 deletions(-) delete mode 100644 kud/deployment_infra/playbooks/configure-topology-manager.yml create mode 100755 kud/tests/topology-manager-sriov.sh (limited to 'kud') diff --git a/kud/deployment_infra/playbooks/configure-topology-manager.yml b/kud/deployment_infra/playbooks/configure-topology-manager.yml deleted file mode 100644 index 012bc8b0..00000000 --- a/kud/deployment_infra/playbooks/configure-topology-manager.yml +++ /dev/null @@ -1,66 +0,0 @@ ---- -# SPDX-license-identifier: Apache-2.0 -############################################################################## -# Copyright (c) 2020 -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## - -- hosts: kube-node - tasks: - - name: Load kud variables - include_vars: - file: kud-vars.yml - - - name: creating kubelet config - become: yes - blockinfile: - path: "{{ kubernetes_config_file }}" - marker: "# {mark} OpenNESS configuration - General" - create: yes - block: | - featureGates: - TopologyManager: {{ False if topology_manager.policy == 'none' else True }} - notify: - - enable and restart kubelet - - - name: customize kubelet config - CPU Manager - become: yes - blockinfile: - path: "{{ kubernetes_config_file }}" - marker: "# {mark} OpenNESS configuration - CPU Manager" - block: | - cpuManagerPolicy: {{ cpu_manager.policy }} - state: "{{ 'present' if cpu_manager.policy == 'static' else 'absent' }}" - notify: - - remove cpu manager checkpoint file - - enable and restart kubelet - - - name: customize kubelet config - Topology Manager - become: yes - blockinfile: - path: "{{ kubernetes_config_file }}" - marker: "# {mark} OpenNESS configuration - Topology Manager" - block: | - topologyManagerPolicy: {{ topology_manager.policy }} - state: "{{ 'absent' if topology_manager.policy == 'none' else 'present' }}" - notify: - - enable and restart kubelet - - handlers: - - name: enable and restart kubelet - become: yes - systemd: - name: kubelet - daemon_reload: yes - enabled: yes - masked: no - state: restarted - - - name: remove cpu manager checkpoint file - become: yes - file: - path: "{{ cpu_manager.checkpoint_file }}" - state: absent diff --git a/kud/deployment_infra/playbooks/kud-vars.yml b/kud/deployment_infra/playbooks/kud-vars.yml index 24a9ef98..35057f5b 100644 --- a/kud/deployment_infra/playbooks/kud-vars.yml +++ b/kud/deployment_infra/playbooks/kud-vars.yml @@ -78,13 +78,6 @@ optane_ipmctl_version: 02.00.00.3474 optane_ipmctl_url: "https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/ipmctl/{{ optane_package }}.tar.xz" optane_ipmctl_package: ipmctl_02.00.00.3474+really01.00.00.3469.orig -kubernetes_config_file: "/etc/kubernetes/kubelet-config.yaml" -cpu_manager: - policy: "static" # Options: none (disabled), static (default) - checkpoint_file: "/var/lib/kubelet/cpu_manager_state" -topology_manager: - policy: "best-effort" # Options: none (disabled), best-effort (default), restricted, single-numa-node - emco_git_url: "https://github.com/open-ness/EMCO.git" emco_repository: "integratedcloudnative/" emco_version: "openness-21.03" diff --git a/kud/hosting_providers/containerized/installer.sh b/kud/hosting_providers/containerized/installer.sh index 427850ab..7365a14f 100755 --- a/kud/hosting_providers/containerized/installer.sh +++ b/kud/hosting_providers/containerized/installer.sh @@ -230,7 +230,7 @@ function install_host_artifacts { done mkdir -p ${host_addons_dir}/tests - for test in _common _common_test _functions multus ovn4nfv nfd sriov-network qat cmk; do + for test in _common _common_test _functions topology-manager-sriov multus ovn4nfv nfd sriov-network qat cmk; do cp ${kud_tests}/${test}.sh ${host_addons_dir}/tests done diff --git a/kud/hosting_providers/containerized/inventory/group_vars/k8s-cluster.yml b/kud/hosting_providers/containerized/inventory/group_vars/k8s-cluster.yml index 7d0404a5..a13d8412 100644 --- a/kud/hosting_providers/containerized/inventory/group_vars/k8s-cluster.yml +++ b/kud/hosting_providers/containerized/inventory/group_vars/k8s-cluster.yml @@ -124,3 +124,8 @@ podsecuritypolicy_restricted_spec: # This will fail if allowed-unsafe-sysctls is not set accordingly in kubelet flags allowedUnsafeSysctls: - '*' + +# Customize kubelet config of CPU and topology manager +kubelet_node_config_extra_args: + cpuManagerPolicy: "static" # Options: none (disabled), static (default) + topologyManagerPolicy: "best-effort" # Options: none (disabled), best-effort (default), restricted, single-numa-node diff --git a/kud/hosting_providers/vagrant/installer.sh b/kud/hosting_providers/vagrant/installer.sh index c88dc9e6..39da50e7 100755 --- a/kud/hosting_providers/vagrant/installer.sh +++ b/kud/hosting_providers/vagrant/installer.sh @@ -186,7 +186,7 @@ function install_addons { # The order of KUD_ADDONS is important: some plugins (sriov, qat) # require nfd to be enabled. Some addons are not currently supported with containerd if [ "${container_runtime}" == "docker" ]; then - kud_addons=${KUD_ADDONS:-topology-manager virtlet ovn4nfv nfd sriov \ + kud_addons=${KUD_ADDONS:-virtlet ovn4nfv nfd sriov \ qat optane cmk} elif [ "${container_runtime}" == "containerd" ]; then kud_addons=${KUD_ADDONS:-ovn4nfv nfd} @@ -216,7 +216,9 @@ function install_addons { popd fi # Run other plugin tests - for addon in ${kud_addons}; do + # The topology-manager is added to the tests here as it is + # enabled via kubelet config, not an addon + for addon in topology-manager ${kud_addons}; do pushd $kud_tests bash ${addon}.sh || failed_kud_tests="${failed_kud_tests} ${addon}" popd diff --git a/kud/hosting_providers/vagrant/inventory/group_vars/k8s-cluster.yml b/kud/hosting_providers/vagrant/inventory/group_vars/k8s-cluster.yml index 7803f27a..bf6f8c84 100644 --- a/kud/hosting_providers/vagrant/inventory/group_vars/k8s-cluster.yml +++ b/kud/hosting_providers/vagrant/inventory/group_vars/k8s-cluster.yml @@ -121,3 +121,8 @@ podsecuritypolicy_restricted_spec: # This will fail if allowed-unsafe-sysctls is not set accordingly in kubelet flags allowedUnsafeSysctls: - '*' + +# Customize kubelet config of CPU and topology manager +kubelet_node_config_extra_args: + cpuManagerPolicy: "static" # Options: none (disabled), static (default) + topologyManagerPolicy: "best-effort" # Options: none (disabled), best-effort (default), restricted, single-numa-node diff --git a/kud/tests/topology-manager-sriov.sh b/kud/tests/topology-manager-sriov.sh new file mode 100755 index 00000000..447a7c83 --- /dev/null +++ b/kud/tests/topology-manager-sriov.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# SPDX-license-identifier: Apache-2.0 +############################################################################## +# Copyright (c) 2020 +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +set -o errexit +set -o nounset +set -o pipefail + +source _common.sh +source _functions.sh + +sriov_capable_nodes=$(kubectl get nodes -o json | jq -r '.items[] | select((.status.capacity."intel.com/intel_sriov_nic"!=null) and ((.status.capacity."intel.com/intel_sriov_nic"|tonumber)>=2)) | .metadata.name') +if [ -z "$sriov_capable_nodes" ]; then + echo "Ethernet adaptor version is not set. Topology manager test case cannot run on this machine" + exit 0 +else + echo "NIC card specs match. Topology manager option avaiable for this version." +fi + +pod_name=pod-topology-manager +csar_id=bd55cccc-bf34-11ea-b3de-0242ac130004 + +function create_pod_yaml { + local csar_id=$1 + _checks_args $csar_id + pushd ${CSAR_DIR}/${csar_id} + + cat << POD > $pod_name.yaml +kind: Pod +apiVersion: v1 +metadata: + name: $pod_name + annotations: + k8s.v1.cni.cncf.io/networks: sriov-intel +spec: + containers: + - name: $pod_name + image: docker.io/centos/tools:latest + command: + - /sbin/init + resources: + limits: + cpu: "1" + memory: "500Mi" + intel.com/intel_sriov_nic: '1' + requests: + cpu: "1" + memory: "500Mi" + intel.com/intel_sriov_nic: '1' +POD + popd +} + +create_pod_yaml ${csar_id} +kubectl delete pod $pod_name --ignore-not-found=true --now --wait +kubectl create -f ${CSAR_DIR}/${csar_id}/$pod_name.yaml --validate=false + +status_phase="" +while [[ $status_phase != "Running" ]]; do + new_phase=$(kubectl get pods $pod_name | awk 'NR==2{print $3}') + if [[ $new_phase != $status_phase ]]; then + echo "$(date +%H:%M:%S) - $pod_name : $new_phase" + status_phase=$new_phase + fi + if [[ $new_phase == "Running" ]]; then + echo "Pod is up and running.." + fi + if [[ $new_phase == "Err"* ]]; then + exit 1 + fi +done + +uid=$(kubectl get pod pod-topology-manager -o jsonpath='{.metadata.uid}') +node_name=$(kubectl get pod $pod_name -o jsonpath='{.spec.nodeName}') +node_ip=$(kubectl get node $node_name -o jsonpath='{.status.addresses[].address}') + +apt-get install -y jq +cpu_core=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- cat /var/lib/kubelet/cpu_manager_state | jq -r --arg UID "${uid}" --arg POD_NAME "${pod_name}" '.entries[$UID][$POD_NAME]') +numa_node_number=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- lscpu | grep "NUMA node(s)" | awk -F ':' '{print $2}') +for (( node=0; node<$numa_node_number; node++ )); do + ranges=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- lscpu | grep "NUMA node"$node | awk -F ':' '{print $2}') + ranges=(${ranges//,/ }) + for range in ${ranges[@]}; do + min=$(echo $range | awk -F '-' '{print $1}') + max=$(echo $range | awk -F '-' '{print $2}') + if [ $cpu_core -ge $min ] && [ $cpu_core -le $max ]; then + cpu_numa_node=$node + fi + done +done + +vf_pci=$(kubectl exec -it $pod_name -- env | grep PCIDEVICE_INTEL_COM_INTEL_SRIOV_NIC | awk -F '=' '{print $2}' | sed 's/\r//g') +vf_numa_node=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- cat /sys/bus/pci/devices/$vf_pci/numa_node) + +echo "The allocated cpu core is:" $cpu_core +echo "The numa node of the allocated cpu core is:" $cpu_numa_node +echo "The PCI address of the allocated vf is:" $vf_pci +echo "The numa node of the allocated vf is:" $vf_numa_node +if [ $cpu_numa_node == $vf_numa_node ]; then + echo "The allocated cpu core and vf are on the same numa node" +else + echo "The allocated cpu core and vf are on different numa nodes" +fi + +kubectl delete pod $pod_name --now +echo "Test complete." diff --git a/kud/tests/topology-manager.sh b/kud/tests/topology-manager.sh index 5c9f900d..b1126aac 100755 --- a/kud/tests/topology-manager.sh +++ b/kud/tests/topology-manager.sh @@ -76,15 +76,15 @@ while [[ $status_phase != "Running" ]]; do fi done -container_id=$(kubectl describe pod $pod_name | grep "Container ID" | awk '{print $3}' ) -container_id=${container_id#docker://} -container_id=${container_id:0:12} +uid=$(kubectl get pod pod-topology-manager -o jsonpath='{.metadata.uid}') +node_name=$(kubectl get pod $pod_name -o jsonpath='{.spec.nodeName}') +node_ip=$(kubectl get node $node_name -o jsonpath='{.status.addresses[].address}') apt-get install -y jq -cpu_core=$(cat /var/lib/kubelet/cpu_manager_state | jq -r .| grep ${container_id} | awk -F ':' '{print $2}'| awk -F '"' '{print $2}') -numa_node_number=$(lscpu | grep "NUMA node(s)" | awk -F ':' '{print $2}') +cpu_core=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- cat /var/lib/kubelet/cpu_manager_state | jq -r --arg UID "${uid}" --arg POD_NAME "${pod_name}" '.entries[$UID][$POD_NAME]') +numa_node_number=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- lscpu | grep "NUMA node(s)" | awk -F ':' '{print $2}') for (( node=0; node<$numa_node_number; node++ )); do - ranges=$(lscpu | grep "NUMA node"$node | awk -F ':' '{print $2}') + ranges=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- lscpu | grep "NUMA node"$node | awk -F ':' '{print $2}') ranges=(${ranges//,/ }) for range in ${ranges[@]}; do min=$(echo $range | awk -F '-' '{print $1}') @@ -95,8 +95,8 @@ for (( node=0; node<$numa_node_number; node++ )); do done done -vf_pci=$(kubectl exec -it $pod_name env | grep PCIDEVICE_INTEL_COM_INTEL_SRIOV_700 | awk -F '=' '{print $2}' | sed 's/\r//g') -vf_numa_node=$(cat /sys/bus/pci/devices/$vf_pci/numa_node) +vf_pci=$(kubectl exec -it $pod_name -- env | grep PCIDEVICE_INTEL_COM_INTEL_SRIOV_700 | awk -F '=' '{print $2}' | sed 's/\r//g') +vf_numa_node=$(ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null $node_ip -- cat /sys/bus/pci/devices/$vf_pci/numa_node) echo "The allocated cpu core is:" $cpu_core echo "The numa node of the allocated cpu core is:" $cpu_numa_node -- cgit 1.2.3-korg