From fc44df7c95e50ceefb89cc18b1af088d1142fd3c Mon Sep 17 00:00:00 2001 From: Wei Zhou Date: Sat, 9 Dec 2023 07:03:05 +0100 Subject: [PATCH] CKS: create HA cluster with 3 control VMs instead 2 (#8297) This PR fixes the test failures with CKS HA-cluster upgrade. In production, the CKS HA cluster should have at least 3 control VMs as well. The etcd cluster requires 3 members to achieve reliable HA. The etcd daemon in control VMs uses RAFT protocol to determine the roles of nodes. During upgrade of CKS with HA, the etcd become unreliable if there are only 2 control VMs. --- .../actionworkers/KubernetesClusterUpgradeWorker.java | 2 +- .../src/main/resources/script/upgrade-kubernetes.sh | 2 +- test/integration/smoke/test_kubernetes_clusters.py | 10 +++++----- ui/src/views/compute/CreateKubernetesCluster.vue | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java index 9b7b6ca47c2..14f5760d5ae 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java @@ -91,7 +91,7 @@ public class KubernetesClusterUpgradeWorker extends KubernetesClusterActionWorke } try { result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, - String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-local-data", hostName), + String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), 10000, 10000, 60000); } catch (Exception e) { logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); diff --git a/plugins/integrations/kubernetes-service/src/main/resources/script/upgrade-kubernetes.sh b/plugins/integrations/kubernetes-service/src/main/resources/script/upgrade-kubernetes.sh index 84f764de93e..c092f53359d 100755 --- a/plugins/integrations/kubernetes-service/src/main/resources/script/upgrade-kubernetes.sh +++ b/plugins/integrations/kubernetes-service/src/main/resources/script/upgrade-kubernetes.sh @@ -137,7 +137,7 @@ if [ -d "$BINARIES_DIR" ]; then systemctl stop kubelet cp -a ${BINARIES_DIR}/k8s/{kubelet,kubectl} /opt/bin - chmod +x {kubelet,kubectl} + chmod +x /opt/bin/{kubelet,kubectl} systemctl daemon-reload systemctl restart containerd diff --git a/test/integration/smoke/test_kubernetes_clusters.py b/test/integration/smoke/test_kubernetes_clusters.py index e53f71871d8..dc8d42f444c 100644 --- a/test/integration/smoke/test_kubernetes_clusters.py +++ b/test/integration/smoke/test_kubernetes_clusters.py @@ -526,7 +526,7 @@ class TestKubernetesCluster(cloudstackTestCase): @attr(tags=["advanced", "smoke"], required_hardware="true") @skipTestIf("hypervisorNotSupported") def test_07_deploy_kubernetes_ha_cluster(self): - """Test to deploy a new Kubernetes cluster + """Test to deploy a new HA Kubernetes cluster # Validate the following: # 1. createKubernetesCluster should return valid info for new cluster @@ -537,14 +537,14 @@ class TestKubernetesCluster(cloudstackTestCase): if self.default_network: self.skipTest("HA cluster on shared network requires external ip address, skipping it") global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 2) + k8s_cluster = self.getValidKubernetesCluster(1, 3) self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id) return @attr(tags=["advanced", "smoke"], required_hardware="true") @skipTestIf("hypervisorNotSupported") def test_08_upgrade_kubernetes_ha_cluster(self): - """Test to upgrade a Kubernetes cluster to newer version + """Test to upgrade a HA Kubernetes cluster to newer version # Validate the following: # 1. upgradeKubernetesCluster should return valid info for the cluster @@ -554,7 +554,7 @@ class TestKubernetesCluster(cloudstackTestCase): if self.default_network: self.skipTest("HA cluster on shared network requires external ip address, skipping it") global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 2, version=self.kubernetes_version_v1) + k8s_cluster = self.getValidKubernetesCluster(1, 3, version=self.kubernetes_version_v1) time.sleep(self.services["sleep"]) self.debug("Upgrading HA Kubernetes cluster with ID: %s" % k8s_cluster.id) @@ -581,7 +581,7 @@ class TestKubernetesCluster(cloudstackTestCase): if self.default_network: self.skipTest("HA cluster on shared network requires external ip address, skipping it") global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 2) + k8s_cluster = self.getValidKubernetesCluster(1, 3) self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id) return diff --git a/ui/src/views/compute/CreateKubernetesCluster.vue b/ui/src/views/compute/CreateKubernetesCluster.vue index 1c70dece6e1..908fc58013e 100644 --- a/ui/src/views/compute/CreateKubernetesCluster.vue +++ b/ui/src/views/compute/CreateKubernetesCluster.vue @@ -278,7 +278,7 @@ export default { initForm () { this.formRef = ref() this.form = reactive({ - controlnodes: 2, + controlnodes: 3, size: 1, noderootdisksize: 8 })