From 69e8ebc03fc705be91c244ebd2d44fca2c5f5593 Mon Sep 17 00:00:00 2001 From: Wei Zhou Date: Tue, 6 Feb 2024 11:14:10 +0100 Subject: [PATCH] CKS: retry if unable to drain node or unable to upgrade k8s node (#8402) * CKS: retry if unable to drain node or unable to upgrade k8s node I tried CKS upgrade 16 times, 11 of 16 upgrades succeeded. 2 of 16 upgrades failed due to ``` error: unable to drain node "testcluster-of7974-node-18c8c33c2c3" due to error:[error when evicting pods/"cloud-controller-manager-5b8fc87665-5nwlh" -n "kube-system": Post "https://10.0.66.18:6443/api/v1/namespaces/kube-system/pods/cloud-controller-manager-5b8fc87665-5nwlh/eviction": unexpected EOF, error when evicting pods/"coredns-5d78c9869d-h5nkz" -n "kube-system": Post "https://10.0.66.18:6443/api/v1/namespaces/kube-system/pods/coredns-5d78c9869d-h5nkz/eviction": unexpected EOF], continuing command... ``` 3 of 16 upgrades failed due to ``` Error from server: error when retrieving current configuration of: Resource: "rbac.authorization.k8s.io/v1, Resource=roles", GroupVersionKind: "rbac.authorization.k8s.io/v1, Kind=Role" Name: "kubernetes-dashboard", Namespace: "kubernetes-dashboard" from server for: "/mnt/k8sdisk//dashboard.yaml": etcdserver: leader changed ``` * CKS: remove tests of creating/deleting HA clusters as they are covered by the upgrade test * Update PR 8402 as suggested * test: remove CKS cluster if fail to create or verify --- .../cluster/KubernetesClusterManagerImpl.java | 1 + .../cluster/KubernetesClusterService.java | 6 ++ .../KubernetesClusterUpgradeWorker.java | 61 +++++++++++++------ .../smoke/test_kubernetes_clusters.py | 48 +++------------ 4 files changed, 59 insertions(+), 57 deletions(-) diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java index 19c66c1ba0b..4bd6ec65fc3 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java @@ -1647,6 +1647,7 @@ public class KubernetesClusterManagerImpl extends ManagerBase implements Kuberne KubernetesClusterStartTimeout, KubernetesClusterScaleTimeout, KubernetesClusterUpgradeTimeout, + KubernetesClusterUpgradeRetries, KubernetesClusterExperimentalFeaturesEnabled, KubernetesMaxClusterSize }; diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java index 420f3552720..12240a47eca 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java @@ -65,6 +65,12 @@ public interface KubernetesClusterService extends PluggableService, Configurable "Timeout interval (in seconds) in which upgrade operation for a Kubernetes cluster should be completed. Not strictly obeyed while upgrade is in progress on a node", true, KubernetesServiceEnabled.key()); + static final ConfigKey KubernetesClusterUpgradeRetries = new ConfigKey("Advanced", Integer.class, + "cloud.kubernetes.cluster.upgrade.retries", + "3", + "The number of retries if fail to upgrade kubernetes cluster due to some reasons (e.g. drain node, etcdserver leader changed)", + true, + KubernetesServiceEnabled.key()); static final ConfigKey KubernetesClusterExperimentalFeaturesEnabled = new ConfigKey("Advanced", Boolean.class, "cloud.kubernetes.cluster.experimental.features.enabled", "false", diff --git a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java index 14f5760d5ae..d418e20f58f 100644 --- a/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java +++ b/plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java @@ -77,39 +77,62 @@ public class KubernetesClusterUpgradeWorker extends KubernetesClusterActionWorke } private void upgradeKubernetesClusterNodes() { - Pair result = null; for (int i = 0; i < clusterVMs.size(); ++i) { UserVm vm = clusterVMs.get(i); String hostName = vm.getHostName(); if (StringUtils.isNotEmpty(hostName)) { hostName = hostName.toLowerCase(); } - result = null; + Pair result; if (LOGGER.isInfoEnabled()) { LOGGER.info(String.format("Upgrading node on VM %s in Kubernetes cluster %s with Kubernetes version(%s) ID: %s", vm.getDisplayName(), kubernetesCluster.getName(), upgradeVersion.getSemanticVersion(), upgradeVersion.getUuid())); } - try { - result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, - String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), - 10000, 10000, 60000); - } catch (Exception e) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); - } - if (!result.first()) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + String errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()); + for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) { + try { + result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, + String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), + 10000, 10000, 60000); + if (result.first()) { + break; + } + if (retry > 0) { + LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + } + } catch (Exception e) { + if (retry > 0) { + LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); + } + } } if (System.currentTimeMillis() > upgradeTimeoutTime) { logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); } - try { - deployProvider(); - result = runInstallScriptOnVM(vm, i); - } catch (Exception e) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); - } - if (!result.first()) { - logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()); + for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) { + try { + deployProvider(); + result = runInstallScriptOnVM(vm, i); + if (result.first()) { + break; + } + if (retry > 0) { + LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); + } + } catch (Exception e) { + if (retry > 0) { + LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry)); + } else { + logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); + } + } } if (System.currentTimeMillis() > upgradeTimeoutTime) { logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); diff --git a/test/integration/smoke/test_kubernetes_clusters.py b/test/integration/smoke/test_kubernetes_clusters.py index dc8d42f444c..0c5b96a7712 100644 --- a/test/integration/smoke/test_kubernetes_clusters.py +++ b/test/integration/smoke/test_kubernetes_clusters.py @@ -278,13 +278,15 @@ class TestKubernetesCluster(cloudstackTestCase): cls.apiclient.deleteKubernetesSupportedVersion(deleteKubernetesSupportedVersionCmd) @classmethod - def listKubernetesCluster(cls, cluster_id = None): + def listKubernetesCluster(cls, cluster_id = None, cluster_name = None): listKubernetesClustersCmd = listKubernetesClusters.listKubernetesClustersCmd() listKubernetesClustersCmd.listall = True if cluster_id != None: listKubernetesClustersCmd.id = cluster_id + if cluster_name != None: + listKubernetesClustersCmd.name = cluster_name clusterResponse = cls.apiclient.listKubernetesClusters(listKubernetesClustersCmd) - if cluster_id != None and clusterResponse != None: + if (cluster_id != None or cluster_name != None) and clusterResponse != None: return clusterResponse[0] return clusterResponse @@ -523,24 +525,6 @@ class TestKubernetesCluster(cloudstackTestCase): return - @attr(tags=["advanced", "smoke"], required_hardware="true") - @skipTestIf("hypervisorNotSupported") - def test_07_deploy_kubernetes_ha_cluster(self): - """Test to deploy a new HA Kubernetes cluster - - # Validate the following: - # 1. createKubernetesCluster should return valid info for new cluster - # 2. The Cloud Database contains the valid information - """ - if self.setup_failed == True: - self.fail("Setup incomplete") - if self.default_network: - self.skipTest("HA cluster on shared network requires external ip address, skipping it") - global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 3) - self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id) - return - @attr(tags=["advanced", "smoke"], required_hardware="true") @skipTestIf("hypervisorNotSupported") def test_08_upgrade_kubernetes_ha_cluster(self): @@ -568,24 +552,6 @@ class TestKubernetesCluster(cloudstackTestCase): self.debug("Kubernetes cluster with ID: %s successfully upgraded" % k8s_cluster.id) return - @attr(tags=["advanced", "smoke"], required_hardware="true") - @skipTestIf("hypervisorNotSupported") - def test_09_delete_kubernetes_ha_cluster(self): - """Test to delete a HA Kubernetes cluster - - # Validate the following: - # 1. deleteKubernetesCluster should delete an existing HA Kubernetes cluster - """ - if self.setup_failed == True: - self.fail("Setup incomplete") - if self.default_network: - self.skipTest("HA cluster on shared network requires external ip address, skipping it") - global k8s_cluster - k8s_cluster = self.getValidKubernetesCluster(1, 3) - - self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id) - return - @attr(tags=["advanced", "smoke"], required_hardware="true") @skipTestIf("hypervisorNotSupported") def test_10_vpc_tier_kubernetes_cluster(self): @@ -739,8 +705,14 @@ class TestKubernetesCluster(cloudstackTestCase): cluster = self.createKubernetesCluster(name, version.id, size, control_nodes) self.verifyKubernetesCluster(cluster, name, version.id, size, control_nodes) except Exception as ex: + cluster = self.listKubernetesCluster(cluster_name = name) + if cluster != None: + self.deleteKubernetesClusterAndVerify(cluster.id, False, True) self.fail("Kubernetes cluster deployment failed: %s" % ex) except AssertionError as err: + cluster = self.listKubernetesCluster(cluster_name = name) + if cluster != None: + self.deleteKubernetesClusterAndVerify(cluster.id, False, True) self.fail("Kubernetes cluster deployment failed during cluster verification: %s" % err) return cluster