mirror of
				https://github.com/apache/cloudstack.git
				synced 2025-10-26 08:42:29 +01:00 
			
		
		
		
	CKS: retry if unable to drain node or unable to upgrade k8s node (#8402)
* CKS: retry if unable to drain node or unable to upgrade k8s node I tried CKS upgrade 16 times, 11 of 16 upgrades succeeded. 2 of 16 upgrades failed due to ``` error: unable to drain node "testcluster-of7974-node-18c8c33c2c3" due to error:[error when evicting pods/"cloud-controller-manager-5b8fc87665-5nwlh" -n "kube-system": Post "https://10.0.66.18:6443/api/v1/namespaces/kube-system/pods/cloud-controller-manager-5b8fc87665-5nwlh/eviction": unexpected EOF, error when evicting pods/"coredns-5d78c9869d-h5nkz" -n "kube-system": Post "https://10.0.66.18:6443/api/v1/namespaces/kube-system/pods/coredns-5d78c9869d-h5nkz/eviction": unexpected EOF], continuing command... ``` 3 of 16 upgrades failed due to ``` Error from server: error when retrieving current configuration of: Resource: "rbac.authorization.k8s.io/v1, Resource=roles", GroupVersionKind: "rbac.authorization.k8s.io/v1, Kind=Role" Name: "kubernetes-dashboard", Namespace: "kubernetes-dashboard" from server for: "/mnt/k8sdisk//dashboard.yaml": etcdserver: leader changed ``` * CKS: remove tests of creating/deleting HA clusters as they are covered by the upgrade test * Update PR 8402 as suggested * test: remove CKS cluster if fail to create or verify
This commit is contained in:
		
							parent
							
								
									b2e29931e8
								
							
						
					
					
						commit
						69e8ebc03f
					
				| @ -1647,6 +1647,7 @@ public class KubernetesClusterManagerImpl extends ManagerBase implements Kuberne | ||||
|             KubernetesClusterStartTimeout, | ||||
|             KubernetesClusterScaleTimeout, | ||||
|             KubernetesClusterUpgradeTimeout, | ||||
|             KubernetesClusterUpgradeRetries, | ||||
|             KubernetesClusterExperimentalFeaturesEnabled, | ||||
|             KubernetesMaxClusterSize | ||||
|         }; | ||||
|  | ||||
| @ -65,6 +65,12 @@ public interface KubernetesClusterService extends PluggableService, Configurable | ||||
|             "Timeout interval (in seconds) in which upgrade operation for a Kubernetes cluster should be completed. Not strictly obeyed while upgrade is in progress on a node", | ||||
|             true, | ||||
|             KubernetesServiceEnabled.key()); | ||||
|     static final ConfigKey<Integer> KubernetesClusterUpgradeRetries = new ConfigKey<Integer>("Advanced", Integer.class, | ||||
|             "cloud.kubernetes.cluster.upgrade.retries", | ||||
|             "3", | ||||
|             "The number of retries if fail to upgrade kubernetes cluster due to some reasons (e.g. drain node, etcdserver leader changed)", | ||||
|             true, | ||||
|             KubernetesServiceEnabled.key()); | ||||
|     static final ConfigKey<Boolean> KubernetesClusterExperimentalFeaturesEnabled = new ConfigKey<Boolean>("Advanced", Boolean.class, | ||||
|             "cloud.kubernetes.cluster.experimental.features.enabled", | ||||
|             "false", | ||||
|  | ||||
| @ -77,39 +77,62 @@ public class KubernetesClusterUpgradeWorker extends KubernetesClusterActionWorke | ||||
|     } | ||||
| 
 | ||||
|     private void upgradeKubernetesClusterNodes() { | ||||
|         Pair<Boolean, String> result = null; | ||||
|         for (int i = 0; i < clusterVMs.size(); ++i) { | ||||
|             UserVm vm = clusterVMs.get(i); | ||||
|             String hostName = vm.getHostName(); | ||||
|             if (StringUtils.isNotEmpty(hostName)) { | ||||
|                 hostName = hostName.toLowerCase(); | ||||
|             } | ||||
|             result = null; | ||||
|             Pair<Boolean, String> result; | ||||
|             if (LOGGER.isInfoEnabled()) { | ||||
|                 LOGGER.info(String.format("Upgrading node on VM %s in Kubernetes cluster %s with Kubernetes version(%s) ID: %s", | ||||
|                         vm.getDisplayName(), kubernetesCluster.getName(), upgradeVersion.getSemanticVersion(), upgradeVersion.getUuid())); | ||||
|             } | ||||
|             try { | ||||
|                 result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, | ||||
|                         String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), | ||||
|                         10000, 10000, 60000); | ||||
|             } catch (Exception e) { | ||||
|                 logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); | ||||
|             } | ||||
|             if (!result.first()) { | ||||
|                 logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); | ||||
|             String errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()); | ||||
|             for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) { | ||||
|                 try { | ||||
|                     result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null, | ||||
|                             String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName), | ||||
|                             10000, 10000, 60000); | ||||
|                     if (result.first()) { | ||||
|                         break; | ||||
|                     } | ||||
|                     if (retry > 0) { | ||||
|                         LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry)); | ||||
|                     } else { | ||||
|                         logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); | ||||
|                     } | ||||
|                 } catch (Exception e) { | ||||
|                     if (retry > 0) { | ||||
|                         LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry)); | ||||
|                     } else { | ||||
|                         logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             if (System.currentTimeMillis() > upgradeTimeoutTime) { | ||||
|                 logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); | ||||
|             } | ||||
|             try { | ||||
|                 deployProvider(); | ||||
|                 result = runInstallScriptOnVM(vm, i); | ||||
|             } catch (Exception e) { | ||||
|                 logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); | ||||
|             } | ||||
|             if (!result.first()) { | ||||
|                 logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); | ||||
|             errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()); | ||||
|             for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) { | ||||
|                 try { | ||||
|                     deployProvider(); | ||||
|                     result = runInstallScriptOnVM(vm, i); | ||||
|                     if (result.first()) { | ||||
|                         break; | ||||
|                     } | ||||
|                     if (retry > 0) { | ||||
|                         LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry)); | ||||
|                     } else { | ||||
|                         logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); | ||||
|                     } | ||||
|                 } catch (Exception e) { | ||||
|                     if (retry > 0) { | ||||
|                         LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry)); | ||||
|                     } else { | ||||
|                         logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             if (System.currentTimeMillis() > upgradeTimeoutTime) { | ||||
|                 logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null); | ||||
|  | ||||
| @ -278,13 +278,15 @@ class TestKubernetesCluster(cloudstackTestCase): | ||||
|         cls.apiclient.deleteKubernetesSupportedVersion(deleteKubernetesSupportedVersionCmd) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def listKubernetesCluster(cls, cluster_id = None): | ||||
|     def listKubernetesCluster(cls, cluster_id = None, cluster_name = None): | ||||
|         listKubernetesClustersCmd = listKubernetesClusters.listKubernetesClustersCmd() | ||||
|         listKubernetesClustersCmd.listall = True | ||||
|         if cluster_id != None: | ||||
|             listKubernetesClustersCmd.id = cluster_id | ||||
|         if cluster_name != None: | ||||
|             listKubernetesClustersCmd.name = cluster_name | ||||
|         clusterResponse = cls.apiclient.listKubernetesClusters(listKubernetesClustersCmd) | ||||
|         if cluster_id != None and clusterResponse != None: | ||||
|         if (cluster_id != None or cluster_name != None) and clusterResponse != None: | ||||
|             return clusterResponse[0] | ||||
|         return clusterResponse | ||||
| 
 | ||||
| @ -523,24 +525,6 @@ class TestKubernetesCluster(cloudstackTestCase): | ||||
| 
 | ||||
|         return | ||||
| 
 | ||||
|     @attr(tags=["advanced", "smoke"], required_hardware="true") | ||||
|     @skipTestIf("hypervisorNotSupported") | ||||
|     def test_07_deploy_kubernetes_ha_cluster(self): | ||||
|         """Test to deploy a new HA Kubernetes cluster | ||||
| 
 | ||||
|         # Validate the following: | ||||
|         # 1. createKubernetesCluster should return valid info for new cluster | ||||
|         # 2. The Cloud Database contains the valid information | ||||
|         """ | ||||
|         if self.setup_failed == True: | ||||
|             self.fail("Setup incomplete") | ||||
|         if self.default_network: | ||||
|             self.skipTest("HA cluster on shared network requires external ip address, skipping it") | ||||
|         global k8s_cluster | ||||
|         k8s_cluster = self.getValidKubernetesCluster(1, 3) | ||||
|         self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id) | ||||
|         return | ||||
| 
 | ||||
|     @attr(tags=["advanced", "smoke"], required_hardware="true") | ||||
|     @skipTestIf("hypervisorNotSupported") | ||||
|     def test_08_upgrade_kubernetes_ha_cluster(self): | ||||
| @ -568,24 +552,6 @@ class TestKubernetesCluster(cloudstackTestCase): | ||||
|         self.debug("Kubernetes cluster with ID: %s successfully upgraded" % k8s_cluster.id) | ||||
|         return | ||||
| 
 | ||||
|     @attr(tags=["advanced", "smoke"], required_hardware="true") | ||||
|     @skipTestIf("hypervisorNotSupported") | ||||
|     def test_09_delete_kubernetes_ha_cluster(self): | ||||
|         """Test to delete a HA Kubernetes cluster | ||||
| 
 | ||||
|         # Validate the following: | ||||
|         # 1. deleteKubernetesCluster should delete an existing HA Kubernetes cluster | ||||
|         """ | ||||
|         if self.setup_failed == True: | ||||
|             self.fail("Setup incomplete") | ||||
|         if self.default_network: | ||||
|             self.skipTest("HA cluster on shared network requires external ip address, skipping it") | ||||
|         global k8s_cluster | ||||
|         k8s_cluster = self.getValidKubernetesCluster(1, 3) | ||||
| 
 | ||||
|         self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id) | ||||
|         return | ||||
| 
 | ||||
|     @attr(tags=["advanced", "smoke"], required_hardware="true") | ||||
|     @skipTestIf("hypervisorNotSupported") | ||||
|     def test_10_vpc_tier_kubernetes_cluster(self): | ||||
| @ -739,8 +705,14 @@ class TestKubernetesCluster(cloudstackTestCase): | ||||
|             cluster = self.createKubernetesCluster(name, version.id, size, control_nodes) | ||||
|             self.verifyKubernetesCluster(cluster, name, version.id, size, control_nodes) | ||||
|         except Exception as ex: | ||||
|             cluster = self.listKubernetesCluster(cluster_name = name) | ||||
|             if cluster != None: | ||||
|                 self.deleteKubernetesClusterAndVerify(cluster.id, False, True) | ||||
|             self.fail("Kubernetes cluster deployment failed: %s" % ex) | ||||
|         except AssertionError as err: | ||||
|             cluster = self.listKubernetesCluster(cluster_name = name) | ||||
|             if cluster != None: | ||||
|                 self.deleteKubernetesClusterAndVerify(cluster.id, False, True) | ||||
|             self.fail("Kubernetes cluster deployment failed during cluster verification: %s" % err) | ||||
|         return cluster | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user