From e86f671c8e9318309f802603e53f39cd4fd61a69 Mon Sep 17 00:00:00 2001 From: Nicolas Vazquez Date: Thu, 23 May 2019 09:13:17 -0300 Subject: [PATCH] KVM: Fix agents dont reconnect post maintenance (#3239) * Keep connection alive when on maintenance * Refactor cancel maintenance and unit tests * Add marvin tests * Refactor * Changing the way we get ssh credentials * Add check on SSH restart and improve marvin tests --- agent/src/com/cloud/agent/Agent.java | 4 +- .../com/cloud/resource/ResourceManager.java | 5 + .../com/cloud/agent/manager/AgentAttache.java | 3 +- .../src/com/cloud/configuration/Config.java | 8 - .../cloud/resource/ResourceManagerImpl.java | 96 +++-- .../resource/ResourceManagerImplTest.java | 105 ++++- .../smoke/test_host_maintenance.py | 402 ++++++++++++++++-- 7 files changed, 532 insertions(+), 91 deletions(-) diff --git a/agent/src/com/cloud/agent/Agent.java b/agent/src/com/cloud/agent/Agent.java index df0448dab22..500724dd5a3 100644 --- a/agent/src/com/cloud/agent/Agent.java +++ b/agent/src/com/cloud/agent/Agent.java @@ -606,9 +606,7 @@ public class Agent implements HandlerFactory, IAgentControl { System.exit(1); return; } else if (cmd instanceof MaintainCommand) { - s_logger.debug("Received maintainCommand"); - cancelTasks(); - _reconnectAllowed = false; + s_logger.debug("Received maintainCommand, do not cancel current tasks"); answer = new MaintainAnswer((MaintainCommand)cmd); } else if (cmd instanceof AgentControlCommand) { answer = null; diff --git a/engine/components-api/src/com/cloud/resource/ResourceManager.java b/engine/components-api/src/com/cloud/resource/ResourceManager.java index 720a980f4e7..b66f7923b4d 100755 --- a/engine/components-api/src/com/cloud/resource/ResourceManager.java +++ b/engine/components-api/src/com/cloud/resource/ResourceManager.java @@ -52,6 +52,11 @@ public interface ResourceManager extends ResourceService, Configurable { "Number of retries when preparing a host into Maintenance Mode is faulty before failing", true, ConfigKey.Scope.Cluster); + ConfigKey KvmSshToAgentEnabled = new ConfigKey<>("Advanced", Boolean.class, + "kvm.ssh.to.agent","true", + "Number of retries when preparing a host into Maintenance Mode is faulty before failing", + false); + /** * Register a listener for different types of resource life cycle events. * There can only be one type of listener per type of host. diff --git a/engine/orchestration/src/com/cloud/agent/manager/AgentAttache.java b/engine/orchestration/src/com/cloud/agent/manager/AgentAttache.java index a3838e1e6ef..4f03acd19fd 100644 --- a/engine/orchestration/src/com/cloud/agent/manager/AgentAttache.java +++ b/engine/orchestration/src/com/cloud/agent/manager/AgentAttache.java @@ -32,6 +32,7 @@ import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import com.cloud.agent.api.ModifySshKeysCommand; +import com.cloud.agent.api.ModifyStoragePoolCommand; import org.apache.cloudstack.managed.context.ManagedContextRunnable; import org.apache.log4j.Logger; @@ -115,7 +116,7 @@ public abstract class AgentAttache { StopCommand.class.toString(), CheckVirtualMachineCommand.class.toString(), PingTestCommand.class.toString(), CheckHealthCommand.class.toString(), ReadyCommand.class.toString(), ShutdownCommand.class.toString(), SetupCommand.class.toString(), CleanupNetworkRulesCmd.class.toString(), CheckNetworkCommand.class.toString(), PvlanSetupCommand.class.toString(), CheckOnHostCommand.class.toString(), - ModifyTargetsCommand.class.toString(), ModifySshKeysCommand.class.toString()}; + ModifyTargetsCommand.class.toString(), ModifySshKeysCommand.class.toString(), ModifyStoragePoolCommand.class.toString()}; protected final static String[] s_commandsNotAllowedInConnectingMode = new String[] { StartCommand.class.toString(), CreateCommand.class.toString() }; static { Arrays.sort(s_commandsAllowedInMaintenanceMode); diff --git a/server/src/com/cloud/configuration/Config.java b/server/src/com/cloud/configuration/Config.java index 98bacf2b907..098d5d7701f 100644 --- a/server/src/com/cloud/configuration/Config.java +++ b/server/src/com/cloud/configuration/Config.java @@ -1211,14 +1211,6 @@ public enum Config { KvmPublicNetwork("Hidden", ManagementServer.class, String.class, "kvm.public.network.device", null, "Specify the public bridge on host for public network", null), KvmPrivateNetwork("Hidden", ManagementServer.class, String.class, "kvm.private.network.device", null, "Specify the private bridge on host for private network", null), KvmGuestNetwork("Hidden", ManagementServer.class, String.class, "kvm.guest.network.device", null, "Specify the private bridge on host for private network", null), - KvmSshToAgentEnabled( - "Advanced", - ManagementServer.class, - Boolean.class, - "kvm.ssh.to.agent", - "true", - "Specify whether or not the management server is allowed to SSH into KVM Agents", - null), // Hyperv HypervPublicNetwork( diff --git a/server/src/com/cloud/resource/ResourceManagerImpl.java b/server/src/com/cloud/resource/ResourceManagerImpl.java index 59a7fa85d7a..34a5196d839 100755 --- a/server/src/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/com/cloud/resource/ResourceManagerImpl.java @@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap; import javax.inject.Inject; import javax.naming.ConfigurationException; +import com.cloud.utils.Pair; import com.cloud.vm.dao.UserVmDetailsDao; import org.apache.cloudstack.framework.config.ConfigKey; import org.apache.commons.lang.ObjectUtils; @@ -2344,46 +2345,77 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, } } + handleAgentIfNotConnected(host, vms_migrating); + try { resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId); _agentMgr.pullAgentOutMaintenance(hostId); retryHostMaintenance.remove(hostId); - - // for kvm, need to log into kvm host, restart cloudstack-agent - if ((host.getHypervisorType() == HypervisorType.KVM && !vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) { - - final boolean sshToAgent = Boolean.parseBoolean(_configDao.getValue(Config.KvmSshToAgentEnabled.key())); - if (!sshToAgent) { - s_logger.info("Configuration tells us not to SSH into Agents. Please restart the Agent (" + hostId + ") manually"); - return true; - } - - _hostDao.loadDetails(host); - final String password = host.getDetail("password"); - final String username = host.getDetail("username"); - if (password == null || username == null) { - s_logger.debug("Can't find password/username"); - return false; - } - final com.trilead.ssh2.Connection connection = SSHCmdHelper.acquireAuthorizedConnection(host.getPrivateIpAddress(), 22, username, password); - if (connection == null) { - s_logger.debug("Failed to connect to host: " + host.getPrivateIpAddress()); - return false; - } - - try { - SSHCmdHelper.SSHCmdResult result = SSHCmdHelper.sshExecuteCmdOneShot(connection, "service cloudstack-agent restart"); - s_logger.debug("cloudstack-agent restart result: " + result.toString()); - } catch (final SshException e) { - return false; - } - } - - return true; } catch (final NoTransitionException e) { s_logger.debug("Cannot transmit host " + host.getId() + "to Enabled state", e); return false; } + + return true; + + } + + /** + * Handle agent (if available) if its not connected before cancelling maintenance. + * Agent must be connected before cancelling maintenance. + * If the host status is not Up: + * - If kvm.ssh.to.agent is true, then SSH into the host and restart the agent. + * - If kvm.shh.to.agent is false, then fail cancelling maintenance + */ + protected void handleAgentIfNotConnected(HostVO host, boolean vmsMigrating) { + final boolean isAgentOnHost = host.getHypervisorType() == HypervisorType.KVM || + host.getHypervisorType() == HypervisorType.LXC; + if (!isAgentOnHost || vmsMigrating || host.getStatus() == Status.Up) { + return; + } + final boolean sshToAgent = Boolean.parseBoolean(_configDao.getValue(KvmSshToAgentEnabled.key())); + if (sshToAgent) { + Pair credentials = getHostCredentials(host); + connectAndRestartAgentOnHost(host, credentials.first(), credentials.second()); + } else { + throw new CloudRuntimeException("SSH access is disabled, cannot cancel maintenance mode as " + + "host agent is not connected"); + } + } + + /** + * Get host credentials + * @throws CloudRuntimeException if username or password are not found + */ + protected Pair getHostCredentials(HostVO host) { + _hostDao.loadDetails(host); + final String password = host.getDetail("password"); + final String username = host.getDetail("username"); + if (password == null || username == null) { + throw new CloudRuntimeException("SSH to agent is enabled, but username/password credentials are not found"); + } + return new Pair<>(username, password); + } + + /** + * True if agent is restarted via SSH. Assumes kvm.ssh.to.agent = true and host status is not Up + */ + protected void connectAndRestartAgentOnHost(HostVO host, String username, String password) { + final com.trilead.ssh2.Connection connection = SSHCmdHelper.acquireAuthorizedConnection( + host.getPrivateIpAddress(), 22, username, password); + if (connection == null) { + throw new CloudRuntimeException("SSH to agent is enabled, but failed to connect to host: " + host.getPrivateIpAddress()); + } + try { + SSHCmdHelper.SSHCmdResult result = SSHCmdHelper.sshExecuteCmdOneShot( + connection, "service cloudstack-agent restart"); + if (result.getReturnCode() != 0) { + throw new CloudRuntimeException("Could not restart agent on host " + host.getId() + " due to: " + result.getStdErr()); + } + s_logger.debug("cloudstack-agent restart result: " + result.toString()); + } catch (final SshException e) { + throw new CloudRuntimeException("SSH to agent is enabled, but agent restart failed", e); + } } private boolean cancelMaintenance(final long hostId) { diff --git a/server/test/com/cloud/resource/ResourceManagerImplTest.java b/server/test/com/cloud/resource/ResourceManagerImplTest.java index 6d14410fc1c..7d1a0fe0163 100644 --- a/server/test/com/cloud/resource/ResourceManagerImplTest.java +++ b/server/test/com/cloud/resource/ResourceManagerImplTest.java @@ -25,13 +25,20 @@ import com.cloud.event.ActionEventUtils; import com.cloud.ha.HighAvailabilityManager; import com.cloud.host.Host; import com.cloud.host.HostVO; +import com.cloud.host.Status; import com.cloud.host.dao.HostDao; import com.cloud.hypervisor.Hypervisor; import com.cloud.storage.StorageManager; +import com.cloud.utils.Pair; +import com.cloud.utils.exception.CloudRuntimeException; import com.cloud.utils.fsm.NoTransitionException; +import com.cloud.utils.ssh.SSHCmdHelper; +import com.cloud.utils.ssh.SshException; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.dao.UserVmDetailsDao; import com.cloud.vm.dao.VMInstanceDao; +import com.trilead.ssh2.Connection; +import org.apache.cloudstack.framework.config.dao.ConfigurationDao; import org.junit.Assert; import org.junit.Before; import org.junit.Test; @@ -56,12 +63,13 @@ import static org.mockito.Matchers.anyBoolean; import static org.mockito.Matchers.anyLong; import static org.mockito.Matchers.anyString; import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @RunWith(PowerMockRunner.class) -@PrepareForTest({ActionEventUtils.class, ResourceManagerImpl.class}) +@PrepareForTest({ActionEventUtils.class, ResourceManagerImpl.class, SSHCmdHelper.class}) public class ResourceManagerImplTest { @Mock @@ -78,6 +86,8 @@ public class ResourceManagerImplTest { private HostDao hostDao; @Mock private VMInstanceDao vmInstanceDao; + @Mock + private ConfigurationDao configurationDao; @Spy @InjectMocks @@ -99,7 +109,13 @@ public class ResourceManagerImplTest { @Mock private GetVncPortCommand getVncPortCommandVm2; + @Mock + private Connection sshConnection; + private static long hostId = 1L; + private static final String hostUsername = "user"; + private static final String hostPassword = "password"; + private static final String hostPrivateIp = "192.168.1.10"; private static long vm1Id = 1L; private static String vm1InstanceName = "i-1-VM"; @@ -117,9 +133,13 @@ public class ResourceManagerImplTest { when(host.getType()).thenReturn(Host.Type.Routing); when(host.getId()).thenReturn(hostId); when(host.getResourceState()).thenReturn(ResourceState.Enabled); - when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware); + when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.KVM); when(host.getClusterId()).thenReturn(1L); when(hostDao.findById(hostId)).thenReturn(host); + when(host.getDetail("username")).thenReturn(hostUsername); + when(host.getDetail("password")).thenReturn(hostPassword); + when(host.getStatus()).thenReturn(Status.Up); + when(host.getPrivateIpAddress()).thenReturn(hostPrivateIp); when(vm1.getId()).thenReturn(vm1Id); when(vm2.getId()).thenReturn(vm2Id); when(vm1.getInstanceName()).thenReturn(vm1InstanceName); @@ -138,6 +158,15 @@ public class ResourceManagerImplTest { PowerMockito.whenNew(GetVncPortCommand.class).withArguments(vm2Id, vm2InstanceName).thenReturn(getVncPortCommandVm2); when(agentManager.easySend(eq(hostId), eq(getVncPortCommandVm1))).thenReturn(getVncPortAnswerVm1); when(agentManager.easySend(eq(hostId), eq(getVncPortCommandVm2))).thenReturn(getVncPortAnswerVm2); + + PowerMockito.mockStatic(SSHCmdHelper.class); + BDDMockito.given(SSHCmdHelper.acquireAuthorizedConnection(eq(hostPrivateIp), eq(22), + eq(hostUsername), eq(hostPassword))).willReturn(sshConnection); + BDDMockito.given(SSHCmdHelper.sshExecuteCmdOneShot(eq(sshConnection), + eq("service cloudstack-agent restart"))). + willReturn(new SSHCmdHelper.SSHCmdResult(0,"","")); + + when(configurationDao.getValue(ResourceManager.KvmSshToAgentEnabled.key())).thenReturn("true"); } @Test @@ -206,4 +235,76 @@ public class ResourceManagerImplTest { verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations); verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations); } + + @Test(expected = CloudRuntimeException.class) + public void testGetHostCredentialsMissingParameter() { + when(host.getDetail("password")).thenReturn(null); + resourceManager.getHostCredentials(host); + } + + @Test + public void testGetHostCredentials() { + Pair credentials = resourceManager.getHostCredentials(host); + Assert.assertNotNull(credentials); + Assert.assertEquals(hostUsername, credentials.first()); + Assert.assertEquals(hostPassword, credentials.second()); + } + + @Test(expected = CloudRuntimeException.class) + public void testConnectAndRestartAgentOnHostCannotConnect() { + BDDMockito.given(SSHCmdHelper.acquireAuthorizedConnection(eq(hostPrivateIp), eq(22), + eq(hostUsername), eq(hostPassword))).willReturn(null); + resourceManager.connectAndRestartAgentOnHost(host, hostUsername, hostPassword); + } + + @Test(expected = CloudRuntimeException.class) + public void testConnectAndRestartAgentOnHostCannotRestart() throws Exception { + BDDMockito.given(SSHCmdHelper.sshExecuteCmdOneShot(eq(sshConnection), + eq("service cloudstack-agent restart"))).willThrow(new SshException("exception")); + resourceManager.connectAndRestartAgentOnHost(host, hostUsername, hostPassword); + } + + @Test + public void testConnectAndRestartAgentOnHost() { + resourceManager.connectAndRestartAgentOnHost(host, hostUsername, hostPassword); + } + + @Test + public void testHandleAgentSSHEnabledNotConnectedAgent() { + when(host.getStatus()).thenReturn(Status.Disconnected); + resourceManager.handleAgentIfNotConnected(host, false); + verify(resourceManager).getHostCredentials(eq(host)); + verify(resourceManager).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword)); + } + + @Test + public void testHandleAgentSSHEnabledConnectedAgent() { + when(host.getStatus()).thenReturn(Status.Up); + resourceManager.handleAgentIfNotConnected(host, false); + verify(resourceManager, never()).getHostCredentials(eq(host)); + verify(resourceManager, never()).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword)); + } + + @Test(expected = CloudRuntimeException.class) + public void testHandleAgentSSHDisabledNotConnectedAgent() { + when(host.getStatus()).thenReturn(Status.Disconnected); + when(configurationDao.getValue(ResourceManager.KvmSshToAgentEnabled.key())).thenReturn("false"); + resourceManager.handleAgentIfNotConnected(host, false); + } + + @Test + public void testHandleAgentSSHDisabledConnectedAgent() { + when(host.getStatus()).thenReturn(Status.Up); + when(configurationDao.getValue(ResourceManager.KvmSshToAgentEnabled.key())).thenReturn("false"); + resourceManager.handleAgentIfNotConnected(host, false); + verify(resourceManager, never()).getHostCredentials(eq(host)); + verify(resourceManager, never()).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword)); + } + + @Test + public void testHandleAgentVMsMigrating() { + resourceManager.handleAgentIfNotConnected(host, true); + verify(resourceManager, never()).getHostCredentials(eq(host)); + verify(resourceManager, never()).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword)); + } } diff --git a/test/integration/smoke/test_host_maintenance.py b/test/integration/smoke/test_host_maintenance.py index 7fc2139e3bb..c7cd9d3998f 100644 --- a/test/integration/smoke/test_host_maintenance.py +++ b/test/integration/smoke/test_host_maintenance.py @@ -18,15 +18,14 @@ """ # Import Local Modules -from marvin.codes import FAILED from marvin.cloudstackTestCase import * -from marvin.cloudstackAPI import * from marvin.lib.utils import * from marvin.lib.base import * -from marvin.lib.common import * +from marvin.lib.common import (get_zone, get_pod, get_template) from nose.plugins.attrib import attr - -from time import sleep +from marvin.lib.decoratorGenerators import skipTestIf +from distutils.util import strtobool +from marvin.sshClient import SshClient _multiprocess_shared_ = False @@ -45,37 +44,6 @@ class TestHostMaintenance(cloudstackTestCase): self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests()) self.pod = get_pod(self.apiclient, self.zone.id) self.cleanup = [] - self.services = { - "service_offering": { - "name": "Ultra Tiny Instance", - "displaytext": "Ultra Tiny Instance", - "cpunumber": 1, - "cpuspeed": 100, - "memory": 128, - }, - "vm": { - "username": "root", - "password": "password", - "ssh_port": 22, - # Hypervisor type should be same as - # hypervisor type of cluster - "privateport": 22, - "publicport": 22, - "protocol": 'TCP', - }, - "natrule": { - "privateport": 22, - "publicport": 22, - "startport": 22, - "endport": 22, - "protocol": "TCP", - "cidrlist": '0.0.0.0/0', - }, - "ostype": 'CentOS 5.3 (64-bit)', - "sleep": 60, - "timeout": 10, - } - def tearDown(self): try: @@ -89,38 +57,54 @@ class TestHostMaintenance(cloudstackTestCase): def createVMs(self, hostId, number): - self.template = get_test_template( + self.template = get_template( self.apiclient, self.zone.id, self.hypervisor ) if self.template == FAILED: - assert False, "get_test_template() failed to return template" + assert False, "get_template() failed to return template" self.logger.debug("Using template %s " % self.template.id) self.service_offering = ServiceOffering.create( self.apiclient, - self.services["service_offering"] + self.services["service_offerings"]["tiny"] ) self.logger.debug("Using service offering %s " % self.service_offering.id) - + self.network_offering = NetworkOffering.create( + self.apiclient, + self.services["l2-network_offering"], + ) + self.network_offering.update(self.apiclient, state='Enabled') + self.services["network"]["networkoffering"] = self.network_offering.id + self.l2_network = Network.create( + self.apiclient, + self.services["l2-network"], + zoneid=self.zone.id, + networkofferingid=self.network_offering.id + ) + vms=[] for i in range(0, number): - self.services["vm"]["zoneid"] = self.zone.id - self.services["vm"]["template"] = self.template.id - self.services["vm"]["displayname"] = 'vm' + str(i) - self.services["vm"]["hypervisor"] = self.hypervisor + self.services["virtual_machine"]["zoneid"] = self.zone.id + self.services["virtual_machine"]["template"] = self.template.id + self.services["virtual_machine"]["displayname"] = 'vm' + str(i) + self.services["virtual_machine"]["hypervisor"] = self.hypervisor vm = VirtualMachine.create( self.apiclient, - self.services["vm"], + self.services["virtual_machine"], serviceofferingid=self.service_offering.id, + networkids=self.l2_network.id, hostid=hostId ) vms.append(vm) self.cleanup.append(vm) self.logger.debug("VM create = {}".format(vm.id)) + self.cleanup.append(self.l2_network) + self.cleanup.append(self.network_offering) + self.cleanup.append(self.service_offering) return vms def checkVmMigratingOnHost(self, hostId): @@ -290,3 +274,331 @@ class TestHostMaintenance(cloudstackTestCase): return +class TestHostMaintenanceAgents(cloudstackTestCase): + + @classmethod + def setUpClass(cls): + cls.testClient = super(TestHostMaintenanceAgents, cls).getClsTestClient() + cls.apiclient = cls.testClient.getApiClient() + cls.hypervisor = cls.testClient.getHypervisorInfo() + cls.dbclient = cls.testClient.getDbConnection() + cls.zone = get_zone(cls.apiclient, cls.testClient.getZoneForTests()) + cls.pod = get_pod(cls.apiclient, cls.zone.id) + cls.services = cls.testClient.getParsedTestDataConfig() + + cls.logger = logging.getLogger('TestHMAgents') + cls.stream_handler = logging.StreamHandler() + cls.logger.setLevel(logging.DEBUG) + cls.logger.addHandler(cls.stream_handler) + + cls._cleanup = [] + cls.hypervisorNotSupported = False + if cls.hypervisor.lower() not in ['kvm', 'lxc']: + cls.hypervisorNotSupported = True + + if not cls.hypervisorNotSupported: + cls.initialsshvalue = cls.is_ssh_enabled() + cls.template = get_template( + cls.apiclient, + cls.zone.id, + cls.hypervisor + ) + cls.services["virtual_machine"]["zoneid"] = cls.zone.id + cls.services["virtual_machine"]["template"] = cls.template.id + cls.services["virtual_machine"]["hypervisor"] = cls.hypervisor + cls.service_offering = ServiceOffering.create( + cls.apiclient, + cls.services["service_offerings"]["tiny"] + ) + cls._cleanup.append(cls.service_offering) + cls.network_offering = NetworkOffering.create( + cls.apiclient, + cls.services["l2-network_offering"], + ) + cls.network_offering.update(cls.apiclient, state='Enabled') + cls.services["network"]["networkoffering"] = cls.network_offering.id + cls.l2_network = Network.create( + cls.apiclient, + cls.services["l2-network"], + zoneid=cls.zone.id, + networkofferingid=cls.network_offering.id + ) + cls._cleanup.append(cls.l2_network) + cls._cleanup.append(cls.network_offering) + + cls.hostConfig = cls.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__ + + + @classmethod + def tearDownClass(cls): + try: + if not cls.hypervisorNotSupported: + # Revert setting value to the original + cls.set_ssh_enabled(cls.initialsshvalue) + cleanup_resources(cls.apiclient, cls._cleanup) + except Exception as e: + raise Exception("Warning: Exception during cleanup : %s" % e) + + def setUp(self): + if not self.hypervisorNotSupported: + self.host = self.get_enabled_host_connected_agent() + self.cleanup = [] + + def tearDown(self): + try: + cleanup_resources(self.apiclient, self.cleanup) + except Exception as e: + raise Exception("Warning: Exception during cleanup : %s" % e) + + + @classmethod + def is_ssh_enabled(cls): + conf = Configurations.list(cls.apiclient, name="kvm.ssh.to.agent") + if not conf: + return False + else: + return bool(strtobool(conf[0].value)) if conf[0].value else False + + @classmethod + def updateConfiguration(self, name, value): + cmd = updateConfiguration.updateConfigurationCmd() + cmd.name = name + cmd.value = value + self.apiclient.updateConfiguration(cmd) + + @classmethod + def set_ssh_enabled(cls, on): + value = "true" if on else "false" + cls.updateConfiguration('kvm.ssh.to.agent', value) + + def prepare_host_for_maintenance(self, hostid): + cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() + cmd.id = hostid + self.apiclient.prepareHostForMaintenance(cmd) + self.logger.debug('Host with id %s is in prepareHostForMaintenance' % hostid) + + def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20): + def check_resource_state(): + response = Host.list( + self.apiclient, + id=hostid + ) + if isinstance(response, list): + if response[0].resourcestate == resourcestate: + self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate)) + return True, None + return False, None + + done, _ = wait_until(interval, retries, check_resource_state) + if not done: + raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate)) + return True + + def wait_until_agent_is_in_state(self, hostid, state, interval=3, retries=20): + def check_agent_state(): + response = Host.list( + self.apiclient, + id=hostid + ) + if isinstance(response, list): + if response[0].state == state: + self.logger.debug('Host agent with id %s is in state = %s' % (hostid, state)) + return True, None + return False, None + + done, _ = wait_until(interval, retries, check_agent_state) + if not done: + raise Exception("Failed to wait for host agent %s to be on state %s" % (hostid, state)) + return True + + def cancel_host_maintenance(self, hostid): + cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() + cmd.id = hostid + self.apiclient.cancelHostMaintenance(cmd) + self.logger.debug('Host with id %s is cancelling maintenance' % hostid) + + def get_enabled_host_connected_agent(self): + hosts = Host.list( + self.apiclient, + type='Routing', + zoneid=self.zone.id, + podid=self.pod.id, + hypervisor=self.hypervisor, + resourcestate='Enabled', + state='Up' + ) + if len(hosts) < 2: + raise unittest.SkipTest("Cancel host maintenance must be tested for 2 or more hosts") + return hosts[0] + + def deploy_vm_on_host(self, hostid): + return VirtualMachine.create( + self.apiclient, + self.services["virtual_machine"], + serviceofferingid=self.service_offering.id, + networkids=self.l2_network.id, + hostid=hostid + ) + + def assert_host_is_functional_after_cancelling_maintenance(self, hostid): + self.wait_until_agent_is_in_state(hostid, "Up") + self.logger.debug('Deploying VM on host %s' % hostid) + vm = self.deploy_vm_on_host(hostid) + self.assertEqual( + vm.state, + "Running", + "Check VM is running on the host" + ) + self.cleanup.append(vm) + + def revert_host_state_on_failure(self, host): + cmd = updateHost.updateHostCmd() + cmd.id = host.id + cmd.allocationstate = "Enable" + response = self.apiclient.updateHost(cmd) + self.assertEqual(response.resourcestate, "Enabled") + + @skipTestIf("hypervisorNotSupported") + @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true") + def test_01_cancel_host_maintenance_ssh_enabled_agent_connected(self): + """ + Test cancel maintenance when: 'kvm.ssh.to.agent' = true, agent state = 'Up' + + 1) Put host on Maintenance + 2) Cancel maintenance on host + 4) Assert agent is still connected after cancelling maintenance + 3) Deploy VM on the host after cancelling maintenance + """ + + if not self.is_ssh_enabled(): + self.set_ssh_enabled(True) + + try: + self.prepare_host_for_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Maintenance") + self.cancel_host_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Enabled") + self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) + except Exception as e: + self.revert_host_state_on_failure(self.host) + self.fail(e) + + def get_ssh_client(self, ip, username, password, retries=10): + """ Setup ssh client connection and return connection """ + + try: + ssh_client = SshClient(ip, 22, username, password, retries) + except Exception as e: + raise unittest.SkipTest("Unable to create ssh connection: " % e) + + self.assertIsNotNone( + ssh_client, "Failed to setup ssh connection to ip=%s" % ip) + + return ssh_client + + @skipTestIf("hypervisorNotSupported") + @attr(tags=["boris", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true") + def test_02_cancel_host_maintenance_ssh_enabled_agent_disconnected(self): + """ + Test cancel maintenance when: 'kvm.ssh.to.agent' = true, agent state != 'Up' + + 1) Put host on maintenance + 2) SSH into host and stop cloudstack-agent service - host gets Disconnected + 3) Cancel maintenance on host + 4) Assert agent is connected after cancelling maintenance + 5) Deploy VM on the host + """ + + if not self.is_ssh_enabled(): + self.set_ssh_enabled(True) + # username, password = self.get_host_credentials(self.host.id) + username = self.hostConfig["username"] + password = self.hostConfig["password"] + + try: + self.prepare_host_for_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Maintenance") + + ssh_client = self.get_ssh_client(self.host.ipaddress, self.hostConfig["username"], + self.hostConfig["password"]) + ssh_client.execute("service cloudstack-agent stop") + self.wait_until_agent_is_in_state(self.host.id, "Disconnected") + + self.cancel_host_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Enabled") + + self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) + except Exception as e: + self.revert_host_state_on_failure(self.host) + self.fail(e) + + @skipTestIf("hypervisorNotSupported") + @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true") + def test_03_cancel_host_maintenance_ssh_disabled_agent_connected(self): + """ + Test cancel maintenance when: 'kvm.ssh.to.agent' = false, agent state = 'Up' + + 1) Put host on Maintenance + 2) Cancel maintenance on host + 4) Assert agent is still connected after cancelling maintenance + 3) Deploy VM on the host after cancelling maintenance + """ + + if self.is_ssh_enabled(): + self.set_ssh_enabled(False) + + try: + self.prepare_host_for_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Maintenance") + self.cancel_host_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Enabled") + self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) + except Exception as e: + self.revert_host_state_on_failure(self.host) + self.fail(e) + + @skipTestIf("hypervisorNotSupported") + @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true") + def test_04_cancel_host_maintenance_ssh_disabled_agent_disconnected(self): + """ + Test cancel maintenance when: 'kvm.ssh.to.agent' = false, agent state != 'Up' + + 1) Put host on maintenance + 2) SSH into host (if possible) and stop cloudstack-agent service - host gets Disconnected. + Skip test if not possible to SSH into host + 3) Cancel maintenance on host - assert cannot cancel maintenance on disconnected host (exception thwown) + 4( SSH into host and start cloudstack-agent service - host gets connected + 5) Cancel maintenance on host + 4) Assert agent is connected after cancelling maintenance + 5) Deploy VM on the host + """ + + if self.is_ssh_enabled(): + self.set_ssh_enabled(False) + + try: + self.prepare_host_for_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Maintenance") + + ssh_client = self.get_ssh_client(self.host.ipaddress, self.hostConfig["username"], + self.hostConfig["password"]) + ssh_client.execute("service cloudstack-agent stop") + self.wait_until_agent_is_in_state(self.host.id, "Disconnected") + except Exception as e: + self.revert_host_state_on_failure(self.host) + self.fail(e) + + self.assertRaises(Exception, self.cancel_host_maintenance, self.host.id) + + try: + ssh_client = self.get_ssh_client(self.host.ipaddress, self.hostConfig["username"], + self.hostConfig["password"]) + ssh_client.execute("service cloudstack-agent start") + self.wait_until_agent_is_in_state(self.host.id, "Up") + + self.cancel_host_maintenance(self.host.id) + self.wait_until_host_is_in_state(self.host.id, "Enabled") + self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) + except Exception as e: + self.revert_host_state_on_failure(self.host) + self.fail(e)