From 3b11663d87e3f3e643d37e1ba1338e9402a2622a Mon Sep 17 00:00:00 2001 From: Vishesh Date: Thu, 26 Oct 2023 20:24:36 +0530 Subject: [PATCH 1/3] Fix failure on agent reconnection (#8089) --- .../src/main/java/com/cloud/agent/Agent.java | 4 + .../java/com/cloud/agent/api/PingAnswer.java | 13 +- .../cloud/agent/manager/AgentManagerImpl.java | 154 +++++++++++------- test/integration/smoke/test_host_ping.py | 102 ++++++++++++ 4 files changed, 216 insertions(+), 57 deletions(-) create mode 100644 test/integration/smoke/test_host_ping.py diff --git a/agent/src/main/java/com/cloud/agent/Agent.java b/agent/src/main/java/com/cloud/agent/Agent.java index e9213ca9b8c..e8558931cbf 100644 --- a/agent/src/main/java/com/cloud/agent/Agent.java +++ b/agent/src/main/java/com/cloud/agent/Agent.java @@ -40,6 +40,7 @@ import java.util.concurrent.atomic.AtomicInteger; import javax.naming.ConfigurationException; +import com.cloud.agent.api.PingAnswer; import com.cloud.utils.NumbersUtil; import org.apache.cloudstack.agent.lb.SetupMSListAnswer; import org.apache.cloudstack.agent.lb.SetupMSListCommand; @@ -822,6 +823,9 @@ public class Agent implements HandlerFactory, IAgentControl { listener.processControlResponse(response, (AgentControlAnswer)answer); } } + } else if (answer instanceof PingAnswer && (((PingAnswer) answer).isSendStartup()) && _reconnectAllowed) { + s_logger.info("Management server requested startup command to reinitialize the agent"); + sendStartup(link); } else { setLastPingResponseTime(); } diff --git a/core/src/main/java/com/cloud/agent/api/PingAnswer.java b/core/src/main/java/com/cloud/agent/api/PingAnswer.java index 35242380739..6353b121583 100644 --- a/core/src/main/java/com/cloud/agent/api/PingAnswer.java +++ b/core/src/main/java/com/cloud/agent/api/PingAnswer.java @@ -22,15 +22,26 @@ package com.cloud.agent.api; public class PingAnswer extends Answer { private PingCommand _command = null; + private boolean sendStartup = false; + protected PingAnswer() { } - public PingAnswer(PingCommand cmd) { + public PingAnswer(PingCommand cmd, boolean sendStartup) { super(cmd); _command = cmd; + this.sendStartup = sendStartup; } public PingCommand getCommand() { return _command; } + + public boolean isSendStartup() { + return sendStartup; + } + + public void setSendStartup(boolean sendStartup) { + this.sendStartup = sendStartup; + } } diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index b74c11cf138..26e871877a3 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -40,6 +40,7 @@ import javax.naming.ConfigurationException; import com.cloud.configuration.Config; import com.cloud.utils.NumbersUtil; +import com.cloud.utils.db.GlobalLock; import org.apache.cloudstack.agent.lb.IndirectAgentLB; import org.apache.cloudstack.ca.CAManager; import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService; @@ -798,49 +799,65 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl return true; } + protected Status getNextStatusOnDisconnection(Host host, final Status.Event event) { + final Status currentStatus = host.getStatus(); + Status nextStatus; + if (currentStatus == Status.Down || currentStatus == Status.Alert || currentStatus == Status.Removed) { + if (s_logger.isDebugEnabled()) { + s_logger.debug(String.format("Host %s is already %s", host.getUuid(), currentStatus)); + } + nextStatus = currentStatus; + } else { + try { + nextStatus = currentStatus.getNextStatus(event); + } catch (final NoTransitionException e) { + final String err = String.format("Cannot find next status for %s as current status is %s for agent %s", event, currentStatus, host.getUuid()); + s_logger.debug(err); + throw new CloudRuntimeException(err); + } + + if (s_logger.isDebugEnabled()) { + s_logger.debug(String.format("The next status of agent %s is %s, current status is %s", host.getUuid(), nextStatus, currentStatus)); + } + } + return nextStatus; + } + protected boolean handleDisconnectWithoutInvestigation(final AgentAttache attache, final Status.Event event, final boolean transitState, final boolean removeAgent) { final long hostId = attache.getId(); - s_logger.info("Host " + hostId + " is disconnecting with event " + event); - Status nextStatus = null; - final HostVO host = _hostDao.findById(hostId); - if (host == null) { - s_logger.warn("Can't find host with " + hostId); - nextStatus = Status.Removed; - } else { - final Status currentStatus = host.getStatus(); - if (currentStatus == Status.Down || currentStatus == Status.Alert || currentStatus == Status.Removed) { - if (s_logger.isDebugEnabled()) { - s_logger.debug("Host " + hostId + " is already " + currentStatus); - } - nextStatus = currentStatus; - } else { - try { - nextStatus = currentStatus.getNextStatus(event); - } catch (final NoTransitionException e) { - final String err = "Cannot find next status for " + event + " as current status is " + currentStatus + " for agent " + hostId; - s_logger.debug(err); - throw new CloudRuntimeException(err); + boolean result = false; + GlobalLock joinLock = getHostJoinLock(hostId); + if (joinLock.lock(60)) { + try { + s_logger.info(String.format("Host %d is disconnecting with event %s", hostId, event)); + Status nextStatus = null; + final HostVO host = _hostDao.findById(hostId); + if (host == null) { + s_logger.warn(String.format("Can't find host with %d", hostId)); + nextStatus = Status.Removed; + } else { + nextStatus = getNextStatusOnDisconnection(host, event); + caService.purgeHostCertificate(host); } if (s_logger.isDebugEnabled()) { - s_logger.debug("The next status of agent " + hostId + "is " + nextStatus + ", current status is " + currentStatus); + s_logger.debug(String.format("Deregistering link for %d with state %s", hostId, nextStatus)); } + + removeAgent(attache, nextStatus); + + if (host != null && transitState) { + // update the state for host in DB as per the event + disconnectAgent(host, event, _nodeId); + } + } finally { + joinLock.unlock(); } - caService.purgeHostCertificate(host); + result = true; } - - if (s_logger.isDebugEnabled()) { - s_logger.debug("Deregistering link for " + hostId + " with state " + nextStatus); - } - - removeAgent(attache, nextStatus); - // update the DB - if (host != null && transitState) { - disconnectAgent(host, event, _nodeId); - } - - return true; + joinLock.releaseRef(); + return result; } protected boolean handleDisconnectWithInvestigation(final AgentAttache attache, Status.Event event) { @@ -1101,26 +1118,23 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl return attache; } - private AgentAttache handleConnectedAgent(final Link link, final StartupCommand[] startup, final Request request) { - AgentAttache attache = null; - ReadyCommand ready = null; - try { - final List agentMSHostList = new ArrayList<>(); - String lbAlgorithm = null; - if (startup != null && startup.length > 0) { - final String agentMSHosts = startup[0].getMsHostList(); - if (StringUtils.isNotEmpty(agentMSHosts)) { - String[] msHosts = agentMSHosts.split("@"); - if (msHosts.length > 1) { - lbAlgorithm = msHosts[1]; - } - agentMSHostList.addAll(Arrays.asList(msHosts[0].split(","))); + private AgentAttache sendReadyAndGetAttache(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startup) throws ConnectionException { + final List agentMSHostList = new ArrayList<>(); + String lbAlgorithm = null; + if (startup != null && startup.length > 0) { + final String agentMSHosts = startup[0].getMsHostList(); + if (StringUtils.isNotEmpty(agentMSHosts)) { + String[] msHosts = agentMSHosts.split("@"); + if (msHosts.length > 1) { + lbAlgorithm = msHosts[1]; } + agentMSHostList.addAll(Arrays.asList(msHosts[0].split(","))); } - - final HostVO host = _resourceMgr.createHostVOForConnectedAgent(startup); - if (host != null) { - ready = new ReadyCommand(host.getDataCenterId(), host.getId(), NumbersUtil.enableHumanReadableSizes); + } + AgentAttache attache = null; + GlobalLock joinLock = getHostJoinLock(host.getId()); + if (joinLock.lock(60)) { + try { if (!indirectAgentLB.compareManagementServerList(host.getId(), host.getDataCenterId(), agentMSHostList, lbAlgorithm)) { final List newMSList = indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(), null); @@ -1132,6 +1146,24 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl attache = createAttacheForConnect(host, link); attache = notifyMonitorsOfConnection(attache, startup, false); + } finally { + joinLock.unlock(); + } + } else { + throw new ConnectionException(true, "Unable to acquire lock on host " + host.getUuid()); + } + joinLock.releaseRef(); + return attache; + } + + private AgentAttache handleConnectedAgent(final Link link, final StartupCommand[] startup, final Request request) { + AgentAttache attache = null; + ReadyCommand ready = null; + try { + final HostVO host = _resourceMgr.createHostVOForConnectedAgent(startup); + if (host != null) { + ready = new ReadyCommand(host.getDataCenterId(), host.getId(), NumbersUtil.enableHumanReadableSizes); + attache = sendReadyAndGetAttache(host, ready, link, startup); } } catch (final Exception e) { s_logger.debug("Failed to handle host connection: ", e); @@ -1265,6 +1297,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl connectAgent(link, cmds, request); } return; + } else if (cmd instanceof StartupCommand) { + connectAgent(link, cmds, request); } final long hostId = attache.getId(); @@ -1318,13 +1352,14 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl handleCommands(attache, request.getSequence(), new Command[] {cmd}); if (cmd instanceof PingCommand) { final long cmdHostId = ((PingCommand)cmd).getHostId(); + boolean requestStartupCommand = false; + final HostVO host = _hostDao.findById(Long.valueOf(cmdHostId)); + boolean gatewayAccessible = true; // if the router is sending a ping, verify the // gateway was pingable if (cmd instanceof PingRoutingCommand) { - final boolean gatewayAccessible = ((PingRoutingCommand)cmd).isGatewayAccessible(); - final HostVO host = _hostDao.findById(Long.valueOf(cmdHostId)); - + gatewayAccessible = ((PingRoutingCommand)cmd).isGatewayAccessible(); if (host != null) { if (!gatewayAccessible) { // alert that host lost connection to @@ -1342,7 +1377,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl s_logger.debug("Not processing " + PingRoutingCommand.class.getSimpleName() + " for agent id=" + cmdHostId + "; can't find the host in the DB"); } } - answer = new PingAnswer((PingCommand)cmd); + if (host!= null && host.getStatus() != Status.Up && gatewayAccessible) { + requestStartupCommand = true; + } + answer = new PingAnswer((PingCommand)cmd, requestStartupCommand); } else if (cmd instanceof ReadyAnswer) { final HostVO host = _hostDao.findById(attache.getId()); if (host == null) { @@ -1864,4 +1902,8 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl sendCommandToAgents(hostsPerZone, params); } } + + private GlobalLock getHostJoinLock(Long hostId) { + return GlobalLock.getInternLock(String.format("%s-%s", "Host-Join", hostId)); + } } diff --git a/test/integration/smoke/test_host_ping.py b/test/integration/smoke/test_host_ping.py new file mode 100644 index 00000000000..9de77f9b771 --- /dev/null +++ b/test/integration/smoke/test_host_ping.py @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" Check state transition of host from Alert to Up on Ping +""" + +# Import Local Modules +from marvin.cloudstackTestCase import * +from marvin.lib.common import * +from marvin.lib.utils import * +from nose.plugins.attrib import attr + +_multiprocess_shared_ = False + + +class TestHostPing(cloudstackTestCase): + + def setUp(self, handler=logging.StreamHandler()): + self.logger = logging.getLogger('TestHM') + self.stream_handler = handler + self.logger.setLevel(logging.DEBUG) + self.logger.addHandler(self.stream_handler) + self.apiclient = self.testClient.getApiClient() + self.hypervisor = self.testClient.getHypervisorInfo() + self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__ + self.dbConnection = self.testClient.getDbConnection() + self.services = self.testClient.getParsedTestDataConfig() + self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests()) + self.pod = get_pod(self.apiclient, self.zone.id) + self.cleanup = [] + + def tearDown(self): + super(TestHostPing, self).tearDown() + + def checkHostStateInCloudstack(self, state, host_id): + try: + listHost = Host.list( + self.apiclient, + type='Routing', + zoneid=self.zone.id, + podid=self.pod.id, + id=host_id + ) + self.assertEqual( + isinstance(listHost, list), + True, + "Check if listHost returns a valid response" + ) + + self.assertEqual( + len(listHost), + 1, + "Check if listHost returns a host" + ) + self.logger.debug(" Host state is %s " % listHost[0].state) + if listHost[0].state == state: + return True, 1 + else: + return False, 1 + except Exception as e: + self.logger.debug("Got exception %s" % e) + return False, 1 + + @attr( + tags=[ + "advanced", + "advancedns", + "smoke", + "basic"], + required_hardware="true") + def test_01_host_ping_on_alert(self): + listHost = Host.list( + self.apiclient, + type='Routing', + zoneid=self.zone.id, + podid=self.pod.id, + ) + for host in listHost: + self.logger.debug('Hypervisor = {}'.format(host.id)) + + hostToTest = listHost[0] + sql_query = "UPDATE host SET status = 'Alert' WHERE uuid = '" + hostToTest.id + "'" + self.dbConnection.execute(sql_query) + + hostUpInCloudstack = wait_until(30, 8, self.checkHostStateInCloudstack, "Up", hostToTest.id) + + if not (hostUpInCloudstack): + raise self.fail("Host is not up %s, in cloudstack so failing test " % (hostToTest.ipaddress)) + return From 1e133d05c7a804da7caccffaa16e066e4754254b Mon Sep 17 00:00:00 2001 From: Harikrishna Date: Fri, 3 Nov 2023 13:37:11 +0530 Subject: [PATCH 2/3] kvm: Handle the failures when setting up memory balloon stats period for KVM VMs (#8049) --- .../resource/LibvirtComputingResource.java | 4 +-- .../kvm/resource/LibvirtDomainXMLParser.java | 25 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java index 265e786e8f8..e239443197d 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtComputingResource.java @@ -1313,8 +1313,8 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv } s_logger.debug(String.format("The memory balloon stats period [%s] has been set successfully for the VM (Libvirt Domain) with ID [%s] and name [%s].", currentVmBalloonStatsPeriod, vmId, dm.getName())); - } catch (final LibvirtException e) { - s_logger.warn("Failed to set up memory balloon stats period." + e.getMessage()); + } catch (final Exception e) { + s_logger.warn(String.format("Failed to set up memory balloon stats period for the VM %s with exception %s", parser.getName(), e.getMessage())); } } } diff --git a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtDomainXMLParser.java b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtDomainXMLParser.java index cb650e24b2f..80a35a04f93 100644 --- a/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtDomainXMLParser.java +++ b/plugins/hypervisors/kvm/src/main/java/com/cloud/hypervisor/kvm/resource/LibvirtDomainXMLParser.java @@ -59,6 +59,8 @@ public class LibvirtDomainXMLParser { private Integer vncPort; private String desc; + private String name; + public boolean parseDomainXML(String domXML) { DocumentBuilder builder; try { @@ -71,6 +73,7 @@ public class LibvirtDomainXMLParser { Element rootElement = doc.getDocumentElement(); desc = getTagValue("description", rootElement); + name = getTagValue("name", rootElement); Element devices = (Element)rootElement.getElementsByTagName("devices").item(0); NodeList disks = devices.getElementsByTagName("disk"); @@ -303,15 +306,19 @@ public class LibvirtDomainXMLParser { String path = getTagValue("backend", rng); String bytes = getAttrValue("rate", "bytes", rng); String period = getAttrValue("rate", "period", rng); - - if (StringUtils.isEmpty(backendModel)) { - def = new RngDef(path, Integer.parseInt(bytes), Integer.parseInt(period)); + if (StringUtils.isAnyEmpty(bytes, period)) { + s_logger.debug(String.format("Bytes and period in the rng section should not be null, please check the VM %s", name)); } else { - def = new RngDef(path, RngBackendModel.valueOf(backendModel.toUpperCase()), - Integer.parseInt(bytes), Integer.parseInt(period)); + if (StringUtils.isEmpty(backendModel)) { + def = new RngDef(path, Integer.parseInt(bytes), Integer.parseInt(period)); + } else { + def = new RngDef(path, RngBackendModel.valueOf(backendModel.toUpperCase()), + Integer.parseInt(bytes), Integer.parseInt(period)); + } + } + if (def != null) { + rngDefs.add(def); } - - rngDefs.add(def); } NodeList watchDogs = devices.getElementsByTagName("watchdog"); @@ -418,4 +425,8 @@ public class LibvirtDomainXMLParser { public String getDescription() { return desc; } + + public String getName() { + return name; + } } From e6f048bc2e830f43ef925dcec0374babfa410afd Mon Sep 17 00:00:00 2001 From: Wei Zhou Date: Fri, 3 Nov 2023 10:01:12 +0100 Subject: [PATCH 3/3] CKS: fix wrong format of cluster size on UI (#8182) --- ui/src/components/view/ListView.vue | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ui/src/components/view/ListView.vue b/ui/src/components/view/ListView.vue index abb62b345bc..286d9d16d2a 100644 --- a/ui/src/components/view/ListView.vue +++ b/ui/src/components/view/ListView.vue @@ -194,7 +194,10 @@ {{ text }}