Trigger out of band VM state update via libvirt event when VM stops (#7963)

* Trigger out of band VM state update via libvirt event when VM stops

* Add License headers, refactor nested try

---------

Co-authored-by: Marcus Sorensen <mls@apple.com>
This commit is contained in:
Marcus Sorensen 2023-09-28 00:42:03 -06:00 committed by GitHub
parent 348a63dc98
commit 3694667f50
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 165 additions and 10 deletions

View File

@ -40,6 +40,8 @@ import java.util.concurrent.atomic.AtomicInteger;
import javax.naming.ConfigurationException;
import com.cloud.resource.AgentStatusUpdater;
import com.cloud.resource.ResourceStatusUpdater;
import com.cloud.utils.NumbersUtil;
import org.apache.cloudstack.agent.lb.SetupMSListAnswer;
import org.apache.cloudstack.agent.lb.SetupMSListCommand;
@ -100,7 +102,7 @@ import com.cloud.utils.script.Script;
* For more configuration options, see the individual types.
*
**/
public class Agent implements HandlerFactory, IAgentControl {
public class Agent implements HandlerFactory, IAgentControl, AgentStatusUpdater {
protected static Logger s_logger = Logger.getLogger(Agent.class);
public enum ExitStatus {
@ -409,6 +411,20 @@ public class Agent implements HandlerFactory, IAgentControl {
}
}
public void triggerUpdate() {
PingCommand command = _resource.getCurrentStatus(getId());
command.setOutOfBand(true);
s_logger.debug("Sending out of band ping");
final Request request = new Request(_id, -1, command, false);
request.setSequence(getNextSequence());
try {
_link.send(request.toBytes());
} catch (final ClosedChannelException e) {
s_logger.warn("Unable to send ping update: " + request.toString());
}
}
protected void cancelTasks() {
synchronized (_watchList) {
for (final WatchTask task : _watchList) {
@ -461,6 +477,10 @@ public class Agent implements HandlerFactory, IAgentControl {
} catch (final ClosedChannelException e) {
s_logger.warn("Unable to send request: " + request.toString());
}
if (_resource instanceof ResourceStatusUpdater) {
((ResourceStatusUpdater) _resource).registerStatusUpdater(this);
}
}
}

View File

@ -24,6 +24,7 @@ import com.cloud.host.Host;
public class PingCommand extends Command {
Host.Type hostType;
long hostId;
boolean outOfBand;
protected PingCommand() {
}
@ -33,6 +34,12 @@ public class PingCommand extends Command {
hostId = id;
}
public PingCommand(Host.Type type, long id, boolean oob) {
hostType = type;
hostId = id;
outOfBand = oob;
}
public Host.Type getHostType() {
return hostType;
}
@ -41,6 +48,10 @@ public class PingCommand extends Command {
return hostId;
}
public boolean getOutOfBand() { return outOfBand; }
public void setOutOfBand(boolean oob) { this.outOfBand = oob; }
@Override
public boolean executeInSequence() {
return false;

View File

@ -0,0 +1,27 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package com.cloud.resource;
/**
* AgentStatusUpdater is an agent with triggerable update functionality
*/
public interface AgentStatusUpdater {
/**
* Trigger the sending of an update (Ping).
*/
void triggerUpdate();
}

View File

@ -0,0 +1,29 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package com.cloud.resource;
/**
* ResourceStatusUpdater is a resource that can trigger out of band status updates
*/
public interface ResourceStatusUpdater {
/**
* Register an AgentStatusUpdater to use for triggering out of band updates.
*
* @param updater The object to call triggerUpdate() on
*/
void registerStatusUpdater(AgentStatusUpdater updater);
}

View File

@ -3727,7 +3727,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
if (cmd instanceof PingRoutingCommand) {
final PingRoutingCommand ping = (PingRoutingCommand)cmd;
if (ping.getHostVmStateReport() != null) {
_syncMgr.processHostVmStatePingReport(agentId, ping.getHostVmStateReport());
_syncMgr.processHostVmStatePingReport(agentId, ping.getHostVmStateReport(), ping.getOutOfBand());
}
scanStalledVMInTransitionStateOnUpHost(agentId);

View File

@ -27,7 +27,7 @@ public interface VirtualMachinePowerStateSync {
void processHostVmStateReport(long hostId, Map<String, HostVmStateReportEntry> report);
// to adapt legacy ping report
void processHostVmStatePingReport(long hostId, Map<String, HostVmStateReportEntry> report);
void processHostVmStatePingReport(long hostId, Map<String, HostVmStateReportEntry> report, boolean force);
Map<Long, VirtualMachine.PowerState> convertVmStateReport(Map<String, HostVmStateReportEntry> states);
}

View File

@ -55,19 +55,19 @@ public class VirtualMachinePowerStateSyncImpl implements VirtualMachinePowerStat
s_logger.debug("Process host VM state report. host: " + hostId);
Map<Long, VirtualMachine.PowerState> translatedInfo = convertVmStateReport(report);
processReport(hostId, translatedInfo);
processReport(hostId, translatedInfo, false);
}
@Override
public void processHostVmStatePingReport(long hostId, Map<String, HostVmStateReportEntry> report) {
public void processHostVmStatePingReport(long hostId, Map<String, HostVmStateReportEntry> report, boolean force) {
if (s_logger.isDebugEnabled())
s_logger.debug("Process host VM state report from ping process. host: " + hostId);
Map<Long, VirtualMachine.PowerState> translatedInfo = convertVmStateReport(report);
processReport(hostId, translatedInfo);
processReport(hostId, translatedInfo, force);
}
private void processReport(long hostId, Map<Long, VirtualMachine.PowerState> translatedInfo) {
private void processReport(long hostId, Map<Long, VirtualMachine.PowerState> translatedInfo, boolean force) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Process VM state report. host: " + hostId + ", number of records in report: " + translatedInfo.size());
@ -117,7 +117,7 @@ public class VirtualMachinePowerStateSyncImpl implements VirtualMachinePowerStat
// Make sure powerState is up to date for missing VMs
try {
if (!_instanceDao.isPowerStateUpToDate(instance.getId())) {
if (!force && !_instanceDao.isPowerStateUpToDate(instance.getId())) {
s_logger.warn("Detected missing VM but power state is outdated, wait for another process report run for VM id: " + instance.getId());
_instanceDao.resetVmPowerStateTracking(instance.getId());
continue;
@ -150,7 +150,7 @@ public class VirtualMachinePowerStateSyncImpl implements VirtualMachinePowerStat
long milliSecondsSinceLastStateUpdate = currentTime.getTime() - vmStateUpdateTime.getTime();
if (milliSecondsSinceLastStateUpdate > milliSecondsGracefullPeriod) {
if (force || milliSecondsSinceLastStateUpdate > milliSecondsGracefullPeriod) {
s_logger.debug("vm id: " + instance.getId() + " - time since last state update(" + milliSecondsSinceLastStateUpdate + "ms) has passed graceful period");
// this is were a race condition might have happened if we don't re-fetch the instance;

View File

@ -83,6 +83,7 @@ import org.libvirt.DomainInfo;
import org.libvirt.DomainInfo.DomainState;
import org.libvirt.DomainInterfaceStats;
import org.libvirt.DomainSnapshot;
import org.libvirt.Library;
import org.libvirt.LibvirtException;
import org.libvirt.MemoryStatistic;
import org.libvirt.Network;
@ -90,6 +91,9 @@ import org.libvirt.SchedParameter;
import org.libvirt.SchedUlongParameter;
import org.libvirt.Secret;
import org.libvirt.VcpuInfo;
import org.libvirt.event.DomainEvent;
import org.libvirt.event.DomainEventDetail;
import org.libvirt.event.StoppedDetail;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@ -97,6 +101,7 @@ import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.cloud.agent.api.Answer;
import com.cloud.agent.api.Command;
import com.cloud.agent.api.HostVmStateReportEntry;
@ -175,6 +180,8 @@ import com.cloud.network.Networks.BroadcastDomainType;
import com.cloud.network.Networks.IsolationType;
import com.cloud.network.Networks.RouterPrivateIpStrategy;
import com.cloud.network.Networks.TrafficType;
import com.cloud.resource.AgentStatusUpdater;
import com.cloud.resource.ResourceStatusUpdater;
import com.cloud.resource.RequestWrapper;
import com.cloud.resource.ServerResource;
import com.cloud.resource.ServerResourceBase;
@ -224,11 +231,12 @@ import com.google.gson.Gson;
* private mac addresses for domrs | mac address | start + 126 || ||
* pool | the parent of the storage pool hierarchy * }
**/
public class LibvirtComputingResource extends ServerResourceBase implements ServerResource, VirtualRouterDeployer {
public class LibvirtComputingResource extends ServerResourceBase implements ServerResource, VirtualRouterDeployer, ResourceStatusUpdater {
protected static Logger s_logger = Logger.getLogger(LibvirtComputingResource.class);
private static final String CONFIG_VALUES_SEPARATOR = ",";
private static final String LEGACY = "legacy";
private static final String SECURE = "secure";
@ -457,6 +465,7 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv
protected CPUStat cpuStat = new CPUStat();
protected MemStat memStat = new MemStat(dom0MinMem, dom0OvercommitMem);
private final LibvirtUtilitiesHelper libvirtUtilitiesHelper = new LibvirtUtilitiesHelper();
private AgentStatusUpdater _agentStatusUpdater;
protected Boolean enableManuallySettingCpuTopologyOnKvmVm = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.ENABLE_MANUALLY_SETTING_CPU_TOPOLOGY_ON_KVM_VM);
@ -481,6 +490,11 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv
return hypervisorQemuVersion;
}
@Override
public void registerStatusUpdater(AgentStatusUpdater updater) {
_agentStatusUpdater = updater;
}
@Override
public ExecutionResult executeInVR(final String routerIp, final String script, final String args) {
return executeInVR(routerIp, script, args, timeout);
@ -3590,9 +3604,63 @@ public class LibvirtComputingResource extends ServerResourceBase implements Serv
} catch (final CloudRuntimeException e) {
s_logger.debug("Unable to initialize local storage pool: " + e);
}
setupLibvirtEventListener();
return sscmd;
}
private void setupLibvirtEventListener() {
final Thread libvirtListenerThread = new Thread(() -> {
try {
Library.runEventLoop();
} catch (LibvirtException e) {
s_logger.error("LibvirtException was thrown in event loop: ", e);
} catch (InterruptedException e) {
s_logger.error("Libvirt event loop was interrupted: ", e);
}
});
try {
libvirtListenerThread.setDaemon(true);
libvirtListenerThread.start();
Connect conn = LibvirtConnection.getConnection();
conn.addLifecycleListener(this::onDomainLifecycleChange);
s_logger.debug("Set up the libvirt domain event lifecycle listener");
} catch (LibvirtException e) {
s_logger.error("Failed to get libvirt connection for domain event lifecycle", e);
}
}
private int onDomainLifecycleChange(Domain domain, DomainEvent domainEvent) {
try {
s_logger.debug(String.format("Got event lifecycle change on Domain %s, event %s", domain.getName(), domainEvent));
if (domainEvent != null) {
switch (domainEvent.getType()) {
case STOPPED:
/* libvirt-destroyed VMs have detail StoppedDetail.DESTROYED, self shutdown guests are StoppedDetail.SHUTDOWN
* Checking for this helps us differentiate between events where cloudstack or admin stopped the VM vs guest
* initiated, and avoid pushing extra updates for actions we are initiating without a need for extra tracking */
DomainEventDetail detail = domainEvent.getDetail();
if (StoppedDetail.SHUTDOWN.equals(detail) || StoppedDetail.CRASHED.equals(detail)) {
s_logger.info("Triggering out of band status update due to completed self-shutdown or crash of VM");
_agentStatusUpdater.triggerUpdate();
} else {
s_logger.debug("Event detail: " + detail);
}
break;
default:
s_logger.debug(String.format("No handling for event %s", domainEvent));
}
}
} catch (LibvirtException e) {
s_logger.error("Libvirt exception while processing lifecycle event", e);
} catch (Throwable e) {
s_logger.error("Error during lifecycle", e);
}
return 0;
}
public String diskUuidToSerial(String uuid) {
String uuidWithoutHyphen = uuid.replace("-","");
return uuidWithoutHyphen.substring(0, Math.min(uuidWithoutHyphen.length(), 20));