From 33a37da9ec7ef75fa42b9c6f2ca3a8f0ebf072bc Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Date: Tue, 28 Jan 2025 14:39:31 +0530 Subject: [PATCH] server: investigate pending HA work when executing in new MS session (#10167) For HA work items that are created for host state change, checks must be done when execution is called in a new management server session. A new column, reason, has been added in cloud.op_ha_work table to track the reason for HA work. When HighAvailabilityManager starts it finds and puts all pending HA work items in Investigating state. During execution of the HA work if it is found in investigating state, checks are done to verify if the work is still valid. If the jobs is found to be invalid it is cancelled. Signed-off-by: Abhishek Kumar --- .../com/cloud/ha/HighAvailabilityManager.java | 21 ++++-- .../cloud/agent/manager/AgentManagerImpl.java | 2 +- .../META-INF/db/schema-42000to42010.sql | 3 + .../src/main/java/com/cloud/ha/HaWorkVO.java | 15 +++- .../cloud/ha/HighAvailabilityManagerImpl.java | 74 ++++++++++++++++--- .../com/cloud/ha/dao/HighAvailabilityDao.java | 2 + .../cloud/ha/dao/HighAvailabilityDaoImpl.java | 30 +++++++- .../cloud/resource/ResourceManagerImpl.java | 30 ++++---- .../provider/host/HAAbstractHostProvider.java | 2 +- .../ha/HighAvailabilityManagerImplTest.java | 66 +++++++++++++++-- .../ha/dao/HighAvailabilityDaoImplTest.java | 59 ++++++++++++++- 11 files changed, 260 insertions(+), 44 deletions(-) diff --git a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java index 728f5a2b180..ddc8153d739 100644 --- a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java +++ b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java @@ -84,6 +84,13 @@ public interface HighAvailabilityManager extends Manager { HA; // Restart a VM. } + enum ReasonType { + Unknown, + HostMaintenance, + HostDown, + HostDegraded; + } + enum Step { Scheduled, Investigating, Fencing, Stopping, Restarting, Migrating, Cancelled, Done, Error, } @@ -92,7 +99,7 @@ public interface HighAvailabilityManager extends Manager { * Investigate why a host has disconnected and migrate the VMs on it * if necessary. * - * @param host - the host that has disconnected. + * @param hostId - the id of the host that has disconnected. */ Status investigate(long hostId); @@ -109,17 +116,19 @@ public interface HighAvailabilityManager extends Manager { * @param investigate must be investigated before we do anything with this vm. */ void scheduleRestart(VMInstanceVO vm, boolean investigate); + void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType); void cancelDestroy(VMInstanceVO vm, Long hostId); - boolean scheduleDestroy(VMInstanceVO vm, long hostId); + boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType); /** * Schedule restarts for all vms running on the host. * @param host host. - * @param investigate TODO + * @param investigate whether to investigate + * @param reasonType reason for HA work */ - void scheduleRestartForVmsOnHost(HostVO host, boolean investigate); + void scheduleRestartForVmsOnHost(HostVO host, boolean investigate, ReasonType reasonType); /** * Schedule the vm for migration. @@ -128,6 +137,7 @@ public interface HighAvailabilityManager extends Manager { * @return true if schedule worked. */ boolean scheduleMigration(VMInstanceVO vm); + boolean scheduleMigration(VMInstanceVO vm, ReasonType reasonType); List findTakenMigrationWork(); @@ -140,10 +150,11 @@ public interface HighAvailabilityManager extends Manager { * 3. Check if a VM has been stopped: WorkType.CheckStop * * @param vm virtual machine to stop. - * @param host host the virtual machine is on. + * @param hostId the id of the host the virtual machine is on. * @param type which type of stop is requested. */ boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type); + boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType); void cancelScheduledMigrations(HostVO host); diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index 09fb211fedf..32180a91909 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -989,7 +989,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl handleDisconnectWithoutInvestigation(attache, event, true, true); host = _hostDao.findById(hostId); // Maybe the host magically reappeared? if (host != null && host.getStatus() == Status.Down) { - _haMgr.scheduleRestartForVmsOnHost(host, true); + _haMgr.scheduleRestartForVmsOnHost(host, true, HighAvailabilityManager.ReasonType.HostDown); } return true; } diff --git a/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql b/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql index 8b70cce3404..976ef217832 100644 --- a/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql +++ b/engine/schema/src/main/resources/META-INF/db/schema-42000to42010.sql @@ -35,3 +35,6 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'last_id', 'bigint(20) uns -- Add used_iops column to support IOPS data in storage stats CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.storage_pool', 'used_iops', 'bigint unsigned DEFAULT NULL COMMENT "IOPS currently in use for this storage pool" '); + +-- Add reason column for op_ha_work +CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.op_ha_work', 'reason', 'varchar(32) DEFAULT NULL COMMENT "Reason for the HA work"'); diff --git a/server/src/main/java/com/cloud/ha/HaWorkVO.java b/server/src/main/java/com/cloud/ha/HaWorkVO.java index f5a36e3bb1a..8307f4d9317 100644 --- a/server/src/main/java/com/cloud/ha/HaWorkVO.java +++ b/server/src/main/java/com/cloud/ha/HaWorkVO.java @@ -86,6 +86,10 @@ public class HaWorkVO implements InternalIdentity { @Column(name = "tried") int timesTried; + @Column(name = "reason") + @Enumerated(value = EnumType.STRING) + private HighAvailabilityManager.ReasonType reasonType; + protected HaWorkVO() { } @@ -179,7 +183,7 @@ public class HaWorkVO implements InternalIdentity { } public HaWorkVO(final long instanceId, final VirtualMachine.Type type, final WorkType workType, final Step step, final long hostId, final State previousState, - final int timesTried, final long updated) { + final int timesTried, final long updated, HighAvailabilityManager.ReasonType reasonType) { this.workType = workType; this.type = type; this.instanceId = instanceId; @@ -191,6 +195,7 @@ public class HaWorkVO implements InternalIdentity { this.step = step; this.timeToTry = System.currentTimeMillis() >> 10; this.updateTime = updated; + this.reasonType = reasonType; } @Override @@ -207,4 +212,12 @@ public class HaWorkVO implements InternalIdentity { .append("]") .toString(); } + + public HighAvailabilityManager.ReasonType getReasonType() { + return reasonType; + } + + public void setReasonType(HighAvailabilityManager.ReasonType reasonType) { + this.reasonType = reasonType; + } } diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java index e10bd47a067..2ce803756fe 100644 --- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -19,6 +19,7 @@ package com.cloud.ha; import static org.apache.cloudstack.framework.config.ConfigKey.Scope.Zone; import java.util.ArrayList; +import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.List; @@ -43,6 +44,7 @@ import org.apache.cloudstack.framework.config.dao.ConfigurationDao; import org.apache.cloudstack.managed.context.ManagedContext; import org.apache.cloudstack.managed.context.ManagedContextRunnable; import org.apache.cloudstack.management.ManagementServerHost; +import org.apache.logging.log4j.ThreadContext; import com.cloud.agent.AgentManager; import com.cloud.alert.AlertManager; @@ -90,7 +92,6 @@ import com.cloud.vm.VirtualMachine; import com.cloud.vm.VirtualMachineManager; import com.cloud.vm.VirtualMachineProfile; import com.cloud.vm.dao.VMInstanceDao; -import org.apache.logging.log4j.ThreadContext; /** * HighAvailabilityManagerImpl coordinates the HA process. VMs are registered with the HA Manager for HA. The request is stored @@ -133,6 +134,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur protected static ConfigKey VmHaAlertsEnabled = new ConfigKey<>("Advanced", Boolean.class, "vm.ha.alerts.enabled", "true", "Enable/Disable alerts for the VM HA operations, it is enabled by default.", true, Zone); + protected static final List CancellableWorkReasonTypes = + Arrays.asList(ReasonType.HostMaintenance, ReasonType.HostDown, ReasonType.HostDegraded); + WorkerThread[] _workers; boolean _stopped; long _timeToSleep; @@ -269,8 +273,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur } @Override - public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate) { - + public void scheduleRestartForVmsOnHost(final HostVO host, boolean investigate, ReasonType reasonType) { if (host.getType() != Host.Type.Routing) { return; } @@ -337,12 +340,12 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur logger.debug("VM {} is not on down host {} it is on other host {} VM HA is done", vm, host, hostId); continue; } - scheduleRestart(vm, investigate); + scheduleRestart(vm, investigate, reasonType); } } @Override - public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) { + public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type, ReasonType reasonType) { assert (type == WorkType.CheckStop || type == WorkType.ForceStop || type == WorkType.Stop); if (_haDao.hasBeenScheduled(vm.getId(), type)) { @@ -359,7 +362,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur return false; } - HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated()); + HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), type, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType); _haDao.persist(work); if (logger.isDebugEnabled()) { logger.debug("Scheduled " + work); @@ -368,6 +371,11 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur return true; } + @Override + public boolean scheduleStop(VMInstanceVO vm, long hostId, WorkType type) { + return scheduleStop(vm, hostId, type, null); + } + protected void wakeupWorkers() { logger.debug("Wakeup workers HA"); for (WorkerThread worker : _workers) { @@ -376,7 +384,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur } @Override - public boolean scheduleMigration(final VMInstanceVO vm) { + public boolean scheduleMigration(final VMInstanceVO vm, ReasonType reasonType) { if (vm.getHostId() == null) { return false; } @@ -390,7 +398,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur return false; } - final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated()); + final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated(), reasonType); _haDao.persist(work); logger.info("Scheduled migration work of VM {} from host {} with HAWork {}", vm, _hostDao.findById(vm.getHostId()), work); wakeupWorkers(); @@ -398,7 +406,12 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur } @Override - public void scheduleRestart(VMInstanceVO vm, boolean investigate) { + public boolean scheduleMigration(final VMInstanceVO vm) { + return scheduleMigration(vm, null); + } + + @Override + public void scheduleRestart(VMInstanceVO vm, boolean investigate, ReasonType reasonType) { if (!VmHaEnabled.valueIn(vm.getDataCenterId())) { String message = String.format("Unable to schedule restart for the VM %s (%d), VM high availability manager is disabled.", vm.getName(), vm.getId()); if (logger.isDebugEnabled()) { @@ -490,7 +503,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur } HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled, - hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated()); + hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated(), reasonType); _haDao.persist(work); if (logger.isInfoEnabled()) { @@ -500,6 +513,11 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur wakeupWorkers(); } + @Override + public void scheduleRestart(VMInstanceVO vm, boolean investigate) { + scheduleRestart(vm, investigate, null); + } + private void startVm(VirtualMachine vm, Map params, DeploymentPlanner planner) throws InsufficientCapacityException, ResourceUnavailableException, ConcurrentOperationException, OperationTimedoutException { @@ -561,6 +579,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur logger.info("Unable to find vm: " + vmId); return null; } + if (checkAndCancelWorkIfNeeded(work)) { + return null; + } logger.info("HA on " + vm); if (vm.getState() != work.getPreviousState() || vm.getUpdated() != work.getUpdateTime()) { @@ -762,6 +783,22 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur return (System.currentTimeMillis() >> 10) + _restartRetryInterval; } + protected boolean checkAndCancelWorkIfNeeded(final HaWorkVO work) { + if (!Step.Investigating.equals(work.getStep())) { + return false; + } + if (!CancellableWorkReasonTypes.contains(work.getReasonType())) { + return false; + } + Status hostStatus = investigate(work.getHostId()); + if (!Status.Up.equals(hostStatus)) { + return false; + } + logger.debug("Cancelling {} as it is not needed anymore", () -> work); + work.setStep(Step.Cancelled); + return true; + } + public Long migrate(final HaWorkVO work) { long vmId = work.getInstanceId(); long srcHostId = work.getHostId(); @@ -772,6 +809,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur logger.info("Unable to find vm: " + vmId + ", skipping migrate."); return null; } + if (checkAndCancelWorkIfNeeded(work)) { + return null; + } logger.info("Migration attempt: for VM {}from host {}. Starting attempt: {}/{} times.", vm, srcHost, 1 + work.getTimesTried(), _maxRetries); try { work.setStep(Step.Migrating); @@ -791,7 +831,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur } @Override - public boolean scheduleDestroy(VMInstanceVO vm, long hostId) { + public boolean scheduleDestroy(VMInstanceVO vm, long hostId, ReasonType reasonType) { if (!VmHaEnabled.valueIn(vm.getDataCenterId())) { String message = String.format("Unable to schedule destroy for the VM %s (%d) on host %d, VM high availability manager is disabled.", vm.getName(), vm.getId(), hostId); if (logger.isDebugEnabled()) { @@ -801,7 +841,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur return false; } - final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated()); + final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Destroy, Step.Scheduled, hostId, vm.getState(), 0, vm.getUpdated(), reasonType); _haDao.persist(work); if (logger.isDebugEnabled()) { logger.debug("Scheduled " + work.toString()); @@ -838,6 +878,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur logger.info("No longer can find VM " + work.getInstanceId() + ". Throwing away " + work); return null; } + if (checkAndCancelWorkIfNeeded(work)) { + return null; + } boolean expunge = VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType()) || VirtualMachine.Type.ConsoleProxy.equals(vm.getType()); if (!expunge && VirtualMachine.State.Destroyed.equals(work.getPreviousState())) { @@ -872,6 +915,9 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur work.setStep(Step.Done); return null; } + if (checkAndCancelWorkIfNeeded(work)) { + return null; + } logger.info("Stopping " + vm); try { if (work.getWorkType() == WorkType.Stop) { @@ -1057,6 +1103,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur public boolean start() { _stopped = false; + _haDao.markPendingWorksAsInvestigating(); + for (final WorkerThread thread : _workers) { thread.start(); } @@ -1074,6 +1122,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur _executor.shutdown(); + _haDao.markServerPendingWorksAsInvestigating(_msServer.getId()); + return true; } diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java index f6539105d78..42c8aabe41a 100644 --- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java +++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java @@ -86,4 +86,6 @@ public interface HighAvailabilityDao extends GenericDao { List listPendingMigrationsForVm(long vmId); int expungeByVmList(List vmIds, Long batchSize); + void markPendingWorksAsInvestigating(); + void markServerPendingWorksAsInvestigating(long managementServerId); } diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java index c722c6376c1..00b62e0d601 100644 --- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java +++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java @@ -31,12 +31,13 @@ import com.cloud.utils.db.SearchBuilder; import com.cloud.utils.db.SearchCriteria; import com.cloud.utils.db.SearchCriteria.Op; import com.cloud.utils.db.TransactionLegacy; +import com.cloud.utils.db.UpdateBuilder; import com.cloud.utils.exception.CloudRuntimeException; @Component public class HighAvailabilityDaoImpl extends GenericDaoBase implements HighAvailabilityDao { - private final SearchBuilder TBASearch; + protected SearchBuilder TBASearch; private final SearchBuilder PreviousInstanceSearch; private final SearchBuilder UntakenMigrationSearch; private final SearchBuilder CleanupSearch; @@ -270,4 +271,31 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl sc.setParameters("vmIds", vmIds.toArray()); return batchExpunge(sc, batchSize); } + + protected void updatePendingWorkToInvestigating(SearchCriteria sc) { + HaWorkVO haWorkVO = createForUpdate(); + haWorkVO.setStep(Step.Investigating); + UpdateBuilder updateBuilder = getUpdateBuilder(haWorkVO); + update(updateBuilder, sc, null); + } + + @Override + public void markPendingWorksAsInvestigating() { + final SearchCriteria sc = TBASearch.create(); + sc.setParameters("time", System.currentTimeMillis() >> 10); + sc.setParameters("step", Step.Done, Step.Cancelled); + updatePendingWorkToInvestigating(sc); + } + + @Override + public void markServerPendingWorksAsInvestigating(long managementServerId) { + SearchBuilder sb = createSearchBuilder(); + sb.and("server", sb.entity().getServerId(), Op.EQ); + sb.and("step", sb.entity().getStep(), Op.NIN); + sb.done(); + SearchCriteria sc = sb.create(); + sc.setParameters("server", managementServerId); + sc.setParameters("step", Step.Done, Step.Cancelled); + updatePendingWorkToInvestigating(sc); + } } diff --git a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java index 50116905bfe..1349e03f205 100755 --- a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java @@ -37,16 +37,6 @@ import java.util.stream.Collectors; import javax.inject.Inject; import javax.naming.ConfigurationException; -import com.cloud.alert.AlertManager; -import com.cloud.cpu.CPU; -import com.cloud.exception.StorageConflictException; -import com.cloud.exception.StorageUnavailableException; -import com.cloud.ha.HighAvailabilityManagerImpl; -import com.cloud.host.HostTagVO; -import com.cloud.storage.Volume; -import com.cloud.storage.VolumeVO; -import com.cloud.storage.dao.VolumeDao; -import com.cloud.hypervisor.HypervisorGuru; import org.apache.cloudstack.alert.AlertService; import org.apache.cloudstack.annotation.AnnotationService; import org.apache.cloudstack.annotation.dao.AnnotationDao; @@ -93,6 +83,7 @@ import com.cloud.agent.api.UpdateHostPasswordCommand; import com.cloud.agent.api.VgpuTypesInfo; import com.cloud.agent.api.to.GPUDeviceTO; import com.cloud.agent.transport.Request; +import com.cloud.alert.AlertManager; import com.cloud.capacity.Capacity; import com.cloud.capacity.CapacityManager; import com.cloud.capacity.CapacityState; @@ -101,6 +92,7 @@ import com.cloud.capacity.dao.CapacityDao; import com.cloud.cluster.ClusterManager; import com.cloud.configuration.Config; import com.cloud.configuration.ConfigurationManager; +import com.cloud.cpu.CPU; import com.cloud.dc.ClusterDetailsDao; import com.cloud.dc.ClusterDetailsVO; import com.cloud.dc.ClusterVO; @@ -134,6 +126,8 @@ import com.cloud.exception.InvalidParameterValueException; import com.cloud.exception.PermissionDeniedException; import com.cloud.exception.ResourceInUseException; import com.cloud.exception.ResourceUnavailableException; +import com.cloud.exception.StorageConflictException; +import com.cloud.exception.StorageUnavailableException; import com.cloud.gpu.GPU; import com.cloud.gpu.HostGpuGroupsVO; import com.cloud.gpu.VGPUTypesVO; @@ -141,10 +135,12 @@ import com.cloud.gpu.dao.HostGpuGroupsDao; import com.cloud.gpu.dao.VGPUTypesDao; import com.cloud.ha.HighAvailabilityManager; import com.cloud.ha.HighAvailabilityManager.WorkType; +import com.cloud.ha.HighAvailabilityManagerImpl; import com.cloud.host.DetailVO; import com.cloud.host.Host; import com.cloud.host.Host.Type; import com.cloud.host.HostStats; +import com.cloud.host.HostTagVO; import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.host.Status.Event; @@ -153,6 +149,7 @@ import com.cloud.host.dao.HostDetailsDao; import com.cloud.host.dao.HostTagsDao; import com.cloud.hypervisor.Hypervisor; import com.cloud.hypervisor.Hypervisor.HypervisorType; +import com.cloud.hypervisor.HypervisorGuru; import com.cloud.hypervisor.kvm.discoverer.KvmDummyResourceBase; import com.cloud.network.dao.IPAddressDao; import com.cloud.network.dao.IPAddressVO; @@ -170,10 +167,13 @@ import com.cloud.storage.StoragePoolHostVO; import com.cloud.storage.StoragePoolStatus; import com.cloud.storage.StorageService; import com.cloud.storage.VMTemplateVO; +import com.cloud.storage.Volume; +import com.cloud.storage.VolumeVO; import com.cloud.storage.dao.DiskOfferingDao; import com.cloud.storage.dao.GuestOSCategoryDao; import com.cloud.storage.dao.StoragePoolHostDao; import com.cloud.storage.dao.VMTemplateDao; +import com.cloud.storage.dao.VolumeDao; import com.cloud.user.Account; import com.cloud.user.AccountManager; import com.cloud.utils.Ternary; @@ -1348,7 +1348,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, if (VirtualMachine.Type.SecondaryStorageVm.equals(vm.getType()) || VirtualMachine.Type.ConsoleProxy.equals(vm.getType())) { logger.error("Maintenance: VM is of type {}. Destroying VM {} immediately instead of migration.", vm.getType(), vm); - _haMgr.scheduleDestroy(vm, host.getId()); + _haMgr.scheduleDestroy(vm, host.getId(), HighAvailabilityManager.ReasonType.HostMaintenance); return; } logger.error("Maintenance: No hosts available for migrations. Scheduling shutdown for VM {} instead of migration.", vm); @@ -1405,10 +1405,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, handleVmForLastHostOrWithVGpu(host, vm); } else if (HypervisorType.LXC.equals(host.getHypervisorType()) && VirtualMachine.Type.User.equals(vm.getType())){ //Migration is not supported for LXC Vms. Schedule restart instead. - _haMgr.scheduleRestart(vm, false); + _haMgr.scheduleRestart(vm, false, HighAvailabilityManager.ReasonType.HostMaintenance); } else if (userVmManager.isVMUsingLocalStorage(vm)) { if (isMaintenanceLocalStrategyForceStop()) { - _haMgr.scheduleStop(vm, hostId, WorkType.ForceStop); + _haMgr.scheduleStop(vm, hostId, WorkType.ForceStop, HighAvailabilityManager.ReasonType.HostMaintenance); } else if (isMaintenanceLocalStrategyMigrate()) { migrateAwayVmWithVolumes(host, vm); } else if (!isMaintenanceLocalStrategyDefault()){ @@ -1421,7 +1421,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, } } else { logger.info("Maintenance: scheduling migration of VM {} from host {}", vm, host); - _haMgr.scheduleMigration(vm); + _haMgr.scheduleMigration(vm, HighAvailabilityManager.ReasonType.HostMaintenance); } } } @@ -1637,7 +1637,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, for (VMInstanceVO vm : allVmsOnHost) { State vmState = vm.getState(); if (vmState == State.Starting || vmState == State.Running || vmState == State.Stopping) { - _haMgr.scheduleRestart(vm, false); + _haMgr.scheduleRestart(vm, false, HighAvailabilityManager.ReasonType.HostDegraded); } } } diff --git a/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java b/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java index 5907c1864ad..2d77e6f9d20 100644 --- a/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java +++ b/server/src/main/java/org/apache/cloudstack/ha/provider/host/HAAbstractHostProvider.java @@ -74,7 +74,7 @@ public abstract class HAAbstractHostProvider extends AdapterBase implements HAPr try { logger.debug("Trying to disconnect the host without investigation and scheduling HA for the VMs on host {}", host); agentManager.disconnectWithoutInvestigation(host.getId(), Event.HostDown); - oldHighAvailabilityManager.scheduleRestartForVmsOnHost((HostVO)host, true); + oldHighAvailabilityManager.scheduleRestartForVmsOnHost((HostVO)host, true, HighAvailabilityManager.ReasonType.HostDown); } catch (Exception e) { logger.error("Failed to disconnect host and schedule HA restart of VMs after fencing the host: ", e); } diff --git a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java index 53ae5d2279e..24714b72388 100644 --- a/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java +++ b/server/src/test/java/com/cloud/ha/HighAvailabilityManagerImplTest.java @@ -135,6 +135,9 @@ public class HighAvailabilityManagerImplTest { @Mock UserVmManager userVmManager; + @Mock + private HaWorkVO mockWork; + HighAvailabilityManagerImpl highAvailabilityManager; HighAvailabilityManagerImpl highAvailabilityManagerSpy; static Method processWorkMethod = null; @@ -185,7 +188,7 @@ public class HighAvailabilityManagerImplTest { highAvailabilityManager.VmHaEnabled = haEnabled; Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(true); - highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true); + highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true, HighAvailabilityManager.ReasonType.HostDown); } @Test @@ -193,7 +196,7 @@ public class HighAvailabilityManagerImplTest { Mockito.when(hostVO.getType()).thenReturn(Host.Type.Routing); Mockito.when(hostVO.getHypervisorType()).thenReturn(HypervisorType.VMware); - highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true); + highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true, HighAvailabilityManager.ReasonType.HostDown); } @Test @@ -206,7 +209,7 @@ public class HighAvailabilityManagerImplTest { highAvailabilityManager.VmHaEnabled = haEnabled; Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(false); - highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true); + highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true, HighAvailabilityManager.ReasonType.HostDown); } @Test @@ -240,7 +243,7 @@ public class HighAvailabilityManagerImplTest { highAvailabilityManager.VmHaEnabled = haEnabled; Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(true); - highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true); + highAvailabilityManager.scheduleRestartForVmsOnHost(hostVO, true, HighAvailabilityManager.ReasonType.HostDown); } @Test @@ -336,7 +339,7 @@ public class HighAvailabilityManagerImplTest { Mockito.when(vm.getState()).thenReturn(VirtualMachine.State.Running); Mockito.when(_haDao.persist((HaWorkVO)Mockito.any())).thenReturn(Mockito.mock(HaWorkVO.class)); - assertTrue(highAvailabilityManager.scheduleDestroy(vm, 1L)); + assertTrue(highAvailabilityManager.scheduleDestroy(vm, 1L, HighAvailabilityManager.ReasonType.HostMaintenance)); } @Test @@ -348,7 +351,7 @@ public class HighAvailabilityManagerImplTest { highAvailabilityManager.VmHaEnabled = haEnabled; Mockito.when(highAvailabilityManager.VmHaEnabled.valueIn(1L)).thenReturn(false); - assertFalse(highAvailabilityManager.scheduleDestroy(vm, 1L)); + assertFalse(highAvailabilityManager.scheduleDestroy(vm, 1L, HighAvailabilityManager.ReasonType.HostMaintenance)); } @Test @@ -402,7 +405,7 @@ public class HighAvailabilityManagerImplTest { private void processWorkWithRetryCount(int count, Step expectedStep) { assertNotNull(processWorkMethod); - HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User, WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count, 12345678l); + HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User, WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count, 12345678l, null); Mockito.doReturn(12345678l).when(highAvailabilityManagerSpy).migrate(work); try { processWorkMethod.invoke(highAvailabilityManagerSpy, work); @@ -425,4 +428,53 @@ public class HighAvailabilityManagerImplTest { public void processWorkWithRetryCountNotExceeded() { processWorkWithRetryCount(3, Step.Scheduled); } + + @Test + public void testCheckAndCancelWorkIfNeeded_Success() { + Mockito.when(mockWork.getStep()).thenReturn(Step.Investigating); + Mockito.when(mockWork.getReasonType()).thenReturn(HighAvailabilityManager.ReasonType.HostMaintenance); + Mockito.when(mockWork.getHostId()).thenReturn(1L); + Mockito.doReturn(Status.Up).when(highAvailabilityManagerSpy).investigate(1L); + Mockito.doNothing().when(mockWork).setStep(Step.Cancelled); + boolean result = highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork); + assertTrue(result); + Mockito.verify(mockWork).setStep(Step.Cancelled); + } + + @Test + public void testCheckAndCancelWorkIfNeeded_StepNotInvestigating() { + Mockito.when(mockWork.getStep()).thenReturn(Step.Cancelled); + boolean result = highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork); + assertFalse(result); + Mockito.verify(mockWork, Mockito.never()).setStep(Mockito.any()); + } + + private void runInvalidReasonCheckAndCancelWorkIfNeeded(HighAvailabilityManager.ReasonType reasonType) { + Mockito.when(mockWork.getStep()).thenReturn(Step.Investigating); + Mockito.when(mockWork.getReasonType()).thenReturn(reasonType); + boolean result = highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork); + assertFalse(result); + Mockito.verify(mockWork, Mockito.never()).setStep(Mockito.any()); + } + + @Test + public void testCheckAndCancelWorkIfNeeded_InvalidReasonType() { + runInvalidReasonCheckAndCancelWorkIfNeeded(HighAvailabilityManager.ReasonType.Unknown); + } + + @Test + public void testCheckAndCancelWorkIfNeeded_NullReasonType() { + runInvalidReasonCheckAndCancelWorkIfNeeded(null); + } + + @Test + public void testCheckAndCancelWorkIfNeeded_HostStatusNotUp() { + Mockito.when(mockWork.getStep()).thenReturn(Step.Investigating); + Mockito.when(mockWork.getReasonType()).thenReturn(HighAvailabilityManager.ReasonType.HostDown); + Mockito.when(mockWork.getHostId()).thenReturn(1L); + Mockito.doReturn(Status.Down).when(highAvailabilityManagerSpy).investigate(1L); + boolean result = highAvailabilityManagerSpy.checkAndCancelWorkIfNeeded(mockWork); + assertFalse(result); + Mockito.verify(mockWork, Mockito.never()).setStep(Mockito.any()); + } } diff --git a/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java b/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java index 783497740fd..c2b95474d0b 100644 --- a/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java +++ b/server/src/test/java/com/cloud/ha/dao/HighAvailabilityDaoImplTest.java @@ -22,20 +22,25 @@ import java.util.List; import org.junit.Assert; import org.junit.Test; import org.junit.runner.RunWith; +import org.mockito.MockedStatic; import org.mockito.Mockito; import org.mockito.Spy; import org.mockito.junit.MockitoJUnitRunner; import org.mockito.stubbing.Answer; import com.cloud.ha.HaWorkVO; +import com.cloud.ha.HighAvailabilityManager; +import com.cloud.utils.db.GenericDaoBase; import com.cloud.utils.db.SearchBuilder; import com.cloud.utils.db.SearchCriteria; +import com.cloud.utils.db.UpdateBuilder; +import com.cloud.vm.VirtualMachine; @RunWith(MockitoJUnitRunner.class) public class HighAvailabilityDaoImplTest { @Spy - HighAvailabilityDaoImpl highAvailabilityDaoImpl; + HighAvailabilityDaoImpl highAvailabilityDaoImpl = new HighAvailabilityDaoImpl(); @Test public void testExpungeByVmListNoVms() { @@ -65,4 +70,56 @@ public class HighAvailabilityDaoImplTest { Mockito.verify(highAvailabilityDaoImpl, Mockito.times(1)) .batchExpunge(sc, batchSize); } + + @Test + public void testMarkPendingWorksAsInvestigating() throws Exception { + SearchBuilder mockTBASearch = Mockito.mock(SearchBuilder.class); + highAvailabilityDaoImpl.TBASearch = mockTBASearch; + SearchCriteria mockSearchCriteria = Mockito.mock(SearchCriteria.class); + UpdateBuilder mockUpdateBuilder = Mockito.mock(UpdateBuilder.class); + Mockito.when(mockTBASearch.create()).thenReturn(mockSearchCriteria); + Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("time"), Mockito.anyLong()); + Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("step"), Mockito.eq(HighAvailabilityManager.Step.Done), Mockito.eq(HighAvailabilityManager.Step.Cancelled)); + HaWorkVO haWorkVO = new HaWorkVO(1L, VirtualMachine.Type.User, null, + null, 1L, null, 0, 0, + HighAvailabilityManager.ReasonType.HostMaintenance); + Mockito.when(highAvailabilityDaoImpl.createForUpdate()).thenReturn(haWorkVO); + try(MockedStatic genericDaoBaseMockedStatic = Mockito.mockStatic(GenericDaoBase.class)) { + genericDaoBaseMockedStatic.when(() -> GenericDaoBase.getUpdateBuilder(Mockito.any())).thenReturn(mockUpdateBuilder); + Mockito.doReturn(5).when(highAvailabilityDaoImpl).update(Mockito.any(UpdateBuilder.class), Mockito.any(), Mockito.nullable(Integer.class)); + highAvailabilityDaoImpl.markPendingWorksAsInvestigating(); + Mockito.verify(mockTBASearch).create(); + Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("time"), Mockito.anyLong()); + Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("step"), Mockito.eq(HighAvailabilityManager.Step.Done), Mockito.eq(HighAvailabilityManager.Step.Cancelled)); + Assert.assertEquals(HighAvailabilityManager.Step.Investigating, haWorkVO.getStep()); // Ensure the step is set correctly + Mockito.verify(highAvailabilityDaoImpl).update(Mockito.eq(mockUpdateBuilder), Mockito.eq(mockSearchCriteria), Mockito.isNull()); + } + } + + @Test + public void testMarkServerPendingWorksAsInvestigating() { + SearchBuilder mockSearch = Mockito.mock(SearchBuilder.class); + Mockito.doReturn(Mockito.mock(HaWorkVO.class)).when(mockSearch).entity(); + Mockito.doReturn(mockSearch).when(highAvailabilityDaoImpl).createSearchBuilder(); + SearchCriteria mockSearchCriteria = Mockito.mock(SearchCriteria.class); + UpdateBuilder mockUpdateBuilder = Mockito.mock(UpdateBuilder.class); + Mockito.when(mockSearch.create()).thenReturn(mockSearchCriteria); + Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("server"), Mockito.eq(1L)); + Mockito.doNothing().when(mockSearchCriteria).setParameters(Mockito.eq("step"), Mockito.eq(HighAvailabilityManager.Step.Done), Mockito.eq(HighAvailabilityManager.Step.Cancelled)); + HaWorkVO haWorkVO = new HaWorkVO(1L, VirtualMachine.Type.User, null, + null, 1L, null, 0, 0, + HighAvailabilityManager.ReasonType.HostMaintenance); + Mockito.when(highAvailabilityDaoImpl.createForUpdate()).thenReturn(haWorkVO); + Mockito.when(highAvailabilityDaoImpl.createForUpdate()).thenReturn(haWorkVO); + try(MockedStatic genericDaoBaseMockedStatic = Mockito.mockStatic(GenericDaoBase.class)) { + genericDaoBaseMockedStatic.when(() -> GenericDaoBase.getUpdateBuilder(Mockito.any())).thenReturn(mockUpdateBuilder); + Mockito.doReturn(5).when(highAvailabilityDaoImpl).update(Mockito.any(UpdateBuilder.class), Mockito.any(), Mockito.nullable(Integer.class)); + highAvailabilityDaoImpl.markServerPendingWorksAsInvestigating(1L); + Mockito.verify(mockSearch).create(); + Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("server"), Mockito.eq(1L)); + Mockito.verify(mockSearchCriteria).setParameters(Mockito.eq("step"), Mockito.eq(HighAvailabilityManager.Step.Done), Mockito.eq(HighAvailabilityManager.Step.Cancelled)); + Assert.assertEquals(HighAvailabilityManager.Step.Investigating, haWorkVO.getStep()); // Ensure the step is set correctly + Mockito.verify(highAvailabilityDaoImpl).update(Mockito.eq(mockUpdateBuilder), Mockito.eq(mockSearchCriteria), Mockito.isNull()); + } + } }