From e2183ed666d202cb5c83b3a640160bea52ab18fa Mon Sep 17 00:00:00 2001 From: Wei Zhou Date: Fri, 14 May 2021 19:44:39 +0200 Subject: [PATCH] forceha: fix two issues when (1)stop vm from inside (2) force remove host (#4647) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * forceha: fix vm is not started if it is poweroff from inside steps to reproduce the issue (1) make sure force.ha is true in global setting. if not, change it to true, and restart mgt server (2) create a service offering , ha is not enabled (3) create a vm (4) log into the vm, and power off via cli. expected result: vm is started again by cloudstack actual result: vm is not started. * forceha: fix vms are still running if host is force-removed when host can be force removed, however vms are stopped in cloudstack, but not stopped on host ``` (localcloud) 🐱 > delete host id="a5625393-444d-4d0a-b31d-62baf88a8be1" forced=true { "success": true }``` after some minutes, vms are still runnning on host ``` root@mgt01:~# ssh node63 virsh list Id Name State --------------------------- 1 i-2-19-VM running 2 i-2-11-VM running ``` error message are ``` Cannot transmit host 2 to Enabled state com.cloud.utils.fsm.NoTransitionException: No next resource state found for current state = Enabled event = DeleteHost at com.cloud.resource.ResourceManagerImpl.resourceStateTransitTo(ResourceManagerImpl.java:1216) at com.cloud.resource.ResourceManagerImpl$1.doInTransactionWithoutResult(ResourceManagerImpl.java:907) ``` * forceha: Make ForceHA dynamic --- .../com/cloud/resource/ResourceState.java | 1 + .../com/cloud/ha/HighAvailabilityManager.java | 2 +- .../cloud/vm/VirtualMachineManagerImpl.java | 2 +- .../cloud/ha/HighAvailabilityManagerImpl.java | 6 +-- .../cloud/resource/ResourceManagerImpl.java | 53 ++++++++++--------- 5 files changed, 33 insertions(+), 31 deletions(-) diff --git a/api/src/main/java/com/cloud/resource/ResourceState.java b/api/src/main/java/com/cloud/resource/ResourceState.java index 9b3bafe28db..6e0fa909230 100644 --- a/api/src/main/java/com/cloud/resource/ResourceState.java +++ b/api/src/main/java/com/cloud/resource/ResourceState.java @@ -114,6 +114,7 @@ public enum ResourceState { s_fsm.addTransition(ResourceState.Enabled, Event.Disable, ResourceState.Disabled); s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance); s_fsm.addTransition(ResourceState.Enabled, Event.InternalEnterMaintenance, ResourceState.Maintenance); + s_fsm.addTransition(ResourceState.Enabled, Event.DeleteHost, ResourceState.Disabled); s_fsm.addTransition(ResourceState.Disabled, Event.Enable, ResourceState.Enabled); s_fsm.addTransition(ResourceState.Disabled, Event.Disable, ResourceState.Disabled); s_fsm.addTransition(ResourceState.Disabled, Event.InternalCreated, ResourceState.Disabled); diff --git a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java index 18ceddbf68d..1dd999dad97 100644 --- a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java +++ b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java @@ -32,7 +32,7 @@ import java.util.List; */ public interface HighAvailabilityManager extends Manager { - ConfigKey ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false", + public ConfigKey ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false", "Force High-Availability to happen even if the VM says no.", true, Cluster); ConfigKey HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5", diff --git a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java index 2fb0a622653..830e8a1e792 100755 --- a/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/vm/VirtualMachineManagerImpl.java @@ -4498,7 +4498,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac String.format("VM %s is at %s and we received a %s report while there is no pending jobs on it" , vm.getInstanceName(), vm.getState(), vm.getPowerState())); } - if(vm.isHaEnabled() && vm.getState() == State.Running + if((HighAvailabilityManager.ForceHA.value() || vm.isHaEnabled()) && vm.getState() == State.Running && HaVmRestartHostUp.value() && vm.getHypervisorType() != HypervisorType.VMware && vm.getHypervisorType() != HypervisorType.Hyperv) { diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java index b05e008546b..cde5594215e 100644 --- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -197,7 +197,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur int _maxRetries; long _timeBetweenFailures; long _timeBetweenCleanups; - boolean _forceHA; String _haTag = null; protected HighAvailabilityManagerImpl() { @@ -364,7 +363,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur alertType = AlertManager.AlertType.ALERT_TYPE_SSVM; } - if (!(_forceHA || vm.isHaEnabled())) { + if (!(ForceHA.value() || vm.isHaEnabled())) { String hostDesc = "id:" + vm.getHostId() + ", availability zone id:" + vm.getDataCenterId() + ", pod id:" + vm.getPodIdToDeployIn(); _alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "VM (name: " + vm.getHostName() + ", id: " + vm.getId() + ") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getHostName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() + @@ -569,7 +568,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur vm = _itMgr.findById(vm.getId()); - if (!_forceHA && !vm.isHaEnabled()) { + if (!ForceHA.value() && !vm.isHaEnabled()) { if (s_logger.isDebugEnabled()) { s_logger.debug("VM is not HA enabled so we're done."); } @@ -861,7 +860,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur _workers[i] = new WorkerThread("HA-Worker-" + i); } - _forceHA = ForceHA.value(); _timeToSleep = TimeToSleep.value() * SECONDS_TO_MILLISECONDS_FACTOR; _maxRetries = MigrationMaxRetries.value(); _timeBetweenFailures = TimeBetweenFailures.value() * SECONDS_TO_MILLISECONDS_FACTOR; diff --git a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java index 6945d6f4368..c8fe5780cf5 100755 --- a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java @@ -2325,34 +2325,32 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, s_logger.debug("Deleting Host: " + host.getId() + " Guid:" + host.getGuid()); } - if (forceDestroyStorage) { + final StoragePoolVO storagePool = _storageMgr.findLocalStorageOnHost(host.getId()); + if (forceDestroyStorage && storagePool != null) { // put local storage into mainenance mode, will set all the VMs on // this local storage into stopped state - final StoragePoolVO storagePool = _storageMgr.findLocalStorageOnHost(host.getId()); - if (storagePool != null) { - if (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.ErrorInMaintenance) { - try { - final StoragePool pool = _storageSvr.preparePrimaryStorageForMaintenance(storagePool.getId()); - if (pool == null) { - s_logger.debug("Failed to set primary storage into maintenance mode"); + if (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.ErrorInMaintenance) { + try { + final StoragePool pool = _storageSvr.preparePrimaryStorageForMaintenance(storagePool.getId()); + if (pool == null) { + s_logger.debug("Failed to set primary storage into maintenance mode"); - throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode"); - } - } catch (final Exception e) { - s_logger.debug("Failed to set primary storage into maintenance mode, due to: " + e.toString()); - throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode, due to: " + e.toString()); + throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode"); } + } catch (final Exception e) { + s_logger.debug("Failed to set primary storage into maintenance mode, due to: " + e.toString()); + throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode, due to: " + e.toString()); } + } - final List vmsOnLocalStorage = _storageMgr.listByStoragePool(storagePool.getId()); - for (final VMInstanceVO vm : vmsOnLocalStorage) { - try { - _vmMgr.destroy(vm.getUuid(), false); - } catch (final Exception e) { - final String errorMsg = "There was an error Destory the vm: " + vm + " as a part of hostDelete id=" + host.getId(); - s_logger.debug(errorMsg, e); - throw new UnableDeleteHostException(errorMsg + "," + e.getMessage()); - } + final List vmsOnLocalStorage = _storageMgr.listByStoragePool(storagePool.getId()); + for (final VMInstanceVO vm : vmsOnLocalStorage) { + try { + _vmMgr.destroy(vm.getUuid(), false); + } catch (final Exception e) { + final String errorMsg = "There was an error Destory the vm: " + vm + " as a part of hostDelete id=" + host.getId(); + s_logger.debug(errorMsg, e); + throw new UnableDeleteHostException(errorMsg + "," + e.getMessage()); } } } else { @@ -2362,17 +2360,22 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, if (isForced) { // Stop HA disabled vms and HA enabled vms in Stopping state // Restart HA enabled vms + try { + resourceStateTransitTo(host, ResourceState.Event.DeleteHost, host.getId()); + } catch (final NoTransitionException e) { + s_logger.debug("Cannot transmit host " + host.getId() + " to Disabled state", e); + } for (final VMInstanceVO vm : vms) { - if (!vm.isHaEnabled() || vm.getState() == State.Stopping) { + if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) { s_logger.debug("Stopping vm: " + vm + " as a part of deleteHost id=" + host.getId()); try { - _vmMgr.advanceStop(vm.getUuid(), false); + _haMgr.scheduleStop(vm, host.getId(), WorkType.Stop); } catch (final Exception e) { final String errorMsg = "There was an error stopping the vm: " + vm + " as a part of hostDelete id=" + host.getId(); s_logger.debug(errorMsg, e); throw new UnableDeleteHostException(errorMsg + "," + e.getMessage()); } - } else if (vm.isHaEnabled() && (vm.getState() == State.Running || vm.getState() == State.Starting)) { + } else if ((HighAvailabilityManager.ForceHA.value() || vm.isHaEnabled()) && (vm.getState() == State.Running || vm.getState() == State.Starting)) { s_logger.debug("Scheduling restart for vm: " + vm + " " + vm.getState() + " on the host id=" + host.getId()); _haMgr.scheduleRestart(vm, false); }