forceha: fix two issues when (1)stop vm from inside (2) force remove host (#4647)

* forceha: fix vm is not started if it is poweroff from inside

steps to reproduce the issue
(1) make sure force.ha is true in global setting. if not, change it to true, and restart mgt server
(2) create a service offering , ha is not enabled
(3) create a vm
(4) log into the vm, and power off via cli.

expected result: vm is started again by cloudstack
actual result: vm is not started.

* forceha: fix vms are still running if host is force-removed

when host can be force removed, however vms are stopped in cloudstack, but not stopped on host
```
(localcloud) 🐱 > delete host id="a5625393-444d-4d0a-b31d-62baf88a8be1" forced=true
{
  "success": true
}```

after some minutes, vms are still runnning on host
```
root@mgt01:~# ssh node63 virsh list
 Id   Name        State
---------------------------
 1    i-2-19-VM   running
 2    i-2-11-VM   running
```

error message are
```
Cannot transmit host 2 to Enabled state
com.cloud.utils.fsm.NoTransitionException: No next resource state found for current state = Enabled event = DeleteHost
        at com.cloud.resource.ResourceManagerImpl.resourceStateTransitTo(ResourceManagerImpl.java:1216)
        at com.cloud.resource.ResourceManagerImpl$1.doInTransactionWithoutResult(ResourceManagerImpl.java:907)
```

* forceha: Make ForceHA dynamic
This commit is contained in:
Wei Zhou 2021-05-14 19:44:39 +02:00 committed by GitHub
parent 6b1c94ea3e
commit e2183ed666
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 33 additions and 31 deletions

View File

@ -114,6 +114,7 @@ public enum ResourceState {
s_fsm.addTransition(ResourceState.Enabled, Event.Disable, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance);
s_fsm.addTransition(ResourceState.Enabled, Event.InternalEnterMaintenance, ResourceState.Maintenance);
s_fsm.addTransition(ResourceState.Enabled, Event.DeleteHost, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.Disabled, Event.Enable, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.Disabled, Event.Disable, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.Disabled, Event.InternalCreated, ResourceState.Disabled);

View File

@ -32,7 +32,7 @@ import java.util.List;
*/
public interface HighAvailabilityManager extends Manager {
ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
public ConfigKey<Boolean> ForceHA = new ConfigKey<>("Advanced", Boolean.class, "force.ha", "false",
"Force High-Availability to happen even if the VM says no.", true, Cluster);
ConfigKey<Integer> HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5",

View File

@ -4498,7 +4498,7 @@ public class VirtualMachineManagerImpl extends ManagerBase implements VirtualMac
String.format("VM %s is at %s and we received a %s report while there is no pending jobs on it"
, vm.getInstanceName(), vm.getState(), vm.getPowerState()));
}
if(vm.isHaEnabled() && vm.getState() == State.Running
if((HighAvailabilityManager.ForceHA.value() || vm.isHaEnabled()) && vm.getState() == State.Running
&& HaVmRestartHostUp.value()
&& vm.getHypervisorType() != HypervisorType.VMware
&& vm.getHypervisorType() != HypervisorType.Hyperv) {

View File

@ -197,7 +197,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
int _maxRetries;
long _timeBetweenFailures;
long _timeBetweenCleanups;
boolean _forceHA;
String _haTag = null;
protected HighAvailabilityManagerImpl() {
@ -364,7 +363,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
alertType = AlertManager.AlertType.ALERT_TYPE_SSVM;
}
if (!(_forceHA || vm.isHaEnabled())) {
if (!(ForceHA.value() || vm.isHaEnabled())) {
String hostDesc = "id:" + vm.getHostId() + ", availability zone id:" + vm.getDataCenterId() + ", pod id:" + vm.getPodIdToDeployIn();
_alertMgr.sendAlert(alertType, vm.getDataCenterId(), vm.getPodIdToDeployIn(), "VM (name: " + vm.getHostName() + ", id: " + vm.getId() +
") stopped unexpectedly on host " + hostDesc, "Virtual Machine " + vm.getHostName() + " (id: " + vm.getId() + ") running on host [" + vm.getHostId() +
@ -569,7 +568,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
vm = _itMgr.findById(vm.getId());
if (!_forceHA && !vm.isHaEnabled()) {
if (!ForceHA.value() && !vm.isHaEnabled()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("VM is not HA enabled so we're done.");
}
@ -861,7 +860,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements Configur
_workers[i] = new WorkerThread("HA-Worker-" + i);
}
_forceHA = ForceHA.value();
_timeToSleep = TimeToSleep.value() * SECONDS_TO_MILLISECONDS_FACTOR;
_maxRetries = MigrationMaxRetries.value();
_timeBetweenFailures = TimeBetweenFailures.value() * SECONDS_TO_MILLISECONDS_FACTOR;

View File

@ -2325,34 +2325,32 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
s_logger.debug("Deleting Host: " + host.getId() + " Guid:" + host.getGuid());
}
if (forceDestroyStorage) {
final StoragePoolVO storagePool = _storageMgr.findLocalStorageOnHost(host.getId());
if (forceDestroyStorage && storagePool != null) {
// put local storage into mainenance mode, will set all the VMs on
// this local storage into stopped state
final StoragePoolVO storagePool = _storageMgr.findLocalStorageOnHost(host.getId());
if (storagePool != null) {
if (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.ErrorInMaintenance) {
try {
final StoragePool pool = _storageSvr.preparePrimaryStorageForMaintenance(storagePool.getId());
if (pool == null) {
s_logger.debug("Failed to set primary storage into maintenance mode");
if (storagePool.getStatus() == StoragePoolStatus.Up || storagePool.getStatus() == StoragePoolStatus.ErrorInMaintenance) {
try {
final StoragePool pool = _storageSvr.preparePrimaryStorageForMaintenance(storagePool.getId());
if (pool == null) {
s_logger.debug("Failed to set primary storage into maintenance mode");
throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode");
}
} catch (final Exception e) {
s_logger.debug("Failed to set primary storage into maintenance mode, due to: " + e.toString());
throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode, due to: " + e.toString());
throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode");
}
} catch (final Exception e) {
s_logger.debug("Failed to set primary storage into maintenance mode, due to: " + e.toString());
throw new UnableDeleteHostException("Failed to set primary storage into maintenance mode, due to: " + e.toString());
}
}
final List<VMInstanceVO> vmsOnLocalStorage = _storageMgr.listByStoragePool(storagePool.getId());
for (final VMInstanceVO vm : vmsOnLocalStorage) {
try {
_vmMgr.destroy(vm.getUuid(), false);
} catch (final Exception e) {
final String errorMsg = "There was an error Destory the vm: " + vm + " as a part of hostDelete id=" + host.getId();
s_logger.debug(errorMsg, e);
throw new UnableDeleteHostException(errorMsg + "," + e.getMessage());
}
final List<VMInstanceVO> vmsOnLocalStorage = _storageMgr.listByStoragePool(storagePool.getId());
for (final VMInstanceVO vm : vmsOnLocalStorage) {
try {
_vmMgr.destroy(vm.getUuid(), false);
} catch (final Exception e) {
final String errorMsg = "There was an error Destory the vm: " + vm + " as a part of hostDelete id=" + host.getId();
s_logger.debug(errorMsg, e);
throw new UnableDeleteHostException(errorMsg + "," + e.getMessage());
}
}
} else {
@ -2362,17 +2360,22 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
if (isForced) {
// Stop HA disabled vms and HA enabled vms in Stopping state
// Restart HA enabled vms
try {
resourceStateTransitTo(host, ResourceState.Event.DeleteHost, host.getId());
} catch (final NoTransitionException e) {
s_logger.debug("Cannot transmit host " + host.getId() + " to Disabled state", e);
}
for (final VMInstanceVO vm : vms) {
if (!vm.isHaEnabled() || vm.getState() == State.Stopping) {
if ((! HighAvailabilityManager.ForceHA.value() && !vm.isHaEnabled()) || vm.getState() == State.Stopping) {
s_logger.debug("Stopping vm: " + vm + " as a part of deleteHost id=" + host.getId());
try {
_vmMgr.advanceStop(vm.getUuid(), false);
_haMgr.scheduleStop(vm, host.getId(), WorkType.Stop);
} catch (final Exception e) {
final String errorMsg = "There was an error stopping the vm: " + vm + " as a part of hostDelete id=" + host.getId();
s_logger.debug(errorMsg, e);
throw new UnableDeleteHostException(errorMsg + "," + e.getMessage());
}
} else if (vm.isHaEnabled() && (vm.getState() == State.Running || vm.getState() == State.Starting)) {
} else if ((HighAvailabilityManager.ForceHA.value() || vm.isHaEnabled()) && (vm.getState() == State.Running || vm.getState() == State.Starting)) {
s_logger.debug("Scheduling restart for vm: " + vm + " " + vm.getState() + " on the host id=" + host.getId());
_haMgr.scheduleRestart(vm, false);
}