diff --git a/api/src/main/java/com/cloud/resource/ResourceState.java b/api/src/main/java/com/cloud/resource/ResourceState.java index d952afa0b7d..9b3bafe28db 100644 --- a/api/src/main/java/com/cloud/resource/ResourceState.java +++ b/api/src/main/java/com/cloud/resource/ResourceState.java @@ -16,23 +16,33 @@ // under the License. package com.cloud.resource; +import java.util.Arrays; import java.util.List; import java.util.Set; import com.cloud.utils.fsm.StateMachine; public enum ResourceState { - Creating, Enabled, Disabled, PrepareForMaintenance, ErrorInMaintenance, Maintenance, Error; + Creating, + Enabled, + Disabled, + ErrorInPrepareForMaintenance, + PrepareForMaintenance, + ErrorInMaintenance, + Maintenance, + Error; public enum Event { InternalCreated("Resource is created"), Enable("Admin enables"), Disable("Admin disables"), - AdminAskMaintenace("Admin asks to enter maintenance"), + AdminAskMaintenance("Admin asks to enter maintenance"), AdminCancelMaintenance("Admin asks to cancel maintenance"), InternalEnterMaintenance("Resource enters maintenance"), UpdatePassword("Admin updates password of host"), - UnableToMigrate("Management server migrates VM failed"), + UnableToMigrate("Migration of VM failed, such as from scheduled HAWork"), + UnableToMaintain("Management server has exhausted all legal operations and attempts to put into maintenance has failed"), + ErrorsCorrected("Errors were corrected on a resource attempting to enter maintenance but encountered errors"), Error("An internal error happened"), DeleteHost("Admin delete a host"), @@ -84,6 +94,16 @@ public enum ResourceState { return strs; } + public static boolean isMaintenanceState(ResourceState state) { + return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance, + ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state); + } + + public static boolean canAttemptMaintenance(ResourceState state) { + return !Arrays.asList(ResourceState.Maintenance, ResourceState.PrepareForMaintenance, + ResourceState.ErrorInPrepareForMaintenance).contains(state); + } + protected static final StateMachine s_fsm = new StateMachine(); static { s_fsm.addTransition(null, Event.InternalCreated, ResourceState.Enabled); @@ -92,22 +112,31 @@ public enum ResourceState { s_fsm.addTransition(ResourceState.Enabled, Event.Enable, ResourceState.Enabled); s_fsm.addTransition(ResourceState.Enabled, Event.InternalCreated, ResourceState.Enabled); s_fsm.addTransition(ResourceState.Enabled, Event.Disable, ResourceState.Disabled); - s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenace, ResourceState.PrepareForMaintenance); + s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance); s_fsm.addTransition(ResourceState.Enabled, Event.InternalEnterMaintenance, ResourceState.Maintenance); s_fsm.addTransition(ResourceState.Disabled, Event.Enable, ResourceState.Enabled); s_fsm.addTransition(ResourceState.Disabled, Event.Disable, ResourceState.Disabled); s_fsm.addTransition(ResourceState.Disabled, Event.InternalCreated, ResourceState.Disabled); s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance); s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled); - s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInMaintenance); + s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInPrepareForMaintenance); + s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMaintain, ResourceState.ErrorInMaintenance); s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.InternalCreated, ResourceState.PrepareForMaintenance); s_fsm.addTransition(ResourceState.Maintenance, Event.AdminCancelMaintenance, ResourceState.Enabled); s_fsm.addTransition(ResourceState.Maintenance, Event.InternalCreated, ResourceState.Maintenance); s_fsm.addTransition(ResourceState.Maintenance, Event.DeleteHost, ResourceState.Disabled); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.InternalCreated, ResourceState.ErrorInPrepareForMaintenance); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.Disable, ResourceState.Disabled); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.DeleteHost, ResourceState.Disabled); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInPrepareForMaintenance); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.UnableToMaintain, ResourceState.ErrorInMaintenance); + s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.ErrorsCorrected, ResourceState.PrepareForMaintenance); s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.InternalCreated, ResourceState.ErrorInMaintenance); + s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance); s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.Disable, ResourceState.Disabled); s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.DeleteHost, ResourceState.Disabled); - s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance); s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled); s_fsm.addTransition(ResourceState.Error, Event.InternalCreated, ResourceState.Error); s_fsm.addTransition(ResourceState.Disabled, Event.DeleteHost, ResourceState.Disabled); diff --git a/api/src/main/java/org/apache/cloudstack/api/command/admin/host/PrepareForMaintenanceCmd.java b/api/src/main/java/org/apache/cloudstack/api/command/admin/host/PrepareForMaintenanceCmd.java index e49aabc49d4..f60812821d6 100644 --- a/api/src/main/java/org/apache/cloudstack/api/command/admin/host/PrepareForMaintenanceCmd.java +++ b/api/src/main/java/org/apache/cloudstack/api/command/admin/host/PrepareForMaintenanceCmd.java @@ -16,8 +16,6 @@ // under the License. package org.apache.cloudstack.api.command.admin.host; -import org.apache.log4j.Logger; - import org.apache.cloudstack.api.APICommand; import org.apache.cloudstack.api.ApiCommandJobType; import org.apache.cloudstack.api.ApiConstants; @@ -27,10 +25,12 @@ import org.apache.cloudstack.api.Parameter; import org.apache.cloudstack.api.ServerApiException; import org.apache.cloudstack.api.response.HostResponse; import org.apache.cloudstack.context.CallContext; +import org.apache.log4j.Logger; import com.cloud.event.EventTypes; import com.cloud.host.Host; import com.cloud.user.Account; +import com.cloud.utils.exception.CloudRuntimeException; @APICommand(name = "prepareHostForMaintenance", description = "Prepares a host for maintenance.", responseObject = HostResponse.class, requestHasSensitiveInfo = false, responseHasSensitiveInfo = false) @@ -99,13 +99,17 @@ public class PrepareForMaintenanceCmd extends BaseAsyncCmd { @Override public void execute() { - Host result = _resourceService.maintain(this); - if (result != null) { - HostResponse response = _responseGenerator.createHostResponse(result); - response.setResponseName("host"); - this.setResponseObject(response); - } else { - throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance"); + try { + Host result = _resourceService.maintain(this); + if (result != null) { + HostResponse response = _responseGenerator.createHostResponse(result); + response.setResponseName("host"); + this.setResponseObject(response); + } else { + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance"); + } + } catch (CloudRuntimeException exception) { + throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance due to: " + exception.getMessage()); } } } diff --git a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java index ecfb6f65701..8c63b3a6777 100644 --- a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java +++ b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java @@ -102,6 +102,7 @@ public interface HighAvailabilityManager extends Manager { boolean hasPendingHaWork(long vmId); + boolean hasPendingMigrationsWork(long vmId); /** * @return */ diff --git a/engine/components-api/src/main/java/com/cloud/resource/ResourceManager.java b/engine/components-api/src/main/java/com/cloud/resource/ResourceManager.java index b66f7923b4d..387fa7f6415 100755 --- a/engine/components-api/src/main/java/com/cloud/resource/ResourceManager.java +++ b/engine/components-api/src/main/java/com/cloud/resource/ResourceManager.java @@ -47,11 +47,6 @@ import org.apache.cloudstack.framework.config.Configurable; */ public interface ResourceManager extends ResourceService, Configurable { - ConfigKey HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class, - "host.maintenance.retries","20", - "Number of retries when preparing a host into Maintenance Mode is faulty before failing", - true, ConfigKey.Scope.Cluster); - ConfigKey KvmSshToAgentEnabled = new ConfigKey<>("Advanced", Boolean.class, "kvm.ssh.to.agent","true", "Number of retries when preparing a host into Maintenance Mode is faulty before failing", @@ -97,7 +92,7 @@ public interface ResourceManager extends ResourceService, Configurable { boolean umanageHost(long hostId); - boolean maintenanceFailed(long hostId); + boolean migrateAwayFailed(long hostId, long vmId); public boolean maintain(final long hostId) throws AgentUnavailableException; diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index 60911319e33..2ee583d1db7 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -1583,7 +1583,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl final HostVO h = sc.find(); if (h != null) { final ResourceState resourceState = h.getResourceState(); - if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance || resourceState == ResourceState.ErrorInMaintenance) { + if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance) { /* * Host is in non-operation state, so no investigation and direct put agent to Disconnected */ @@ -1605,7 +1605,9 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl } final QueryBuilder sc = QueryBuilder.create(HostVO.class); - sc.and(sc.entity().getResourceState(), Op.IN, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance); + sc.and(sc.entity().getResourceState(), Op.IN, + ResourceState.PrepareForMaintenance, + ResourceState.ErrorInPrepareForMaintenance); final List hosts = sc.list(); for (final HostVO host : hosts) { diff --git a/engine/orchestration/src/main/java/org/apache/cloudstack/engine/datacenter/entity/api/db/EngineHostVO.java b/engine/orchestration/src/main/java/org/apache/cloudstack/engine/datacenter/entity/api/db/EngineHostVO.java index be1484f0bde..846b4157786 100644 --- a/engine/orchestration/src/main/java/org/apache/cloudstack/engine/datacenter/entity/api/db/EngineHostVO.java +++ b/engine/orchestration/src/main/java/org/apache/cloudstack/engine/datacenter/entity/api/db/EngineHostVO.java @@ -16,16 +16,10 @@ // under the License. package org.apache.cloudstack.engine.datacenter.entity.api.db; -import com.cloud.host.Status; -import com.cloud.hypervisor.Hypervisor.HypervisorType; -import com.cloud.resource.ResourceState; -import com.cloud.storage.Storage.StoragePoolType; -import com.cloud.utils.NumbersUtil; -import com.cloud.utils.db.GenericDao; -import com.cloud.utils.db.StateMachine; -import org.apache.cloudstack.api.Identity; -import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State; -import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State.Event; +import java.util.Date; +import java.util.List; +import java.util.Map; +import java.util.UUID; import javax.persistence.Column; import javax.persistence.DiscriminatorColumn; @@ -42,10 +36,18 @@ import javax.persistence.Table; import javax.persistence.Temporal; import javax.persistence.TemporalType; import javax.persistence.Transient; -import java.util.Date; -import java.util.List; -import java.util.Map; -import java.util.UUID; + +import org.apache.cloudstack.api.Identity; +import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State; +import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State.Event; + +import com.cloud.host.Status; +import com.cloud.hypervisor.Hypervisor.HypervisorType; +import com.cloud.resource.ResourceState; +import com.cloud.storage.Storage.StoragePoolType; +import com.cloud.utils.NumbersUtil; +import com.cloud.utils.db.GenericDao; +import com.cloud.utils.db.StateMachine; @Entity @Table(name = "host") @@ -730,7 +732,7 @@ public class EngineHostVO implements EngineHost, Identity { @Override public boolean isInMaintenanceStates() { - return (getResourceState() == ResourceState.Maintenance || getResourceState() == ResourceState.ErrorInMaintenance || getResourceState() == ResourceState.PrepareForMaintenance); + return ResourceState.isMaintenanceState(getResourceState()); } public long getUpdated() { diff --git a/engine/schema/src/main/java/com/cloud/host/HostVO.java b/engine/schema/src/main/java/com/cloud/host/HostVO.java index 7fd1e710185..f23435945d5 100644 --- a/engine/schema/src/main/java/com/cloud/host/HostVO.java +++ b/engine/schema/src/main/java/com/cloud/host/HostVO.java @@ -16,12 +16,11 @@ // under the License. package com.cloud.host; -import com.cloud.agent.api.VgpuTypesInfo; -import com.cloud.hypervisor.Hypervisor.HypervisorType; -import com.cloud.resource.ResourceState; -import com.cloud.storage.Storage.StoragePoolType; -import com.cloud.utils.NumbersUtil; -import com.cloud.utils.db.GenericDao; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; import javax.persistence.Column; import javax.persistence.DiscriminatorColumn; @@ -38,11 +37,13 @@ import javax.persistence.Table; import javax.persistence.Temporal; import javax.persistence.TemporalType; import javax.persistence.Transient; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.UUID; + +import com.cloud.agent.api.VgpuTypesInfo; +import com.cloud.hypervisor.Hypervisor.HypervisorType; +import com.cloud.resource.ResourceState; +import com.cloud.storage.Storage.StoragePoolType; +import com.cloud.utils.NumbersUtil; +import com.cloud.utils.db.GenericDao; @Entity @Table(name = "host") @@ -714,9 +715,8 @@ public class HostVO implements Host { @Override public boolean isInMaintenanceStates() { - return (getResourceState() == ResourceState.Maintenance || getResourceState() == ResourceState.ErrorInMaintenance || getResourceState() == ResourceState.PrepareForMaintenance); + return ResourceState.isMaintenanceState(getResourceState()); } - @Override public boolean isDisabled() { return (getResourceState() == ResourceState.Disabled); diff --git a/engine/schema/src/main/resources/META-INF/db/schema-41300to41400-cleanup.sql b/engine/schema/src/main/resources/META-INF/db/schema-41300to41400-cleanup.sql index 57c4a611f0e..fe84077f72b 100644 --- a/engine/schema/src/main/resources/META-INF/db/schema-41300to41400-cleanup.sql +++ b/engine/schema/src/main/resources/META-INF/db/schema-41300to41400-cleanup.sql @@ -19,3 +19,4 @@ -- Schema upgrade cleanup from 4.13.0.0 to 4.14.0.0 --; +DELETE FROM `cloud`.`configuration` WHERE name = 'host.maintenance.retries'; diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java index 49211f5eba3..348a4179f3f 100644 --- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -28,17 +28,19 @@ import java.util.concurrent.TimeUnit; import javax.inject.Inject; import javax.naming.ConfigurationException; -import org.apache.log4j.Logger; -import org.apache.log4j.NDC; import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService; +import org.apache.cloudstack.framework.config.ConfigKey; +import org.apache.cloudstack.framework.config.Configurable; import org.apache.cloudstack.framework.config.dao.ConfigurationDao; import org.apache.cloudstack.managed.context.ManagedContext; import org.apache.cloudstack.managed.context.ManagedContextRunnable; +import org.apache.cloudstack.management.ManagementServerHost; +import org.apache.log4j.Logger; +import org.apache.log4j.NDC; import com.cloud.agent.AgentManager; import com.cloud.alert.AlertManager; import com.cloud.cluster.ClusterManagerListener; -import org.apache.cloudstack.management.ManagementServerHost; import com.cloud.configuration.Config; import com.cloud.dc.ClusterDetailsDao; import com.cloud.dc.DataCenterVO; @@ -101,9 +103,14 @@ import com.cloud.vm.dao.VMInstanceDao; * ha.retry.wait | time to wait before retrying the work item | seconds | 120 || || stop.retry.wait | time to wait * before retrying the stop | seconds | 120 || * } **/ -public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvailabilityManager, ClusterManagerListener { +public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvailabilityManager, ClusterManagerListener, Configurable { protected static final Logger s_logger = Logger.getLogger(HighAvailabilityManagerImpl.class); + private ConfigKey MaxRetries = new ConfigKey<>("Advanced", Integer.class, + "max.retries","5", + "Total number of attempts for trying migration of a VM.", + true, ConfigKey.Scope.Cluster); + WorkerThread[] _workers; boolean _stopped; long _timeToSleep; @@ -314,6 +321,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai if (vm.getHostId() != null) { final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated()); _haDao.persist(work); + s_logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work); wakeupWorkers(); } return true; @@ -629,23 +637,32 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai public Long migrate(final HaWorkVO work) { long vmId = work.getInstanceId(); - long srcHostId = work.getHostId(); + + VMInstanceVO vm = _instanceDao.findById(vmId); + if (vm == null) { + s_logger.info("Unable to find vm: " + vmId + ", skipping migrate."); + return null; + } + s_logger.info("Migration attempt: for VM " + vm.getUuid() + "from host id " + srcHostId + + ". Starting attempt: " + (1 + work.getTimesTried()) + "/" + _maxRetries + " times."); try { work.setStep(Step.Migrating); _haDao.update(work.getId(), work); - VMInstanceVO vm = _instanceDao.findById(vmId); - if (vm == null) { - return null; - } // First try starting the vm with its original planner, if it doesn't succeed send HAPlanner as its an emergency. _itMgr.migrateAway(vm.getUuid(), srcHostId); return null; } catch (InsufficientServerCapacityException e) { - s_logger.warn("Insufficient capacity for migrating a VM."); - _resourceMgr.maintenanceFailed(srcHostId); + s_logger.warn("Migration attempt: Insufficient capacity for migrating a VM " + + vm.getUuid() + " from source host id " + srcHostId + + ". Exception: " + e.getMessage()); + _resourceMgr.migrateAwayFailed(srcHostId, vmId); return (System.currentTimeMillis() >> 10) + _migrateRetryInterval; + } catch (Exception e) { + s_logger.warn("Migration attempt: Unexpected exception occurred when attempting migration of " + + vm.getUuid() + e.getMessage()); + throw e; } } @@ -744,7 +761,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai @Override public void cancelScheduledMigrations(final HostVO host) { WorkType type = host.getType() == HostVO.Type.Storage ? WorkType.Stop : WorkType.Migration; - + s_logger.info("Canceling all scheduled migrations from host " + host.getUuid()); _haDao.deleteMigrationWorkItems(host.getId(), type, _serverId); } @@ -762,7 +779,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } private void rescheduleWork(final HaWorkVO work, final long nextTime) { - s_logger.info("Rescheduling work " + work + " to try again at " + new Date(nextTime << 10)); work.setTimeToTry(nextTime); work.setTimesTried(work.getTimesTried() + 1); work.setServerId(null); @@ -803,7 +819,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai } if (nextTime == null) { - s_logger.info("Completed work " + work); + s_logger.info("Completed work " + work + ". Took " + (work.getTimesTried() + 1) + "/" + _maxRetries + " attempts."); work.setStep(Step.Done); } else { rescheduleWork(work, nextTime.longValue()); @@ -819,12 +835,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai VMInstanceVO vm = _instanceDao.findById(work.getInstanceId()); work.setUpdateTime(vm.getUpdated()); work.setPreviousState(vm.getState()); + } finally { + if (!Step.Done.equals(work.getStep())) { + if (work.getTimesTried() >= _maxRetries) { + s_logger.warn("Giving up, retried max " + work.getTimesTried() + "/" + _maxRetries + " times for work: " + work); + work.setStep(Step.Done); + } else { + s_logger.warn("Rescheduling work " + work + " to try again at " + new Date(work.getTimeToTry() << 10) + + ". Finished attempt " + work.getTimesTried() + "/" + _maxRetries + " times."); + } + } + _haDao.update(work.getId(), work); } - if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) { - s_logger.warn("Giving up, retried max. times for work: " + work); - work.setStep(Step.Done); - } - _haDao.update(work.getId(), work); } @Override @@ -908,6 +930,16 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai return true; } + @Override + public String getConfigComponentName() { + return HighAvailabilityManagerImpl.class.getSimpleName(); + } + + @Override + public ConfigKey[] getConfigKeys() { + return new ConfigKey[] {MaxRetries}; + } + protected class CleanupTask extends ManagedContextRunnable { @Override protected void runInContext() { @@ -1004,4 +1036,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai List haWorks = _haDao.listPendingHaWorkForVm(vmId); return haWorks.size() > 0; } + + @Override + public boolean hasPendingMigrationsWork(long vmId) { + List haWorks = _haDao.listPendingMigrationsForVm(vmId); + for (HaWorkVO work : haWorks) { + if (work.getTimesTried() <= _maxRetries) { + return true; + } else { + s_logger.warn("HAWork Job of migration type " + work + " found in database which has max " + + "retries more than " + _maxRetries + " but still not in Done, Cancelled, or Error State"); + } + } + return false; + } } diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java index 85135bb9794..e8a3e17f805 100644 --- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java +++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDao.java @@ -83,4 +83,6 @@ public interface HighAvailabilityDao extends GenericDao { List listRunningHaWorkForVm(long vmId); List listPendingHaWorkForVm(long vmId); + + List listPendingMigrationsForVm(long vmId); } diff --git a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java index 3d11eb04f53..56e24c36ec7 100644 --- a/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java +++ b/server/src/main/java/com/cloud/ha/dao/HighAvailabilityDaoImpl.java @@ -48,6 +48,7 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl private final SearchBuilder FutureHaWorkSearch; private final SearchBuilder RunningHaWorkSearch; private final SearchBuilder PendingHaWorkSearch; + private final SearchBuilder MigratingWorkSearch; protected HighAvailabilityDaoImpl() { super(); @@ -112,6 +113,12 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl PendingHaWorkSearch.and("type", PendingHaWorkSearch.entity().getType(), Op.EQ); PendingHaWorkSearch.and("step", PendingHaWorkSearch.entity().getStep(), Op.NIN); PendingHaWorkSearch.done(); + + MigratingWorkSearch = createSearchBuilder(); + MigratingWorkSearch.and("instance", MigratingWorkSearch.entity().getInstanceId(), Op.EQ); + MigratingWorkSearch.and("workType", MigratingWorkSearch.entity().getWorkType(), Op.EQ); + MigratingWorkSearch.and("step", MigratingWorkSearch.entity().getStep(), Op.NIN); + MigratingWorkSearch.done(); } @Override @@ -124,6 +131,16 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase impl return search(sc, null); } + @Override + public List listPendingMigrationsForVm(long vmId) { + SearchCriteria sc = MigratingWorkSearch.create(); + sc.setParameters("instance", vmId); + sc.setParameters("workType", WorkType.Migration); + sc.setParameters("step", Step.Done, Step.Error, Step.Cancelled); + + return search(sc, null); + } + @Override public List listRunningHaWorkForVm(long vmId) { SearchCriteria sc = RunningHaWorkSearch.create(); diff --git a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java index d07a4383d01..29f7e68e08c 100755 --- a/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java +++ b/server/src/main/java/com/cloud/resource/ResourceManagerImpl.java @@ -26,7 +26,6 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; -import java.util.concurrent.ConcurrentHashMap; import javax.inject.Inject; import javax.naming.ConfigurationException; @@ -274,8 +273,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, private SearchBuilder _gpuAvailability; - private Map retryHostMaintenance = new ConcurrentHashMap<>(); - private void insertListener(final Integer event, final ResourceListener listener) { List lst = _lifeCycleListeners.get(event); if (lst == null) { @@ -1165,6 +1162,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, throw new InvalidParameterValueException("Host with id " + hostId.toString() + " doesn't exist"); } + if (!ResourceState.isMaintenanceState(host.getResourceState())) { + throw new CloudRuntimeException("Cannot perform cancelMaintenance when resource state is " + host.getResourceState() + ", hostId = " + hostId); + } + processResourceEvent(ResourceListener.EVENT_CANCEL_MAINTENANCE_BEFORE, hostId); final boolean success = cancelMaintenance(hostId); processResourceEvent(ResourceListener.EVENT_CANCEL_MAINTENANCE_AFTER, hostId); @@ -1212,6 +1213,12 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, private boolean doMaintain(final long hostId) { final HostVO host = _hostDao.findById(hostId); + s_logger.info("Maintenance: attempting maintenance of host " + host.getUuid()); + ResourceState hostState = host.getResourceState(); + if (!ResourceState.canAttemptMaintenance(hostState)) { + throw new CloudRuntimeException("Cannot perform maintain when resource state is " + hostState + ", hostId = " + hostId); + } + final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand()); if (answer == null || !answer.getResult()) { s_logger.warn("Unable to send MaintainCommand to host: " + hostId); @@ -1219,7 +1226,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, } try { - resourceStateTransitTo(host, ResourceState.Event.AdminAskMaintenace, _nodeId); + resourceStateTransitTo(host, ResourceState.Event.AdminAskMaintenance, _nodeId); } catch (final NoTransitionException e) { final String err = "Cannot transmit resource state of host " + host.getId() + " to " + ResourceState.Maintenance; s_logger.debug(err, e); @@ -1228,7 +1235,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0); _agentMgr.pullAgentToMaintenance(hostId); - setHostMaintenanceRetries(host); /* TODO: move below to listener */ if (host.getType() == Host.Type.Routing) { @@ -1244,11 +1250,13 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, || _serviceOfferingDetailsDao.findDetail(vm.getServiceOfferingId(), GPU.Keys.vgpuType.toString()) != null) { // Migration is not supported for VGPU Vms so stop them. // for the last host in this cluster, stop all the VMs + s_logger.error("Maintenance: No hosts available for migrations. Scheduling shutdown instead of migrations."); _haMgr.scheduleStop(vm, hostId, WorkType.ForceStop); } else if (HypervisorType.LXC.equals(host.getHypervisorType()) && VirtualMachine.Type.User.equals(vm.getType())){ //Migration is not supported for LXC Vms. Schedule restart instead. _haMgr.scheduleRestart(vm, false); } else { + s_logger.info("Maintenance: scheduling migration of VM " + vm.getUuid() + " from host " + host.getUuid()); _haMgr.scheduleMigration(vm); } } @@ -1256,19 +1264,9 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, return true; } - /** - * Set retries for transiting the host into Maintenance - */ - protected void setHostMaintenanceRetries(HostVO host) { - Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId()); - retryHostMaintenance.put(host.getId(), retries); - s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s", - host.getId(), host.getName(), retries)); - } - @Override public boolean maintain(final long hostId) throws AgentUnavailableException { - final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace); + final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenance); if (result != null) { return result; } @@ -1285,13 +1283,29 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, s_logger.debug("Unable to find host " + hostId); throw new InvalidParameterValueException("Unable to find host with ID: " + hostId + ". Please specify a valid host ID."); } + if (!ResourceState.canAttemptMaintenance(host.getResourceState())) { + throw new CloudRuntimeException("Host is already in state " + host.getResourceState() + ". Cannot recall for maintenance until resolved."); + } - if (_hostDao.countBy(host.getClusterId(), ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance) > 0) { - throw new InvalidParameterValueException("There are other servers in PrepareForMaintenance OR ErrorInMaintenance STATUS in cluster " + host.getClusterId()); + if (_hostDao.countBy(host.getClusterId(), ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance) > 0) { + throw new CloudRuntimeException("There are other servers attempting migrations for maintenance. " + + "Found hosts in PrepareForMaintenance OR ErrorInPrepareForMaintenance STATUS in cluster " + host.getClusterId()); } if (_storageMgr.isLocalStorageActiveOnHost(host.getId())) { - throw new InvalidParameterValueException("There are active VMs using the host's local storage pool. Please stop all VMs on this host that use local storage."); + throw new CloudRuntimeException("There are active VMs using the host's local storage pool. Please stop all VMs on this host that use local storage."); + } + List migratingInVMs = _vmDao.findByHostInStates(hostId, State.Migrating); + if (migratingInVMs.size() > 0) { + throw new CloudRuntimeException("Host contains incoming VMs migrating. Please wait for them to complete before putting to maintenance."); + } + + if (_vmDao.findByHostInStates(hostId, State.Starting, State.Stopping).size() > 0) { + throw new CloudRuntimeException("Host contains VMs in starting/stopping state. Please wait for them to complete before putting to maintenance."); + } + + if (_vmDao.findByHostInStates(hostId, State.Error, State.Unknown).size() > 0) { + throw new CloudRuntimeException("Host contains VMs in error/unknown/shutdown state. Please fix errors to proceed."); } try { @@ -1331,19 +1345,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, } } - /** - * Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following: - * - Cancel scheduled migrations for those which have already failed - * - Configure VNC access for VMs (KVM hosts only) - */ - protected boolean setHostIntoErrorInMaintenance(HostVO host, List failedMigrations) throws NoTransitionException { - s_logger.debug("Unable to migrate " + failedMigrations.size() + " VM(s) from host " + host.getUuid()); - _haMgr.cancelScheduledMigrations(host); - configureVncAccessForKVMHostFailedMigrations(host, failedMigrations); - resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId); - return false; - } - /** * Safely transit host into Maintenance mode */ @@ -1357,31 +1358,104 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, } /** - * Return true if host goes into Maintenance mode, only when: - * - No Running, Migrating or Failed migrations (host_id = last_host_id) for the host + * Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following: + * - Cancel scheduled migrations for those which have already failed + * - Configure VNC access for VMs (KVM hosts only) */ - protected boolean isHostInMaintenance(HostVO host, List runningVms, List migratingVms, List failedMigrations) throws NoTransitionException { - if (CollectionUtils.isEmpty(runningVms) && CollectionUtils.isEmpty(migratingVms)) { - return CollectionUtils.isEmpty(failedMigrations) ? - setHostIntoMaintenance(host) : - setHostIntoErrorInMaintenance(host, failedMigrations); - } else if (retryHostMaintenance.containsKey(host.getId())) { - Integer retriesLeft = retryHostMaintenance.get(host.getId()); - if (retriesLeft != null) { - if (retriesLeft <= 0) { - retryHostMaintenance.remove(host.getId()); - s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " + - "please investigate this connection.", - host.getId(), host.getName())); - return setHostIntoErrorInMaintenance(host, failedMigrations); - } - retriesLeft--; - retryHostMaintenance.put(host.getId(), retriesLeft); - s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s", - host.getId(), host.getName(), retriesLeft)); + protected boolean setHostIntoErrorInMaintenance(HostVO host, List errorVms) throws NoTransitionException { + s_logger.debug("Unable to migrate / fix errors for " + errorVms.size() + " VM(s) from host " + host.getUuid()); + _haMgr.cancelScheduledMigrations(host); + configureVncAccessForKVMHostFailedMigrations(host, errorVms); + resourceStateTransitTo(host, ResourceState.Event.UnableToMaintain, _nodeId); + return false; + } + + protected boolean setHostIntoErrorInPrepareForMaintenance(HostVO host, List errorVms) throws NoTransitionException { + s_logger.debug("Host " + host.getUuid() + " entering in PrepareForMaintenanceWithErrors state"); + configureVncAccessForKVMHostFailedMigrations(host, errorVms); + resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId); + return false; + } + + protected boolean setHostIntoPrepareForMaintenanceAfterErrorsFixed(HostVO host) throws NoTransitionException { + s_logger.debug("Host " + host.getUuid() + " entering in PrepareForMaintenance state as any previous corrections have been fixed"); + resourceStateTransitTo(host, ResourceState.Event.ErrorsCorrected, _nodeId); + return false; + } + + /** + * Return true if host goes into Maintenance mode. There are various possibilities for VMs' states + * on a host. We need to track the various VM states on each run and accordingly transit to the + * appropriate state. + * + * We change states as follws - + * 1. If there are no VMs in running, migrating, starting, stopping, error, unknown states we can move + * to maintenance state. Note that there cannot be incoming migrations as the API Call prepare for + * maintenance checks incoming migrations before starting. + * 2. If there errors (like migrating VMs, error VMs, etc) we mark as ErrorInPrepareForMaintenance but + * don't stop remaining migrations/ongoing legitimate operations. + * 3. If all migration retries, legitimate operations have finished we check for VMs on the host and if + * there are still VMs in error state or in running state or failed migrations we mark the VM as + * ErrorInMaintenance state. + * 4. Lastly if there are no errors or failed migrations or running VMs but there are still pending + * legitimate operations and the host was in ErrorInPrepareForMaintenance, we push the host back + * to PrepareForMaintenance state. + */ + protected boolean attemptMaintain(HostVO host) throws NoTransitionException { + final long hostId = host.getId(); + + s_logger.info("Attempting maintenance for host " + host.getName()); + + // Step 0: First gather if VMs have pending HAWork for migration with retries left. + final List allVmsOnHost = _vmDao.listByHostId(hostId); + final boolean hasMigratingAwayVms = CollectionUtils.isNotEmpty(_vmDao.listVmsMigratingFromHost(hostId)); + boolean hasPendingMigrationRetries = false; + for (VMInstanceVO vmInstanceVO : allVmsOnHost) { + if (_haMgr.hasPendingMigrationsWork(vmInstanceVO.getId())) { + s_logger.info("Attempting maintenance for " + host + " found pending migration for VM " + vmInstanceVO); + hasPendingMigrationRetries = true; + break; } } + // Step 1: If there are no VMs in migrating, running, starting, stopping, error or unknown state we can safely move the host to maintenance. + if (!hasMigratingAwayVms && CollectionUtils.isEmpty(_vmDao.findByHostInStates(host.getId(), + State.Migrating, State.Running, State.Starting, State.Stopping, State.Error, State.Unknown))) { + if (hasPendingMigrationRetries) { + s_logger.error("There should not be pending retries VMs for this host as there are no running, migrating," + + "starting, stopping, error or unknown states on host " + host); + } + return setHostIntoMaintenance(host); + } + + // Step 2: Gather relevant VMs' states on the host and then based on them we can determine if + final List failedMigrations = new ArrayList<>(_vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId)); + final List errorVms = new ArrayList<>(_vmDao.findByHostInStates(hostId, State.Unknown, State.Error)); + final boolean hasRunningVms = CollectionUtils.isNotEmpty(_vmDao.findByHostInStates(hostId, State.Running)); + final boolean hasFailedMigrations = CollectionUtils.isNotEmpty(failedMigrations); + final boolean hasVmsInFailureStates = CollectionUtils.isNotEmpty(errorVms); + final boolean hasStoppingVms = CollectionUtils.isNotEmpty(_vmDao.findByHostInStates(hostId, State.Stopping)); + errorVms.addAll(failedMigrations); + + // Step 3: If there are no pending migration retries but host still has running VMs or, + // host has VMs in failure state / failed migrations we move the host to ErrorInMaintenance state. + if ((!hasPendingMigrationRetries && !hasMigratingAwayVms && hasRunningVms) || + (!hasRunningVms && !hasMigratingAwayVms && hasVmsInFailureStates)) { + return setHostIntoErrorInMaintenance(host, errorVms); + } + + // Step 4: IF there are pending migrations or ongoing retries left or stopping VMs and there were errors or failed + // migrations we put the host into ErrorInPrepareForMaintenance + if ((hasPendingMigrationRetries || hasMigratingAwayVms || hasStoppingVms) && (hasVmsInFailureStates || hasFailedMigrations)) { + return setHostIntoErrorInPrepareForMaintenance(host, errorVms); + } + + // Step 5: If there were previously errors found, but not anymore it means the operator has fixed errors and we put + // the host into PrepareForMaintenance state. + if (host.getResourceState() == ResourceState.ErrorInPrepareForMaintenance) { + return setHostIntoPrepareForMaintenanceAfterErrorsFixed(host); + } + return false; } @@ -1392,14 +1466,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, try { if (host.getType() != Host.Type.Storage) { - final List vos = _vmDao.listByHostId(hostId); - final List vosMigrating = _vmDao.listVmsMigratingFromHost(hostId); - final List failedVmMigrations = _vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId); - - hostInMaintenance = isHostInMaintenance(host, vos, vosMigrating, failedVmMigrations); + hostInMaintenance = attemptMaintain(host); } } catch (final NoTransitionException e) { - s_logger.debug("Cannot transmit host " + host.getId() + "to Maintenance state", e); + s_logger.debug("Cannot transmit host " + host.getId() + " to Maintenance state", e); } return hostInMaintenance; } @@ -2327,8 +2397,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, * TODO: think twice about returning true or throwing out exception, I * really prefer to exception that always exposes bugs */ - if (host.getResourceState() != ResourceState.PrepareForMaintenance && host.getResourceState() != ResourceState.Maintenance && - host.getResourceState() != ResourceState.ErrorInMaintenance) { + if (!ResourceState.isMaintenanceState(host.getResourceState())) { throw new CloudRuntimeException("Cannot perform cancelMaintenance when resource state is " + host.getResourceState() + ", hostId = " + hostId); } @@ -2349,7 +2418,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, try { resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId); _agentMgr.pullAgentOutMaintenance(hostId); - retryHostMaintenance.remove(hostId); } catch (final NoTransitionException e) { s_logger.debug("Cannot transmit host " + host.getId() + "to Enabled state", e); return false; @@ -2433,7 +2501,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, @Override public boolean executeUserRequest(final long hostId, final ResourceState.Event event) throws AgentUnavailableException { - if (event == ResourceState.Event.AdminAskMaintenace) { + if (event == ResourceState.Event.AdminAskMaintenance) { return doMaintain(hostId); } else if (event == ResourceState.Event.AdminCancelMaintenance) { return doCancelMaintenance(hostId); @@ -2561,7 +2629,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, return null; } - s_logger.debug("Propagating resource request event:" + event.toString() + " to agent:" + agentId); + s_logger.debug("Propagating resource request event:" + event.toString() + " to agent:" + agentId); final Command[] cmds = new Command[1]; cmds[0] = new PropagateResourceEventCommand(agentId, event); @@ -2580,7 +2648,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, } @Override - public boolean maintenanceFailed(final long hostId) { + public boolean migrateAwayFailed(final long hostId, final long vmId) { final HostVO host = _hostDao.findById(hostId); if (host == null) { if (s_logger.isDebugEnabled()) { @@ -2589,6 +2657,8 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, return false; } else { try { + s_logger.warn("Migration of VM " + _vmDao.findById(vmId) + " failed from host " + _hostDao.findById(hostId) + + ". Emitting event UnableToMigrate."); return resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId); } catch (final NoTransitionException e) { s_logger.debug("No next resource state for host " + host.getId() + " while current state is " + host.getResourceState() + " with event " + @@ -2704,7 +2774,11 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId); } sc.and(sc.entity().getType(), Op.EQ, type); - sc.and(sc.entity().getResourceState(), Op.NIN, ResourceState.Maintenance, ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance, + sc.and(sc.entity().getResourceState(), Op.NIN, + ResourceState.Maintenance, + ResourceState.ErrorInMaintenance, + ResourceState.ErrorInPrepareForMaintenance, + ResourceState.PrepareForMaintenance, ResourceState.Error); return sc.list(); } @@ -2981,6 +3055,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager, @Override public ConfigKey[] getConfigKeys() { - return new ConfigKey[] {HostMaintenanceRetries}; + return new ConfigKey[0]; } } diff --git a/server/src/main/java/com/cloud/server/StatsCollector.java b/server/src/main/java/com/cloud/server/StatsCollector.java index b2ccfe274a5..ac1ae952cac 100644 --- a/server/src/main/java/com/cloud/server/StatsCollector.java +++ b/server/src/main/java/com/cloud/server/StatsCollector.java @@ -1556,7 +1556,11 @@ public class StatsCollector extends ManagerBase implements ComponentMethodInterc private SearchCriteria createSearchCriteriaForHostTypeRoutingStateUpAndNotInMaintenance() { SearchCriteria sc = _hostDao.createSearchCriteria(); sc.addAnd("status", SearchCriteria.Op.EQ, Status.Up.toString()); - sc.addAnd("resourceState", SearchCriteria.Op.NIN, ResourceState.Maintenance, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance); + sc.addAnd("resourceState", SearchCriteria.Op.NIN, + ResourceState.Maintenance, + ResourceState.PrepareForMaintenance, + ResourceState.ErrorInPrepareForMaintenance, + ResourceState.ErrorInMaintenance); sc.addAnd("type", SearchCriteria.Op.EQ, Host.Type.Routing.toString()); return sc; } diff --git a/server/src/main/java/com/cloud/servlet/ConsoleProxyServlet.java b/server/src/main/java/com/cloud/servlet/ConsoleProxyServlet.java index 5a6c84f1479..ae9b5c548e5 100644 --- a/server/src/main/java/com/cloud/servlet/ConsoleProxyServlet.java +++ b/server/src/main/java/com/cloud/servlet/ConsoleProxyServlet.java @@ -35,21 +35,16 @@ import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import javax.servlet.http.HttpSession; -import com.cloud.resource.ResourceState; +import org.apache.cloudstack.framework.security.keys.KeysManager; import org.apache.commons.codec.binary.Base64; import org.apache.log4j.Logger; import org.springframework.stereotype.Component; import org.springframework.web.context.support.SpringBeanAutowiringSupport; -import com.cloud.vm.VmDetailConstants; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; - -import org.apache.cloudstack.framework.security.keys.KeysManager; - import com.cloud.exception.PermissionDeniedException; import com.cloud.host.HostVO; import com.cloud.hypervisor.Hypervisor; +import com.cloud.resource.ResourceState; import com.cloud.server.ManagementServer; import com.cloud.storage.GuestOSVO; import com.cloud.user.Account; @@ -64,7 +59,10 @@ import com.cloud.utils.db.TransactionLegacy; import com.cloud.vm.UserVmDetailVO; import com.cloud.vm.VirtualMachine; import com.cloud.vm.VirtualMachineManager; +import com.cloud.vm.VmDetailConstants; import com.cloud.vm.dao.UserVmDetailsDao; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; /** * Thumbnail access : /console?cmd=thumbnail&vm=xxx&w=xxx&h=xxx @@ -420,14 +418,24 @@ public class ConsoleProxyServlet extends HttpServlet { StringBuffer sb = new StringBuffer(rootUrl); String host = hostVo.getPrivateIpAddress(); - Pair portInfo; - if (hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance)) { + Pair portInfo = null; + if (hostVo.getHypervisorType() == Hypervisor.HypervisorType.KVM && + (hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance) || + hostVo.getResourceState().equals(ResourceState.ErrorInPrepareForMaintenance))) { UserVmDetailVO detailAddress = _userVmDetailsDao.findDetail(vm.getId(), VmDetailConstants.KVM_VNC_ADDRESS); UserVmDetailVO detailPort = _userVmDetailsDao.findDetail(vm.getId(), VmDetailConstants.KVM_VNC_PORT); - portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue())); - } else { + if (detailAddress != null && detailPort != null) { + portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue())); + } else { + s_logger.warn("KVM Host in ErrorInMaintenance/ErrorInPrepareForMaintenance but " + + "no VNC Address/Port was available. Falling back to default one from MS."); + } + } + + if (portInfo == null) { portInfo = _ms.getVncPort(vm); } + if (s_logger.isDebugEnabled()) s_logger.debug("Port info " + portInfo.first()); diff --git a/server/src/test/java/com/cloud/resource/MockResourceManagerImpl.java b/server/src/test/java/com/cloud/resource/MockResourceManagerImpl.java index 82a1e923cb1..26cd820fd67 100755 --- a/server/src/test/java/com/cloud/resource/MockResourceManagerImpl.java +++ b/server/src/test/java/com/cloud/resource/MockResourceManagerImpl.java @@ -32,6 +32,7 @@ import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd; import org.apache.cloudstack.api.command.admin.host.ReconnectHostCmd; import org.apache.cloudstack.api.command.admin.host.UpdateHostCmd; import org.apache.cloudstack.api.command.admin.host.UpdateHostPasswordCmd; +import org.apache.cloudstack.framework.config.ConfigKey; import com.cloud.agent.api.StartupCommand; import com.cloud.agent.api.StartupRoutingCommand; @@ -56,7 +57,6 @@ import com.cloud.org.Cluster; import com.cloud.resource.ResourceState.Event; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.fsm.NoTransitionException; -import org.apache.cloudstack.framework.config.ConfigKey; public class MockResourceManagerImpl extends ManagerBase implements ResourceManager { @@ -307,10 +307,10 @@ public class MockResourceManagerImpl extends ManagerBase implements ResourceMana } /* (non-Javadoc) - * @see com.cloud.resource.ResourceManager#maintenanceFailed(long) + * @see com.cloud.resource.ResourceManager#migrateAwayFailed(long) */ @Override - public boolean maintenanceFailed(final long hostId) { + public boolean migrateAwayFailed(final long hostId, final long vmId) { // TODO Auto-generated method stub return false; } diff --git a/server/src/test/java/com/cloud/resource/ResourceManagerImplTest.java b/server/src/test/java/com/cloud/resource/ResourceManagerImplTest.java index 7d1a0fe0163..6faa83bc910 100644 --- a/server/src/test/java/com/cloud/resource/ResourceManagerImplTest.java +++ b/server/src/test/java/com/cloud/resource/ResourceManagerImplTest.java @@ -17,6 +17,39 @@ package com.cloud.resource; +import static com.cloud.resource.ResourceState.Event.ErrorsCorrected; +import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance; +import static com.cloud.resource.ResourceState.Event.UnableToMaintain; +import static com.cloud.resource.ResourceState.Event.UnableToMigrate; +import static org.mockito.Matchers.any; +import static org.mockito.Matchers.anyBoolean; +import static org.mockito.Matchers.anyLong; +import static org.mockito.Matchers.anyObject; +import static org.mockito.Matchers.anyString; +import static org.mockito.Matchers.eq; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.cloudstack.framework.config.dao.ConfigurationDao; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.BDDMockito; +import org.mockito.InjectMocks; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.Spy; +import org.powermock.api.mockito.PowerMockito; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.junit4.PowerMockRunner; + import com.cloud.agent.AgentManager; import com.cloud.agent.api.GetVncPortAnswer; import com.cloud.agent.api.GetVncPortCommand; @@ -35,38 +68,10 @@ import com.cloud.utils.fsm.NoTransitionException; import com.cloud.utils.ssh.SSHCmdHelper; import com.cloud.utils.ssh.SshException; import com.cloud.vm.VMInstanceVO; +import com.cloud.vm.VirtualMachine; import com.cloud.vm.dao.UserVmDetailsDao; import com.cloud.vm.dao.VMInstanceDao; import com.trilead.ssh2.Connection; -import org.apache.cloudstack.framework.config.dao.ConfigurationDao; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mockito.BDDMockito; -import org.mockito.InjectMocks; -import org.mockito.Mock; -import org.mockito.MockitoAnnotations; -import org.mockito.Spy; -import org.powermock.api.mockito.PowerMockito; -import org.powermock.core.classloader.annotations.PrepareForTest; -import org.powermock.modules.junit4.PowerMockRunner; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance; -import static com.cloud.resource.ResourceState.Event.UnableToMigrate; -import static org.mockito.Matchers.any; -import static org.mockito.Matchers.anyBoolean; -import static org.mockito.Matchers.anyLong; -import static org.mockito.Matchers.anyString; -import static org.mockito.Matchers.eq; -import static org.mockito.Mockito.never; -import static org.mockito.Mockito.times; -import static org.mockito.Mockito.verify; -import static org.mockito.Mockito.when; @RunWith(PowerMockRunner.class) @PrepareForTest({ActionEventUtils.class, ResourceManagerImpl.class, SSHCmdHelper.class}) @@ -170,38 +175,98 @@ public class ResourceManagerImplTest { } @Test - public void testCheckAndMaintainEnterMaintenanceMode() throws NoTransitionException { + public void testCheckAndMaintainEnterMaintenanceModeNoVms() throws NoTransitionException { + // Test entering into maintenance with no VMs running on host. boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); - verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), new ArrayList<>()); + verify(resourceManager).attemptMaintain(host); verify(resourceManager).setHostIntoMaintenance(host); + verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject()); verify(resourceManager).resourceStateTransitTo(eq(host), eq(InternalEnterMaintenance), anyLong()); + Assert.assertTrue(enterMaintenanceMode); } + @Test + public void testCheckAndMaintainProceedsWithPrepareForMaintenanceRunningVms() throws NoTransitionException { + // Test proceeding through with no events if pending migrating works / retries left. + setupRunningVMs(); + setupPendingMigrationRetries(); + verifyNoChangeInMaintenance(); + } + @Test public void testCheckAndMaintainErrorInMaintenanceRunningVms() throws NoTransitionException { - when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2)); - boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); - verify(resourceManager).isHostInMaintenance(host, Arrays.asList(vm1, vm2), new ArrayList<>(), new ArrayList<>()); - Assert.assertFalse(enterMaintenanceMode); + // Test entering into ErrorInMaintenance when no pending migrations etc, and due to - Running VMs + setupRunningVMs(); + setupNoPendingMigrationRetries(); + verifyErrorInMaintenanceCalls(); } @Test - public void testCheckAndMaintainErrorInMaintenanceMigratingVms() throws NoTransitionException { - when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm1, vm2)); - boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); - verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), Arrays.asList(vm1, vm2), new ArrayList<>()); - Assert.assertFalse(enterMaintenanceMode); + public void testCheckAndMaintainErrorInMaintenanceWithErrorVms() throws NoTransitionException { + // Test entering into ErrorInMaintenance when no pending migrations etc, and due to - no migrating but error VMs + setupErrorVms(); + setupNoPendingMigrationRetries(); + verifyErrorInMaintenanceCalls(); } @Test - public void testCheckAndMaintainErrorInMaintenanceFailedMigrations() throws NoTransitionException { - when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1, vm2)); - boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); - verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), Arrays.asList(vm1, vm2)); - verify(resourceManager).setHostIntoErrorInMaintenance(host, Arrays.asList(vm1, vm2)); - verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong()); - Assert.assertFalse(enterMaintenanceMode); + public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsPendingRetries() throws NoTransitionException { + // Test entering into ErrorInPrepareForMaintenance when pending migrations retries and due to - Failed Migrations + setupFailedMigrations(); + setupPendingMigrationRetries(); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Running)).thenReturn(Arrays.asList(vm2)); + verifyErrorInPrepareForMaintenanceCalls(); + } + + @Test + public void testCheckAndMaintainErrorInPrepareForMaintenanceWithErrorVmsPendingRetries() throws NoTransitionException { + // Test entering into ErrorInMaintenance when pending migrations retries due to - no migrating but error VMs + setupErrorVms(); + setupPendingMigrationRetries(); + when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2)); + verifyErrorInPrepareForMaintenanceCalls(); + } + + @Test + public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsAndMigratingVms() throws NoTransitionException { + // Test entering into ErrorInPrepareForMaintenance when no pending migrations retries + // but executing migration and due to - Failed Migrations + setupFailedMigrations(); + setupNoPendingMigrationRetries(); + when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2)); + verifyErrorInPrepareForMaintenanceCalls(); + } + + @Test + public void testCheckAndMaintainErrorInPrepareForMaintenanceWithErrorVmsAndMigratingVms() throws NoTransitionException { + // Test entering into ErrorInPrepareForMaintenance when no pending migrations retries + // but executing migration and due to - Error Vms + setupErrorVms(); + setupNoPendingMigrationRetries(); + when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2)); + verifyErrorInPrepareForMaintenanceCalls(); + } + + @Test + public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsAndStoppingVms() throws NoTransitionException { + // Test entering into ErrorInPrepareForMaintenance when no pending migrations retries + // but stopping VMs and due to - Failed Migrations + setupFailedMigrations(); + setupNoPendingMigrationRetries(); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Stopping)).thenReturn(Arrays.asList(vm2)); + verifyErrorInPrepareForMaintenanceCalls(); + } + + @Test + public void testCheckAndMaintainReturnsToPrepareForMaintenanceRunningVms() throws NoTransitionException { + // Test switching back to PrepareForMaintenance + when(host.getResourceState()).thenReturn(ResourceState.ErrorInPrepareForMaintenance); + setupRunningVMs(); + setupPendingMigrationRetries(); + verifyReturnToPrepareForMaintenanceCalls(); } @Test @@ -219,23 +284,6 @@ public class ResourceManagerImplTest { verify(agentManager).pullAgentToMaintenance(hostId); } - @Test - public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException { - resourceManager.setHostMaintenanceRetries(host); - - List failedMigrations = Arrays.asList(vm1, vm2); - when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations); - when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations); - - Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId()); - for (int i = 0; i <= retries; i++) { - resourceManager.checkAndMaintain(host.getId()); - } - - verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations); - verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations); - } - @Test(expected = CloudRuntimeException.class) public void testGetHostCredentialsMissingParameter() { when(host.getDetail("password")).thenReturn(null); @@ -307,4 +355,76 @@ public class ResourceManagerImplTest { verify(resourceManager, never()).getHostCredentials(eq(host)); verify(resourceManager, never()).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword)); } + + private void setupNoPendingMigrationRetries() { + when(haManager.hasPendingMigrationsWork(vm1.getId())).thenReturn(false); + when(haManager.hasPendingMigrationsWork(vm2.getId())).thenReturn(false); + } + + private void setupRunningVMs() { + when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2)); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2)); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Running)).thenReturn(Arrays.asList(vm1, vm2)); + } + + private void setupPendingMigrationRetries() { + when(haManager.hasPendingMigrationsWork(vm1.getId())).thenReturn(true); + when(haManager.hasPendingMigrationsWork(vm2.getId())).thenReturn(false); + } + + private void setupFailedMigrations() { + when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2)); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2)); + when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1)); + } + + private void setupErrorVms() { + when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2)); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2)); + when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Unknown, VirtualMachine.State.Error)).thenReturn(Arrays.asList(vm1)); + } + + private void verifyErrorInMaintenanceCalls() throws NoTransitionException { + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).attemptMaintain(host); + verify(resourceManager).setHostIntoErrorInMaintenance(eq(host), anyObject()); + verify(resourceManager, never()).setHostIntoMaintenance(anyObject()); + verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject()); + verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMaintain), anyLong()); + Assert.assertFalse(enterMaintenanceMode); + } + + private void verifyErrorInPrepareForMaintenanceCalls() throws NoTransitionException { + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).attemptMaintain(host); + verify(resourceManager).setHostIntoErrorInPrepareForMaintenance(eq(host), anyObject()); + verify(resourceManager, never()).setHostIntoMaintenance(anyObject()); + verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject()); + verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong()); + Assert.assertFalse(enterMaintenanceMode); + } + + private void verifyReturnToPrepareForMaintenanceCalls() throws NoTransitionException { + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).attemptMaintain(host); + verify(resourceManager).setHostIntoPrepareForMaintenanceAfterErrorsFixed(eq(host)); + verify(resourceManager).resourceStateTransitTo(eq(host), eq(ErrorsCorrected), anyLong()); + verify(resourceManager, never()).setHostIntoMaintenance(anyObject()); + verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject()); + Assert.assertFalse(enterMaintenanceMode); + } + + private void verifyNoChangeInMaintenance() throws NoTransitionException { + boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId); + verify(resourceManager).attemptMaintain(host); + verify(resourceManager, never()).setHostIntoMaintenance(anyObject()); + verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject()); + verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject()); + verify(resourceManager, never()).resourceStateTransitTo(anyObject(), any(), anyLong()); + Assert.assertFalse(enterMaintenanceMode); + } } diff --git a/test/integration/smoke/test_host_maintenance.py b/test/integration/smoke/test_host_maintenance.py index c7cd9d3998f..33536214d80 100644 --- a/test/integration/smoke/test_host_maintenance.py +++ b/test/integration/smoke/test_host_maintenance.py @@ -21,16 +21,75 @@ from marvin.cloudstackTestCase import * from marvin.lib.utils import * from marvin.lib.base import * -from marvin.lib.common import (get_zone, get_pod, get_template) +from marvin.lib.common import (get_zone, get_pod, get_template, list_ssvms) from nose.plugins.attrib import attr from marvin.lib.decoratorGenerators import skipTestIf from distutils.util import strtobool from marvin.sshClient import SshClient _multiprocess_shared_ = False +MIN_VMS_FOR_TEST = 3 + +class TestHostMaintenanceBase(cloudstackTestCase): + def get_ssh_client(self, ip, username, password, retries=10): + """ Setup ssh client connection and return connection """ + try: + ssh_client = SshClient(ip, 22, username, password, retries) + except Exception as e: + raise unittest.SkipTest("Unable to create ssh connection: " % e) + + self.assertIsNotNone( + ssh_client, "Failed to setup ssh connection to ip=%s" % ip) + + return ssh_client + + def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20): + def check_resource_state(): + response = Host.list( + self.apiclient, + id=hostid + ) + if isinstance(response, list): + if response[0].resourcestate == resourcestate: + self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate)) + return True, None + else: + self.logger.debug("Waiting for host " + hostid + + " to reach state " + resourcestate + + ", with current state " + response[0].resourcestate) + return False, None + + done, _ = wait_until(interval, retries, check_resource_state) + if not done: + raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate)) + return True + + def prepare_host_for_maintenance(self, hostid): + self.logger.debug("Sending Host with id %s to prepareHostForMaintenance" % hostid) + cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() + cmd.id = hostid + response = self.apiclient.prepareHostForMaintenance(cmd) + self.logger.debug("Host with id %s is in prepareHostForMaintenance" % hostid) + self.logger.debug(response) + return response + + def cancel_host_maintenance(self, hostid): + self.logger.debug("Canceling Host with id %s from maintain" % hostid) + cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() + cmd.id = hostid + res = self.apiclient.cancelHostMaintenance(cmd) + self.logger.debug("Host with id %s is cancelling maintenance" % hostid) + return res + + def revert_host_state_on_failure(self, hostId): + cmd = updateHost.updateHostCmd() + cmd.id = hostId + cmd.allocationstate = "Enable" + response = self.apiclient.updateHost(cmd) + self.assertEqual(response.resourcestate, "Enabled") -class TestHostMaintenance(cloudstackTestCase): +class TestHostMaintenance(TestHostMaintenanceBase): def setUp(self): self.logger = logging.getLogger('TestHM') @@ -44,6 +103,8 @@ class TestHostMaintenance(cloudstackTestCase): self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests()) self.pod = get_pod(self.apiclient, self.zone.id) self.cleanup = [] + self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__ + def tearDown(self): try: @@ -55,7 +116,7 @@ class TestHostMaintenance(cloudstackTestCase): return - def createVMs(self, hostId, number): + def createVMs(self, hostId, number, offering_key="tiny"): self.template = get_template( self.apiclient, @@ -70,7 +131,7 @@ class TestHostMaintenance(cloudstackTestCase): self.service_offering = ServiceOffering.create( self.apiclient, - self.services["service_offerings"]["tiny"] + self.services["service_offerings"][offering_key] ) self.logger.debug("Using service offering %s " % self.service_offering.id) self.network_offering = NetworkOffering.create( @@ -106,7 +167,32 @@ class TestHostMaintenance(cloudstackTestCase): self.cleanup.append(self.network_offering) self.cleanup.append(self.service_offering) return vms - + + def checkAllVmsRunningOnHost(self, hostId): + listVms1 = VirtualMachine.list( + self.apiclient, + hostid=hostId + ) + + if (listVms1 is not None): + self.logger.debug('Vms found to test all running = {} '.format(len(listVms1))) + for vm in listVms1: + if (vm.state != "Running"): + self.logger.debug('VirtualMachine on Host with id = {} is in {}'.format(vm.id, vm.state)) + return (False, None) + + response = list_ssvms( + self.apiclient, + hostid=hostId + ) + if isinstance(response, list): + for systemvm in response: + if systemvm.state != 'Running': + self.logger.debug("Found not running VM {}".format(systemvm.name)) + return (False, None) + + return (True, None) + def checkVmMigratingOnHost(self, hostId): vm_migrating=False listVms1 = VirtualMachine.list( @@ -118,60 +204,60 @@ class TestHostMaintenance(cloudstackTestCase): self.logger.debug('Vms found = {} '.format(len(listVms1))) for vm in listVms1: if (vm.state == "Migrating"): - self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state)) + self.logger.debug('VirtualMachine on Host with id = {} is in {}'.format(vm.id, vm.state)) vm_migrating=True break return (vm_migrating, None) - def checkNoVmMigratingOnHost(self, hostId): - no_vm_migrating=True + def migrationsFinished(self, hostId): + migrations_finished=True listVms1 = VirtualMachine.list( self.apiclient, hostid=hostId ) if (listVms1 is not None): - self.logger.debug('Vms found = {} '.format(len(listVms1))) - for vm in listVms1: - if (vm.state == "Migrating"): - self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state)) - no_vm_migrating=False - break + numVms = len(listVms1) + migrations_finished = (numVms == 0) + + return (migrations_finished, None) - return (no_vm_migrating, None) - def noOfVMsOnHost(self, hostId): listVms = VirtualMachine.list( self.apiclient, hostid=hostId ) no_of_vms=0 + self.logger.debug("Counting VMs on host " + hostId) if (listVms is not None): for vm in listVms: - self.logger.debug('VirtualMachine on Hyp 1 = {}'.format(vm.id)) + self.logger.debug("VirtualMachine on Host " + hostId + " = " + vm.id) no_of_vms=no_of_vms+1 - + self.logger.debug("Found VMs on host " + str(no_of_vms)) return no_of_vms - - def hostPrepareAndCancelMaintenance(self, target_host_id, other_host_id, checkVMMigration): - - cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() - cmd.id = target_host_id - response = self.apiclient.prepareHostForMaintenance(cmd) - - self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id)) - - vm_migrating = wait_until(1, 10, checkVMMigration, other_host_id) - - cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() - cmd.id = target_host_id - response = self.apiclient.cancelHostMaintenance(cmd) - - self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) ) - - return vm_migrating - + + def hostPrepareAndCancelMaintenance(self, target_host_id, other_host_id): + # Wait for all VMs to complete any pending migrations. + if not wait_until(3, 100, self.checkAllVmsRunningOnHost, target_host_id) or \ + not wait_until(3, 100, self.checkAllVmsRunningOnHost, other_host_id): + raise Exception("Failed to wait for all VMs to reach running state to execute test") + + self.prepare_host_for_maintenance(target_host_id) + migrations_finished = wait_until(5, 200, self.migrationsFinished, target_host_id) + + self.wait_until_host_is_in_state(target_host_id, "Maintenance", 5, 200) + + vm_count_after_maintenance = self.noOfVMsOnHost(target_host_id) + + self.cancel_host_maintenance(target_host_id) + self.wait_until_host_is_in_state(target_host_id, "Enabled", 5, 200) + + if vm_count_after_maintenance != 0: + self.fail("Host to put to maintenance still has VMs running") + + return migrations_finished + @attr( tags=[ "advanced", @@ -182,42 +268,45 @@ class TestHostMaintenance(cloudstackTestCase): "sg"], required_hardware="true") def test_01_cancel_host_maintenace_with_no_migration_jobs(self): + """ + Tests if putting a host with no migrations (0 VMs) work back and forth + + 1) Verify if there are at least 2 hosts in enabled state. + 2) Put the host into maintenance verify success + 3) Put the other host into maintenance, verify success + """ listHost = Host.list( self.apiclient, type='Routing', zoneid=self.zone.id, podid=self.pod.id, + hypervisor=self.hypervisor, + resourcestate='Enabled', + state='Up' ) for host in listHost: - self.logger.debug('1 Hypervisor = {}'.format(host.id)) - - - if (len(listHost) < 2): - raise unittest.SkipTest("Cancel host maintenance when VMs are migrating should be tested for 2 or more hosts"); - return + self.logger.debug('Found Host = {}'.format(host.id)) + + + if (len(listHost) < 2): + raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled") - vm_migrating=False - try: - vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id, self.checkNoVmMigratingOnHost) - - vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id, self.checkNoVmMigratingOnHost) - + migrations_finished = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id) + + if migrations_finished: + self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id) + else: + raise unittest.SkipTest("VMs are still migrating so reverse migration /maintenace skipped") + except Exception as e: + self.revert_host_state_on_failure(listHost[0].id) + self.revert_host_state_on_failure(listHost[1].id) self.logger.debug("Exception {}".format(e)) - self.fail("Cancel host maintenance failed {}".format(e[0])) - - - if (vm_migrating == True): - raise unittest.SkipTest("VMs are migrating and the test will not be able to check the conditions the test is intended for"); - - - return + self.fail("Host maintenance test failed {}".format(e[0])) - - @attr( tags=[ "advanced", @@ -228,53 +317,125 @@ class TestHostMaintenance(cloudstackTestCase): "sg"], required_hardware="true") def test_02_cancel_host_maintenace_with_migration_jobs(self): - + """ + Tests if putting a host with migrations (3 VMs) work back and forth + + 1) Verify if there are at least 2 hosts in enabled state. + 2) Deploy VMs if needed + 3) Put the host into maintenance verify success -ensure existing host has zero running VMs + 4) Put the other host into maintenance, verify success just as step 3 + """ listHost = Host.list( self.apiclient, type='Routing', zoneid=self.zone.id, podid=self.pod.id, + hypervisor=self.hypervisor, + resourcestate='Enabled', + state='Up' ) for host in listHost: - self.logger.debug('2 Hypervisor = {}'.format(host.id)) - - if (len(listHost) != 2): - raise unittest.SkipTest("Cancel host maintenance when VMs are migrating can only be tested with 2 hosts"); - return + self.logger.debug('Found Host = {}'.format(host.id)) + + if (len(listHost) < 2): + raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled") - no_of_vms = self.noOfVMsOnHost(listHost[0].id) - + no_of_vms = no_of_vms + self.noOfVMsOnHost(listHost[1].id) - - if no_of_vms < 5: + + if no_of_vms < MIN_VMS_FOR_TEST: self.logger.debug("Create VMs as there are not enough vms to check host maintenance") - no_vm_req = 5 - no_of_vms + no_vm_req = MIN_VMS_FOR_TEST - no_of_vms if (no_vm_req > 0): self.logger.debug("Creating vms = {}".format(no_vm_req)) self.vmlist = self.createVMs(listHost[0].id, no_vm_req) - - vm_migrating=False - + try: - - vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id, self.checkVmMigratingOnHost) - - vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id, self.checkVmMigratingOnHost) - + migrations_finished = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id) + + if migrations_finished: + self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id) + else: + raise unittest.SkipTest("VMs are still migrating so reverse migration /maintenace skipped") + except Exception as e: + self.revert_host_state_on_failure(listHost[0].id) + self.revert_host_state_on_failure(listHost[1].id) self.logger.debug("Exception {}".format(e)) - self.fail("Cancel host maintenance failed {}".format(e[0])) - + self.fail("Host maintenance test failed {}".format(e[0])) - if (vm_migrating == False): - raise unittest.SkipTest("No VM is migrating and the test will not be able to check the conditions the test is intended for"); - - - return + @attr( + tags=[ + "advanced", + "advancedns", + "smoke", + "basic", + "eip", + "sg"], + required_hardware="true") + def test_03_cancel_host_maintenace_with_migration_jobs_failure(self): + """ + Tests if putting a host with impossible migrations (2 VMs) work pushes to ErrorInMaintenance state + + 1) Verify if there are at least 2 hosts in enabled state. + 2) Tag the host and deploy tagged VMs which cannot be migrated to other host without tags + 3) Put the host into maintenance verify it fails with it reaching ErrorInMaintenance + """ + listHost = Host.list( + self.apiclient, + type='Routing', + zoneid=self.zone.id, + podid=self.pod.id, + hypervisor=self.hypervisor, + resourcestate='Enabled', + state='Up' + ) + + for host in listHost: + self.logger.debug('Found Host = {}'.format(host.id)) + + if (len(listHost) < 2): + raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled") + + target_host_id = listHost[0].id + + try: + Host.update(self.apiclient, + id=target_host_id, + hosttags=self.services["service_offerings"]["taggedsmall"]["hosttags"]) + + no_of_vms = self.noOfVMsOnHost(target_host_id) + + # Need only 2 VMs for this case. + if no_of_vms < 2: + self.logger.debug("Create VMs as there are not enough vms to check host maintenance") + no_vm_req = 2 - no_of_vms + if (no_vm_req > 0): + self.logger.debug("Creating vms = {}".format(no_vm_req)) + self.vmlist = self.createVMs(listHost[0].id, no_vm_req, "taggedsmall") + + # Attempt putting host in maintenance and check if ErrorInMaintenance state is reached + self.prepare_host_for_maintenance(target_host_id) + error_in_maintenance_reached = self.wait_until_host_is_in_state(target_host_id, "ErrorInMaintenance", 5, 300) + + self.cancel_host_maintenance(target_host_id) + self.wait_until_host_is_in_state(target_host_id, "Enabled", 5, 200) + + Host.update(self.apiclient, id=target_host_id, hosttags="") + + if not error_in_maintenance_reached: + self.fail("Error in maintenance state should have reached after ports block") + + except Exception as e: + self.revert_host_state_on_failure(listHost[0].id) + self.revert_host_state_on_failure(listHost[1].id) + Host.update(self.apiclient, id=target_host_id, hosttags="") + self.logger.debug("Exception {}".format(e)) + self.fail("Host maintenance test failed {}".format(e[0])) -class TestHostMaintenanceAgents(cloudstackTestCase): +class TestHostMaintenanceAgents(TestHostMaintenanceBase): @classmethod def setUpClass(cls): @@ -371,29 +532,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase): value = "true" if on else "false" cls.updateConfiguration('kvm.ssh.to.agent', value) - def prepare_host_for_maintenance(self, hostid): - cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd() - cmd.id = hostid - self.apiclient.prepareHostForMaintenance(cmd) - self.logger.debug('Host with id %s is in prepareHostForMaintenance' % hostid) - - def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20): - def check_resource_state(): - response = Host.list( - self.apiclient, - id=hostid - ) - if isinstance(response, list): - if response[0].resourcestate == resourcestate: - self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate)) - return True, None - return False, None - - done, _ = wait_until(interval, retries, check_resource_state) - if not done: - raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate)) - return True - def wait_until_agent_is_in_state(self, hostid, state, interval=3, retries=20): def check_agent_state(): response = Host.list( @@ -411,12 +549,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase): raise Exception("Failed to wait for host agent %s to be on state %s" % (hostid, state)) return True - def cancel_host_maintenance(self, hostid): - cmd = cancelHostMaintenance.cancelHostMaintenanceCmd() - cmd.id = hostid - self.apiclient.cancelHostMaintenance(cmd) - self.logger.debug('Host with id %s is cancelling maintenance' % hostid) - def get_enabled_host_connected_agent(self): hosts = Host.list( self.apiclient, @@ -428,7 +560,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase): state='Up' ) if len(hosts) < 2: - raise unittest.SkipTest("Cancel host maintenance must be tested for 2 or more hosts") + raise unittest.SkipTest("Host maintenance tests must be tested for 2 or more hosts") return hosts[0] def deploy_vm_on_host(self, hostid): @@ -451,13 +583,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase): ) self.cleanup.append(vm) - def revert_host_state_on_failure(self, host): - cmd = updateHost.updateHostCmd() - cmd.id = host.id - cmd.allocationstate = "Enable" - response = self.apiclient.updateHost(cmd) - self.assertEqual(response.resourcestate, "Enabled") - @skipTestIf("hypervisorNotSupported") @attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true") def test_01_cancel_host_maintenance_ssh_enabled_agent_connected(self): @@ -480,22 +605,9 @@ class TestHostMaintenanceAgents(cloudstackTestCase): self.wait_until_host_is_in_state(self.host.id, "Enabled") self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) except Exception as e: - self.revert_host_state_on_failure(self.host) + self.revert_host_state_on_failure(self.host.id) self.fail(e) - def get_ssh_client(self, ip, username, password, retries=10): - """ Setup ssh client connection and return connection """ - - try: - ssh_client = SshClient(ip, 22, username, password, retries) - except Exception as e: - raise unittest.SkipTest("Unable to create ssh connection: " % e) - - self.assertIsNotNone( - ssh_client, "Failed to setup ssh connection to ip=%s" % ip) - - return ssh_client - @skipTestIf("hypervisorNotSupported") @attr(tags=["boris", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true") def test_02_cancel_host_maintenance_ssh_enabled_agent_disconnected(self): @@ -529,7 +641,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase): self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) except Exception as e: - self.revert_host_state_on_failure(self.host) + self.revert_host_state_on_failure(self.host.id) self.fail(e) @skipTestIf("hypervisorNotSupported") @@ -554,7 +666,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase): self.wait_until_host_is_in_state(self.host.id, "Enabled") self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) except Exception as e: - self.revert_host_state_on_failure(self.host) + self.revert_host_state_on_failure(self.host.id) self.fail(e) @skipTestIf("hypervisorNotSupported") @@ -585,7 +697,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase): ssh_client.execute("service cloudstack-agent stop") self.wait_until_agent_is_in_state(self.host.id, "Disconnected") except Exception as e: - self.revert_host_state_on_failure(self.host) + self.revert_host_state_on_failure(self.host.id) self.fail(e) self.assertRaises(Exception, self.cancel_host_maintenance, self.host.id) @@ -600,5 +712,5 @@ class TestHostMaintenanceAgents(cloudstackTestCase): self.wait_until_host_is_in_state(self.host.id, "Enabled") self.assert_host_is_functional_after_cancelling_maintenance(self.host.id) except Exception as e: - self.revert_host_state_on_failure(self.host) + self.revert_host_state_on_failure(self.host.id) self.fail(e) diff --git a/ui/scripts/metrics.js b/ui/scripts/metrics.js index 2784eab2d8d..da95b98e03b 100644 --- a/ui/scripts/metrics.js +++ b/ui/scripts/metrics.js @@ -679,6 +679,7 @@ 'Down': 'off', 'Removed': 'off', 'ErrorInMaintenance': 'off', + 'ErrorInPrepareForMaintenance': 'warning', 'PrepareForMaintenance': 'warning', 'CancelMaintenance': 'warning', 'Maintenance': 'warning', diff --git a/ui/scripts/system.js b/ui/scripts/system.js index 5ef561ff5c0..1a73e64f3df 100755 --- a/ui/scripts/system.js +++ b/ui/scripts/system.js @@ -17162,7 +17162,8 @@ title: 'label.outofbandmanagement.action.issue', desc: function(args) { var host = args.context.hosts[0]; - if (host.resourcestate == 'Maintenance' || host.resourcestate == 'PrepareForMaintenance' || host.resourcestate == 'ErrorInMaintenance') { + if (host.resourcestate == 'Maintenance' || host.resourcestate == 'PrepareForMaintenance' || + host.resourcestate == 'ErrorInPrepareForMaintenance' || host.resourcestate == 'ErrorInMaintenance') { return _l('message.outofbandmanagement.action.maintenance'); } }, @@ -17776,6 +17777,7 @@ 'Down': 'off', 'Removed': 'off', 'ErrorInMaintenance': 'off', + 'ErrorInPrepareForMaintenance': 'warning', 'PrepareForMaintenance': 'warning', 'CancelMaintenance': 'warning', 'Maintenance': 'warning', @@ -21975,7 +21977,7 @@ allowedActions.push("edit"); allowedActions.push("enableMaintenanceMode"); allowedActions.push("cancelMaintenanceMode"); - } else if (jsonObj.resourcestate == "PrepareForMaintenance") { + } else if (jsonObj.resourcestate == "PrepareForMaintenance" || jsonObj.resourcestate == 'ErrorInPrepareForMaintenance') { allowedActions.push("edit"); allowedActions.push("cancelMaintenanceMode"); } else if (jsonObj.resourcestate == "Maintenance") { @@ -22029,7 +22031,7 @@ } else if (jsonObj.state == "ErrorInMaintenance") { allowedActions.push("enableMaintenanceMode"); allowedActions.push("cancelMaintenanceMode"); - } else if (jsonObj.state == "PrepareForMaintenance") { + } else if (jsonObj.state == "PrepareForMaintenance" || jsonObj.resourcestate == "ErrorInPrepareForMaintenance") { allowedActions.push("cancelMaintenanceMode"); } else if (jsonObj.state == "Maintenance") { allowedActions.push("cancelMaintenanceMode");