Better tracking host maintanence and handling of migration jobs (#3425)

* Service layer changes for new way of tracking maintanence progress

* Fixes after offline code review

* Fix marvin tests

* Change state name and add documentation

* Fix test

* Fix and add more unit tests for different caseS

* Fix and enhance Marvin Tests

* Fixes for corner cases

* More fixes and logging

* UI fixes

* Some minor changes and reducing VMs on host for more contained tests

* Fixed ssh client auth problem causing test failure

* Code review changes + fixes + some more logging

* Fix flaky tests by adding delays between host states

* Added fetching only enabled hosts for tests

* Make port blocking KVM specific and refactor to handle failure

* Make failing migrations due to tagged host instead of port blocking

* Added additional check for migrating VMs

* Refactor to use single place for methods checking maintenance states
This commit is contained in:
Anurag Awasthi 2019-12-19 21:06:20 +05:30 committed by Andrija Panic
parent cf6e616d5b
commit 4b43c2684f
19 changed files with 780 additions and 360 deletions

View File

@ -16,23 +16,33 @@
// under the License.
package com.cloud.resource;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import com.cloud.utils.fsm.StateMachine;
public enum ResourceState {
Creating, Enabled, Disabled, PrepareForMaintenance, ErrorInMaintenance, Maintenance, Error;
Creating,
Enabled,
Disabled,
ErrorInPrepareForMaintenance,
PrepareForMaintenance,
ErrorInMaintenance,
Maintenance,
Error;
public enum Event {
InternalCreated("Resource is created"),
Enable("Admin enables"),
Disable("Admin disables"),
AdminAskMaintenace("Admin asks to enter maintenance"),
AdminAskMaintenance("Admin asks to enter maintenance"),
AdminCancelMaintenance("Admin asks to cancel maintenance"),
InternalEnterMaintenance("Resource enters maintenance"),
UpdatePassword("Admin updates password of host"),
UnableToMigrate("Management server migrates VM failed"),
UnableToMigrate("Migration of VM failed, such as from scheduled HAWork"),
UnableToMaintain("Management server has exhausted all legal operations and attempts to put into maintenance has failed"),
ErrorsCorrected("Errors were corrected on a resource attempting to enter maintenance but encountered errors"),
Error("An internal error happened"),
DeleteHost("Admin delete a host"),
@ -84,6 +94,16 @@ public enum ResourceState {
return strs;
}
public static boolean isMaintenanceState(ResourceState state) {
return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance,
ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state);
}
public static boolean canAttemptMaintenance(ResourceState state) {
return !Arrays.asList(ResourceState.Maintenance, ResourceState.PrepareForMaintenance,
ResourceState.ErrorInPrepareForMaintenance).contains(state);
}
protected static final StateMachine<ResourceState, Event> s_fsm = new StateMachine<ResourceState, Event>();
static {
s_fsm.addTransition(null, Event.InternalCreated, ResourceState.Enabled);
@ -92,22 +112,31 @@ public enum ResourceState {
s_fsm.addTransition(ResourceState.Enabled, Event.Enable, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.Enabled, Event.InternalCreated, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.Enabled, Event.Disable, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenace, ResourceState.PrepareForMaintenance);
s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance);
s_fsm.addTransition(ResourceState.Enabled, Event.InternalEnterMaintenance, ResourceState.Maintenance);
s_fsm.addTransition(ResourceState.Disabled, Event.Enable, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.Disabled, Event.Disable, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.Disabled, Event.InternalCreated, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance);
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInMaintenance);
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInPrepareForMaintenance);
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMaintain, ResourceState.ErrorInMaintenance);
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.InternalCreated, ResourceState.PrepareForMaintenance);
s_fsm.addTransition(ResourceState.Maintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.Maintenance, Event.InternalCreated, ResourceState.Maintenance);
s_fsm.addTransition(ResourceState.Maintenance, Event.DeleteHost, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.InternalCreated, ResourceState.ErrorInPrepareForMaintenance);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.Disable, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.DeleteHost, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInPrepareForMaintenance);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.UnableToMaintain, ResourceState.ErrorInMaintenance);
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.ErrorsCorrected, ResourceState.PrepareForMaintenance);
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.InternalCreated, ResourceState.ErrorInMaintenance);
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance);
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.Disable, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.DeleteHost, ResourceState.Disabled);
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance);
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
s_fsm.addTransition(ResourceState.Error, Event.InternalCreated, ResourceState.Error);
s_fsm.addTransition(ResourceState.Disabled, Event.DeleteHost, ResourceState.Disabled);

View File

@ -16,8 +16,6 @@
// under the License.
package org.apache.cloudstack.api.command.admin.host;
import org.apache.log4j.Logger;
import org.apache.cloudstack.api.APICommand;
import org.apache.cloudstack.api.ApiCommandJobType;
import org.apache.cloudstack.api.ApiConstants;
@ -27,10 +25,12 @@ import org.apache.cloudstack.api.Parameter;
import org.apache.cloudstack.api.ServerApiException;
import org.apache.cloudstack.api.response.HostResponse;
import org.apache.cloudstack.context.CallContext;
import org.apache.log4j.Logger;
import com.cloud.event.EventTypes;
import com.cloud.host.Host;
import com.cloud.user.Account;
import com.cloud.utils.exception.CloudRuntimeException;
@APICommand(name = "prepareHostForMaintenance", description = "Prepares a host for maintenance.", responseObject = HostResponse.class,
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
@ -99,13 +99,17 @@ public class PrepareForMaintenanceCmd extends BaseAsyncCmd {
@Override
public void execute() {
Host result = _resourceService.maintain(this);
if (result != null) {
HostResponse response = _responseGenerator.createHostResponse(result);
response.setResponseName("host");
this.setResponseObject(response);
} else {
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance");
try {
Host result = _resourceService.maintain(this);
if (result != null) {
HostResponse response = _responseGenerator.createHostResponse(result);
response.setResponseName("host");
this.setResponseObject(response);
} else {
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance");
}
} catch (CloudRuntimeException exception) {
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance due to: " + exception.getMessage());
}
}
}

View File

@ -102,6 +102,7 @@ public interface HighAvailabilityManager extends Manager {
boolean hasPendingHaWork(long vmId);
boolean hasPendingMigrationsWork(long vmId);
/**
* @return
*/

View File

@ -47,11 +47,6 @@ import org.apache.cloudstack.framework.config.Configurable;
*/
public interface ResourceManager extends ResourceService, Configurable {
ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class,
"host.maintenance.retries","20",
"Number of retries when preparing a host into Maintenance Mode is faulty before failing",
true, ConfigKey.Scope.Cluster);
ConfigKey<Boolean> KvmSshToAgentEnabled = new ConfigKey<>("Advanced", Boolean.class,
"kvm.ssh.to.agent","true",
"Number of retries when preparing a host into Maintenance Mode is faulty before failing",
@ -97,7 +92,7 @@ public interface ResourceManager extends ResourceService, Configurable {
boolean umanageHost(long hostId);
boolean maintenanceFailed(long hostId);
boolean migrateAwayFailed(long hostId, long vmId);
public boolean maintain(final long hostId) throws AgentUnavailableException;

View File

@ -1583,7 +1583,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
final HostVO h = sc.find();
if (h != null) {
final ResourceState resourceState = h.getResourceState();
if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance || resourceState == ResourceState.ErrorInMaintenance) {
if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance) {
/*
* Host is in non-operation state, so no investigation and direct put agent to Disconnected
*/
@ -1605,7 +1605,9 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
}
final QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
sc.and(sc.entity().getResourceState(), Op.IN, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance);
sc.and(sc.entity().getResourceState(), Op.IN,
ResourceState.PrepareForMaintenance,
ResourceState.ErrorInPrepareForMaintenance);
final List<HostVO> hosts = sc.list();
for (final HostVO host : hosts) {

View File

@ -16,16 +16,10 @@
// under the License.
package org.apache.cloudstack.engine.datacenter.entity.api.db;
import com.cloud.host.Status;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.resource.ResourceState;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GenericDao;
import com.cloud.utils.db.StateMachine;
import org.apache.cloudstack.api.Identity;
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State;
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State.Event;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import javax.persistence.Column;
import javax.persistence.DiscriminatorColumn;
@ -42,10 +36,18 @@ import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
import javax.persistence.Transient;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.cloudstack.api.Identity;
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State;
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State.Event;
import com.cloud.host.Status;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.resource.ResourceState;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GenericDao;
import com.cloud.utils.db.StateMachine;
@Entity
@Table(name = "host")
@ -730,7 +732,7 @@ public class EngineHostVO implements EngineHost, Identity {
@Override
public boolean isInMaintenanceStates() {
return (getResourceState() == ResourceState.Maintenance || getResourceState() == ResourceState.ErrorInMaintenance || getResourceState() == ResourceState.PrepareForMaintenance);
return ResourceState.isMaintenanceState(getResourceState());
}
public long getUpdated() {

View File

@ -16,12 +16,11 @@
// under the License.
package com.cloud.host;
import com.cloud.agent.api.VgpuTypesInfo;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.resource.ResourceState;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GenericDao;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import javax.persistence.Column;
import javax.persistence.DiscriminatorColumn;
@ -38,11 +37,13 @@ import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
import javax.persistence.Transient;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import com.cloud.agent.api.VgpuTypesInfo;
import com.cloud.hypervisor.Hypervisor.HypervisorType;
import com.cloud.resource.ResourceState;
import com.cloud.storage.Storage.StoragePoolType;
import com.cloud.utils.NumbersUtil;
import com.cloud.utils.db.GenericDao;
@Entity
@Table(name = "host")
@ -714,9 +715,8 @@ public class HostVO implements Host {
@Override
public boolean isInMaintenanceStates() {
return (getResourceState() == ResourceState.Maintenance || getResourceState() == ResourceState.ErrorInMaintenance || getResourceState() == ResourceState.PrepareForMaintenance);
return ResourceState.isMaintenanceState(getResourceState());
}
@Override
public boolean isDisabled() {
return (getResourceState() == ResourceState.Disabled);

View File

@ -19,3 +19,4 @@
-- Schema upgrade cleanup from 4.13.0.0 to 4.14.0.0
--;
DELETE FROM `cloud`.`configuration` WHERE name = 'host.maintenance.retries';

View File

@ -28,17 +28,19 @@ import java.util.concurrent.TimeUnit;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;
import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService;
import org.apache.cloudstack.framework.config.ConfigKey;
import org.apache.cloudstack.framework.config.Configurable;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
import org.apache.cloudstack.management.ManagementServerHost;
import org.apache.log4j.Logger;
import org.apache.log4j.NDC;
import com.cloud.agent.AgentManager;
import com.cloud.alert.AlertManager;
import com.cloud.cluster.ClusterManagerListener;
import org.apache.cloudstack.management.ManagementServerHost;
import com.cloud.configuration.Config;
import com.cloud.dc.ClusterDetailsDao;
import com.cloud.dc.DataCenterVO;
@ -101,9 +103,14 @@ import com.cloud.vm.dao.VMInstanceDao;
* ha.retry.wait | time to wait before retrying the work item | seconds | 120 || || stop.retry.wait | time to wait
* before retrying the stop | seconds | 120 || * }
**/
public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvailabilityManager, ClusterManagerListener {
public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvailabilityManager, ClusterManagerListener, Configurable {
protected static final Logger s_logger = Logger.getLogger(HighAvailabilityManagerImpl.class);
private ConfigKey<Integer> MaxRetries = new ConfigKey<>("Advanced", Integer.class,
"max.retries","5",
"Total number of attempts for trying migration of a VM.",
true, ConfigKey.Scope.Cluster);
WorkerThread[] _workers;
boolean _stopped;
long _timeToSleep;
@ -314,6 +321,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
if (vm.getHostId() != null) {
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
_haDao.persist(work);
s_logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
wakeupWorkers();
}
return true;
@ -629,23 +637,32 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
public Long migrate(final HaWorkVO work) {
long vmId = work.getInstanceId();
long srcHostId = work.getHostId();
VMInstanceVO vm = _instanceDao.findById(vmId);
if (vm == null) {
s_logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
return null;
}
s_logger.info("Migration attempt: for VM " + vm.getUuid() + "from host id " + srcHostId +
". Starting attempt: " + (1 + work.getTimesTried()) + "/" + _maxRetries + " times.");
try {
work.setStep(Step.Migrating);
_haDao.update(work.getId(), work);
VMInstanceVO vm = _instanceDao.findById(vmId);
if (vm == null) {
return null;
}
// First try starting the vm with its original planner, if it doesn't succeed send HAPlanner as its an emergency.
_itMgr.migrateAway(vm.getUuid(), srcHostId);
return null;
} catch (InsufficientServerCapacityException e) {
s_logger.warn("Insufficient capacity for migrating a VM.");
_resourceMgr.maintenanceFailed(srcHostId);
s_logger.warn("Migration attempt: Insufficient capacity for migrating a VM " +
vm.getUuid() + " from source host id " + srcHostId +
". Exception: " + e.getMessage());
_resourceMgr.migrateAwayFailed(srcHostId, vmId);
return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
} catch (Exception e) {
s_logger.warn("Migration attempt: Unexpected exception occurred when attempting migration of " +
vm.getUuid() + e.getMessage());
throw e;
}
}
@ -744,7 +761,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
@Override
public void cancelScheduledMigrations(final HostVO host) {
WorkType type = host.getType() == HostVO.Type.Storage ? WorkType.Stop : WorkType.Migration;
s_logger.info("Canceling all scheduled migrations from host " + host.getUuid());
_haDao.deleteMigrationWorkItems(host.getId(), type, _serverId);
}
@ -762,7 +779,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
private void rescheduleWork(final HaWorkVO work, final long nextTime) {
s_logger.info("Rescheduling work " + work + " to try again at " + new Date(nextTime << 10));
work.setTimeToTry(nextTime);
work.setTimesTried(work.getTimesTried() + 1);
work.setServerId(null);
@ -803,7 +819,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
if (nextTime == null) {
s_logger.info("Completed work " + work);
s_logger.info("Completed work " + work + ". Took " + (work.getTimesTried() + 1) + "/" + _maxRetries + " attempts.");
work.setStep(Step.Done);
} else {
rescheduleWork(work, nextTime.longValue());
@ -819,12 +835,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
work.setUpdateTime(vm.getUpdated());
work.setPreviousState(vm.getState());
} finally {
if (!Step.Done.equals(work.getStep())) {
if (work.getTimesTried() >= _maxRetries) {
s_logger.warn("Giving up, retried max " + work.getTimesTried() + "/" + _maxRetries + " times for work: " + work);
work.setStep(Step.Done);
} else {
s_logger.warn("Rescheduling work " + work + " to try again at " + new Date(work.getTimeToTry() << 10) +
". Finished attempt " + work.getTimesTried() + "/" + _maxRetries + " times.");
}
}
_haDao.update(work.getId(), work);
}
if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) {
s_logger.warn("Giving up, retried max. times for work: " + work);
work.setStep(Step.Done);
}
_haDao.update(work.getId(), work);
}
@Override
@ -908,6 +930,16 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
return true;
}
@Override
public String getConfigComponentName() {
return HighAvailabilityManagerImpl.class.getSimpleName();
}
@Override
public ConfigKey<?>[] getConfigKeys() {
return new ConfigKey<?>[] {MaxRetries};
}
protected class CleanupTask extends ManagedContextRunnable {
@Override
protected void runInContext() {
@ -1004,4 +1036,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
List<HaWorkVO> haWorks = _haDao.listPendingHaWorkForVm(vmId);
return haWorks.size() > 0;
}
@Override
public boolean hasPendingMigrationsWork(long vmId) {
List<HaWorkVO> haWorks = _haDao.listPendingMigrationsForVm(vmId);
for (HaWorkVO work : haWorks) {
if (work.getTimesTried() <= _maxRetries) {
return true;
} else {
s_logger.warn("HAWork Job of migration type " + work + " found in database which has max " +
"retries more than " + _maxRetries + " but still not in Done, Cancelled, or Error State");
}
}
return false;
}
}

View File

@ -83,4 +83,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
List<HaWorkVO> listRunningHaWorkForVm(long vmId);
List<HaWorkVO> listPendingHaWorkForVm(long vmId);
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
}

View File

@ -48,6 +48,7 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
private final SearchBuilder<HaWorkVO> FutureHaWorkSearch;
private final SearchBuilder<HaWorkVO> RunningHaWorkSearch;
private final SearchBuilder<HaWorkVO> PendingHaWorkSearch;
private final SearchBuilder<HaWorkVO> MigratingWorkSearch;
protected HighAvailabilityDaoImpl() {
super();
@ -112,6 +113,12 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
PendingHaWorkSearch.and("type", PendingHaWorkSearch.entity().getType(), Op.EQ);
PendingHaWorkSearch.and("step", PendingHaWorkSearch.entity().getStep(), Op.NIN);
PendingHaWorkSearch.done();
MigratingWorkSearch = createSearchBuilder();
MigratingWorkSearch.and("instance", MigratingWorkSearch.entity().getInstanceId(), Op.EQ);
MigratingWorkSearch.and("workType", MigratingWorkSearch.entity().getWorkType(), Op.EQ);
MigratingWorkSearch.and("step", MigratingWorkSearch.entity().getStep(), Op.NIN);
MigratingWorkSearch.done();
}
@Override
@ -124,6 +131,16 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
return search(sc, null);
}
@Override
public List<HaWorkVO> listPendingMigrationsForVm(long vmId) {
SearchCriteria<HaWorkVO> sc = MigratingWorkSearch.create();
sc.setParameters("instance", vmId);
sc.setParameters("workType", WorkType.Migration);
sc.setParameters("step", Step.Done, Step.Error, Step.Cancelled);
return search(sc, null);
}
@Override
public List<HaWorkVO> listRunningHaWorkForVm(long vmId) {
SearchCriteria<HaWorkVO> sc = RunningHaWorkSearch.create();

View File

@ -26,7 +26,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.ConcurrentHashMap;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
@ -274,8 +273,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>();
private void insertListener(final Integer event, final ResourceListener listener) {
List<ResourceListener> lst = _lifeCycleListeners.get(event);
if (lst == null) {
@ -1165,6 +1162,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
throw new InvalidParameterValueException("Host with id " + hostId.toString() + " doesn't exist");
}
if (!ResourceState.isMaintenanceState(host.getResourceState())) {
throw new CloudRuntimeException("Cannot perform cancelMaintenance when resource state is " + host.getResourceState() + ", hostId = " + hostId);
}
processResourceEvent(ResourceListener.EVENT_CANCEL_MAINTENANCE_BEFORE, hostId);
final boolean success = cancelMaintenance(hostId);
processResourceEvent(ResourceListener.EVENT_CANCEL_MAINTENANCE_AFTER, hostId);
@ -1212,6 +1213,12 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
private boolean doMaintain(final long hostId) {
final HostVO host = _hostDao.findById(hostId);
s_logger.info("Maintenance: attempting maintenance of host " + host.getUuid());
ResourceState hostState = host.getResourceState();
if (!ResourceState.canAttemptMaintenance(hostState)) {
throw new CloudRuntimeException("Cannot perform maintain when resource state is " + hostState + ", hostId = " + hostId);
}
final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand());
if (answer == null || !answer.getResult()) {
s_logger.warn("Unable to send MaintainCommand to host: " + hostId);
@ -1219,7 +1226,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
}
try {
resourceStateTransitTo(host, ResourceState.Event.AdminAskMaintenace, _nodeId);
resourceStateTransitTo(host, ResourceState.Event.AdminAskMaintenance, _nodeId);
} catch (final NoTransitionException e) {
final String err = "Cannot transmit resource state of host " + host.getId() + " to " + ResourceState.Maintenance;
s_logger.debug(err, e);
@ -1228,7 +1235,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0);
_agentMgr.pullAgentToMaintenance(hostId);
setHostMaintenanceRetries(host);
/* TODO: move below to listener */
if (host.getType() == Host.Type.Routing) {
@ -1244,11 +1250,13 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|| _serviceOfferingDetailsDao.findDetail(vm.getServiceOfferingId(), GPU.Keys.vgpuType.toString()) != null) {
// Migration is not supported for VGPU Vms so stop them.
// for the last host in this cluster, stop all the VMs
s_logger.error("Maintenance: No hosts available for migrations. Scheduling shutdown instead of migrations.");
_haMgr.scheduleStop(vm, hostId, WorkType.ForceStop);
} else if (HypervisorType.LXC.equals(host.getHypervisorType()) && VirtualMachine.Type.User.equals(vm.getType())){
//Migration is not supported for LXC Vms. Schedule restart instead.
_haMgr.scheduleRestart(vm, false);
} else {
s_logger.info("Maintenance: scheduling migration of VM " + vm.getUuid() + " from host " + host.getUuid());
_haMgr.scheduleMigration(vm);
}
}
@ -1256,19 +1264,9 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
return true;
}
/**
* Set retries for transiting the host into Maintenance
*/
protected void setHostMaintenanceRetries(HostVO host) {
Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId());
retryHostMaintenance.put(host.getId(), retries);
s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s",
host.getId(), host.getName(), retries));
}
@Override
public boolean maintain(final long hostId) throws AgentUnavailableException {
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace);
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenance);
if (result != null) {
return result;
}
@ -1285,13 +1283,29 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
s_logger.debug("Unable to find host " + hostId);
throw new InvalidParameterValueException("Unable to find host with ID: " + hostId + ". Please specify a valid host ID.");
}
if (!ResourceState.canAttemptMaintenance(host.getResourceState())) {
throw new CloudRuntimeException("Host is already in state " + host.getResourceState() + ". Cannot recall for maintenance until resolved.");
}
if (_hostDao.countBy(host.getClusterId(), ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance) > 0) {
throw new InvalidParameterValueException("There are other servers in PrepareForMaintenance OR ErrorInMaintenance STATUS in cluster " + host.getClusterId());
if (_hostDao.countBy(host.getClusterId(), ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance) > 0) {
throw new CloudRuntimeException("There are other servers attempting migrations for maintenance. " +
"Found hosts in PrepareForMaintenance OR ErrorInPrepareForMaintenance STATUS in cluster " + host.getClusterId());
}
if (_storageMgr.isLocalStorageActiveOnHost(host.getId())) {
throw new InvalidParameterValueException("There are active VMs using the host's local storage pool. Please stop all VMs on this host that use local storage.");
throw new CloudRuntimeException("There are active VMs using the host's local storage pool. Please stop all VMs on this host that use local storage.");
}
List<VMInstanceVO> migratingInVMs = _vmDao.findByHostInStates(hostId, State.Migrating);
if (migratingInVMs.size() > 0) {
throw new CloudRuntimeException("Host contains incoming VMs migrating. Please wait for them to complete before putting to maintenance.");
}
if (_vmDao.findByHostInStates(hostId, State.Starting, State.Stopping).size() > 0) {
throw new CloudRuntimeException("Host contains VMs in starting/stopping state. Please wait for them to complete before putting to maintenance.");
}
if (_vmDao.findByHostInStates(hostId, State.Error, State.Unknown).size() > 0) {
throw new CloudRuntimeException("Host contains VMs in error/unknown/shutdown state. Please fix errors to proceed.");
}
try {
@ -1331,19 +1345,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
}
}
/**
* Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following:
* - Cancel scheduled migrations for those which have already failed
* - Configure VNC access for VMs (KVM hosts only)
*/
protected boolean setHostIntoErrorInMaintenance(HostVO host, List<VMInstanceVO> failedMigrations) throws NoTransitionException {
s_logger.debug("Unable to migrate " + failedMigrations.size() + " VM(s) from host " + host.getUuid());
_haMgr.cancelScheduledMigrations(host);
configureVncAccessForKVMHostFailedMigrations(host, failedMigrations);
resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId);
return false;
}
/**
* Safely transit host into Maintenance mode
*/
@ -1357,31 +1358,104 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
}
/**
* Return true if host goes into Maintenance mode, only when:
* - No Running, Migrating or Failed migrations (host_id = last_host_id) for the host
* Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following:
* - Cancel scheduled migrations for those which have already failed
* - Configure VNC access for VMs (KVM hosts only)
*/
protected boolean isHostInMaintenance(HostVO host, List<VMInstanceVO> runningVms, List<VMInstanceVO> migratingVms, List<VMInstanceVO> failedMigrations) throws NoTransitionException {
if (CollectionUtils.isEmpty(runningVms) && CollectionUtils.isEmpty(migratingVms)) {
return CollectionUtils.isEmpty(failedMigrations) ?
setHostIntoMaintenance(host) :
setHostIntoErrorInMaintenance(host, failedMigrations);
} else if (retryHostMaintenance.containsKey(host.getId())) {
Integer retriesLeft = retryHostMaintenance.get(host.getId());
if (retriesLeft != null) {
if (retriesLeft <= 0) {
retryHostMaintenance.remove(host.getId());
s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " +
"please investigate this connection.",
host.getId(), host.getName()));
return setHostIntoErrorInMaintenance(host, failedMigrations);
}
retriesLeft--;
retryHostMaintenance.put(host.getId(), retriesLeft);
s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s",
host.getId(), host.getName(), retriesLeft));
protected boolean setHostIntoErrorInMaintenance(HostVO host, List<VMInstanceVO> errorVms) throws NoTransitionException {
s_logger.debug("Unable to migrate / fix errors for " + errorVms.size() + " VM(s) from host " + host.getUuid());
_haMgr.cancelScheduledMigrations(host);
configureVncAccessForKVMHostFailedMigrations(host, errorVms);
resourceStateTransitTo(host, ResourceState.Event.UnableToMaintain, _nodeId);
return false;
}
protected boolean setHostIntoErrorInPrepareForMaintenance(HostVO host, List<VMInstanceVO> errorVms) throws NoTransitionException {
s_logger.debug("Host " + host.getUuid() + " entering in PrepareForMaintenanceWithErrors state");
configureVncAccessForKVMHostFailedMigrations(host, errorVms);
resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId);
return false;
}
protected boolean setHostIntoPrepareForMaintenanceAfterErrorsFixed(HostVO host) throws NoTransitionException {
s_logger.debug("Host " + host.getUuid() + " entering in PrepareForMaintenance state as any previous corrections have been fixed");
resourceStateTransitTo(host, ResourceState.Event.ErrorsCorrected, _nodeId);
return false;
}
/**
* Return true if host goes into Maintenance mode. There are various possibilities for VMs' states
* on a host. We need to track the various VM states on each run and accordingly transit to the
* appropriate state.
*
* We change states as follws -
* 1. If there are no VMs in running, migrating, starting, stopping, error, unknown states we can move
* to maintenance state. Note that there cannot be incoming migrations as the API Call prepare for
* maintenance checks incoming migrations before starting.
* 2. If there errors (like migrating VMs, error VMs, etc) we mark as ErrorInPrepareForMaintenance but
* don't stop remaining migrations/ongoing legitimate operations.
* 3. If all migration retries, legitimate operations have finished we check for VMs on the host and if
* there are still VMs in error state or in running state or failed migrations we mark the VM as
* ErrorInMaintenance state.
* 4. Lastly if there are no errors or failed migrations or running VMs but there are still pending
* legitimate operations and the host was in ErrorInPrepareForMaintenance, we push the host back
* to PrepareForMaintenance state.
*/
protected boolean attemptMaintain(HostVO host) throws NoTransitionException {
final long hostId = host.getId();
s_logger.info("Attempting maintenance for host " + host.getName());
// Step 0: First gather if VMs have pending HAWork for migration with retries left.
final List<VMInstanceVO> allVmsOnHost = _vmDao.listByHostId(hostId);
final boolean hasMigratingAwayVms = CollectionUtils.isNotEmpty(_vmDao.listVmsMigratingFromHost(hostId));
boolean hasPendingMigrationRetries = false;
for (VMInstanceVO vmInstanceVO : allVmsOnHost) {
if (_haMgr.hasPendingMigrationsWork(vmInstanceVO.getId())) {
s_logger.info("Attempting maintenance for " + host + " found pending migration for VM " + vmInstanceVO);
hasPendingMigrationRetries = true;
break;
}
}
// Step 1: If there are no VMs in migrating, running, starting, stopping, error or unknown state we can safely move the host to maintenance.
if (!hasMigratingAwayVms && CollectionUtils.isEmpty(_vmDao.findByHostInStates(host.getId(),
State.Migrating, State.Running, State.Starting, State.Stopping, State.Error, State.Unknown))) {
if (hasPendingMigrationRetries) {
s_logger.error("There should not be pending retries VMs for this host as there are no running, migrating," +
"starting, stopping, error or unknown states on host " + host);
}
return setHostIntoMaintenance(host);
}
// Step 2: Gather relevant VMs' states on the host and then based on them we can determine if
final List<VMInstanceVO> failedMigrations = new ArrayList<>(_vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId));
final List<VMInstanceVO> errorVms = new ArrayList<>(_vmDao.findByHostInStates(hostId, State.Unknown, State.Error));
final boolean hasRunningVms = CollectionUtils.isNotEmpty(_vmDao.findByHostInStates(hostId, State.Running));
final boolean hasFailedMigrations = CollectionUtils.isNotEmpty(failedMigrations);
final boolean hasVmsInFailureStates = CollectionUtils.isNotEmpty(errorVms);
final boolean hasStoppingVms = CollectionUtils.isNotEmpty(_vmDao.findByHostInStates(hostId, State.Stopping));
errorVms.addAll(failedMigrations);
// Step 3: If there are no pending migration retries but host still has running VMs or,
// host has VMs in failure state / failed migrations we move the host to ErrorInMaintenance state.
if ((!hasPendingMigrationRetries && !hasMigratingAwayVms && hasRunningVms) ||
(!hasRunningVms && !hasMigratingAwayVms && hasVmsInFailureStates)) {
return setHostIntoErrorInMaintenance(host, errorVms);
}
// Step 4: IF there are pending migrations or ongoing retries left or stopping VMs and there were errors or failed
// migrations we put the host into ErrorInPrepareForMaintenance
if ((hasPendingMigrationRetries || hasMigratingAwayVms || hasStoppingVms) && (hasVmsInFailureStates || hasFailedMigrations)) {
return setHostIntoErrorInPrepareForMaintenance(host, errorVms);
}
// Step 5: If there were previously errors found, but not anymore it means the operator has fixed errors and we put
// the host into PrepareForMaintenance state.
if (host.getResourceState() == ResourceState.ErrorInPrepareForMaintenance) {
return setHostIntoPrepareForMaintenanceAfterErrorsFixed(host);
}
return false;
}
@ -1392,14 +1466,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
try {
if (host.getType() != Host.Type.Storage) {
final List<VMInstanceVO> vos = _vmDao.listByHostId(hostId);
final List<VMInstanceVO> vosMigrating = _vmDao.listVmsMigratingFromHost(hostId);
final List<VMInstanceVO> failedVmMigrations = _vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId);
hostInMaintenance = isHostInMaintenance(host, vos, vosMigrating, failedVmMigrations);
hostInMaintenance = attemptMaintain(host);
}
} catch (final NoTransitionException e) {
s_logger.debug("Cannot transmit host " + host.getId() + "to Maintenance state", e);
s_logger.debug("Cannot transmit host " + host.getId() + " to Maintenance state", e);
}
return hostInMaintenance;
}
@ -2327,8 +2397,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
* TODO: think twice about returning true or throwing out exception, I
* really prefer to exception that always exposes bugs
*/
if (host.getResourceState() != ResourceState.PrepareForMaintenance && host.getResourceState() != ResourceState.Maintenance &&
host.getResourceState() != ResourceState.ErrorInMaintenance) {
if (!ResourceState.isMaintenanceState(host.getResourceState())) {
throw new CloudRuntimeException("Cannot perform cancelMaintenance when resource state is " + host.getResourceState() + ", hostId = " + hostId);
}
@ -2349,7 +2418,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
try {
resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId);
_agentMgr.pullAgentOutMaintenance(hostId);
retryHostMaintenance.remove(hostId);
} catch (final NoTransitionException e) {
s_logger.debug("Cannot transmit host " + host.getId() + "to Enabled state", e);
return false;
@ -2433,7 +2501,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
@Override
public boolean executeUserRequest(final long hostId, final ResourceState.Event event) throws AgentUnavailableException {
if (event == ResourceState.Event.AdminAskMaintenace) {
if (event == ResourceState.Event.AdminAskMaintenance) {
return doMaintain(hostId);
} else if (event == ResourceState.Event.AdminCancelMaintenance) {
return doCancelMaintenance(hostId);
@ -2561,7 +2629,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
return null;
}
s_logger.debug("Propagating resource request event:" + event.toString() + " to agent:" + agentId);
s_logger.debug("Propagating resource request event:" + event.toString() + " to agent:" + agentId);
final Command[] cmds = new Command[1];
cmds[0] = new PropagateResourceEventCommand(agentId, event);
@ -2580,7 +2648,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
}
@Override
public boolean maintenanceFailed(final long hostId) {
public boolean migrateAwayFailed(final long hostId, final long vmId) {
final HostVO host = _hostDao.findById(hostId);
if (host == null) {
if (s_logger.isDebugEnabled()) {
@ -2589,6 +2657,8 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
return false;
} else {
try {
s_logger.warn("Migration of VM " + _vmDao.findById(vmId) + " failed from host " + _hostDao.findById(hostId) +
". Emitting event UnableToMigrate.");
return resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId);
} catch (final NoTransitionException e) {
s_logger.debug("No next resource state for host " + host.getId() + " while current state is " + host.getResourceState() + " with event " +
@ -2704,7 +2774,11 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId);
}
sc.and(sc.entity().getType(), Op.EQ, type);
sc.and(sc.entity().getResourceState(), Op.NIN, ResourceState.Maintenance, ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance,
sc.and(sc.entity().getResourceState(), Op.NIN,
ResourceState.Maintenance,
ResourceState.ErrorInMaintenance,
ResourceState.ErrorInPrepareForMaintenance,
ResourceState.PrepareForMaintenance,
ResourceState.Error);
return sc.list();
}
@ -2981,6 +3055,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
@Override
public ConfigKey<?>[] getConfigKeys() {
return new ConfigKey<?>[] {HostMaintenanceRetries};
return new ConfigKey[0];
}
}

View File

@ -1556,7 +1556,11 @@ public class StatsCollector extends ManagerBase implements ComponentMethodInterc
private SearchCriteria<HostVO> createSearchCriteriaForHostTypeRoutingStateUpAndNotInMaintenance() {
SearchCriteria<HostVO> sc = _hostDao.createSearchCriteria();
sc.addAnd("status", SearchCriteria.Op.EQ, Status.Up.toString());
sc.addAnd("resourceState", SearchCriteria.Op.NIN, ResourceState.Maintenance, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance);
sc.addAnd("resourceState", SearchCriteria.Op.NIN,
ResourceState.Maintenance,
ResourceState.PrepareForMaintenance,
ResourceState.ErrorInPrepareForMaintenance,
ResourceState.ErrorInMaintenance);
sc.addAnd("type", SearchCriteria.Op.EQ, Host.Type.Routing.toString());
return sc;
}

View File

@ -35,21 +35,16 @@ import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.servlet.http.HttpSession;
import com.cloud.resource.ResourceState;
import org.apache.cloudstack.framework.security.keys.KeysManager;
import org.apache.commons.codec.binary.Base64;
import org.apache.log4j.Logger;
import org.springframework.stereotype.Component;
import org.springframework.web.context.support.SpringBeanAutowiringSupport;
import com.cloud.vm.VmDetailConstants;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import org.apache.cloudstack.framework.security.keys.KeysManager;
import com.cloud.exception.PermissionDeniedException;
import com.cloud.host.HostVO;
import com.cloud.hypervisor.Hypervisor;
import com.cloud.resource.ResourceState;
import com.cloud.server.ManagementServer;
import com.cloud.storage.GuestOSVO;
import com.cloud.user.Account;
@ -64,7 +59,10 @@ import com.cloud.utils.db.TransactionLegacy;
import com.cloud.vm.UserVmDetailVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.VirtualMachineManager;
import com.cloud.vm.VmDetailConstants;
import com.cloud.vm.dao.UserVmDetailsDao;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
/**
* Thumbnail access : /console?cmd=thumbnail&vm=xxx&w=xxx&h=xxx
@ -420,14 +418,24 @@ public class ConsoleProxyServlet extends HttpServlet {
StringBuffer sb = new StringBuffer(rootUrl);
String host = hostVo.getPrivateIpAddress();
Pair<String, Integer> portInfo;
if (hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance)) {
Pair<String, Integer> portInfo = null;
if (hostVo.getHypervisorType() == Hypervisor.HypervisorType.KVM &&
(hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance) ||
hostVo.getResourceState().equals(ResourceState.ErrorInPrepareForMaintenance))) {
UserVmDetailVO detailAddress = _userVmDetailsDao.findDetail(vm.getId(), VmDetailConstants.KVM_VNC_ADDRESS);
UserVmDetailVO detailPort = _userVmDetailsDao.findDetail(vm.getId(), VmDetailConstants.KVM_VNC_PORT);
portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue()));
} else {
if (detailAddress != null && detailPort != null) {
portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue()));
} else {
s_logger.warn("KVM Host in ErrorInMaintenance/ErrorInPrepareForMaintenance but " +
"no VNC Address/Port was available. Falling back to default one from MS.");
}
}
if (portInfo == null) {
portInfo = _ms.getVncPort(vm);
}
if (s_logger.isDebugEnabled())
s_logger.debug("Port info " + portInfo.first());

View File

@ -32,6 +32,7 @@ import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd;
import org.apache.cloudstack.api.command.admin.host.ReconnectHostCmd;
import org.apache.cloudstack.api.command.admin.host.UpdateHostCmd;
import org.apache.cloudstack.api.command.admin.host.UpdateHostPasswordCmd;
import org.apache.cloudstack.framework.config.ConfigKey;
import com.cloud.agent.api.StartupCommand;
import com.cloud.agent.api.StartupRoutingCommand;
@ -56,7 +57,6 @@ import com.cloud.org.Cluster;
import com.cloud.resource.ResourceState.Event;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.fsm.NoTransitionException;
import org.apache.cloudstack.framework.config.ConfigKey;
public class MockResourceManagerImpl extends ManagerBase implements ResourceManager {
@ -307,10 +307,10 @@ public class MockResourceManagerImpl extends ManagerBase implements ResourceMana
}
/* (non-Javadoc)
* @see com.cloud.resource.ResourceManager#maintenanceFailed(long)
* @see com.cloud.resource.ResourceManager#migrateAwayFailed(long)
*/
@Override
public boolean maintenanceFailed(final long hostId) {
public boolean migrateAwayFailed(final long hostId, final long vmId) {
// TODO Auto-generated method stub
return false;
}

View File

@ -17,6 +17,39 @@
package com.cloud.resource;
import static com.cloud.resource.ResourceState.Event.ErrorsCorrected;
import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance;
import static com.cloud.resource.ResourceState.Event.UnableToMaintain;
import static com.cloud.resource.ResourceState.Event.UnableToMigrate;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyBoolean;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.anyObject;
import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.BDDMockito;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;
import org.mockito.Spy;
import org.powermock.api.mockito.PowerMockito;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.modules.junit4.PowerMockRunner;
import com.cloud.agent.AgentManager;
import com.cloud.agent.api.GetVncPortAnswer;
import com.cloud.agent.api.GetVncPortCommand;
@ -35,38 +68,10 @@ import com.cloud.utils.fsm.NoTransitionException;
import com.cloud.utils.ssh.SSHCmdHelper;
import com.cloud.utils.ssh.SshException;
import com.cloud.vm.VMInstanceVO;
import com.cloud.vm.VirtualMachine;
import com.cloud.vm.dao.UserVmDetailsDao;
import com.cloud.vm.dao.VMInstanceDao;
import com.trilead.ssh2.Connection;
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.BDDMockito;
import org.mockito.InjectMocks;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;
import org.mockito.Spy;
import org.powermock.api.mockito.PowerMockito;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.modules.junit4.PowerMockRunner;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance;
import static com.cloud.resource.ResourceState.Event.UnableToMigrate;
import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyBoolean;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.anyString;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.never;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@RunWith(PowerMockRunner.class)
@PrepareForTest({ActionEventUtils.class, ResourceManagerImpl.class, SSHCmdHelper.class})
@ -170,38 +175,98 @@ public class ResourceManagerImplTest {
}
@Test
public void testCheckAndMaintainEnterMaintenanceMode() throws NoTransitionException {
public void testCheckAndMaintainEnterMaintenanceModeNoVms() throws NoTransitionException {
// Test entering into maintenance with no VMs running on host.
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), new ArrayList<>());
verify(resourceManager).attemptMaintain(host);
verify(resourceManager).setHostIntoMaintenance(host);
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
verify(resourceManager).resourceStateTransitTo(eq(host), eq(InternalEnterMaintenance), anyLong());
Assert.assertTrue(enterMaintenanceMode);
}
@Test
public void testCheckAndMaintainProceedsWithPrepareForMaintenanceRunningVms() throws NoTransitionException {
// Test proceeding through with no events if pending migrating works / retries left.
setupRunningVMs();
setupPendingMigrationRetries();
verifyNoChangeInMaintenance();
}
@Test
public void testCheckAndMaintainErrorInMaintenanceRunningVms() throws NoTransitionException {
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).isHostInMaintenance(host, Arrays.asList(vm1, vm2), new ArrayList<>(), new ArrayList<>());
Assert.assertFalse(enterMaintenanceMode);
// Test entering into ErrorInMaintenance when no pending migrations etc, and due to - Running VMs
setupRunningVMs();
setupNoPendingMigrationRetries();
verifyErrorInMaintenanceCalls();
}
@Test
public void testCheckAndMaintainErrorInMaintenanceMigratingVms() throws NoTransitionException {
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm1, vm2));
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), Arrays.asList(vm1, vm2), new ArrayList<>());
Assert.assertFalse(enterMaintenanceMode);
public void testCheckAndMaintainErrorInMaintenanceWithErrorVms() throws NoTransitionException {
// Test entering into ErrorInMaintenance when no pending migrations etc, and due to - no migrating but error VMs
setupErrorVms();
setupNoPendingMigrationRetries();
verifyErrorInMaintenanceCalls();
}
@Test
public void testCheckAndMaintainErrorInMaintenanceFailedMigrations() throws NoTransitionException {
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1, vm2));
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), Arrays.asList(vm1, vm2));
verify(resourceManager).setHostIntoErrorInMaintenance(host, Arrays.asList(vm1, vm2));
verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong());
Assert.assertFalse(enterMaintenanceMode);
public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsPendingRetries() throws NoTransitionException {
// Test entering into ErrorInPrepareForMaintenance when pending migrations retries and due to - Failed Migrations
setupFailedMigrations();
setupPendingMigrationRetries();
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Running)).thenReturn(Arrays.asList(vm2));
verifyErrorInPrepareForMaintenanceCalls();
}
@Test
public void testCheckAndMaintainErrorInPrepareForMaintenanceWithErrorVmsPendingRetries() throws NoTransitionException {
// Test entering into ErrorInMaintenance when pending migrations retries due to - no migrating but error VMs
setupErrorVms();
setupPendingMigrationRetries();
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2));
verifyErrorInPrepareForMaintenanceCalls();
}
@Test
public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsAndMigratingVms() throws NoTransitionException {
// Test entering into ErrorInPrepareForMaintenance when no pending migrations retries
// but executing migration and due to - Failed Migrations
setupFailedMigrations();
setupNoPendingMigrationRetries();
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2));
verifyErrorInPrepareForMaintenanceCalls();
}
@Test
public void testCheckAndMaintainErrorInPrepareForMaintenanceWithErrorVmsAndMigratingVms() throws NoTransitionException {
// Test entering into ErrorInPrepareForMaintenance when no pending migrations retries
// but executing migration and due to - Error Vms
setupErrorVms();
setupNoPendingMigrationRetries();
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2));
verifyErrorInPrepareForMaintenanceCalls();
}
@Test
public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsAndStoppingVms() throws NoTransitionException {
// Test entering into ErrorInPrepareForMaintenance when no pending migrations retries
// but stopping VMs and due to - Failed Migrations
setupFailedMigrations();
setupNoPendingMigrationRetries();
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Stopping)).thenReturn(Arrays.asList(vm2));
verifyErrorInPrepareForMaintenanceCalls();
}
@Test
public void testCheckAndMaintainReturnsToPrepareForMaintenanceRunningVms() throws NoTransitionException {
// Test switching back to PrepareForMaintenance
when(host.getResourceState()).thenReturn(ResourceState.ErrorInPrepareForMaintenance);
setupRunningVMs();
setupPendingMigrationRetries();
verifyReturnToPrepareForMaintenanceCalls();
}
@Test
@ -219,23 +284,6 @@ public class ResourceManagerImplTest {
verify(agentManager).pullAgentToMaintenance(hostId);
}
@Test
public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException {
resourceManager.setHostMaintenanceRetries(host);
List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2);
when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations);
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations);
Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId());
for (int i = 0; i <= retries; i++) {
resourceManager.checkAndMaintain(host.getId());
}
verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations);
verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations);
}
@Test(expected = CloudRuntimeException.class)
public void testGetHostCredentialsMissingParameter() {
when(host.getDetail("password")).thenReturn(null);
@ -307,4 +355,76 @@ public class ResourceManagerImplTest {
verify(resourceManager, never()).getHostCredentials(eq(host));
verify(resourceManager, never()).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword));
}
private void setupNoPendingMigrationRetries() {
when(haManager.hasPendingMigrationsWork(vm1.getId())).thenReturn(false);
when(haManager.hasPendingMigrationsWork(vm2.getId())).thenReturn(false);
}
private void setupRunningVMs() {
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2));
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Running)).thenReturn(Arrays.asList(vm1, vm2));
}
private void setupPendingMigrationRetries() {
when(haManager.hasPendingMigrationsWork(vm1.getId())).thenReturn(true);
when(haManager.hasPendingMigrationsWork(vm2.getId())).thenReturn(false);
}
private void setupFailedMigrations() {
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2));
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1));
}
private void setupErrorVms() {
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2));
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Unknown, VirtualMachine.State.Error)).thenReturn(Arrays.asList(vm1));
}
private void verifyErrorInMaintenanceCalls() throws NoTransitionException {
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).attemptMaintain(host);
verify(resourceManager).setHostIntoErrorInMaintenance(eq(host), anyObject());
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMaintain), anyLong());
Assert.assertFalse(enterMaintenanceMode);
}
private void verifyErrorInPrepareForMaintenanceCalls() throws NoTransitionException {
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).attemptMaintain(host);
verify(resourceManager).setHostIntoErrorInPrepareForMaintenance(eq(host), anyObject());
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong());
Assert.assertFalse(enterMaintenanceMode);
}
private void verifyReturnToPrepareForMaintenanceCalls() throws NoTransitionException {
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).attemptMaintain(host);
verify(resourceManager).setHostIntoPrepareForMaintenanceAfterErrorsFixed(eq(host));
verify(resourceManager).resourceStateTransitTo(eq(host), eq(ErrorsCorrected), anyLong());
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
Assert.assertFalse(enterMaintenanceMode);
}
private void verifyNoChangeInMaintenance() throws NoTransitionException {
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
verify(resourceManager).attemptMaintain(host);
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
verify(resourceManager, never()).resourceStateTransitTo(anyObject(), any(), anyLong());
Assert.assertFalse(enterMaintenanceMode);
}
}

View File

@ -21,16 +21,75 @@
from marvin.cloudstackTestCase import *
from marvin.lib.utils import *
from marvin.lib.base import *
from marvin.lib.common import (get_zone, get_pod, get_template)
from marvin.lib.common import (get_zone, get_pod, get_template, list_ssvms)
from nose.plugins.attrib import attr
from marvin.lib.decoratorGenerators import skipTestIf
from distutils.util import strtobool
from marvin.sshClient import SshClient
_multiprocess_shared_ = False
MIN_VMS_FOR_TEST = 3
class TestHostMaintenanceBase(cloudstackTestCase):
def get_ssh_client(self, ip, username, password, retries=10):
""" Setup ssh client connection and return connection """
try:
ssh_client = SshClient(ip, 22, username, password, retries)
except Exception as e:
raise unittest.SkipTest("Unable to create ssh connection: " % e)
self.assertIsNotNone(
ssh_client, "Failed to setup ssh connection to ip=%s" % ip)
return ssh_client
def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20):
def check_resource_state():
response = Host.list(
self.apiclient,
id=hostid
)
if isinstance(response, list):
if response[0].resourcestate == resourcestate:
self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate))
return True, None
else:
self.logger.debug("Waiting for host " + hostid +
" to reach state " + resourcestate +
", with current state " + response[0].resourcestate)
return False, None
done, _ = wait_until(interval, retries, check_resource_state)
if not done:
raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate))
return True
def prepare_host_for_maintenance(self, hostid):
self.logger.debug("Sending Host with id %s to prepareHostForMaintenance" % hostid)
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
cmd.id = hostid
response = self.apiclient.prepareHostForMaintenance(cmd)
self.logger.debug("Host with id %s is in prepareHostForMaintenance" % hostid)
self.logger.debug(response)
return response
def cancel_host_maintenance(self, hostid):
self.logger.debug("Canceling Host with id %s from maintain" % hostid)
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
cmd.id = hostid
res = self.apiclient.cancelHostMaintenance(cmd)
self.logger.debug("Host with id %s is cancelling maintenance" % hostid)
return res
def revert_host_state_on_failure(self, hostId):
cmd = updateHost.updateHostCmd()
cmd.id = hostId
cmd.allocationstate = "Enable"
response = self.apiclient.updateHost(cmd)
self.assertEqual(response.resourcestate, "Enabled")
class TestHostMaintenance(cloudstackTestCase):
class TestHostMaintenance(TestHostMaintenanceBase):
def setUp(self):
self.logger = logging.getLogger('TestHM')
@ -44,6 +103,8 @@ class TestHostMaintenance(cloudstackTestCase):
self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests())
self.pod = get_pod(self.apiclient, self.zone.id)
self.cleanup = []
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
def tearDown(self):
try:
@ -55,7 +116,7 @@ class TestHostMaintenance(cloudstackTestCase):
return
def createVMs(self, hostId, number):
def createVMs(self, hostId, number, offering_key="tiny"):
self.template = get_template(
self.apiclient,
@ -70,7 +131,7 @@ class TestHostMaintenance(cloudstackTestCase):
self.service_offering = ServiceOffering.create(
self.apiclient,
self.services["service_offerings"]["tiny"]
self.services["service_offerings"][offering_key]
)
self.logger.debug("Using service offering %s " % self.service_offering.id)
self.network_offering = NetworkOffering.create(
@ -106,7 +167,32 @@ class TestHostMaintenance(cloudstackTestCase):
self.cleanup.append(self.network_offering)
self.cleanup.append(self.service_offering)
return vms
def checkAllVmsRunningOnHost(self, hostId):
listVms1 = VirtualMachine.list(
self.apiclient,
hostid=hostId
)
if (listVms1 is not None):
self.logger.debug('Vms found to test all running = {} '.format(len(listVms1)))
for vm in listVms1:
if (vm.state != "Running"):
self.logger.debug('VirtualMachine on Host with id = {} is in {}'.format(vm.id, vm.state))
return (False, None)
response = list_ssvms(
self.apiclient,
hostid=hostId
)
if isinstance(response, list):
for systemvm in response:
if systemvm.state != 'Running':
self.logger.debug("Found not running VM {}".format(systemvm.name))
return (False, None)
return (True, None)
def checkVmMigratingOnHost(self, hostId):
vm_migrating=False
listVms1 = VirtualMachine.list(
@ -118,60 +204,60 @@ class TestHostMaintenance(cloudstackTestCase):
self.logger.debug('Vms found = {} '.format(len(listVms1)))
for vm in listVms1:
if (vm.state == "Migrating"):
self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state))
self.logger.debug('VirtualMachine on Host with id = {} is in {}'.format(vm.id, vm.state))
vm_migrating=True
break
return (vm_migrating, None)
def checkNoVmMigratingOnHost(self, hostId):
no_vm_migrating=True
def migrationsFinished(self, hostId):
migrations_finished=True
listVms1 = VirtualMachine.list(
self.apiclient,
hostid=hostId
)
if (listVms1 is not None):
self.logger.debug('Vms found = {} '.format(len(listVms1)))
for vm in listVms1:
if (vm.state == "Migrating"):
self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state))
no_vm_migrating=False
break
numVms = len(listVms1)
migrations_finished = (numVms == 0)
return (migrations_finished, None)
return (no_vm_migrating, None)
def noOfVMsOnHost(self, hostId):
listVms = VirtualMachine.list(
self.apiclient,
hostid=hostId
)
no_of_vms=0
self.logger.debug("Counting VMs on host " + hostId)
if (listVms is not None):
for vm in listVms:
self.logger.debug('VirtualMachine on Hyp 1 = {}'.format(vm.id))
self.logger.debug("VirtualMachine on Host " + hostId + " = " + vm.id)
no_of_vms=no_of_vms+1
self.logger.debug("Found VMs on host " + str(no_of_vms))
return no_of_vms
def hostPrepareAndCancelMaintenance(self, target_host_id, other_host_id, checkVMMigration):
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
cmd.id = target_host_id
response = self.apiclient.prepareHostForMaintenance(cmd)
self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id))
vm_migrating = wait_until(1, 10, checkVMMigration, other_host_id)
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
cmd.id = target_host_id
response = self.apiclient.cancelHostMaintenance(cmd)
self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) )
return vm_migrating
def hostPrepareAndCancelMaintenance(self, target_host_id, other_host_id):
# Wait for all VMs to complete any pending migrations.
if not wait_until(3, 100, self.checkAllVmsRunningOnHost, target_host_id) or \
not wait_until(3, 100, self.checkAllVmsRunningOnHost, other_host_id):
raise Exception("Failed to wait for all VMs to reach running state to execute test")
self.prepare_host_for_maintenance(target_host_id)
migrations_finished = wait_until(5, 200, self.migrationsFinished, target_host_id)
self.wait_until_host_is_in_state(target_host_id, "Maintenance", 5, 200)
vm_count_after_maintenance = self.noOfVMsOnHost(target_host_id)
self.cancel_host_maintenance(target_host_id)
self.wait_until_host_is_in_state(target_host_id, "Enabled", 5, 200)
if vm_count_after_maintenance != 0:
self.fail("Host to put to maintenance still has VMs running")
return migrations_finished
@attr(
tags=[
"advanced",
@ -182,42 +268,45 @@ class TestHostMaintenance(cloudstackTestCase):
"sg"],
required_hardware="true")
def test_01_cancel_host_maintenace_with_no_migration_jobs(self):
"""
Tests if putting a host with no migrations (0 VMs) work back and forth
1) Verify if there are at least 2 hosts in enabled state.
2) Put the host into maintenance verify success
3) Put the other host into maintenance, verify success
"""
listHost = Host.list(
self.apiclient,
type='Routing',
zoneid=self.zone.id,
podid=self.pod.id,
hypervisor=self.hypervisor,
resourcestate='Enabled',
state='Up'
)
for host in listHost:
self.logger.debug('1 Hypervisor = {}'.format(host.id))
if (len(listHost) < 2):
raise unittest.SkipTest("Cancel host maintenance when VMs are migrating should be tested for 2 or more hosts");
return
self.logger.debug('Found Host = {}'.format(host.id))
if (len(listHost) < 2):
raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled")
vm_migrating=False
try:
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id, self.checkNoVmMigratingOnHost)
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id, self.checkNoVmMigratingOnHost)
migrations_finished = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id)
if migrations_finished:
self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id)
else:
raise unittest.SkipTest("VMs are still migrating so reverse migration /maintenace skipped")
except Exception as e:
self.revert_host_state_on_failure(listHost[0].id)
self.revert_host_state_on_failure(listHost[1].id)
self.logger.debug("Exception {}".format(e))
self.fail("Cancel host maintenance failed {}".format(e[0]))
if (vm_migrating == True):
raise unittest.SkipTest("VMs are migrating and the test will not be able to check the conditions the test is intended for");
return
self.fail("Host maintenance test failed {}".format(e[0]))
@attr(
tags=[
"advanced",
@ -228,53 +317,125 @@ class TestHostMaintenance(cloudstackTestCase):
"sg"],
required_hardware="true")
def test_02_cancel_host_maintenace_with_migration_jobs(self):
"""
Tests if putting a host with migrations (3 VMs) work back and forth
1) Verify if there are at least 2 hosts in enabled state.
2) Deploy VMs if needed
3) Put the host into maintenance verify success -ensure existing host has zero running VMs
4) Put the other host into maintenance, verify success just as step 3
"""
listHost = Host.list(
self.apiclient,
type='Routing',
zoneid=self.zone.id,
podid=self.pod.id,
hypervisor=self.hypervisor,
resourcestate='Enabled',
state='Up'
)
for host in listHost:
self.logger.debug('2 Hypervisor = {}'.format(host.id))
if (len(listHost) != 2):
raise unittest.SkipTest("Cancel host maintenance when VMs are migrating can only be tested with 2 hosts");
return
self.logger.debug('Found Host = {}'.format(host.id))
if (len(listHost) < 2):
raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled")
no_of_vms = self.noOfVMsOnHost(listHost[0].id)
no_of_vms = no_of_vms + self.noOfVMsOnHost(listHost[1].id)
if no_of_vms < 5:
if no_of_vms < MIN_VMS_FOR_TEST:
self.logger.debug("Create VMs as there are not enough vms to check host maintenance")
no_vm_req = 5 - no_of_vms
no_vm_req = MIN_VMS_FOR_TEST - no_of_vms
if (no_vm_req > 0):
self.logger.debug("Creating vms = {}".format(no_vm_req))
self.vmlist = self.createVMs(listHost[0].id, no_vm_req)
vm_migrating=False
try:
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id, self.checkVmMigratingOnHost)
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id, self.checkVmMigratingOnHost)
migrations_finished = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id)
if migrations_finished:
self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id)
else:
raise unittest.SkipTest("VMs are still migrating so reverse migration /maintenace skipped")
except Exception as e:
self.revert_host_state_on_failure(listHost[0].id)
self.revert_host_state_on_failure(listHost[1].id)
self.logger.debug("Exception {}".format(e))
self.fail("Cancel host maintenance failed {}".format(e[0]))
self.fail("Host maintenance test failed {}".format(e[0]))
if (vm_migrating == False):
raise unittest.SkipTest("No VM is migrating and the test will not be able to check the conditions the test is intended for");
return
@attr(
tags=[
"advanced",
"advancedns",
"smoke",
"basic",
"eip",
"sg"],
required_hardware="true")
def test_03_cancel_host_maintenace_with_migration_jobs_failure(self):
"""
Tests if putting a host with impossible migrations (2 VMs) work pushes to ErrorInMaintenance state
1) Verify if there are at least 2 hosts in enabled state.
2) Tag the host and deploy tagged VMs which cannot be migrated to other host without tags
3) Put the host into maintenance verify it fails with it reaching ErrorInMaintenance
"""
listHost = Host.list(
self.apiclient,
type='Routing',
zoneid=self.zone.id,
podid=self.pod.id,
hypervisor=self.hypervisor,
resourcestate='Enabled',
state='Up'
)
for host in listHost:
self.logger.debug('Found Host = {}'.format(host.id))
if (len(listHost) < 2):
raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled")
target_host_id = listHost[0].id
try:
Host.update(self.apiclient,
id=target_host_id,
hosttags=self.services["service_offerings"]["taggedsmall"]["hosttags"])
no_of_vms = self.noOfVMsOnHost(target_host_id)
# Need only 2 VMs for this case.
if no_of_vms < 2:
self.logger.debug("Create VMs as there are not enough vms to check host maintenance")
no_vm_req = 2 - no_of_vms
if (no_vm_req > 0):
self.logger.debug("Creating vms = {}".format(no_vm_req))
self.vmlist = self.createVMs(listHost[0].id, no_vm_req, "taggedsmall")
# Attempt putting host in maintenance and check if ErrorInMaintenance state is reached
self.prepare_host_for_maintenance(target_host_id)
error_in_maintenance_reached = self.wait_until_host_is_in_state(target_host_id, "ErrorInMaintenance", 5, 300)
self.cancel_host_maintenance(target_host_id)
self.wait_until_host_is_in_state(target_host_id, "Enabled", 5, 200)
Host.update(self.apiclient, id=target_host_id, hosttags="")
if not error_in_maintenance_reached:
self.fail("Error in maintenance state should have reached after ports block")
except Exception as e:
self.revert_host_state_on_failure(listHost[0].id)
self.revert_host_state_on_failure(listHost[1].id)
Host.update(self.apiclient, id=target_host_id, hosttags="")
self.logger.debug("Exception {}".format(e))
self.fail("Host maintenance test failed {}".format(e[0]))
class TestHostMaintenanceAgents(cloudstackTestCase):
class TestHostMaintenanceAgents(TestHostMaintenanceBase):
@classmethod
def setUpClass(cls):
@ -371,29 +532,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
value = "true" if on else "false"
cls.updateConfiguration('kvm.ssh.to.agent', value)
def prepare_host_for_maintenance(self, hostid):
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
cmd.id = hostid
self.apiclient.prepareHostForMaintenance(cmd)
self.logger.debug('Host with id %s is in prepareHostForMaintenance' % hostid)
def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20):
def check_resource_state():
response = Host.list(
self.apiclient,
id=hostid
)
if isinstance(response, list):
if response[0].resourcestate == resourcestate:
self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate))
return True, None
return False, None
done, _ = wait_until(interval, retries, check_resource_state)
if not done:
raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate))
return True
def wait_until_agent_is_in_state(self, hostid, state, interval=3, retries=20):
def check_agent_state():
response = Host.list(
@ -411,12 +549,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
raise Exception("Failed to wait for host agent %s to be on state %s" % (hostid, state))
return True
def cancel_host_maintenance(self, hostid):
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
cmd.id = hostid
self.apiclient.cancelHostMaintenance(cmd)
self.logger.debug('Host with id %s is cancelling maintenance' % hostid)
def get_enabled_host_connected_agent(self):
hosts = Host.list(
self.apiclient,
@ -428,7 +560,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
state='Up'
)
if len(hosts) < 2:
raise unittest.SkipTest("Cancel host maintenance must be tested for 2 or more hosts")
raise unittest.SkipTest("Host maintenance tests must be tested for 2 or more hosts")
return hosts[0]
def deploy_vm_on_host(self, hostid):
@ -451,13 +583,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
)
self.cleanup.append(vm)
def revert_host_state_on_failure(self, host):
cmd = updateHost.updateHostCmd()
cmd.id = host.id
cmd.allocationstate = "Enable"
response = self.apiclient.updateHost(cmd)
self.assertEqual(response.resourcestate, "Enabled")
@skipTestIf("hypervisorNotSupported")
@attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true")
def test_01_cancel_host_maintenance_ssh_enabled_agent_connected(self):
@ -480,22 +605,9 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
self.wait_until_host_is_in_state(self.host.id, "Enabled")
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
except Exception as e:
self.revert_host_state_on_failure(self.host)
self.revert_host_state_on_failure(self.host.id)
self.fail(e)
def get_ssh_client(self, ip, username, password, retries=10):
""" Setup ssh client connection and return connection """
try:
ssh_client = SshClient(ip, 22, username, password, retries)
except Exception as e:
raise unittest.SkipTest("Unable to create ssh connection: " % e)
self.assertIsNotNone(
ssh_client, "Failed to setup ssh connection to ip=%s" % ip)
return ssh_client
@skipTestIf("hypervisorNotSupported")
@attr(tags=["boris", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true")
def test_02_cancel_host_maintenance_ssh_enabled_agent_disconnected(self):
@ -529,7 +641,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
except Exception as e:
self.revert_host_state_on_failure(self.host)
self.revert_host_state_on_failure(self.host.id)
self.fail(e)
@skipTestIf("hypervisorNotSupported")
@ -554,7 +666,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
self.wait_until_host_is_in_state(self.host.id, "Enabled")
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
except Exception as e:
self.revert_host_state_on_failure(self.host)
self.revert_host_state_on_failure(self.host.id)
self.fail(e)
@skipTestIf("hypervisorNotSupported")
@ -585,7 +697,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
ssh_client.execute("service cloudstack-agent stop")
self.wait_until_agent_is_in_state(self.host.id, "Disconnected")
except Exception as e:
self.revert_host_state_on_failure(self.host)
self.revert_host_state_on_failure(self.host.id)
self.fail(e)
self.assertRaises(Exception, self.cancel_host_maintenance, self.host.id)
@ -600,5 +712,5 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
self.wait_until_host_is_in_state(self.host.id, "Enabled")
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
except Exception as e:
self.revert_host_state_on_failure(self.host)
self.revert_host_state_on_failure(self.host.id)
self.fail(e)

View File

@ -679,6 +679,7 @@
'Down': 'off',
'Removed': 'off',
'ErrorInMaintenance': 'off',
'ErrorInPrepareForMaintenance': 'warning',
'PrepareForMaintenance': 'warning',
'CancelMaintenance': 'warning',
'Maintenance': 'warning',

View File

@ -17162,7 +17162,8 @@
title: 'label.outofbandmanagement.action.issue',
desc: function(args) {
var host = args.context.hosts[0];
if (host.resourcestate == 'Maintenance' || host.resourcestate == 'PrepareForMaintenance' || host.resourcestate == 'ErrorInMaintenance') {
if (host.resourcestate == 'Maintenance' || host.resourcestate == 'PrepareForMaintenance' ||
host.resourcestate == 'ErrorInPrepareForMaintenance' || host.resourcestate == 'ErrorInMaintenance') {
return _l('message.outofbandmanagement.action.maintenance');
}
},
@ -17776,6 +17777,7 @@
'Down': 'off',
'Removed': 'off',
'ErrorInMaintenance': 'off',
'ErrorInPrepareForMaintenance': 'warning',
'PrepareForMaintenance': 'warning',
'CancelMaintenance': 'warning',
'Maintenance': 'warning',
@ -21975,7 +21977,7 @@
allowedActions.push("edit");
allowedActions.push("enableMaintenanceMode");
allowedActions.push("cancelMaintenanceMode");
} else if (jsonObj.resourcestate == "PrepareForMaintenance") {
} else if (jsonObj.resourcestate == "PrepareForMaintenance" || jsonObj.resourcestate == 'ErrorInPrepareForMaintenance') {
allowedActions.push("edit");
allowedActions.push("cancelMaintenanceMode");
} else if (jsonObj.resourcestate == "Maintenance") {
@ -22029,7 +22031,7 @@
} else if (jsonObj.state == "ErrorInMaintenance") {
allowedActions.push("enableMaintenanceMode");
allowedActions.push("cancelMaintenanceMode");
} else if (jsonObj.state == "PrepareForMaintenance") {
} else if (jsonObj.state == "PrepareForMaintenance" || jsonObj.resourcestate == "ErrorInPrepareForMaintenance") {
allowedActions.push("cancelMaintenanceMode");
} else if (jsonObj.state == "Maintenance") {
allowedActions.push("cancelMaintenanceMode");