mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
Better tracking host maintanence and handling of migration jobs (#3425)
* Service layer changes for new way of tracking maintanence progress * Fixes after offline code review * Fix marvin tests * Change state name and add documentation * Fix test * Fix and add more unit tests for different caseS * Fix and enhance Marvin Tests * Fixes for corner cases * More fixes and logging * UI fixes * Some minor changes and reducing VMs on host for more contained tests * Fixed ssh client auth problem causing test failure * Code review changes + fixes + some more logging * Fix flaky tests by adding delays between host states * Added fetching only enabled hosts for tests * Make port blocking KVM specific and refactor to handle failure * Make failing migrations due to tagged host instead of port blocking * Added additional check for migrating VMs * Refactor to use single place for methods checking maintenance states
This commit is contained in:
parent
cf6e616d5b
commit
4b43c2684f
@ -16,23 +16,33 @@
|
||||
// under the License.
|
||||
package com.cloud.resource;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.cloud.utils.fsm.StateMachine;
|
||||
|
||||
public enum ResourceState {
|
||||
Creating, Enabled, Disabled, PrepareForMaintenance, ErrorInMaintenance, Maintenance, Error;
|
||||
Creating,
|
||||
Enabled,
|
||||
Disabled,
|
||||
ErrorInPrepareForMaintenance,
|
||||
PrepareForMaintenance,
|
||||
ErrorInMaintenance,
|
||||
Maintenance,
|
||||
Error;
|
||||
|
||||
public enum Event {
|
||||
InternalCreated("Resource is created"),
|
||||
Enable("Admin enables"),
|
||||
Disable("Admin disables"),
|
||||
AdminAskMaintenace("Admin asks to enter maintenance"),
|
||||
AdminAskMaintenance("Admin asks to enter maintenance"),
|
||||
AdminCancelMaintenance("Admin asks to cancel maintenance"),
|
||||
InternalEnterMaintenance("Resource enters maintenance"),
|
||||
UpdatePassword("Admin updates password of host"),
|
||||
UnableToMigrate("Management server migrates VM failed"),
|
||||
UnableToMigrate("Migration of VM failed, such as from scheduled HAWork"),
|
||||
UnableToMaintain("Management server has exhausted all legal operations and attempts to put into maintenance has failed"),
|
||||
ErrorsCorrected("Errors were corrected on a resource attempting to enter maintenance but encountered errors"),
|
||||
Error("An internal error happened"),
|
||||
DeleteHost("Admin delete a host"),
|
||||
|
||||
@ -84,6 +94,16 @@ public enum ResourceState {
|
||||
return strs;
|
||||
}
|
||||
|
||||
public static boolean isMaintenanceState(ResourceState state) {
|
||||
return Arrays.asList(ResourceState.Maintenance, ResourceState.ErrorInMaintenance,
|
||||
ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance).contains(state);
|
||||
}
|
||||
|
||||
public static boolean canAttemptMaintenance(ResourceState state) {
|
||||
return !Arrays.asList(ResourceState.Maintenance, ResourceState.PrepareForMaintenance,
|
||||
ResourceState.ErrorInPrepareForMaintenance).contains(state);
|
||||
}
|
||||
|
||||
protected static final StateMachine<ResourceState, Event> s_fsm = new StateMachine<ResourceState, Event>();
|
||||
static {
|
||||
s_fsm.addTransition(null, Event.InternalCreated, ResourceState.Enabled);
|
||||
@ -92,22 +112,31 @@ public enum ResourceState {
|
||||
s_fsm.addTransition(ResourceState.Enabled, Event.Enable, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.Enabled, Event.InternalCreated, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.Enabled, Event.Disable, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenace, ResourceState.PrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.Enabled, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.Enabled, Event.InternalEnterMaintenance, ResourceState.Maintenance);
|
||||
s_fsm.addTransition(ResourceState.Disabled, Event.Enable, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.Disabled, Event.Disable, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.Disabled, Event.InternalCreated, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance);
|
||||
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInMaintenance);
|
||||
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInPrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.UnableToMaintain, ResourceState.ErrorInMaintenance);
|
||||
s_fsm.addTransition(ResourceState.PrepareForMaintenance, Event.InternalCreated, ResourceState.PrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.Maintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.Maintenance, Event.InternalCreated, ResourceState.Maintenance);
|
||||
s_fsm.addTransition(ResourceState.Maintenance, Event.DeleteHost, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.InternalCreated, ResourceState.ErrorInPrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.Disable, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.DeleteHost, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.UnableToMigrate, ResourceState.ErrorInPrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.UnableToMaintain, ResourceState.ErrorInMaintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInPrepareForMaintenance, Event.ErrorsCorrected, ResourceState.PrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.InternalCreated, ResourceState.ErrorInMaintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.AdminAskMaintenance, ResourceState.PrepareForMaintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.Disable, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.DeleteHost, ResourceState.Disabled);
|
||||
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.InternalEnterMaintenance, ResourceState.Maintenance);
|
||||
s_fsm.addTransition(ResourceState.ErrorInMaintenance, Event.AdminCancelMaintenance, ResourceState.Enabled);
|
||||
s_fsm.addTransition(ResourceState.Error, Event.InternalCreated, ResourceState.Error);
|
||||
s_fsm.addTransition(ResourceState.Disabled, Event.DeleteHost, ResourceState.Disabled);
|
||||
|
||||
@ -16,8 +16,6 @@
|
||||
// under the License.
|
||||
package org.apache.cloudstack.api.command.admin.host;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import org.apache.cloudstack.api.APICommand;
|
||||
import org.apache.cloudstack.api.ApiCommandJobType;
|
||||
import org.apache.cloudstack.api.ApiConstants;
|
||||
@ -27,10 +25,12 @@ import org.apache.cloudstack.api.Parameter;
|
||||
import org.apache.cloudstack.api.ServerApiException;
|
||||
import org.apache.cloudstack.api.response.HostResponse;
|
||||
import org.apache.cloudstack.context.CallContext;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.cloud.event.EventTypes;
|
||||
import com.cloud.host.Host;
|
||||
import com.cloud.user.Account;
|
||||
import com.cloud.utils.exception.CloudRuntimeException;
|
||||
|
||||
@APICommand(name = "prepareHostForMaintenance", description = "Prepares a host for maintenance.", responseObject = HostResponse.class,
|
||||
requestHasSensitiveInfo = false, responseHasSensitiveInfo = false)
|
||||
@ -99,13 +99,17 @@ public class PrepareForMaintenanceCmd extends BaseAsyncCmd {
|
||||
|
||||
@Override
|
||||
public void execute() {
|
||||
Host result = _resourceService.maintain(this);
|
||||
if (result != null) {
|
||||
HostResponse response = _responseGenerator.createHostResponse(result);
|
||||
response.setResponseName("host");
|
||||
this.setResponseObject(response);
|
||||
} else {
|
||||
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance");
|
||||
try {
|
||||
Host result = _resourceService.maintain(this);
|
||||
if (result != null) {
|
||||
HostResponse response = _responseGenerator.createHostResponse(result);
|
||||
response.setResponseName("host");
|
||||
this.setResponseObject(response);
|
||||
} else {
|
||||
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance");
|
||||
}
|
||||
} catch (CloudRuntimeException exception) {
|
||||
throw new ServerApiException(ApiErrorCode.INTERNAL_ERROR, "Failed to prepare host for maintenance due to: " + exception.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -102,6 +102,7 @@ public interface HighAvailabilityManager extends Manager {
|
||||
|
||||
boolean hasPendingHaWork(long vmId);
|
||||
|
||||
boolean hasPendingMigrationsWork(long vmId);
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
|
||||
@ -47,11 +47,6 @@ import org.apache.cloudstack.framework.config.Configurable;
|
||||
*/
|
||||
public interface ResourceManager extends ResourceService, Configurable {
|
||||
|
||||
ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class,
|
||||
"host.maintenance.retries","20",
|
||||
"Number of retries when preparing a host into Maintenance Mode is faulty before failing",
|
||||
true, ConfigKey.Scope.Cluster);
|
||||
|
||||
ConfigKey<Boolean> KvmSshToAgentEnabled = new ConfigKey<>("Advanced", Boolean.class,
|
||||
"kvm.ssh.to.agent","true",
|
||||
"Number of retries when preparing a host into Maintenance Mode is faulty before failing",
|
||||
@ -97,7 +92,7 @@ public interface ResourceManager extends ResourceService, Configurable {
|
||||
|
||||
boolean umanageHost(long hostId);
|
||||
|
||||
boolean maintenanceFailed(long hostId);
|
||||
boolean migrateAwayFailed(long hostId, long vmId);
|
||||
|
||||
public boolean maintain(final long hostId) throws AgentUnavailableException;
|
||||
|
||||
|
||||
@ -1583,7 +1583,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
||||
final HostVO h = sc.find();
|
||||
if (h != null) {
|
||||
final ResourceState resourceState = h.getResourceState();
|
||||
if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance || resourceState == ResourceState.ErrorInMaintenance) {
|
||||
if (resourceState == ResourceState.Disabled || resourceState == ResourceState.Maintenance) {
|
||||
/*
|
||||
* Host is in non-operation state, so no investigation and direct put agent to Disconnected
|
||||
*/
|
||||
@ -1605,7 +1605,9 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
||||
}
|
||||
|
||||
final QueryBuilder<HostVO> sc = QueryBuilder.create(HostVO.class);
|
||||
sc.and(sc.entity().getResourceState(), Op.IN, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance);
|
||||
sc.and(sc.entity().getResourceState(), Op.IN,
|
||||
ResourceState.PrepareForMaintenance,
|
||||
ResourceState.ErrorInPrepareForMaintenance);
|
||||
final List<HostVO> hosts = sc.list();
|
||||
|
||||
for (final HostVO host : hosts) {
|
||||
|
||||
@ -16,16 +16,10 @@
|
||||
// under the License.
|
||||
package org.apache.cloudstack.engine.datacenter.entity.api.db;
|
||||
|
||||
import com.cloud.host.Status;
|
||||
import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
||||
import com.cloud.resource.ResourceState;
|
||||
import com.cloud.storage.Storage.StoragePoolType;
|
||||
import com.cloud.utils.NumbersUtil;
|
||||
import com.cloud.utils.db.GenericDao;
|
||||
import com.cloud.utils.db.StateMachine;
|
||||
import org.apache.cloudstack.api.Identity;
|
||||
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State;
|
||||
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State.Event;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.DiscriminatorColumn;
|
||||
@ -42,10 +36,18 @@ import javax.persistence.Table;
|
||||
import javax.persistence.Temporal;
|
||||
import javax.persistence.TemporalType;
|
||||
import javax.persistence.Transient;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import org.apache.cloudstack.api.Identity;
|
||||
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State;
|
||||
import org.apache.cloudstack.engine.datacenter.entity.api.DataCenterResourceEntity.State.Event;
|
||||
|
||||
import com.cloud.host.Status;
|
||||
import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
||||
import com.cloud.resource.ResourceState;
|
||||
import com.cloud.storage.Storage.StoragePoolType;
|
||||
import com.cloud.utils.NumbersUtil;
|
||||
import com.cloud.utils.db.GenericDao;
|
||||
import com.cloud.utils.db.StateMachine;
|
||||
|
||||
@Entity
|
||||
@Table(name = "host")
|
||||
@ -730,7 +732,7 @@ public class EngineHostVO implements EngineHost, Identity {
|
||||
|
||||
@Override
|
||||
public boolean isInMaintenanceStates() {
|
||||
return (getResourceState() == ResourceState.Maintenance || getResourceState() == ResourceState.ErrorInMaintenance || getResourceState() == ResourceState.PrepareForMaintenance);
|
||||
return ResourceState.isMaintenanceState(getResourceState());
|
||||
}
|
||||
|
||||
public long getUpdated() {
|
||||
|
||||
@ -16,12 +16,11 @@
|
||||
// under the License.
|
||||
package com.cloud.host;
|
||||
|
||||
import com.cloud.agent.api.VgpuTypesInfo;
|
||||
import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
||||
import com.cloud.resource.ResourceState;
|
||||
import com.cloud.storage.Storage.StoragePoolType;
|
||||
import com.cloud.utils.NumbersUtil;
|
||||
import com.cloud.utils.db.GenericDao;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import javax.persistence.Column;
|
||||
import javax.persistence.DiscriminatorColumn;
|
||||
@ -38,11 +37,13 @@ import javax.persistence.Table;
|
||||
import javax.persistence.Temporal;
|
||||
import javax.persistence.TemporalType;
|
||||
import javax.persistence.Transient;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
|
||||
import com.cloud.agent.api.VgpuTypesInfo;
|
||||
import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
||||
import com.cloud.resource.ResourceState;
|
||||
import com.cloud.storage.Storage.StoragePoolType;
|
||||
import com.cloud.utils.NumbersUtil;
|
||||
import com.cloud.utils.db.GenericDao;
|
||||
|
||||
@Entity
|
||||
@Table(name = "host")
|
||||
@ -714,9 +715,8 @@ public class HostVO implements Host {
|
||||
|
||||
@Override
|
||||
public boolean isInMaintenanceStates() {
|
||||
return (getResourceState() == ResourceState.Maintenance || getResourceState() == ResourceState.ErrorInMaintenance || getResourceState() == ResourceState.PrepareForMaintenance);
|
||||
return ResourceState.isMaintenanceState(getResourceState());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isDisabled() {
|
||||
return (getResourceState() == ResourceState.Disabled);
|
||||
|
||||
@ -19,3 +19,4 @@
|
||||
-- Schema upgrade cleanup from 4.13.0.0 to 4.14.0.0
|
||||
--;
|
||||
|
||||
DELETE FROM `cloud`.`configuration` WHERE name = 'host.maintenance.retries';
|
||||
|
||||
@ -28,17 +28,19 @@ import java.util.concurrent.TimeUnit;
|
||||
import javax.inject.Inject;
|
||||
import javax.naming.ConfigurationException;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.NDC;
|
||||
import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationService;
|
||||
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||
import org.apache.cloudstack.framework.config.Configurable;
|
||||
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
|
||||
import org.apache.cloudstack.managed.context.ManagedContext;
|
||||
import org.apache.cloudstack.managed.context.ManagedContextRunnable;
|
||||
import org.apache.cloudstack.management.ManagementServerHost;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.NDC;
|
||||
|
||||
import com.cloud.agent.AgentManager;
|
||||
import com.cloud.alert.AlertManager;
|
||||
import com.cloud.cluster.ClusterManagerListener;
|
||||
import org.apache.cloudstack.management.ManagementServerHost;
|
||||
import com.cloud.configuration.Config;
|
||||
import com.cloud.dc.ClusterDetailsDao;
|
||||
import com.cloud.dc.DataCenterVO;
|
||||
@ -101,9 +103,14 @@ import com.cloud.vm.dao.VMInstanceDao;
|
||||
* ha.retry.wait | time to wait before retrying the work item | seconds | 120 || || stop.retry.wait | time to wait
|
||||
* before retrying the stop | seconds | 120 || * }
|
||||
**/
|
||||
public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvailabilityManager, ClusterManagerListener {
|
||||
public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvailabilityManager, ClusterManagerListener, Configurable {
|
||||
|
||||
protected static final Logger s_logger = Logger.getLogger(HighAvailabilityManagerImpl.class);
|
||||
private ConfigKey<Integer> MaxRetries = new ConfigKey<>("Advanced", Integer.class,
|
||||
"max.retries","5",
|
||||
"Total number of attempts for trying migration of a VM.",
|
||||
true, ConfigKey.Scope.Cluster);
|
||||
|
||||
WorkerThread[] _workers;
|
||||
boolean _stopped;
|
||||
long _timeToSleep;
|
||||
@ -314,6 +321,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
if (vm.getHostId() != null) {
|
||||
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.Migration, Step.Scheduled, vm.getHostId(), vm.getState(), 0, vm.getUpdated());
|
||||
_haDao.persist(work);
|
||||
s_logger.info("Scheduled migration work of VM " + vm.getUuid() + " from host " + _hostDao.findById(vm.getHostId()) + " with HAWork " + work);
|
||||
wakeupWorkers();
|
||||
}
|
||||
return true;
|
||||
@ -629,23 +637,32 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
|
||||
public Long migrate(final HaWorkVO work) {
|
||||
long vmId = work.getInstanceId();
|
||||
|
||||
long srcHostId = work.getHostId();
|
||||
|
||||
VMInstanceVO vm = _instanceDao.findById(vmId);
|
||||
if (vm == null) {
|
||||
s_logger.info("Unable to find vm: " + vmId + ", skipping migrate.");
|
||||
return null;
|
||||
}
|
||||
s_logger.info("Migration attempt: for VM " + vm.getUuid() + "from host id " + srcHostId +
|
||||
". Starting attempt: " + (1 + work.getTimesTried()) + "/" + _maxRetries + " times.");
|
||||
try {
|
||||
work.setStep(Step.Migrating);
|
||||
_haDao.update(work.getId(), work);
|
||||
|
||||
VMInstanceVO vm = _instanceDao.findById(vmId);
|
||||
if (vm == null) {
|
||||
return null;
|
||||
}
|
||||
// First try starting the vm with its original planner, if it doesn't succeed send HAPlanner as its an emergency.
|
||||
_itMgr.migrateAway(vm.getUuid(), srcHostId);
|
||||
return null;
|
||||
} catch (InsufficientServerCapacityException e) {
|
||||
s_logger.warn("Insufficient capacity for migrating a VM.");
|
||||
_resourceMgr.maintenanceFailed(srcHostId);
|
||||
s_logger.warn("Migration attempt: Insufficient capacity for migrating a VM " +
|
||||
vm.getUuid() + " from source host id " + srcHostId +
|
||||
". Exception: " + e.getMessage());
|
||||
_resourceMgr.migrateAwayFailed(srcHostId, vmId);
|
||||
return (System.currentTimeMillis() >> 10) + _migrateRetryInterval;
|
||||
} catch (Exception e) {
|
||||
s_logger.warn("Migration attempt: Unexpected exception occurred when attempting migration of " +
|
||||
vm.getUuid() + e.getMessage());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
@ -744,7 +761,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
@Override
|
||||
public void cancelScheduledMigrations(final HostVO host) {
|
||||
WorkType type = host.getType() == HostVO.Type.Storage ? WorkType.Stop : WorkType.Migration;
|
||||
|
||||
s_logger.info("Canceling all scheduled migrations from host " + host.getUuid());
|
||||
_haDao.deleteMigrationWorkItems(host.getId(), type, _serverId);
|
||||
}
|
||||
|
||||
@ -762,7 +779,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
}
|
||||
|
||||
private void rescheduleWork(final HaWorkVO work, final long nextTime) {
|
||||
s_logger.info("Rescheduling work " + work + " to try again at " + new Date(nextTime << 10));
|
||||
work.setTimeToTry(nextTime);
|
||||
work.setTimesTried(work.getTimesTried() + 1);
|
||||
work.setServerId(null);
|
||||
@ -803,7 +819,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
}
|
||||
|
||||
if (nextTime == null) {
|
||||
s_logger.info("Completed work " + work);
|
||||
s_logger.info("Completed work " + work + ". Took " + (work.getTimesTried() + 1) + "/" + _maxRetries + " attempts.");
|
||||
work.setStep(Step.Done);
|
||||
} else {
|
||||
rescheduleWork(work, nextTime.longValue());
|
||||
@ -819,12 +835,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
|
||||
work.setUpdateTime(vm.getUpdated());
|
||||
work.setPreviousState(vm.getState());
|
||||
} finally {
|
||||
if (!Step.Done.equals(work.getStep())) {
|
||||
if (work.getTimesTried() >= _maxRetries) {
|
||||
s_logger.warn("Giving up, retried max " + work.getTimesTried() + "/" + _maxRetries + " times for work: " + work);
|
||||
work.setStep(Step.Done);
|
||||
} else {
|
||||
s_logger.warn("Rescheduling work " + work + " to try again at " + new Date(work.getTimeToTry() << 10) +
|
||||
". Finished attempt " + work.getTimesTried() + "/" + _maxRetries + " times.");
|
||||
}
|
||||
}
|
||||
_haDao.update(work.getId(), work);
|
||||
}
|
||||
if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) {
|
||||
s_logger.warn("Giving up, retried max. times for work: " + work);
|
||||
work.setStep(Step.Done);
|
||||
}
|
||||
_haDao.update(work.getId(), work);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -908,6 +930,16 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getConfigComponentName() {
|
||||
return HighAvailabilityManagerImpl.class.getSimpleName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ConfigKey<?>[] getConfigKeys() {
|
||||
return new ConfigKey<?>[] {MaxRetries};
|
||||
}
|
||||
|
||||
protected class CleanupTask extends ManagedContextRunnable {
|
||||
@Override
|
||||
protected void runInContext() {
|
||||
@ -1004,4 +1036,18 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
||||
List<HaWorkVO> haWorks = _haDao.listPendingHaWorkForVm(vmId);
|
||||
return haWorks.size() > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasPendingMigrationsWork(long vmId) {
|
||||
List<HaWorkVO> haWorks = _haDao.listPendingMigrationsForVm(vmId);
|
||||
for (HaWorkVO work : haWorks) {
|
||||
if (work.getTimesTried() <= _maxRetries) {
|
||||
return true;
|
||||
} else {
|
||||
s_logger.warn("HAWork Job of migration type " + work + " found in database which has max " +
|
||||
"retries more than " + _maxRetries + " but still not in Done, Cancelled, or Error State");
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -83,4 +83,6 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
|
||||
List<HaWorkVO> listRunningHaWorkForVm(long vmId);
|
||||
|
||||
List<HaWorkVO> listPendingHaWorkForVm(long vmId);
|
||||
|
||||
List<HaWorkVO> listPendingMigrationsForVm(long vmId);
|
||||
}
|
||||
|
||||
@ -48,6 +48,7 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
|
||||
private final SearchBuilder<HaWorkVO> FutureHaWorkSearch;
|
||||
private final SearchBuilder<HaWorkVO> RunningHaWorkSearch;
|
||||
private final SearchBuilder<HaWorkVO> PendingHaWorkSearch;
|
||||
private final SearchBuilder<HaWorkVO> MigratingWorkSearch;
|
||||
|
||||
protected HighAvailabilityDaoImpl() {
|
||||
super();
|
||||
@ -112,6 +113,12 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
|
||||
PendingHaWorkSearch.and("type", PendingHaWorkSearch.entity().getType(), Op.EQ);
|
||||
PendingHaWorkSearch.and("step", PendingHaWorkSearch.entity().getStep(), Op.NIN);
|
||||
PendingHaWorkSearch.done();
|
||||
|
||||
MigratingWorkSearch = createSearchBuilder();
|
||||
MigratingWorkSearch.and("instance", MigratingWorkSearch.entity().getInstanceId(), Op.EQ);
|
||||
MigratingWorkSearch.and("workType", MigratingWorkSearch.entity().getWorkType(), Op.EQ);
|
||||
MigratingWorkSearch.and("step", MigratingWorkSearch.entity().getStep(), Op.NIN);
|
||||
MigratingWorkSearch.done();
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -124,6 +131,16 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
|
||||
return search(sc, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HaWorkVO> listPendingMigrationsForVm(long vmId) {
|
||||
SearchCriteria<HaWorkVO> sc = MigratingWorkSearch.create();
|
||||
sc.setParameters("instance", vmId);
|
||||
sc.setParameters("workType", WorkType.Migration);
|
||||
sc.setParameters("step", Step.Done, Step.Error, Step.Cancelled);
|
||||
|
||||
return search(sc, null);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<HaWorkVO> listRunningHaWorkForVm(long vmId) {
|
||||
SearchCriteria<HaWorkVO> sc = RunningHaWorkSearch.create();
|
||||
|
||||
@ -26,7 +26,6 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.naming.ConfigurationException;
|
||||
@ -274,8 +273,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|
||||
private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
|
||||
|
||||
private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>();
|
||||
|
||||
private void insertListener(final Integer event, final ResourceListener listener) {
|
||||
List<ResourceListener> lst = _lifeCycleListeners.get(event);
|
||||
if (lst == null) {
|
||||
@ -1165,6 +1162,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
throw new InvalidParameterValueException("Host with id " + hostId.toString() + " doesn't exist");
|
||||
}
|
||||
|
||||
if (!ResourceState.isMaintenanceState(host.getResourceState())) {
|
||||
throw new CloudRuntimeException("Cannot perform cancelMaintenance when resource state is " + host.getResourceState() + ", hostId = " + hostId);
|
||||
}
|
||||
|
||||
processResourceEvent(ResourceListener.EVENT_CANCEL_MAINTENANCE_BEFORE, hostId);
|
||||
final boolean success = cancelMaintenance(hostId);
|
||||
processResourceEvent(ResourceListener.EVENT_CANCEL_MAINTENANCE_AFTER, hostId);
|
||||
@ -1212,6 +1213,12 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|
||||
private boolean doMaintain(final long hostId) {
|
||||
final HostVO host = _hostDao.findById(hostId);
|
||||
s_logger.info("Maintenance: attempting maintenance of host " + host.getUuid());
|
||||
ResourceState hostState = host.getResourceState();
|
||||
if (!ResourceState.canAttemptMaintenance(hostState)) {
|
||||
throw new CloudRuntimeException("Cannot perform maintain when resource state is " + hostState + ", hostId = " + hostId);
|
||||
}
|
||||
|
||||
final MaintainAnswer answer = (MaintainAnswer)_agentMgr.easySend(hostId, new MaintainCommand());
|
||||
if (answer == null || !answer.getResult()) {
|
||||
s_logger.warn("Unable to send MaintainCommand to host: " + hostId);
|
||||
@ -1219,7 +1226,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
}
|
||||
|
||||
try {
|
||||
resourceStateTransitTo(host, ResourceState.Event.AdminAskMaintenace, _nodeId);
|
||||
resourceStateTransitTo(host, ResourceState.Event.AdminAskMaintenance, _nodeId);
|
||||
} catch (final NoTransitionException e) {
|
||||
final String err = "Cannot transmit resource state of host " + host.getId() + " to " + ResourceState.Maintenance;
|
||||
s_logger.debug(err, e);
|
||||
@ -1228,7 +1235,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|
||||
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0);
|
||||
_agentMgr.pullAgentToMaintenance(hostId);
|
||||
setHostMaintenanceRetries(host);
|
||||
|
||||
/* TODO: move below to listener */
|
||||
if (host.getType() == Host.Type.Routing) {
|
||||
@ -1244,11 +1250,13 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|| _serviceOfferingDetailsDao.findDetail(vm.getServiceOfferingId(), GPU.Keys.vgpuType.toString()) != null) {
|
||||
// Migration is not supported for VGPU Vms so stop them.
|
||||
// for the last host in this cluster, stop all the VMs
|
||||
s_logger.error("Maintenance: No hosts available for migrations. Scheduling shutdown instead of migrations.");
|
||||
_haMgr.scheduleStop(vm, hostId, WorkType.ForceStop);
|
||||
} else if (HypervisorType.LXC.equals(host.getHypervisorType()) && VirtualMachine.Type.User.equals(vm.getType())){
|
||||
//Migration is not supported for LXC Vms. Schedule restart instead.
|
||||
_haMgr.scheduleRestart(vm, false);
|
||||
} else {
|
||||
s_logger.info("Maintenance: scheduling migration of VM " + vm.getUuid() + " from host " + host.getUuid());
|
||||
_haMgr.scheduleMigration(vm);
|
||||
}
|
||||
}
|
||||
@ -1256,19 +1264,9 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set retries for transiting the host into Maintenance
|
||||
*/
|
||||
protected void setHostMaintenanceRetries(HostVO host) {
|
||||
Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId());
|
||||
retryHostMaintenance.put(host.getId(), retries);
|
||||
s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s",
|
||||
host.getId(), host.getName(), retries));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean maintain(final long hostId) throws AgentUnavailableException {
|
||||
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace);
|
||||
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenance);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
@ -1285,13 +1283,29 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
s_logger.debug("Unable to find host " + hostId);
|
||||
throw new InvalidParameterValueException("Unable to find host with ID: " + hostId + ". Please specify a valid host ID.");
|
||||
}
|
||||
if (!ResourceState.canAttemptMaintenance(host.getResourceState())) {
|
||||
throw new CloudRuntimeException("Host is already in state " + host.getResourceState() + ". Cannot recall for maintenance until resolved.");
|
||||
}
|
||||
|
||||
if (_hostDao.countBy(host.getClusterId(), ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance) > 0) {
|
||||
throw new InvalidParameterValueException("There are other servers in PrepareForMaintenance OR ErrorInMaintenance STATUS in cluster " + host.getClusterId());
|
||||
if (_hostDao.countBy(host.getClusterId(), ResourceState.PrepareForMaintenance, ResourceState.ErrorInPrepareForMaintenance) > 0) {
|
||||
throw new CloudRuntimeException("There are other servers attempting migrations for maintenance. " +
|
||||
"Found hosts in PrepareForMaintenance OR ErrorInPrepareForMaintenance STATUS in cluster " + host.getClusterId());
|
||||
}
|
||||
|
||||
if (_storageMgr.isLocalStorageActiveOnHost(host.getId())) {
|
||||
throw new InvalidParameterValueException("There are active VMs using the host's local storage pool. Please stop all VMs on this host that use local storage.");
|
||||
throw new CloudRuntimeException("There are active VMs using the host's local storage pool. Please stop all VMs on this host that use local storage.");
|
||||
}
|
||||
List<VMInstanceVO> migratingInVMs = _vmDao.findByHostInStates(hostId, State.Migrating);
|
||||
if (migratingInVMs.size() > 0) {
|
||||
throw new CloudRuntimeException("Host contains incoming VMs migrating. Please wait for them to complete before putting to maintenance.");
|
||||
}
|
||||
|
||||
if (_vmDao.findByHostInStates(hostId, State.Starting, State.Stopping).size() > 0) {
|
||||
throw new CloudRuntimeException("Host contains VMs in starting/stopping state. Please wait for them to complete before putting to maintenance.");
|
||||
}
|
||||
|
||||
if (_vmDao.findByHostInStates(hostId, State.Error, State.Unknown).size() > 0) {
|
||||
throw new CloudRuntimeException("Host contains VMs in error/unknown/shutdown state. Please fix errors to proceed.");
|
||||
}
|
||||
|
||||
try {
|
||||
@ -1331,19 +1345,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following:
|
||||
* - Cancel scheduled migrations for those which have already failed
|
||||
* - Configure VNC access for VMs (KVM hosts only)
|
||||
*/
|
||||
protected boolean setHostIntoErrorInMaintenance(HostVO host, List<VMInstanceVO> failedMigrations) throws NoTransitionException {
|
||||
s_logger.debug("Unable to migrate " + failedMigrations.size() + " VM(s) from host " + host.getUuid());
|
||||
_haMgr.cancelScheduledMigrations(host);
|
||||
configureVncAccessForKVMHostFailedMigrations(host, failedMigrations);
|
||||
resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Safely transit host into Maintenance mode
|
||||
*/
|
||||
@ -1357,31 +1358,104 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if host goes into Maintenance mode, only when:
|
||||
* - No Running, Migrating or Failed migrations (host_id = last_host_id) for the host
|
||||
* Set host into ErrorInMaintenance state, as errors occurred during VM migrations. Do the following:
|
||||
* - Cancel scheduled migrations for those which have already failed
|
||||
* - Configure VNC access for VMs (KVM hosts only)
|
||||
*/
|
||||
protected boolean isHostInMaintenance(HostVO host, List<VMInstanceVO> runningVms, List<VMInstanceVO> migratingVms, List<VMInstanceVO> failedMigrations) throws NoTransitionException {
|
||||
if (CollectionUtils.isEmpty(runningVms) && CollectionUtils.isEmpty(migratingVms)) {
|
||||
return CollectionUtils.isEmpty(failedMigrations) ?
|
||||
setHostIntoMaintenance(host) :
|
||||
setHostIntoErrorInMaintenance(host, failedMigrations);
|
||||
} else if (retryHostMaintenance.containsKey(host.getId())) {
|
||||
Integer retriesLeft = retryHostMaintenance.get(host.getId());
|
||||
if (retriesLeft != null) {
|
||||
if (retriesLeft <= 0) {
|
||||
retryHostMaintenance.remove(host.getId());
|
||||
s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " +
|
||||
"please investigate this connection.",
|
||||
host.getId(), host.getName()));
|
||||
return setHostIntoErrorInMaintenance(host, failedMigrations);
|
||||
}
|
||||
retriesLeft--;
|
||||
retryHostMaintenance.put(host.getId(), retriesLeft);
|
||||
s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s",
|
||||
host.getId(), host.getName(), retriesLeft));
|
||||
protected boolean setHostIntoErrorInMaintenance(HostVO host, List<VMInstanceVO> errorVms) throws NoTransitionException {
|
||||
s_logger.debug("Unable to migrate / fix errors for " + errorVms.size() + " VM(s) from host " + host.getUuid());
|
||||
_haMgr.cancelScheduledMigrations(host);
|
||||
configureVncAccessForKVMHostFailedMigrations(host, errorVms);
|
||||
resourceStateTransitTo(host, ResourceState.Event.UnableToMaintain, _nodeId);
|
||||
return false;
|
||||
}
|
||||
|
||||
protected boolean setHostIntoErrorInPrepareForMaintenance(HostVO host, List<VMInstanceVO> errorVms) throws NoTransitionException {
|
||||
s_logger.debug("Host " + host.getUuid() + " entering in PrepareForMaintenanceWithErrors state");
|
||||
configureVncAccessForKVMHostFailedMigrations(host, errorVms);
|
||||
resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId);
|
||||
return false;
|
||||
}
|
||||
|
||||
protected boolean setHostIntoPrepareForMaintenanceAfterErrorsFixed(HostVO host) throws NoTransitionException {
|
||||
s_logger.debug("Host " + host.getUuid() + " entering in PrepareForMaintenance state as any previous corrections have been fixed");
|
||||
resourceStateTransitTo(host, ResourceState.Event.ErrorsCorrected, _nodeId);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true if host goes into Maintenance mode. There are various possibilities for VMs' states
|
||||
* on a host. We need to track the various VM states on each run and accordingly transit to the
|
||||
* appropriate state.
|
||||
*
|
||||
* We change states as follws -
|
||||
* 1. If there are no VMs in running, migrating, starting, stopping, error, unknown states we can move
|
||||
* to maintenance state. Note that there cannot be incoming migrations as the API Call prepare for
|
||||
* maintenance checks incoming migrations before starting.
|
||||
* 2. If there errors (like migrating VMs, error VMs, etc) we mark as ErrorInPrepareForMaintenance but
|
||||
* don't stop remaining migrations/ongoing legitimate operations.
|
||||
* 3. If all migration retries, legitimate operations have finished we check for VMs on the host and if
|
||||
* there are still VMs in error state or in running state or failed migrations we mark the VM as
|
||||
* ErrorInMaintenance state.
|
||||
* 4. Lastly if there are no errors or failed migrations or running VMs but there are still pending
|
||||
* legitimate operations and the host was in ErrorInPrepareForMaintenance, we push the host back
|
||||
* to PrepareForMaintenance state.
|
||||
*/
|
||||
protected boolean attemptMaintain(HostVO host) throws NoTransitionException {
|
||||
final long hostId = host.getId();
|
||||
|
||||
s_logger.info("Attempting maintenance for host " + host.getName());
|
||||
|
||||
// Step 0: First gather if VMs have pending HAWork for migration with retries left.
|
||||
final List<VMInstanceVO> allVmsOnHost = _vmDao.listByHostId(hostId);
|
||||
final boolean hasMigratingAwayVms = CollectionUtils.isNotEmpty(_vmDao.listVmsMigratingFromHost(hostId));
|
||||
boolean hasPendingMigrationRetries = false;
|
||||
for (VMInstanceVO vmInstanceVO : allVmsOnHost) {
|
||||
if (_haMgr.hasPendingMigrationsWork(vmInstanceVO.getId())) {
|
||||
s_logger.info("Attempting maintenance for " + host + " found pending migration for VM " + vmInstanceVO);
|
||||
hasPendingMigrationRetries = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: If there are no VMs in migrating, running, starting, stopping, error or unknown state we can safely move the host to maintenance.
|
||||
if (!hasMigratingAwayVms && CollectionUtils.isEmpty(_vmDao.findByHostInStates(host.getId(),
|
||||
State.Migrating, State.Running, State.Starting, State.Stopping, State.Error, State.Unknown))) {
|
||||
if (hasPendingMigrationRetries) {
|
||||
s_logger.error("There should not be pending retries VMs for this host as there are no running, migrating," +
|
||||
"starting, stopping, error or unknown states on host " + host);
|
||||
}
|
||||
return setHostIntoMaintenance(host);
|
||||
}
|
||||
|
||||
// Step 2: Gather relevant VMs' states on the host and then based on them we can determine if
|
||||
final List<VMInstanceVO> failedMigrations = new ArrayList<>(_vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId));
|
||||
final List<VMInstanceVO> errorVms = new ArrayList<>(_vmDao.findByHostInStates(hostId, State.Unknown, State.Error));
|
||||
final boolean hasRunningVms = CollectionUtils.isNotEmpty(_vmDao.findByHostInStates(hostId, State.Running));
|
||||
final boolean hasFailedMigrations = CollectionUtils.isNotEmpty(failedMigrations);
|
||||
final boolean hasVmsInFailureStates = CollectionUtils.isNotEmpty(errorVms);
|
||||
final boolean hasStoppingVms = CollectionUtils.isNotEmpty(_vmDao.findByHostInStates(hostId, State.Stopping));
|
||||
errorVms.addAll(failedMigrations);
|
||||
|
||||
// Step 3: If there are no pending migration retries but host still has running VMs or,
|
||||
// host has VMs in failure state / failed migrations we move the host to ErrorInMaintenance state.
|
||||
if ((!hasPendingMigrationRetries && !hasMigratingAwayVms && hasRunningVms) ||
|
||||
(!hasRunningVms && !hasMigratingAwayVms && hasVmsInFailureStates)) {
|
||||
return setHostIntoErrorInMaintenance(host, errorVms);
|
||||
}
|
||||
|
||||
// Step 4: IF there are pending migrations or ongoing retries left or stopping VMs and there were errors or failed
|
||||
// migrations we put the host into ErrorInPrepareForMaintenance
|
||||
if ((hasPendingMigrationRetries || hasMigratingAwayVms || hasStoppingVms) && (hasVmsInFailureStates || hasFailedMigrations)) {
|
||||
return setHostIntoErrorInPrepareForMaintenance(host, errorVms);
|
||||
}
|
||||
|
||||
// Step 5: If there were previously errors found, but not anymore it means the operator has fixed errors and we put
|
||||
// the host into PrepareForMaintenance state.
|
||||
if (host.getResourceState() == ResourceState.ErrorInPrepareForMaintenance) {
|
||||
return setHostIntoPrepareForMaintenanceAfterErrorsFixed(host);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1392,14 +1466,10 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|
||||
try {
|
||||
if (host.getType() != Host.Type.Storage) {
|
||||
final List<VMInstanceVO> vos = _vmDao.listByHostId(hostId);
|
||||
final List<VMInstanceVO> vosMigrating = _vmDao.listVmsMigratingFromHost(hostId);
|
||||
final List<VMInstanceVO> failedVmMigrations = _vmDao.listNonMigratingVmsByHostEqualsLastHost(hostId);
|
||||
|
||||
hostInMaintenance = isHostInMaintenance(host, vos, vosMigrating, failedVmMigrations);
|
||||
hostInMaintenance = attemptMaintain(host);
|
||||
}
|
||||
} catch (final NoTransitionException e) {
|
||||
s_logger.debug("Cannot transmit host " + host.getId() + "to Maintenance state", e);
|
||||
s_logger.debug("Cannot transmit host " + host.getId() + " to Maintenance state", e);
|
||||
}
|
||||
return hostInMaintenance;
|
||||
}
|
||||
@ -2327,8 +2397,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
* TODO: think twice about returning true or throwing out exception, I
|
||||
* really prefer to exception that always exposes bugs
|
||||
*/
|
||||
if (host.getResourceState() != ResourceState.PrepareForMaintenance && host.getResourceState() != ResourceState.Maintenance &&
|
||||
host.getResourceState() != ResourceState.ErrorInMaintenance) {
|
||||
if (!ResourceState.isMaintenanceState(host.getResourceState())) {
|
||||
throw new CloudRuntimeException("Cannot perform cancelMaintenance when resource state is " + host.getResourceState() + ", hostId = " + hostId);
|
||||
}
|
||||
|
||||
@ -2349,7 +2418,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
try {
|
||||
resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId);
|
||||
_agentMgr.pullAgentOutMaintenance(hostId);
|
||||
retryHostMaintenance.remove(hostId);
|
||||
} catch (final NoTransitionException e) {
|
||||
s_logger.debug("Cannot transmit host " + host.getId() + "to Enabled state", e);
|
||||
return false;
|
||||
@ -2433,7 +2501,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|
||||
@Override
|
||||
public boolean executeUserRequest(final long hostId, final ResourceState.Event event) throws AgentUnavailableException {
|
||||
if (event == ResourceState.Event.AdminAskMaintenace) {
|
||||
if (event == ResourceState.Event.AdminAskMaintenance) {
|
||||
return doMaintain(hostId);
|
||||
} else if (event == ResourceState.Event.AdminCancelMaintenance) {
|
||||
return doCancelMaintenance(hostId);
|
||||
@ -2561,7 +2629,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
return null;
|
||||
}
|
||||
|
||||
s_logger.debug("Propagating resource request event:" + event.toString() + " to agent:" + agentId);
|
||||
s_logger.debug("Propagating resource request event:" + event.toString() + " to agent:" + agentId);
|
||||
final Command[] cmds = new Command[1];
|
||||
cmds[0] = new PropagateResourceEventCommand(agentId, event);
|
||||
|
||||
@ -2580,7 +2648,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean maintenanceFailed(final long hostId) {
|
||||
public boolean migrateAwayFailed(final long hostId, final long vmId) {
|
||||
final HostVO host = _hostDao.findById(hostId);
|
||||
if (host == null) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
@ -2589,6 +2657,8 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
return false;
|
||||
} else {
|
||||
try {
|
||||
s_logger.warn("Migration of VM " + _vmDao.findById(vmId) + " failed from host " + _hostDao.findById(hostId) +
|
||||
". Emitting event UnableToMigrate.");
|
||||
return resourceStateTransitTo(host, ResourceState.Event.UnableToMigrate, _nodeId);
|
||||
} catch (final NoTransitionException e) {
|
||||
s_logger.debug("No next resource state for host " + host.getId() + " while current state is " + host.getResourceState() + " with event " +
|
||||
@ -2704,7 +2774,11 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
sc.and(sc.entity().getDataCenterId(), Op.EQ, dcId);
|
||||
}
|
||||
sc.and(sc.entity().getType(), Op.EQ, type);
|
||||
sc.and(sc.entity().getResourceState(), Op.NIN, ResourceState.Maintenance, ResourceState.ErrorInMaintenance, ResourceState.PrepareForMaintenance,
|
||||
sc.and(sc.entity().getResourceState(), Op.NIN,
|
||||
ResourceState.Maintenance,
|
||||
ResourceState.ErrorInMaintenance,
|
||||
ResourceState.ErrorInPrepareForMaintenance,
|
||||
ResourceState.PrepareForMaintenance,
|
||||
ResourceState.Error);
|
||||
return sc.list();
|
||||
}
|
||||
@ -2981,6 +3055,6 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
||||
|
||||
@Override
|
||||
public ConfigKey<?>[] getConfigKeys() {
|
||||
return new ConfigKey<?>[] {HostMaintenanceRetries};
|
||||
return new ConfigKey[0];
|
||||
}
|
||||
}
|
||||
|
||||
@ -1556,7 +1556,11 @@ public class StatsCollector extends ManagerBase implements ComponentMethodInterc
|
||||
private SearchCriteria<HostVO> createSearchCriteriaForHostTypeRoutingStateUpAndNotInMaintenance() {
|
||||
SearchCriteria<HostVO> sc = _hostDao.createSearchCriteria();
|
||||
sc.addAnd("status", SearchCriteria.Op.EQ, Status.Up.toString());
|
||||
sc.addAnd("resourceState", SearchCriteria.Op.NIN, ResourceState.Maintenance, ResourceState.PrepareForMaintenance, ResourceState.ErrorInMaintenance);
|
||||
sc.addAnd("resourceState", SearchCriteria.Op.NIN,
|
||||
ResourceState.Maintenance,
|
||||
ResourceState.PrepareForMaintenance,
|
||||
ResourceState.ErrorInPrepareForMaintenance,
|
||||
ResourceState.ErrorInMaintenance);
|
||||
sc.addAnd("type", SearchCriteria.Op.EQ, Host.Type.Routing.toString());
|
||||
return sc;
|
||||
}
|
||||
|
||||
@ -35,21 +35,16 @@ import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
import javax.servlet.http.HttpSession;
|
||||
|
||||
import com.cloud.resource.ResourceState;
|
||||
import org.apache.cloudstack.framework.security.keys.KeysManager;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.springframework.stereotype.Component;
|
||||
import org.springframework.web.context.support.SpringBeanAutowiringSupport;
|
||||
|
||||
import com.cloud.vm.VmDetailConstants;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
import org.apache.cloudstack.framework.security.keys.KeysManager;
|
||||
|
||||
import com.cloud.exception.PermissionDeniedException;
|
||||
import com.cloud.host.HostVO;
|
||||
import com.cloud.hypervisor.Hypervisor;
|
||||
import com.cloud.resource.ResourceState;
|
||||
import com.cloud.server.ManagementServer;
|
||||
import com.cloud.storage.GuestOSVO;
|
||||
import com.cloud.user.Account;
|
||||
@ -64,7 +59,10 @@ import com.cloud.utils.db.TransactionLegacy;
|
||||
import com.cloud.vm.UserVmDetailVO;
|
||||
import com.cloud.vm.VirtualMachine;
|
||||
import com.cloud.vm.VirtualMachineManager;
|
||||
import com.cloud.vm.VmDetailConstants;
|
||||
import com.cloud.vm.dao.UserVmDetailsDao;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
/**
|
||||
* Thumbnail access : /console?cmd=thumbnail&vm=xxx&w=xxx&h=xxx
|
||||
@ -420,14 +418,24 @@ public class ConsoleProxyServlet extends HttpServlet {
|
||||
StringBuffer sb = new StringBuffer(rootUrl);
|
||||
String host = hostVo.getPrivateIpAddress();
|
||||
|
||||
Pair<String, Integer> portInfo;
|
||||
if (hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance)) {
|
||||
Pair<String, Integer> portInfo = null;
|
||||
if (hostVo.getHypervisorType() == Hypervisor.HypervisorType.KVM &&
|
||||
(hostVo.getResourceState().equals(ResourceState.ErrorInMaintenance) ||
|
||||
hostVo.getResourceState().equals(ResourceState.ErrorInPrepareForMaintenance))) {
|
||||
UserVmDetailVO detailAddress = _userVmDetailsDao.findDetail(vm.getId(), VmDetailConstants.KVM_VNC_ADDRESS);
|
||||
UserVmDetailVO detailPort = _userVmDetailsDao.findDetail(vm.getId(), VmDetailConstants.KVM_VNC_PORT);
|
||||
portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue()));
|
||||
} else {
|
||||
if (detailAddress != null && detailPort != null) {
|
||||
portInfo = new Pair<>(detailAddress.getValue(), Integer.valueOf(detailPort.getValue()));
|
||||
} else {
|
||||
s_logger.warn("KVM Host in ErrorInMaintenance/ErrorInPrepareForMaintenance but " +
|
||||
"no VNC Address/Port was available. Falling back to default one from MS.");
|
||||
}
|
||||
}
|
||||
|
||||
if (portInfo == null) {
|
||||
portInfo = _ms.getVncPort(vm);
|
||||
}
|
||||
|
||||
if (s_logger.isDebugEnabled())
|
||||
s_logger.debug("Port info " + portInfo.first());
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd;
|
||||
import org.apache.cloudstack.api.command.admin.host.ReconnectHostCmd;
|
||||
import org.apache.cloudstack.api.command.admin.host.UpdateHostCmd;
|
||||
import org.apache.cloudstack.api.command.admin.host.UpdateHostPasswordCmd;
|
||||
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||
|
||||
import com.cloud.agent.api.StartupCommand;
|
||||
import com.cloud.agent.api.StartupRoutingCommand;
|
||||
@ -56,7 +57,6 @@ import com.cloud.org.Cluster;
|
||||
import com.cloud.resource.ResourceState.Event;
|
||||
import com.cloud.utils.component.ManagerBase;
|
||||
import com.cloud.utils.fsm.NoTransitionException;
|
||||
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||
|
||||
public class MockResourceManagerImpl extends ManagerBase implements ResourceManager {
|
||||
|
||||
@ -307,10 +307,10 @@ public class MockResourceManagerImpl extends ManagerBase implements ResourceMana
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.cloud.resource.ResourceManager#maintenanceFailed(long)
|
||||
* @see com.cloud.resource.ResourceManager#migrateAwayFailed(long)
|
||||
*/
|
||||
@Override
|
||||
public boolean maintenanceFailed(final long hostId) {
|
||||
public boolean migrateAwayFailed(final long hostId, final long vmId) {
|
||||
// TODO Auto-generated method stub
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -17,6 +17,39 @@
|
||||
|
||||
package com.cloud.resource;
|
||||
|
||||
import static com.cloud.resource.ResourceState.Event.ErrorsCorrected;
|
||||
import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance;
|
||||
import static com.cloud.resource.ResourceState.Event.UnableToMaintain;
|
||||
import static com.cloud.resource.ResourceState.Event.UnableToMigrate;
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Matchers.anyBoolean;
|
||||
import static org.mockito.Matchers.anyLong;
|
||||
import static org.mockito.Matchers.anyObject;
|
||||
import static org.mockito.Matchers.anyString;
|
||||
import static org.mockito.Matchers.eq;
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.BDDMockito;
|
||||
import org.mockito.InjectMocks;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.MockitoAnnotations;
|
||||
import org.mockito.Spy;
|
||||
import org.powermock.api.mockito.PowerMockito;
|
||||
import org.powermock.core.classloader.annotations.PrepareForTest;
|
||||
import org.powermock.modules.junit4.PowerMockRunner;
|
||||
|
||||
import com.cloud.agent.AgentManager;
|
||||
import com.cloud.agent.api.GetVncPortAnswer;
|
||||
import com.cloud.agent.api.GetVncPortCommand;
|
||||
@ -35,38 +68,10 @@ import com.cloud.utils.fsm.NoTransitionException;
|
||||
import com.cloud.utils.ssh.SSHCmdHelper;
|
||||
import com.cloud.utils.ssh.SshException;
|
||||
import com.cloud.vm.VMInstanceVO;
|
||||
import com.cloud.vm.VirtualMachine;
|
||||
import com.cloud.vm.dao.UserVmDetailsDao;
|
||||
import com.cloud.vm.dao.VMInstanceDao;
|
||||
import com.trilead.ssh2.Connection;
|
||||
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.mockito.BDDMockito;
|
||||
import org.mockito.InjectMocks;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.MockitoAnnotations;
|
||||
import org.mockito.Spy;
|
||||
import org.powermock.api.mockito.PowerMockito;
|
||||
import org.powermock.core.classloader.annotations.PrepareForTest;
|
||||
import org.powermock.modules.junit4.PowerMockRunner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import static com.cloud.resource.ResourceState.Event.InternalEnterMaintenance;
|
||||
import static com.cloud.resource.ResourceState.Event.UnableToMigrate;
|
||||
import static org.mockito.Matchers.any;
|
||||
import static org.mockito.Matchers.anyBoolean;
|
||||
import static org.mockito.Matchers.anyLong;
|
||||
import static org.mockito.Matchers.anyString;
|
||||
import static org.mockito.Matchers.eq;
|
||||
import static org.mockito.Mockito.never;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@RunWith(PowerMockRunner.class)
|
||||
@PrepareForTest({ActionEventUtils.class, ResourceManagerImpl.class, SSHCmdHelper.class})
|
||||
@ -170,38 +175,98 @@ public class ResourceManagerImplTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainEnterMaintenanceMode() throws NoTransitionException {
|
||||
public void testCheckAndMaintainEnterMaintenanceModeNoVms() throws NoTransitionException {
|
||||
// Test entering into maintenance with no VMs running on host.
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), new ArrayList<>());
|
||||
verify(resourceManager).attemptMaintain(host);
|
||||
verify(resourceManager).setHostIntoMaintenance(host);
|
||||
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
|
||||
verify(resourceManager).resourceStateTransitTo(eq(host), eq(InternalEnterMaintenance), anyLong());
|
||||
|
||||
Assert.assertTrue(enterMaintenanceMode);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainProceedsWithPrepareForMaintenanceRunningVms() throws NoTransitionException {
|
||||
// Test proceeding through with no events if pending migrating works / retries left.
|
||||
setupRunningVMs();
|
||||
setupPendingMigrationRetries();
|
||||
verifyNoChangeInMaintenance();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInMaintenanceRunningVms() throws NoTransitionException {
|
||||
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).isHostInMaintenance(host, Arrays.asList(vm1, vm2), new ArrayList<>(), new ArrayList<>());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
// Test entering into ErrorInMaintenance when no pending migrations etc, and due to - Running VMs
|
||||
setupRunningVMs();
|
||||
setupNoPendingMigrationRetries();
|
||||
verifyErrorInMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInMaintenanceMigratingVms() throws NoTransitionException {
|
||||
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), Arrays.asList(vm1, vm2), new ArrayList<>());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
public void testCheckAndMaintainErrorInMaintenanceWithErrorVms() throws NoTransitionException {
|
||||
// Test entering into ErrorInMaintenance when no pending migrations etc, and due to - no migrating but error VMs
|
||||
setupErrorVms();
|
||||
setupNoPendingMigrationRetries();
|
||||
verifyErrorInMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInMaintenanceFailedMigrations() throws NoTransitionException {
|
||||
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).isHostInMaintenance(host, new ArrayList<>(), new ArrayList<>(), Arrays.asList(vm1, vm2));
|
||||
verify(resourceManager).setHostIntoErrorInMaintenance(host, Arrays.asList(vm1, vm2));
|
||||
verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsPendingRetries() throws NoTransitionException {
|
||||
// Test entering into ErrorInPrepareForMaintenance when pending migrations retries and due to - Failed Migrations
|
||||
setupFailedMigrations();
|
||||
setupPendingMigrationRetries();
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Running)).thenReturn(Arrays.asList(vm2));
|
||||
verifyErrorInPrepareForMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInPrepareForMaintenanceWithErrorVmsPendingRetries() throws NoTransitionException {
|
||||
// Test entering into ErrorInMaintenance when pending migrations retries due to - no migrating but error VMs
|
||||
setupErrorVms();
|
||||
setupPendingMigrationRetries();
|
||||
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2));
|
||||
verifyErrorInPrepareForMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsAndMigratingVms() throws NoTransitionException {
|
||||
// Test entering into ErrorInPrepareForMaintenance when no pending migrations retries
|
||||
// but executing migration and due to - Failed Migrations
|
||||
setupFailedMigrations();
|
||||
setupNoPendingMigrationRetries();
|
||||
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2));
|
||||
verifyErrorInPrepareForMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInPrepareForMaintenanceWithErrorVmsAndMigratingVms() throws NoTransitionException {
|
||||
// Test entering into ErrorInPrepareForMaintenance when no pending migrations retries
|
||||
// but executing migration and due to - Error Vms
|
||||
setupErrorVms();
|
||||
setupNoPendingMigrationRetries();
|
||||
when(vmInstanceDao.listVmsMigratingFromHost(hostId)).thenReturn(Arrays.asList(vm2));
|
||||
verifyErrorInPrepareForMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInPrepareForMaintenanceFailedMigrationsAndStoppingVms() throws NoTransitionException {
|
||||
// Test entering into ErrorInPrepareForMaintenance when no pending migrations retries
|
||||
// but stopping VMs and due to - Failed Migrations
|
||||
setupFailedMigrations();
|
||||
setupNoPendingMigrationRetries();
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Stopping)).thenReturn(Arrays.asList(vm2));
|
||||
verifyErrorInPrepareForMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainReturnsToPrepareForMaintenanceRunningVms() throws NoTransitionException {
|
||||
// Test switching back to PrepareForMaintenance
|
||||
when(host.getResourceState()).thenReturn(ResourceState.ErrorInPrepareForMaintenance);
|
||||
setupRunningVMs();
|
||||
setupPendingMigrationRetries();
|
||||
verifyReturnToPrepareForMaintenanceCalls();
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -219,23 +284,6 @@ public class ResourceManagerImplTest {
|
||||
verify(agentManager).pullAgentToMaintenance(hostId);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException {
|
||||
resourceManager.setHostMaintenanceRetries(host);
|
||||
|
||||
List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2);
|
||||
when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations);
|
||||
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations);
|
||||
|
||||
Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId());
|
||||
for (int i = 0; i <= retries; i++) {
|
||||
resourceManager.checkAndMaintain(host.getId());
|
||||
}
|
||||
|
||||
verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations);
|
||||
verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations);
|
||||
}
|
||||
|
||||
@Test(expected = CloudRuntimeException.class)
|
||||
public void testGetHostCredentialsMissingParameter() {
|
||||
when(host.getDetail("password")).thenReturn(null);
|
||||
@ -307,4 +355,76 @@ public class ResourceManagerImplTest {
|
||||
verify(resourceManager, never()).getHostCredentials(eq(host));
|
||||
verify(resourceManager, never()).connectAndRestartAgentOnHost(eq(host), eq(hostUsername), eq(hostPassword));
|
||||
}
|
||||
|
||||
private void setupNoPendingMigrationRetries() {
|
||||
when(haManager.hasPendingMigrationsWork(vm1.getId())).thenReturn(false);
|
||||
when(haManager.hasPendingMigrationsWork(vm2.getId())).thenReturn(false);
|
||||
}
|
||||
|
||||
private void setupRunningVMs() {
|
||||
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Running)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
}
|
||||
|
||||
private void setupPendingMigrationRetries() {
|
||||
when(haManager.hasPendingMigrationsWork(vm1.getId())).thenReturn(true);
|
||||
when(haManager.hasPendingMigrationsWork(vm2.getId())).thenReturn(false);
|
||||
}
|
||||
|
||||
private void setupFailedMigrations() {
|
||||
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(hostId)).thenReturn(Arrays.asList(vm1));
|
||||
}
|
||||
|
||||
private void setupErrorVms() {
|
||||
when(vmInstanceDao.listByHostId(hostId)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Migrating, VirtualMachine.State.Running, VirtualMachine.State.Starting, VirtualMachine.State.Stopping, VirtualMachine.State.Error, VirtualMachine.State.Unknown)).thenReturn(Arrays.asList(vm1, vm2));
|
||||
when(vmInstanceDao.findByHostInStates(hostId, VirtualMachine.State.Unknown, VirtualMachine.State.Error)).thenReturn(Arrays.asList(vm1));
|
||||
}
|
||||
|
||||
private void verifyErrorInMaintenanceCalls() throws NoTransitionException {
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).attemptMaintain(host);
|
||||
verify(resourceManager).setHostIntoErrorInMaintenance(eq(host), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
|
||||
verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMaintain), anyLong());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
}
|
||||
|
||||
private void verifyErrorInPrepareForMaintenanceCalls() throws NoTransitionException {
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).attemptMaintain(host);
|
||||
verify(resourceManager).setHostIntoErrorInPrepareForMaintenance(eq(host), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
|
||||
verify(resourceManager).resourceStateTransitTo(eq(host), eq(UnableToMigrate), anyLong());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
}
|
||||
|
||||
private void verifyReturnToPrepareForMaintenanceCalls() throws NoTransitionException {
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).attemptMaintain(host);
|
||||
verify(resourceManager).setHostIntoPrepareForMaintenanceAfterErrorsFixed(eq(host));
|
||||
verify(resourceManager).resourceStateTransitTo(eq(host), eq(ErrorsCorrected), anyLong());
|
||||
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
}
|
||||
|
||||
private void verifyNoChangeInMaintenance() throws NoTransitionException {
|
||||
boolean enterMaintenanceMode = resourceManager.checkAndMaintain(hostId);
|
||||
verify(resourceManager).attemptMaintain(host);
|
||||
verify(resourceManager, never()).setHostIntoMaintenance(anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInPrepareForMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoErrorInMaintenance(anyObject(), anyObject());
|
||||
verify(resourceManager, never()).setHostIntoPrepareForMaintenanceAfterErrorsFixed(anyObject());
|
||||
verify(resourceManager, never()).resourceStateTransitTo(anyObject(), any(), anyLong());
|
||||
Assert.assertFalse(enterMaintenanceMode);
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,16 +21,75 @@
|
||||
from marvin.cloudstackTestCase import *
|
||||
from marvin.lib.utils import *
|
||||
from marvin.lib.base import *
|
||||
from marvin.lib.common import (get_zone, get_pod, get_template)
|
||||
from marvin.lib.common import (get_zone, get_pod, get_template, list_ssvms)
|
||||
from nose.plugins.attrib import attr
|
||||
from marvin.lib.decoratorGenerators import skipTestIf
|
||||
from distutils.util import strtobool
|
||||
from marvin.sshClient import SshClient
|
||||
|
||||
_multiprocess_shared_ = False
|
||||
MIN_VMS_FOR_TEST = 3
|
||||
|
||||
class TestHostMaintenanceBase(cloudstackTestCase):
|
||||
def get_ssh_client(self, ip, username, password, retries=10):
|
||||
""" Setup ssh client connection and return connection """
|
||||
try:
|
||||
ssh_client = SshClient(ip, 22, username, password, retries)
|
||||
except Exception as e:
|
||||
raise unittest.SkipTest("Unable to create ssh connection: " % e)
|
||||
|
||||
self.assertIsNotNone(
|
||||
ssh_client, "Failed to setup ssh connection to ip=%s" % ip)
|
||||
|
||||
return ssh_client
|
||||
|
||||
def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20):
|
||||
def check_resource_state():
|
||||
response = Host.list(
|
||||
self.apiclient,
|
||||
id=hostid
|
||||
)
|
||||
if isinstance(response, list):
|
||||
if response[0].resourcestate == resourcestate:
|
||||
self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate))
|
||||
return True, None
|
||||
else:
|
||||
self.logger.debug("Waiting for host " + hostid +
|
||||
" to reach state " + resourcestate +
|
||||
", with current state " + response[0].resourcestate)
|
||||
return False, None
|
||||
|
||||
done, _ = wait_until(interval, retries, check_resource_state)
|
||||
if not done:
|
||||
raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate))
|
||||
return True
|
||||
|
||||
def prepare_host_for_maintenance(self, hostid):
|
||||
self.logger.debug("Sending Host with id %s to prepareHostForMaintenance" % hostid)
|
||||
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
|
||||
cmd.id = hostid
|
||||
response = self.apiclient.prepareHostForMaintenance(cmd)
|
||||
self.logger.debug("Host with id %s is in prepareHostForMaintenance" % hostid)
|
||||
self.logger.debug(response)
|
||||
return response
|
||||
|
||||
def cancel_host_maintenance(self, hostid):
|
||||
self.logger.debug("Canceling Host with id %s from maintain" % hostid)
|
||||
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
|
||||
cmd.id = hostid
|
||||
res = self.apiclient.cancelHostMaintenance(cmd)
|
||||
self.logger.debug("Host with id %s is cancelling maintenance" % hostid)
|
||||
return res
|
||||
|
||||
def revert_host_state_on_failure(self, hostId):
|
||||
cmd = updateHost.updateHostCmd()
|
||||
cmd.id = hostId
|
||||
cmd.allocationstate = "Enable"
|
||||
response = self.apiclient.updateHost(cmd)
|
||||
self.assertEqual(response.resourcestate, "Enabled")
|
||||
|
||||
|
||||
class TestHostMaintenance(cloudstackTestCase):
|
||||
class TestHostMaintenance(TestHostMaintenanceBase):
|
||||
|
||||
def setUp(self):
|
||||
self.logger = logging.getLogger('TestHM')
|
||||
@ -44,6 +103,8 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests())
|
||||
self.pod = get_pod(self.apiclient, self.zone.id)
|
||||
self.cleanup = []
|
||||
self.hostConfig = self.config.__dict__["zones"][0].__dict__["pods"][0].__dict__["clusters"][0].__dict__["hosts"][0].__dict__
|
||||
|
||||
|
||||
def tearDown(self):
|
||||
try:
|
||||
@ -55,7 +116,7 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
|
||||
return
|
||||
|
||||
def createVMs(self, hostId, number):
|
||||
def createVMs(self, hostId, number, offering_key="tiny"):
|
||||
|
||||
self.template = get_template(
|
||||
self.apiclient,
|
||||
@ -70,7 +131,7 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
|
||||
self.service_offering = ServiceOffering.create(
|
||||
self.apiclient,
|
||||
self.services["service_offerings"]["tiny"]
|
||||
self.services["service_offerings"][offering_key]
|
||||
)
|
||||
self.logger.debug("Using service offering %s " % self.service_offering.id)
|
||||
self.network_offering = NetworkOffering.create(
|
||||
@ -106,7 +167,32 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
self.cleanup.append(self.network_offering)
|
||||
self.cleanup.append(self.service_offering)
|
||||
return vms
|
||||
|
||||
|
||||
def checkAllVmsRunningOnHost(self, hostId):
|
||||
listVms1 = VirtualMachine.list(
|
||||
self.apiclient,
|
||||
hostid=hostId
|
||||
)
|
||||
|
||||
if (listVms1 is not None):
|
||||
self.logger.debug('Vms found to test all running = {} '.format(len(listVms1)))
|
||||
for vm in listVms1:
|
||||
if (vm.state != "Running"):
|
||||
self.logger.debug('VirtualMachine on Host with id = {} is in {}'.format(vm.id, vm.state))
|
||||
return (False, None)
|
||||
|
||||
response = list_ssvms(
|
||||
self.apiclient,
|
||||
hostid=hostId
|
||||
)
|
||||
if isinstance(response, list):
|
||||
for systemvm in response:
|
||||
if systemvm.state != 'Running':
|
||||
self.logger.debug("Found not running VM {}".format(systemvm.name))
|
||||
return (False, None)
|
||||
|
||||
return (True, None)
|
||||
|
||||
def checkVmMigratingOnHost(self, hostId):
|
||||
vm_migrating=False
|
||||
listVms1 = VirtualMachine.list(
|
||||
@ -118,60 +204,60 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
self.logger.debug('Vms found = {} '.format(len(listVms1)))
|
||||
for vm in listVms1:
|
||||
if (vm.state == "Migrating"):
|
||||
self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state))
|
||||
self.logger.debug('VirtualMachine on Host with id = {} is in {}'.format(vm.id, vm.state))
|
||||
vm_migrating=True
|
||||
break
|
||||
|
||||
return (vm_migrating, None)
|
||||
|
||||
def checkNoVmMigratingOnHost(self, hostId):
|
||||
no_vm_migrating=True
|
||||
def migrationsFinished(self, hostId):
|
||||
migrations_finished=True
|
||||
listVms1 = VirtualMachine.list(
|
||||
self.apiclient,
|
||||
hostid=hostId
|
||||
)
|
||||
|
||||
if (listVms1 is not None):
|
||||
self.logger.debug('Vms found = {} '.format(len(listVms1)))
|
||||
for vm in listVms1:
|
||||
if (vm.state == "Migrating"):
|
||||
self.logger.debug('VirtualMachine on Hyp id = {} is in {}'.format(vm.id, vm.state))
|
||||
no_vm_migrating=False
|
||||
break
|
||||
numVms = len(listVms1)
|
||||
migrations_finished = (numVms == 0)
|
||||
|
||||
return (migrations_finished, None)
|
||||
|
||||
return (no_vm_migrating, None)
|
||||
|
||||
def noOfVMsOnHost(self, hostId):
|
||||
listVms = VirtualMachine.list(
|
||||
self.apiclient,
|
||||
hostid=hostId
|
||||
)
|
||||
no_of_vms=0
|
||||
self.logger.debug("Counting VMs on host " + hostId)
|
||||
if (listVms is not None):
|
||||
for vm in listVms:
|
||||
self.logger.debug('VirtualMachine on Hyp 1 = {}'.format(vm.id))
|
||||
self.logger.debug("VirtualMachine on Host " + hostId + " = " + vm.id)
|
||||
no_of_vms=no_of_vms+1
|
||||
|
||||
self.logger.debug("Found VMs on host " + str(no_of_vms))
|
||||
return no_of_vms
|
||||
|
||||
def hostPrepareAndCancelMaintenance(self, target_host_id, other_host_id, checkVMMigration):
|
||||
|
||||
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
|
||||
cmd.id = target_host_id
|
||||
response = self.apiclient.prepareHostForMaintenance(cmd)
|
||||
|
||||
self.logger.debug('Host with id {} is in prepareHostForMaintenance'.format(target_host_id))
|
||||
|
||||
vm_migrating = wait_until(1, 10, checkVMMigration, other_host_id)
|
||||
|
||||
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
|
||||
cmd.id = target_host_id
|
||||
response = self.apiclient.cancelHostMaintenance(cmd)
|
||||
|
||||
self.logger.debug('Host with id {} is in cancelHostMaintenance'.format(target_host_id) )
|
||||
|
||||
return vm_migrating
|
||||
|
||||
|
||||
def hostPrepareAndCancelMaintenance(self, target_host_id, other_host_id):
|
||||
# Wait for all VMs to complete any pending migrations.
|
||||
if not wait_until(3, 100, self.checkAllVmsRunningOnHost, target_host_id) or \
|
||||
not wait_until(3, 100, self.checkAllVmsRunningOnHost, other_host_id):
|
||||
raise Exception("Failed to wait for all VMs to reach running state to execute test")
|
||||
|
||||
self.prepare_host_for_maintenance(target_host_id)
|
||||
migrations_finished = wait_until(5, 200, self.migrationsFinished, target_host_id)
|
||||
|
||||
self.wait_until_host_is_in_state(target_host_id, "Maintenance", 5, 200)
|
||||
|
||||
vm_count_after_maintenance = self.noOfVMsOnHost(target_host_id)
|
||||
|
||||
self.cancel_host_maintenance(target_host_id)
|
||||
self.wait_until_host_is_in_state(target_host_id, "Enabled", 5, 200)
|
||||
|
||||
if vm_count_after_maintenance != 0:
|
||||
self.fail("Host to put to maintenance still has VMs running")
|
||||
|
||||
return migrations_finished
|
||||
|
||||
@attr(
|
||||
tags=[
|
||||
"advanced",
|
||||
@ -182,42 +268,45 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
"sg"],
|
||||
required_hardware="true")
|
||||
def test_01_cancel_host_maintenace_with_no_migration_jobs(self):
|
||||
"""
|
||||
Tests if putting a host with no migrations (0 VMs) work back and forth
|
||||
|
||||
1) Verify if there are at least 2 hosts in enabled state.
|
||||
2) Put the host into maintenance verify success
|
||||
3) Put the other host into maintenance, verify success
|
||||
"""
|
||||
listHost = Host.list(
|
||||
self.apiclient,
|
||||
type='Routing',
|
||||
zoneid=self.zone.id,
|
||||
podid=self.pod.id,
|
||||
hypervisor=self.hypervisor,
|
||||
resourcestate='Enabled',
|
||||
state='Up'
|
||||
)
|
||||
for host in listHost:
|
||||
self.logger.debug('1 Hypervisor = {}'.format(host.id))
|
||||
|
||||
|
||||
if (len(listHost) < 2):
|
||||
raise unittest.SkipTest("Cancel host maintenance when VMs are migrating should be tested for 2 or more hosts");
|
||||
return
|
||||
self.logger.debug('Found Host = {}'.format(host.id))
|
||||
|
||||
|
||||
if (len(listHost) < 2):
|
||||
raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled")
|
||||
|
||||
vm_migrating=False
|
||||
|
||||
try:
|
||||
|
||||
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id, self.checkNoVmMigratingOnHost)
|
||||
|
||||
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id, self.checkNoVmMigratingOnHost)
|
||||
|
||||
migrations_finished = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id)
|
||||
|
||||
if migrations_finished:
|
||||
self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id)
|
||||
else:
|
||||
raise unittest.SkipTest("VMs are still migrating so reverse migration /maintenace skipped")
|
||||
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(listHost[0].id)
|
||||
self.revert_host_state_on_failure(listHost[1].id)
|
||||
self.logger.debug("Exception {}".format(e))
|
||||
self.fail("Cancel host maintenance failed {}".format(e[0]))
|
||||
|
||||
|
||||
if (vm_migrating == True):
|
||||
raise unittest.SkipTest("VMs are migrating and the test will not be able to check the conditions the test is intended for");
|
||||
|
||||
|
||||
return
|
||||
self.fail("Host maintenance test failed {}".format(e[0]))
|
||||
|
||||
|
||||
|
||||
|
||||
@attr(
|
||||
tags=[
|
||||
"advanced",
|
||||
@ -228,53 +317,125 @@ class TestHostMaintenance(cloudstackTestCase):
|
||||
"sg"],
|
||||
required_hardware="true")
|
||||
def test_02_cancel_host_maintenace_with_migration_jobs(self):
|
||||
|
||||
"""
|
||||
Tests if putting a host with migrations (3 VMs) work back and forth
|
||||
|
||||
1) Verify if there are at least 2 hosts in enabled state.
|
||||
2) Deploy VMs if needed
|
||||
3) Put the host into maintenance verify success -ensure existing host has zero running VMs
|
||||
4) Put the other host into maintenance, verify success just as step 3
|
||||
"""
|
||||
listHost = Host.list(
|
||||
self.apiclient,
|
||||
type='Routing',
|
||||
zoneid=self.zone.id,
|
||||
podid=self.pod.id,
|
||||
hypervisor=self.hypervisor,
|
||||
resourcestate='Enabled',
|
||||
state='Up'
|
||||
)
|
||||
for host in listHost:
|
||||
self.logger.debug('2 Hypervisor = {}'.format(host.id))
|
||||
|
||||
if (len(listHost) != 2):
|
||||
raise unittest.SkipTest("Cancel host maintenance when VMs are migrating can only be tested with 2 hosts");
|
||||
return
|
||||
self.logger.debug('Found Host = {}'.format(host.id))
|
||||
|
||||
if (len(listHost) < 2):
|
||||
raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled")
|
||||
|
||||
|
||||
no_of_vms = self.noOfVMsOnHost(listHost[0].id)
|
||||
|
||||
|
||||
no_of_vms = no_of_vms + self.noOfVMsOnHost(listHost[1].id)
|
||||
|
||||
if no_of_vms < 5:
|
||||
|
||||
if no_of_vms < MIN_VMS_FOR_TEST:
|
||||
self.logger.debug("Create VMs as there are not enough vms to check host maintenance")
|
||||
no_vm_req = 5 - no_of_vms
|
||||
no_vm_req = MIN_VMS_FOR_TEST - no_of_vms
|
||||
if (no_vm_req > 0):
|
||||
self.logger.debug("Creating vms = {}".format(no_vm_req))
|
||||
self.vmlist = self.createVMs(listHost[0].id, no_vm_req)
|
||||
|
||||
vm_migrating=False
|
||||
|
||||
|
||||
try:
|
||||
|
||||
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id, self.checkVmMigratingOnHost)
|
||||
|
||||
vm_migrating = self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id, self.checkVmMigratingOnHost)
|
||||
|
||||
migrations_finished = self.hostPrepareAndCancelMaintenance(listHost[0].id, listHost[1].id)
|
||||
|
||||
if migrations_finished:
|
||||
self.hostPrepareAndCancelMaintenance(listHost[1].id, listHost[0].id)
|
||||
else:
|
||||
raise unittest.SkipTest("VMs are still migrating so reverse migration /maintenace skipped")
|
||||
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(listHost[0].id)
|
||||
self.revert_host_state_on_failure(listHost[1].id)
|
||||
self.logger.debug("Exception {}".format(e))
|
||||
self.fail("Cancel host maintenance failed {}".format(e[0]))
|
||||
|
||||
self.fail("Host maintenance test failed {}".format(e[0]))
|
||||
|
||||
if (vm_migrating == False):
|
||||
raise unittest.SkipTest("No VM is migrating and the test will not be able to check the conditions the test is intended for");
|
||||
|
||||
|
||||
return
|
||||
@attr(
|
||||
tags=[
|
||||
"advanced",
|
||||
"advancedns",
|
||||
"smoke",
|
||||
"basic",
|
||||
"eip",
|
||||
"sg"],
|
||||
required_hardware="true")
|
||||
def test_03_cancel_host_maintenace_with_migration_jobs_failure(self):
|
||||
"""
|
||||
Tests if putting a host with impossible migrations (2 VMs) work pushes to ErrorInMaintenance state
|
||||
|
||||
1) Verify if there are at least 2 hosts in enabled state.
|
||||
2) Tag the host and deploy tagged VMs which cannot be migrated to other host without tags
|
||||
3) Put the host into maintenance verify it fails with it reaching ErrorInMaintenance
|
||||
"""
|
||||
listHost = Host.list(
|
||||
self.apiclient,
|
||||
type='Routing',
|
||||
zoneid=self.zone.id,
|
||||
podid=self.pod.id,
|
||||
hypervisor=self.hypervisor,
|
||||
resourcestate='Enabled',
|
||||
state='Up'
|
||||
)
|
||||
|
||||
for host in listHost:
|
||||
self.logger.debug('Found Host = {}'.format(host.id))
|
||||
|
||||
if (len(listHost) < 2):
|
||||
raise unittest.SkipTest("Canceling tests for host maintenance as we need 2 or more hosts up and enabled")
|
||||
|
||||
target_host_id = listHost[0].id
|
||||
|
||||
try:
|
||||
Host.update(self.apiclient,
|
||||
id=target_host_id,
|
||||
hosttags=self.services["service_offerings"]["taggedsmall"]["hosttags"])
|
||||
|
||||
no_of_vms = self.noOfVMsOnHost(target_host_id)
|
||||
|
||||
# Need only 2 VMs for this case.
|
||||
if no_of_vms < 2:
|
||||
self.logger.debug("Create VMs as there are not enough vms to check host maintenance")
|
||||
no_vm_req = 2 - no_of_vms
|
||||
if (no_vm_req > 0):
|
||||
self.logger.debug("Creating vms = {}".format(no_vm_req))
|
||||
self.vmlist = self.createVMs(listHost[0].id, no_vm_req, "taggedsmall")
|
||||
|
||||
# Attempt putting host in maintenance and check if ErrorInMaintenance state is reached
|
||||
self.prepare_host_for_maintenance(target_host_id)
|
||||
error_in_maintenance_reached = self.wait_until_host_is_in_state(target_host_id, "ErrorInMaintenance", 5, 300)
|
||||
|
||||
self.cancel_host_maintenance(target_host_id)
|
||||
self.wait_until_host_is_in_state(target_host_id, "Enabled", 5, 200)
|
||||
|
||||
Host.update(self.apiclient, id=target_host_id, hosttags="")
|
||||
|
||||
if not error_in_maintenance_reached:
|
||||
self.fail("Error in maintenance state should have reached after ports block")
|
||||
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(listHost[0].id)
|
||||
self.revert_host_state_on_failure(listHost[1].id)
|
||||
Host.update(self.apiclient, id=target_host_id, hosttags="")
|
||||
self.logger.debug("Exception {}".format(e))
|
||||
self.fail("Host maintenance test failed {}".format(e[0]))
|
||||
|
||||
|
||||
class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
class TestHostMaintenanceAgents(TestHostMaintenanceBase):
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
@ -371,29 +532,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
value = "true" if on else "false"
|
||||
cls.updateConfiguration('kvm.ssh.to.agent', value)
|
||||
|
||||
def prepare_host_for_maintenance(self, hostid):
|
||||
cmd = prepareHostForMaintenance.prepareHostForMaintenanceCmd()
|
||||
cmd.id = hostid
|
||||
self.apiclient.prepareHostForMaintenance(cmd)
|
||||
self.logger.debug('Host with id %s is in prepareHostForMaintenance' % hostid)
|
||||
|
||||
def wait_until_host_is_in_state(self, hostid, resourcestate, interval=3, retries=20):
|
||||
def check_resource_state():
|
||||
response = Host.list(
|
||||
self.apiclient,
|
||||
id=hostid
|
||||
)
|
||||
if isinstance(response, list):
|
||||
if response[0].resourcestate == resourcestate:
|
||||
self.logger.debug('Host with id %s is in resource state = %s' % (hostid, resourcestate))
|
||||
return True, None
|
||||
return False, None
|
||||
|
||||
done, _ = wait_until(interval, retries, check_resource_state)
|
||||
if not done:
|
||||
raise Exception("Failed to wait for host %s to be on resource state %s" % (hostid, resourcestate))
|
||||
return True
|
||||
|
||||
def wait_until_agent_is_in_state(self, hostid, state, interval=3, retries=20):
|
||||
def check_agent_state():
|
||||
response = Host.list(
|
||||
@ -411,12 +549,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
raise Exception("Failed to wait for host agent %s to be on state %s" % (hostid, state))
|
||||
return True
|
||||
|
||||
def cancel_host_maintenance(self, hostid):
|
||||
cmd = cancelHostMaintenance.cancelHostMaintenanceCmd()
|
||||
cmd.id = hostid
|
||||
self.apiclient.cancelHostMaintenance(cmd)
|
||||
self.logger.debug('Host with id %s is cancelling maintenance' % hostid)
|
||||
|
||||
def get_enabled_host_connected_agent(self):
|
||||
hosts = Host.list(
|
||||
self.apiclient,
|
||||
@ -428,7 +560,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
state='Up'
|
||||
)
|
||||
if len(hosts) < 2:
|
||||
raise unittest.SkipTest("Cancel host maintenance must be tested for 2 or more hosts")
|
||||
raise unittest.SkipTest("Host maintenance tests must be tested for 2 or more hosts")
|
||||
return hosts[0]
|
||||
|
||||
def deploy_vm_on_host(self, hostid):
|
||||
@ -451,13 +583,6 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
)
|
||||
self.cleanup.append(vm)
|
||||
|
||||
def revert_host_state_on_failure(self, host):
|
||||
cmd = updateHost.updateHostCmd()
|
||||
cmd.id = host.id
|
||||
cmd.allocationstate = "Enable"
|
||||
response = self.apiclient.updateHost(cmd)
|
||||
self.assertEqual(response.resourcestate, "Enabled")
|
||||
|
||||
@skipTestIf("hypervisorNotSupported")
|
||||
@attr(tags=["advanced", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true")
|
||||
def test_01_cancel_host_maintenance_ssh_enabled_agent_connected(self):
|
||||
@ -480,22 +605,9 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
self.wait_until_host_is_in_state(self.host.id, "Enabled")
|
||||
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(self.host)
|
||||
self.revert_host_state_on_failure(self.host.id)
|
||||
self.fail(e)
|
||||
|
||||
def get_ssh_client(self, ip, username, password, retries=10):
|
||||
""" Setup ssh client connection and return connection """
|
||||
|
||||
try:
|
||||
ssh_client = SshClient(ip, 22, username, password, retries)
|
||||
except Exception as e:
|
||||
raise unittest.SkipTest("Unable to create ssh connection: " % e)
|
||||
|
||||
self.assertIsNotNone(
|
||||
ssh_client, "Failed to setup ssh connection to ip=%s" % ip)
|
||||
|
||||
return ssh_client
|
||||
|
||||
@skipTestIf("hypervisorNotSupported")
|
||||
@attr(tags=["boris", "advancedns", "smoke", "basic", "eip", "sg"], required_hardware="true")
|
||||
def test_02_cancel_host_maintenance_ssh_enabled_agent_disconnected(self):
|
||||
@ -529,7 +641,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
|
||||
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(self.host)
|
||||
self.revert_host_state_on_failure(self.host.id)
|
||||
self.fail(e)
|
||||
|
||||
@skipTestIf("hypervisorNotSupported")
|
||||
@ -554,7 +666,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
self.wait_until_host_is_in_state(self.host.id, "Enabled")
|
||||
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(self.host)
|
||||
self.revert_host_state_on_failure(self.host.id)
|
||||
self.fail(e)
|
||||
|
||||
@skipTestIf("hypervisorNotSupported")
|
||||
@ -585,7 +697,7 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
ssh_client.execute("service cloudstack-agent stop")
|
||||
self.wait_until_agent_is_in_state(self.host.id, "Disconnected")
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(self.host)
|
||||
self.revert_host_state_on_failure(self.host.id)
|
||||
self.fail(e)
|
||||
|
||||
self.assertRaises(Exception, self.cancel_host_maintenance, self.host.id)
|
||||
@ -600,5 +712,5 @@ class TestHostMaintenanceAgents(cloudstackTestCase):
|
||||
self.wait_until_host_is_in_state(self.host.id, "Enabled")
|
||||
self.assert_host_is_functional_after_cancelling_maintenance(self.host.id)
|
||||
except Exception as e:
|
||||
self.revert_host_state_on_failure(self.host)
|
||||
self.revert_host_state_on_failure(self.host.id)
|
||||
self.fail(e)
|
||||
|
||||
@ -679,6 +679,7 @@
|
||||
'Down': 'off',
|
||||
'Removed': 'off',
|
||||
'ErrorInMaintenance': 'off',
|
||||
'ErrorInPrepareForMaintenance': 'warning',
|
||||
'PrepareForMaintenance': 'warning',
|
||||
'CancelMaintenance': 'warning',
|
||||
'Maintenance': 'warning',
|
||||
|
||||
@ -17162,7 +17162,8 @@
|
||||
title: 'label.outofbandmanagement.action.issue',
|
||||
desc: function(args) {
|
||||
var host = args.context.hosts[0];
|
||||
if (host.resourcestate == 'Maintenance' || host.resourcestate == 'PrepareForMaintenance' || host.resourcestate == 'ErrorInMaintenance') {
|
||||
if (host.resourcestate == 'Maintenance' || host.resourcestate == 'PrepareForMaintenance' ||
|
||||
host.resourcestate == 'ErrorInPrepareForMaintenance' || host.resourcestate == 'ErrorInMaintenance') {
|
||||
return _l('message.outofbandmanagement.action.maintenance');
|
||||
}
|
||||
},
|
||||
@ -17776,6 +17777,7 @@
|
||||
'Down': 'off',
|
||||
'Removed': 'off',
|
||||
'ErrorInMaintenance': 'off',
|
||||
'ErrorInPrepareForMaintenance': 'warning',
|
||||
'PrepareForMaintenance': 'warning',
|
||||
'CancelMaintenance': 'warning',
|
||||
'Maintenance': 'warning',
|
||||
@ -21975,7 +21977,7 @@
|
||||
allowedActions.push("edit");
|
||||
allowedActions.push("enableMaintenanceMode");
|
||||
allowedActions.push("cancelMaintenanceMode");
|
||||
} else if (jsonObj.resourcestate == "PrepareForMaintenance") {
|
||||
} else if (jsonObj.resourcestate == "PrepareForMaintenance" || jsonObj.resourcestate == 'ErrorInPrepareForMaintenance') {
|
||||
allowedActions.push("edit");
|
||||
allowedActions.push("cancelMaintenanceMode");
|
||||
} else if (jsonObj.resourcestate == "Maintenance") {
|
||||
@ -22029,7 +22031,7 @@
|
||||
} else if (jsonObj.state == "ErrorInMaintenance") {
|
||||
allowedActions.push("enableMaintenanceMode");
|
||||
allowedActions.push("cancelMaintenanceMode");
|
||||
} else if (jsonObj.state == "PrepareForMaintenance") {
|
||||
} else if (jsonObj.state == "PrepareForMaintenance" || jsonObj.resourcestate == "ErrorInPrepareForMaintenance") {
|
||||
allowedActions.push("cancelMaintenanceMode");
|
||||
} else if (jsonObj.state == "Maintenance") {
|
||||
allowedActions.push("cancelMaintenanceMode");
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user