mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
server: Prevent corner case for infinite PrepareForMaintenance (#3095)
A corner case was found on 4.11.2 for #2493 leading to an infinite loop in state PrepareForMaintenance To prevent such cases, in which failed migrations are detected but still running on the host, this feature adds a new cluster setting host.maintenance.retries which is the number of retries before marking the host as ErrorInMaintenance if migration errors persist. How Has This Been Tested? - 2 KVM hosts, pick one which has running VMs as H - Block migrations ports on H to simulate failures on migrations: iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 49152:49215 -m comment --comment 'test block migrations' iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 16509 -m comment --comment 'test block migrations - Put host H in Maintenance - Observe that host is indefinitely in PrepareForMaintenance state (after this fix it goes into ErrorInMaintenance after retrying host.maintenance.retries times)
This commit is contained in:
parent
68b4b84101
commit
13c81a8ee4
@ -38,12 +38,20 @@ import com.cloud.host.Status;
|
|||||||
import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
import com.cloud.hypervisor.Hypervisor.HypervisorType;
|
||||||
import com.cloud.resource.ResourceState.Event;
|
import com.cloud.resource.ResourceState.Event;
|
||||||
import com.cloud.utils.fsm.NoTransitionException;
|
import com.cloud.utils.fsm.NoTransitionException;
|
||||||
|
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||||
|
import org.apache.cloudstack.framework.config.Configurable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ResourceManager manages how physical resources are organized within the
|
* ResourceManager manages how physical resources are organized within the
|
||||||
* CloudStack. It also manages the life cycle of the physical resources.
|
* CloudStack. It also manages the life cycle of the physical resources.
|
||||||
*/
|
*/
|
||||||
public interface ResourceManager extends ResourceService {
|
public interface ResourceManager extends ResourceService, Configurable {
|
||||||
|
|
||||||
|
ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class,
|
||||||
|
"host.maintenance.retries","20",
|
||||||
|
"Number of retries when preparing a host into Maintenance Mode is faulty before failing",
|
||||||
|
true, ConfigKey.Scope.Cluster);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Register a listener for different types of resource life cycle events.
|
* Register a listener for different types of resource life cycle events.
|
||||||
* There can only be one type of listener per type of host.
|
* There can only be one type of listener per type of host.
|
||||||
|
|||||||
@ -26,11 +26,13 @@ import java.util.Iterator;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import javax.naming.ConfigurationException;
|
import javax.naming.ConfigurationException;
|
||||||
|
|
||||||
import com.cloud.vm.dao.UserVmDetailsDao;
|
import com.cloud.vm.dao.UserVmDetailsDao;
|
||||||
|
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||||
import org.apache.commons.lang.ObjectUtils;
|
import org.apache.commons.lang.ObjectUtils;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
@ -271,6 +273,8 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
|||||||
|
|
||||||
private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
|
private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
|
||||||
|
|
||||||
|
private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private void insertListener(final Integer event, final ResourceListener listener) {
|
private void insertListener(final Integer event, final ResourceListener listener) {
|
||||||
List<ResourceListener> lst = _lifeCycleListeners.get(event);
|
List<ResourceListener> lst = _lifeCycleListeners.get(event);
|
||||||
if (lst == null) {
|
if (lst == null) {
|
||||||
@ -1224,6 +1228,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
|||||||
|
|
||||||
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0);
|
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0);
|
||||||
_agentMgr.pullAgentToMaintenance(hostId);
|
_agentMgr.pullAgentToMaintenance(hostId);
|
||||||
|
setHostMaintenanceRetries(host);
|
||||||
|
|
||||||
/* TODO: move below to listener */
|
/* TODO: move below to listener */
|
||||||
if (host.getType() == Host.Type.Routing) {
|
if (host.getType() == Host.Type.Routing) {
|
||||||
@ -1251,6 +1256,16 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set retries for transiting the host into Maintenance
|
||||||
|
*/
|
||||||
|
protected void setHostMaintenanceRetries(HostVO host) {
|
||||||
|
Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId());
|
||||||
|
retryHostMaintenance.put(host.getId(), retries);
|
||||||
|
s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s",
|
||||||
|
host.getId(), host.getName(), retries));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean maintain(final long hostId) throws AgentUnavailableException {
|
public boolean maintain(final long hostId) throws AgentUnavailableException {
|
||||||
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace);
|
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace);
|
||||||
@ -1350,7 +1365,23 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
|||||||
return CollectionUtils.isEmpty(failedMigrations) ?
|
return CollectionUtils.isEmpty(failedMigrations) ?
|
||||||
setHostIntoMaintenance(host) :
|
setHostIntoMaintenance(host) :
|
||||||
setHostIntoErrorInMaintenance(host, failedMigrations);
|
setHostIntoErrorInMaintenance(host, failedMigrations);
|
||||||
|
} else if (retryHostMaintenance.containsKey(host.getId())) {
|
||||||
|
Integer retriesLeft = retryHostMaintenance.get(host.getId());
|
||||||
|
if (retriesLeft != null) {
|
||||||
|
if (retriesLeft <= 0) {
|
||||||
|
retryHostMaintenance.remove(host.getId());
|
||||||
|
s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " +
|
||||||
|
"please investigate this connection.",
|
||||||
|
host.getId(), host.getName()));
|
||||||
|
return setHostIntoErrorInMaintenance(host, failedMigrations);
|
||||||
}
|
}
|
||||||
|
retriesLeft--;
|
||||||
|
retryHostMaintenance.put(host.getId(), retriesLeft);
|
||||||
|
s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s",
|
||||||
|
host.getId(), host.getName(), retriesLeft));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2316,6 +2347,7 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
|||||||
try {
|
try {
|
||||||
resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId);
|
resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId);
|
||||||
_agentMgr.pullAgentOutMaintenance(hostId);
|
_agentMgr.pullAgentOutMaintenance(hostId);
|
||||||
|
retryHostMaintenance.remove(hostId);
|
||||||
|
|
||||||
// for kvm, need to log into kvm host, restart cloudstack-agent
|
// for kvm, need to log into kvm host, restart cloudstack-agent
|
||||||
if ((host.getHypervisorType() == HypervisorType.KVM && !vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) {
|
if ((host.getHypervisorType() == HypervisorType.KVM && !vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) {
|
||||||
@ -2924,4 +2956,13 @@ public class ResourceManagerImpl extends ManagerBase implements ResourceManager,
|
|||||||
return super.start();
|
return super.start();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getConfigComponentName() {
|
||||||
|
return ResourceManagerImpl.class.getSimpleName();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ConfigKey<?>[] getConfigKeys() {
|
||||||
|
return new ConfigKey<?>[] {HostMaintenanceRetries};
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -56,6 +56,7 @@ import com.cloud.org.Cluster;
|
|||||||
import com.cloud.resource.ResourceState.Event;
|
import com.cloud.resource.ResourceState.Event;
|
||||||
import com.cloud.utils.component.ManagerBase;
|
import com.cloud.utils.component.ManagerBase;
|
||||||
import com.cloud.utils.fsm.NoTransitionException;
|
import com.cloud.utils.fsm.NoTransitionException;
|
||||||
|
import org.apache.cloudstack.framework.config.ConfigKey;
|
||||||
|
|
||||||
public class MockResourceManagerImpl extends ManagerBase implements ResourceManager {
|
public class MockResourceManagerImpl extends ManagerBase implements ResourceManager {
|
||||||
|
|
||||||
@ -625,4 +626,14 @@ public class MockResourceManagerImpl extends ManagerBase implements ResourceMana
|
|||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String getConfigComponentName() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ConfigKey<?>[] getConfigKeys() {
|
||||||
|
return new ConfigKey[0];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -118,6 +118,7 @@ public class ResourceManagerImplTest {
|
|||||||
when(host.getId()).thenReturn(hostId);
|
when(host.getId()).thenReturn(hostId);
|
||||||
when(host.getResourceState()).thenReturn(ResourceState.Enabled);
|
when(host.getResourceState()).thenReturn(ResourceState.Enabled);
|
||||||
when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware);
|
when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware);
|
||||||
|
when(host.getClusterId()).thenReturn(1L);
|
||||||
when(hostDao.findById(hostId)).thenReturn(host);
|
when(hostDao.findById(hostId)).thenReturn(host);
|
||||||
when(vm1.getId()).thenReturn(vm1Id);
|
when(vm1.getId()).thenReturn(vm1Id);
|
||||||
when(vm2.getId()).thenReturn(vm2Id);
|
when(vm2.getId()).thenReturn(vm2Id);
|
||||||
@ -188,4 +189,21 @@ public class ResourceManagerImplTest {
|
|||||||
verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), eq(String.valueOf(vm2VncPort)), anyBoolean());
|
verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), eq(String.valueOf(vm2VncPort)), anyBoolean());
|
||||||
verify(agentManager).pullAgentToMaintenance(hostId);
|
verify(agentManager).pullAgentToMaintenance(hostId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException {
|
||||||
|
resourceManager.setHostMaintenanceRetries(host);
|
||||||
|
|
||||||
|
List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2);
|
||||||
|
when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations);
|
||||||
|
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations);
|
||||||
|
|
||||||
|
Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId());
|
||||||
|
for (int i = 0; i <= retries; i++) {
|
||||||
|
resourceManager.checkAndMaintain(host.getId());
|
||||||
|
}
|
||||||
|
|
||||||
|
verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations);
|
||||||
|
verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user