Merge pull request #760 from koushik-das/CLOUDSTACK-8785

CLOUDSTACK-8785: Proper enforcement of retry count (max.retries) for all work type handled by HighAvailability manager
Retry count is properly enforced for all work types in HA manager. Also reorganized some of the code for easy testing.

Signed-off-by: Koushik Das <koushik@apache.org>
This commit is contained in:
Koushik Das 2015-09-01 17:58:45 +05:30
commit f732c7d1e9
2 changed files with 103 additions and 67 deletions

View File

@ -383,10 +383,10 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
int maxRetries = 0;
int timesTried = 0;
for (HaWorkVO item : items) {
if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
maxRetries = item.getTimesTried();
if (timesTried < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
timesTried = item.getTimesTried();
break;
}
}
@ -396,7 +396,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
hostId != null ? hostId : 0L, vm.getState(), maxRetries + 1, vm.getUpdated());
hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
_haDao.persist(work);
if (s_logger.isInfoEnabled()) {
@ -407,7 +407,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
protected Long restart(HaWorkVO work) {
protected Long restart(final HaWorkVO work) {
List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
if (items.size() > 0) {
StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = [");
@ -571,11 +571,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
return null;
}
if (work.getTimesTried() > _maxRetries) {
s_logger.warn("Retried to max times so deleting: " + vmId);
return null;
}
try {
HashMap<VirtualMachineProfile.Param, Object> params = new HashMap<VirtualMachineProfile.Param, Object>();
if (_haTag != null) {
@ -663,7 +658,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
_haDao.delete(vm.getId(), WorkType.Destroy);
}
protected Long destroyVM(HaWorkVO work) {
protected Long destroyVM(final HaWorkVO work) {
final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
s_logger.info("Destroying " + vm.toString());
try {
@ -690,7 +685,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
s_logger.debug("concurrent operation: " + e.getMessage());
}
work.setTimesTried(work.getTimesTried() + 1);
return (System.currentTimeMillis() >> 10) + _stopRetryInterval;
}
@ -738,10 +732,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
s_logger.debug("operation timed out: " + e.getMessage());
}
work.setTimesTried(work.getTimesTried() + 1);
if (s_logger.isDebugEnabled()) {
s_logger.debug("Stop was unsuccessful. Rescheduling");
}
return (System.currentTimeMillis() >> 10) + _stopRetryInterval;
}
@ -765,6 +755,56 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
return vms;
}
private void rescheduleWork(final HaWorkVO work, final long nextTime) {
s_logger.info("Rescheduling work " + work + " to try again at " + new Date(nextTime << 10));
work.setTimeToTry(nextTime);
work.setTimesTried(work.getTimesTried() + 1);
work.setServerId(null);
work.setDateTaken(null);
}
private void processWork(final HaWorkVO work) {
try {
final WorkType wt = work.getWorkType();
Long nextTime = null;
if (wt == WorkType.Migration) {
nextTime = migrate(work);
} else if (wt == WorkType.HA) {
nextTime = restart(work);
} else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) {
nextTime = stopVM(work);
} else if (wt == WorkType.Destroy) {
nextTime = destroyVM(work);
} else {
assert false : "How did we get here with " + wt.toString();
return;
}
if (nextTime == null) {
s_logger.info("Completed work " + work);
work.setStep(Step.Done);
} else {
rescheduleWork(work, nextTime.longValue());
}
} catch (Exception e) {
s_logger.warn("Encountered unhandled exception during HA process, reschedule work", e);
long nextTime = (System.currentTimeMillis() >> 10) + _restartRetryInterval;
rescheduleWork(work, nextTime);
// if restart failed in the middle due to exception, VM state may has been changed
// recapture into the HA worker so that it can really continue in it next turn
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
work.setUpdateTime(vm.getUpdated());
work.setPreviousState(vm.getState());
}
if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) {
s_logger.warn("Giving up, retried max. times for work: " + work);
work.setStep(Step.Done);
}
_haDao.update(work.getId(), work);
}
@Override
public boolean configure(final String name, final Map<String, Object> xmlParams) throws ConfigurationException {
_serverId = _msServer.getId();
@ -881,7 +921,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
private void runWithContext() {
HaWorkVO work = null;
try {
s_logger.trace("Checking the database");
s_logger.trace("Checking the database for work");
work = _haDao.take(_serverId);
if (work == null) {
try {
@ -896,56 +936,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
NDC.push("work-" + work.getId());
s_logger.info("Processing " + work);
try {
final WorkType wt = work.getWorkType();
Long nextTime = null;
if (wt == WorkType.Migration) {
nextTime = migrate(work);
} else if (wt == WorkType.HA) {
nextTime = restart(work);
} else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) {
nextTime = stopVM(work);
} else if (wt == WorkType.Destroy) {
nextTime = destroyVM(work);
} else {
assert false : "How did we get here with " + wt.toString();
return;
}
if (nextTime == null) {
s_logger.info("Completed " + work);
work.setStep(Step.Done);
} else {
s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10));
work.setTimeToTry(nextTime);
work.setTimesTried(work.getTimesTried() + 1);
work.setServerId(null);
work.setDateTaken(null);
}
} catch (Exception e) {
s_logger.warn("Encountered unhandled exception during HA process, reschedule retry", e);
long nextTime = (System.currentTimeMillis() >> 10) + _restartRetryInterval;
s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10));
work.setTimeToTry(nextTime);
work.setTimesTried(work.getTimesTried() + 1);
work.setServerId(null);
work.setDateTaken(null);
// if restart failed in the middle due to exception, VM state may has been changed
// recapture into the HA worker so that it can really continue in it next turn
VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
work.setUpdateTime(vm.getUpdated());
work.setPreviousState(vm.getState());
if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) {
s_logger.warn("Giving up, retries max times for work: " + work);
work.setStep(Step.Done);
}
}
_haDao.update(work.getId(), work);
s_logger.info("Processing work " + work);
processWork(work);
} catch (final Throwable th) {
s_logger.error("Caught this throwable, ", th);
} finally {

View File

@ -16,11 +16,14 @@
// under the License.
package com.cloud.ha;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.lang.reflect.Array;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -31,6 +34,7 @@ import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationSer
import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
import org.apache.cloudstack.managed.context.ManagedContext;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mockito.Mock;
@ -44,6 +48,8 @@ import com.cloud.dc.DataCenterVO;
import com.cloud.dc.HostPodVO;
import com.cloud.dc.dao.DataCenterDao;
import com.cloud.dc.dao.HostPodDao;
import com.cloud.ha.HighAvailabilityManager.Step;
import com.cloud.ha.HighAvailabilityManager.WorkType;
import com.cloud.ha.dao.HighAvailabilityDao;
import com.cloud.host.Host;
import com.cloud.host.HostVO;
@ -106,6 +112,17 @@ public class HighAvailabilityManagerImplTest {
HostVO hostVO;
HighAvailabilityManagerImpl highAvailabilityManager;
HighAvailabilityManagerImpl highAvailabilityManagerSpy;
static Method processWorkMethod = null;
@BeforeClass
public static void initOnce() {
try {
processWorkMethod = HighAvailabilityManagerImpl.class.getDeclaredMethod("processWork", HaWorkVO.class);
processWorkMethod.setAccessible(true);
} catch (NoSuchMethodException e) {
}
}
@Before
public void setup() throws IllegalArgumentException,
@ -123,8 +140,12 @@ public class HighAvailabilityManagerImplTest {
injectField.set(highAvailabilityManager, obj);
}
}
} else if (injectField.getName().equals("_maxRetries")) {
injectField.setAccessible(true);
injectField.set(highAvailabilityManager, 5);
}
}
highAvailabilityManagerSpy = Mockito.spy(highAvailabilityManager);
}
@Test
@ -201,4 +222,27 @@ public class HighAvailabilityManagerImplTest {
assertNull(highAvailabilityManager.investigate(1l));
}
private void processWorkWithRetryCount(int count, Step expectedStep) {
assertNotNull(processWorkMethod);
HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User, WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count, 12345678l);
Mockito.doReturn(12345678l).when(highAvailabilityManagerSpy).migrate(work);
try {
processWorkMethod.invoke(highAvailabilityManagerSpy, work);
} catch (IllegalAccessException e) {
} catch (IllegalArgumentException e) {
} catch (InvocationTargetException e) {
}
assertTrue(work.getStep() == expectedStep);
}
@Test
public void processWorkWithRetryCountExceeded() {
processWorkWithRetryCount(5, Step.Done); // max retry count is 5
}
@Test
public void processWorkWithRetryCountNotExceeded() {
processWorkWithRetryCount(3, Step.Scheduled);
}
}