Merge pull request #760 from koushik-das/CLOUDSTACK-8785

CLOUDSTACK-8785: Proper enforcement of retry count (max.retries) for all work type handled by HighAvailability manager Retry count is properly enforced for all work types in HA manager. Also reorganized some of the code for easy testing. Signed-off-by: Koushik Das <koushik@apache.org>
2025-11-02 11:52:28 +01:00 · 2015-09-01 17:58:45 +05:30 · 2015-09-01 17:58:45 +05:30 · f732c7d1e9
commit f732c7d1e9
parent c8bfeb88c3 cbf2c3bbf6
2 changed files with 103 additions and 67 deletions
--- a/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java
+++ b/server/src/com/cloud/ha/HighAvailabilityManagerImpl.java
@ -383,10 +383,10 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
        }

        List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
-        int maxRetries = 0;
+        int timesTried = 0;
        for (HaWorkVO item : items) {
-            if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
-                maxRetries = item.getTimesTried();
+            if (timesTried < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
+                timesTried = item.getTimesTried();
                break;
            }
        }
@ -396,7 +396,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
        }

        HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled,
-                hostId != null ? hostId : 0L, vm.getState(), maxRetries + 1, vm.getUpdated());
+                hostId != null ? hostId : 0L, vm.getState(), timesTried, vm.getUpdated());
        _haDao.persist(work);

        if (s_logger.isInfoEnabled()) {
@ -407,7 +407,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai

    }

-    protected Long restart(HaWorkVO work) {
+    protected Long restart(final HaWorkVO work) {
        List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
        if (items.size() > 0) {
            StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled.  Work Ids = [");
@ -571,11 +571,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
            return null;
        }

-        if (work.getTimesTried() > _maxRetries) {
-            s_logger.warn("Retried to max times so deleting: " + vmId);
-            return null;
-        }
-
        try {
            HashMap<VirtualMachineProfile.Param, Object> params = new HashMap<VirtualMachineProfile.Param, Object>();
            if (_haTag != null) {
@ -663,7 +658,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
        _haDao.delete(vm.getId(), WorkType.Destroy);
    }

-    protected Long destroyVM(HaWorkVO work) {
+    protected Long destroyVM(final HaWorkVO work) {
        final VirtualMachine vm = _itMgr.findById(work.getInstanceId());
        s_logger.info("Destroying " + vm.toString());
        try {
@ -690,7 +685,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
            s_logger.debug("concurrent operation: " + e.getMessage());
        }

-        work.setTimesTried(work.getTimesTried() + 1);
        return (System.currentTimeMillis() >> 10) + _stopRetryInterval;
    }

@ -738,10 +732,6 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
            s_logger.debug("operation timed out: " + e.getMessage());
        }

-        work.setTimesTried(work.getTimesTried() + 1);
-        if (s_logger.isDebugEnabled()) {
-            s_logger.debug("Stop was unsuccessful.  Rescheduling");
-        }
        return (System.currentTimeMillis() >> 10) + _stopRetryInterval;
    }

@ -765,6 +755,56 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
        return vms;
    }

+    private void rescheduleWork(final HaWorkVO work, final long nextTime) {
+        s_logger.info("Rescheduling work " + work + " to try again at " + new Date(nextTime << 10));
+        work.setTimeToTry(nextTime);
+        work.setTimesTried(work.getTimesTried() + 1);
+        work.setServerId(null);
+        work.setDateTaken(null);
+    }
+
+    private void processWork(final HaWorkVO work) {
+        try {
+            final WorkType wt = work.getWorkType();
+            Long nextTime = null;
+            if (wt == WorkType.Migration) {
+                nextTime = migrate(work);
+            } else if (wt == WorkType.HA) {
+                nextTime = restart(work);
+            } else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) {
+                nextTime = stopVM(work);
+            } else if (wt == WorkType.Destroy) {
+                nextTime = destroyVM(work);
+            } else {
+                assert false : "How did we get here with " + wt.toString();
+                return;
+            }
+
+            if (nextTime == null) {
+                s_logger.info("Completed work " + work);
+                work.setStep(Step.Done);
+            } else {
+                rescheduleWork(work, nextTime.longValue());
+            }
+        } catch (Exception e) {
+            s_logger.warn("Encountered unhandled exception during HA process, reschedule work", e);
+
+            long nextTime = (System.currentTimeMillis() >> 10) + _restartRetryInterval;
+            rescheduleWork(work, nextTime);
+
+            // if restart failed in the middle due to exception, VM state may has been changed
+            // recapture into the HA worker so that it can really continue in it next turn
+            VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
+            work.setUpdateTime(vm.getUpdated());
+            work.setPreviousState(vm.getState());
+        }
+        if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) {
+            s_logger.warn("Giving up, retried max. times for work: " + work);
+            work.setStep(Step.Done);
+        }
+        _haDao.update(work.getId(), work);
+    }
+
    @Override
    public boolean configure(final String name, final Map<String, Object> xmlParams) throws ConfigurationException {
        _serverId = _msServer.getId();
@ -881,7 +921,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
        private void runWithContext() {
            HaWorkVO work = null;
            try {
-                s_logger.trace("Checking the database");
+                s_logger.trace("Checking the database for work");
                work = _haDao.take(_serverId);
                if (work == null) {
                    try {
@ -896,56 +936,8 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
                }

                NDC.push("work-" + work.getId());
-                s_logger.info("Processing " + work);
-
-                try {
-                    final WorkType wt = work.getWorkType();
-                    Long nextTime = null;
-                    if (wt == WorkType.Migration) {
-                        nextTime = migrate(work);
-                    } else if (wt == WorkType.HA) {
-                        nextTime = restart(work);
-                    } else if (wt == WorkType.Stop || wt == WorkType.CheckStop || wt == WorkType.ForceStop) {
-                        nextTime = stopVM(work);
-                    } else if (wt == WorkType.Destroy) {
-                        nextTime = destroyVM(work);
-                    } else {
-                        assert false : "How did we get here with " + wt.toString();
-                        return;
-                    }
-
-                    if (nextTime == null) {
-                        s_logger.info("Completed " + work);
-                        work.setStep(Step.Done);
-                    } else {
-                        s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10));
-                        work.setTimeToTry(nextTime);
-                        work.setTimesTried(work.getTimesTried() + 1);
-                        work.setServerId(null);
-                        work.setDateTaken(null);
-                    }
-                } catch (Exception e) {
-                    s_logger.warn("Encountered unhandled exception during HA process, reschedule retry", e);
-
-                    long nextTime = (System.currentTimeMillis() >> 10) + _restartRetryInterval;
-
-                    s_logger.info("Rescheduling " + work + " to try again at " + new Date(nextTime << 10));
-                    work.setTimeToTry(nextTime);
-                    work.setTimesTried(work.getTimesTried() + 1);
-                    work.setServerId(null);
-                    work.setDateTaken(null);
-
-                    // if restart failed in the middle due to exception, VM state may has been changed
-                    // recapture into the HA worker so that it can really continue in it next turn
-                    VMInstanceVO vm = _instanceDao.findById(work.getInstanceId());
-                    work.setUpdateTime(vm.getUpdated());
-                    work.setPreviousState(vm.getState());
-                    if (!Step.Done.equals(work.getStep()) && work.getTimesTried() >= _maxRetries) {
-                        s_logger.warn("Giving up, retries max times for work: " + work);
-                        work.setStep(Step.Done);
-                    }
-                }
-                _haDao.update(work.getId(), work);
+                s_logger.info("Processing work " + work);
+                processWork(work);
            } catch (final Throwable th) {
                s_logger.error("Caught this throwable, ", th);
            } finally {
--- a/server/test/com/cloud/ha/HighAvailabilityManagerImplTest.java
+++ b/server/test/com/cloud/ha/HighAvailabilityManagerImplTest.java
@ -16,11 +16,14 @@
 // under the License.
 package com.cloud.ha;

+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;

 import java.lang.reflect.Array;
 import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
@ -31,6 +34,7 @@ import org.apache.cloudstack.engine.orchestration.service.VolumeOrchestrationSer
 import org.apache.cloudstack.framework.config.dao.ConfigurationDao;
 import org.apache.cloudstack.managed.context.ManagedContext;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.mockito.Mock;
@ -44,6 +48,8 @@ import com.cloud.dc.DataCenterVO;
 import com.cloud.dc.HostPodVO;
 import com.cloud.dc.dao.DataCenterDao;
 import com.cloud.dc.dao.HostPodDao;
+import com.cloud.ha.HighAvailabilityManager.Step;
+import com.cloud.ha.HighAvailabilityManager.WorkType;
 import com.cloud.ha.dao.HighAvailabilityDao;
 import com.cloud.host.Host;
 import com.cloud.host.HostVO;
@ -106,6 +112,17 @@ public class HighAvailabilityManagerImplTest {
    HostVO hostVO;

    HighAvailabilityManagerImpl highAvailabilityManager;
+    HighAvailabilityManagerImpl highAvailabilityManagerSpy;
+    static Method processWorkMethod = null;
+
+    @BeforeClass
+    public static void initOnce() {
+        try {
+            processWorkMethod = HighAvailabilityManagerImpl.class.getDeclaredMethod("processWork", HaWorkVO.class);
+            processWorkMethod.setAccessible(true);
+        } catch (NoSuchMethodException e) {
+        }
+    }

    @Before
    public void setup() throws IllegalArgumentException,
@ -123,8 +140,12 @@ public class HighAvailabilityManagerImplTest {
                        injectField.set(highAvailabilityManager, obj);
                    }
                }
+            } else if (injectField.getName().equals("_maxRetries")) {
+                injectField.setAccessible(true);
+                injectField.set(highAvailabilityManager, 5);
            }
        }
+        highAvailabilityManagerSpy = Mockito.spy(highAvailabilityManager);
    }

    @Test
@ -201,4 +222,27 @@ public class HighAvailabilityManagerImplTest {

        assertNull(highAvailabilityManager.investigate(1l));
    }
+
+    private void processWorkWithRetryCount(int count, Step expectedStep) {
+        assertNotNull(processWorkMethod);
+        HaWorkVO work = new HaWorkVO(1l, VirtualMachine.Type.User, WorkType.Migration, Step.Scheduled, 1l, VirtualMachine.State.Running, count, 12345678l);
+        Mockito.doReturn(12345678l).when(highAvailabilityManagerSpy).migrate(work);
+        try {
+            processWorkMethod.invoke(highAvailabilityManagerSpy, work);
+        } catch (IllegalAccessException e) {
+        } catch (IllegalArgumentException e) {
+        } catch (InvocationTargetException e) {
+        }
+        assertTrue(work.getStep() == expectedStep);
+    }
+
+    @Test
+    public void processWorkWithRetryCountExceeded() {
+        processWorkWithRetryCount(5, Step.Done); // max retry count is 5
+    }
+
+    @Test
+    public void processWorkWithRetryCountNotExceeded() {
+        processWorkWithRetryCount(3, Step.Scheduled);
+    }
 }