bug 10094: The problem was we added code that won't add any more ha work items if it already has one. However, that is wrong. HA Manager stores the existing snapshot of the VM state machine. Before working on HA for a VM, it checks to see if that snapshot has been changed. So by not scheduling HA work, we've effectively made HA not work under multi-failure situations. I've fixed by removing that code and instead at the time of performing HA, do a quick check to see if there are pwork underway for the same VM and work scheduled in the future for that VM. If there are work scheduled in the future, then we simply cancel the current work. If there are already work underway, then we retry again in 1 minute.

This commit is contained in:
Alex Huang 2011-06-12 09:18:21 -07:00
parent 44d15d38b0
commit 6137f216b1
3 changed files with 81 additions and 15 deletions

View File

@ -293,36 +293,50 @@ public class HighAvailabilityManagerImpl implements HighAvailabilityManager, Clu
}
}
final List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
List<HaWorkVO> items = _haDao.findPreviousHA(vm.getId());
int maxRetries = 0;
boolean NeedToAddNew = true;
for (final HaWorkVO item : items) {
for (HaWorkVO item : items) {
if (maxRetries < item.getTimesTried() && !item.canScheduleNew(_timeBetweenFailures)) {
maxRetries = item.getTimesTried();
break;
}
}
for (final HaWorkVO item : items) {
if (!(item.getStep() == Step.Error || item.getStep() == Step.Done || item.getStep() == Step.Cancelled)) {
NeedToAddNew = false;
}
}
if (NeedToAddNew) {
final HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled, hostId, vm.getState(), maxRetries + 1, vm.getUpdated());
_haDao.persist(work);
}
HaWorkVO work = new HaWorkVO(vm.getId(), vm.getType(), WorkType.HA, investigate ? Step.Investigating : Step.Scheduled, hostId, vm.getState(), maxRetries + 1, vm.getUpdated());
_haDao.persist(work);
if (s_logger.isInfoEnabled()) {
s_logger.info("Schedule vm for HA: " + vm.toString());
s_logger.info("Schedule vm for HA: " + vm);
}
wakeupWorkers();
}
protected Long restart(final HaWorkVO work) {
final long vmId = work.getInstanceId();
protected Long restart(HaWorkVO work) {
List<HaWorkVO> items = _haDao.listFutureHaWorkForVm(work.getInstanceId(), work.getId());
if (items.size() > 0) {
StringBuilder str = new StringBuilder("Cancelling this work item because newer ones have been scheduled. Work Ids = [");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
}
str.delete(str.length() - 2, str.length()).append("]");
s_logger.info(str.toString());
return null;
}
items = _haDao.listRunningHaWorkForVm(work.getInstanceId());
if (items.size() > 0) {
StringBuilder str = new StringBuilder("Waiting because there's HA work being executed on an item currently. Work Ids =[");
for (HaWorkVO item : items) {
str.append(item.getId()).append(", ");
}
str.delete(str.length() - 2, str.length()).append("]");
s_logger.info(str.toString());
return (System.currentTimeMillis() >> 10) + _investigateRetryInterval;
}
long vmId = work.getInstanceId();
VMInstanceVO vm = _itMgr.findById(work.getType(), work.getInstanceId());
if (vm == null) {

View File

@ -65,4 +65,21 @@ public interface HighAvailabilityDao extends GenericDao<HaWorkVO, Long> {
boolean hasBeenScheduled(long instanceId, WorkType type);
int releaseWorkItems(long nodeId);
/**
* Look for HA work that has been scheduled for a vm since a certain work id.
*
* @param vmId virtual machine id.
* @param workId work item id.
* @return List of work items.
*/
List<HaWorkVO> listFutureHaWorkForVm(long vmId, long workId);
/**
* Look for HA work that is being run right now for a VM.
*
* @param vmId virtual machine id
* @return List of work items
*/
List<HaWorkVO> listRunningHaWorkForVm(long vmId);
}

View File

@ -47,6 +47,8 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
private final SearchBuilder<HaWorkVO> PreviousWorkSearch;
private final SearchBuilder<HaWorkVO> TakenWorkSearch;
private final SearchBuilder<HaWorkVO> ReleaseSearch;
private final SearchBuilder<HaWorkVO> FutureHaWorkSearch;
private final SearchBuilder<HaWorkVO> RunningHaWorkSearch;
protected HighAvailabilityDaoImpl() {
super();
@ -91,6 +93,39 @@ public class HighAvailabilityDaoImpl extends GenericDaoBase<HaWorkVO, Long> impl
ReleaseSearch.and("step", ReleaseSearch.entity().getStep(), Op.NIN);
ReleaseSearch.and("taken", ReleaseSearch.entity().getDateTaken(), Op.NNULL);
ReleaseSearch.done();
FutureHaWorkSearch = createSearchBuilder();
FutureHaWorkSearch.and("instance", FutureHaWorkSearch.entity().getInstanceId(), Op.EQ);
FutureHaWorkSearch.and("type", FutureHaWorkSearch.entity().getType(), Op.EQ);
FutureHaWorkSearch.and("id", FutureHaWorkSearch.entity().getId(), Op.GT);
FutureHaWorkSearch.done();
RunningHaWorkSearch = createSearchBuilder();
RunningHaWorkSearch.and("instance", RunningHaWorkSearch.entity().getInstanceId(), Op.EQ);
RunningHaWorkSearch.and("type", RunningHaWorkSearch.entity().getType(), Op.EQ);
RunningHaWorkSearch.and("taken", RunningHaWorkSearch.entity().getDateTaken(), Op.NNULL);
RunningHaWorkSearch.and("step", RunningHaWorkSearch.entity().getStep(), Op.NIN);
RunningHaWorkSearch.done();
}
@Override
public List<HaWorkVO> listRunningHaWorkForVm(long vmId) {
SearchCriteria<HaWorkVO> sc = RunningHaWorkSearch.create();
sc.setParameters("instance", vmId);
sc.setParameters("type", WorkType.HA);
sc.setParameters("step", Step.Done, Step.Error, Step.Cancelled);
return search(sc, null);
}
@Override
public List<HaWorkVO> listFutureHaWorkForVm(long vmId, long workId) {
SearchCriteria<HaWorkVO> sc = FutureHaWorkSearch.create();
sc.setParameters("instance", vmId);
sc.setParameters("type", HighAvailabilityManager.WorkType.HA);
sc.setParameters("id", workId);
return search(sc, null);
}
@Override