CLOUDSTACK-7589: VM not Starting and always stuck in Stopped state after

management server restarts.

(cherry picked from commit 7cdb67dcf1ec4158ec0ab4c2fa868cc63121bbb5)
This commit is contained in:
Min Chen 2014-09-19 15:12:09 -07:00 committed by David Nalley
parent 9bbcef4540
commit b979c6f0c7
3 changed files with 32 additions and 40 deletions

View File

@ -835,24 +835,6 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager,
return ManagementServerNode.getManagementServerId();
}
private void cleanupPendingJobs(List<SyncQueueItemVO> l) {
for (SyncQueueItemVO item : l) {
if (s_logger.isInfoEnabled()) {
s_logger.info("Discard left-over queue item: " + item.toString());
}
String contentType = item.getContentType();
if (contentType != null && contentType.equalsIgnoreCase(SyncQueueItem.AsyncJobContentType)) {
Long jobId = item.getContentId();
if (jobId != null) {
s_logger.warn("Mark job as failed as its correspoding queue-item has been discarded. job id: " + jobId);
completeAsyncJob(jobId, JobInfo.Status.FAILED, 0, "Execution was cancelled because of server shutdown");
}
}
_queueMgr.purgeItem(item.getId());
}
}
@DB
protected List<Long> wakeupByJoinedJobCompletion(long joinedJobId) {
SearchCriteria<Long> joinJobSC = JoinJobSearch.create("joinJobId", joinedJobId);
@ -967,6 +949,22 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager,
return true;
}
private void cleanupLeftOverJobs(final long msid) {
try {
Transaction.execute(new TransactionCallbackNoReturn() {
@Override
public void doInTransactionWithoutResult(TransactionStatus status) {
// purge sync queue item running on this ms node
_queueMgr.cleanupActiveQueueItems(msid, true);
// reset job status for all jobs running on this ms node
_jobDao.resetJobProcess(msid, ApiErrorCode.INTERNAL_ERROR.getHttpCode(), "job cancelled because of management server restart or shutdown");
}
});
} catch (Throwable e) {
s_logger.warn("Unexpected exception in cleaning up left over jobs for mamagement server node " + msid, e);
}
}
@Override
public void onManagementNodeJoined(List<? extends ManagementServerHost> nodeList, long selfNodeId) {
}
@ -974,18 +972,7 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager,
@Override
public void onManagementNodeLeft(List<? extends ManagementServerHost> nodeList, long selfNodeId) {
for (final ManagementServerHost msHost : nodeList) {
try {
Transaction.execute(new TransactionCallbackNoReturn() {
@Override
public void doInTransactionWithoutResult(TransactionStatus status) {
List<SyncQueueItemVO> items = _queueMgr.getActiveQueueItems(msHost.getId(), true);
cleanupPendingJobs(items);
_jobDao.resetJobProcess(msHost.getId(), ApiErrorCode.INTERNAL_ERROR.getHttpCode(), "job cancelled because of management server restart");
}
});
} catch (Throwable e) {
s_logger.warn("Unexpected exception ", e);
}
cleanupLeftOverJobs(msHost.getId());
}
}
@ -995,15 +982,7 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager,
@Override
public boolean start() {
try {
_jobDao.cleanupPseduoJobs(getMsid());
List<SyncQueueItemVO> l = _queueMgr.getActiveQueueItems(getMsid(), false);
cleanupPendingJobs(l);
_jobDao.resetJobProcess(getMsid(), ApiErrorCode.INTERNAL_ERROR.getHttpCode(), "job cancelled because of management server restart");
} catch (Throwable e) {
s_logger.error("Unexpected exception " + e.getMessage(), e);
}
cleanupLeftOverJobs(getMsid());
_heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), HEARTBEAT_INTERVAL, HEARTBEAT_INTERVAL, TimeUnit.MILLISECONDS);
_heartbeatScheduler.scheduleAtFixedRate(getGCTask(), GC_INTERVAL, GC_INTERVAL, TimeUnit.MILLISECONDS);

View File

@ -36,4 +36,6 @@ public interface SyncQueueManager extends Manager {
public List<SyncQueueItemVO> getBlockedQueueItems(long thresholdMs, boolean exclusive);
void purgeAsyncJobQueueItemId(long asyncJobId);
public void cleanupActiveQueueItems(Long msid, boolean exclusive);
}

View File

@ -26,7 +26,6 @@ import org.apache.log4j.Logger;
import org.apache.cloudstack.framework.jobs.dao.SyncQueueDao;
import org.apache.cloudstack.framework.jobs.dao.SyncQueueItemDao;
import com.cloud.utils.DateUtil;
import com.cloud.utils.component.ManagerBase;
import com.cloud.utils.db.DB;
@ -260,4 +259,16 @@ public class SyncQueueManagerImpl extends ManagerBase implements SyncQueueManage
purgeItem(itemId);
}
}
@Override
public void cleanupActiveQueueItems(Long msid, boolean exclusive) {
List<SyncQueueItemVO> l = getActiveQueueItems(msid, false);
for (SyncQueueItemVO item : l) {
if (s_logger.isInfoEnabled()) {
s_logger.info("Discard left-over queue item: " + item.toString());
}
purgeItem(item.getId());
}
}
}