From b979c6f0c7c50f0ade0987bcc52543f8f9edc23b Mon Sep 17 00:00:00 2001 From: Min Chen Date: Fri, 19 Sep 2014 15:12:09 -0700 Subject: [PATCH] CLOUDSTACK-7589: VM not Starting and always stuck in Stopped state after management server restarts. (cherry picked from commit 7cdb67dcf1ec4158ec0ab4c2fa868cc63121bbb5) --- .../jobs/impl/AsyncJobManagerImpl.java | 57 ++++++------------- .../framework/jobs/impl/SyncQueueManager.java | 2 + .../jobs/impl/SyncQueueManagerImpl.java | 13 ++++- 3 files changed, 32 insertions(+), 40 deletions(-) diff --git a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java index 7d374da3142..4c4d3c21f72 100644 --- a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java +++ b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java @@ -835,24 +835,6 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, return ManagementServerNode.getManagementServerId(); } - private void cleanupPendingJobs(List l) { - for (SyncQueueItemVO item : l) { - if (s_logger.isInfoEnabled()) { - s_logger.info("Discard left-over queue item: " + item.toString()); - } - - String contentType = item.getContentType(); - if (contentType != null && contentType.equalsIgnoreCase(SyncQueueItem.AsyncJobContentType)) { - Long jobId = item.getContentId(); - if (jobId != null) { - s_logger.warn("Mark job as failed as its correspoding queue-item has been discarded. job id: " + jobId); - completeAsyncJob(jobId, JobInfo.Status.FAILED, 0, "Execution was cancelled because of server shutdown"); - } - } - _queueMgr.purgeItem(item.getId()); - } - } - @DB protected List wakeupByJoinedJobCompletion(long joinedJobId) { SearchCriteria joinJobSC = JoinJobSearch.create("joinJobId", joinedJobId); @@ -967,6 +949,22 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, return true; } + private void cleanupLeftOverJobs(final long msid) { + try { + Transaction.execute(new TransactionCallbackNoReturn() { + @Override + public void doInTransactionWithoutResult(TransactionStatus status) { + // purge sync queue item running on this ms node + _queueMgr.cleanupActiveQueueItems(msid, true); + // reset job status for all jobs running on this ms node + _jobDao.resetJobProcess(msid, ApiErrorCode.INTERNAL_ERROR.getHttpCode(), "job cancelled because of management server restart or shutdown"); + } + }); + } catch (Throwable e) { + s_logger.warn("Unexpected exception in cleaning up left over jobs for mamagement server node " + msid, e); + } + } + @Override public void onManagementNodeJoined(List nodeList, long selfNodeId) { } @@ -974,18 +972,7 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, @Override public void onManagementNodeLeft(List nodeList, long selfNodeId) { for (final ManagementServerHost msHost : nodeList) { - try { - Transaction.execute(new TransactionCallbackNoReturn() { - @Override - public void doInTransactionWithoutResult(TransactionStatus status) { - List items = _queueMgr.getActiveQueueItems(msHost.getId(), true); - cleanupPendingJobs(items); - _jobDao.resetJobProcess(msHost.getId(), ApiErrorCode.INTERNAL_ERROR.getHttpCode(), "job cancelled because of management server restart"); - } - }); - } catch (Throwable e) { - s_logger.warn("Unexpected exception ", e); - } + cleanupLeftOverJobs(msHost.getId()); } } @@ -995,15 +982,7 @@ public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, @Override public boolean start() { - try { - _jobDao.cleanupPseduoJobs(getMsid()); - - List l = _queueMgr.getActiveQueueItems(getMsid(), false); - cleanupPendingJobs(l); - _jobDao.resetJobProcess(getMsid(), ApiErrorCode.INTERNAL_ERROR.getHttpCode(), "job cancelled because of management server restart"); - } catch (Throwable e) { - s_logger.error("Unexpected exception " + e.getMessage(), e); - } + cleanupLeftOverJobs(getMsid()); _heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), HEARTBEAT_INTERVAL, HEARTBEAT_INTERVAL, TimeUnit.MILLISECONDS); _heartbeatScheduler.scheduleAtFixedRate(getGCTask(), GC_INTERVAL, GC_INTERVAL, TimeUnit.MILLISECONDS); diff --git a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManager.java b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManager.java index b521ffe5df1..32d84647a2d 100644 --- a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManager.java +++ b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManager.java @@ -36,4 +36,6 @@ public interface SyncQueueManager extends Manager { public List getBlockedQueueItems(long thresholdMs, boolean exclusive); void purgeAsyncJobQueueItemId(long asyncJobId); + + public void cleanupActiveQueueItems(Long msid, boolean exclusive); } diff --git a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManagerImpl.java b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManagerImpl.java index 5160e05db77..1cfec4dba41 100644 --- a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManagerImpl.java +++ b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/SyncQueueManagerImpl.java @@ -26,7 +26,6 @@ import org.apache.log4j.Logger; import org.apache.cloudstack.framework.jobs.dao.SyncQueueDao; import org.apache.cloudstack.framework.jobs.dao.SyncQueueItemDao; - import com.cloud.utils.DateUtil; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.db.DB; @@ -260,4 +259,16 @@ public class SyncQueueManagerImpl extends ManagerBase implements SyncQueueManage purgeItem(itemId); } } + + @Override + public void cleanupActiveQueueItems(Long msid, boolean exclusive) { + List l = getActiveQueueItems(msid, false); + for (SyncQueueItemVO item : l) { + if (s_logger.isInfoEnabled()) { + s_logger.info("Discard left-over queue item: " + item.toString()); + } + purgeItem(item.getId()); + } + } + }