diff --git a/engine/storage/volume/src/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java b/engine/storage/volume/src/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java index 2e72286b222..6a9dd74eaf7 100644 --- a/engine/storage/volume/src/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java +++ b/engine/storage/volume/src/org/apache/cloudstack/storage/volume/VolumeServiceImpl.java @@ -1993,8 +1993,13 @@ public class VolumeServiceImpl implements VolumeService { SnapshotInfo snapshot = null; try { snapshot = snapshotMgr.takeSnapshot(volume); + } catch (CloudRuntimeException cre) { + s_logger.error("Take snapshot: " + volume.getId() + " failed", cre); + throw cre; } catch (Exception e) { - s_logger.debug("Take snapshot: " + volume.getId() + " failed", e); + if(s_logger.isDebugEnabled()) { + s_logger.debug("unknown exception while taking snapshot for volume " + volume.getId() + " was caught", e); + } throw new CloudRuntimeException("Failed to take snapshot", e); } diff --git a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java index 121246bc11c..aa38258c4b4 100644 --- a/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java +++ b/framework/jobs/src/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java @@ -85,9 +85,9 @@ import com.cloud.vm.dao.VMInstanceDao; public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, ClusterManagerListener, Configurable { // Advanced - private static final ConfigKey JobExpireMinutes = new ConfigKey("Advanced", Long.class, "job.expire.minutes", "1440", + public static final ConfigKey JobExpireMinutes = new ConfigKey("Advanced", Long.class, "job.expire.minutes", "1440", "Time (in minutes) for async-jobs to be kept in system", true, ConfigKey.Scope.Global); - private static final ConfigKey JobCancelThresholdMinutes = new ConfigKey("Advanced", Long.class, "job.cancel.threshold.minutes", "60", + public static final ConfigKey JobCancelThresholdMinutes = new ConfigKey("Advanced", Long.class, "job.cancel.threshold.minutes", "60", "Time (in minutes) for async-jobs to be forcely cancelled if it has been in process for long", true, ConfigKey.Scope.Global); private static final ConfigKey VmJobLockTimeout = new ConfigKey("Advanced", Integer.class, "vm.job.lock.timeout", "1800", diff --git a/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManager.java b/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManager.java index 8a3b201f352..12c48fee026 100644 --- a/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManager.java +++ b/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManager.java @@ -35,6 +35,9 @@ public interface VmwareManager { public static final ConfigKey s_vmwareNicHotplugWaitTimeout = new ConfigKey("Advanced", Long.class, "vmware.nic.hotplug.wait.timeout", "15000", "Wait timeout (milli seconds) for hot plugged NIC of VM to be detected by guest OS.", false, ConfigKey.Scope.Global); + public static final ConfigKey s_vmwareCleanOldWorderVMs = new ConfigKey("Advanced", Boolean.class, "vmware.clean.old.worker.vms", "false", + "If a worker vm is older then twice the 'job.expire.minutes' + 'job.cancel.threshold.minutes' , remove it.", true, ConfigKey.Scope.Global); + String composeWorkerName(); String getSystemVMIsoFileNameOnDatastore(); diff --git a/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManagerImpl.java b/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManagerImpl.java index 8688235019d..0fc7ca55eb5 100644 --- a/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManagerImpl.java +++ b/plugins/hypervisors/vmware/src/com/cloud/hypervisor/vmware/manager/VmwareManagerImpl.java @@ -22,6 +22,8 @@ import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import java.rmi.RemoteException; +import java.time.Duration; +import java.time.Instant; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -35,6 +37,7 @@ import java.util.concurrent.TimeUnit; import javax.inject.Inject; import javax.naming.ConfigurationException; +import org.apache.cloudstack.framework.jobs.impl.AsyncJobManagerImpl; import org.apache.log4j.Logger; import com.vmware.vim25.AboutInfo; @@ -128,6 +131,7 @@ import com.cloud.vm.DomainRouterVO; public class VmwareManagerImpl extends ManagerBase implements VmwareManager, VmwareStorageMount, Listener, VmwareDatacenterService, Configurable { private static final Logger s_logger = Logger.getLogger(VmwareManagerImpl.class); + private static final long SECONDS_PER_MINUTE = 60; private static final int STARTUP_DELAY = 60000; // 60 seconds private static final long DEFAULT_HOST_SCAN_INTERVAL = 600000; // every 10 minutes private long _hostScanInterval = DEFAULT_HOST_SCAN_INTERVAL; @@ -212,7 +216,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw @Override public ConfigKey[] getConfigKeys() { - return new ConfigKey[] {s_vmwareNicHotplugWaitTimeout}; + return new ConfigKey[] {s_vmwareNicHotplugWaitTimeout, s_vmwareCleanOldWorderVMs}; } @Override @@ -534,7 +538,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw return false; } - Long.parseLong(tokens[0]); + long startTick = Long.parseLong(tokens[0]); long msid = Long.parseLong(tokens[1]); long runid = Long.parseLong(tokens[2]); @@ -550,15 +554,22 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw return true; } - // disable time-out check until we have found out a VMware API that can check if - // there are pending tasks on the subject VM - /* - if(System.currentTimeMillis() - startTick > _hungWorkerTimeout) { - if(s_logger.isInfoEnabled()) - s_logger.info("Worker VM expired, seconds elapsed: " + (System.currentTimeMillis() - startTick) / 1000); - return true; - } - */ + // this time-out check was disabled + // "until we have found out a VMware API that can check if there are pending tasks on the subject VM" + // but as we expire jobs and those stale worker VMs stay around untill an MS reboot we opt in to have them removed anyway + Instant start = Instant.ofEpochMilli(startTick); + Instant end = start.plusSeconds(2 * (AsyncJobManagerImpl.JobExpireMinutes.value() + AsyncJobManagerImpl.JobCancelThresholdMinutes.value()) * SECONDS_PER_MINUTE); + Instant now = Instant.now(); + if(s_vmwareCleanOldWorderVMs.value() && now.isAfter(end)) { + if(s_logger.isInfoEnabled()) { + s_logger.info("Worker VM expired, seconds elapsed: " + Duration.between(start,now).getSeconds()); + } + return true; + } + if (s_logger.isTraceEnabled()) { + s_logger.trace("Worker VM with tag '" + workerTag + "' does not need recycling, yet." + + "But in " + Duration.between(now,end).getSeconds() + " seconds, though"); + } return false; } diff --git a/server/src/com/cloud/storage/snapshot/SnapshotManagerImpl.java b/server/src/com/cloud/storage/snapshot/SnapshotManagerImpl.java index 92937ba35c6..f3ea88bdcbe 100755 --- a/server/src/com/cloud/storage/snapshot/SnapshotManagerImpl.java +++ b/server/src/com/cloud/storage/snapshot/SnapshotManagerImpl.java @@ -1114,8 +1114,17 @@ public class SnapshotManagerImpl extends MutualExclusiveIdsManagerBase implement } catch (Exception e) { s_logger.debug("post process snapshot failed", e); } + } catch (CloudRuntimeException cre) { + if(s_logger.isDebugEnabled()) { + s_logger.debug("Failed to create snapshot" + cre.getLocalizedMessage()); + } + _resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot); + _resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize())); + throw cre; } catch (Exception e) { - s_logger.debug("Failed to create snapshot", e); + if(s_logger.isDebugEnabled()) { + s_logger.debug("Failed to create snapshot", e); + } _resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot); _resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize())); throw new CloudRuntimeException("Failed to create snapshot", e);