mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
Merge pull request #2030 from shapeblue/snapshot-housekeeping
CLOUDSTACK-9864 cleanup stale worker VMs after job expiry time
This commit is contained in:
commit
57628b2dd0
@ -1993,8 +1993,13 @@ public class VolumeServiceImpl implements VolumeService {
|
||||
SnapshotInfo snapshot = null;
|
||||
try {
|
||||
snapshot = snapshotMgr.takeSnapshot(volume);
|
||||
} catch (CloudRuntimeException cre) {
|
||||
s_logger.error("Take snapshot: " + volume.getId() + " failed", cre);
|
||||
throw cre;
|
||||
} catch (Exception e) {
|
||||
s_logger.debug("Take snapshot: " + volume.getId() + " failed", e);
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("unknown exception while taking snapshot for volume " + volume.getId() + " was caught", e);
|
||||
}
|
||||
throw new CloudRuntimeException("Failed to take snapshot", e);
|
||||
}
|
||||
|
||||
|
||||
@ -85,9 +85,9 @@ import com.cloud.vm.dao.VMInstanceDao;
|
||||
|
||||
public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, ClusterManagerListener, Configurable {
|
||||
// Advanced
|
||||
private static final ConfigKey<Long> JobExpireMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.expire.minutes", "1440",
|
||||
public static final ConfigKey<Long> JobExpireMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.expire.minutes", "1440",
|
||||
"Time (in minutes) for async-jobs to be kept in system", true, ConfigKey.Scope.Global);
|
||||
private static final ConfigKey<Long> JobCancelThresholdMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.cancel.threshold.minutes", "60",
|
||||
public static final ConfigKey<Long> JobCancelThresholdMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.cancel.threshold.minutes", "60",
|
||||
"Time (in minutes) for async-jobs to be forcely cancelled if it has been in process for long", true, ConfigKey.Scope.Global);
|
||||
private static final ConfigKey<Integer> VmJobLockTimeout = new ConfigKey<Integer>("Advanced",
|
||||
Integer.class, "vm.job.lock.timeout", "1800",
|
||||
|
||||
@ -35,6 +35,9 @@ public interface VmwareManager {
|
||||
public static final ConfigKey<Long> s_vmwareNicHotplugWaitTimeout = new ConfigKey<Long>("Advanced", Long.class, "vmware.nic.hotplug.wait.timeout", "15000",
|
||||
"Wait timeout (milli seconds) for hot plugged NIC of VM to be detected by guest OS.", false, ConfigKey.Scope.Global);
|
||||
|
||||
public static final ConfigKey<Boolean> s_vmwareCleanOldWorderVMs = new ConfigKey<Boolean>("Advanced", Boolean.class, "vmware.clean.old.worker.vms", "false",
|
||||
"If a worker vm is older then twice the 'job.expire.minutes' + 'job.cancel.threshold.minutes' , remove it.", true, ConfigKey.Scope.Global);
|
||||
|
||||
String composeWorkerName();
|
||||
|
||||
String getSystemVMIsoFileNameOnDatastore();
|
||||
|
||||
@ -22,6 +22,8 @@ import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.rmi.RemoteException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -35,6 +37,7 @@ import java.util.concurrent.TimeUnit;
|
||||
import javax.inject.Inject;
|
||||
import javax.naming.ConfigurationException;
|
||||
|
||||
import org.apache.cloudstack.framework.jobs.impl.AsyncJobManagerImpl;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import com.vmware.vim25.AboutInfo;
|
||||
@ -128,6 +131,7 @@ import com.cloud.vm.DomainRouterVO;
|
||||
public class VmwareManagerImpl extends ManagerBase implements VmwareManager, VmwareStorageMount, Listener, VmwareDatacenterService, Configurable {
|
||||
private static final Logger s_logger = Logger.getLogger(VmwareManagerImpl.class);
|
||||
|
||||
private static final long SECONDS_PER_MINUTE = 60;
|
||||
private static final int STARTUP_DELAY = 60000; // 60 seconds
|
||||
private static final long DEFAULT_HOST_SCAN_INTERVAL = 600000; // every 10 minutes
|
||||
private long _hostScanInterval = DEFAULT_HOST_SCAN_INTERVAL;
|
||||
@ -212,7 +216,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
|
||||
|
||||
@Override
|
||||
public ConfigKey<?>[] getConfigKeys() {
|
||||
return new ConfigKey<?>[] {s_vmwareNicHotplugWaitTimeout};
|
||||
return new ConfigKey<?>[] {s_vmwareNicHotplugWaitTimeout, s_vmwareCleanOldWorderVMs};
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -534,7 +538,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
|
||||
return false;
|
||||
}
|
||||
|
||||
Long.parseLong(tokens[0]);
|
||||
long startTick = Long.parseLong(tokens[0]);
|
||||
long msid = Long.parseLong(tokens[1]);
|
||||
long runid = Long.parseLong(tokens[2]);
|
||||
|
||||
@ -550,15 +554,22 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
|
||||
return true;
|
||||
}
|
||||
|
||||
// disable time-out check until we have found out a VMware API that can check if
|
||||
// there are pending tasks on the subject VM
|
||||
/*
|
||||
if(System.currentTimeMillis() - startTick > _hungWorkerTimeout) {
|
||||
if(s_logger.isInfoEnabled())
|
||||
s_logger.info("Worker VM expired, seconds elapsed: " + (System.currentTimeMillis() - startTick) / 1000);
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
// this time-out check was disabled
|
||||
// "until we have found out a VMware API that can check if there are pending tasks on the subject VM"
|
||||
// but as we expire jobs and those stale worker VMs stay around untill an MS reboot we opt in to have them removed anyway
|
||||
Instant start = Instant.ofEpochMilli(startTick);
|
||||
Instant end = start.plusSeconds(2 * (AsyncJobManagerImpl.JobExpireMinutes.value() + AsyncJobManagerImpl.JobCancelThresholdMinutes.value()) * SECONDS_PER_MINUTE);
|
||||
Instant now = Instant.now();
|
||||
if(s_vmwareCleanOldWorderVMs.value() && now.isAfter(end)) {
|
||||
if(s_logger.isInfoEnabled()) {
|
||||
s_logger.info("Worker VM expired, seconds elapsed: " + Duration.between(start,now).getSeconds());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (s_logger.isTraceEnabled()) {
|
||||
s_logger.trace("Worker VM with tag '" + workerTag + "' does not need recycling, yet." +
|
||||
"But in " + Duration.between(now,end).getSeconds() + " seconds, though");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -1114,8 +1114,17 @@ public class SnapshotManagerImpl extends MutualExclusiveIdsManagerBase implement
|
||||
} catch (Exception e) {
|
||||
s_logger.debug("post process snapshot failed", e);
|
||||
}
|
||||
} catch (CloudRuntimeException cre) {
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Failed to create snapshot" + cre.getLocalizedMessage());
|
||||
}
|
||||
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot);
|
||||
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize()));
|
||||
throw cre;
|
||||
} catch (Exception e) {
|
||||
s_logger.debug("Failed to create snapshot", e);
|
||||
if(s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Failed to create snapshot", e);
|
||||
}
|
||||
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot);
|
||||
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize()));
|
||||
throw new CloudRuntimeException("Failed to create snapshot", e);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user