Merge pull request #2030 from shapeblue/snapshot-housekeeping

CLOUDSTACK-9864 cleanup stale worker VMs after job expiry time
This commit is contained in:
Rajani Karuturi 2017-05-02 11:08:50 +05:30 committed by GitHub
commit 57628b2dd0
5 changed files with 43 additions and 15 deletions

View File

@ -1993,8 +1993,13 @@ public class VolumeServiceImpl implements VolumeService {
SnapshotInfo snapshot = null;
try {
snapshot = snapshotMgr.takeSnapshot(volume);
} catch (CloudRuntimeException cre) {
s_logger.error("Take snapshot: " + volume.getId() + " failed", cre);
throw cre;
} catch (Exception e) {
s_logger.debug("Take snapshot: " + volume.getId() + " failed", e);
if(s_logger.isDebugEnabled()) {
s_logger.debug("unknown exception while taking snapshot for volume " + volume.getId() + " was caught", e);
}
throw new CloudRuntimeException("Failed to take snapshot", e);
}

View File

@ -85,9 +85,9 @@ import com.cloud.vm.dao.VMInstanceDao;
public class AsyncJobManagerImpl extends ManagerBase implements AsyncJobManager, ClusterManagerListener, Configurable {
// Advanced
private static final ConfigKey<Long> JobExpireMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.expire.minutes", "1440",
public static final ConfigKey<Long> JobExpireMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.expire.minutes", "1440",
"Time (in minutes) for async-jobs to be kept in system", true, ConfigKey.Scope.Global);
private static final ConfigKey<Long> JobCancelThresholdMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.cancel.threshold.minutes", "60",
public static final ConfigKey<Long> JobCancelThresholdMinutes = new ConfigKey<Long>("Advanced", Long.class, "job.cancel.threshold.minutes", "60",
"Time (in minutes) for async-jobs to be forcely cancelled if it has been in process for long", true, ConfigKey.Scope.Global);
private static final ConfigKey<Integer> VmJobLockTimeout = new ConfigKey<Integer>("Advanced",
Integer.class, "vm.job.lock.timeout", "1800",

View File

@ -35,6 +35,9 @@ public interface VmwareManager {
public static final ConfigKey<Long> s_vmwareNicHotplugWaitTimeout = new ConfigKey<Long>("Advanced", Long.class, "vmware.nic.hotplug.wait.timeout", "15000",
"Wait timeout (milli seconds) for hot plugged NIC of VM to be detected by guest OS.", false, ConfigKey.Scope.Global);
public static final ConfigKey<Boolean> s_vmwareCleanOldWorderVMs = new ConfigKey<Boolean>("Advanced", Boolean.class, "vmware.clean.old.worker.vms", "false",
"If a worker vm is older then twice the 'job.expire.minutes' + 'job.cancel.threshold.minutes' , remove it.", true, ConfigKey.Scope.Global);
String composeWorkerName();
String getSystemVMIsoFileNameOnDatastore();

View File

@ -22,6 +22,8 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.rmi.RemoteException;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -35,6 +37,7 @@ import java.util.concurrent.TimeUnit;
import javax.inject.Inject;
import javax.naming.ConfigurationException;
import org.apache.cloudstack.framework.jobs.impl.AsyncJobManagerImpl;
import org.apache.log4j.Logger;
import com.vmware.vim25.AboutInfo;
@ -128,6 +131,7 @@ import com.cloud.vm.DomainRouterVO;
public class VmwareManagerImpl extends ManagerBase implements VmwareManager, VmwareStorageMount, Listener, VmwareDatacenterService, Configurable {
private static final Logger s_logger = Logger.getLogger(VmwareManagerImpl.class);
private static final long SECONDS_PER_MINUTE = 60;
private static final int STARTUP_DELAY = 60000; // 60 seconds
private static final long DEFAULT_HOST_SCAN_INTERVAL = 600000; // every 10 minutes
private long _hostScanInterval = DEFAULT_HOST_SCAN_INTERVAL;
@ -212,7 +216,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
@Override
public ConfigKey<?>[] getConfigKeys() {
return new ConfigKey<?>[] {s_vmwareNicHotplugWaitTimeout};
return new ConfigKey<?>[] {s_vmwareNicHotplugWaitTimeout, s_vmwareCleanOldWorderVMs};
}
@Override
@ -534,7 +538,7 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
return false;
}
Long.parseLong(tokens[0]);
long startTick = Long.parseLong(tokens[0]);
long msid = Long.parseLong(tokens[1]);
long runid = Long.parseLong(tokens[2]);
@ -550,15 +554,22 @@ public class VmwareManagerImpl extends ManagerBase implements VmwareManager, Vmw
return true;
}
// disable time-out check until we have found out a VMware API that can check if
// there are pending tasks on the subject VM
/*
if(System.currentTimeMillis() - startTick > _hungWorkerTimeout) {
if(s_logger.isInfoEnabled())
s_logger.info("Worker VM expired, seconds elapsed: " + (System.currentTimeMillis() - startTick) / 1000);
return true;
}
*/
// this time-out check was disabled
// "until we have found out a VMware API that can check if there are pending tasks on the subject VM"
// but as we expire jobs and those stale worker VMs stay around untill an MS reboot we opt in to have them removed anyway
Instant start = Instant.ofEpochMilli(startTick);
Instant end = start.plusSeconds(2 * (AsyncJobManagerImpl.JobExpireMinutes.value() + AsyncJobManagerImpl.JobCancelThresholdMinutes.value()) * SECONDS_PER_MINUTE);
Instant now = Instant.now();
if(s_vmwareCleanOldWorderVMs.value() && now.isAfter(end)) {
if(s_logger.isInfoEnabled()) {
s_logger.info("Worker VM expired, seconds elapsed: " + Duration.between(start,now).getSeconds());
}
return true;
}
if (s_logger.isTraceEnabled()) {
s_logger.trace("Worker VM with tag '" + workerTag + "' does not need recycling, yet." +
"But in " + Duration.between(now,end).getSeconds() + " seconds, though");
}
return false;
}

View File

@ -1114,8 +1114,17 @@ public class SnapshotManagerImpl extends MutualExclusiveIdsManagerBase implement
} catch (Exception e) {
s_logger.debug("post process snapshot failed", e);
}
} catch (CloudRuntimeException cre) {
if(s_logger.isDebugEnabled()) {
s_logger.debug("Failed to create snapshot" + cre.getLocalizedMessage());
}
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot);
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize()));
throw cre;
} catch (Exception e) {
s_logger.debug("Failed to create snapshot", e);
if(s_logger.isDebugEnabled()) {
s_logger.debug("Failed to create snapshot", e);
}
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.snapshot);
_resourceLimitMgr.decrementResourceCount(snapshotOwner.getId(), ResourceType.secondary_storage, new Long(volume.getSize()));
throw new CloudRuntimeException("Failed to create snapshot", e);