// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. package com.cloud.resource; import java.util.ArrayList; import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; import javax.inject.Inject; import javax.naming.ConfigurationException; import org.apache.cloudstack.affinity.AffinityGroupProcessor; import org.apache.cloudstack.api.ApiCommandResourceType; import org.apache.cloudstack.api.command.admin.cluster.UpdateClusterCmd; import org.apache.cloudstack.api.command.admin.host.PrepareForMaintenanceCmd; import org.apache.cloudstack.api.command.admin.resource.StartRollingMaintenanceCmd; import org.apache.cloudstack.context.CallContext; import org.apache.cloudstack.framework.config.ConfigKey; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.ObjectUtils; import com.cloud.agent.AgentManager; import com.cloud.agent.api.Answer; import com.cloud.agent.api.RollingMaintenanceAnswer; import com.cloud.agent.api.RollingMaintenanceCommand; import com.cloud.alert.AlertManager; import com.cloud.capacity.CapacityManager; import com.cloud.dc.ClusterDetailsDao; import com.cloud.dc.ClusterDetailsVO; import com.cloud.deploy.DeployDestination; import com.cloud.event.ActionEventUtils; import com.cloud.event.EventVO; import com.cloud.exception.AgentUnavailableException; import com.cloud.exception.InvalidParameterValueException; import com.cloud.exception.OperationTimedoutException; import com.cloud.host.Host; import com.cloud.host.HostTagVO; import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.host.dao.HostDao; import com.cloud.host.dao.HostTagsDao; import com.cloud.hypervisor.Hypervisor; import com.cloud.org.Cluster; import com.cloud.org.Grouping; import com.cloud.service.ServiceOfferingVO; import com.cloud.service.dao.ServiceOfferingDao; import com.cloud.utils.Pair; import com.cloud.utils.StringUtils; import com.cloud.utils.Ternary; import com.cloud.utils.component.ManagerBase; import com.cloud.utils.exception.CloudRuntimeException; import com.cloud.vm.UserVmDetailVO; import com.cloud.vm.VMInstanceVO; import com.cloud.vm.VirtualMachine.State; import com.cloud.vm.VirtualMachineProfileImpl; import com.cloud.vm.VmDetailConstants; import com.cloud.vm.dao.UserVmDetailsDao; import com.cloud.vm.dao.VMInstanceDao; public class RollingMaintenanceManagerImpl extends ManagerBase implements RollingMaintenanceManager { @Inject private HostDao hostDao; @Inject private AgentManager agentManager; @Inject private ResourceManager resourceManager; @Inject private CapacityManager capacityManager; @Inject private VMInstanceDao vmInstanceDao; @Inject protected UserVmDetailsDao userVmDetailsDao; @Inject private ServiceOfferingDao serviceOfferingDao; @Inject private ClusterDetailsDao clusterDetailsDao; @Inject private HostTagsDao hostTagsDao; @Inject private AlertManager alertManager; protected List _affinityProcessors; public void setAffinityGroupProcessors(List affinityProcessors) { _affinityProcessors = affinityProcessors; } private Pair> getResourceTypeAndIdPair(List podIds, List clusterIds, List zoneIds, List hostIds) { Pair> pair = CollectionUtils.isNotEmpty(podIds) ? new Pair<>(ResourceType.Pod, podIds) : CollectionUtils.isNotEmpty(clusterIds) ? new Pair<>(ResourceType.Cluster, clusterIds) : CollectionUtils.isNotEmpty(zoneIds) ? new Pair<>(ResourceType.Zone, zoneIds) : CollectionUtils.isNotEmpty(hostIds) ? new Pair<>(ResourceType.Host, hostIds) : null; if (pair == null) { throw new CloudRuntimeException("Parameters podId, clusterId, zoneId, hostId are mutually exclusive, " + "please set only one of them"); } return pair; } @Override public boolean configure(String name, Map params) throws ConfigurationException { return true; } private void updateCluster(long clusterId, String allocationState) { Cluster cluster = resourceManager.getCluster(clusterId); if (cluster == null) { throw new InvalidParameterValueException("Unable to find the cluster by id=" + clusterId); } UpdateClusterCmd updateClusterCmd = new UpdateClusterCmd(); updateClusterCmd.setId(clusterId); updateClusterCmd.setAllocationState(allocationState); resourceManager.updateCluster(updateClusterCmd); } private void generateReportAndFinishingEvent(StartRollingMaintenanceCmd cmd, boolean success, String details, List hostsUpdated, List hostsSkipped) { Pair> pair = getResourceTypeIdPair(cmd); ResourceType entity = pair.first(); List ids = pair.second(); String cmdResourceType = ApiCommandResourceType.fromString(entity.name()) != null ? ApiCommandResourceType.fromString(entity.name()).toString() : null; String description = String.format("Success: %s, details: %s, hosts updated: %s, hosts skipped: %s", success, details, generateReportHostsUpdated(hostsUpdated), generateReportHostsSkipped(hostsSkipped)); ActionEventUtils.onCompletedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventVO.LEVEL_INFO, cmd.getEventType(), "Completed rolling maintenance for entity " + entity + " with IDs: " + ids + " - " + description, ids.get(0), cmdResourceType, 0); } private String generateReportHostsUpdated(List hostsUpdated) { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append(hostsUpdated.size()); return stringBuilder.toString(); } private String generateReportHostsSkipped(List hostsSkipped) { StringBuilder stringBuilder = new StringBuilder(); stringBuilder.append(hostsSkipped.size()); return stringBuilder.toString(); } @Override public Ternary, List>> startRollingMaintenance(StartRollingMaintenanceCmd cmd) { Pair> pair = getResourceTypeAndIdPair(cmd.getPodIds(), cmd.getClusterIds(), cmd.getZoneIds(), cmd.getHostIds()); ResourceType type = pair.first(); List ids = pair.second(); int timeout = cmd.getTimeout() == null ? KvmRollingMaintenanceStageTimeout.value() : cmd.getTimeout(); String payload = cmd.getPayload(); Boolean forced = cmd.getForced(); Set disabledClusters = new HashSet<>(); Map hostsToAvoidMaintenance = new HashMap<>(); boolean success = false; String details = null; List hostsUpdated = new ArrayList<>(); List hostsSkipped = new ArrayList<>(); if (timeout <= KvmRollingMaintenancePingInterval.value()) { return new Ternary<>(success, "The timeout value provided must be greater or equal than the ping interval " + "defined in '" + KvmRollingMaintenancePingInterval.key() + "'", new Pair<>(hostsUpdated, hostsSkipped)); } try { Map> hostsByCluster = getHostsByClusterForRollingMaintenance(type, ids); for (Long clusterId : hostsByCluster.keySet()) { Cluster cluster = resourceManager.getCluster(clusterId); List hosts = hostsByCluster.get(clusterId); if (!isMaintenanceAllowedByVMStates(cluster, hosts, hostsSkipped)) { if (forced) { continue; } success = false; details = "VMs in invalid states in cluster: " + cluster.getUuid(); return new Ternary<>(success, details, new Pair<>(hostsUpdated, hostsSkipped)); } disableClusterIfEnabled(cluster, disabledClusters); logger.debug("State checks on the hosts in the cluster"); performStateChecks(cluster, hosts, forced, hostsSkipped); logger.debug("Checking hosts capacity before attempting rolling maintenance"); performCapacityChecks(cluster, hosts, forced); logger.debug("Attempting pre-flight stages on each host before starting rolling maintenance"); performPreFlightChecks(hosts, timeout, payload, forced, hostsToAvoidMaintenance); for (Host host: hosts) { Ternary hostResult = startRollingMaintenanceHostInCluster(cluster, host, timeout, payload, forced, hostsToAvoidMaintenance, hostsUpdated, hostsSkipped); if (hostResult.second()) { continue; } if (hostResult.first()) { success = false; details = hostResult.third(); return new Ternary<>(success, details, new Pair<>(hostsUpdated, hostsSkipped)); } } enableClusterIfDisabled(cluster, disabledClusters); } } catch (AgentUnavailableException | InterruptedException | CloudRuntimeException e) { String err = "Error starting rolling maintenance: " + e.getMessage(); logger.error(err, e); success = false; details = err; return new Ternary<>(success, details, new Pair<>(hostsUpdated, hostsSkipped)); } finally { // Enable back disabled clusters for (Long clusterId : disabledClusters) { Cluster cluster = resourceManager.getCluster(clusterId); if (cluster.getAllocationState() == Grouping.AllocationState.Disabled) { updateCluster(clusterId, "Enabled"); } } generateReportAndFinishingEvent(cmd, success, details, hostsUpdated, hostsSkipped); } success = true; details = "OK"; return new Ternary<>(success, details, new Pair<>(hostsUpdated, hostsSkipped)); } /** * Perform state checks on the hosts in a cluster */ protected void performStateChecks(Cluster cluster, List hosts, Boolean forced, List hostsSkipped) { List hostsToDrop = new ArrayList<>(); for (Host host : hosts) { if (host.getStatus() != Status.Up) { String msg = "Host " + host.getUuid() + " is not connected, state = " + host.getStatus().toString(); if (forced) { hostsSkipped.add(new HostSkipped(host, msg)); hostsToDrop.add(host); continue; } throw new CloudRuntimeException(msg); } if (host.getResourceState() != ResourceState.Enabled) { String msg = "Host " + host.getUuid() + " is not enabled, state = " + host.getResourceState().toString(); if (forced) { hostsSkipped.add(new HostSkipped(host, msg)); hostsToDrop.add(host); continue; } throw new CloudRuntimeException(msg); } } if (CollectionUtils.isNotEmpty(hostsToDrop)) { hosts.removeAll(hostsToDrop); } } /** * Do not allow rolling maintenance if there are VMs in Starting/Stopping/Migrating/Error/Unknown state */ private boolean isMaintenanceAllowedByVMStates(Cluster cluster, List hosts, List hostsSkipped) { for (Host host : hosts) { List notAllowedStates = vmInstanceDao.findByHostInStates(host.getId(), State.Starting, State.Stopping, State.Migrating, State.Error, State.Unknown); if (notAllowedStates.size() > 0) { String msg = "There are VMs in starting/stopping/migrating/error/unknown state, not allowing rolling maintenance in the cluster"; HostSkipped skipped = new HostSkipped(host, msg); hostsSkipped.add(skipped); return false; } } return true; } /** * Start rolling maintenance for a single host * @return tuple: (FAIL, SKIP, DETAILS), where: * - FAIL: True if rolling maintenance must fail * - SKIP: True if host must be skipped * - DETAILS: Information retrieved by the host */ private Ternary startRollingMaintenanceHostInCluster(Cluster cluster, Host host, int timeout, String payload, Boolean forced, Map hostsToAvoidMaintenance, List hostsUpdated, List hostsSkipped) throws InterruptedException, AgentUnavailableException { Ternary result; if (!isMaintenanceScriptDefinedOnHost(host, hostsSkipped)) { String msg = "There is no maintenance script on the host"; hostsSkipped.add(new HostSkipped(host, msg)); return new Ternary<>(false, true, msg); } result = performPreMaintenanceStageOnHost(host, timeout, payload, forced, hostsToAvoidMaintenance, hostsSkipped); if (result.first() || result.second()) { return result; } if (isMaintenanceStageAvoided(host, hostsToAvoidMaintenance, hostsSkipped)) { return new Ternary<>(false, true, "Maintenance stage must be avoided"); } logger.debug("Updating capacity before re-checking capacity"); alertManager.recalculateCapacity(); result = reCheckCapacityBeforeMaintenanceOnHost(cluster, host, forced, hostsSkipped); if (result.first() || result.second()) { return result; } Date startTime = new Date(); putHostIntoMaintenance(host); result = performMaintenanceStageOnHost(host, timeout, payload, forced, hostsToAvoidMaintenance, hostsSkipped); if (result.first() || result.second()) { cancelHostMaintenance(host); return result; } cancelHostMaintenance(host); Date endTime = new Date(); HostUpdated hostUpdated = new HostUpdated(host, startTime, endTime, result.third()); hostsUpdated.add(hostUpdated); result = performPostMaintenanceStageOnHost(host, timeout, payload, forced, hostsToAvoidMaintenance, hostsSkipped); if (result.first() || result.second()) { return result; } return new Ternary<>(false, false, "Completed rolling maintenance on host " + host.getUuid()); } /** * Perform Post-Maintenance stage on host * @return tuple: (FAIL, SKIP, DETAILS), where: * - FAIL: True if rolling maintenance must fail * - SKIP: True if host must be skipped * - DETAILS: Information retrieved by the host after executing the stage * @throws InterruptedException */ private Ternary performPostMaintenanceStageOnHost(Host host, int timeout, String payload, Boolean forced, Map hostsToAvoidMaintenance, List hostsSkipped) throws InterruptedException { Ternary result = performStageOnHost(host, Stage.PostMaintenance, timeout, payload, forced); if (!result.first()) { if (forced) { String msg = "Post-maintenance script failed: " + result.second(); hostsSkipped.add(new HostSkipped(host, msg)); return new Ternary<>(true, true, msg); } return new Ternary<>(true, false, result.second()); } return new Ternary<>(false, false, result.second()); } /** * Cancel maintenance mode on host * @param host host */ private void cancelHostMaintenance(Host host) { if (!resourceManager.cancelMaintenance(host.getId())) { String message = "Could not cancel maintenance on host " + host.getUuid(); logger.error(message); throw new CloudRuntimeException(message); } } /** * Perform Maintenance stage on host * @return tuple: (FAIL, SKIP, DETAILS), where: * - FAIL: True if rolling maintenance must fail * - SKIP: True if host must be skipped * - DETAILS: Information retrieved by the host after executing the stage * @throws InterruptedException */ private Ternary performMaintenanceStageOnHost(Host host, int timeout, String payload, Boolean forced, Map hostsToAvoidMaintenance, List hostsSkipped) throws InterruptedException { Ternary result = performStageOnHost(host, Stage.Maintenance, timeout, payload, forced); if (!result.first()) { if (forced) { String msg = "Maintenance script failed: " + result.second(); hostsSkipped.add(new HostSkipped(host, msg)); return new Ternary<>(true, true, msg); } return new Ternary<>(true, false, result.second()); } return new Ternary<>(false, false, result.second()); } /** * Puts host into maintenance and waits for its completion * @param host host * @throws InterruptedException * @throws AgentUnavailableException */ private void putHostIntoMaintenance(Host host) throws InterruptedException, AgentUnavailableException { logger.debug(String.format("Trying to set %s into maintenance", host)); PrepareForMaintenanceCmd cmd = new PrepareForMaintenanceCmd(); cmd.setId(host.getId()); resourceManager.maintain(cmd); waitForHostInMaintenance(host.getId()); } /** * Enable back disabled cluster * @param cluster cluster to enable if it has been disabled * @param disabledClusters set of disabled clusters */ private void enableClusterIfDisabled(Cluster cluster, Set disabledClusters) { if (cluster.getAllocationState() == Grouping.AllocationState.Disabled && disabledClusters.contains(cluster.getId())) { updateCluster(cluster.getId(), "Enabled"); } } /** * Re-check capacity to ensure the host can transit into maintenance state * @return tuple: (FAIL, SKIP, DETAILS), where: * - FAIL: True if rolling maintenance must fail * - SKIP: True if host must be skipped * - DETAILS: Information retrieved after capacity checks */ private Ternary reCheckCapacityBeforeMaintenanceOnHost(Cluster cluster, Host host, Boolean forced, List hostsSkipped) { Pair capacityCheckBeforeMaintenance = performCapacityChecksBeforeHostInMaintenance(host, cluster); if (!capacityCheckBeforeMaintenance.first()) { String errorMsg = String.format("Capacity check failed for %s: %s", host, capacityCheckBeforeMaintenance.second()); if (forced) { logger.info(String.format("Skipping %s as: %s", host, errorMsg)); hostsSkipped.add(new HostSkipped(host, errorMsg)); return new Ternary<>(true, true, capacityCheckBeforeMaintenance.second()); } return new Ternary<>(true, false, capacityCheckBeforeMaintenance.second()); } return new Ternary<>(false, false, capacityCheckBeforeMaintenance.second()); } /** * Indicates if the maintenance stage must be avoided */ private boolean isMaintenanceStageAvoided(Host host, Map hostsToAvoidMaintenance, List hostsSkipped) { if (hostsToAvoidMaintenance.containsKey(host.getId())) { HostSkipped hostSkipped = new HostSkipped(host, hostsToAvoidMaintenance.get(host.getId())); hostsSkipped.add(hostSkipped); logger.debug(String.format("%s is in avoid maintenance list [hosts skipped: %d], skipping its maintenance.", host, hostsSkipped.size())); return true; } return false; } /** * Perform Pre-Maintenance stage on host * @return tuple: (FAIL, SKIP, DETAILS), where: * - FAIL: True if rolling maintenance must fail * - SKIP: True if host must be skipped * - DETAILS: Information retrieved by the host after executing the stage * @throws InterruptedException */ private Ternary performPreMaintenanceStageOnHost(Host host, int timeout, String payload, Boolean forced, Map hostsToAvoidMaintenance, List hostsSkipped) throws InterruptedException { Ternary result = performStageOnHost(host, Stage.PreMaintenance, timeout, payload, forced); if (!result.first()) { if (forced) { String msg = "Pre-maintenance script failed: " + result.second(); hostsSkipped.add(new HostSkipped(host, msg)); return new Ternary<>(true, true, result.second()); } return new Ternary<>(true, false, result.second()); } if (result.third() && !hostsToAvoidMaintenance.containsKey(host.getId())) { logHostAddedToAvoidMaintenanceSet(host); hostsToAvoidMaintenance.put(host.getId(), "Pre-maintenance stage set to avoid maintenance"); } return new Ternary<>(false, false, result.second()); } /** * Disable cluster (if hasn't been disabled yet) * @param cluster cluster to disable * @param disabledClusters set of disabled cluster ids. cluster is added if it is disabled */ private void disableClusterIfEnabled(Cluster cluster, Set disabledClusters) { if (cluster.getAllocationState() == Grouping.AllocationState.Enabled && !disabledClusters.contains(cluster.getId())) { updateCluster(cluster.getId(), "Disabled"); disabledClusters.add(cluster.getId()); } } private boolean isMaintenanceScriptDefinedOnHost(Host host, List hostsSkipped) { try { RollingMaintenanceAnswer answer = (RollingMaintenanceAnswer) agentManager.send(host.getId(), new RollingMaintenanceCommand(true)); return answer.isMaintenaceScriptDefined(); } catch (AgentUnavailableException | OperationTimedoutException e) { String msg = String.format("Could not check for maintenance script on %s due to: %s", host, e.getMessage()); logger.error(msg, e); return false; } } /** * Execute stage on host * @return tuple: (SUCCESS, DETAILS, AVOID_MAINTENANCE) where: * - SUCCESS: True if stage is successful * - DETAILS: Information retrieved by the host after executing the stage * - AVOID_MAINTENANCE: True if maintenance stage must be avoided */ private Ternary performStageOnHost(Host host, Stage stage, int timeout, String payload, Boolean forced) throws InterruptedException { Ternary result = sendRollingMaintenanceCommandToHost(host, stage, timeout, payload); if (!result.first() && !forced) { throw new CloudRuntimeException("Stage: " + stage.toString() + " failed on host " + host.getUuid() + ": " + result.second()); } return result; } /** * Send rolling maintenance command to a host to perform a certain stage specified in cmd * @return tuple: (SUCCESS, DETAILS, AVOID_MAINTENANCE) where: * - SUCCESS: True if stage is successful * - DETAILS: Information retrieved by the host after executing the stage * - AVOID_MAINTENANCE: True if maintenance stage must be avoided */ private Ternary sendRollingMaintenanceCommandToHost(Host host, Stage stage, int timeout, String payload) throws InterruptedException { boolean completed = false; Answer answer = null; long timeSpent = 0L; long pingInterval = KvmRollingMaintenancePingInterval.value() * 1000L; boolean avoidMaintenance = false; RollingMaintenanceCommand cmd = new RollingMaintenanceCommand(stage.toString()); cmd.setWait(timeout); cmd.setPayload(payload); while (!completed && timeSpent < timeout * 1000L) { try { answer = agentManager.send(host.getId(), cmd); } catch (AgentUnavailableException | OperationTimedoutException e) { // Agent may be restarted on the scripts - continue polling until it is up String msg = String.format("Cannot send command to %s, waiting %sms - %s", host, pingInterval, e.getMessage()); logger.warn(msg, e); cmd.setStarted(true); Thread.sleep(pingInterval); timeSpent += pingInterval; continue; } cmd.setStarted(true); RollingMaintenanceAnswer rollingMaintenanceAnswer = (RollingMaintenanceAnswer) answer; completed = rollingMaintenanceAnswer.isFinished(); if (!completed) { Thread.sleep(pingInterval); timeSpent += pingInterval; } else { avoidMaintenance = rollingMaintenanceAnswer.isAvoidMaintenance(); } } if (timeSpent >= timeout * 1000L) { return new Ternary<>(false, "Timeout exceeded for rolling maintenance on host " + host.getUuid() + " and stage " + stage.toString(), avoidMaintenance); } return new Ternary<>(answer.getResult(), answer.getDetails(), avoidMaintenance); } /** * Pre flight checks on hosts */ private void performPreFlightChecks(List hosts, int timeout, String payload, Boolean forced, Map hostsToAvoidMaintenance) throws InterruptedException { for (Host host : hosts) { Ternary result = performStageOnHost(host, Stage.PreFlight, timeout, payload, forced); if (result.third() && !hostsToAvoidMaintenance.containsKey(host.getId())) { logHostAddedToAvoidMaintenanceSet(host); hostsToAvoidMaintenance.put(host.getId(), "Pre-flight stage set to avoid maintenance"); } } } private void logHostAddedToAvoidMaintenanceSet(Host host) { logger.debug(String.format("%s added to the avoid maintenance set.", host)); } /** * Capacity checks on hosts */ private void performCapacityChecks(Cluster cluster, List hosts, Boolean forced) { for (Host host : hosts) { Pair result = performCapacityChecksBeforeHostInMaintenance(host, cluster); if (!result.first() && !forced) { throw new CloudRuntimeException(String.format("Capacity check failed for %s : %s", host, result.second())); } } } /** * Check if there is enough capacity for host to enter maintenance */ private Pair performCapacityChecksBeforeHostInMaintenance(Host host, Cluster cluster) { List hosts = hostDao.findByClusterId(cluster.getId()); List hostsInCluster = hosts.stream() .filter(x -> x.getId() != host.getId() && x.getClusterId().equals(cluster.getId()) && x.getResourceState() == ResourceState.Enabled && x.getStatus() == Status.Up) .collect(Collectors.toList()); if (CollectionUtils.isEmpty(hostsInCluster)) { throw new CloudRuntimeException("No host available in cluster " + cluster.getUuid() + " (" + cluster.getName() + ") to support host " + host.getUuid() + " (" + host.getName() + ") in maintenance"); } List vmsRunning = vmInstanceDao.listByHostId(host.getId()); if (CollectionUtils.isEmpty(vmsRunning)) { return new Pair<>(true, "OK"); } List hostTags = hostTagsDao.getHostTags(host.getId()); int successfullyCheckedVmMigrations = 0; for (VMInstanceVO runningVM : vmsRunning) { boolean canMigrateVm = false; Ternary cpuSpeedAndRamSize = getComputeResourcesCpuSpeedAndRamSize(runningVM); Integer cpu = cpuSpeedAndRamSize.first(); Integer speed = cpuSpeedAndRamSize.second(); Integer ramSize = cpuSpeedAndRamSize.third(); if (ObjectUtils.anyNull(cpu, speed, ramSize)) { logger.warn("Cannot fetch compute resources for the VM {}, skipping it from the capacity check", runningVM); continue; } ServiceOfferingVO serviceOffering = serviceOfferingDao.findById(runningVM.getServiceOfferingId()); for (Host hostInCluster : hostsInCluster) { if (!checkHostTags(hostTags, hostTagsDao.getHostTags(hostInCluster.getId()), serviceOffering.getHostTag())) { logger.debug("Host tags mismatch between {} and {} Skipping it from the capacity check", host, hostInCluster); continue; } DeployDestination deployDestination = new DeployDestination(null, null, null, host); VirtualMachineProfileImpl vmProfile = new VirtualMachineProfileImpl(runningVM); boolean affinityChecks = true; for (AffinityGroupProcessor affinityProcessor : _affinityProcessors) { affinityChecks = affinityChecks && affinityProcessor.check(vmProfile, deployDestination); } if (!affinityChecks) { logger.debug("Affinity check failed between {} and {} Skipping it from the capacity check", host, hostInCluster); continue; } boolean maxGuestLimit = capacityManager.checkIfHostReachMaxGuestLimit(host); boolean hostHasCPUCapacity = capacityManager.checkIfHostHasCpuCapability(hostInCluster.getId(), cpu, speed); int cpuRequested = cpu * speed; long ramRequested = ramSize * 1024L * 1024L; ClusterDetailsVO clusterDetailsCpuOvercommit = clusterDetailsDao.findDetail(cluster.getId(), "cpuOvercommitRatio"); ClusterDetailsVO clusterDetailsRamOvercommmt = clusterDetailsDao.findDetail(cluster.getId(), "memoryOvercommitRatio"); Float cpuOvercommitRatio = Float.parseFloat(clusterDetailsCpuOvercommit.getValue()); Float memoryOvercommitRatio = Float.parseFloat(clusterDetailsRamOvercommmt.getValue()); boolean hostHasCapacity = capacityManager.checkIfHostHasCapacity(hostInCluster, cpuRequested, ramRequested, false, cpuOvercommitRatio, memoryOvercommitRatio, false); if (!maxGuestLimit && hostHasCPUCapacity && hostHasCapacity) { canMigrateVm = true; break; } } if (!canMigrateVm) { String msg = String.format("%s cannot be migrated away from %s to any other host in the cluster", runningVM, host); logger.error(msg); return new Pair<>(false, msg); } successfullyCheckedVmMigrations++; } if (successfullyCheckedVmMigrations != vmsRunning.size()) { String migrationCheckDetails = String.format("%s cannot enter maintenance mode as capacity check failed for hosts in cluster %s", host, cluster); return new Pair<>(false, migrationCheckDetails); } return new Pair<>(true, "OK"); } protected Ternary getComputeResourcesCpuSpeedAndRamSize(VMInstanceVO runningVM) { ServiceOfferingVO serviceOffering = serviceOfferingDao.findById(runningVM.getServiceOfferingId()); Integer cpu = serviceOffering.getCpu(); Integer speed = serviceOffering.getSpeed(); Integer ramSize = serviceOffering.getRamSize(); if (!serviceOffering.isDynamic()) { return new Ternary<>(cpu, speed, ramSize); } List vmDetails = userVmDetailsDao.listDetails(runningVM.getId()); if (CollectionUtils.isEmpty(vmDetails)) { return new Ternary<>(cpu, speed, ramSize); } for (UserVmDetailVO vmDetail : vmDetails) { if (StringUtils.isBlank(vmDetail.getName()) || StringUtils.isBlank(vmDetail.getValue())) { continue; } if (cpu == null && VmDetailConstants.CPU_NUMBER.equals(vmDetail.getName())) { cpu = Integer.valueOf(vmDetail.getValue()); } else if (speed == null && VmDetailConstants.CPU_SPEED.equals(vmDetail.getName())) { speed = Integer.valueOf(vmDetail.getValue()); } else if (ramSize == null && VmDetailConstants.MEMORY.equals(vmDetail.getName())) { ramSize = Integer.valueOf(vmDetail.getValue()); } } return new Ternary<>(cpu, speed, ramSize); } /** * Check hosts tags */ private boolean checkHostTags(List hostTags, List hostInClusterTags, String offeringTag) { if ((CollectionUtils.isEmpty(hostTags) && CollectionUtils.isEmpty(hostInClusterTags)) || StringUtils.isBlank(offeringTag)) { return true; } else if ((CollectionUtils.isNotEmpty(hostTags) && CollectionUtils.isEmpty(hostInClusterTags)) || (CollectionUtils.isEmpty(hostTags) && CollectionUtils.isNotEmpty(hostInClusterTags))) { return false; } else { return hostInClusterTags.parallelStream().anyMatch(hostTagVO -> offeringTag.equals(hostTagVO.getTag())); } } /** * Retrieve all the hosts in 'Up' state within the scope for starting rolling maintenance */ protected Map> getHostsByClusterForRollingMaintenance(ResourceType type, List ids) { Set hosts = new HashSet<>(); List hostsInScope = null; for (Long id : ids) { if (type == ResourceType.Host) { hostsInScope = Collections.singletonList(hostDao.findById(id)); } else if (type == ResourceType.Cluster) { hostsInScope = hostDao.findByClusterId(id); } else if (type == ResourceType.Pod) { hostsInScope = hostDao.findByPodId(id); } else if (type == ResourceType.Zone) { hostsInScope = hostDao.findByDataCenterId(id); } List hostsUp = hostsInScope.stream() .filter(x -> x.getHypervisorType() == Hypervisor.HypervisorType.KVM) .collect(Collectors.toList()); hosts.addAll(hostsUp); } return hosts.stream().collect(Collectors.groupingBy(Host::getClusterId)); } @Override public Pair> getResourceTypeIdPair(StartRollingMaintenanceCmd cmd) { return getResourceTypeAndIdPair(cmd.getPodIds(), cmd.getClusterIds(), cmd.getZoneIds(), cmd.getHostIds()); } /* Wait for to be in maintenance mode */ private void waitForHostInMaintenance(long hostId) throws CloudRuntimeException, InterruptedException { HostVO host = hostDao.findById(hostId); long timeout = KvmRollingMaintenanceWaitForMaintenanceTimeout.value() * 1000L; long timeSpent = 0; long step = 30 * 1000L; while (timeSpent < timeout && host.getResourceState() != ResourceState.Maintenance) { Thread.sleep(step); timeSpent += step; host = hostDao.findById(hostId); } if (host.getResourceState() != ResourceState.Maintenance) { String errorMsg = "Timeout: waited " + timeout + "ms for host " + host.getUuid() + "(" + host.getName() + ")" + " to be in Maintenance state, but after timeout it is in " + host.getResourceState().toString() + " state"; logger.error(errorMsg); throw new CloudRuntimeException(errorMsg); } logger.debug("Host " + host.getUuid() + "(" + host.getName() + ") is in maintenance"); } @Override public String getConfigComponentName() { return RollingMaintenanceManagerImpl.class.getSimpleName(); } @Override public ConfigKey[] getConfigKeys() { return new ConfigKey[] {KvmRollingMaintenanceStageTimeout, KvmRollingMaintenancePingInterval, KvmRollingMaintenanceWaitForMaintenanceTimeout}; } }