mirror of
https://github.com/apache/cloudstack.git
synced 2025-11-02 20:02:29 +01:00
CLOUDSTACK-2918: In a scaled up environment, hosts fail to come up after Management server restart in clustered set up
Summary of changes in the fix - Optimized host scan logic, now instead of iterating over each cluster host scan is done for a batch of clusters - Made host scan task interval configurable
This commit is contained in:
parent
06f73092fb
commit
aed49572c7
@ -21,6 +21,7 @@ import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TimeZone;
|
||||
@ -116,6 +117,7 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
|
||||
|
||||
protected SearchBuilder<HostVO> HostsForReconnectSearch;
|
||||
protected GenericSearchBuilder<HostVO, Long> ClustersOwnedByMSSearch;
|
||||
protected GenericSearchBuilder<HostVO, Long> ClustersForHostsNotOwnedByAnyMSSearch;
|
||||
protected GenericSearchBuilder<ClusterVO, Long> AllClustersSearch;
|
||||
protected SearchBuilder<HostVO> HostsInClusterSearch;
|
||||
|
||||
@ -264,7 +266,7 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
|
||||
UnmanagedDirectConnectSearch.and("server", UnmanagedDirectConnectSearch.entity().getManagementServerId(), SearchCriteria.Op.NULL);
|
||||
UnmanagedDirectConnectSearch.and("lastPinged", UnmanagedDirectConnectSearch.entity().getLastPinged(), SearchCriteria.Op.LTEQ);
|
||||
UnmanagedDirectConnectSearch.and("resourceStates", UnmanagedDirectConnectSearch.entity().getResourceState(), SearchCriteria.Op.NIN);
|
||||
UnmanagedDirectConnectSearch.and("cluster", UnmanagedDirectConnectSearch.entity().getClusterId(), SearchCriteria.Op.EQ);
|
||||
UnmanagedDirectConnectSearch.and("clusterIn", UnmanagedDirectConnectSearch.entity().getClusterId(), SearchCriteria.Op.IN);
|
||||
/*
|
||||
* UnmanagedDirectConnectSearch.op(SearchCriteria.Op.OR, "managementServerId",
|
||||
* UnmanagedDirectConnectSearch.entity().getManagementServerId(), SearchCriteria.Op.EQ);
|
||||
@ -353,6 +355,13 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
|
||||
ClustersOwnedByMSSearch.and("server", ClustersOwnedByMSSearch.entity().getManagementServerId(), SearchCriteria.Op.EQ);
|
||||
ClustersOwnedByMSSearch.done();
|
||||
|
||||
ClustersForHostsNotOwnedByAnyMSSearch = createSearchBuilder(Long.class);
|
||||
ClustersForHostsNotOwnedByAnyMSSearch.select(null, Func.DISTINCT, ClustersForHostsNotOwnedByAnyMSSearch.entity().getClusterId());
|
||||
ClustersForHostsNotOwnedByAnyMSSearch.and("resource", ClustersForHostsNotOwnedByAnyMSSearch.entity().getResource(), SearchCriteria.Op.NNULL);
|
||||
ClustersForHostsNotOwnedByAnyMSSearch.and("cluster", ClustersForHostsNotOwnedByAnyMSSearch.entity().getClusterId(), SearchCriteria.Op.NNULL);
|
||||
ClustersForHostsNotOwnedByAnyMSSearch.and("server", ClustersForHostsNotOwnedByAnyMSSearch.entity().getManagementServerId(), SearchCriteria.Op.NULL);
|
||||
ClustersForHostsNotOwnedByAnyMSSearch.done();
|
||||
|
||||
AllClustersSearch = _clusterDao.createSearchBuilder(Long.class);
|
||||
AllClustersSearch.select(null, Func.NATIVE, AllClustersSearch.entity().getId());
|
||||
AllClustersSearch.and("managed", AllClustersSearch.entity().getManagedState(), SearchCriteria.Op.EQ);
|
||||
@ -409,10 +418,17 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
|
||||
sc.setParameters("lastPinged", lastPingSecondsAfter);
|
||||
sc.setParameters("status", Status.Disconnected, Status.Down, Status.Alert);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
List<HostVO> hosts = lockRows(sc, null, true); // exclusive lock
|
||||
for (HostVO host : hosts) {
|
||||
host.setManagementServerId(null);
|
||||
update(host.getId(), host);
|
||||
sb.append(host.getId());
|
||||
sb.append(" ");
|
||||
}
|
||||
|
||||
if (s_logger.isTraceEnabled()) {
|
||||
s_logger.trace("Following hosts got reset: " + sb.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@ -427,6 +443,16 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
|
||||
return clusters;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns clusters based on the list of hosts not owned by any MS
|
||||
*/
|
||||
private List<Long> findClustersForHostsNotOwnedByAnyManagementServer() {
|
||||
SearchCriteria<Long> sc = ClustersForHostsNotOwnedByAnyMSSearch.create();
|
||||
|
||||
List<Long> clusters = customSearch(sc, null);
|
||||
return clusters;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a list of all cluster Ids
|
||||
*/
|
||||
@ -459,55 +485,100 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
|
||||
public List<HostVO> findAndUpdateDirectAgentToLoad(long lastPingSecondsAfter, Long limit, long managementServerId) {
|
||||
Transaction txn = Transaction.currentTxn();
|
||||
|
||||
txn.start();
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Resetting hosts suitable for reconnect");
|
||||
}
|
||||
// reset hosts that are suitable candidates for reconnect
|
||||
txn.start();
|
||||
resetHosts(managementServerId, lastPingSecondsAfter);
|
||||
txn.commit();
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Completed resetting hosts suitable for reconnect");
|
||||
}
|
||||
|
||||
List<Long> clusters = findClustersOwnedByManagementServer(managementServerId);
|
||||
List<Long> allClusters = listAllClusters();
|
||||
|
||||
SearchCriteria<HostVO> sc = UnmanagedDirectConnectSearch.create();
|
||||
sc.setParameters("lastPinged", lastPingSecondsAfter);
|
||||
sc.setJoinParameters("ClusterManagedSearch", "managed", Managed.ManagedState.Managed);
|
||||
List<HostVO> assignedHosts = new ArrayList<HostVO>();
|
||||
List<Long> remainingClusters = new ArrayList<Long>();
|
||||
|
||||
// handle clusters already owned by @managementServerId
|
||||
txn.start();
|
||||
for (Long clusterId : allClusters) {
|
||||
if (clusters.contains(clusterId)) { // host belongs to clusters owned by @managementServerId
|
||||
sc.setParameters("cluster", clusterId);
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Acquiring hosts for clusters already owned by this management server");
|
||||
}
|
||||
List<Long> clusters = findClustersOwnedByManagementServer(managementServerId);
|
||||
if (clusters.size() > 0) {
|
||||
// handle clusters already owned by @managementServerId
|
||||
SearchCriteria<HostVO> sc = UnmanagedDirectConnectSearch.create();
|
||||
sc.setParameters("lastPinged", lastPingSecondsAfter);
|
||||
sc.setJoinParameters("ClusterManagedSearch", "managed", Managed.ManagedState.Managed);
|
||||
sc.setParameters("clusterIn", clusters.toArray());
|
||||
List<HostVO> unmanagedHosts = lockRows(sc, new Filter(HostVO.class, "clusterId", true, 0L, limit), true); // host belongs to clusters owned by @managementServerId
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (HostVO host : unmanagedHosts) {
|
||||
host.setManagementServerId(managementServerId);
|
||||
update(host.getId(), host);
|
||||
assignedHosts.add(host);
|
||||
sb.append(host.getId());
|
||||
sb.append(" ");
|
||||
}
|
||||
if (s_logger.isTraceEnabled()) {
|
||||
s_logger.trace("Following hosts got acquired for clusters already owned: " + sb.toString());
|
||||
}
|
||||
}
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Completed acquiring hosts for clusters already owned by this management server");
|
||||
}
|
||||
|
||||
if (assignedHosts.size() < limit) {
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Acquiring hosts for clusters not owned by any management server");
|
||||
}
|
||||
// for remaining hosts not owned by any MS check if they can be owned (by owning full cluster)
|
||||
clusters = findClustersForHostsNotOwnedByAnyManagementServer();
|
||||
List<Long> updatedClusters = clusters;
|
||||
if (clusters.size() > limit) {
|
||||
updatedClusters = clusters.subList(0, limit.intValue());
|
||||
}
|
||||
if (updatedClusters.size() > 0) {
|
||||
SearchCriteria<HostVO> sc = UnmanagedDirectConnectSearch.create();
|
||||
sc.setParameters("lastPinged", lastPingSecondsAfter);
|
||||
sc.setJoinParameters("ClusterManagedSearch", "managed", Managed.ManagedState.Managed);
|
||||
sc.setParameters("clusterIn", updatedClusters.toArray());
|
||||
List<HostVO> unmanagedHosts = lockRows(sc, null, true);
|
||||
|
||||
// group hosts based on cluster
|
||||
Map<Long, List<HostVO>> hostMap = new HashMap<Long, List<HostVO>>();
|
||||
for (HostVO host : unmanagedHosts) {
|
||||
host.setManagementServerId(managementServerId);
|
||||
update(host.getId(), host);
|
||||
assignedHosts.add(host);
|
||||
if (hostMap.get(host.getClusterId()) == null) {
|
||||
hostMap.put(host.getClusterId(), new ArrayList<HostVO>());
|
||||
}
|
||||
hostMap.get(host.getClusterId()).add(host);
|
||||
}
|
||||
} else {
|
||||
remainingClusters.add(clusterId);
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (Long clusterId : hostMap.keySet()) {
|
||||
if (canOwnCluster(clusterId)) { // cluster is not owned by any other MS, so @managementServerId can own it
|
||||
List<HostVO> hostList = hostMap.get(clusterId);
|
||||
for (HostVO host : hostList) {
|
||||
host.setManagementServerId(managementServerId);
|
||||
update(host.getId(), host);
|
||||
assignedHosts.add(host);
|
||||
sb.append(host.getId());
|
||||
sb.append(" ");
|
||||
}
|
||||
}
|
||||
if (assignedHosts.size() > limit) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (s_logger.isTraceEnabled()) {
|
||||
s_logger.trace("Following hosts got acquired from newly owned clusters: " + sb.toString());
|
||||
}
|
||||
}
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Completed acquiring hosts for clusters not owned by any management server");
|
||||
}
|
||||
}
|
||||
txn.commit();
|
||||
|
||||
// for remaining clusters check if they can be owned
|
||||
for (Long clusterId : remainingClusters) {
|
||||
txn.start();
|
||||
sc.setParameters("cluster", clusterId);
|
||||
List<HostVO> unmanagedHosts = lockRows(sc, null, true);
|
||||
if (canOwnCluster(clusterId)) { // cluster is not owned by any other MS, so @managementServerId can own it
|
||||
for (HostVO host : unmanagedHosts) {
|
||||
host.setManagementServerId(managementServerId);
|
||||
update(host.getId(), host);
|
||||
assignedHosts.add(host);
|
||||
}
|
||||
}
|
||||
txn.commit();
|
||||
}
|
||||
|
||||
return assignedHosts;
|
||||
}
|
||||
|
||||
|
||||
@Override @DB
|
||||
public List<HostVO> findAndUpdateApplianceToLoad(long lastPingSecondsAfter, long managementServerId) {
|
||||
Transaction txn = Transaction.currentTxn();
|
||||
|
||||
@ -99,6 +99,7 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
|
||||
public final static long SCAN_INTERVAL = 90000; // 90 seconds, it takes 60 sec for xenserver to fail login
|
||||
public final static int ACQUIRE_GLOBAL_LOCK_TIMEOUT_FOR_COOPERATION = 5; // 5 seconds
|
||||
public long _loadSize = 100;
|
||||
protected int _directAgentScanInterval = 90; // 90 seconds
|
||||
protected Set<Long> _agentToTransferIds = new HashSet<Long>();
|
||||
|
||||
@Inject
|
||||
@ -134,6 +135,9 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
|
||||
String value = params.get(Config.DirectAgentLoadSize.key());
|
||||
_loadSize = NumbersUtil.parseInt(value, 16);
|
||||
|
||||
value = params.get(Config.DirectAgentScanInterval.key());
|
||||
_directAgentScanInterval = NumbersUtil.parseInt(value, 90); // defaulted to 90 seconds
|
||||
|
||||
ClusteredAgentAttache.initialize(this);
|
||||
|
||||
_clusterMgr.registerListener(this);
|
||||
@ -146,7 +150,10 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
|
||||
if (!super.start()) {
|
||||
return false;
|
||||
}
|
||||
_timer.schedule(new DirectAgentScanTimerTask(), STARTUP_DELAY, SCAN_INTERVAL);
|
||||
_timer.schedule(new DirectAgentScanTimerTask(), STARTUP_DELAY, _directAgentScanInterval * 1000);
|
||||
if (s_logger.isDebugEnabled()) {
|
||||
s_logger.debug("Scheduled direct agent scan task to run at an interval of " + _directAgentScanInterval + " seconds");
|
||||
}
|
||||
|
||||
// schedule transfer scan executor - if agent LB is enabled
|
||||
if (_clusterMgr.isAgentRebalanceEnabled()) {
|
||||
|
||||
@ -356,6 +356,7 @@ public enum Config {
|
||||
|
||||
ResourceCountCheckInterval("Advanced", ManagementServer.class, Long.class, "resourcecount.check.interval", "0", "Time (in seconds) to wait before retrying resource count check task. Default is 0 which is to never run the task", "Seconds"),
|
||||
DirectAgentLoadSize("Advanced", ManagementServer.class, Integer.class, "direct.agent.load.size", "16", "The number of direct agents to load each time", null),
|
||||
DirectAgentScanInterval("Advanced", ManagementServer.class, Integer.class, "direct.agent.scan.interval", "90", "Time interval (in seconds) to run the direct agent scan task", null),
|
||||
|
||||
//disabling lb as cluster sync does not work with distributed cluster
|
||||
AgentLbEnable("Advanced", ManagementServer.class, Boolean.class, "agent.lb.enabled", "false", "If agent load balancing enabled in cluster setup", null),
|
||||
|
||||
@ -2152,6 +2152,8 @@ INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'manag
|
||||
INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'management-server', 'execute.in.sequence.hypervisor.commands', 'false', 'If set to true, StartCommand, StopCommand, CopyVolumeCommand, CreateCommand will be synchronized on the agent side. If set to false, these commands become asynchronous. Default value is false.');
|
||||
INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'management-server', 'execute.in.sequence.network.element.commands', 'false', 'If set to true, DhcpEntryCommand, SavePasswordCommand, UserDataCommand, VmDataCommand will be synchronized on the agent side. If set to false, these commands become asynchronous. Default value is false.');
|
||||
|
||||
INSERT IGNORE INTO `cloud`.`configuration` VALUES ('Advanced', 'DEFAULT', 'management-server', 'direct.agent.scan.interval', 90, 'Time interval (in seconds) to run the direct agent scan task.');
|
||||
|
||||
ALTER TABLE `cloud`.`vm_template` ADD COLUMN `dynamically_scalable` tinyint(1) unsigned NOT NULL DEFAULT 0 COMMENT 'true if template contains XS/VMWare tools inorder to support dynamic scaling of VM cpu/memory';
|
||||
UPDATE `cloud`.`vm_template` SET dynamically_scalable = 1 WHERE name = "CentOS 5.6(64-bit) no GUI (XenServer)" AND type = "BUILTIN";
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user