2 fixes for Agent Load Balancer:

* when management server dies and notifies other management servers about this, the running management server has to cleanup host_transfer records belonging to the died management server
* issue agent load balancing task only when agent load (number of connected agents in the system) exceeds "agent.load.threshold" - 70% by default

Conflicts:

	server/src/com/cloud/configuration/Config.java
	setup/db/db/schema-228to229.sql
This commit is contained in:
alena 2011-07-21 14:47:58 -07:00
parent abd3321077
commit 307741edcd
7 changed files with 55 additions and 11 deletions

View File

@ -567,7 +567,7 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
//cancel all transfer tasks
s_transferExecutor.shutdownNow();
cleanupTransferMap();
cleanupTransferMap(_nodeId);
return super.stop();
}
@ -695,6 +695,8 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
for (ManagementServerHostVO vo : nodeList) {
s_logger.info("Marking hosts as disconnected on Management server" + vo.getMsid());
_hostDao.markHostsAsDisconnected(vo.getMsid());
s_logger.info("Deleting entries from op_host_transfer table for Management server " + vo.getMsid());
cleanupTransferMap(vo.getMsid());
}
}
@ -1071,14 +1073,14 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust
return true;
}
protected void cleanupTransferMap() {
List<HostTransferMapVO> hostsJoingingCluster = _hostTransferDao.listHostsJoiningCluster(_nodeId);
protected void cleanupTransferMap(long msId) {
List<HostTransferMapVO> hostsJoingingCluster = _hostTransferDao.listHostsJoiningCluster(msId);
for (HostTransferMapVO hostJoingingCluster : hostsJoingingCluster) {
_hostTransferDao.remove(hostJoingingCluster.getId());
}
List<HostTransferMapVO> hostsLeavingCluster = _hostTransferDao.listHostsLeavingCluster(_nodeId);
List<HostTransferMapVO> hostsLeavingCluster = _hostTransferDao.listHostsLeavingCluster(msId);
for (HostTransferMapVO hostLeavingCluster : hostsLeavingCluster) {
_hostTransferDao.remove(hostLeavingCluster.getId());
}

View File

@ -127,6 +127,9 @@ public class ClusterManagerImpl implements ClusterManager {
private String _name;
private String _clusterNodeIP = "127.0.0.1";
private boolean _agentLBEnabled = false;
private double _connectedAgentsThreshold = 0.7;
private static boolean _agentLbHappened = false;
public ClusterManagerImpl() {
clusterPeers = new HashMap<String, ClusterService>();
@ -607,6 +610,26 @@ public class ClusterManagerImpl implements ClusterManager {
}
peerScan();
//initiate agent lb task will be scheduled and executed only once, and only when number of agents loaded exceeds _connectedAgentsThreshold
if (_agentLBEnabled && !_agentLbHappened) {
List<HostVO> allManagedRoutingAgents = _hostDao.listManagedRoutingAgents();
List<HostVO> allAgents = _hostDao.listAllRoutingAgents();
double allHostsCount = allAgents.size();
double managedHostsCount = allManagedRoutingAgents.size();
if (allHostsCount > 0.0) {
double load = managedHostsCount/allHostsCount;
if (load >= _connectedAgentsThreshold) {
s_logger.debug("Scheduling agent rebalancing task as the average agent load " + load + " is more than the threshold " + _connectedAgentsThreshold);
_rebalanceService.scheduleRebalanceAgents();
_agentLbHappened = true;
} else {
s_logger.trace("Not scheduling agent rebalancing task as the averages load " + load + " is less than the threshold " + _connectedAgentsThreshold);
}
}
}
} catch(CloudRuntimeException e) {
s_logger.error("Runtime DB exception ", e.getCause());
@ -937,10 +960,6 @@ public class ClusterManagerImpl implements ClusterManager {
_heartbeatScheduler.scheduleAtFixedRate(getHeartbeatTask(), heartbeatInterval, heartbeatInterval, TimeUnit.MILLISECONDS);
_notificationExecutor.submit(getNotificationTask());
//Initiate agent rebalancing after the host is in UP state
if (_agentLBEnabled) {
_rebalanceService.scheduleRebalanceAgents();
}
} catch (Throwable e) {
s_logger.error("Unexpected exception : ", e);
@ -1064,6 +1083,12 @@ public class ClusterManagerImpl implements ClusterManager {
_agentLBEnabled = Boolean.valueOf(configDao.getValue(Config.AgentLbEnable.key()));
String connectedAgentsThreshold = configs.get("agent.load.threshold");
if (connectedAgentsThreshold != null) {
_connectedAgentsThreshold = Double.parseDouble(connectedAgentsThreshold);
}
this.registerListener(new LockMasterListener(_msId));
checkConflicts();
@ -1191,5 +1216,4 @@ public class ClusterManagerImpl implements ClusterManager {
public boolean isAgentRebalanceEnabled() {
return _agentLBEnabled;
}
}

View File

@ -51,6 +51,7 @@ public class HostTransferMapDaoImpl extends GenericDaoBase<HostTransferMapVO, Lo
IntermediateStateSearch = createSearchBuilder();
IntermediateStateSearch.and("futureOwner", IntermediateStateSearch.entity().getFutureOwner(), SearchCriteria.Op.EQ);
IntermediateStateSearch.and("initialOwner", IntermediateStateSearch.entity().getInitialOwner(), SearchCriteria.Op.EQ);
IntermediateStateSearch.and("state", IntermediateStateSearch.entity().getState(), SearchCriteria.Op.IN);
IntermediateStateSearch.done();

View File

@ -241,7 +241,8 @@ public enum Config {
EncodeApiResponse("Advanced", ManagementServer.class, Boolean.class, "encode.api.response", "false", "Do UTF-8 encoding for the api response, false by default", null),
DnsBasicZoneUpdates("Advanced", NetworkManager.class, String.class, "network.dns.basiczone.updates", "all", "This parameter can take 2 values: all (default) and pod. It defines if DHCP/DNS requests have to be send to all dhcp servers in cloudstack, or only to the one in the same pod", "all,pod"),
ClusterMessageTimeOutSeconds("Advanced", ManagementServer.class, Integer.class, "cluster.message.timeout.seconds", "300", "Time (in seconds) to wait before a inter-management server message post times out.", null);
ClusterMessageTimeOutSeconds("Advanced", ManagementServer.class, Integer.class, "cluster.message.timeout.seconds", "300", "Time (in seconds) to wait before a inter-management server message post times out.", null),
AgentLoadThreshold("Advanced", ManagementServer.class, Float.class, "agent.load.threshold", "0.7", "Percentage (as a value between 0 and 1) of connected agents after which agent load balancing will start happening", null);
private final String _category;
private final Class<?> _componentClass;

View File

@ -184,4 +184,6 @@ public interface HostDao extends GenericDao<HostVO, Long> {
List<HostVO> listByManagementServer(long msId);
List<HostVO> listSecondaryStorageVM(long dcId);
List<HostVO> listAllRoutingAgents();
}

View File

@ -104,6 +104,7 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
protected final GenericSearchBuilder<HostVO, Long> CountRoutingByDc;
protected final SearchBuilder<HostTransferMapVO> HostTransferSearch;
protected final SearchBuilder<ClusterVO> ClusterManagedSearch;
protected final SearchBuilder<HostVO> RoutingSearch;
protected final Attribute _statusAttr;
protected final Attribute _msIdAttr;
@ -295,6 +296,10 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
ManagedRoutingServersSearch.and("type", ManagedRoutingServersSearch.entity().getType(), SearchCriteria.Op.EQ);
ManagedRoutingServersSearch.done();
RoutingSearch = createSearchBuilder();
RoutingSearch.and("type", RoutingSearch.entity().getType(), SearchCriteria.Op.EQ);
RoutingSearch.done();
_statusAttr = _allAttributes.get("status");
_msIdAttr = _allAttributes.get("managementServerId");
_pingTimeAttr = _allAttributes.get("lastPinged");
@ -942,4 +947,11 @@ public class HostDaoImpl extends GenericDaoBase<HostVO, Long> implements HostDao
return listBy(sc);
}
@Override
public List<HostVO> listAllRoutingAgents() {
SearchCriteria<HostVO> sc = RoutingSearch.create();
sc.setParameters("type", Type.Routing);
return listBy(sc);
}
}

View File

@ -13,3 +13,5 @@ ALTER TABLE `cloud`.`op_host_capacity` DROP FOREIGN KEY `fk_op_host_capacity__po
ALTER TABLE `cloud`.`op_host_capacity` DROP FOREIGN KEY `fk_op_host_capacity__data_center_id`;
ALTER TABLE `cloud`.`op_host_capacity` DROP FOREIGN KEY `fk_op_host_capacity__cluster_id`;
INSERT IGNORE INTO configuration VALUES ('Advanced', 'DEFAULT', 'management-server', 'agent.load.threshold', '0.70', 'Percentage (as a value between 0 and 1) of connected agents after which agent load balancing will start happening');