CLOUDSTACK-8666: Put host in Alert state only after alert.wait timeout

Instead of putting the host to Alert state immediately, the investigators should be allowed to run for some time based on alert.wait global config.
At the end of this interval if the host state still cannot be determined then put the host in Alert. Also updated some of the log messages.

This closes #621
This commit is contained in:
Koushik Das 2015-07-23 17:57:51 +05:30
parent 7febdb58b5
commit 090db05821
2 changed files with 19 additions and 15 deletions

View File

@ -824,25 +824,30 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
/* OK, we are going to the bad status, let's see what happened */
s_logger.info("Investigating why host " + hostId + " has disconnected with event " + event);
final Status determinedState = investigate(attache);
Status determinedState = investigate(attache);
// if state cannot be determined do nothing and bail out
if (determinedState == null) {
s_logger.warn("Agent state cannot be determined, do nothing");
return false;
if (((System.currentTimeMillis() >> 10) - host.getLastPinged()) > AlertWait.value()) {
s_logger.warn("Agent " + hostId + " state cannot be determined for more than " + AlertWait + "(" + AlertWait.value() + ") seconds, will go to Alert state");
determinedState = Status.Alert;
} else {
s_logger.warn("Agent " + hostId + " state cannot be determined, do nothing");
return false;
}
}
final Status currentStatus = host.getStatus();
s_logger.info("The state determined is " + determinedState);
s_logger.info("The agent " + hostId + " state determined is " + determinedState);
if (determinedState == Status.Down) {
s_logger.error("Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs");
String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
s_logger.error(message);
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + host.getId(),
"Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs");
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
}
event = Status.Event.HostDown;
} else if (determinedState == Status.Up) {
/* Got ping response from host, bring it back*/
/* Got ping response from host, bring it back */
s_logger.info("Agent is determined to be up and running");
agentStatusTransitTo(host, Status.Event.Ping, _nodeId);
return false;
@ -850,10 +855,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
s_logger.warn("Agent is disconnected but the host is still up: " + host.getId() + "-" + host.getName());
if (currentStatus == Status.Disconnected) {
if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > AlertWait.value()) {
s_logger.warn("Host " + host.getId() + " has been disconnected pass the time it should be disconnected.");
s_logger.warn("Host " + host.getId() + " has been disconnected past the wait time it should be disconnected.");
event = Status.Event.WaitedTooLong;
} else {
s_logger.debug("Host has been determined to be disconnected but it hasn't passed the wait time yet.");
s_logger.debug("Host " + host.getId() + " has been determined to be disconnected but it hasn't passed the wait time yet.");
return false;
}
} else if (currentStatus == Status.Up) {
@ -862,7 +867,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + hostDesc,
"If the agent for host [" + hostDesc + "] is not restarted within " + AlertWait + " seconds, HA will begin on the VMs");
"If the agent for host [" + hostDesc + "] is not restarted within " + AlertWait + " seconds, host will go to Alert state");
}
event = Status.Event.AgentDisconnected;
}
@ -872,11 +877,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
final HostPodVO podVO = _podDao.findById(host.getPodId());
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc,
"In availability zone " + host.getDataCenterId() + ", " + host.getId() + "-" + host.getName()
+ " disconnect due to event " + event + ", ms can't determine the host status" );
"In availability zone " + host.getDataCenterId() + ", host is in alert state: " + host.getId() + "-" + host.getName());
}
} else {
s_logger.debug("The next status of Agent " + host.getId() + " is not Alert, no need to investigate what happened");
s_logger.debug("The next status of agent " + host.getId() + " is not Alert, no need to investigate what happened");
}
}
handleDisconnectWithoutInvestigation(attache, event, true, true);

View File

@ -220,7 +220,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
}
}
return Status.Alert;
return hostState;
}
@Override