mirror of
https://github.com/apache/cloudstack.git
synced 2025-11-03 04:12:31 +01:00
CLOUDSTACK-8666: Put host in Alert state only after alert.wait timeout
Instead of putting the host to Alert state immediately, the investigators should be allowed to run for some time based on alert.wait global config. At the end of this interval if the host state still cannot be determined then put the host in Alert. Also updated some of the log messages. This closes #621
This commit is contained in:
parent
7febdb58b5
commit
090db05821
@ -824,25 +824,30 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
|||||||
/* OK, we are going to the bad status, let's see what happened */
|
/* OK, we are going to the bad status, let's see what happened */
|
||||||
s_logger.info("Investigating why host " + hostId + " has disconnected with event " + event);
|
s_logger.info("Investigating why host " + hostId + " has disconnected with event " + event);
|
||||||
|
|
||||||
final Status determinedState = investigate(attache);
|
Status determinedState = investigate(attache);
|
||||||
// if state cannot be determined do nothing and bail out
|
// if state cannot be determined do nothing and bail out
|
||||||
if (determinedState == null) {
|
if (determinedState == null) {
|
||||||
s_logger.warn("Agent state cannot be determined, do nothing");
|
if (((System.currentTimeMillis() >> 10) - host.getLastPinged()) > AlertWait.value()) {
|
||||||
return false;
|
s_logger.warn("Agent " + hostId + " state cannot be determined for more than " + AlertWait + "(" + AlertWait.value() + ") seconds, will go to Alert state");
|
||||||
|
determinedState = Status.Alert;
|
||||||
|
} else {
|
||||||
|
s_logger.warn("Agent " + hostId + " state cannot be determined, do nothing");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final Status currentStatus = host.getStatus();
|
final Status currentStatus = host.getStatus();
|
||||||
s_logger.info("The state determined is " + determinedState);
|
s_logger.info("The agent " + hostId + " state determined is " + determinedState);
|
||||||
|
|
||||||
if (determinedState == Status.Down) {
|
if (determinedState == Status.Down) {
|
||||||
s_logger.error("Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs");
|
String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
|
||||||
|
s_logger.error(message);
|
||||||
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
|
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
|
||||||
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + host.getId(),
|
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
|
||||||
"Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs");
|
|
||||||
}
|
}
|
||||||
event = Status.Event.HostDown;
|
event = Status.Event.HostDown;
|
||||||
} else if (determinedState == Status.Up) {
|
} else if (determinedState == Status.Up) {
|
||||||
/* Got ping response from host, bring it back*/
|
/* Got ping response from host, bring it back */
|
||||||
s_logger.info("Agent is determined to be up and running");
|
s_logger.info("Agent is determined to be up and running");
|
||||||
agentStatusTransitTo(host, Status.Event.Ping, _nodeId);
|
agentStatusTransitTo(host, Status.Event.Ping, _nodeId);
|
||||||
return false;
|
return false;
|
||||||
@ -850,10 +855,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
|||||||
s_logger.warn("Agent is disconnected but the host is still up: " + host.getId() + "-" + host.getName());
|
s_logger.warn("Agent is disconnected but the host is still up: " + host.getId() + "-" + host.getName());
|
||||||
if (currentStatus == Status.Disconnected) {
|
if (currentStatus == Status.Disconnected) {
|
||||||
if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > AlertWait.value()) {
|
if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > AlertWait.value()) {
|
||||||
s_logger.warn("Host " + host.getId() + " has been disconnected pass the time it should be disconnected.");
|
s_logger.warn("Host " + host.getId() + " has been disconnected past the wait time it should be disconnected.");
|
||||||
event = Status.Event.WaitedTooLong;
|
event = Status.Event.WaitedTooLong;
|
||||||
} else {
|
} else {
|
||||||
s_logger.debug("Host has been determined to be disconnected but it hasn't passed the wait time yet.");
|
s_logger.debug("Host " + host.getId() + " has been determined to be disconnected but it hasn't passed the wait time yet.");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else if (currentStatus == Status.Up) {
|
} else if (currentStatus == Status.Up) {
|
||||||
@ -862,7 +867,7 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
|||||||
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
|
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
|
||||||
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
|
if (host.getType() != Host.Type.SecondaryStorage && host.getType() != Host.Type.ConsoleProxy) {
|
||||||
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + hostDesc,
|
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + hostDesc,
|
||||||
"If the agent for host [" + hostDesc + "] is not restarted within " + AlertWait + " seconds, HA will begin on the VMs");
|
"If the agent for host [" + hostDesc + "] is not restarted within " + AlertWait + " seconds, host will go to Alert state");
|
||||||
}
|
}
|
||||||
event = Status.Event.AgentDisconnected;
|
event = Status.Event.AgentDisconnected;
|
||||||
}
|
}
|
||||||
@ -872,11 +877,10 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl
|
|||||||
final HostPodVO podVO = _podDao.findById(host.getPodId());
|
final HostPodVO podVO = _podDao.findById(host.getPodId());
|
||||||
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
|
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + dcVO.getName() + ", pod: " + podVO.getName();
|
||||||
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc,
|
_alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc,
|
||||||
"In availability zone " + host.getDataCenterId() + ", " + host.getId() + "-" + host.getName()
|
"In availability zone " + host.getDataCenterId() + ", host is in alert state: " + host.getId() + "-" + host.getName());
|
||||||
+ " disconnect due to event " + event + ", ms can't determine the host status" );
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s_logger.debug("The next status of Agent " + host.getId() + " is not Alert, no need to investigate what happened");
|
s_logger.debug("The next status of agent " + host.getId() + " is not Alert, no need to investigate what happened");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
handleDisconnectWithoutInvestigation(attache, event, true, true);
|
handleDisconnectWithoutInvestigation(attache, event, true, true);
|
||||||
|
|||||||
@ -220,7 +220,7 @@ public class HighAvailabilityManagerImpl extends ManagerBase implements HighAvai
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return Status.Alert;
|
return hostState;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user