check_heartbeat and pingtest execute through ssh, not XAPI, because XAPI may hang when master host is downi

This commit is contained in:
Anthony Xu 2014-03-25 10:42:31 -07:00
parent 7b08bb7cab
commit 88c1da679c
5 changed files with 78 additions and 48 deletions

View File

@ -406,17 +406,28 @@ public abstract class CitrixResourceBase implements ServerResource, HypervisorRe
}
protected boolean pingXenServer() {
protected boolean pingXAPI() {
Connection conn = getConnection();
try {
Host host = Host.getByUuid(conn, _host.uuid);
if( !host.getEnabled(conn) ) {
s_logger.debug("Host " + _host.ip + " is not enabled!");
return false;
}
} catch (Exception e) {
s_logger.debug("cannot get host enabled status, host " + _host.ip + " due to " + e.toString(), e);
return false;
}
try {
callHostPlugin(conn, "echo", "main");
return true;
} catch (Exception e) {
s_logger.debug("cannot ping host " + _host.ip + " due to " + e.toString(), e);
return false;
}
return false;
return true;
}
protected String logX(XenAPIObject obj, String msg) {
return new StringBuilder("Host ").append(_host.ip).append(" ").append(obj.toWireString()).append(": ").append(msg).toString();
}
@ -2006,12 +2017,24 @@ public abstract class CitrixResourceBase implements ServerResource, HypervisorRe
}
private boolean doPingTest(Connection conn, final String computingHostIp) {
String args = "-h " + computingHostIp;
String result = callHostPlugin(conn, "vmops", "pingtest", "args", args);
if (result == null || result.isEmpty()) {
com.trilead.ssh2.Connection sshConnection = new com.trilead.ssh2.Connection(_host.ip, 22);
try {
sshConnection.connect(null, 60000, 60000);
if (!sshConnection.authenticateWithPassword(_username, _password.peek())) {
throw new CloudRuntimeException("Unable to authenticate");
}
String cmd = "ping -c 2 " + computingHostIp;
if (!SSHCmdHelper.sshExecuteCmd(sshConnection, cmd)) {
throw new CloudRuntimeException("Cannot ping host " + computingHostIp + " from host " + _host.ip);
}
return true;
} catch (Exception e) {
s_logger.warn("Catch exception " + e.toString(), e);
return false;
} finally {
sshConnection.close();
}
return true;
}
protected CheckOnHostAnswer execute(CheckOnHostCommand cmd) {
@ -2238,7 +2261,7 @@ public abstract class CitrixResourceBase implements ServerResource, HypervisorRe
}
protected CheckHealthAnswer execute(CheckHealthCommand cmd) {
boolean result = pingXenServer();
boolean result = pingXAPI();
return new CheckHealthAnswer(cmd, result);
}
@ -4341,9 +4364,9 @@ public abstract class CitrixResourceBase implements ServerResource, HypervisorRe
@Override
public PingCommand getCurrentStatus(long id) {
try {
if (!pingXenServer()) {
if (!pingXAPI()) {
Thread.sleep(1000);
if (!pingXenServer()) {
if (!pingXAPI()) {
s_logger.warn(" can not ping xenserver " + _host.uuid);
return null;
}

View File

@ -73,12 +73,10 @@ public class XenServer56FP1Resource extends XenServer56Resource {
protected FenceAnswer execute(FenceCommand cmd) {
Connection conn = getConnection();
try {
String result = callHostPluginPremium(conn, "check_heartbeat", "host", cmd.getHostGuid(), "interval", Integer.toString(_heartbeatInterval * 2));
if (!result.contains("> DEAD <")) {
if (check_heartbeat(cmd.getHostGuid())) {
s_logger.debug("Heart beat is still going so unable to fence");
return new FenceAnswer(cmd, false, "Heartbeat is still going on unable to fence");
}
Set<VM> vms = VM.getByNameLabel(conn, cmd.getVmName());
for (VM vm : vms) {
Set<VDI> vdis = new HashSet<VDI>();

View File

@ -28,6 +28,7 @@ import com.cloud.agent.api.StartupCommand;
import com.cloud.resource.ServerResource;
import com.cloud.utils.exception.CloudRuntimeException;
import com.cloud.utils.script.Script;
import com.cloud.utils.ssh.SSHCmdHelper;
import com.xensource.xenapi.Connection;
import com.xensource.xenapi.Host;
import com.xensource.xenapi.Network;
@ -208,15 +209,37 @@ public class XenServer56Resource extends CitrixResourceBase {
}
}
protected Boolean check_heartbeat(String hostuuid) {
com.trilead.ssh2.Connection sshConnection = new com.trilead.ssh2.Connection(_host.ip, 22);
try {
sshConnection.connect(null, 60000, 60000);
if (!sshConnection.authenticateWithPassword(_username, _password.peek())) {
throw new CloudRuntimeException("Unable to authenticate");
}
String shcmd = "/opt/cloud/bin/check_heartbeat.sh " + hostuuid + " "
+ Integer.toString(_heartbeatInterval * 2);
if (!SSHCmdHelper.sshExecuteCmd(sshConnection, shcmd)) {
s_logger.debug("Heart beat is gone so dead.");
return false;
}
s_logger.debug("Heart beat is still going");
return true;
} catch (Exception e) {
s_logger.debug("health check failed due to catch exception " + e.toString());
return null;
} finally {
sshConnection.close();
}
}
protected FenceAnswer execute(FenceCommand cmd) {
Connection conn = getConnection();
try {
String result = callHostPluginPremium(conn, "check_heartbeat", "host", cmd.getHostGuid(), "interval", Integer.toString(_heartbeatInterval * 2));
if (!result.contains("> DEAD <")) {
if (check_heartbeat(cmd.getHostGuid())) {
s_logger.debug("Heart beat is still going so unable to fence");
return new FenceAnswer(cmd, false, "Heartbeat is still going on unable to fence");
}
Set<VM> vms = VM.getByNameLabel(conn, cmd.getVmName());
for (VM vm : vms) {
synchronized (_cluster.intern()) {
@ -236,6 +259,7 @@ public class XenServer56Resource extends CitrixResourceBase {
}
}
@Override
protected boolean transferManagementNetwork(Connection conn, Host host, PIF src, PIF.Record spr, PIF dest) throws XmlRpcException, XenAPIException {
dest.reconfigureIp(conn, spr.ipConfigurationMode, spr.IP, spr.netmask, spr.gateway, spr.DNS);
@ -269,33 +293,29 @@ public class XenServer56Resource extends CitrixResourceBase {
@Override
public StartupCommand[] initialize() {
pingXenServer();
pingXAPI();
StartupCommand[] cmds = super.initialize();
return cmds;
}
@Override
protected CheckOnHostAnswer execute(CheckOnHostCommand cmd) {
try {
Connection conn = getConnection();
String result = callHostPluginPremium(conn, "check_heartbeat", "host", cmd.getHost().getGuid(), "interval", Integer.toString(_heartbeatInterval * 2));
if (result == null) {
return new CheckOnHostAnswer(cmd, "Unable to call plugin");
}
if (result.contains("> DEAD <")) {
s_logger.debug("Heart beat is gone so dead.");
return new CheckOnHostAnswer(cmd, false, "Heart Beat is done");
} else if (result.contains("> ALIVE <")) {
s_logger.debug("Heart beat is still going");
return new CheckOnHostAnswer(cmd, true, "Heartbeat is still going");
}
return new CheckOnHostAnswer(cmd, null, "Unable to determine");
} catch (Exception e) {
s_logger.warn("Unable to fence", e);
return new CheckOnHostAnswer(cmd, e.getMessage());
Boolean alive = check_heartbeat(cmd.getHost().getGuid());
String msg = "";
if (alive == null) {
msg = " cannot determine ";
} else if ( alive == true) {
msg = "Heart beat is still going";
} else {
msg = "Heart beat is gone so dead.";
}
s_logger.debug(msg);
return new CheckOnHostAnswer(cmd, alive, msg);
}
public XenServer56Resource() {
super();
}

View File

@ -72,3 +72,4 @@ do
done
echo "=====> DEAD <======"
exit 1

View File

@ -123,17 +123,6 @@ def setup_heartbeat_file(session, args):
txt = ''
return txt
@echo
def check_heartbeat(session, args):
host = args['host']
interval = args['interval']
try:
cmd = ["bash", "/opt/cloud/bin/check_heartbeat.sh", host, interval]
txt = util.pread2(cmd)
except:
txt=''
return txt
@echo
def heartbeat(session, args):
@ -156,5 +145,4 @@ def asmonitor(session, args):
return 'fail'
if __name__ == "__main__":
XenAPIPlugin.dispatch({"forceShutdownVM":forceShutdownVM, "upgrade_snapshot":upgrade_snapshot, "create_privatetemplate_from_snapshot":create_privatetemplate_from_snapshot, "copy_vhd_to_secondarystorage":copy_vhd_to_secondarystorage, "copy_vhd_from_secondarystorage":copy_vhd_from_secondarystorage, "setup_heartbeat_sr":setup_heartbeat_sr, "setup_heartbeat_file":setup_heartbeat_file, "check_heartbeat":check_heartbeat, "heartbeat": heartbeat, "asmonitor": asmonitor})
XenAPIPlugin.dispatch({"forceShutdownVM":forceShutdownVM, "upgrade_snapshot":upgrade_snapshot, "create_privatetemplate_from_snapshot":create_privatetemplate_from_snapshot, "copy_vhd_to_secondarystorage":copy_vhd_to_secondarystorage, "copy_vhd_from_secondarystorage":copy_vhd_from_secondarystorage, "setup_heartbeat_sr":setup_heartbeat_sr, "setup_heartbeat_file":setup_heartbeat_file, "heartbeat": heartbeat, "asmonitor": asmonitor})