CLOUDSTACK-10310 Fix KVM reboot on storage issue (#2722)

This commit is contained in:
Slair1 2018-08-20 03:28:03 -05:00 committed by dahn
parent 9b772db0f1
commit 023dcec5ef
3 changed files with 15 additions and 6 deletions

View File

@ -34,7 +34,8 @@ public class KVMHABase {
protected static String s_heartBeatPath;
protected long _heartBeatUpdateTimeout = 60000;
protected long _heartBeatUpdateFreq = 60000;
protected long _heartBeatUpdateMaxRetry = 3;
protected long _heartBeatUpdateMaxTries = 5;
protected long _heartBeatUpdateRetrySleep = 15000;
public static enum PoolType {
PrimaryStorage, SecondaryStorage

View File

@ -119,7 +119,8 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
}
String result = null;
for (int i = 0; i < 5; i++) {
// Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error
for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
cmd.add("-i", primaryStoragePool._poolIp);
cmd.add("-p", primaryStoragePool._poolMountSourcePath);
@ -127,14 +128,21 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
cmd.add("-h", _hostIP);
result = cmd.execute();
if (result != null) {
s_logger.warn("write heartbeat failed: " + result + ", retry: " + i);
s_logger.warn("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries);
try {
Thread.sleep(_heartBeatUpdateRetrySleep);
} catch (InterruptedException e) {
s_logger.debug("[ignored] interupted between heartbeat retries.");
}
} else {
break;
}
}
if (result != null) {
s_logger.warn("write heartbeat failed: " + result + "; reboot the host");
// Stop cloudstack-agent if can't write to heartbeat file.
// This will raise an alert on the mgmt server
s_logger.warn("write heartbeat failed: " + result + "; stopping cloudstack-agent");
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
cmd.add("-i", primaryStoragePool._poolIp);
cmd.add("-p", primaryStoragePool._poolMountSourcePath);

View File

@ -155,10 +155,10 @@ then
exit 0
elif [ "$cflag" == "1" ]
then
/usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was unable to write the heartbeat to the storage."
/usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage."
sync &
sleep 5
echo b > /proc/sysrq-trigger
service cloudstack-agent stop
exit $?
else
write_hbLog