CLOUDSTACK-10310 Fix KVM reboot on storage issue (#2722)

This commit is contained in:
Slair1 2018-08-20 03:28:03 -05:00 committed by dahn
parent 9b772db0f1
commit 023dcec5ef
3 changed files with 15 additions and 6 deletions

View File

@ -34,7 +34,8 @@ public class KVMHABase {
protected static String s_heartBeatPath; protected static String s_heartBeatPath;
protected long _heartBeatUpdateTimeout = 60000; protected long _heartBeatUpdateTimeout = 60000;
protected long _heartBeatUpdateFreq = 60000; protected long _heartBeatUpdateFreq = 60000;
protected long _heartBeatUpdateMaxRetry = 3; protected long _heartBeatUpdateMaxTries = 5;
protected long _heartBeatUpdateRetrySleep = 15000;
public static enum PoolType { public static enum PoolType {
PrimaryStorage, SecondaryStorage PrimaryStorage, SecondaryStorage

View File

@ -119,7 +119,8 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
} }
String result = null; String result = null;
for (int i = 0; i < 5; i++) { // Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error
for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) {
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger); Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
cmd.add("-i", primaryStoragePool._poolIp); cmd.add("-i", primaryStoragePool._poolIp);
cmd.add("-p", primaryStoragePool._poolMountSourcePath); cmd.add("-p", primaryStoragePool._poolMountSourcePath);
@ -127,14 +128,21 @@ public class KVMHAMonitor extends KVMHABase implements Runnable {
cmd.add("-h", _hostIP); cmd.add("-h", _hostIP);
result = cmd.execute(); result = cmd.execute();
if (result != null) { if (result != null) {
s_logger.warn("write heartbeat failed: " + result + ", retry: " + i); s_logger.warn("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries);
try {
Thread.sleep(_heartBeatUpdateRetrySleep);
} catch (InterruptedException e) {
s_logger.debug("[ignored] interupted between heartbeat retries.");
}
} else { } else {
break; break;
} }
} }
if (result != null) { if (result != null) {
s_logger.warn("write heartbeat failed: " + result + "; reboot the host"); // Stop cloudstack-agent if can't write to heartbeat file.
// This will raise an alert on the mgmt server
s_logger.warn("write heartbeat failed: " + result + "; stopping cloudstack-agent");
Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger); Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger);
cmd.add("-i", primaryStoragePool._poolIp); cmd.add("-i", primaryStoragePool._poolIp);
cmd.add("-p", primaryStoragePool._poolMountSourcePath); cmd.add("-p", primaryStoragePool._poolMountSourcePath);

View File

@ -155,10 +155,10 @@ then
exit 0 exit 0
elif [ "$cflag" == "1" ] elif [ "$cflag" == "1" ]
then then
/usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was unable to write the heartbeat to the storage." /usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage."
sync & sync &
sleep 5 sleep 5
echo b > /proc/sysrq-trigger service cloudstack-agent stop
exit $? exit $?
else else
write_hbLog write_hbLog