From 023dcec5ef2e38091c0aacda1e0fae67fd6c4553 Mon Sep 17 00:00:00 2001 From: Slair1 Date: Mon, 20 Aug 2018 03:28:03 -0500 Subject: [PATCH] CLOUDSTACK-10310 Fix KVM reboot on storage issue (#2722) --- .../cloud/hypervisor/kvm/resource/KVMHABase.java | 3 ++- .../hypervisor/kvm/resource/KVMHAMonitor.java | 14 +++++++++++--- scripts/vm/hypervisor/kvm/kvmheartbeat.sh | 4 ++-- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java index be5ab396d19..f180848a8d5 100644 --- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java +++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHABase.java @@ -34,7 +34,8 @@ public class KVMHABase { protected static String s_heartBeatPath; protected long _heartBeatUpdateTimeout = 60000; protected long _heartBeatUpdateFreq = 60000; - protected long _heartBeatUpdateMaxRetry = 3; + protected long _heartBeatUpdateMaxTries = 5; + protected long _heartBeatUpdateRetrySleep = 15000; public static enum PoolType { PrimaryStorage, SecondaryStorage diff --git a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java index 0cebb4c9b00..8a11b7fc962 100644 --- a/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java +++ b/plugins/hypervisors/kvm/src/com/cloud/hypervisor/kvm/resource/KVMHAMonitor.java @@ -119,7 +119,8 @@ public class KVMHAMonitor extends KVMHABase implements Runnable { } String result = null; - for (int i = 0; i < 5; i++) { + // Try multiple times, but sleep in between tries to ensure it isn't a short lived transient error + for (int i = 1; i <= _heartBeatUpdateMaxTries; i++) { Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger); cmd.add("-i", primaryStoragePool._poolIp); cmd.add("-p", primaryStoragePool._poolMountSourcePath); @@ -127,14 +128,21 @@ public class KVMHAMonitor extends KVMHABase implements Runnable { cmd.add("-h", _hostIP); result = cmd.execute(); if (result != null) { - s_logger.warn("write heartbeat failed: " + result + ", retry: " + i); + s_logger.warn("write heartbeat failed: " + result + ", try: " + i + " of " + _heartBeatUpdateMaxTries); + try { + Thread.sleep(_heartBeatUpdateRetrySleep); + } catch (InterruptedException e) { + s_logger.debug("[ignored] interupted between heartbeat retries."); + } } else { break; } } if (result != null) { - s_logger.warn("write heartbeat failed: " + result + "; reboot the host"); + // Stop cloudstack-agent if can't write to heartbeat file. + // This will raise an alert on the mgmt server + s_logger.warn("write heartbeat failed: " + result + "; stopping cloudstack-agent"); Script cmd = new Script(s_heartBeatPath, _heartBeatUpdateTimeout, s_logger); cmd.add("-i", primaryStoragePool._poolIp); cmd.add("-p", primaryStoragePool._poolMountSourcePath); diff --git a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh index 7c8ee67f30c..30ca72a2aa9 100755 --- a/scripts/vm/hypervisor/kvm/kvmheartbeat.sh +++ b/scripts/vm/hypervisor/kvm/kvmheartbeat.sh @@ -155,10 +155,10 @@ then exit 0 elif [ "$cflag" == "1" ] then - /usr/bin/logger -t heartbeat "kvmheartbeat.sh rebooted system because it was unable to write the heartbeat to the storage." + /usr/bin/logger -t heartbeat "kvmheartbeat.sh stopped cloudstack-agent because it was unable to write the heartbeat to the storage." sync & sleep 5 - echo b > /proc/sysrq-trigger + service cloudstack-agent stop exit $? else write_hbLog