Improve HA logs (#5241)

Co-authored-by: GutoVeronezi <daniel@scclouds.com.br>
This commit is contained in:
Daniel Augusto Veronezi Salvador 2021-07-30 16:13:16 -03:00 committed by GitHub
parent 0d8b4de1b2
commit 82df04ecc8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 25 deletions

View File

@ -16,9 +16,9 @@
// under the License. // under the License.
package com.cloud.hypervisor.kvm.resource; package com.cloud.hypervisor.kvm.resource;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.stream.Collectors;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -27,13 +27,13 @@ import com.cloud.utils.script.Script;
public class KVMHAChecker extends KVMHABase implements Callable<Boolean> { public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
private static final Logger s_logger = Logger.getLogger(KVMHAChecker.class); private static final Logger s_logger = Logger.getLogger(KVMHAChecker.class);
private List<NfsStoragePool> _pools; private List<NfsStoragePool> nfsStoragePools;
private String _hostIP; private String hostIp;
private long _heartBeatCheckerTimeout = 360000; /* 6 minutes */ private long heartBeatCheckerTimeout = 360000; // 6 minutes
public KVMHAChecker(List<NfsStoragePool> pools, String host) { public KVMHAChecker(List<NfsStoragePool> pools, String host) {
this._pools = pools; this.nfsStoragePools = pools;
this._hostIP = host; this.hostIp = host;
} }
/* /*
@ -42,35 +42,40 @@ public class KVMHAChecker extends KVMHABase implements Callable<Boolean> {
*/ */
@Override @Override
public Boolean checkingHeartBeat() { public Boolean checkingHeartBeat() {
List<Boolean> results = new ArrayList<Boolean>(); boolean validResult = false;
for (NfsStoragePool pool : _pools) {
Script cmd = new Script(s_heartBeatPath, _heartBeatCheckerTimeout, s_logger); String hostAndPools = String.format("host IP [%s] in pools [%s]", hostIp, nfsStoragePools.stream().map(pool -> pool._poolIp).collect(Collectors.joining(", ")));
s_logger.debug(String.format("Checking heart beat with KVMHAChecker for %s", hostAndPools));
for (NfsStoragePool pool : nfsStoragePools) {
Script cmd = new Script(s_heartBeatPath, heartBeatCheckerTimeout, s_logger);
cmd.add("-i", pool._poolIp); cmd.add("-i", pool._poolIp);
cmd.add("-p", pool._poolMountSourcePath); cmd.add("-p", pool._poolMountSourcePath);
cmd.add("-m", pool._mountDestPath); cmd.add("-m", pool._mountDestPath);
cmd.add("-h", _hostIP); cmd.add("-h", hostIp);
cmd.add("-r"); cmd.add("-r");
cmd.add("-t", String.valueOf(_heartBeatUpdateFreq / 1000)); cmd.add("-t", String.valueOf(_heartBeatUpdateFreq / 1000));
OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser(); OutputInterpreter.OneLineParser parser = new OutputInterpreter.OneLineParser();
String result = cmd.execute(parser); String result = cmd.execute(parser);
s_logger.debug("KVMHAChecker pool: " + pool._poolIp); String parsedLine = parser.getLine();
s_logger.debug("KVMHAChecker result: " + result);
s_logger.debug("KVMHAChecker parser: " + parser.getLine()); s_logger.debug(String.format("Checking heart beat with KVMHAChecker [{command=\"%s\", result: \"%s\", log: \"%s\", pool: \"%s\"}].", cmd.toString(), result, parsedLine,
if (result == null && parser.getLine().contains("> DEAD <")) { pool._poolIp));
s_logger.debug("read heartbeat failed: ");
results.add(false); if (result == null && parsedLine.contains("DEAD")) {
s_logger.warn(String.format("Checking heart beat with KVMHAChecker command [%s] returned [%s]. [%s]. It may cause a shutdown of host IP [%s].", cmd.toString(),
result, parsedLine, hostIp));
} else { } else {
results.add(true); validResult = true;
} }
} }
for (Boolean r : results) { if (!validResult) {
if (r) { s_logger.warn(String.format("All checks with KVMHAChecker for %s considered it as dead. It may cause a shutdown of the host.", hostAndPools));
return true;
}
} }
return false; return validResult;
} }
@Override @Override

View File

@ -138,7 +138,7 @@ check_hbLog() {
diff=`expr $now - $hb` diff=`expr $now - $hb`
if [ $diff -gt $interval ] if [ $diff -gt $interval ]
then then
return 1 return $diff
fi fi
return 0 return 0
} }
@ -146,11 +146,12 @@ check_hbLog() {
if [ "$rflag" == "1" ] if [ "$rflag" == "1" ]
then then
check_hbLog check_hbLog
if [ $? == 0 ] diff=$?
if [ $diff == 0 ]
then then
echo "=====> ALIVE <=====" echo "=====> ALIVE <====="
else else
echo "=====> DEAD <======" echo "=====> Considering host as DEAD because last write on [$hbFile] was [$diff] seconds ago, but the max interval is [$interval] <======"
fi fi
exit 0 exit 0
elif [ "$cflag" == "1" ] elif [ "$cflag" == "1" ]