From 2993c993632454ffe8be39607b94181834942f39 Mon Sep 17 00:00:00 2001 From: Sina Kashipazha Date: Fri, 8 Dec 2023 15:21:06 +0100 Subject: [PATCH] Add missing hosts info to the prometheus exporter output. (#8328) Sometimes the hostStats object of the agents becomes null in the management server. It is a rare situation, and we haven't found the root cause yet, but it occurs occasionally in our CloudStack deployments with many hosts. The hostStat is null, even though the agent is UP and hosting multiple VMs. It is possible to access the VM consoles and execute tasks on them. This pull request doesn't address the issue directly; rather it displays those hosts in Prometheus so we can restart the agent and get the necessary information. --- .../metrics/PrometheusExporterImpl.java | 85 ++++++++++++++----- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/plugins/integrations/prometheus/src/main/java/org/apache/cloudstack/metrics/PrometheusExporterImpl.java b/plugins/integrations/prometheus/src/main/java/org/apache/cloudstack/metrics/PrometheusExporterImpl.java index 3b111da5961..17fbd48181a 100644 --- a/plugins/integrations/prometheus/src/main/java/org/apache/cloudstack/metrics/PrometheusExporterImpl.java +++ b/plugins/integrations/prometheus/src/main/java/org/apache/cloudstack/metrics/PrometheusExporterImpl.java @@ -82,6 +82,24 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp private static final String ONLINE = "online"; private static final String OFFLINE = "offline"; + enum MissingInfoFilter { + Host_Stats("hostStats"), + CPU_CAPACITY("cpuCapacity"), + MEM_CAPACITY("memCapacity"), + CORE_CAPACITY("coreCapacity"); + + private final String name; + + MissingInfoFilter(String name){ + this.name = name; + } + + @Override + public String toString() { + return name; + } + } + private static List metricsItems = new ArrayList<>(); @Inject @@ -129,8 +147,6 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp Map upHosts = new HashMap<>(); Map downHosts = new HashMap<>(); - HostStats hostStats; - for (final HostVO host : hostDao.listAll()) { if (host == null || host.getType() != Host.Type.Routing || host.getDataCenterId() != dcId) { continue; @@ -147,8 +163,6 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp int isDedicated = (dr != null) ? 1 : 0; metricsList.add(new ItemHostIsDedicated(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), isDedicated)); - String hostTags = markTagMaps(host, totalHosts, upHosts, downHosts); - hostStats = ApiDBUtils.getHostStatistics(host.getId()); // Get account, domain details for dedicated hosts if (isDedicated == 1) { @@ -160,16 +174,22 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp metricsList.add(new ItemHostDedicatedToAccount(zoneName, host.getName(), accountName, domain.getPath(), isDedicated)); } + String hostTags = markTagMaps(host, totalHosts, upHosts, downHosts); + HostStats hostStats = ApiDBUtils.getHostStatistics(host.getId()); + + if (hostStats == null){ + metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.Host_Stats)); + } + final String cpuFactor = String.valueOf(CapacityManager.CpuOverprovisioningFactor.valueIn(host.getClusterId())); final CapacityVO cpuCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_CPU); - final double cpuUsedMhz = hostStats.getCpuUtilization() * host.getCpus() * host.getSpeed() / 100.0 ; - if (host.isInMaintenanceStates()) { - metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, ALLOCATED, 0L, isDedicated, hostTags)); - metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, USED, 0L, isDedicated, hostTags)); - metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, TOTAL, 0L, isDedicated, hostTags)); + if (cpuCapacity == null && !host.isInMaintenanceStates()){ + metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.CPU_CAPACITY)); } - else if (cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.Enabled) { + + if (hostStats != null && cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.Enabled) { + final double cpuUsedMhz = hostStats.getCpuUtilization() * host.getCpus() * host.getSpeed() / 100.0 ; metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, ALLOCATED, cpuCapacity.getUsedCapacity(), isDedicated, hostTags)); metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, USED, cpuUsedMhz, isDedicated, hostTags)); metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, TOTAL, cpuCapacity.getTotalCapacity(), isDedicated, hostTags)); @@ -181,12 +201,12 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp final String memoryFactor = String.valueOf(CapacityManager.MemOverprovisioningFactor.valueIn(host.getClusterId())); final CapacityVO memCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_MEMORY); - if (host.isInMaintenanceStates()) { - metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, ALLOCATED, 0L, isDedicated, hostTags)); - metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, USED, 0, isDedicated, hostTags)); - metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, TOTAL, 0L, isDedicated, hostTags)); + + if (memCapacity == null && !host.isInMaintenanceStates()){ + metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.MEM_CAPACITY)); } - else if (memCapacity != null && memCapacity.getCapacityState() == CapacityState.Enabled) { + + if (hostStats != null && memCapacity != null && memCapacity.getCapacityState() == CapacityState.Enabled) { metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, ALLOCATED, memCapacity.getUsedCapacity(), isDedicated, hostTags)); metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, USED, hostStats.getUsedMemory(), isDedicated, hostTags)); metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, TOTAL, memCapacity.getTotalCapacity(), isDedicated, hostTags)); @@ -197,13 +217,13 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp } metricsList.add(new ItemHostVM(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), vmDao.listByHostId(host.getId()).size())); - final CapacityVO coreCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_CPU_CORE); - if (host.isInMaintenanceStates()) { - metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), USED, 0L, isDedicated, hostTags)); - metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), TOTAL, 0L, isDedicated, hostTags)); + + if (coreCapacity == null && !host.isInMaintenanceStates()){ + metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.CORE_CAPACITY)); } - else if (coreCapacity != null && coreCapacity.getCapacityState() == CapacityState.Enabled) { + + if (hostStats != null && coreCapacity != null && coreCapacity.getCapacityState() == CapacityState.Enabled) { metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), USED, coreCapacity.getUsedCapacity(), isDedicated, hostTags)); metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), TOTAL, coreCapacity.getTotalCapacity(), isDedicated, hostTags)); } else { @@ -213,17 +233,17 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp } final List cpuCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_CPU, dcId, null, null); - if (cpuCapacity != null && cpuCapacity.size() > 0) { + if (cpuCapacity != null && !cpuCapacity.isEmpty()) { metricsList.add(new ItemHostCpu(zoneName, zoneUuid, null, null, null, null, ALLOCATED, cpuCapacity.get(0).getAllocatedCapacity() != null ? cpuCapacity.get(0).getAllocatedCapacity() : 0, 0, "")); } final List memCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_MEMORY, dcId, null, null); - if (memCapacity != null && memCapacity.size() > 0) { + if (memCapacity != null && !memCapacity.isEmpty()) { metricsList.add(new ItemHostMemory(zoneName, zoneUuid, null, null, null, null, ALLOCATED, memCapacity.get(0).getAllocatedCapacity() != null ? memCapacity.get(0).getAllocatedCapacity() : 0, 0, "")); } final List coreCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_CPU_CORE, dcId, null, null); - if (coreCapacity != null && coreCapacity.size() > 0) { + if (coreCapacity != null && !coreCapacity.isEmpty()) { metricsList.add(new ItemVMCore(zoneName, zoneUuid, null, null, null, ALLOCATED, coreCapacity.get(0).getAllocatedCapacity() != null ? coreCapacity.get(0).getAllocatedCapacity() : 0, 0, "")); } @@ -626,6 +646,25 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp } } + class MissingHostInfo extends Item { + + String zoneName; + String hostName; + MissingInfoFilter filter; + + public MissingHostInfo(String zoneName, String hostname, MissingInfoFilter filter) { + super("cloudstack_host_missing_info"); + this.zoneName = zoneName; + this.hostName = hostname; + this.filter = filter; + } + + @Override + public String toMetricsString() { + return String.format("%s{zone=\"%s\",hostname=\"%s\",filter=\"%s\"} -1", name, zoneName, hostName, filter); + } + } + class ItemHostCpu extends Item { String zoneName; String zoneUuid;