Add missing hosts info to the prometheus exporter output. (#8328)

Sometimes the hostStats object of the agents becomes null in the management server. It is a rare situation, and we haven't found the root cause yet, but it occurs occasionally in our CloudStack deployments with many hosts.

The hostStat is null, even though the agent is UP and hosting multiple VMs. It is possible to access the VM consoles and execute tasks on them.

This pull request doesn't address the issue directly; rather it displays those hosts in Prometheus so we can restart the agent and get the necessary information.
This commit is contained in:
Sina Kashipazha 2023-12-08 15:21:06 +01:00 committed by GitHub
parent c599011ef5
commit 2993c99363
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -82,6 +82,24 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
private static final String ONLINE = "online";
private static final String OFFLINE = "offline";
enum MissingInfoFilter {
Host_Stats("hostStats"),
CPU_CAPACITY("cpuCapacity"),
MEM_CAPACITY("memCapacity"),
CORE_CAPACITY("coreCapacity");
private final String name;
MissingInfoFilter(String name){
this.name = name;
}
@Override
public String toString() {
return name;
}
}
private static List<Item> metricsItems = new ArrayList<>();
@Inject
@ -129,8 +147,6 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
Map<String, Integer> upHosts = new HashMap<>();
Map<String, Integer> downHosts = new HashMap<>();
HostStats hostStats;
for (final HostVO host : hostDao.listAll()) {
if (host == null || host.getType() != Host.Type.Routing || host.getDataCenterId() != dcId) {
continue;
@ -147,8 +163,6 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
int isDedicated = (dr != null) ? 1 : 0;
metricsList.add(new ItemHostIsDedicated(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), isDedicated));
String hostTags = markTagMaps(host, totalHosts, upHosts, downHosts);
hostStats = ApiDBUtils.getHostStatistics(host.getId());
// Get account, domain details for dedicated hosts
if (isDedicated == 1) {
@ -160,16 +174,22 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
metricsList.add(new ItemHostDedicatedToAccount(zoneName, host.getName(), accountName, domain.getPath(), isDedicated));
}
String hostTags = markTagMaps(host, totalHosts, upHosts, downHosts);
HostStats hostStats = ApiDBUtils.getHostStatistics(host.getId());
if (hostStats == null){
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.Host_Stats));
}
final String cpuFactor = String.valueOf(CapacityManager.CpuOverprovisioningFactor.valueIn(host.getClusterId()));
final CapacityVO cpuCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_CPU);
final double cpuUsedMhz = hostStats.getCpuUtilization() * host.getCpus() * host.getSpeed() / 100.0 ;
if (host.isInMaintenanceStates()) {
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, ALLOCATED, 0L, isDedicated, hostTags));
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, USED, 0L, isDedicated, hostTags));
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, TOTAL, 0L, isDedicated, hostTags));
if (cpuCapacity == null && !host.isInMaintenanceStates()){
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.CPU_CAPACITY));
}
else if (cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.Enabled) {
if (hostStats != null && cpuCapacity != null && cpuCapacity.getCapacityState() == CapacityState.Enabled) {
final double cpuUsedMhz = hostStats.getCpuUtilization() * host.getCpus() * host.getSpeed() / 100.0 ;
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, ALLOCATED, cpuCapacity.getUsedCapacity(), isDedicated, hostTags));
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, USED, cpuUsedMhz, isDedicated, hostTags));
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), cpuFactor, TOTAL, cpuCapacity.getTotalCapacity(), isDedicated, hostTags));
@ -181,12 +201,12 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
final String memoryFactor = String.valueOf(CapacityManager.MemOverprovisioningFactor.valueIn(host.getClusterId()));
final CapacityVO memCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_MEMORY);
if (host.isInMaintenanceStates()) {
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, ALLOCATED, 0L, isDedicated, hostTags));
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, USED, 0, isDedicated, hostTags));
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, TOTAL, 0L, isDedicated, hostTags));
if (memCapacity == null && !host.isInMaintenanceStates()){
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.MEM_CAPACITY));
}
else if (memCapacity != null && memCapacity.getCapacityState() == CapacityState.Enabled) {
if (hostStats != null && memCapacity != null && memCapacity.getCapacityState() == CapacityState.Enabled) {
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, ALLOCATED, memCapacity.getUsedCapacity(), isDedicated, hostTags));
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, USED, hostStats.getUsedMemory(), isDedicated, hostTags));
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), memoryFactor, TOTAL, memCapacity.getTotalCapacity(), isDedicated, hostTags));
@ -197,13 +217,13 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
}
metricsList.add(new ItemHostVM(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), vmDao.listByHostId(host.getId()).size()));
final CapacityVO coreCapacity = capacityDao.findByHostIdType(host.getId(), Capacity.CAPACITY_TYPE_CPU_CORE);
if (host.isInMaintenanceStates()) {
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), USED, 0L, isDedicated, hostTags));
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), TOTAL, 0L, isDedicated, hostTags));
if (coreCapacity == null && !host.isInMaintenanceStates()){
metricsList.add(new MissingHostInfo(zoneName, host.getName(), MissingInfoFilter.CORE_CAPACITY));
}
else if (coreCapacity != null && coreCapacity.getCapacityState() == CapacityState.Enabled) {
if (hostStats != null && coreCapacity != null && coreCapacity.getCapacityState() == CapacityState.Enabled) {
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), USED, coreCapacity.getUsedCapacity(), isDedicated, hostTags));
metricsList.add(new ItemVMCore(zoneName, zoneUuid, host.getName(), host.getUuid(), host.getPrivateIpAddress(), TOTAL, coreCapacity.getTotalCapacity(), isDedicated, hostTags));
} else {
@ -213,17 +233,17 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
}
final List<CapacityDaoImpl.SummedCapacity> cpuCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_CPU, dcId, null, null);
if (cpuCapacity != null && cpuCapacity.size() > 0) {
if (cpuCapacity != null && !cpuCapacity.isEmpty()) {
metricsList.add(new ItemHostCpu(zoneName, zoneUuid, null, null, null, null, ALLOCATED, cpuCapacity.get(0).getAllocatedCapacity() != null ? cpuCapacity.get(0).getAllocatedCapacity() : 0, 0, ""));
}
final List<CapacityDaoImpl.SummedCapacity> memCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_MEMORY, dcId, null, null);
if (memCapacity != null && memCapacity.size() > 0) {
if (memCapacity != null && !memCapacity.isEmpty()) {
metricsList.add(new ItemHostMemory(zoneName, zoneUuid, null, null, null, null, ALLOCATED, memCapacity.get(0).getAllocatedCapacity() != null ? memCapacity.get(0).getAllocatedCapacity() : 0, 0, ""));
}
final List<CapacityDaoImpl.SummedCapacity> coreCapacity = capacityDao.findCapacityBy((int) Capacity.CAPACITY_TYPE_CPU_CORE, dcId, null, null);
if (coreCapacity != null && coreCapacity.size() > 0) {
if (coreCapacity != null && !coreCapacity.isEmpty()) {
metricsList.add(new ItemVMCore(zoneName, zoneUuid, null, null, null, ALLOCATED, coreCapacity.get(0).getAllocatedCapacity() != null ? coreCapacity.get(0).getAllocatedCapacity() : 0, 0, ""));
}
@ -626,6 +646,25 @@ public class PrometheusExporterImpl extends ManagerBase implements PrometheusExp
}
}
class MissingHostInfo extends Item {
String zoneName;
String hostName;
MissingInfoFilter filter;
public MissingHostInfo(String zoneName, String hostname, MissingInfoFilter filter) {
super("cloudstack_host_missing_info");
this.zoneName = zoneName;
this.hostName = hostname;
this.filter = filter;
}
@Override
public String toMetricsString() {
return String.format("%s{zone=\"%s\",hostname=\"%s\",filter=\"%s\"} -1", name, zoneName, hostName, filter);
}
}
class ItemHostCpu extends Item {
String zoneName;
String zoneUuid;