mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
* [routers] distiction between fatal failure and warning or unknown on healthchecks * UI status for router health checks * status from scripts varied * automation signalled errors * revert removal of update sql * upgradeversion * move config item and further cleanup * handling services better * backwards compatible response --------- Co-authored-by: Daan Hoogland <dahn@apache.org>
413 lines
13 KiB
Python
Executable File
413 lines
13 KiB
Python
Executable File
#!/usr/bin/python
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
from configparser import ConfigParser
|
|
from subprocess import *
|
|
from datetime import datetime
|
|
import time
|
|
import os
|
|
import logging
|
|
import json
|
|
from os import sys, path
|
|
from health_checks.utility import getHealthChecksData
|
|
|
|
class StatusCodes:
|
|
SUCCESS = 0
|
|
FAILED = 1
|
|
INVALID_INP = 2
|
|
RUNNING = 3
|
|
STOPPED = 4
|
|
STARTING = 5
|
|
|
|
# see com.cloud.network.VirtualNetworkApplianceService.RouterHealthStatus and make sure to keep it aligned
|
|
class RouterHealthStatus:
|
|
SUCCESS = "SUCCESS"
|
|
FAILED = "FAILED"
|
|
WARNING = "WARNING"
|
|
UNKNOWN = "UNKNOWN"
|
|
|
|
class Log:
|
|
INFO = 'INFO'
|
|
ALERT = 'ALERT'
|
|
CRIT = 'CRIT'
|
|
NOTIF = 'NOTIF'
|
|
|
|
class Config:
|
|
SLEEP_SEC = 1
|
|
RETRY_ITERATIONS = 10
|
|
RETRY_FOR_RESTART = 5
|
|
MONITOR_LOG = '/var/log/monitor.log'
|
|
HEALTH_CHECKS_DIR = 'health_checks'
|
|
MONITOR_RESULT_FILE_SUFFIX = 'monitor_results.json'
|
|
FAILING_CHECKS_FILE = 'failing_health_checks'
|
|
|
|
def getServicesConfig( config_file_path = "/etc/monitor.conf" ):
|
|
"""
|
|
Reads the process configuration from the config file.
|
|
Config file contains the processes to be monitored.
|
|
|
|
"""
|
|
process_dict = {}
|
|
parser = ConfigParser()
|
|
parser.read( config_file_path )
|
|
|
|
|
|
for section in parser.sections():
|
|
process_dict[section] = {}
|
|
|
|
for name, value in parser.items(section):
|
|
process_dict[section][name] = value
|
|
printd (" %s = %r" % (name, value))
|
|
|
|
return process_dict
|
|
|
|
def printd (msg):
|
|
"""
|
|
prints the debug messages
|
|
"""
|
|
|
|
#for debug
|
|
#print msg
|
|
|
|
f= open(Config.MONITOR_LOG, 'w' if not path.isfile(Config.MONITOR_LOG) else 'r+')
|
|
f.seek(0, 2)
|
|
f.write(str(msg)+"\n")
|
|
f.close()
|
|
print(str(msg))
|
|
|
|
def raisealert(severity, msg, process_name=None):
|
|
""" Writes the alert message"""
|
|
|
|
#timeStr=str(time.ctime())
|
|
if process_name is not None:
|
|
log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
|
|
else:
|
|
log = '['+severity+']' + " " + msg +"\n"
|
|
|
|
logging.basicConfig(level=logging.INFO,filename='/var/log/routerServiceMonitor.log',format='%(asctime)s %(message)s')
|
|
logging.info(log)
|
|
msg = 'logger -t monit '+ log
|
|
pout = Popen(msg, shell=True, stdout=PIPE)
|
|
print("[Alert] " + msg)
|
|
|
|
|
|
def isPidMatchPidFile(pidfile, pids):
|
|
""" Compares the running process pid with the pid in pid file.
|
|
If a process with multiple pids then it matches with pid file
|
|
"""
|
|
|
|
if pids is None or isinstance(pids,list) != True or len(pids) == 0:
|
|
printd ("Invalid Arguments")
|
|
return StatusCodes.FAILED
|
|
if not path.isfile(pidfile):
|
|
#It seems there is no pid file for this service
|
|
printd("The pid file "+pidfile+" is not there for this process")
|
|
return StatusCodes.FAILED
|
|
|
|
fd=None
|
|
try:
|
|
fd = open(pidfile,'r')
|
|
except:
|
|
printd("pid file: "+ pidfile +" open failed")
|
|
return StatusCodes.FAILED
|
|
|
|
|
|
inp = fd.read()
|
|
|
|
if not inp:
|
|
fd.close()
|
|
return StatusCodes.FAILED
|
|
|
|
printd("file content of pidfile " + pidfile + " = " + str(inp).strip())
|
|
printd(pids)
|
|
tocheck_pid = inp.strip()
|
|
for item in pids:
|
|
if str(tocheck_pid) == item.strip():
|
|
printd("pid file matched")
|
|
fd.close()
|
|
return StatusCodes.SUCCESS
|
|
|
|
fd.close()
|
|
return StatusCodes.FAILED
|
|
|
|
def checkProcessRunningStatus(process_name, pidFile):
|
|
printd("checking the process " + process_name)
|
|
cmd = ''
|
|
pids = []
|
|
cmd = 'pidof ' + process_name
|
|
printd(cmd)
|
|
|
|
#cmd = 'service ' + process_name + ' status'
|
|
pout = Popen(cmd, shell=True, stdout=PIPE)
|
|
exitStatus = pout.wait()
|
|
temp_out = pout.communicate()[0].decode()
|
|
|
|
#check there is only one pid or not
|
|
if exitStatus == 0:
|
|
pids = temp_out.strip().split(' ')
|
|
printd("pid(s) of process %s are %s " %(process_name, pids))
|
|
|
|
#there is more than one process so match the pid file
|
|
#if not matched set pidFileMatched=False
|
|
printd("Checking pid file")
|
|
if isPidMatchPidFile(pidFile, pids) == StatusCodes.SUCCESS:
|
|
return True,pids
|
|
|
|
printd("pid of exit status %s" %exitStatus)
|
|
|
|
return False,pids
|
|
|
|
def restartService(service_name):
|
|
|
|
cmd = 'service ' + service_name + ' restart'
|
|
cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
|
|
return_val = cout.wait()
|
|
|
|
if return_val == 0:
|
|
printd("The service " + service_name +" recovered successfully ")
|
|
msg="The process " +service_name+" is recovered successfully "
|
|
raisealert(Log.INFO,msg,service_name)
|
|
return True
|
|
else:
|
|
printd("process restart failed ....")
|
|
|
|
return False
|
|
|
|
def checkProcessStatus( process ):
|
|
"""
|
|
Check the process running status, if not running tries to restart
|
|
Returns the process status and if it was restarted
|
|
"""
|
|
process_name = process.get('processname')
|
|
service_name = process.get('servicename')
|
|
pidfile = process.get('pidfile')
|
|
#temp_out = None
|
|
restartFailed=False
|
|
pidFileMatched=False
|
|
pids=''
|
|
cmd=''
|
|
if process_name is None:
|
|
printd ("\n Invalid Process Name")
|
|
return StatusCodes.INVALID_INP, False
|
|
|
|
status, pids = checkProcessRunningStatus(process_name, pidfile)
|
|
|
|
if status == True:
|
|
printd("The process is running ....")
|
|
return StatusCodes.RUNNING, False
|
|
else:
|
|
printd("Process %s is not running trying to recover" %process_name)
|
|
#Retry the process state for few seconds
|
|
|
|
for i in range(1, Config.RETRY_ITERATIONS):
|
|
time.sleep(Config.SLEEP_SEC)
|
|
|
|
if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times
|
|
|
|
status, pids = checkProcessRunningStatus(process_name, pidfile)
|
|
if status == True:
|
|
raisealert(Log.ALERT, "The process detected as running", process_name)
|
|
break
|
|
else:
|
|
printd("Process %s is not running checking the status again..." %process_name)
|
|
continue
|
|
else:
|
|
msg="The process " +process_name+" is not running trying recover "
|
|
raisealert(Log.INFO,process_name,msg)
|
|
|
|
if service_name == 'apache2':
|
|
# Killing apache2 process with this the main service will not start
|
|
for pid in pids:
|
|
cmd = 'kill -9 '+pid
|
|
printd(cmd)
|
|
Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
|
|
|
|
if restartService(service_name) == True:
|
|
break
|
|
else:
|
|
restartFailed = True
|
|
continue
|
|
#for end here
|
|
|
|
if restartFailed == True:
|
|
msg="The process %s recover failed "%process_name
|
|
raisealert(Log.ALERT,process_name,msg)
|
|
|
|
printd("Restart failed after number of retries")
|
|
return StatusCodes.STOPPED, False
|
|
|
|
return StatusCodes.RUNNING, True
|
|
|
|
|
|
def monitProcess( processes_info ):
|
|
"""
|
|
Monitors the processes which got from the config file
|
|
"""
|
|
checkStartTime = time.time()
|
|
service_status = {}
|
|
failing_services = []
|
|
if len( processes_info ) == 0:
|
|
printd("No config items provided - means a redundant VR or a VPC Router")
|
|
return service_status, failing_services
|
|
|
|
print("[Process Info] " + json.dumps(processes_info))
|
|
|
|
#time for noting process down time
|
|
csec = repr(time.time()).split('.')[0]
|
|
|
|
for process,properties in list(processes_info.items()):
|
|
printd ("---------------------------\nchecking the service %s\n---------------------------- " %process)
|
|
serviceName = process + ".service"
|
|
processStatus, wasRestarted = checkProcessStatus(properties)
|
|
routerHealth = RouterHealthStatus.UNKNOWN
|
|
|
|
match processStatus:
|
|
case StatusCodes.RUNNING:
|
|
routerHealth = RouterHealthStatus.SUCCESS
|
|
routerMessage = "service is running" + (", was restarted" if wasRestarted else "")
|
|
case StatusCodes.STARTING:
|
|
routerHealth = RouterHealthStatus.WARNING
|
|
routerMessage = "service is starting at " + str(csec)
|
|
case StatusCodes.STOPPED:
|
|
routerHealth = RouterHealthStatus.WARNING
|
|
routerMessage = "service down at last check " + str(csec)
|
|
case StatusCodes.SUCCESS:
|
|
routerHealth = RouterHealthStatus.UNKNOWN
|
|
routerMessage = "service exisits but no status"
|
|
case StatusCodes.FAILED | StatusCodes.INVALID_INP:
|
|
routerHealth = RouterHealthStatus.FAILED
|
|
routerMessage = "service down at last check " + str(csec)
|
|
|
|
printd( "\n Service %s is status == " % routerHealth)
|
|
checkEndTime = time.time()
|
|
service_status[serviceName] = {
|
|
"success": routerHealth,
|
|
"lastUpdate": str(int(checkStartTime * 1000)),
|
|
"lastRunDuration": str((checkEndTime - checkStartTime) * 1000),
|
|
"message": routerMessage
|
|
}
|
|
if routerHealth != RouterHealthStatus.SUCCESS:
|
|
failing_services.append(serviceName)
|
|
|
|
return service_status, failing_services
|
|
|
|
|
|
def execute(script, checkType = "basic"):
|
|
checkStartTime = time.time()
|
|
cmd = "./" + script + " " + checkType
|
|
printd ("Executing health check script command: " + cmd)
|
|
|
|
pout = Popen(cmd, shell=True, stdout=PIPE)
|
|
exitStatus = pout.wait()
|
|
output = pout.communicate()[0].decode().strip()
|
|
checkEndTime = time.time()
|
|
|
|
# we run all scripts and have to ignore the ones that do nothing
|
|
if not len(output) > 0 and exitStatus == 0:
|
|
return {}
|
|
|
|
routerHealth = RouterHealthStatus.SUCCESS
|
|
match exitStatus:
|
|
case 1:
|
|
routerHealth = RouterHealthStatus.FAILED
|
|
case 2:
|
|
routerHealth = RouterHealthStatus.WARNING
|
|
case 3:
|
|
routerHealth = RouterHealthStatus.UNKNOWN
|
|
|
|
printd("Ended execution of " + script)
|
|
return {
|
|
"success": routerHealth,
|
|
"lastUpdate": str(int(checkStartTime * 1000)),
|
|
"lastRunDuration": str((checkEndTime - checkStartTime) * 1000),
|
|
"message": output
|
|
}
|
|
|
|
def main(checkType = "basic"):
|
|
startTime = time.time()
|
|
'''
|
|
Step1 : Get Services Config
|
|
'''
|
|
printd("monitoring started")
|
|
configDict = getServicesConfig()
|
|
|
|
'''
|
|
Step2: Monitor services and Raise Alerts
|
|
'''
|
|
monitResult = {}
|
|
failingChecks = []
|
|
if checkType == "basic":
|
|
monitResult, failingChecks = monitProcess(configDict)
|
|
|
|
'''
|
|
Step3: Run health check scripts as needed
|
|
'''
|
|
hc_data = getHealthChecksData()
|
|
|
|
if hc_data is not None and "health_checks_enabled" in hc_data and hc_data['health_checks_enabled']:
|
|
hc_exclude = hc_data["excluded_health_checks"] if "excluded_health_checks" in hc_data else []
|
|
for f in os.listdir(Config.HEALTH_CHECKS_DIR):
|
|
if f in hc_exclude:
|
|
continue
|
|
fpath = path.join(Config.HEALTH_CHECKS_DIR, f)
|
|
if path.isfile(fpath) and os.access(fpath, os.X_OK):
|
|
ret = execute(fpath, checkType)
|
|
if len(ret) == 0:
|
|
continue
|
|
if "success" in ret and ret["success"].upper() == RouterHealthStatus.FAILED:
|
|
failingChecks.append(f)
|
|
monitResult[f] = ret
|
|
|
|
'''
|
|
Step4: Write results to the json file for admins/management server to read
|
|
'''
|
|
|
|
endTime = time.time()
|
|
monitResult["lastRun"] = {
|
|
"start": str(datetime.fromtimestamp(startTime)),
|
|
"end": str(datetime.fromtimestamp(endTime)),
|
|
"duration": str(endTime - startTime)
|
|
}
|
|
|
|
with open(checkType + "_" + Config.MONITOR_RESULT_FILE_SUFFIX, 'w') as f:
|
|
json.dump(monitResult, f, ensure_ascii=False)
|
|
|
|
failChecksFile = checkType + "_" + Config.FAILING_CHECKS_FILE
|
|
if len(failingChecks) > 0:
|
|
fcs = ""
|
|
for fc in failingChecks:
|
|
fcs = fcs + fc + ","
|
|
fcs = fcs[0:-1]
|
|
with open(failChecksFile, 'w') as f:
|
|
f.write(fcs)
|
|
elif path.isfile(failChecksFile):
|
|
os.remove(failChecksFile)
|
|
|
|
if __name__ == "__main__":
|
|
checkType = "basic"
|
|
if len(sys.argv) == 2:
|
|
if sys.argv[1] == "advanced":
|
|
main("advanced")
|
|
elif sys.argv[1] == "basic":
|
|
main("basic")
|
|
else:
|
|
printd("Error: Unknown type of test: " + sys.argv)
|
|
else:
|
|
main("basic")
|
|
main("advanced")
|