#!/usr/bin/python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from configparser import ConfigParser from subprocess import * from datetime import datetime import time import os import logging import json from os import sys, path from health_checks.utility import getHealthChecksData class StatusCodes: SUCCESS = 0 FAILED = 1 INVALID_INP = 2 RUNNING = 3 STOPPED = 4 STARTING = 5 # see com.cloud.network.VirtualNetworkApplianceService.RouterHealthStatus and make sure to keep it aligned class RouterHealthStatus: SUCCESS = "SUCCESS" FAILED = "FAILED" WARNING = "WARNING" UNKNOWN = "UNKNOWN" class Log: INFO = 'INFO' ALERT = 'ALERT' CRIT = 'CRIT' NOTIF = 'NOTIF' class Config: SLEEP_SEC = 1 RETRY_ITERATIONS = 10 RETRY_FOR_RESTART = 5 MONITOR_LOG = '/var/log/monitor.log' HEALTH_CHECKS_DIR = 'health_checks' MONITOR_RESULT_FILE_SUFFIX = 'monitor_results.json' FAILING_CHECKS_FILE = 'failing_health_checks' def getServicesConfig( config_file_path = "/etc/monitor.conf" ): """ Reads the process configuration from the config file. Config file contains the processes to be monitored. """ process_dict = {} parser = ConfigParser() parser.read( config_file_path ) for section in parser.sections(): process_dict[section] = {} for name, value in parser.items(section): process_dict[section][name] = value printd (" %s = %r" % (name, value)) return process_dict def printd (msg): """ prints the debug messages """ #for debug #print msg f= open(Config.MONITOR_LOG, 'w' if not path.isfile(Config.MONITOR_LOG) else 'r+') f.seek(0, 2) f.write(str(msg)+"\n") f.close() print(str(msg)) def raisealert(severity, msg, process_name=None): """ Writes the alert message""" #timeStr=str(time.ctime()) if process_name is not None: log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n" else: log = '['+severity+']' + " " + msg +"\n" logging.basicConfig(level=logging.INFO,filename='/var/log/routerServiceMonitor.log',format='%(asctime)s %(message)s') logging.info(log) msg = 'logger -t monit '+ log pout = Popen(msg, shell=True, stdout=PIPE) print("[Alert] " + msg) def isPidMatchPidFile(pidfile, pids): """ Compares the running process pid with the pid in pid file. If a process with multiple pids then it matches with pid file """ if pids is None or isinstance(pids,list) != True or len(pids) == 0: printd ("Invalid Arguments") return StatusCodes.FAILED if not path.isfile(pidfile): #It seems there is no pid file for this service printd("The pid file "+pidfile+" is not there for this process") return StatusCodes.FAILED fd=None try: fd = open(pidfile,'r') except: printd("pid file: "+ pidfile +" open failed") return StatusCodes.FAILED inp = fd.read() if not inp: fd.close() return StatusCodes.FAILED printd("file content of pidfile " + pidfile + " = " + str(inp).strip()) printd(pids) tocheck_pid = inp.strip() for item in pids: if str(tocheck_pid) == item.strip(): printd("pid file matched") fd.close() return StatusCodes.SUCCESS fd.close() return StatusCodes.FAILED def checkProcessRunningStatus(process_name, pidFile): printd("checking the process " + process_name) cmd = '' pids = [] cmd = 'pidof ' + process_name printd(cmd) #cmd = 'service ' + process_name + ' status' pout = Popen(cmd, shell=True, stdout=PIPE) exitStatus = pout.wait() temp_out = pout.communicate()[0].decode() #check there is only one pid or not if exitStatus == 0: pids = temp_out.strip().split(' ') printd("pid(s) of process %s are %s " %(process_name, pids)) #there is more than one process so match the pid file #if not matched set pidFileMatched=False printd("Checking pid file") if isPidMatchPidFile(pidFile, pids) == StatusCodes.SUCCESS: return True,pids printd("pid of exit status %s" %exitStatus) return False,pids def restartService(service_name): cmd = 'service ' + service_name + ' restart' cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) return_val = cout.wait() if return_val == 0: printd("The service " + service_name +" recovered successfully ") msg="The process " +service_name+" is recovered successfully " raisealert(Log.INFO,msg,service_name) return True else: printd("process restart failed ....") return False def checkProcessStatus( process ): """ Check the process running status, if not running tries to restart Returns the process status and if it was restarted """ process_name = process.get('processname') service_name = process.get('servicename') pidfile = process.get('pidfile') #temp_out = None restartFailed=False pidFileMatched=False pids='' cmd='' if process_name is None: printd ("\n Invalid Process Name") return StatusCodes.INVALID_INP, False status, pids = checkProcessRunningStatus(process_name, pidfile) if status == True: printd("The process is running ....") return StatusCodes.RUNNING, False else: printd("Process %s is not running trying to recover" %process_name) #Retry the process state for few seconds for i in range(1, Config.RETRY_ITERATIONS): time.sleep(Config.SLEEP_SEC) if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times status, pids = checkProcessRunningStatus(process_name, pidfile) if status == True: raisealert(Log.ALERT, "The process detected as running", process_name) break else: printd("Process %s is not running checking the status again..." %process_name) continue else: msg="The process " +process_name+" is not running trying recover " raisealert(Log.INFO,process_name,msg) if service_name == 'apache2': # Killing apache2 process with this the main service will not start for pid in pids: cmd = 'kill -9 '+pid printd(cmd) Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) if restartService(service_name) == True: break else: restartFailed = True continue #for end here if restartFailed == True: msg="The process %s recover failed "%process_name raisealert(Log.ALERT,process_name,msg) printd("Restart failed after number of retries") return StatusCodes.STOPPED, False return StatusCodes.RUNNING, True def monitProcess( processes_info ): """ Monitors the processes which got from the config file """ checkStartTime = time.time() service_status = {} failing_services = [] if len( processes_info ) == 0: printd("No config items provided - means a redundant VR or a VPC Router") return service_status, failing_services print("[Process Info] " + json.dumps(processes_info)) #time for noting process down time csec = repr(time.time()).split('.')[0] for process,properties in list(processes_info.items()): printd ("---------------------------\nchecking the service %s\n---------------------------- " %process) serviceName = process + ".service" processStatus, wasRestarted = checkProcessStatus(properties) routerHealth = RouterHealthStatus.UNKNOWN match processStatus: case StatusCodes.RUNNING: routerHealth = RouterHealthStatus.SUCCESS routerMessage = "service is running" + (", was restarted" if wasRestarted else "") case StatusCodes.STARTING: routerHealth = RouterHealthStatus.WARNING routerMessage = "service is starting at " + str(csec) case StatusCodes.STOPPED: routerHealth = RouterHealthStatus.WARNING routerMessage = "service down at last check " + str(csec) case StatusCodes.SUCCESS: routerHealth = RouterHealthStatus.UNKNOWN routerMessage = "service exisits but no status" case StatusCodes.FAILED | StatusCodes.INVALID_INP: routerHealth = RouterHealthStatus.FAILED routerMessage = "service down at last check " + str(csec) printd( "\n Service %s is status == " % routerHealth) checkEndTime = time.time() service_status[serviceName] = { "success": routerHealth, "lastUpdate": str(int(checkStartTime * 1000)), "lastRunDuration": str((checkEndTime - checkStartTime) * 1000), "message": routerMessage } if routerHealth != RouterHealthStatus.SUCCESS: failing_services.append(serviceName) return service_status, failing_services def execute(script, checkType = "basic"): checkStartTime = time.time() cmd = "./" + script + " " + checkType printd ("Executing health check script command: " + cmd) pout = Popen(cmd, shell=True, stdout=PIPE) exitStatus = pout.wait() output = pout.communicate()[0].decode().strip() checkEndTime = time.time() # we run all scripts and have to ignore the ones that do nothing if not len(output) > 0 and exitStatus == 0: return {} routerHealth = RouterHealthStatus.SUCCESS match exitStatus: case 1: routerHealth = RouterHealthStatus.FAILED case 2: routerHealth = RouterHealthStatus.WARNING case 3: routerHealth = RouterHealthStatus.UNKNOWN printd("Ended execution of " + script) return { "success": routerHealth, "lastUpdate": str(int(checkStartTime * 1000)), "lastRunDuration": str((checkEndTime - checkStartTime) * 1000), "message": output } def main(checkType = "basic"): startTime = time.time() ''' Step1 : Get Services Config ''' printd("monitoring started") configDict = getServicesConfig() ''' Step2: Monitor services and Raise Alerts ''' monitResult = {} failingChecks = [] if checkType == "basic": monitResult, failingChecks = monitProcess(configDict) ''' Step3: Run health check scripts as needed ''' hc_data = getHealthChecksData() if hc_data is not None and "health_checks_enabled" in hc_data and hc_data['health_checks_enabled']: hc_exclude = hc_data["excluded_health_checks"] if "excluded_health_checks" in hc_data else [] for f in os.listdir(Config.HEALTH_CHECKS_DIR): if f in hc_exclude: continue fpath = path.join(Config.HEALTH_CHECKS_DIR, f) if path.isfile(fpath) and os.access(fpath, os.X_OK): ret = execute(fpath, checkType) if len(ret) == 0: continue if "success" in ret and ret["success"].upper() == RouterHealthStatus.FAILED: failingChecks.append(f) monitResult[f] = ret ''' Step4: Write results to the json file for admins/management server to read ''' endTime = time.time() monitResult["lastRun"] = { "start": str(datetime.fromtimestamp(startTime)), "end": str(datetime.fromtimestamp(endTime)), "duration": str(endTime - startTime) } with open(checkType + "_" + Config.MONITOR_RESULT_FILE_SUFFIX, 'w') as f: json.dump(monitResult, f, ensure_ascii=False) failChecksFile = checkType + "_" + Config.FAILING_CHECKS_FILE if len(failingChecks) > 0: fcs = "" for fc in failingChecks: fcs = fcs + fc + "," fcs = fcs[0:-1] with open(failChecksFile, 'w') as f: f.write(fcs) elif path.isfile(failChecksFile): os.remove(failChecksFile) if __name__ == "__main__": checkType = "basic" if len(sys.argv) == 2: if sys.argv[1] == "advanced": main("advanced") elif sys.argv[1] == "basic": main("basic") else: printd("Error: Unknown type of test: " + sys.argv) else: main("basic") main("advanced")