mirror of
				https://github.com/apache/cloudstack.git
				synced 2025-10-26 08:42:29 +01:00 
			
		
		
		
	
		
			
				
	
	
		
			393 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			393 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/python
 | |
| # Licensed to the Apache Software Foundation (ASF) under one
 | |
| # or more contributor license agreements.  See the NOTICE file
 | |
| # distributed with this work for additional information
 | |
| # regarding copyright ownership.  The ASF licenses this file
 | |
| # to you under the Apache License, Version 2.0 (the
 | |
| # "License"); you may not use this file except in compliance
 | |
| # with the License.  You may obtain a copy of the License at
 | |
| #
 | |
| #   http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing,
 | |
| # software distributed under the License is distributed on an
 | |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 | |
| # KIND, either express or implied.  See the License for the
 | |
| # specific language governing permissions and limitations
 | |
| # under the License.
 | |
| 
 | |
| from configparser import ConfigParser
 | |
| from subprocess import *
 | |
| from datetime import datetime
 | |
| import time
 | |
| import os
 | |
| import logging
 | |
| import json
 | |
| from os import sys, path
 | |
| from health_checks.utility import getHealthChecksData
 | |
| 
 | |
| class StatusCodes:
 | |
|     SUCCESS      = 0
 | |
|     FAILED       = 1
 | |
|     INVALID_INP  = 2
 | |
|     RUNNING      = 3
 | |
|     STOPPED      = 4
 | |
|     STARTING     = 5
 | |
| 
 | |
| class Log:
 | |
|     INFO = 'INFO'
 | |
|     ALERT = 'ALERT'
 | |
|     CRIT  = 'CRIT'
 | |
|     NOTIF = 'NOTIF'
 | |
| 
 | |
| class Config:
 | |
|     SLEEP_SEC = 1
 | |
|     RETRY_ITERATIONS = 10
 | |
|     RETRY_FOR_RESTART = 5
 | |
|     MONITOR_LOG = '/var/log/monitor.log'
 | |
|     HEALTH_CHECKS_DIR = 'health_checks'
 | |
|     MONITOR_RESULT_FILE_SUFFIX = 'monitor_results.json'
 | |
|     FAILING_CHECKS_FILE = 'failing_health_checks'
 | |
| 
 | |
| def getServicesConfig( config_file_path = "/etc/monitor.conf" ):
 | |
|     """
 | |
|     Reads the process configuration from the config file.
 | |
|     Config file contains the processes to be monitored.
 | |
| 
 | |
|     """
 | |
|     process_dict = {}
 | |
|     parser = ConfigParser()
 | |
|     parser.read( config_file_path )
 | |
| 
 | |
| 
 | |
|     for section in parser.sections():
 | |
|         process_dict[section] = {}
 | |
| 
 | |
|         for name, value in parser.items(section):
 | |
|             process_dict[section][name] = value
 | |
|             printd (" %s = %r" % (name, value))
 | |
| 
 | |
|     return process_dict
 | |
| 
 | |
| def printd (msg):
 | |
|     """
 | |
|     prints the debug messages
 | |
|     """
 | |
| 
 | |
|     #for debug
 | |
|     #print msg
 | |
| 
 | |
|     f= open(Config.MONITOR_LOG, 'w' if not path.isfile(Config.MONITOR_LOG) else 'r+')
 | |
|     f.seek(0, 2)
 | |
|     f.write(str(msg)+"\n")
 | |
|     f.close()
 | |
|     print(str(msg))
 | |
| 
 | |
| def raisealert(severity, msg, process_name=None):
 | |
|     """ Writes the alert message"""
 | |
| 
 | |
|     #timeStr=str(time.ctime())
 | |
|     if process_name is not None:
 | |
|         log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
 | |
|     else:
 | |
|         log = '['+severity+']' + " " + msg +"\n"
 | |
| 
 | |
|     logging.basicConfig(level=logging.INFO,filename='/var/log/routerServiceMonitor.log',format='%(asctime)s %(message)s')
 | |
|     logging.info(log)
 | |
|     msg = 'logger -t monit '+ log
 | |
|     pout = Popen(msg, shell=True, stdout=PIPE)
 | |
|     print("[Alert] " + msg)
 | |
| 
 | |
| 
 | |
| def isPidMatchPidFile(pidfile, pids):
 | |
|     """ Compares the running process pid with the pid in pid file.
 | |
|         If a process with multiple pids then it matches with pid file
 | |
|     """
 | |
| 
 | |
|     if pids is None or isinstance(pids,list) != True or len(pids) == 0:
 | |
|         printd ("Invalid Arguments")
 | |
|         return StatusCodes.FAILED
 | |
|     if not path.isfile(pidfile):
 | |
|         #It seems there is no pid file for this service
 | |
|         printd("The pid file "+pidfile+" is not there for this process")
 | |
|         return StatusCodes.FAILED
 | |
| 
 | |
|     fd=None
 | |
|     try:
 | |
|         fd = open(pidfile,'r')
 | |
|     except:
 | |
|         printd("pid file: "+ pidfile +" open failed")
 | |
|         return StatusCodes.FAILED
 | |
| 
 | |
| 
 | |
|     inp = fd.read()
 | |
| 
 | |
|     if not inp:
 | |
|         fd.close()
 | |
|         return StatusCodes.FAILED
 | |
| 
 | |
|     printd("file content of pidfile " + pidfile + " = " + str(inp).strip())
 | |
|     printd(pids)
 | |
|     tocheck_pid  =  inp.strip()
 | |
|     for item in pids:
 | |
|         if str(tocheck_pid) ==  item.strip():
 | |
|             printd("pid file matched")
 | |
|             fd.close()
 | |
|             return StatusCodes.SUCCESS
 | |
| 
 | |
|     fd.close()
 | |
|     return StatusCodes.FAILED
 | |
| 
 | |
| def checkProcessRunningStatus(process_name, pidFile):
 | |
|     printd("checking the process " + process_name)
 | |
|     cmd = ''
 | |
|     pids = []
 | |
|     cmd = 'pidof ' + process_name
 | |
|     printd(cmd)
 | |
| 
 | |
|     #cmd = 'service ' + process_name + ' status'
 | |
|     pout = Popen(cmd, shell=True, stdout=PIPE)
 | |
|     exitStatus = pout.wait()
 | |
|     temp_out = pout.communicate()[0].decode()
 | |
| 
 | |
|     #check there is only one pid or not
 | |
|     if exitStatus == 0:
 | |
|         pids = temp_out.strip().split(' ')
 | |
|         printd("pid(s) of process %s are %s " %(process_name, pids))
 | |
| 
 | |
|         #there is more than one process so match the pid file
 | |
|         #if not matched set pidFileMatched=False
 | |
|         printd("Checking pid file")
 | |
|         if isPidMatchPidFile(pidFile, pids) == StatusCodes.SUCCESS:
 | |
|             return True,pids
 | |
| 
 | |
|     printd("pid of exit status %s" %exitStatus)
 | |
| 
 | |
|     return False,pids
 | |
| 
 | |
| def restartService(service_name):
 | |
| 
 | |
|     cmd = 'service ' + service_name + ' restart'
 | |
|     cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
 | |
|     return_val = cout.wait()
 | |
| 
 | |
|     if return_val == 0:
 | |
|         printd("The service " + service_name +" recovered successfully ")
 | |
|         msg="The process " +service_name+" is recovered successfully "
 | |
|         raisealert(Log.INFO,msg,service_name)
 | |
|         return True
 | |
|     else:
 | |
|         printd("process restart failed ....")
 | |
| 
 | |
|     return False
 | |
| 
 | |
| def checkProcessStatus( process ):
 | |
|     """
 | |
|     Check the process running status, if not running tries to restart
 | |
|     Returns the process status and if it was restarted
 | |
|     """
 | |
|     process_name = process.get('processname')
 | |
|     service_name = process.get('servicename')
 | |
|     pidfile = process.get('pidfile')
 | |
|     #temp_out = None
 | |
|     restartFailed=False
 | |
|     pidFileMatched=False
 | |
|     pids=''
 | |
|     cmd=''
 | |
|     if process_name is None:
 | |
|         printd ("\n Invalid Process Name")
 | |
|         return StatusCodes.INVALID_INP, False
 | |
| 
 | |
|     status, pids = checkProcessRunningStatus(process_name, pidfile)
 | |
| 
 | |
|     if status == True:
 | |
|         printd("The process is running ....")
 | |
|         return StatusCodes.RUNNING, False
 | |
|     else:
 | |
|         printd("Process %s is not running trying to recover" %process_name)
 | |
|         #Retry the process state for few seconds
 | |
| 
 | |
|         for i in range(1, Config.RETRY_ITERATIONS):
 | |
|             time.sleep(Config.SLEEP_SEC)
 | |
| 
 | |
|             if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times
 | |
| 
 | |
|                 status, pids = checkProcessRunningStatus(process_name, pidfile)
 | |
|                 if status == True:
 | |
|                     raisealert(Log.ALERT, "The process detected as running", process_name)
 | |
|                     break
 | |
|                 else:
 | |
|                     printd("Process %s is not running checking the status again..." %process_name)
 | |
|                     continue
 | |
|             else:
 | |
|                 msg="The process " +process_name+" is not running trying recover "
 | |
|                 raisealert(Log.INFO,process_name,msg)
 | |
| 
 | |
|                 if service_name == 'apache2':
 | |
|                     # Killing apache2 process with this the main service will not start
 | |
|                     for pid in pids:
 | |
|                         cmd = 'kill -9 '+pid
 | |
|                         printd(cmd)
 | |
|                         Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
 | |
| 
 | |
|                 if restartService(service_name) == True:
 | |
|                     break
 | |
|                 else:
 | |
|                     restartFailed = True
 | |
|                     continue
 | |
|         #for end here
 | |
| 
 | |
|         if restartFailed == True:
 | |
|             msg="The process %s recover failed "%process_name
 | |
|             raisealert(Log.ALERT,process_name,msg)
 | |
| 
 | |
|             printd("Restart failed after number of retries")
 | |
|             return StatusCodes.STOPPED, False
 | |
| 
 | |
|         return StatusCodes.RUNNING, True
 | |
| 
 | |
| 
 | |
| def monitProcess( processes_info ):
 | |
|     """
 | |
|     Monitors the processes which got from the config file
 | |
|     """
 | |
|     checkStartTime = time.time()
 | |
|     service_status = {}
 | |
|     failing_services = []
 | |
|     if len( processes_info ) == 0:
 | |
|         printd("No config items provided - means a redundant VR or a VPC Router")
 | |
|         return service_status, failing_services
 | |
| 
 | |
|     print("[Process Info] " + json.dumps(processes_info))
 | |
| 
 | |
|     #time for noting process down time
 | |
|     csec = repr(time.time()).split('.')[0]
 | |
| 
 | |
|     for process,properties in list(processes_info.items()):
 | |
|         printd ("---------------------------\nchecking the service %s\n---------------------------- " %process)
 | |
|         serviceName = process + ".service"
 | |
|         processStatus, wasRestarted = checkProcessStatus(properties)
 | |
|         if processStatus != StatusCodes.RUNNING:
 | |
|             printd( "\n Service %s is not Running"%process)
 | |
|             checkEndTime = time.time()
 | |
|             service_status[serviceName] = {
 | |
|                 "success": "false",
 | |
|                 "lastUpdate": str(int(checkStartTime * 1000)),
 | |
|                 "lastRunDuration": str((checkEndTime - checkStartTime) * 1000),
 | |
|                 "message": "service down at last check " + str(csec)
 | |
|             }
 | |
|             failing_services.append(serviceName)
 | |
|         else:
 | |
|             checkEndTime = time.time()
 | |
|             service_status[serviceName] = {
 | |
|                 "success": "true",
 | |
|                 "lastUpdate": str(int(checkStartTime * 1000)),
 | |
|                 "lastRunDuration": str((checkEndTime - checkStartTime) * 1000),
 | |
|                 "message": "service is running" + (", was restarted" if wasRestarted else "")
 | |
|             }
 | |
| 
 | |
|     return service_status, failing_services
 | |
| 
 | |
| 
 | |
| def execute(script, checkType = "basic"):
 | |
|     checkStartTime = time.time()
 | |
|     cmd = "./" + script + " " + checkType
 | |
|     printd ("Executing health check script command: " + cmd)
 | |
| 
 | |
|     pout = Popen(cmd, shell=True, stdout=PIPE)
 | |
|     exitStatus = pout.wait()
 | |
|     output = pout.communicate()[0].decode().strip()
 | |
|     checkEndTime = time.time()
 | |
| 
 | |
|     if exitStatus == 0:
 | |
|         if len(output) > 0:
 | |
|             printd("Successful execution of " + script)
 | |
|             return {
 | |
|                 "success": "true",
 | |
|                 "lastUpdate": str(int(checkStartTime * 1000)),
 | |
|                 "lastRunDuration": str((checkEndTime - checkStartTime) * 1000),
 | |
|                 "message": output
 | |
|             }
 | |
|         return {} #Skip script if no output is received
 | |
|     else:
 | |
|         printd("Script execution failed " + script)
 | |
|         return {
 | |
|             "success": "false",
 | |
|             "lastUpdate": str(int(checkStartTime * 1000)),
 | |
|             "lastRunDuration": str((checkEndTime - checkStartTime) * 1000),
 | |
|             "message": output
 | |
|         }
 | |
| 
 | |
| def main(checkType = "basic"):
 | |
|     startTime = time.time()
 | |
|     '''
 | |
|     Step1 : Get Services Config
 | |
|     '''
 | |
|     printd("monitoring started")
 | |
|     configDict = getServicesConfig()
 | |
| 
 | |
|     '''
 | |
|     Step2: Monitor services and Raise Alerts
 | |
|     '''
 | |
|     monitResult = {}
 | |
|     failingChecks = []
 | |
|     if checkType == "basic":
 | |
|         monitResult, failingChecks = monitProcess(configDict)
 | |
| 
 | |
|     '''
 | |
|     Step3: Run health check scripts as needed
 | |
|     '''
 | |
|     hc_data = getHealthChecksData()
 | |
| 
 | |
|     if hc_data is not None and "health_checks_enabled" in hc_data and hc_data['health_checks_enabled']:
 | |
|         hc_exclude = hc_data["excluded_health_checks"] if "excluded_health_checks" in hc_data else []
 | |
|         for f in os.listdir(Config.HEALTH_CHECKS_DIR):
 | |
|             if f in hc_exclude:
 | |
|                 continue
 | |
|             fpath = path.join(Config.HEALTH_CHECKS_DIR, f)
 | |
|             if path.isfile(fpath) and os.access(fpath, os.X_OK):
 | |
|                 ret = execute(fpath, checkType)
 | |
|                 if len(ret) == 0:
 | |
|                     continue
 | |
|                 if "success" in ret and ret["success"].lower() == "false":
 | |
|                     failingChecks.append(f)
 | |
|                 monitResult[f] = ret
 | |
| 
 | |
|     '''
 | |
|     Step4: Write results to the json file for admins/management server to read
 | |
|     '''
 | |
| 
 | |
|     endTime = time.time()
 | |
|     monitResult["lastRun"] = {
 | |
|         "start": str(datetime.fromtimestamp(startTime)),
 | |
|         "end": str(datetime.fromtimestamp(endTime)),
 | |
|         "duration": str(endTime - startTime)
 | |
|     }
 | |
| 
 | |
|     with open(checkType + "_" + Config.MONITOR_RESULT_FILE_SUFFIX, 'w') as f:
 | |
|         json.dump(monitResult, f, ensure_ascii=False)
 | |
| 
 | |
|     failChecksFile = checkType + "_" + Config.FAILING_CHECKS_FILE
 | |
|     if len(failingChecks) > 0:
 | |
|         fcs = ""
 | |
|         for fc in failingChecks:
 | |
|             fcs = fcs + fc + ","
 | |
|         fcs = fcs[0:-1]
 | |
|         with open(failChecksFile, 'w') as f:
 | |
|             f.write(fcs)
 | |
|     elif path.isfile(failChecksFile):
 | |
|         os.remove(failChecksFile)
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     checkType = "basic"
 | |
|     if len(sys.argv) == 2:
 | |
|         if sys.argv[1] == "advanced":
 | |
|             main("advanced")
 | |
|         elif sys.argv[1] == "basic":
 | |
|             main("basic")
 | |
|         else:
 | |
|             printd("Error: Unknown type of test: " + sys.argv)
 | |
|     else:
 | |
|         main("basic")
 | |
|         main("advanced")
 |