#!/usr/bin/python # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from ConfigParser import SafeConfigParser from subprocess import * from os import path import time import os import logging class StatusCodes: SUCCESS = 0 FAILED = 1 INVALID_INP = 2 RUNNING = 3 STOPPED = 4 STARTING = 5 class Log: INFO = 'INFO' ALERT = 'ALERT' CRIT = 'CRIT' NOTIF = 'NOTIF' class Config: MONIT_AFTER_MINS = 30 SLEEP_SEC = 1 RETRY_ITERATIONS = 10 RETRY_FOR_RESTART = 5 MONITOR_LOG = '/var/log/monitor.log' UNMONIT_PS_FILE = '/etc/unmonit_psList.txt' def getConfig( config_file_path = "/etc/monitor.conf" ): """ Reads the process configuration from the config file. Config file contains the processes to be monitored. """ process_dict = {} parser = SafeConfigParser() parser.read( config_file_path ) for section in parser.sections(): process_dict[section] = {} for name, value in parser.items(section): process_dict[section][name] = value # printd (" %s = %r" % (name, value)) return process_dict def printd (msg): """ prints the debug messages """ #for debug #print msg return 0 f= open(Config.MONITOR_LOG,'r+') f.seek(0, 2) f.write(str(msg)+"\n") f.close() def raisealert(severity, msg, process_name=None): """ Writes the alert message""" #timeStr=str(time.ctime()) if process_name is not None: log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n" else: log = '['+severity+']' + " " + msg +"\n" logging.basicConfig(level=logging.INFO,filename='/var/log/routerServiceMonitor.log',format='%(asctime)s %(message)s') logging.info(log) msg = 'logger -t monit '+ log pout = Popen(msg, shell=True, stdout=PIPE) def isPidMatchPidFile(pidfile, pids): """ Compares the running process pid with the pid in pid file. If a process with multiple pids then it matches with pid file """ if pids is None or isinstance(pids,list) != True or len(pids) == 0: printd ("Invalid Arguments") return StatusCodes.FAILED if not path.isfile(pidfile): #It seems there is no pid file for this service printd("The pid file "+pidfile+" is not there for this process") return StatusCodes.FAILED fd=None try: fd = open(pidfile,'r') except: printd("pid file: "+ pidfile +" open failed") return StatusCodes.FAILED inp = fd.read() if not inp: fd.close() return StatusCodes.FAILED printd("file content "+str(inp)) printd(pids) tocheck_pid = inp.strip() for item in pids: if str(tocheck_pid) == item.strip(): printd("pid file matched") fd.close() return StatusCodes.SUCCESS fd.close() return StatusCodes.FAILED def checkProcessRunningStatus(process_name, pidFile): printd("checking the process " + process_name) cmd = '' pids = [] cmd = 'pidof ' + process_name printd(cmd) #cmd = 'service ' + process_name + ' status' pout = Popen(cmd, shell=True, stdout=PIPE) exitStatus = pout.wait() temp_out = pout.communicate()[0] #check there is only one pid or not if exitStatus == 0: pids = temp_out.split(' ') printd("pid(s) of process %s are %s " %(process_name, pids)) #there is more than one process so match the pid file #if not matched set pidFileMatched=False printd("Checking pid file") if isPidMatchPidFile(pidFile, pids) == StatusCodes.SUCCESS: return True,pids printd("pid of exit status %s" %exitStatus) return False,pids def restartService(service_name): cmd = 'service ' + service_name + ' restart' cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) return_val = cout.wait() if return_val == 0: printd("The service " + service_name +" recovered successfully ") msg="The process " +service_name+" is recovered successfully " raisealert(Log.INFO,msg,service_name) return True else: printd("process restart failed ....") return False def checkProcessStatus( process ): """ Check the process running status, if not running tries to restart """ process_name = process.get('processname') service_name = process.get('servicename') pidfile = process.get('pidfile') #temp_out = None restartFailed=False pidFileMatched=False pids='' cmd='' if process_name is None: printd ("\n Invalid Process Name") return StatusCodes.INVALID_INP status, pids = checkProcessRunningStatus(process_name, pidfile) if status == True: printd("The process is running ....") return StatusCodes.RUNNING else: printd("Process %s is not running trying to recover" %process_name) #Retry the process state for few seconds for i in range(1, Config.RETRY_ITERATIONS): time.sleep(Config.SLEEP_SEC) if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times status, pids = checkProcessRunningStatus(process_name, pidfile) if status == True: raisealert(Log.ALERT, "The process detected as running", process_name) break else: printd("Process %s is not running checking the status again..." %process_name) continue else: msg="The process " +process_name+" is not running trying recover " raisealert(Log.INFO,process_name,msg) if service_name == 'apache2': # Killing apache2 process with this the main service will not start for pid in pids: cmd = 'kill -9 '+pid printd(cmd) Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) if restartService(service_name) == True: break else: restartFailed = True continue #for end here if restartFailed == True: msg="The process %s recover failed "%process_name raisealert(Log.ALERT,process_name,msg) printd("Restart failed after number of retries") return StatusCodes.STOPPED return StatusCodes.RUNNING def monitProcess( processes_info ): """ Monitors the processes which got from the config file """ if len( processes_info ) == 0: printd("Invalid Input") return StatusCodes.INVALID_INP dict_unmonit={} umonit_update={} unMonitPs=False if not path.isfile(Config.UNMONIT_PS_FILE): printd('Unmonit File not exist') else: #load the dictionary with unmonit process list dict_unmonit = loadPsFromUnMonitFile() #time for noting process down time csec = repr(time.time()).split('.')[0] for process,properties in processes_info.items(): #skip the process it its time stamp less than Config.MONIT_AFTER_MINS printd ("checking the service %s \n" %process) if not is_emtpy(dict_unmonit): if dict_unmonit.has_key(process): ts = dict_unmonit[process] if checkPsTimeStampForMonitor (csec, ts, properties) == False: unMonitPs = True continue if checkProcessStatus( properties) != StatusCodes.RUNNING: printd( "\n Service %s is not Running"%process) #add this process into unmonit list printd ("updating the service for unmonit %s\n" %process) umonit_update[process]=csec #if dict is not empty write to file else delete it if not is_emtpy(umonit_update): writePsListToUnmonitFile(umonit_update) else: if is_emtpy(umonit_update) and unMonitPs == False: #delete file it is there removeFile(Config.UNMONIT_PS_FILE) def checkPsTimeStampForMonitor(csec,ts, process): printd("Time difference=%s" %str(int(csec) - int(ts))) tmin = (int(csec) - int(ts) )/60 if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS: raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS)) printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin))) return False return True def removeFile(fileName): if path.isfile(fileName): printd("Removing the file %s" %fileName) os.remove(fileName) def loadPsFromUnMonitFile(): dict_unmonit = {} try: fd = open(Config.UNMONIT_PS_FILE) except: printd("Failed to open file %s " %(Config.UNMONIT_PS_FILE)) return StatusCodes.FAILED ps = fd.read() if not ps: printd("File %s content is empty " %Config.UNMONIT_PS_FILE) return StatusCodes.FAILED printd(ps) plist = ps.split(',') plist.remove('') for i in plist: dict_unmonit[i.split(':')[0]] = i.split(':')[1] fd.close() return dict_unmonit def writePsListToUnmonitFile(umonit_update): printd("Write updated unmonit list to file") line='' for i in umonit_update: line+=str(i)+":"+str(umonit_update[i])+',' printd(line) try: fd=open(Config.UNMONIT_PS_FILE,'w') except: printd("Failed to open file %s " %Config.UNMONIT_PS_FILE) return StatusCodes.FAILED fd.write(line) fd.close() def is_emtpy(struct): """ Checks wether the given struct is empty or not """ if struct: return False else: return True def main(): ''' Step1 : Get Config ''' printd("monitoring started") temp_dict = getConfig() ''' Step2: Monitor and Raise Alert ''' monitProcess( temp_dict ) if __name__ == "__main__": main()