mirror of
https://github.com/apache/cloudstack.git
synced 2025-10-26 08:42:29 +01:00
CLOUDSTACK-5164 Unmonit for 30 minutes for a failed process
This commit is contained in:
parent
ab2c38c050
commit
0be4a685e8
@ -64,7 +64,7 @@ crontab -l | grep -v monitorServices.py | crontab -
|
|||||||
create_config $config
|
create_config $config
|
||||||
|
|
||||||
#add cron job
|
#add cron job
|
||||||
(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */1 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
|
(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */3 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
|
||||||
|
|
||||||
|
|
||||||
unlock_exit 0 $lock $locked
|
unlock_exit 0 $lock $locked
|
||||||
|
|||||||
@ -19,14 +19,13 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
__author__ = 'jayapalreddy'
|
|
||||||
|
|
||||||
from ConfigParser import SafeConfigParser
|
from ConfigParser import SafeConfigParser
|
||||||
from subprocess import *
|
from subprocess import *
|
||||||
from os import path
|
from os import path
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
|
|
||||||
monitor_log='/var/log/monitor.log'
|
|
||||||
class StatusCodes:
|
class StatusCodes:
|
||||||
SUCCESS = 0
|
SUCCESS = 0
|
||||||
FAILED = 1
|
FAILED = 1
|
||||||
@ -35,42 +34,58 @@ class StatusCodes:
|
|||||||
STOPPED = 4
|
STOPPED = 4
|
||||||
STARTING = 5
|
STARTING = 5
|
||||||
|
|
||||||
class log:
|
class Log:
|
||||||
INFO = 'INFO'
|
INFO = 'INFO'
|
||||||
ALERT = 'ALERT'
|
ALERT = 'ALERT'
|
||||||
CRIT = 'CRIT'
|
CRIT = 'CRIT'
|
||||||
NOTIF = 'NOTIF'
|
NOTIF = 'NOTIF'
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
MONIT_AFTER_MINS = 30
|
||||||
|
SLEEP_SEC = 1
|
||||||
|
RETRY_ITERATIONS = 10
|
||||||
|
RETRY_FOR_RESTART = 5
|
||||||
|
MONITOR_LOG = '/var/log/monitor.log'
|
||||||
|
UNMONIT_PS_FILE = '/etc/unmonit_psList.txt'
|
||||||
|
|
||||||
|
|
||||||
def getConfig( config_file_path = "/etc/monitor.conf" ):
|
def getConfig( config_file_path = "/etc/monitor.conf" ):
|
||||||
|
"""
|
||||||
|
Reads the process configuration from the config file.
|
||||||
|
Config file contains the processes to be monitored.
|
||||||
|
|
||||||
|
"""
|
||||||
process_dict = {}
|
process_dict = {}
|
||||||
parser = SafeConfigParser()
|
parser = SafeConfigParser()
|
||||||
parser.read( config_file_path )
|
parser.read( config_file_path )
|
||||||
|
|
||||||
#print 'Read values:\n'
|
|
||||||
|
|
||||||
for section in parser.sections():
|
for section in parser.sections():
|
||||||
# print section
|
|
||||||
process_dict[section] = {}
|
process_dict[section] = {}
|
||||||
|
|
||||||
for name, value in parser.items(section):
|
for name, value in parser.items(section):
|
||||||
process_dict[section][name] = value
|
process_dict[section][name] = value
|
||||||
# print ' %s = %r' % (name, value)
|
# printd (" %s = %r" % (name, value))
|
||||||
|
|
||||||
return process_dict
|
return process_dict
|
||||||
|
|
||||||
def printd (msg):
|
def printd (msg):
|
||||||
|
"""
|
||||||
|
prints the debug messages
|
||||||
|
"""
|
||||||
|
|
||||||
|
#for debug
|
||||||
|
#print msg
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
f= open(monitor_log,'r+')
|
f= open(Config.MONITOR_LOG,'r+')
|
||||||
f.seek(0, 2)
|
f.seek(0, 2)
|
||||||
f.write(str(msg)+"\n")
|
f.write(str(msg)+"\n")
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
def raisealert(severity, msg, process_name=None):
|
def raisealert(severity, msg, process_name=None):
|
||||||
|
""" Writes the alert message"""
|
||||||
|
|
||||||
#timeStr=str(time.ctime())
|
#timeStr=str(time.ctime())
|
||||||
if process_name is not None:
|
if process_name is not None:
|
||||||
log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
|
log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
|
||||||
@ -82,9 +97,12 @@ def raisealert(severity, msg, process_name=None):
|
|||||||
|
|
||||||
|
|
||||||
def isPidMatchPidFile(pidfile, pids):
|
def isPidMatchPidFile(pidfile, pids):
|
||||||
|
""" Compares the running process pid with the pid in pid file.
|
||||||
|
If a process with multiple pids then it matches with pid file
|
||||||
|
"""
|
||||||
|
|
||||||
if pids is None or isinstance(pids,list) != True or len(pids) == 0:
|
if pids is None or isinstance(pids,list) != True or len(pids) == 0:
|
||||||
print "Invalid Arguments"
|
printd ("Invalid Arguments")
|
||||||
return StatusCodes.FAILED
|
return StatusCodes.FAILED
|
||||||
if not path.isfile(pidfile):
|
if not path.isfile(pidfile):
|
||||||
#It seems there is no pid file for this service
|
#It seems there is no pid file for this service
|
||||||
@ -100,12 +118,18 @@ def isPidMatchPidFile(pidfile, pids):
|
|||||||
|
|
||||||
|
|
||||||
inp = fd.read()
|
inp = fd.read()
|
||||||
|
|
||||||
|
if not inp:
|
||||||
|
fd.close()
|
||||||
|
return StatusCodes.FAILED
|
||||||
|
|
||||||
printd("file content "+str(inp))
|
printd("file content "+str(inp))
|
||||||
printd(pids)
|
printd(pids)
|
||||||
tocheck_pid = inp.strip()
|
tocheck_pid = inp.strip()
|
||||||
for item in pids:
|
for item in pids:
|
||||||
if str(tocheck_pid) == item.strip():
|
if str(tocheck_pid) == item.strip():
|
||||||
printd("pid file matched")
|
printd("pid file matched")
|
||||||
|
fd.close()
|
||||||
return StatusCodes.SUCCESS
|
return StatusCodes.SUCCESS
|
||||||
|
|
||||||
fd.close()
|
fd.close()
|
||||||
@ -114,19 +138,22 @@ def isPidMatchPidFile(pidfile, pids):
|
|||||||
|
|
||||||
|
|
||||||
def checkProcessStatus( process ):
|
def checkProcessStatus( process ):
|
||||||
|
"""
|
||||||
|
Check the process running status, if not running tries to restart
|
||||||
|
"""
|
||||||
process_name = process.get('processname')
|
process_name = process.get('processname')
|
||||||
service_name = process.get('servicename')
|
service_name = process.get('servicename')
|
||||||
pidfile = process.get('pidfile')
|
pidfile = process.get('pidfile')
|
||||||
#temp_out = None
|
#temp_out = None
|
||||||
restartFailed=False
|
restartFailed=False
|
||||||
pidFileMatched=1
|
pidFileMatched=False
|
||||||
|
pids=''
|
||||||
cmd=''
|
cmd=''
|
||||||
if process_name is None:
|
if process_name is None:
|
||||||
print "\n Invalid Process Name"
|
printd ("\n Invalid Process Name")
|
||||||
return StatusCodes.INVALID_INP
|
return StatusCodes.INVALID_INP
|
||||||
else:
|
else:
|
||||||
msg="checking the process " + process_name
|
printd("checking the process " + process_name)
|
||||||
printd(msg)
|
|
||||||
cmd = 'pidof ' + process_name
|
cmd = 'pidof ' + process_name
|
||||||
printd(cmd)
|
printd(cmd)
|
||||||
#cmd = 'service ' + process_name + ' status'
|
#cmd = 'service ' + process_name + ' status'
|
||||||
@ -136,20 +163,19 @@ def checkProcessStatus( process ):
|
|||||||
|
|
||||||
#check there is only one pid or not
|
#check there is only one pid or not
|
||||||
if exitStatus == 0:
|
if exitStatus == 0:
|
||||||
|
pids = temp_out.split(' ')
|
||||||
msg="pids: " +temp_out;
|
msg="pids: " +temp_out;
|
||||||
printd(msg)
|
printd(msg)
|
||||||
pids = temp_out.split(' ')
|
|
||||||
|
|
||||||
#there is more than one process so match the pid file
|
#there is more than one process so match the pid file
|
||||||
#if not matched set pidFileMatched=0
|
#if not matched set pidFileMatched=False
|
||||||
printd("Checking pid file")
|
printd("Checking pid file")
|
||||||
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
|
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
|
||||||
pidFileMatched = 1;
|
pidFileMatched = True;
|
||||||
else:
|
else:
|
||||||
pidFileMatched = 0;
|
pidFileMatched = False;
|
||||||
|
|
||||||
printd(pidFileMatched)
|
if exitStatus == 0 and pidFileMatched == True:
|
||||||
if exitStatus == 0 and pidFileMatched == 1:
|
|
||||||
printd("The process is running ....")
|
printd("The process is running ....")
|
||||||
return StatusCodes.RUNNING
|
return StatusCodes.RUNNING
|
||||||
else:
|
else:
|
||||||
@ -157,28 +183,28 @@ def checkProcessStatus( process ):
|
|||||||
msg="The process " + process_name +" is not running trying recover "
|
msg="The process " + process_name +" is not running trying recover "
|
||||||
printd(msg)
|
printd(msg)
|
||||||
#Retry the process state for few seconds
|
#Retry the process state for few seconds
|
||||||
for i in range(1,10):
|
for i in range(1, Config.RETRY_ITERATIONS):
|
||||||
pout = Popen(cmd, shell=True, stdout=PIPE)
|
pout = Popen(cmd, shell=True, stdout=PIPE)
|
||||||
exitStatus = pout.wait()
|
exitStatus = pout.wait()
|
||||||
temp_out = pout.communicate()[0]
|
temp_out = pout.communicate()[0]
|
||||||
|
|
||||||
if i < 5: # this is just for trying few more times
|
if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times
|
||||||
if exitStatus == 0:
|
if exitStatus == 0:
|
||||||
pids = temp_out.split(' ')
|
pids = temp_out.split(' ')
|
||||||
|
|
||||||
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
|
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
|
||||||
pidFileMatched = 1;
|
pidFileMatched = True;
|
||||||
printd("pid file is matched ...")
|
printd("pid file is matched ...")
|
||||||
raisealert(log.ALERT, "The process detected as running", process_name)
|
raisealert(Log.ALERT, "The process detected as running", process_name)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
printd("pid file is not matched ...")
|
printd("pid file is not matched ...")
|
||||||
pidFileMatched = 0;
|
pidFileMatched = False;
|
||||||
|
time.sleep(Config.SLEEP_SEC)
|
||||||
continue
|
continue
|
||||||
time.sleep(1)
|
|
||||||
else:
|
else:
|
||||||
msg="The process " +process_name+" is not running trying recover "
|
msg="The process " +process_name+" is not running trying recover "
|
||||||
raisealert(log.INFO,process_name,msg)
|
raisealert(Log.INFO,process_name,msg)
|
||||||
|
|
||||||
if service_name == 'apache2':
|
if service_name == 'apache2':
|
||||||
# Killing apache2 process with this the main service will not start
|
# Killing apache2 process with this the main service will not start
|
||||||
@ -189,7 +215,7 @@ def checkProcessStatus( process ):
|
|||||||
|
|
||||||
cmd = 'service ' + service_name + ' restart'
|
cmd = 'service ' + service_name + ' restart'
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(Config.SLEEP_SEC)
|
||||||
#return_val= check_call(cmd , shell=True)
|
#return_val= check_call(cmd , shell=True)
|
||||||
|
|
||||||
cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
|
cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
|
||||||
@ -198,37 +224,135 @@ def checkProcessStatus( process ):
|
|||||||
if return_val == 0:
|
if return_val == 0:
|
||||||
printd("The process" + process_name +" recovered successfully ")
|
printd("The process" + process_name +" recovered successfully ")
|
||||||
msg="The process " +process_name+" is recovered successfully "
|
msg="The process " +process_name+" is recovered successfully "
|
||||||
raisealert(log.INFO,msg,process_name)
|
raisealert(Log.INFO,msg,process_name)
|
||||||
|
|
||||||
break;
|
break;
|
||||||
else:
|
else:
|
||||||
#retry restarting the process for few tries
|
#retry restarting the process for few tries
|
||||||
printd("process restart failing trying again ....")
|
printd("process restart failing trying again ....")
|
||||||
restartFailed=True
|
restartFailed=True
|
||||||
time.sleep(1)
|
time.sleep(Config.SLEEP_SEC)
|
||||||
continue
|
continue
|
||||||
#for end here
|
#for end here
|
||||||
|
|
||||||
if restartFailed == True:
|
if restartFailed == True:
|
||||||
msg="The process %s recover failed "%process_name
|
msg="The process %s recover failed "%process_name
|
||||||
raisealert(log.ALERT,process_name,msg)
|
raisealert(Log.ALERT,process_name,msg)
|
||||||
|
|
||||||
printd("Restart failed after number of retries")
|
printd("Restart failed after number of retries")
|
||||||
return StatusCodes.STOPPED
|
return StatusCodes.STOPPED
|
||||||
|
|
||||||
return StatusCodes.RUNNING
|
return StatusCodes.RUNNING
|
||||||
|
|
||||||
def raiseAlert( process_name ):
|
|
||||||
print "process name %s is raised "%process_name
|
|
||||||
|
|
||||||
def monitProcess( processes_info ):
|
def monitProcess( processes_info ):
|
||||||
|
"""
|
||||||
|
Monitors the processes which got from the config file
|
||||||
|
"""
|
||||||
if len( processes_info ) == 0:
|
if len( processes_info ) == 0:
|
||||||
print "Invalid Input"
|
printd("Invalid Input")
|
||||||
return StatusCodes.INVALID_INP
|
return StatusCodes.INVALID_INP
|
||||||
for process,properties in processes_info.items():
|
|
||||||
if checkProcessStatus( properties) != StatusCodes.RUNNING:
|
|
||||||
print "\n Process %s is not Running"%process
|
|
||||||
|
|
||||||
|
dict_unmonit={}
|
||||||
|
umonit_update={}
|
||||||
|
|
||||||
|
if not path.isfile(Config.UNMONIT_PS_FILE):
|
||||||
|
printd('Unmonit File not exist')
|
||||||
|
else:
|
||||||
|
#load the dictionary with unmonit process list
|
||||||
|
dict_unmonit = loadPsFromUnMonitFile()
|
||||||
|
|
||||||
|
#time for noting process down time
|
||||||
|
csec = repr(time.time()).split('.')[0]
|
||||||
|
|
||||||
|
unMonitPs=False
|
||||||
|
|
||||||
|
for process,properties in processes_info.items():
|
||||||
|
#skip the process it its time stamp less than Config.MONIT_AFTER_MINS
|
||||||
|
printd ("checking the process %s \n" %process)
|
||||||
|
|
||||||
|
if not is_emtpy(dict_unmonit):
|
||||||
|
if dict_unmonit.has_key(process):
|
||||||
|
ts = dict_unmonit[process]
|
||||||
|
printd("Time difference=%s" %str(int(csec) - int(ts)))
|
||||||
|
tmin = (int(csec) - int(ts) )/60
|
||||||
|
|
||||||
|
if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS:
|
||||||
|
raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS))
|
||||||
|
printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin)))
|
||||||
|
unMonitPs=True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if checkProcessStatus( properties) != StatusCodes.RUNNING:
|
||||||
|
printd( "\n Process %s is not Running"%process)
|
||||||
|
#add this process into unmonit list
|
||||||
|
printd ("updating the process for unmonit %s\n" %process)
|
||||||
|
umonit_update[process]=csec
|
||||||
|
|
||||||
|
|
||||||
|
#if dict is not empty write to file else delete it
|
||||||
|
if not is_emtpy(umonit_update):
|
||||||
|
writePsListToUnmonitFile(umonit_update)
|
||||||
|
else:
|
||||||
|
if is_emtpy(umonit_update) and unMonitPs == False:
|
||||||
|
#delete file it is there
|
||||||
|
if path.isfile(Config.UNMONIT_PS_FILE):
|
||||||
|
printd("Removing the file %s" %Config.UNMONIT_PS_FILE)
|
||||||
|
os.remove(Config.UNMONIT_PS_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def loadPsFromUnMonitFile():
|
||||||
|
|
||||||
|
dict_unmonit = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
fd = open(Config.UNMONIT_PS_FILE)
|
||||||
|
except:
|
||||||
|
printd("Failed to open file %s " %(Config.UNMONIT_PS_FILE))
|
||||||
|
return StatusCodes.FAILED
|
||||||
|
|
||||||
|
ps = fd.read()
|
||||||
|
|
||||||
|
if not ps:
|
||||||
|
printd("File %s content is empty " %Config.UNMONIT_PS_FILE)
|
||||||
|
return StatusCodes.FAILED
|
||||||
|
|
||||||
|
printd(ps)
|
||||||
|
plist = ps.split(',')
|
||||||
|
plist.remove('')
|
||||||
|
for i in plist:
|
||||||
|
dict_unmonit[i.split(':')[0]] = i.split(':')[1]
|
||||||
|
|
||||||
|
fd.close();
|
||||||
|
|
||||||
|
return dict_unmonit;
|
||||||
|
|
||||||
|
|
||||||
|
def writePsListToUnmonitFile(umonit_update):
|
||||||
|
printd("Write updated unmonit list to file")
|
||||||
|
line=''
|
||||||
|
for i in umonit_update:
|
||||||
|
line+=str(i)+":"+str(umonit_update[i])+','
|
||||||
|
printd(line)
|
||||||
|
try:
|
||||||
|
fd=open(Config.UNMONIT_PS_FILE,'w')
|
||||||
|
except:
|
||||||
|
printd("Failed to open file %s " %Config.UNMONIT_PS_FILE)
|
||||||
|
return StatusCodes.FAILED
|
||||||
|
|
||||||
|
fd.write(line);
|
||||||
|
fd.close()
|
||||||
|
|
||||||
|
|
||||||
|
def is_emtpy(struct):
|
||||||
|
"""
|
||||||
|
Checks wether the given struct is empty or not
|
||||||
|
"""
|
||||||
|
if struct:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
'''
|
'''
|
||||||
@ -238,14 +362,11 @@ def main():
|
|||||||
printd("monitoring started")
|
printd("monitoring started")
|
||||||
temp_dict = getConfig()
|
temp_dict = getConfig()
|
||||||
|
|
||||||
'''
|
|
||||||
Step2: Get Previous Run Log
|
|
||||||
'''
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
Step3: Monitor and Raise Alert
|
Step2: Monitor and Raise Alert
|
||||||
'''
|
'''
|
||||||
#raisealert(log.INFO, 'Monit started')
|
#raisealert(Log.INFO, 'Monit started')
|
||||||
monitProcess( temp_dict )
|
monitProcess( temp_dict )
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user