Add watchdog for verifier processes.
Add a watchdog in parent process to check verifier child processes, and restart if needed. Change-Id: Icacc4c046a8f4ba949499780cdc4724c9fd54fba
This commit is contained in:
parent
2b4535c8a2
commit
c389b8f8a9
@ -184,6 +184,7 @@ class BaseVerifierTestCase(StacktachBaseTestCase):
|
|||||||
start = datetime.datetime.utcnow()
|
start = datetime.datetime.utcnow()
|
||||||
self.mox.StubOutWithMock(self.verifier_without_notifications, '_utcnow')
|
self.mox.StubOutWithMock(self.verifier_without_notifications, '_utcnow')
|
||||||
self.verifier_without_notifications._utcnow().AndReturn(start)
|
self.verifier_without_notifications._utcnow().AndReturn(start)
|
||||||
|
self.verifier_without_notifications._utcnow().AndReturn(start)
|
||||||
settle_offset = {SETTLE_UNITS: SETTLE_TIME}
|
settle_offset = {SETTLE_UNITS: SETTLE_TIME}
|
||||||
ending_max = start - datetime.timedelta(**settle_offset)
|
ending_max = start - datetime.timedelta(**settle_offset)
|
||||||
self.mox.StubOutWithMock(self.verifier_without_notifications, 'verify_for_range')
|
self.mox.StubOutWithMock(self.verifier_without_notifications, 'verify_for_range')
|
||||||
@ -231,6 +232,7 @@ class BaseVerifierTestCase(StacktachBaseTestCase):
|
|||||||
start = datetime.datetime.utcnow()
|
start = datetime.datetime.utcnow()
|
||||||
self.mox.StubOutWithMock(self.verifier_with_notifications, '_utcnow')
|
self.mox.StubOutWithMock(self.verifier_with_notifications, '_utcnow')
|
||||||
self.verifier_with_notifications._utcnow().AndReturn(start)
|
self.verifier_with_notifications._utcnow().AndReturn(start)
|
||||||
|
self.verifier_with_notifications._utcnow().AndReturn(start)
|
||||||
settle_offset = {SETTLE_UNITS: SETTLE_TIME}
|
settle_offset = {SETTLE_UNITS: SETTLE_TIME}
|
||||||
ending_max = start - datetime.timedelta(**settle_offset)
|
ending_max = start - datetime.timedelta(**settle_offset)
|
||||||
self.mox.StubOutWithMock(self.verifier_with_notifications, 'verify_for_range')
|
self.mox.StubOutWithMock(self.verifier_with_notifications, 'verify_for_range')
|
||||||
|
@ -108,13 +108,17 @@ def _is_alphanumeric(attr_name, attr_value, exist_id, instance_uuid):
|
|||||||
|
|
||||||
|
|
||||||
class Verifier(object):
|
class Verifier(object):
|
||||||
def __init__(self, config, pool=None, reconciler=None):
|
def __init__(self, config, pool=None, reconciler=None, stats=None):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.pool = pool or multiprocessing.Pool(config.pool_size())
|
self.pool = pool or multiprocessing.Pool(config.pool_size())
|
||||||
self.enable_notifications = config.enable_notifications()
|
self.enable_notifications = config.enable_notifications()
|
||||||
self.reconciler = reconciler
|
self.reconciler = reconciler
|
||||||
self.results = []
|
self.results = []
|
||||||
self.failed = []
|
self.failed = []
|
||||||
|
if stats is None:
|
||||||
|
self.stats = {}
|
||||||
|
else:
|
||||||
|
self.stats = stats
|
||||||
|
|
||||||
def clean_results(self):
|
def clean_results(self):
|
||||||
pending = []
|
pending = []
|
||||||
@ -147,6 +151,7 @@ class Verifier(object):
|
|||||||
settle_units = self.config.settle_units()
|
settle_units = self.config.settle_units()
|
||||||
settle_time = self.config.settle_time()
|
settle_time = self.config.settle_time()
|
||||||
while self._keep_running():
|
while self._keep_running():
|
||||||
|
self.stats['timestamp'] = self._utcnow()
|
||||||
with transaction.commit_on_success():
|
with transaction.commit_on_success():
|
||||||
now = self._utcnow()
|
now = self._utcnow()
|
||||||
kwargs = {settle_units: settle_time}
|
kwargs = {settle_units: settle_time}
|
||||||
@ -175,6 +180,7 @@ class Verifier(object):
|
|||||||
def callback(result):
|
def callback(result):
|
||||||
attempt = 0
|
attempt = 0
|
||||||
while attempt < 2:
|
while attempt < 2:
|
||||||
|
self.stats['timestamp'] = self._utcnow()
|
||||||
try:
|
try:
|
||||||
(verified, exist) = result
|
(verified, exist) = result
|
||||||
if verified:
|
if verified:
|
||||||
@ -198,6 +204,10 @@ class Verifier(object):
|
|||||||
logger.exception(msg)
|
logger.exception(msg)
|
||||||
break
|
break
|
||||||
attempt += 1
|
attempt += 1
|
||||||
|
self.stats['timestamp'] = self._utcnow()
|
||||||
|
total = self.stats.get('total_processed', 0) + 1
|
||||||
|
self.stats['total_processed'] = total
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._run(callback=callback)
|
self._run(callback=callback)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
|
@ -66,6 +66,10 @@ def pool_size():
|
|||||||
return config['pool_size']
|
return config['pool_size']
|
||||||
|
|
||||||
|
|
||||||
|
def process_timeout(default=0):
|
||||||
|
return config.get('process_timeout', default)
|
||||||
|
|
||||||
|
|
||||||
def durable_queue():
|
def durable_queue():
|
||||||
return config['rabbit']['durable_queue']
|
return config['rabbit']['durable_queue']
|
||||||
|
|
||||||
|
@ -158,8 +158,8 @@ def _verify(exists):
|
|||||||
|
|
||||||
|
|
||||||
class GlanceVerifier(Verifier):
|
class GlanceVerifier(Verifier):
|
||||||
def __init__(self, config, pool=None):
|
def __init__(self, config, pool=None, stats=None):
|
||||||
super(GlanceVerifier, self).__init__(config, pool=pool)
|
super(GlanceVerifier, self).__init__(config, pool=pool, stats=stats)
|
||||||
|
|
||||||
def verify_exists(self, grouped_exists, callback, verifying_status):
|
def verify_exists(self, grouped_exists, callback, verifying_status):
|
||||||
count = len(grouped_exists)
|
count = len(grouped_exists)
|
||||||
|
@ -298,10 +298,6 @@ def _verify(exist, validation_level):
|
|||||||
|
|
||||||
|
|
||||||
class NovaVerifier(base_verifier.Verifier):
|
class NovaVerifier(base_verifier.Verifier):
|
||||||
def __init__(self, config, pool=None, reconciler=None):
|
|
||||||
super(NovaVerifier, self).__init__(config,
|
|
||||||
pool=pool,
|
|
||||||
reconciler=reconciler)
|
|
||||||
|
|
||||||
def send_verified_notification(self, exist, connection, exchange,
|
def send_verified_notification(self, exist, connection, exchange,
|
||||||
routing_keys=None):
|
routing_keys=None):
|
||||||
|
@ -5,9 +5,9 @@
|
|||||||
# to you under the Apache License, Version 2.0 (the
|
# to you under the Apache License, Version 2.0 (the
|
||||||
# "License"); you may not use this file except in compliance
|
# "License"); you may not use this file except in compliance
|
||||||
# with the License. You may obtain a copy of the License at
|
# with the License. You may obtain a copy of the License at
|
||||||
#
|
#
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
#
|
||||||
# Unless required by applicable law or agreed to in writing,
|
# Unless required by applicable law or agreed to in writing,
|
||||||
# software distributed under the License is distributed on an
|
# software distributed under the License is distributed on an
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
@ -19,8 +19,9 @@ import json
|
|||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
from multiprocessing import Process
|
from multiprocessing import Process, Manager
|
||||||
|
|
||||||
POSSIBLE_TOPDIR = os.path.normpath(os.path.join(os.path.abspath(sys.argv[0]),
|
POSSIBLE_TOPDIR = os.path.normpath(os.path.join(os.path.abspath(sys.argv[0]),
|
||||||
os.pardir, os.pardir))
|
os.pardir, os.pardir))
|
||||||
@ -38,61 +39,142 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
process = None
|
processes = {}
|
||||||
log_listener = None
|
log_listener = None
|
||||||
processes = []
|
|
||||||
stacklog.set_default_logger_name('verifier')
|
stacklog.set_default_logger_name('verifier')
|
||||||
|
|
||||||
|
DEFAULT_PROC_TIMEOUT = 3600
|
||||||
|
RUNNING = True
|
||||||
|
|
||||||
def _get_parent_logger():
|
def _get_parent_logger():
|
||||||
return stacklog.get_logger('verifier', is_parent=True)
|
return stacklog.get_logger('verifier', is_parent=True)
|
||||||
|
|
||||||
|
|
||||||
def kill_time(signal, frame):
|
|
||||||
log_listener.end()
|
|
||||||
print "dying ..."
|
|
||||||
for process in processes:
|
|
||||||
process.terminate()
|
|
||||||
print "rose"
|
|
||||||
for process in processes:
|
|
||||||
process.join()
|
|
||||||
print "bud"
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_nova_reconciler():
|
def _load_nova_reconciler():
|
||||||
config_loc = verifier_config.reconciler_config()
|
config_loc = verifier_config.reconciler_config()
|
||||||
with open(config_loc, 'r') as rec_config_file:
|
with open(config_loc, 'r') as rec_config_file:
|
||||||
rec_config = json.load(rec_config_file)
|
rec_config = json.load(rec_config_file)
|
||||||
return reconciler.Reconciler(rec_config)
|
return reconciler.Reconciler(rec_config)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
def make_and_start_verifier(exchange):
|
|
||||||
# Gotta create it and run it this way so things don't get
|
|
||||||
# lost when the process is forked.
|
|
||||||
verifier = None
|
|
||||||
if exchange == "nova":
|
|
||||||
reconcile = verifier_config.reconcile()
|
|
||||||
reconciler = None
|
|
||||||
if reconcile:
|
|
||||||
reconciler = _load_nova_reconciler()
|
|
||||||
verifier = nova_verifier.NovaVerifier(verifier_config,
|
|
||||||
reconciler=reconciler)
|
|
||||||
elif exchange == "glance":
|
|
||||||
verifier = glance_verifier.GlanceVerifier(verifier_config)
|
|
||||||
|
|
||||||
verifier.run()
|
def make_and_start_verifier(exchange, stats=None):
|
||||||
|
# Gotta create it and run it this way so things don't get
|
||||||
|
# lost when the process is forked.
|
||||||
|
verifier = None
|
||||||
|
if exchange == "nova":
|
||||||
|
reconcile = verifier_config.reconcile()
|
||||||
|
reconciler = None
|
||||||
|
if reconcile:
|
||||||
|
reconciler = _load_nova_reconciler()
|
||||||
|
verifier = nova_verifier.NovaVerifier(verifier_config,
|
||||||
|
reconciler=reconciler,
|
||||||
|
stats=stats)
|
||||||
|
elif exchange == "glance":
|
||||||
|
verifier = glance_verifier.GlanceVerifier(verifier_config,
|
||||||
|
stats=stats)
|
||||||
|
|
||||||
verifier_config.load()
|
verifier.run()
|
||||||
log_listener = stacklog.LogListener(_get_parent_logger())
|
|
||||||
log_listener.start()
|
|
||||||
|
def create_proc_table(manager):
|
||||||
for exchange in verifier_config.topics().keys():
|
for exchange in verifier_config.topics().keys():
|
||||||
process = Process(target=make_and_start_verifier, args=(exchange,))
|
stats = manager.dict()
|
||||||
process.start()
|
proc_info = dict(process=None,
|
||||||
processes.append(process)
|
pid=0,
|
||||||
|
exchange=exchange,
|
||||||
|
stats=stats)
|
||||||
|
processes[exchange] = proc_info
|
||||||
|
|
||||||
if len(processes) > 0:
|
|
||||||
# Only pause parent process if there are children running.
|
def is_alive(proc_info):
|
||||||
# Otherwise just end...
|
process = proc_info['process']
|
||||||
signal.signal(signal.SIGINT, kill_time)
|
if not proc_info['pid'] or process is None:
|
||||||
signal.signal(signal.SIGTERM, kill_time)
|
return False
|
||||||
signal.pause()
|
return process.is_alive()
|
||||||
|
|
||||||
|
|
||||||
|
def needs_restart(proc_info):
|
||||||
|
timeout = verifier_config.process_timeout(DEFAULT_PROC_TIMEOUT)
|
||||||
|
process = proc_info['process']
|
||||||
|
stats = proc_info['stats']
|
||||||
|
age = datetime.datetime.utcnow() - stats['timestamp']
|
||||||
|
if timeout and (age > datetime.timedelta(seconds=timeout)):
|
||||||
|
process.terminate()
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def start_proc(proc_info):
|
||||||
|
logger = _get_parent_logger()
|
||||||
|
if is_alive(proc_info):
|
||||||
|
if needs_restart(proc_info):
|
||||||
|
logger.warning("Child process %s (%s) terminated due to "
|
||||||
|
"heartbeat timeout. Restarting..." % (proc_info['pid'],
|
||||||
|
proc_info['exchange']))
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
stats = proc_info['stats']
|
||||||
|
stats['timestamp'] = datetime.datetime.utcnow()
|
||||||
|
stats['total_processed'] = 0
|
||||||
|
stats['processed'] = 0
|
||||||
|
args = (proc_info['exchange'], stats)
|
||||||
|
process = Process(target=make_and_start_verifier, args=args)
|
||||||
|
process.daemon = True
|
||||||
|
process.start()
|
||||||
|
proc_info['pid'] = process.pid
|
||||||
|
proc_info['process'] = process
|
||||||
|
logger.info("Started child process %s (%s)" % (proc_info['pid'],
|
||||||
|
proc_info['exchange']))
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def check_or_start_all():
|
||||||
|
for proc_name in sorted(processes.keys()):
|
||||||
|
if RUNNING:
|
||||||
|
start_proc(processes[proc_name])
|
||||||
|
|
||||||
|
|
||||||
|
def stop_all():
|
||||||
|
procs = sorted(processes.keys())
|
||||||
|
for pname in procs:
|
||||||
|
process = processes[pname]['process']
|
||||||
|
if process is not None:
|
||||||
|
process.terminate()
|
||||||
|
for pname in procs:
|
||||||
|
process = processes[pname]['process']
|
||||||
|
if process is not None:
|
||||||
|
process.join()
|
||||||
|
processes[pname]['process'] = None
|
||||||
|
processes[pname]['pid'] = 0
|
||||||
|
|
||||||
|
|
||||||
|
def kill_time(signal, frame):
|
||||||
|
global RUNNING
|
||||||
|
RUNNING = False
|
||||||
|
stop_all()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
verifier_config.load()
|
||||||
|
|
||||||
|
logger = _get_parent_logger()
|
||||||
|
log_listener = stacklog.LogListener(logger)
|
||||||
|
log_listener.start()
|
||||||
|
manager = Manager()
|
||||||
|
|
||||||
|
create_proc_table(manager)
|
||||||
|
|
||||||
|
signal.signal(signal.SIGINT, kill_time)
|
||||||
|
signal.signal(signal.SIGTERM, kill_time)
|
||||||
|
|
||||||
|
logger.info("Starting Verifiers...")
|
||||||
|
while RUNNING:
|
||||||
|
check_or_start_all()
|
||||||
|
time.sleep(30)
|
||||||
|
logger.info("Verifiers Shutting down...")
|
||||||
|
|
||||||
|
#make sure.
|
||||||
|
stop_all()
|
||||||
|
|
||||||
|
log_listener.end()
|
||||||
|
sys.exit(0)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user