Catch exceptions during scheduled runs

Occasionally an exception happens during a run, we don't know why yet.
When this happens the scheduler isn't restarted causing the pool mgm to
require restart.

This will now catch the exception, log it and restart the scheduler

Change-Id: I78f57c089831e5ec4a5f084957f859b28cff9c3a
This commit is contained in:
Andrew Hutchings 2013-03-12 19:47:18 +00:00
parent cd50311a1e
commit 687d6b6756

View File

@ -76,20 +76,25 @@ class Server(object):
def failed_nodes(self): def failed_nodes(self):
""" check list of failures """ """ check list of failures """
with self.rlock: with self.rlock:
self.logger.info('Checking log of failed node uploads') try:
nodes = self.node_list.get() self.logger.info('Checking log of failed node uploads')
if len(nodes) == 0: nodes = self.node_list.get()
self.logger.info('Node log empty') if len(nodes) == 0:
else: self.logger.info('Node log empty')
api = self.driver_class(self.args.api_server, self.logger)
if api.is_online():
self.logger.info(
'Connected to {url}'.format(url=api.get_url())
)
for node in nodes:
self.retest_node(node, api)
else: else:
self.logger.error('No working API server found') api = self.driver_class(self.args.api_server, self.logger)
if api.is_online():
self.logger.info(
'Connected to {url}'.format(url=api.get_url())
)
for node in nodes:
self.retest_node(node, api)
else:
self.logger.error('No working API server found')
except Exception:
self.logger.exception(
'Uncaught exception during failed node check'
)
self.reset_failed_scheduler() self.reset_failed_scheduler()
def retest_node(self, node_id, api): def retest_node(self, node_id, api):
@ -168,28 +173,32 @@ class Server(object):
def check_nodes(self): def check_nodes(self):
""" check if known nodes are used """ """ check if known nodes are used """
with self.rlock: with self.rlock:
self.logger.info('Checking if new nodes are needed') try:
api = self.driver_class(self.args.api_server, self.logger) self.logger.info('Checking if new nodes are needed')
if api.is_online(): api = self.driver_class(self.args.api_server, self.logger)
self.logger.info( if api.is_online():
'Connected to {url}'.format(url=api.get_url())
)
free_count = api.get_free_count()
if free_count is None:
self.reset_scheduler()
return
if free_count < self.args.nodes:
# we need to build new nodes
nodes_required = self.args.nodes - free_count
self.logger.info( self.logger.info(
'Building {nodes} nodes' 'Connected to {url}'.format(url=api.get_url())
.format(nodes=nodes_required)
) )
self.build_nodes(nodes_required, api) free_count = api.get_free_count()
if free_count is None:
self.reset_scheduler()
return
if free_count < self.args.nodes:
# we need to build new nodes
nodes_required = self.args.nodes - free_count
self.logger.info(
'Building {nodes} nodes'
.format(nodes=nodes_required)
)
self.build_nodes(nodes_required, api)
else:
self.logger.info('No new nodes required')
else: else:
self.logger.info('No new nodes required') self.logger.error('No working API server found')
else: except Exception:
self.logger.error('No working API server found') self.logger.exception('Uncaught exception during node check')
self.reset_scheduler() self.reset_scheduler()
def reset_scheduler(self): def reset_scheduler(self):