Catch exceptions during scheduled runs
Occasionally an exception happens during a run, we don't know why yet. When this happens the scheduler isn't restarted causing the pool mgm to require restart. This will now catch the exception, log it and restart the scheduler Change-Id: I78f57c089831e5ec4a5f084957f859b28cff9c3a
This commit is contained in:
parent
cd50311a1e
commit
687d6b6756
@ -76,20 +76,25 @@ class Server(object):
|
|||||||
def failed_nodes(self):
|
def failed_nodes(self):
|
||||||
""" check list of failures """
|
""" check list of failures """
|
||||||
with self.rlock:
|
with self.rlock:
|
||||||
self.logger.info('Checking log of failed node uploads')
|
try:
|
||||||
nodes = self.node_list.get()
|
self.logger.info('Checking log of failed node uploads')
|
||||||
if len(nodes) == 0:
|
nodes = self.node_list.get()
|
||||||
self.logger.info('Node log empty')
|
if len(nodes) == 0:
|
||||||
else:
|
self.logger.info('Node log empty')
|
||||||
api = self.driver_class(self.args.api_server, self.logger)
|
|
||||||
if api.is_online():
|
|
||||||
self.logger.info(
|
|
||||||
'Connected to {url}'.format(url=api.get_url())
|
|
||||||
)
|
|
||||||
for node in nodes:
|
|
||||||
self.retest_node(node, api)
|
|
||||||
else:
|
else:
|
||||||
self.logger.error('No working API server found')
|
api = self.driver_class(self.args.api_server, self.logger)
|
||||||
|
if api.is_online():
|
||||||
|
self.logger.info(
|
||||||
|
'Connected to {url}'.format(url=api.get_url())
|
||||||
|
)
|
||||||
|
for node in nodes:
|
||||||
|
self.retest_node(node, api)
|
||||||
|
else:
|
||||||
|
self.logger.error('No working API server found')
|
||||||
|
except Exception:
|
||||||
|
self.logger.exception(
|
||||||
|
'Uncaught exception during failed node check'
|
||||||
|
)
|
||||||
self.reset_failed_scheduler()
|
self.reset_failed_scheduler()
|
||||||
|
|
||||||
def retest_node(self, node_id, api):
|
def retest_node(self, node_id, api):
|
||||||
@ -168,28 +173,32 @@ class Server(object):
|
|||||||
def check_nodes(self):
|
def check_nodes(self):
|
||||||
""" check if known nodes are used """
|
""" check if known nodes are used """
|
||||||
with self.rlock:
|
with self.rlock:
|
||||||
self.logger.info('Checking if new nodes are needed')
|
try:
|
||||||
api = self.driver_class(self.args.api_server, self.logger)
|
self.logger.info('Checking if new nodes are needed')
|
||||||
if api.is_online():
|
api = self.driver_class(self.args.api_server, self.logger)
|
||||||
self.logger.info(
|
if api.is_online():
|
||||||
'Connected to {url}'.format(url=api.get_url())
|
|
||||||
)
|
|
||||||
free_count = api.get_free_count()
|
|
||||||
if free_count is None:
|
|
||||||
self.reset_scheduler()
|
|
||||||
return
|
|
||||||
if free_count < self.args.nodes:
|
|
||||||
# we need to build new nodes
|
|
||||||
nodes_required = self.args.nodes - free_count
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'Building {nodes} nodes'
|
'Connected to {url}'.format(url=api.get_url())
|
||||||
.format(nodes=nodes_required)
|
|
||||||
)
|
)
|
||||||
self.build_nodes(nodes_required, api)
|
free_count = api.get_free_count()
|
||||||
|
if free_count is None:
|
||||||
|
self.reset_scheduler()
|
||||||
|
return
|
||||||
|
if free_count < self.args.nodes:
|
||||||
|
# we need to build new nodes
|
||||||
|
nodes_required = self.args.nodes - free_count
|
||||||
|
self.logger.info(
|
||||||
|
'Building {nodes} nodes'
|
||||||
|
.format(nodes=nodes_required)
|
||||||
|
)
|
||||||
|
self.build_nodes(nodes_required, api)
|
||||||
|
else:
|
||||||
|
self.logger.info('No new nodes required')
|
||||||
else:
|
else:
|
||||||
self.logger.info('No new nodes required')
|
self.logger.error('No working API server found')
|
||||||
else:
|
except Exception:
|
||||||
self.logger.error('No working API server found')
|
self.logger.exception('Uncaught exception during node check')
|
||||||
|
|
||||||
self.reset_scheduler()
|
self.reset_scheduler()
|
||||||
|
|
||||||
def reset_scheduler(self):
|
def reset_scheduler(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user