Add failed node handling

There are two cases where we can fail with node building:

1. The node errors or times out
2. Sending to the API server fails

In these cases a new timer is triggered and will test these nodes and add or
delete them as required.  It stores them in a separate file.

This also adds a dummy API driver that always says 5 nodes are needed and
echos the node data sent to it to the log file.

Fixes bug #1080863

Change-Id: I9a67de2be5336abdfcad5a3bb372c467cb57f4b1
This commit is contained in:
Andrew Hutchings 2012-11-21 16:08:05 +00:00
parent 7e3ac2cd2e
commit c9709df427
9 changed files with 288 additions and 40 deletions

View File

@ -65,6 +65,7 @@ Pool Manager Section
[mgm]
pid = /var/run/libra/libra_mgm.pid
logfile = /var/log/libra/libra_mgm.log
datadir = /etc/libra/
nova_auth_url = https://region-a.geo-1.identity.hpcloudsvc.com:35357/v2.0/
nova_user = username
nova_pass = password
@ -77,6 +78,7 @@ Pool Manager Section
api_server = 10.0.0.1:8889 10.0.0.2:8889
nodes = 10
check_interval = 5
failed_interval = 15
node_basename = 'libra'
@ -167,8 +169,13 @@ Pool Manager Command Line Options
.. option:: --check_interval <CHECK_INTERVAL>
How often to check the API server to see if noew nodes are needed
(in value is minutes)
How often to check the API server to see if new nodes are needed
(value is minutes)
.. option:: --failed_interval <FAILED_INTERVAL>
How often to check the list of failed node uploads to see if the nodes
are now in a good state (value is in minutes)
.. option:: -d, --debug
@ -196,6 +203,10 @@ Pool Manager Command Line Options
file is */var/log/libra/libra_worker.log*. When not in daemon mode,
logging will go to STDOUT unless a log file is specified.
.. option:: --datadir <DATADIR>
The data directory used to store things such as the failed node list.
.. option:: -n, --nodaemon
Do not run as a daemon. This option is useful for debugging purposes

View File

@ -38,6 +38,7 @@ logfile = /var/log/libra/libra_worker.log
[mgm]
pid = /var/run/libra/libra_mgm.pid
logfile = /var/log/libra/libra_mgm.log
datadir = /etc/libra/
nova_auth_url = https://region-a.geo-1.identity.hpcloudsvc.com:35357/v2.0/
nova_user = username
nova_pass = password
@ -50,4 +51,5 @@ nova_image_size = standard.medium
api_server = 10.0.0.1:8889 10.0.0.2:8889
nodes = 10
check_interval = 5
failed_interval = 15
node_basename = 'libra'

View File

@ -14,7 +14,8 @@
# Mapping of --driver options to a class
known_drivers = {
'hp_rest': 'libra.mgm.drivers.hp_rest.driver.HPRestDriver'
'hp_rest': 'libra.mgm.drivers.hp_rest.driver.HPRestDriver',
'dummy': 'libra.mgm.drivers.dummy.driver.DummyDriver'
}

View File

@ -0,0 +1,13 @@
# Copyright 2012 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

View File

@ -0,0 +1,35 @@
# Copyright 2012 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
from libra.mgm.drivers.base import MgmDriver
class DummyDriver(MgmDriver):
"""
Pool manager dummy driver for testing
"""
def __init__(self, addresses, logger):
self.logger = logger
def get_free_count(self):
return 5
def is_online(self):
return True
def add_node(self, node_data):
self.logger.info('Dummy API send of {0}'.format(node_data))
return True, 'test response'
def get_url(self):
return 'Dummy Connection'

View File

@ -22,10 +22,12 @@ import sys
import os
import threading
from novaclient import exceptions
from libra.openstack.common import importutils
from libra.mgm.nova import Node
from libra.mgm.nova import Node, BuildError
from libra.common.options import Options, setup_logging
from libra.mgm.drivers.base import known_drivers
from libra.mgm.node_list import NodeList, AccessDenied
class Server(object):
@ -33,8 +35,14 @@ class Server(object):
self.logger = logger
self.args = args
self.ct = None
self.ft = None
self.api = None
self.driver_class = None
try:
self.node_list = NodeList(self.args.datadir)
except AccessDenied as exc:
self.logger.error(exc)
self.shutdown(True)
def main(self):
self.logger.info(
@ -59,9 +67,102 @@ class Server(object):
# at the same time.
self.rlock = threading.RLock()
self.check_nodes()
self.failed_nodes()
while True:
time.sleep(1)
def failed_nodes(self):
""" check list of failures """
with self.rlock:
self.logger.info('Checking log of failed node uploads')
nodes = self.node_list.get()
if len(nodes) == 0:
self.logger.info('Node log empty')
else:
api = self.driver_class(self.args.api_server, self.logger)
if api.is_online():
self.logger.info(
'Connected to {url}'.format(url=api.get_url())
)
for node in nodes:
self.retest_node(node, api)
else:
self.logger.error('No working API server found')
self.reset_failed_scheduler()
def retest_node(self, node_id, api):
try:
nova = Node(
self.args.nova_user,
self.args.nova_pass,
self.args.nova_tenant,
self.args.nova_auth_url,
self.args.nova_region,
self.args.nova_keyname,
self.args.nova_secgroup,
self.args.nova_image,
self.args.nova_image_size,
node_basename=self.args.node_basename
)
except Exception as exc:
self.logger.error(
'Error initialising Nova connection {exc}'
.format(exc=exc)
)
return
self.logger.info('Retrying node {0}'.format(node_id))
try:
resp, status = nova.status(node_id)
except exceptions.NotFound:
self.logger.info(
'Node {0} no longer exists, removing from list'
.format(node_id)
)
self.node_list.delete(node_id)
return
except exceptions.ClientException as exc:
self.logger.error(
'Error getting status from Nova, exception {exc}'
.format(exc=sys.exc_info()[0])
)
return
if resp['status'] not in('200', '203'):
self.logger.error(
'Error geting status from Nova, error {0}'
.format(resp['status'])
)
return
status = status['server']
if status['status'] == 'ACTIVE':
name = status['name']
body = self.build_node_data(status)
status, response = api.add_node(body)
if not status:
self.logger.error(
'Could not upload node {name} to API server'
.format(name=name)
)
else:
self.node_list.delete(node_id)
self.logger.info('Node {0} added to API server'.format(name))
return
elif status['status'].startswith('BUILD'):
self.logger.info(
'Node {0} still building, ignoring'.format(node_id)
)
return
else:
self.logger.info(
'Node {0} is bad, deleting'.format(node_id)
)
status, msg = nova.delete(node_id)
if not status:
self.logger.error(msg)
else:
self.logger.info('Delete successful')
self.node_list.delete(node_id)
def check_nodes(self):
""" check if known nodes are used """
with self.rlock:
@ -90,12 +191,30 @@ class Server(object):
self.reset_scheduler()
def reset_scheduler(self):
self.logger.info('Sleeping for {mins} minutes'
self.logger.info('Node check timer sleeping for {mins} minutes'
.format(mins=self.args.check_interval))
self.ct = threading.Timer(60 * int(self.args.check_interval),
self.check_nodes, ())
self.ct.start()
def reset_failed_scheduler(self):
self.logger.info('Node failed timer sleeping for {mins} minutes'
.format(mins=self.args.failed_interval))
self.ft = threading.Timer(60 * int(self.args.failed_interval),
self.failed_nodes, ())
self.ft.start()
def build_node_data(self, data):
""" Build the API data from the node data """
body = {}
body['name'] = data['name']
addresses = data['addresses']['private']
for address in addresses:
if not address['addr'].startswith('10.'):
break
body['address'] = address['addr']
return body
def build_nodes(self, count, api):
try:
nova = Node(
@ -116,31 +235,28 @@ class Server(object):
)
return
while count > 0:
status, data = nova.build()
if not status:
self.logger.error(data)
try:
data = nova.build()
except BuildError as exc:
self.logger.error('{0}, node {1}'
.format(exc.msg, exc.node_name)
)
if exc.node_id > 0:
self.logger.info('Storing node to try again later')
self.node_list.add(exc.node_id)
self.logger.warning('Aborting node building')
return
body = {}
body['name'] = data['name']
addresses = data['addresses']['private']
for address in addresses:
if not address['addr'].startswith('10.'):
break
body['address'] = address['addr']
body = self.build_node_data(data)
self.logger.info('Adding node {name} on {ip}'
.format(name=body['name'], ip=body['address']))
# TODO: store failed uploads to API server to retry
status, response = api.add_node(body)
if not status:
self.logger.error(
'Could not upload node {name} to API server, deleting'
'Could not upload node {name} to API server'
.format(name=data['name'])
)
status, response = nova.delete(data['id'])
if not status:
self.logger.error(response)
else:
self.logger.info('Delete succeeded')
self.logger.info('Storing node to try again later')
self.node_list.add(data['id'])
self.logger.warning('Aborting node building')
return
count = count - 1
@ -153,6 +269,8 @@ class Server(object):
def shutdown(self, error):
if self.ct:
self.ct.cancel()
if self.ft:
self.ft.cancel()
if not error:
self.logger.info('Safely shutting down')
@ -168,6 +286,10 @@ def main():
'--api_server', action='append', metavar='HOST:POST',
help='a list of API servers to connect to (for HP REST API driver)'
)
options.parser.add_argument(
'--datadir', dest='datadir',
help='directory to store data files'
)
options.parser.add_argument(
'--nodes', type=int, default=1,
help='number of nodes'
@ -176,6 +298,11 @@ def main():
'--check_interval', type=int, default=5,
help='how often to check if new nodes are needed (in minutes)'
)
options.parser.add_argument(
'--failed_interval', type=int, default=15,
help='how often to retest nodes that failed to get added to the API'
' server (in minutes)'
)
options.parser.add_argument(
'--driver', dest='driver',
choices=known_drivers.keys(), default='hp_rest',
@ -227,6 +354,7 @@ def main():
args = options.run()
required_args = [
'datadir',
'nova_image', 'nova_image_size', 'nova_secgroup', 'nova_keyname',
'nova_tenant', 'nova_region', 'nova_user', 'nova_pass', 'nova_auth_url'
]

49
libra/mgm/node_list.py Normal file
View File

@ -0,0 +1,49 @@
# Copyright 2012 Hewlett-Packard Development Company, L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import pickle
import os
class AccessDenied(Exception):
pass
class NodeList(object):
def __init__(self, path):
if not os.access(path, os.W_OK):
msg = 'Do not have permission to write to {0}'.format(path)
raise AccessDenied(msg)
self.file_name = '{0}/node_log.dat'.format(path)
def add(self, item):
data = self.get()
data.append(item)
self.put(data)
def delete(self, item):
data = self.get()
data.remove(item)
self.put(data)
def get(self):
# Attribute error is thrown if file is non-existent
try:
return pickle.load(open(self.file_name, "rb"))
except IOError:
return []
def put(self, data):
pickle.dump(data, open(self.file_name, "wb"))

View File

@ -25,6 +25,16 @@ class NotFound(Exception):
pass
class BuildError(Exception):
def __init__(self, msg, node_name, node_id=0):
self.msg = msg
self.node_name = node_name
self.node_id = node_id
def __str__(self):
return self.msg
class Node(object):
def __init__(self, username, password, tenant, auth_url, region, keyname,
secgroup, image, node_type, node_basename=None):
@ -60,8 +70,9 @@ class Node(object):
try:
body = self._create(node_id)
except exceptions.ClientException:
return False, 'Error creating node {nid} exception {exc}'.format(
nid=node_id, exc=sys.exc_info()[0]
raise BuildError(
'Error creating node, exception {exc}'
.format(exc=sys.exc_info()[0]), node_id
)
server_id = body['server']['id']
@ -69,19 +80,19 @@ class Node(object):
waits = 40
while waits > 0:
time.sleep(3)
status = self._status(server_id)
resp, status = self.status(server_id)
status = status['server']
if status['status'] == 'ACTIVE':
return True, status
return status
elif not status['status'].startswith('BUILD'):
return False, 'Error spawning node {nid} status {stat}'.format(
node=node_id, stat=status['status']
raise BuildError(
'Error spawning node, status {stat}'
.format(stat=status['status']),
node_id, server_id,
)
waits = waits - 1
return (False,
'Timeout creating node, uuid: {nid}, server ID: {sid}'
.format(nid=node_id, sid=server_id)
)
raise BuildError('Timeout creating node', node_id, server_id)
def delete(self, node_id):
""" delete a node """
@ -119,11 +130,11 @@ class Node(object):
resp, body = self.nova.post(url, body=body)
return body
def _status(self, node_id):
def status(self, node_id):
""" used to keep scanning to see if node is up """
url = "/servers/{0}".format(node_id)
resp, body = self.nova.get(url)
return body['server']
return resp, body
def _delete(self, node_id):
""" delete a nova node, return 204 succeed """

View File

@ -5,7 +5,7 @@ import httplib2
import json
import mock_objects
from libra.mgm.nova import Node
from libra.mgm.nova import Node, BuildError
fake_response = httplib2.Response({"status": '200'})
fake_bad_response = httplib2.Response({"status": '500'})
@ -42,16 +42,14 @@ class TestLBaaSMgmNova(unittest.TestCase):
def testCreateNode(self):
with mock.patch.object(httplib2.Http, "request", mock_request):
with mock.patch('time.time', mock.Mock(return_value=1234)):
resp, data = self.api.build()
self.assertTrue(resp)
data = self.api.build()
self.assertEqual(data['id'], 417773)
def testCreateNodeFail(self):
with mock.patch.object(httplib2.Http, "request", mock_bad_request):
with mock.patch('time.time', mock.Mock(return_value=1234)):
resp, data = self.api.build()
self.assertFalse(resp)
self.assertRegexpMatches(data, 'Error creating')
with self.assertRaises(BuildError):
data = self.api.build()
def testDeleteNodeFail(self):
with mock.patch.object(httplib2.Http, "request", mock_bad_request):