diff --git a/doc/config.rst b/doc/config.rst index 603b30a8..3a60d05d 100644 --- a/doc/config.rst +++ b/doc/config.rst @@ -65,6 +65,7 @@ Pool Manager Section [mgm] pid = /var/run/libra/libra_mgm.pid logfile = /var/log/libra/libra_mgm.log + datadir = /etc/libra/ nova_auth_url = https://region-a.geo-1.identity.hpcloudsvc.com:35357/v2.0/ nova_user = username nova_pass = password @@ -77,6 +78,7 @@ Pool Manager Section api_server = 10.0.0.1:8889 10.0.0.2:8889 nodes = 10 check_interval = 5 + failed_interval = 15 node_basename = 'libra' @@ -167,8 +169,13 @@ Pool Manager Command Line Options .. option:: --check_interval - How often to check the API server to see if noew nodes are needed - (in value is minutes) + How often to check the API server to see if new nodes are needed + (value is minutes) + + .. option:: --failed_interval + + How often to check the list of failed node uploads to see if the nodes + are now in a good state (value is in minutes) .. option:: -d, --debug @@ -196,6 +203,10 @@ Pool Manager Command Line Options file is */var/log/libra/libra_worker.log*. When not in daemon mode, logging will go to STDOUT unless a log file is specified. + .. option:: --datadir + + The data directory used to store things such as the failed node list. + .. option:: -n, --nodaemon Do not run as a daemon. This option is useful for debugging purposes diff --git a/etc/sample_libra.cfg b/etc/sample_libra.cfg index efece789..50eb8943 100644 --- a/etc/sample_libra.cfg +++ b/etc/sample_libra.cfg @@ -38,6 +38,7 @@ logfile = /var/log/libra/libra_worker.log [mgm] pid = /var/run/libra/libra_mgm.pid logfile = /var/log/libra/libra_mgm.log +datadir = /etc/libra/ nova_auth_url = https://region-a.geo-1.identity.hpcloudsvc.com:35357/v2.0/ nova_user = username nova_pass = password @@ -50,4 +51,5 @@ nova_image_size = standard.medium api_server = 10.0.0.1:8889 10.0.0.2:8889 nodes = 10 check_interval = 5 +failed_interval = 15 node_basename = 'libra' diff --git a/libra/mgm/drivers/base.py b/libra/mgm/drivers/base.py index 5140cef3..4e70117b 100644 --- a/libra/mgm/drivers/base.py +++ b/libra/mgm/drivers/base.py @@ -14,7 +14,8 @@ # Mapping of --driver options to a class known_drivers = { - 'hp_rest': 'libra.mgm.drivers.hp_rest.driver.HPRestDriver' + 'hp_rest': 'libra.mgm.drivers.hp_rest.driver.HPRestDriver', + 'dummy': 'libra.mgm.drivers.dummy.driver.DummyDriver' } diff --git a/libra/mgm/drivers/dummy/__init__.py b/libra/mgm/drivers/dummy/__init__.py new file mode 100644 index 00000000..582348cb --- /dev/null +++ b/libra/mgm/drivers/dummy/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2012 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. diff --git a/libra/mgm/drivers/dummy/driver.py b/libra/mgm/drivers/dummy/driver.py new file mode 100644 index 00000000..fcb07a4b --- /dev/null +++ b/libra/mgm/drivers/dummy/driver.py @@ -0,0 +1,35 @@ +# Copyright 2012 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations + +from libra.mgm.drivers.base import MgmDriver + + +class DummyDriver(MgmDriver): + """ + Pool manager dummy driver for testing + """ + def __init__(self, addresses, logger): + self.logger = logger + + def get_free_count(self): + return 5 + + def is_online(self): + return True + + def add_node(self, node_data): + self.logger.info('Dummy API send of {0}'.format(node_data)) + return True, 'test response' + + def get_url(self): + return 'Dummy Connection' diff --git a/libra/mgm/mgm.py b/libra/mgm/mgm.py index d4250e03..73971eca 100644 --- a/libra/mgm/mgm.py +++ b/libra/mgm/mgm.py @@ -22,10 +22,12 @@ import sys import os import threading +from novaclient import exceptions from libra.openstack.common import importutils -from libra.mgm.nova import Node +from libra.mgm.nova import Node, BuildError from libra.common.options import Options, setup_logging from libra.mgm.drivers.base import known_drivers +from libra.mgm.node_list import NodeList, AccessDenied class Server(object): @@ -33,8 +35,14 @@ class Server(object): self.logger = logger self.args = args self.ct = None + self.ft = None self.api = None self.driver_class = None + try: + self.node_list = NodeList(self.args.datadir) + except AccessDenied as exc: + self.logger.error(exc) + self.shutdown(True) def main(self): self.logger.info( @@ -59,9 +67,102 @@ class Server(object): # at the same time. self.rlock = threading.RLock() self.check_nodes() + self.failed_nodes() while True: time.sleep(1) + def failed_nodes(self): + """ check list of failures """ + with self.rlock: + self.logger.info('Checking log of failed node uploads') + nodes = self.node_list.get() + if len(nodes) == 0: + self.logger.info('Node log empty') + else: + api = self.driver_class(self.args.api_server, self.logger) + if api.is_online(): + self.logger.info( + 'Connected to {url}'.format(url=api.get_url()) + ) + for node in nodes: + self.retest_node(node, api) + else: + self.logger.error('No working API server found') + self.reset_failed_scheduler() + + def retest_node(self, node_id, api): + try: + nova = Node( + self.args.nova_user, + self.args.nova_pass, + self.args.nova_tenant, + self.args.nova_auth_url, + self.args.nova_region, + self.args.nova_keyname, + self.args.nova_secgroup, + self.args.nova_image, + self.args.nova_image_size, + node_basename=self.args.node_basename + ) + except Exception as exc: + self.logger.error( + 'Error initialising Nova connection {exc}' + .format(exc=exc) + ) + return + self.logger.info('Retrying node {0}'.format(node_id)) + try: + resp, status = nova.status(node_id) + except exceptions.NotFound: + self.logger.info( + 'Node {0} no longer exists, removing from list' + .format(node_id) + ) + self.node_list.delete(node_id) + return + except exceptions.ClientException as exc: + self.logger.error( + 'Error getting status from Nova, exception {exc}' + .format(exc=sys.exc_info()[0]) + ) + return + + if resp['status'] not in('200', '203'): + self.logger.error( + 'Error geting status from Nova, error {0}' + .format(resp['status']) + ) + return + status = status['server'] + if status['status'] == 'ACTIVE': + name = status['name'] + body = self.build_node_data(status) + status, response = api.add_node(body) + if not status: + self.logger.error( + 'Could not upload node {name} to API server' + .format(name=name) + ) + else: + self.node_list.delete(node_id) + self.logger.info('Node {0} added to API server'.format(name)) + return + elif status['status'].startswith('BUILD'): + self.logger.info( + 'Node {0} still building, ignoring'.format(node_id) + ) + return + else: + self.logger.info( + 'Node {0} is bad, deleting'.format(node_id) + ) + status, msg = nova.delete(node_id) + if not status: + self.logger.error(msg) + else: + self.logger.info('Delete successful') + self.node_list.delete(node_id) + def check_nodes(self): """ check if known nodes are used """ with self.rlock: @@ -90,12 +191,30 @@ class Server(object): self.reset_scheduler() def reset_scheduler(self): - self.logger.info('Sleeping for {mins} minutes' + self.logger.info('Node check timer sleeping for {mins} minutes' .format(mins=self.args.check_interval)) self.ct = threading.Timer(60 * int(self.args.check_interval), self.check_nodes, ()) self.ct.start() + def reset_failed_scheduler(self): + self.logger.info('Node failed timer sleeping for {mins} minutes' + .format(mins=self.args.failed_interval)) + self.ft = threading.Timer(60 * int(self.args.failed_interval), + self.failed_nodes, ()) + self.ft.start() + + def build_node_data(self, data): + """ Build the API data from the node data """ + body = {} + body['name'] = data['name'] + addresses = data['addresses']['private'] + for address in addresses: + if not address['addr'].startswith('10.'): + break + body['address'] = address['addr'] + return body + def build_nodes(self, count, api): try: nova = Node( @@ -116,31 +235,28 @@ class Server(object): ) return while count > 0: - status, data = nova.build() - if not status: - self.logger.error(data) + try: + data = nova.build() + except BuildError as exc: + self.logger.error('{0}, node {1}' + .format(exc.msg, exc.node_name) + ) + if exc.node_id > 0: + self.logger.info('Storing node to try again later') + self.node_list.add(exc.node_id) + self.logger.warning('Aborting node building') return - body = {} - body['name'] = data['name'] - addresses = data['addresses']['private'] - for address in addresses: - if not address['addr'].startswith('10.'): - break - body['address'] = address['addr'] + body = self.build_node_data(data) self.logger.info('Adding node {name} on {ip}' .format(name=body['name'], ip=body['address'])) - # TODO: store failed uploads to API server to retry status, response = api.add_node(body) if not status: self.logger.error( - 'Could not upload node {name} to API server, deleting' + 'Could not upload node {name} to API server' .format(name=data['name']) ) - status, response = nova.delete(data['id']) - if not status: - self.logger.error(response) - else: - self.logger.info('Delete succeeded') + self.logger.info('Storing node to try again later') + self.node_list.add(data['id']) self.logger.warning('Aborting node building') return count = count - 1 @@ -153,6 +269,8 @@ class Server(object): def shutdown(self, error): if self.ct: self.ct.cancel() + if self.ft: + self.ft.cancel() if not error: self.logger.info('Safely shutting down') @@ -168,6 +286,10 @@ def main(): '--api_server', action='append', metavar='HOST:POST', help='a list of API servers to connect to (for HP REST API driver)' ) + options.parser.add_argument( + '--datadir', dest='datadir', + help='directory to store data files' + ) options.parser.add_argument( '--nodes', type=int, default=1, help='number of nodes' @@ -176,6 +298,11 @@ def main(): '--check_interval', type=int, default=5, help='how often to check if new nodes are needed (in minutes)' ) + options.parser.add_argument( + '--failed_interval', type=int, default=15, + help='how often to retest nodes that failed to get added to the API' + ' server (in minutes)' + ) options.parser.add_argument( '--driver', dest='driver', choices=known_drivers.keys(), default='hp_rest', @@ -227,6 +354,7 @@ def main(): args = options.run() required_args = [ + 'datadir', 'nova_image', 'nova_image_size', 'nova_secgroup', 'nova_keyname', 'nova_tenant', 'nova_region', 'nova_user', 'nova_pass', 'nova_auth_url' ] diff --git a/libra/mgm/node_list.py b/libra/mgm/node_list.py new file mode 100644 index 00000000..c0914290 --- /dev/null +++ b/libra/mgm/node_list.py @@ -0,0 +1,49 @@ +# Copyright 2012 Hewlett-Packard Development Company, L.P. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import pickle +import os + + +class AccessDenied(Exception): + pass + + +class NodeList(object): + def __init__(self, path): + if not os.access(path, os.W_OK): + msg = 'Do not have permission to write to {0}'.format(path) + raise AccessDenied(msg) + + self.file_name = '{0}/node_log.dat'.format(path) + + def add(self, item): + data = self.get() + data.append(item) + self.put(data) + + def delete(self, item): + data = self.get() + data.remove(item) + self.put(data) + + def get(self): + # Attribute error is thrown if file is non-existent + try: + return pickle.load(open(self.file_name, "rb")) + except IOError: + return [] + + def put(self, data): + pickle.dump(data, open(self.file_name, "wb")) diff --git a/libra/mgm/nova.py b/libra/mgm/nova.py index 33fb15af..81c6e3d6 100644 --- a/libra/mgm/nova.py +++ b/libra/mgm/nova.py @@ -25,6 +25,16 @@ class NotFound(Exception): pass +class BuildError(Exception): + def __init__(self, msg, node_name, node_id=0): + self.msg = msg + self.node_name = node_name + self.node_id = node_id + + def __str__(self): + return self.msg + + class Node(object): def __init__(self, username, password, tenant, auth_url, region, keyname, secgroup, image, node_type, node_basename=None): @@ -60,8 +70,9 @@ class Node(object): try: body = self._create(node_id) except exceptions.ClientException: - return False, 'Error creating node {nid} exception {exc}'.format( - nid=node_id, exc=sys.exc_info()[0] + raise BuildError( + 'Error creating node, exception {exc}' + .format(exc=sys.exc_info()[0]), node_id ) server_id = body['server']['id'] @@ -69,19 +80,19 @@ class Node(object): waits = 40 while waits > 0: time.sleep(3) - status = self._status(server_id) + resp, status = self.status(server_id) + status = status['server'] if status['status'] == 'ACTIVE': - return True, status + return status elif not status['status'].startswith('BUILD'): - return False, 'Error spawning node {nid} status {stat}'.format( - node=node_id, stat=status['status'] + raise BuildError( + 'Error spawning node, status {stat}' + .format(stat=status['status']), + node_id, server_id, ) waits = waits - 1 - return (False, - 'Timeout creating node, uuid: {nid}, server ID: {sid}' - .format(nid=node_id, sid=server_id) - ) + raise BuildError('Timeout creating node', node_id, server_id) def delete(self, node_id): """ delete a node """ @@ -119,11 +130,11 @@ class Node(object): resp, body = self.nova.post(url, body=body) return body - def _status(self, node_id): + def status(self, node_id): """ used to keep scanning to see if node is up """ url = "/servers/{0}".format(node_id) resp, body = self.nova.get(url) - return body['server'] + return resp, body def _delete(self, node_id): """ delete a nova node, return 204 succeed """ diff --git a/tests/test_lbaas_mgm.py b/tests/test_lbaas_mgm.py index 3b8cdc3b..c045aa32 100644 --- a/tests/test_lbaas_mgm.py +++ b/tests/test_lbaas_mgm.py @@ -5,7 +5,7 @@ import httplib2 import json import mock_objects -from libra.mgm.nova import Node +from libra.mgm.nova import Node, BuildError fake_response = httplib2.Response({"status": '200'}) fake_bad_response = httplib2.Response({"status": '500'}) @@ -42,16 +42,14 @@ class TestLBaaSMgmNova(unittest.TestCase): def testCreateNode(self): with mock.patch.object(httplib2.Http, "request", mock_request): with mock.patch('time.time', mock.Mock(return_value=1234)): - resp, data = self.api.build() - self.assertTrue(resp) + data = self.api.build() self.assertEqual(data['id'], 417773) def testCreateNodeFail(self): with mock.patch.object(httplib2.Http, "request", mock_bad_request): with mock.patch('time.time', mock.Mock(return_value=1234)): - resp, data = self.api.build() - self.assertFalse(resp) - self.assertRegexpMatches(data, 'Error creating') + with self.assertRaises(BuildError): + data = self.api.build() def testDeleteNodeFail(self): with mock.patch.object(httplib2.Http, "request", mock_bad_request):