Add failed node handling
There are two cases where we can fail with node building: 1. The node errors or times out 2. Sending to the API server fails In these cases a new timer is triggered and will test these nodes and add or delete them as required. It stores them in a separate file. This also adds a dummy API driver that always says 5 nodes are needed and echos the node data sent to it to the log file. Fixes bug #1080863 Change-Id: I9a67de2be5336abdfcad5a3bb372c467cb57f4b1
This commit is contained in:
parent
7e3ac2cd2e
commit
c9709df427
@ -65,6 +65,7 @@ Pool Manager Section
|
||||
[mgm]
|
||||
pid = /var/run/libra/libra_mgm.pid
|
||||
logfile = /var/log/libra/libra_mgm.log
|
||||
datadir = /etc/libra/
|
||||
nova_auth_url = https://region-a.geo-1.identity.hpcloudsvc.com:35357/v2.0/
|
||||
nova_user = username
|
||||
nova_pass = password
|
||||
@ -77,6 +78,7 @@ Pool Manager Section
|
||||
api_server = 10.0.0.1:8889 10.0.0.2:8889
|
||||
nodes = 10
|
||||
check_interval = 5
|
||||
failed_interval = 15
|
||||
node_basename = 'libra'
|
||||
|
||||
|
||||
@ -167,8 +169,13 @@ Pool Manager Command Line Options
|
||||
|
||||
.. option:: --check_interval <CHECK_INTERVAL>
|
||||
|
||||
How often to check the API server to see if noew nodes are needed
|
||||
(in value is minutes)
|
||||
How often to check the API server to see if new nodes are needed
|
||||
(value is minutes)
|
||||
|
||||
.. option:: --failed_interval <FAILED_INTERVAL>
|
||||
|
||||
How often to check the list of failed node uploads to see if the nodes
|
||||
are now in a good state (value is in minutes)
|
||||
|
||||
.. option:: -d, --debug
|
||||
|
||||
@ -196,6 +203,10 @@ Pool Manager Command Line Options
|
||||
file is */var/log/libra/libra_worker.log*. When not in daemon mode,
|
||||
logging will go to STDOUT unless a log file is specified.
|
||||
|
||||
.. option:: --datadir <DATADIR>
|
||||
|
||||
The data directory used to store things such as the failed node list.
|
||||
|
||||
.. option:: -n, --nodaemon
|
||||
|
||||
Do not run as a daemon. This option is useful for debugging purposes
|
||||
|
@ -38,6 +38,7 @@ logfile = /var/log/libra/libra_worker.log
|
||||
[mgm]
|
||||
pid = /var/run/libra/libra_mgm.pid
|
||||
logfile = /var/log/libra/libra_mgm.log
|
||||
datadir = /etc/libra/
|
||||
nova_auth_url = https://region-a.geo-1.identity.hpcloudsvc.com:35357/v2.0/
|
||||
nova_user = username
|
||||
nova_pass = password
|
||||
@ -50,4 +51,5 @@ nova_image_size = standard.medium
|
||||
api_server = 10.0.0.1:8889 10.0.0.2:8889
|
||||
nodes = 10
|
||||
check_interval = 5
|
||||
failed_interval = 15
|
||||
node_basename = 'libra'
|
||||
|
@ -14,7 +14,8 @@
|
||||
|
||||
# Mapping of --driver options to a class
|
||||
known_drivers = {
|
||||
'hp_rest': 'libra.mgm.drivers.hp_rest.driver.HPRestDriver'
|
||||
'hp_rest': 'libra.mgm.drivers.hp_rest.driver.HPRestDriver',
|
||||
'dummy': 'libra.mgm.drivers.dummy.driver.DummyDriver'
|
||||
}
|
||||
|
||||
|
||||
|
13
libra/mgm/drivers/dummy/__init__.py
Normal file
13
libra/mgm/drivers/dummy/__init__.py
Normal file
@ -0,0 +1,13 @@
|
||||
# Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
35
libra/mgm/drivers/dummy/driver.py
Normal file
35
libra/mgm/drivers/dummy/driver.py
Normal file
@ -0,0 +1,35 @@
|
||||
# Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
|
||||
from libra.mgm.drivers.base import MgmDriver
|
||||
|
||||
|
||||
class DummyDriver(MgmDriver):
|
||||
"""
|
||||
Pool manager dummy driver for testing
|
||||
"""
|
||||
def __init__(self, addresses, logger):
|
||||
self.logger = logger
|
||||
|
||||
def get_free_count(self):
|
||||
return 5
|
||||
|
||||
def is_online(self):
|
||||
return True
|
||||
|
||||
def add_node(self, node_data):
|
||||
self.logger.info('Dummy API send of {0}'.format(node_data))
|
||||
return True, 'test response'
|
||||
|
||||
def get_url(self):
|
||||
return 'Dummy Connection'
|
166
libra/mgm/mgm.py
166
libra/mgm/mgm.py
@ -22,10 +22,12 @@ import sys
|
||||
import os
|
||||
import threading
|
||||
|
||||
from novaclient import exceptions
|
||||
from libra.openstack.common import importutils
|
||||
from libra.mgm.nova import Node
|
||||
from libra.mgm.nova import Node, BuildError
|
||||
from libra.common.options import Options, setup_logging
|
||||
from libra.mgm.drivers.base import known_drivers
|
||||
from libra.mgm.node_list import NodeList, AccessDenied
|
||||
|
||||
|
||||
class Server(object):
|
||||
@ -33,8 +35,14 @@ class Server(object):
|
||||
self.logger = logger
|
||||
self.args = args
|
||||
self.ct = None
|
||||
self.ft = None
|
||||
self.api = None
|
||||
self.driver_class = None
|
||||
try:
|
||||
self.node_list = NodeList(self.args.datadir)
|
||||
except AccessDenied as exc:
|
||||
self.logger.error(exc)
|
||||
self.shutdown(True)
|
||||
|
||||
def main(self):
|
||||
self.logger.info(
|
||||
@ -59,9 +67,102 @@ class Server(object):
|
||||
# at the same time.
|
||||
self.rlock = threading.RLock()
|
||||
self.check_nodes()
|
||||
self.failed_nodes()
|
||||
while True:
|
||||
time.sleep(1)
|
||||
|
||||
def failed_nodes(self):
|
||||
""" check list of failures """
|
||||
with self.rlock:
|
||||
self.logger.info('Checking log of failed node uploads')
|
||||
nodes = self.node_list.get()
|
||||
if len(nodes) == 0:
|
||||
self.logger.info('Node log empty')
|
||||
else:
|
||||
api = self.driver_class(self.args.api_server, self.logger)
|
||||
if api.is_online():
|
||||
self.logger.info(
|
||||
'Connected to {url}'.format(url=api.get_url())
|
||||
)
|
||||
for node in nodes:
|
||||
self.retest_node(node, api)
|
||||
else:
|
||||
self.logger.error('No working API server found')
|
||||
self.reset_failed_scheduler()
|
||||
|
||||
def retest_node(self, node_id, api):
|
||||
try:
|
||||
nova = Node(
|
||||
self.args.nova_user,
|
||||
self.args.nova_pass,
|
||||
self.args.nova_tenant,
|
||||
self.args.nova_auth_url,
|
||||
self.args.nova_region,
|
||||
self.args.nova_keyname,
|
||||
self.args.nova_secgroup,
|
||||
self.args.nova_image,
|
||||
self.args.nova_image_size,
|
||||
node_basename=self.args.node_basename
|
||||
)
|
||||
except Exception as exc:
|
||||
self.logger.error(
|
||||
'Error initialising Nova connection {exc}'
|
||||
.format(exc=exc)
|
||||
)
|
||||
return
|
||||
self.logger.info('Retrying node {0}'.format(node_id))
|
||||
try:
|
||||
resp, status = nova.status(node_id)
|
||||
except exceptions.NotFound:
|
||||
self.logger.info(
|
||||
'Node {0} no longer exists, removing from list'
|
||||
.format(node_id)
|
||||
)
|
||||
self.node_list.delete(node_id)
|
||||
return
|
||||
except exceptions.ClientException as exc:
|
||||
self.logger.error(
|
||||
'Error getting status from Nova, exception {exc}'
|
||||
.format(exc=sys.exc_info()[0])
|
||||
)
|
||||
return
|
||||
|
||||
if resp['status'] not in('200', '203'):
|
||||
self.logger.error(
|
||||
'Error geting status from Nova, error {0}'
|
||||
.format(resp['status'])
|
||||
)
|
||||
return
|
||||
status = status['server']
|
||||
if status['status'] == 'ACTIVE':
|
||||
name = status['name']
|
||||
body = self.build_node_data(status)
|
||||
status, response = api.add_node(body)
|
||||
if not status:
|
||||
self.logger.error(
|
||||
'Could not upload node {name} to API server'
|
||||
.format(name=name)
|
||||
)
|
||||
else:
|
||||
self.node_list.delete(node_id)
|
||||
self.logger.info('Node {0} added to API server'.format(name))
|
||||
return
|
||||
elif status['status'].startswith('BUILD'):
|
||||
self.logger.info(
|
||||
'Node {0} still building, ignoring'.format(node_id)
|
||||
)
|
||||
return
|
||||
else:
|
||||
self.logger.info(
|
||||
'Node {0} is bad, deleting'.format(node_id)
|
||||
)
|
||||
status, msg = nova.delete(node_id)
|
||||
if not status:
|
||||
self.logger.error(msg)
|
||||
else:
|
||||
self.logger.info('Delete successful')
|
||||
self.node_list.delete(node_id)
|
||||
|
||||
def check_nodes(self):
|
||||
""" check if known nodes are used """
|
||||
with self.rlock:
|
||||
@ -90,12 +191,30 @@ class Server(object):
|
||||
self.reset_scheduler()
|
||||
|
||||
def reset_scheduler(self):
|
||||
self.logger.info('Sleeping for {mins} minutes'
|
||||
self.logger.info('Node check timer sleeping for {mins} minutes'
|
||||
.format(mins=self.args.check_interval))
|
||||
self.ct = threading.Timer(60 * int(self.args.check_interval),
|
||||
self.check_nodes, ())
|
||||
self.ct.start()
|
||||
|
||||
def reset_failed_scheduler(self):
|
||||
self.logger.info('Node failed timer sleeping for {mins} minutes'
|
||||
.format(mins=self.args.failed_interval))
|
||||
self.ft = threading.Timer(60 * int(self.args.failed_interval),
|
||||
self.failed_nodes, ())
|
||||
self.ft.start()
|
||||
|
||||
def build_node_data(self, data):
|
||||
""" Build the API data from the node data """
|
||||
body = {}
|
||||
body['name'] = data['name']
|
||||
addresses = data['addresses']['private']
|
||||
for address in addresses:
|
||||
if not address['addr'].startswith('10.'):
|
||||
break
|
||||
body['address'] = address['addr']
|
||||
return body
|
||||
|
||||
def build_nodes(self, count, api):
|
||||
try:
|
||||
nova = Node(
|
||||
@ -116,31 +235,28 @@ class Server(object):
|
||||
)
|
||||
return
|
||||
while count > 0:
|
||||
status, data = nova.build()
|
||||
if not status:
|
||||
self.logger.error(data)
|
||||
try:
|
||||
data = nova.build()
|
||||
except BuildError as exc:
|
||||
self.logger.error('{0}, node {1}'
|
||||
.format(exc.msg, exc.node_name)
|
||||
)
|
||||
if exc.node_id > 0:
|
||||
self.logger.info('Storing node to try again later')
|
||||
self.node_list.add(exc.node_id)
|
||||
self.logger.warning('Aborting node building')
|
||||
return
|
||||
body = {}
|
||||
body['name'] = data['name']
|
||||
addresses = data['addresses']['private']
|
||||
for address in addresses:
|
||||
if not address['addr'].startswith('10.'):
|
||||
break
|
||||
body['address'] = address['addr']
|
||||
body = self.build_node_data(data)
|
||||
self.logger.info('Adding node {name} on {ip}'
|
||||
.format(name=body['name'], ip=body['address']))
|
||||
# TODO: store failed uploads to API server to retry
|
||||
status, response = api.add_node(body)
|
||||
if not status:
|
||||
self.logger.error(
|
||||
'Could not upload node {name} to API server, deleting'
|
||||
'Could not upload node {name} to API server'
|
||||
.format(name=data['name'])
|
||||
)
|
||||
status, response = nova.delete(data['id'])
|
||||
if not status:
|
||||
self.logger.error(response)
|
||||
else:
|
||||
self.logger.info('Delete succeeded')
|
||||
self.logger.info('Storing node to try again later')
|
||||
self.node_list.add(data['id'])
|
||||
self.logger.warning('Aborting node building')
|
||||
return
|
||||
count = count - 1
|
||||
@ -153,6 +269,8 @@ class Server(object):
|
||||
def shutdown(self, error):
|
||||
if self.ct:
|
||||
self.ct.cancel()
|
||||
if self.ft:
|
||||
self.ft.cancel()
|
||||
|
||||
if not error:
|
||||
self.logger.info('Safely shutting down')
|
||||
@ -168,6 +286,10 @@ def main():
|
||||
'--api_server', action='append', metavar='HOST:POST',
|
||||
help='a list of API servers to connect to (for HP REST API driver)'
|
||||
)
|
||||
options.parser.add_argument(
|
||||
'--datadir', dest='datadir',
|
||||
help='directory to store data files'
|
||||
)
|
||||
options.parser.add_argument(
|
||||
'--nodes', type=int, default=1,
|
||||
help='number of nodes'
|
||||
@ -176,6 +298,11 @@ def main():
|
||||
'--check_interval', type=int, default=5,
|
||||
help='how often to check if new nodes are needed (in minutes)'
|
||||
)
|
||||
options.parser.add_argument(
|
||||
'--failed_interval', type=int, default=15,
|
||||
help='how often to retest nodes that failed to get added to the API'
|
||||
' server (in minutes)'
|
||||
)
|
||||
options.parser.add_argument(
|
||||
'--driver', dest='driver',
|
||||
choices=known_drivers.keys(), default='hp_rest',
|
||||
@ -227,6 +354,7 @@ def main():
|
||||
args = options.run()
|
||||
|
||||
required_args = [
|
||||
'datadir',
|
||||
'nova_image', 'nova_image_size', 'nova_secgroup', 'nova_keyname',
|
||||
'nova_tenant', 'nova_region', 'nova_user', 'nova_pass', 'nova_auth_url'
|
||||
]
|
||||
|
49
libra/mgm/node_list.py
Normal file
49
libra/mgm/node_list.py
Normal file
@ -0,0 +1,49 @@
|
||||
# Copyright 2012 Hewlett-Packard Development Company, L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
import pickle
|
||||
import os
|
||||
|
||||
|
||||
class AccessDenied(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class NodeList(object):
|
||||
def __init__(self, path):
|
||||
if not os.access(path, os.W_OK):
|
||||
msg = 'Do not have permission to write to {0}'.format(path)
|
||||
raise AccessDenied(msg)
|
||||
|
||||
self.file_name = '{0}/node_log.dat'.format(path)
|
||||
|
||||
def add(self, item):
|
||||
data = self.get()
|
||||
data.append(item)
|
||||
self.put(data)
|
||||
|
||||
def delete(self, item):
|
||||
data = self.get()
|
||||
data.remove(item)
|
||||
self.put(data)
|
||||
|
||||
def get(self):
|
||||
# Attribute error is thrown if file is non-existent
|
||||
try:
|
||||
return pickle.load(open(self.file_name, "rb"))
|
||||
except IOError:
|
||||
return []
|
||||
|
||||
def put(self, data):
|
||||
pickle.dump(data, open(self.file_name, "wb"))
|
@ -25,6 +25,16 @@ class NotFound(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class BuildError(Exception):
|
||||
def __init__(self, msg, node_name, node_id=0):
|
||||
self.msg = msg
|
||||
self.node_name = node_name
|
||||
self.node_id = node_id
|
||||
|
||||
def __str__(self):
|
||||
return self.msg
|
||||
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, username, password, tenant, auth_url, region, keyname,
|
||||
secgroup, image, node_type, node_basename=None):
|
||||
@ -60,8 +70,9 @@ class Node(object):
|
||||
try:
|
||||
body = self._create(node_id)
|
||||
except exceptions.ClientException:
|
||||
return False, 'Error creating node {nid} exception {exc}'.format(
|
||||
nid=node_id, exc=sys.exc_info()[0]
|
||||
raise BuildError(
|
||||
'Error creating node, exception {exc}'
|
||||
.format(exc=sys.exc_info()[0]), node_id
|
||||
)
|
||||
|
||||
server_id = body['server']['id']
|
||||
@ -69,19 +80,19 @@ class Node(object):
|
||||
waits = 40
|
||||
while waits > 0:
|
||||
time.sleep(3)
|
||||
status = self._status(server_id)
|
||||
resp, status = self.status(server_id)
|
||||
status = status['server']
|
||||
if status['status'] == 'ACTIVE':
|
||||
return True, status
|
||||
return status
|
||||
elif not status['status'].startswith('BUILD'):
|
||||
return False, 'Error spawning node {nid} status {stat}'.format(
|
||||
node=node_id, stat=status['status']
|
||||
raise BuildError(
|
||||
'Error spawning node, status {stat}'
|
||||
.format(stat=status['status']),
|
||||
node_id, server_id,
|
||||
)
|
||||
waits = waits - 1
|
||||
|
||||
return (False,
|
||||
'Timeout creating node, uuid: {nid}, server ID: {sid}'
|
||||
.format(nid=node_id, sid=server_id)
|
||||
)
|
||||
raise BuildError('Timeout creating node', node_id, server_id)
|
||||
|
||||
def delete(self, node_id):
|
||||
""" delete a node """
|
||||
@ -119,11 +130,11 @@ class Node(object):
|
||||
resp, body = self.nova.post(url, body=body)
|
||||
return body
|
||||
|
||||
def _status(self, node_id):
|
||||
def status(self, node_id):
|
||||
""" used to keep scanning to see if node is up """
|
||||
url = "/servers/{0}".format(node_id)
|
||||
resp, body = self.nova.get(url)
|
||||
return body['server']
|
||||
return resp, body
|
||||
|
||||
def _delete(self, node_id):
|
||||
""" delete a nova node, return 204 succeed """
|
||||
|
@ -5,7 +5,7 @@ import httplib2
|
||||
import json
|
||||
|
||||
import mock_objects
|
||||
from libra.mgm.nova import Node
|
||||
from libra.mgm.nova import Node, BuildError
|
||||
|
||||
fake_response = httplib2.Response({"status": '200'})
|
||||
fake_bad_response = httplib2.Response({"status": '500'})
|
||||
@ -42,16 +42,14 @@ class TestLBaaSMgmNova(unittest.TestCase):
|
||||
def testCreateNode(self):
|
||||
with mock.patch.object(httplib2.Http, "request", mock_request):
|
||||
with mock.patch('time.time', mock.Mock(return_value=1234)):
|
||||
resp, data = self.api.build()
|
||||
self.assertTrue(resp)
|
||||
data = self.api.build()
|
||||
self.assertEqual(data['id'], 417773)
|
||||
|
||||
def testCreateNodeFail(self):
|
||||
with mock.patch.object(httplib2.Http, "request", mock_bad_request):
|
||||
with mock.patch('time.time', mock.Mock(return_value=1234)):
|
||||
resp, data = self.api.build()
|
||||
self.assertFalse(resp)
|
||||
self.assertRegexpMatches(data, 'Error creating')
|
||||
with self.assertRaises(BuildError):
|
||||
data = self.api.build()
|
||||
|
||||
def testDeleteNodeFail(self):
|
||||
with mock.patch.object(httplib2.Http, "request", mock_bad_request):
|
||||
|
Loading…
x
Reference in New Issue
Block a user