diff --git a/README b/README index 70b3bb7..9a11764 100644 --- a/README +++ b/README @@ -1,3 +1,66 @@ StackTach is a debugging tool for OpenStack Nova. -It takes events from AMQP and sends them to the StackTach server for web display. +-------------------- + +It takes events from AMQP and inserts them in a database for the StackTach django project server for web display. +stacktach workers consume the monitor.info and monitor.error rabbit messape queues. + +It's important that they keep running otherwise the queues fill up and slow down the +whole nova environment. + +-------------------- + +There's a django app in the 'stacktach' directory that lets you see what's +going on in your nova install, then there's two flavors of worker processes +that collect the amqp messages for displaying in the stacktach django apps. + +The app was originally designed as a service so that different customers could +monitor their nova deployments, so it has 'tenants'. + +After installing the django app like most django apps, if you navigate to it, +it'll prompt you to make a new tenant. It'll be given the number 1. You can +view it in the browser by appending /1 onto the end of the url. + +That tenant ID then needs to be passed to one of the worker setups. + +-------------------- + +The original worker setup uses the single script 'worker.py'. It depends on +the 'kombu' lib to talk to your amqp server and uses eventlet green threads, +one for each nova deploy. + +-------------------- + +The newer version consists of the scripts: + + * start_workers.py - Starts up sub-processes of worker_new.py's + * worker_conf.py - Holds the configs for your nova deploys + * worker_new.py - The actual worker code + +This version was written to address stability issues in the original. + +It depends on 'amqplib' for talking to the amqp broker. + +It uses subprocess instead of threading to fire up the workers, one sub-proc +for each nova deploy. + +The newer version seems to have a memory leak, so needs restarting +occasionally, but seems to be overall more stabler than the original. + +-------------------- + +Before use, put your nova install detail into 'worker_conf.py' + +If you need to start them manually: + +sudo -i +cd /path/to/stacktach/install +export DJANGO_SETTINGS_MODULE=stacktach.settings +python start_workers.py + +The start_workers.py imports the list of deployments to consume from worker_conf.py, +then it starts a sub process of worker.py for each deploy. + +Nice way to see the logs in a screen session: + + ls *.log | grep "global\|cell" | xargs watch --differences tail -n2 diff --git a/stacktach/templates/stacktach/base.html b/stacktach/templates/base.html similarity index 100% rename from stacktach/templates/stacktach/base.html rename to stacktach/templates/base.html diff --git a/stacktach/templates/stacktach/data.html b/stacktach/templates/data.html similarity index 100% rename from stacktach/templates/stacktach/data.html rename to stacktach/templates/data.html diff --git a/stacktach/templates/stacktach/expand.html b/stacktach/templates/expand.html similarity index 100% rename from stacktach/templates/stacktach/expand.html rename to stacktach/templates/expand.html diff --git a/stacktach/templates/stacktach/host_status.html b/stacktach/templates/host_status.html similarity index 100% rename from stacktach/templates/stacktach/host_status.html rename to stacktach/templates/host_status.html diff --git a/stacktach/templates/stacktach/index.html b/stacktach/templates/index.html similarity index 100% rename from stacktach/templates/stacktach/index.html rename to stacktach/templates/index.html diff --git a/stacktach/templates/stacktach/instance_status.html b/stacktach/templates/instance_status.html similarity index 100% rename from stacktach/templates/stacktach/instance_status.html rename to stacktach/templates/instance_status.html diff --git a/stacktach/templates/stacktach/new_tenant.html b/stacktach/templates/new_tenant.html similarity index 100% rename from stacktach/templates/stacktach/new_tenant.html rename to stacktach/templates/new_tenant.html diff --git a/stacktach/templates/stacktach/rows.html b/stacktach/templates/rows.html similarity index 100% rename from stacktach/templates/stacktach/rows.html rename to stacktach/templates/rows.html diff --git a/stacktach/templates/stacktach/welcome.html b/stacktach/templates/welcome.html similarity index 100% rename from stacktach/templates/stacktach/welcome.html rename to stacktach/templates/welcome.html diff --git a/stacktach/views.py b/stacktach/views.py index 2a91071..d58227e 100644 --- a/stacktach/views.py +++ b/stacktach/views.py @@ -10,12 +10,10 @@ from stacktach import models import datetime import json -import logging import pprint import random import sys -logger = logging.getLogger(__name__) VERSION = 4 @@ -33,7 +31,11 @@ def _monitor_message(routing_key, body): publisher = body['publisher_id'] parts = publisher.split('.') service = parts[0] - host = parts[1] + if len(parts) > 1: + host = ".".join(parts[1:]) + else: + host = None + #logging.error("publisher=%s, host=%s" % (publisher, host)) payload = body['payload'] request_spec = payload.get('request_spec', None) instance = payload.get('instance_id', None) @@ -80,9 +82,15 @@ def _parse(tenant, args, json_args): return {} values['tenant'] = tenant - when = body['_context_timestamp'] try: - when = datetime.datetime.strptime(when, "%Y-%m-%dT%H:%M:%S.%f") + when = body['timestamp'] + except KeyError: + when = body['_context_timestamp'] # Old way of doing it + try: + try: + when = datetime.datetime.strptime(when, "%Y-%m-%d %H:%M:%S.%f") + except ValueError: + when = datetime.datetime.strptime(when, "%Y-%m-%dT%H:%M:%S.%f") # Old way of doing it values['microseconds'] = when.microsecond except Exception, e: pass @@ -169,21 +177,21 @@ def _default_context(state): context = dict(utc=datetime.datetime.utcnow(), state=state) return context - + def welcome(request): state = _reset_state(request) - return render_to_response('stacktach/welcome.html', _default_context(state)) + return render_to_response('welcome.html', _default_context(state)) @tenant_check def home(request, tenant_id): state = _get_state(request, tenant_id) - return render_to_response('stacktach/index.html', _default_context(state)) + return render_to_response('index.html', _default_context(state)) def logout(request): del request.session['state'] - return render_to_response('stacktach/welcome.html', _default_context(None)) + return render_to_response('welcome.html', _default_context(None)) @csrf_protect @@ -200,7 +208,7 @@ def new_tenant(request): else: form = models.TenantForm() context['form'] = form - return render_to_response('stacktach/new_tenant.html', context, + return render_to_response('new_tenant.html', context, context_instance=template.RequestContext(request)) @@ -212,7 +220,7 @@ def data(request, tenant_id): c = _default_context(state) fields = _parse(state.tenant, args, raw_args) c['cooked_args'] = fields - return render_to_response('stacktach/data.html', c) + return render_to_response('data.html', c) @tenant_check @@ -229,13 +237,13 @@ def details(request, tenant_id, column, row_id): from_time = value - datetime.timedelta(minutes=1) to_time = value + datetime.timedelta(minutes=1) rows = rows.filter(when__range=(from_time, to_time)) - + rows = rows.order_by('-when', '-microseconds')[:200] _post_process_raw_data(rows, state, highlight=row_id) c['rows'] = rows c['allow_expansion'] = True c['show_absolute_time'] = True - return render_to_response('stacktach/rows.html', c) + return render_to_response('rows.html', c) @tenant_check @@ -246,7 +254,7 @@ def expand(request, tenant_id, row_id): payload = json.loads(row.json) pp = pprint.PrettyPrinter() c['payload'] = pp.pformat(payload) - return render_to_response('stacktach/expand.html', c) + return render_to_response('expand.html', c) @tenant_check @@ -257,7 +265,7 @@ def host_status(request, tenant_id): order_by('-when', '-microseconds')[:20] _post_process_raw_data(hosts, state) c['rows'] = hosts - return render_to_response('stacktach/host_status.html', c) + return render_to_response('host_status.html', c) @tenant_check @@ -270,9 +278,9 @@ def search(request, tenant_id): if column != None and value != None: rows = models.RawData.objects.filter(tenant=tenant_id).\ filter(**{column:value}).\ - order_by('-when', '-microseconds') + order_by('-when', '-microseconds')[:22] _post_process_raw_data(rows, state) c['rows'] = rows c['allow_expansion'] = True c['show_absolute_time'] = True - return render_to_response('stacktach/rows.html', c) + return render_to_response('rows.html', c) diff --git a/start_workers.py b/start_workers.py new file mode 100644 index 0000000..db3a023 --- /dev/null +++ b/start_workers.py @@ -0,0 +1,30 @@ +# Copyright 2012 Openstack LLC. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from multiprocessing import Process +from worker import run +from worker_conf import DEPLOYMENTS + + +if __name__ == '__main__': + processes = [] + for deployment in DEPLOYMENTS: + if deployment.get('enabled', True): + process = Process(target=run, args=(deployment,)) + process.daemon = True + process.start() + processes.append(process) + for process in processes: + process.join() diff --git a/worker_conf.py b/worker_conf.py new file mode 100644 index 0000000..7a1f98f --- /dev/null +++ b/worker_conf.py @@ -0,0 +1,29 @@ +# Copyright 2012 Openstack LLC. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +# This is a sample conf file. Use it as a guide to make your own + +DEPLOYMENTS = [ + # My fun conf + dict( + tenant_id=1, # This is the stacktach tenant, not an openstack tenant + name="my-fun-nova-deploy", + url='http://stacktach.my-fun-nova-deploy.com', # The url for the base of the django app + rabbit_host="1.2.3.4", # ip/host name of the amqp server to listen to + rabbit_port=5672, + rabbit_userid="amqp-user-1", + rabbit_password="some secret password", + rabbit_virtual_host="amqp-vhost"), +] diff --git a/worker_new.py b/worker_new.py new file mode 100755 index 0000000..27b995c --- /dev/null +++ b/worker_new.py @@ -0,0 +1,82 @@ +# Copyright 2012 Openstack LLC. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import amqplib.client_0_8 as amqp +import json +import socket +import time + +from stacktach import models, views + + +class NovaConsumer(object): + def __init__(self, channel, tenant_id, logger): + self.channel = channel + self.tenant = models.Tenant.objects.get(tenant_id=tenant_id) + self.logger = logger + channel.basic_consume('monitor.info', callback=self.onMessage) + channel.basic_consume('monitor.error', callback=self.onMessage) + + def onMessage(self, message): + routing_key = message.delivery_info['routing_key'] + args = (routing_key, json.loads(message.body)) + asJson = json.dumps(args) + + #from pprint import pformat + #self.logger.debug("Saving %s", pformat(args)) + views._parse(self.tenant, args, asJson) + self.logger.debug("Recorded %s ", routing_key) + self.channel.basic_ack(message.delivery_tag) + + +def run(deployment, logger): + tenant_id = deployment.get('tenant_id', 1) + host = deployment.get('rabbit_host', 'localhost') + port = deployment.get('rabbit_port', 5672) + user_id = deployment.get('rabbit_userid', 'rabbit') + password = deployment.get('rabbit_password', 'rabbit') + virtual_host = deployment.get('rabbit_virtual_host', '/') + + logger.info("Rabbit: %s %s %s %s" % + (host, port, user_id, virtual_host)) + + + while 1: + try: + conn = amqp.Connection(host, userid=user_id, password=password, virtual_host=virtual_host) + + ch = conn.channel() + ch.access_request(virtual_host, active=True, read=True) + + ch.exchange_declare('nova', type='topic', durable=True, auto_delete=False) + ch.queue_declare('monitor.info', durable=True, auto_delete=False, exclusive=False) + ch.queue_declare('monitor.error', durable=True, auto_delete=False, exclusive=False) + ch.queue_bind('monitor.info', 'nova') + ch.queue_bind('monitor.error', 'nova') + consumer = NovaConsumer(ch, tenant_id, logger) + # + # Loop as long as the channel has callbacks registered + # + while ch.callbacks: + ch.wait() + break + except socket.error, e: + logger.warn("Socket error: %s" % e) + time.sleep(5) + continue + + ch.close() + conn.close() + logger.info("Finished")