stacktach/reports/error_details.py
2013-04-30 16:23:10 -05:00

228 lines
7.5 KiB
Python

import datetime
import json
import sys
import time
import os
sys.path.append(os.environ.get('STACKTACH_INSTALL_DIR', '/stacktach'))
from stacktach import datetime_to_decimal as dt
from stacktach import image_type
from stacktach import models
if __name__ != '__main__':
sys.exit(1)
yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
if len(sys.argv) == 2:
try:
t = time.strptime(sys.argv[1], "%Y-%m-%d")
yesterday = datetime.datetime(*t[:6])
except Exception, e:
print e
print "Usage: python requests.py YYYY-MM-DD (the end date)"
sys.exit(1)
hours = 0
length = 24
start = datetime.datetime(year=yesterday.year, month=yesterday.month,
day=yesterday.day)
end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
instance_map = {} # { uuid : [request_id, request_id, ...] }
metadata = {'report_format': 'json', 'instances': instance_map}
report = [metadata] # Tell Stacky to format as JSON
dstart = dt.dt_to_decimal(start)
dend = dt.dt_to_decimal(end)
codes = {}
deployments = {}
for deploy in models.Deployment.objects.all():
deployments[deploy.id] = deploy.name
# Get all the instances that have changed in the last N hours ...
updates = models.RawData.objects.filter(event='compute.instance.update',
when__gt=dstart, when__lte=dend)\
.values('instance').distinct()
expiry = 60 * 60 # 1 hour
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
failures = {}
causes = {}
durations = {}
error_messages = {}
successes = {}
tenant_issues = {}
for uuid_dict in updates:
uuid = uuid_dict['instance']
# All the unique Request ID's for this instance during that timespan.
reqs = models.RawData.objects.filter(instance=uuid,
when__gt=dstart, when__lte=dend)\
.values('request_id').distinct()
req_list = []
for req_dict in reqs:
req = req_dict['request_id']
raws = list(models.RawData.objects.filter(request_id=req)
.exclude(event='compute.instance.exists')
.values("id", "when", "routing_key", "old_state",
"state", "tenant", "event", "image_type",
"deployment")
.order_by('when'))
_start = None
err_id = None
failure_type = None
operation = "n/a"
platform = 0
tenant = 0
cell = "n/a"
image_type_num = 0
_when = None
for raw in raws:
_when = raw['when']
_routing_key = raw['routing_key']
_old_state = raw['old_state']
_state = raw['state']
_tenant = raw['tenant']
_event = raw['event']
_image_type = raw['image_type']
_name = raw['deployment']
_id = raw['id']
if not _start:
_start = _when
if 'error' in _routing_key:
err_id = _id
failure_type = 'http'
if _old_state != 'error' and _state == 'error':
failure_type = 'state'
err_id = _id
if _old_state == 'error' and \
(not _state in ['deleted', 'error']):
failure_type = None
err_id = None
if _tenant:
tenant = _tenant
for cmd in cmds:
if cmd in _event:
operation = cmd
cell = deployments.get(_name, "n/a")
break
if _image_type:
image_type_num |= _image_type
if not _start:
continue
image = "?"
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
image = "base"
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
image = "snap"
_end = _when
diff = _end - _start
if diff > 3600 and failure_type is None:
failure_type = ">60"
key = (operation, image_type_num, cell)
# Track durations for all attempts, good and bad ...
duration_min, duration_max, duration_count, duration_total = \
durations.get(key, (9999999, 0, 0, 0))
duration_min = min(duration_min, diff)
duration_max = max(duration_max, diff)
duration_count += 1
duration_total += diff
durations[key] = (duration_min, duration_max, duration_count,
duration_total)
if not failure_type:
successes[key] = successes.get(key, 0) + 1
else:
failed_request = {}
req_list.append(req)
instance_map[uuid] = req_list
failed_request['req'] = req
failed_request['duration'] = "%.2f minutes" % (diff/60)
failed_request['operation'] = operation
failed_request['platform'] = image_type.readable(image_type_num)
failures[key] = failures.get(key, 0) + 1
tenant_issues[tenant] = tenant_issues.get(tenant, 0) + 1
if err_id:
err = models.RawData.objects.get(id=err_id)
queue, body = json.loads(err.json)
payload = body['payload']
# Add error information to failed request report
failed_request['event_id'] = err.id
failed_request['tenant'] = err.tenant
failed_request['service'] = err.service
failed_request['host'] = err.host
failed_request['deployment'] = err.deployment.name
failed_request['event'] = err.event
failed_request['when'] = str(dt.dt_from_decimal(err.when))
exc = payload.get('exception')
if exc:
# group the messages ...
failed_request['exception'] = exc
exc_str = str(exc)
error_messages[exc_str] = \
error_messages.get(exc_str, 0) + 1
# extract the code, if any ...
code = exc.get('kwargs', {}).get('code')
if code:
codes[code] = codes.get(code, 0) + 1
failure_type = code
failed_request['failure_type'] = failure_type
raws = models.RawData.objects.filter(request_id=req)\
.exclude(event='compute.instance.exists')\
.order_by('when')
failed_request['details'] = []
for raw in raws:
failure_detail = {}
failure_detail['host'] = raw.host
failure_detail['event'] = raw.event
failure_detail['old_state'] = raw.old_state
failure_detail['state'] = raw.state
failure_detail['old_task'] = raw.old_task
failure_detail['task'] = raw.task
failed_request['details'].append(failure_detail)
report.append(failed_request)
cause_key = (key, failure_type)
causes[cause_key] = causes.get(cause_key, 0) + 1
values = {'json': json.dumps(report),
'created': dt.dt_to_decimal(datetime.datetime.utcnow()),
'period_start': start,
'period_end': end,
'version': 1,
'name': 'Error detail report'}
report = models.JsonReport(**values)
report.save()