328 lines
10 KiB
Python
328 lines
10 KiB
Python
import datetime
|
|
import json
|
|
import sys
|
|
import time
|
|
|
|
import prettytable
|
|
|
|
sys.path.append("/stacktach")
|
|
|
|
from stacktach import datetime_to_decimal as dt
|
|
from stacktach import image_type
|
|
from stacktach import models
|
|
|
|
|
|
if __name__ != '__main__':
|
|
sys.exit(1)
|
|
|
|
yesterday = datetime.datetime.utcnow().date() - datetime.timedelta(days=1)
|
|
if len(sys.argv) == 2:
|
|
try:
|
|
t = time.strptime(sys.argv[1], "%Y-%m-%d")
|
|
yesterday = datetime.datetime(*t[:6])
|
|
except Exception, e:
|
|
print e
|
|
print "Usage: python requests.py YYYY-MM-DD (the end date)"
|
|
sys.exit(1)
|
|
|
|
hours = 0
|
|
length = 24
|
|
|
|
start = datetime.datetime(year=yesterday.year, month=yesterday.month,
|
|
day=yesterday.day)
|
|
end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59)
|
|
|
|
instance_map = {} # { uuid : [request_id, request_id, ...] }
|
|
metadata = {'raw_text': True, 'instances': instance_map}
|
|
report = [metadata] # Tell Stacky not to format results.
|
|
report.append("Generating report for %s to %s" % (start, end))
|
|
|
|
dstart = dt.dt_to_decimal(start)
|
|
dend = dt.dt_to_decimal(end)
|
|
|
|
codes = {}
|
|
|
|
deployments = {}
|
|
for deploy in models.Deployment.objects.all():
|
|
deployments[deploy.id] = deploy.name
|
|
|
|
# Get all the instances that have changed in the last N hours ...
|
|
updates = models.RawData.objects.filter(event='compute.instance.update',
|
|
when__gt=dstart, when__lte=dend)\
|
|
.values('instance').distinct()
|
|
|
|
expiry = 60 * 60 # 1 hour
|
|
cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot']
|
|
|
|
failures = {}
|
|
causes = {}
|
|
durations = {}
|
|
error_messages = {}
|
|
successes = {}
|
|
tenant_issues = {}
|
|
|
|
for uuid_dict in updates:
|
|
uuid = uuid_dict['instance']
|
|
|
|
# All the unique Request ID's for this instance during that timespan.
|
|
reqs = models.RawData.objects.filter(instance=uuid,
|
|
when__gt=dstart, when__lte=dend) \
|
|
.values('request_id').distinct()
|
|
|
|
req_list = []
|
|
for req_dict in reqs:
|
|
req = req_dict['request_id']
|
|
|
|
raws = list(models.RawData.objects.filter(request_id=req)\
|
|
.exclude(event='compute.instance.exists')\
|
|
.values("id", "when", "routing_key", "old_state",
|
|
"state", "tenant", "event", "image_type",
|
|
"deployment")\
|
|
.order_by('when'))
|
|
|
|
_start = None
|
|
err_id = None
|
|
failure_type = None
|
|
|
|
operation = "n/a"
|
|
platform = 0
|
|
tenant = 0
|
|
cell = "n/a"
|
|
image_type_num = 0
|
|
|
|
_when = None
|
|
|
|
for raw in raws:
|
|
_when = raw['when']
|
|
_routing_key = raw['routing_key']
|
|
_old_state = raw['old_state']
|
|
_state = raw['state']
|
|
_tenant = raw['tenant']
|
|
_event = raw['event']
|
|
_image_type = raw['image_type']
|
|
_name = raw['deployment']
|
|
_id = raw['id']
|
|
|
|
if not _start:
|
|
_start = _when
|
|
|
|
if 'error' in _routing_key:
|
|
err_id = _id
|
|
failure_type = 'http'
|
|
|
|
if _old_state != 'error' and _state == 'error':
|
|
failure_type = 'state'
|
|
err_id = _id
|
|
|
|
if _old_state == 'error' and \
|
|
(not _state in ['deleted', 'error']):
|
|
failure_type = None
|
|
err_id = None
|
|
|
|
if _tenant:
|
|
tenant = _tenant
|
|
|
|
for cmd in cmds:
|
|
if cmd in _event:
|
|
operation = cmd
|
|
cell = deployments.get(_name, "n/a")
|
|
break
|
|
|
|
if _image_type:
|
|
image_type_num |= _image_type
|
|
|
|
if not _start:
|
|
continue
|
|
|
|
image = "?"
|
|
if image_type.isset(image_type_num, image_type.BASE_IMAGE):
|
|
image = "base"
|
|
if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE):
|
|
image = "snap"
|
|
|
|
_end = _when
|
|
diff = _end - _start
|
|
|
|
if diff > 3600 and failure_type == None:
|
|
failure_type = ">60"
|
|
|
|
key = (operation, image_type_num, cell)
|
|
|
|
# Track durations for all attempts, good and bad ...
|
|
duration_min, duration_max, duration_count, duration_total = \
|
|
durations.get(key, (9999999, 0, 0, 0))
|
|
duration_min = min(duration_min, diff)
|
|
duration_max = max(duration_max, diff)
|
|
duration_count += 1
|
|
duration_total += diff
|
|
durations[key] = (duration_min, duration_max, duration_count,
|
|
duration_total)
|
|
|
|
if not failure_type:
|
|
successes[key] = successes.get(key, 0) + 1
|
|
else:
|
|
req_list.append(req)
|
|
instance_map[uuid] = req_list
|
|
|
|
report.append('')
|
|
report.append("------ %s ----------" % uuid)
|
|
report.append("Req: %s" % req)
|
|
report.append("Duration: %.2f minutes" % (diff / 60))
|
|
report.append("Operation: %s" % operation)
|
|
report.append("Platform: %s" % image_type.readable(image_type_num))
|
|
failures[key] = failures.get(key, 0) + 1
|
|
tenant_issues[tenant] = tenant_issues.get(tenant, 0) + 1
|
|
|
|
if err_id:
|
|
err = models.RawData.objects.get(id=err_id)
|
|
queue, body = json.loads(err.json)
|
|
payload = body['payload']
|
|
|
|
report.append("Event ID: %s" % err.id)
|
|
report.append("Tenant: %s" % err.tenant)
|
|
report.append("Service: %s" % err.service)
|
|
report.append("Host: %s" % err.host)
|
|
report.append("Deployment: %s" % err.deployment.name)
|
|
report.append("Event: %s" % err.event)
|
|
report.append("When: %s" % dt.dt_from_decimal(err.when))
|
|
exc = payload.get('exception')
|
|
if exc:
|
|
# group the messages ...
|
|
exc_str = str(exc)
|
|
report.append("Exception: %s" % exc_str)
|
|
error_messages[exc_str] = \
|
|
error_messages.get(exc_str, 0) + 1
|
|
|
|
# extract the code, if any ...
|
|
code = exc.get('kwargs', {}).get('code')
|
|
if code:
|
|
codes[code] = codes.get(code, 0) + 1
|
|
failure_type = code
|
|
report.append("Failure Type: %s" % failure_type)
|
|
|
|
report.append('')
|
|
report.append("Details:")
|
|
raws = models.RawData.objects.filter(request_id=req)\
|
|
.exclude(event='compute.instance.exists')\
|
|
.order_by('when')
|
|
|
|
for raw in raws:
|
|
report.append("H: %s E:%s, S:(%s->%s) T:(%s->%s)" %
|
|
(raw.host, raw.event,
|
|
raw.old_state, raw.state, raw.old_task,
|
|
raw.task))
|
|
report.append('---------------------------------------')
|
|
cause_key = (key, failure_type)
|
|
causes[cause_key] = causes.get(cause_key, 0) + 1
|
|
|
|
|
|
def dump_breakdown(totals, label):
|
|
p = prettytable.PrettyTable(["Category", "Count"])
|
|
for k, v in totals.iteritems():
|
|
p.add_row([k, v])
|
|
report.append(label)
|
|
p.sortby = 'Count'
|
|
report.append(p.get_string())
|
|
|
|
|
|
def dump_summary(info, label):
|
|
report.append("-- %s by operation by cell by platform --" % (label,))
|
|
p = prettytable.PrettyTable(["Operation", "Cell", "Platform", "Count",
|
|
"Min", "Max", "Avg"])
|
|
for c in ["Count", "Min", "Max", "Avg"]:
|
|
p.align[c] = 'r'
|
|
|
|
total = 0
|
|
op_totals = {}
|
|
cell_totals = {}
|
|
platform_totals = {}
|
|
for key, count in info.iteritems():
|
|
operation, platform, cell = key
|
|
readable = image_type.readable(platform)
|
|
text = "n/a"
|
|
if readable:
|
|
text = ", ".join(readable)
|
|
|
|
_min, _max, _count, _total = durations[key]
|
|
_avg = float(_total) / float(_count)
|
|
_fmin = dt.sec_to_str(_min)
|
|
_fmax = dt.sec_to_str(_max)
|
|
_favg = dt.sec_to_str(_avg * 100.0)
|
|
|
|
op_totals[operation] = op_totals.get(operation, 0) + count
|
|
cell_totals[cell] = cell_totals.get(cell, 0) + count
|
|
platform_totals[text] = platform_totals.get(text, 0) + count
|
|
|
|
p.add_row([operation, cell, text, count, _fmin, _fmax, _favg])
|
|
total += count
|
|
p.sortby = 'Count'
|
|
report.append(p.get_string())
|
|
|
|
dump_breakdown(op_totals, "Total %s by Operation" % label)
|
|
dump_breakdown(cell_totals, "Total %s by Cell" % label)
|
|
dump_breakdown(platform_totals, "Total %s by Platform" % label)
|
|
|
|
report.append('')
|
|
return total
|
|
|
|
|
|
good = dump_summary(successes, "Success")
|
|
bad = dump_summary(failures, "Failures")
|
|
report.append("""
|
|
SUMMARY
|
|
|
|
=====================================================
|
|
Total Success: %d Total Failure: %d
|
|
|
|
""" % (good, bad))
|
|
|
|
p = prettytable.PrettyTable(["Tenant", "Count"])
|
|
for tenant, count in tenant_issues.iteritems():
|
|
p.add_row([tenant, count])
|
|
p.sortby = 'Count'
|
|
report.append("""
|
|
-- Errors by Tenant --
|
|
%s""" % p.get_string())
|
|
|
|
p = prettytable.PrettyTable(["Return Code", "Count"])
|
|
for k, v in codes.iteritems():
|
|
p.add_row([k, v])
|
|
p.sortby = 'Count'
|
|
report.append("""
|
|
-- Return code counts --
|
|
%s""" % p.get_string())
|
|
|
|
p = prettytable.PrettyTable(["Cause", "Operation", "Cell", "Platform", "Count"])
|
|
for cause_key, count in causes.iteritems():
|
|
key, cause = cause_key
|
|
operation, platform, cell = key
|
|
readable = image_type.readable(platform)
|
|
text = "n/a"
|
|
if readable:
|
|
text = ", ".join(readable)
|
|
p.add_row([cause, operation, cell, text, count])
|
|
p.sortby = 'Count'
|
|
report.append("""
|
|
-- Cause breakdown --
|
|
%s""" % p.get_string())
|
|
|
|
p = prettytable.PrettyTable(["Count", "Message"])
|
|
for k, v in error_messages.iteritems():
|
|
p.add_row([v, k[:80]])
|
|
p.sortby = 'Count'
|
|
report.append("""
|
|
-- Error Message Counts --
|
|
%s""" % p.get_string())
|
|
|
|
for r in report[1:]:
|
|
print r
|
|
|
|
values = {'json': json.dumps(report),
|
|
'created': dt.dt_to_decimal(datetime.datetime.utcnow()),
|
|
'period_start': start,
|
|
'period_end': end,
|
|
'version': 1,
|
|
'name': 'Error detail report'}
|
|
report = models.JsonReport(**values)
|
|
report.save()
|