From c74bd48a48a8a87fe312ebba7da9a33387dcc262 Mon Sep 17 00:00:00 2001 From: Thomas Maddox Date: Wed, 12 Jun 2013 16:39:04 -0500 Subject: [PATCH 1/4] first checkin for os_type; grabbing from notification json --- reports/pretty.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/reports/pretty.py b/reports/pretty.py index 7867592..81c013b 100644 --- a/reports/pretty.py +++ b/reports/pretty.py @@ -77,6 +77,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, failure_type = None operation = "aux" + os_type = "other" image_type_num = 0 for raw in raws: @@ -99,6 +100,15 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, operation = cmd break + # Brace yourself. We are now painfully digging into the + # notification to get the os_type attribute + if os_type == "other" and raw.json: + notification = json.loads(raw.json) + if notification[1]: + os_type = notification[1].get('payload', {})\ + .get('image_meta', {})\ + .get('os_type', "other") + if raw.image_type: image_type_num |= raw.image_type @@ -117,7 +127,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, if diff > too_long and failure_type == None: failure_type = too_long_col - key = (operation, image) + key = (operation, image, os_type) # Track durations for all attempts, good and bad ... _durations = durations.get(key, []) @@ -150,7 +160,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, report.append(details) failure_types = ["4xx", "5xx", too_long_col, "state"] - cols = ["Operation", "Image", "Min", "Max", "Med", "%d%%" % percentile, + cols = ["Operation", "Image", "OS", "Min", "Max", "Med", "%d%%" % percentile, "Requests"] for failure_type in failure_types: cols.append("%s" % failure_type) @@ -161,7 +171,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, failure_totals = {} for key, count in attempts.iteritems(): total += count - operation, image = key + operation, image, os_type = key breakdown = failures.get(key, {}) this_failure_pair = [] @@ -199,7 +209,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, _fmedian = dt.sec_to_str(_median) _fpercentile = dt.sec_to_str(_percentile) - row = [operation, image, _fmin, _fmax, _fmedian, _fpercentile, count] + row = [operation, image, os_type, _fmin, _fmax, _fmedian, _fpercentile, count] for failure_count, failure_percentage in this_failure_pair: row.append(failure_count) row.append(failure_percentage) @@ -316,7 +326,7 @@ if __name__ == '__main__': for row in raw_report[2:]: frow = row[:] - for col in [8, 10, 12, 14]: + for col in [9, 11, 13, 15]: frow[col] = "%.1f%%" % (row[col] * 100.0) p.add_row(frow) print p From 6676e85c0fe42ba52156e93b9ec907ceff30776a Mon Sep 17 00:00:00 2001 From: Thomas Maddox Date: Thu, 13 Jun 2013 12:47:34 -0500 Subject: [PATCH 2/4] Changed to use image_type bit field to determine os_type --- reports/pretty.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/reports/pretty.py b/reports/pretty.py index 81c013b..851b0d7 100644 --- a/reports/pretty.py +++ b/reports/pretty.py @@ -77,7 +77,6 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, failure_type = None operation = "aux" - os_type = "other" image_type_num = 0 for raw in raws: @@ -112,12 +111,20 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, if raw.image_type: image_type_num |= raw.image_type + # Get image (base or snapshot) from image_type bit field image = "?" if image_type.isset(image_type_num, image_type.BASE_IMAGE): image = "base" if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): image = "snap" + #Get os_type from image_type bit field + os_type = "other" + if image_type.isset(image_type_num, image_type.LINUX_IMAGE): + os_type = "linux" + if image_type.isset(image_type_num, image_type.WINDOWS_IMAGE): + os_type = "windows" + if not start: continue From 8ecd7a4a819f7db79b02747c17ceeab9cefa6a7d Mon Sep 17 00:00:00 2001 From: Thomas Maddox Date: Thu, 20 Jun 2013 11:07:43 -0500 Subject: [PATCH 3/4] removed notification dive --- reports/pretty.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/reports/pretty.py b/reports/pretty.py index 851b0d7..436fb48 100644 --- a/reports/pretty.py +++ b/reports/pretty.py @@ -99,15 +99,6 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, operation = cmd break - # Brace yourself. We are now painfully digging into the - # notification to get the os_type attribute - if os_type == "other" and raw.json: - notification = json.loads(raw.json) - if notification[1]: - os_type = notification[1].get('payload', {})\ - .get('image_meta', {})\ - .get('os_type', "other") - if raw.image_type: image_type_num |= raw.image_type From 93a07235442b455c03598c93b432f92e84b0c986 Mon Sep 17 00:00:00 2001 From: Thomas Maddox Date: Fri, 21 Jun 2013 13:08:03 -0500 Subject: [PATCH 4/4] clean up --- reports/error_details.py | 71 +++++++++++++++++----------------------- reports/pretty.py | 24 +++++++------- 2 files changed, 42 insertions(+), 53 deletions(-) diff --git a/reports/error_details.py b/reports/error_details.py index 1fd883d..a45bfd5 100644 --- a/reports/error_details.py +++ b/reports/error_details.py @@ -17,6 +17,16 @@ if __name__ != '__main__': # To mask unique identifiers for categorizing notifications def mask_msg(text): + # Needs order because of how precedent effects masking. + # + # Example: REQ_ID has a UUID in it, but the meaning is different + # in this context, so best to grab those first. + # + # LG_NUM usually represents a memory size; with the number of flavors + # this can create a lot of noise. + # + # The intent is to remove noise from unimportant subtleties + masking_regex = ( (1, 'REQ_ID', r"req-[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}" @@ -117,14 +127,21 @@ if __name__ == '__main__': day=yesterday.day) end = start + datetime.timedelta(hours=length-1, minutes=59, seconds=59) + deployments = {} + instance_map = {} # { uuid : [request_id, request_id, ...] } exception_counts = {} # { exception_message : count } event_counts = {} # { event_name : count } - metadata = {'report_format': 'json', - 'instances': instance_map, - 'exception_counts': exception_counts, - 'event_counts': event_counts - } + tenant_issues = {} + codes = {} + metadata = { + 'report_format': 'json', + 'instances': instance_map, + 'exception_counts': exception_counts, + 'event_counts': event_counts, + 'tenant_issues': tenant_issues, + 'codes': codes, + } # Tell Stacky to format as JSON and set placeholders for various summaries report = [metadata] @@ -132,8 +149,6 @@ if __name__ == '__main__': dstart = dt.dt_to_decimal(start) dend = dt.dt_to_decimal(end) - codes = {} - deployments = {} for deploy in models.Deployment.objects.all(): deployments[deploy.id] = deploy.name @@ -145,12 +160,6 @@ if __name__ == '__main__': expiry = 60 * 60 # 1 hour cmds = ['create', 'rebuild', 'rescue', 'resize', 'snapshot'] - failures = {} - causes = {} - durations = {} - successes = {} - tenant_issues = {} - for uuid_dict in updates: uuid = uuid_dict['instance'] @@ -224,42 +233,24 @@ if __name__ == '__main__': if not _start: continue - image = "?" - if image_type.isset(image_type_num, image_type.BASE_IMAGE): - image = "base" - if image_type.isset(image_type_num, image_type.SNAPSHOT_IMAGE): - image = "snap" - _end = _when diff = _end - _start - if diff > 3600 and failure_type is None: - failure_type = ">60" + if diff > 1800 and failure_type is None: + failure_type = ">30" - key = (operation, image_type_num, cell) - - # Track durations for all attempts, good and bad ... - duration_min, duration_max, duration_count, duration_total = \ - durations.get(key, (9999999, 0, 0, 0)) - duration_min = min(duration_min, diff) - duration_max = max(duration_max, diff) - duration_count += 1 - duration_total += diff - durations[key] = (duration_min, duration_max, duration_count, - duration_total) - - if not failure_type: - successes[key] = successes.get(key, 0) + 1 - else: + if failure_type: + key = (operation, image_type_num, cell) failed_request = {} message = [] # For exception message masking req_list.append(req) instance_map[uuid] = req_list failed_request['req'] = req + failed_request['uuid'] = uuid + failed_request['tenant'] = tenant failed_request['duration'] = "%.2f minutes" % (diff/60) failed_request['operation'] = operation failed_request['platform'] = image_type.readable(image_type_num) - failures[key] = failures.get(key, 0) + 1 tenant_issues[tenant] = tenant_issues.get(tenant, 0) + 1 if err_id: @@ -296,12 +287,12 @@ if __name__ == '__main__': codes[code] = codes.get(code, 0) + 1 failure_type = code failed_request['failure_type'] = failure_type + raws = models.RawData.objects.filter(request_id=req)\ .exclude(event='compute.instance.exists')\ .order_by('when') failed_request['details'] = [] - for raw in raws: failure_detail = {} failure_detail['host'] = raw.host @@ -310,13 +301,11 @@ if __name__ == '__main__': failure_detail['state'] = raw.state failure_detail['old_task'] = raw.old_task failure_detail['task'] = raw.task + failed_request['details'].append(failure_detail) report.append(failed_request) - cause_key = (key, failure_type) - causes[cause_key] = causes.get(cause_key, 0) + 1 - # Assign values to store in DB values = {'json': json.dumps(report), 'created': dt.dt_to_decimal(datetime.datetime.utcnow()), diff --git a/reports/pretty.py b/reports/pretty.py index 436fb48..654fc40 100644 --- a/reports/pretty.py +++ b/reports/pretty.py @@ -16,17 +16,16 @@ from stacktach import models def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, store=False, region=None, too_long=1800): if not yesterday: - yesterday = datetime.datetime.utcnow().date() - \ - datetime.timedelta(days=1) + yesterday = datetime.datetime.utcnow().date() -\ + datetime.timedelta(days=1) rstart = datetime.datetime(year=yesterday.year, month=yesterday.month, - day=yesterday.day, hour=start_hour) + day=yesterday.day, hour=start_hour) rend = rstart + datetime.timedelta(hours=hours-1, minutes=59, seconds=59) dstart = dt.dt_to_decimal(rstart) dend = dt.dt_to_decimal(rend) - codes = {} too_long_col = '> %d' % (too_long / 60) cells = [] @@ -87,11 +86,12 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, err = raw failure_type = 'http' - if raw.old_state != 'error' and raw.state == 'error': + if failure_type != 'state' and raw.old_state != 'error'\ + and raw.state == 'error': failure_type = 'state' if raw.old_state == 'error' and \ - (not raw.state in ['deleted', 'error']): + (not raw.state in ['deleted', 'error']): failure_type = None for cmd in cmds: @@ -110,7 +110,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, image = "snap" #Get os_type from image_type bit field - os_type = "other" + os_type = "?" if image_type.isset(image_type_num, image_type.LINUX_IMAGE): os_type = "linux" if image_type.isset(image_type_num, image_type.WINDOWS_IMAGE): @@ -122,7 +122,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, end = raw.when diff = end - start - if diff > too_long and failure_type == None: + if diff > too_long and failure_type is None: failure_type = too_long_col key = (operation, image, os_type) @@ -158,7 +158,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, report.append(details) failure_types = ["4xx", "5xx", too_long_col, "state"] - cols = ["Operation", "Image", "OS", "Min", "Max", "Med", "%d%%" % percentile, + cols = ["Operation", "Image", "OS Type", "Min", "Max", "Med", "%d%%" % percentile, "Requests"] for failure_type in failure_types: cols.append("%s" % failure_type) @@ -178,7 +178,7 @@ def make_report(yesterday=None, start_hour=0, hours=24, percentile=97, # Sum for grand totals. failure_count = breakdown.get(failure_type, 0) failure_totals[failure_type] = \ - failure_totals.get(failure_type, 0) + failure_count + failure_totals.get(failure_type, 0) + failure_count # Failure percentage for this attempt. percentage = float(failure_count) / float(count) @@ -231,9 +231,9 @@ def valid_date(date): try: t = time.strptime(date, "%Y-%m-%d") return datetime.datetime(*t[:6]) - except Exception, e: + except Exception: raise argparse.ArgumentTypeError( - "'%s' is not in YYYY-MM-DD format." % date) + "'%s' is not in YYYY-MM-DD format." % date) if __name__ == '__main__':