Merge pull request #12 from adobdin/issue9

merge "Issue9" branch
This commit is contained in:
Dmitry 2016-05-03 13:41:34 -07:00
commit 485a69d91e
7 changed files with 385 additions and 439 deletions

View File

@ -1,23 +1,29 @@
ssh_opts: ssh_opts:
- -oConnectTimeout=2 - '-oConnectTimeout=2'
- -oStrictHostKeyChecking=no - '-oStrictHostKeyChecking=no'
- -oUserKnownHostsFile=/dev/null - '-oUserKnownHostsFile=/dev/null'
- -oLogLevel=error - '-oLogLevel=error'
- -lroot - '-lroot'
- -oBatchMode=yes - '-oBatchMode=yes'
env_vars: env_vars:
- OPENRC=/root/openrc - 'OPENRC=/root/openrc'
- IPTABLES_STR="iptables -nvL" - 'IPTABLES_STR="iptables -nvL"'
fuelip: 127.0.0.1 fuelip: '127.0.0.1'
rqdir: ./rq rqdir: './rq'
soft_filter: soft_filter:
status: ['ready'] status: ['ready']
timeout: 15 timeout: 15
compress_timeout: 3600 compress_timeout: 3600
log_files: log_path: '/var/log'
path: /var/log log_filter:
filter:
default:
include: '(.)*' include: '(.)*'
exclude: '[-_]\d{8}$|atop[-_]|\.gz$' exclude: '[-_]\d{8}$|atop[-_]|\.gz$'
# by_role:
# contrail:
# log_filter:
# include: 'contrail'
# by_node_id:
# 3:
# env_vars:
# OPENRC: '/root/openrc'
# IPTABLES_STR: 'iptables -L'

View File

@ -17,7 +17,4 @@ setup(name='timmy',
packages=["timmy"], packages=["timmy"],
data_files=rqfiles, data_files=rqfiles,
include_package_data=True, include_package_data=True,
entry_points = { entry_points={'console_scripts': ['timmy=timmy.cli:main']})
'console_scripts': ['timmy = timmy.cli:main']
}
)

View File

@ -16,15 +16,16 @@
# under the License. # under the License.
import argparse import argparse
import timmy
from timmy import nodes from timmy import nodes
import logging import logging
import sys import sys
import os import os
from timmy.conf import Conf from timmy.conf import Conf
from timmy import flock from timmy import flock
from timmy.tools import interrupt_wrapper
@interrupt_wrapper
def main(argv=None): def main(argv=None):
if argv is None: if argv is None:
argv = sys.argv argv = sys.argv
@ -42,10 +43,12 @@ def main(argv=None):
parser.add_argument('-l', '--logs', parser.add_argument('-l', '--logs',
help='collect logs from nodes', help='collect logs from nodes',
action='store_true', dest='getlogs') action='store_true', dest='getlogs')
parser.add_argument('-L', '--logs-maxthreads', type=int, default=100,
help="maximum simultaneous log collection operations")
parser.add_argument('--only-logs', parser.add_argument('--only-logs',
action='store_true', action='store_true',
help='Collect only logs from fuel-node') help='Collect only logs from fuel-node')
parser.add_argument('--log-file', parser.add_argument('--log-file', default=None,
help='timmy log file') help='timmy log file')
parser.add_argument('--fake-logs', parser.add_argument('--fake-logs',
help="Do not collect logs, only calculate size", help="Do not collect logs, only calculate size",
@ -63,12 +66,7 @@ def main(argv=None):
loglevel = logging.DEBUG loglevel = logging.DEBUG
else: else:
loglevel = logging.INFO loglevel = logging.INFO
if args.log_file: logging.basicConfig(filename=args.log_file,
logfile = args.log_file
else:
logfile = None
logging.basicConfig(
filename=logfile,
level=loglevel, level=loglevel,
format='%(asctime)s %(levelname)s %(message)s') format='%(asctime)s %(levelname)s %(message)s')
config = Conf() config = Conf()
@ -92,15 +90,19 @@ def main(argv=None):
lf = '/tmp/timmy-logs.lock' lf = '/tmp/timmy-logs.lock'
lock = flock.FLock(lf) lock = flock.FLock(lf)
if lock.lock(): if lock.lock():
try:
n.get_node_file_list() n.get_node_file_list()
n.calculate_log_size() n.calculate_log_size()
if n.is_enough_space(config.archives): if n.is_enough_space(config.archives):
n.create_log_archives(config.archives, n.archive_logs(config.archives,
config.compress_timeout, config.compress_timeout,
maxthreads=args.logs_maxthreads,
fake=args.fake_logs) fake=args.fake_logs)
finally:
lock.unlock() lock.unlock()
else: else:
logging.warning('Unable to obtain lock %s, skipping "logs"-part' % lf) logging.warning('Unable to obtain lock %s, skipping "logs"-part' %
lf)
logging.info("Nodes:\n%s" % n) logging.info("Nodes:\n%s" % n)
print(n) print(n)
return 0 return 0

View File

@ -20,9 +20,9 @@ class Conf(object):
compress_timeout = 3600 compress_timeout = 3600
archives = '/tmp/timmy/archives' archives = '/tmp/timmy/archives'
cmds_archive = '' cmds_archive = ''
log_files = {} log_path = '/var/log'
log_files['filter'] = {'default': {'include': "(.)*", 'exclude': '[-_]\d{8}$|atop[-_]|\.gz$'}} log_filter = {'include': '',
log_files['path'] = '/var/log/' 'exclude': '[-_]\d{8}$|atop[-_]|\.gz$'}
def __init__(self, **entries): def __init__(self, **entries):
self.__dict__.update(entries) self.__dict__.update(entries)
@ -38,16 +38,19 @@ class Conf(object):
conf = yaml.load(f) conf = yaml.load(f)
return Conf(**conf) return Conf(**conf)
except IOError as e: except IOError as e:
logging.error("load_conf: I/O error(%s): %s" % (e.errno, e.strerror)) logging.error("load_conf: I/O error(%s): %s" %
(e.errno, e.strerror))
sys.exit(1) sys.exit(1)
except ValueError: except ValueError:
logging.error("load_conf: Could not convert data") logging.error("load_conf: Could not convert data")
sys.exit(1) sys.exit(1)
except yaml.parser.ParserError as e: except yaml.parser.ParserError as e:
logging.error("load_conf: Could not parse %s:\n%s" % (filename, str(e))) logging.error("load_conf: Could not parse %s:\n%s" %
(filename, str(e)))
sys.exit(1) sys.exit(1)
except: except:
logging.error("load_conf: Unexpected error: %s" % sys.exc_info()[0]) logging.error("load_conf: Unexpected error: %s" %
sys.exc_info()[0])
sys.exit(1) sys.exit(1)

View File

@ -46,7 +46,8 @@ class FLock:
self.lockfd = os.open(self.lockfile, self.lockfd = os.open(self.lockfile,
os.O_TRUNC | os.O_CREAT | os.O_RDWR) os.O_TRUNC | os.O_CREAT | os.O_RDWR)
# Acquire exclusive lock on the file, but don't block waiting for it # Acquire exclusive lock on the file,
# but don't block waiting for it
fcntl.flock(self.lockfd, fcntl.LOCK_EX | fcntl.LOCK_NB) fcntl.flock(self.lockfd, fcntl.LOCK_EX | fcntl.LOCK_NB)
# Writing to file is pointless, nobody can see it # Writing to file is pointless, nobody can see it
@ -54,7 +55,8 @@ class FLock:
return True return True
except (OSError, IOError), e: except (OSError, IOError), e:
# Lock cannot be acquired is okay, everything else reraise exception # Lock cannot be acquired is okay,
# everything else reraise exception
if e.errno in (errno.EACCES, errno.EAGAIN): if e.errno in (errno.EACCES, errno.EAGAIN):
return False return False
else: else:
@ -67,7 +69,7 @@ class FLock:
os.unlink(self.lockfile) os.unlink(self.lockfile)
# Just in case, let's not leak file descriptors # Just in case, let's not leak file descriptors
os.close(self.lockfd) os.close(self.lockfd)
except (OSError, IOError), e: except (OSError, IOError):
# Ignore error destroying lock file. See class doc about how # Ignore error destroying lock file. See class doc about how
# lockfile can be erased and everything still works normally. # lockfile can be erased and everything still works normally.
pass pass

View File

@ -24,9 +24,8 @@ import json
import os import os
import logging import logging
import sys import sys
import threading
import re import re
from tools import * import tools
ckey = 'cmds' ckey = 'cmds'
fkey = 'files' fkey = 'files'
@ -36,6 +35,9 @@ varlogdir = '/var/log'
class Node(object): class Node(object):
override_by_id = ['ssh_opts', 'env_vars', 'log_path', 'log_filter']
aggregate_by_role = ['log_path', 'log_filter']
def __init__(self, node_id, mac, cluster, roles, os_platform, def __init__(self, node_id, mac, cluster, roles, os_platform,
online, status, ip, conf): online, status, ip, conf):
self.node_id = node_id self.node_id = node_id
@ -51,28 +53,33 @@ class Node(object):
self.logsize = 0 self.logsize = 0
self.flogs = {} self.flogs = {}
self.mapcmds = {} self.mapcmds = {}
self.logs = {}
self.set_conf(conf) self.set_conf(conf)
def set_conf(self, conf): def override_conf(self, conf):
logging.info(conf.ssh_opts) for field in Node.aggregate_by_role:
self.ssh_opts = " ".join(conf.ssh_opts) for role in self.roles:
self.env_vars = " ".join(conf.env_vars)
self.log_files = conf.log_files
self.timeout = conf.timeout
try: try:
conf.by_node_id getattr(self, field).append(conf.by_role[self.role][field])
except: except:
return pass
if self.node_id in conf.by_node_id: for field in Node.override_by_id:
if 'ssh_opts' in conf.by_node_id[self.node_id]: try:
self.ssh_opts = " ".join(conf.by_node_id[self.node_id]['ssh_opts']) setattr(self, field, conf.by_node_id[self.node_id][field])
if 'env_vars' in conf.by_node_id[self.node_id]: except:
self.env_vars = " ".join(conf.by_node_id[self.node_id]['env_vars']) pass
if 'log_files' in conf.by_node_id[self.node_id]:
self.log_files = conf.by_node_id[self.node_id]['log_files'] def set_conf(self, conf):
self.ssh_opts = conf.ssh_opts
self.env_vars = conf.env_vars
self.log_path = list([conf.log_path])
self.log_filter = list([conf.log_filter])
self.timeout = conf.timeout
self.override_conf(conf)
def set_files(self, dirname, key, ds, version): def set_files(self, dirname, key, ds, version):
files = [] files = []
dfs = 'default'
for role in self.roles: for role in self.roles:
if 'by-role' in ds[key] and role in ds[key]['by-role'].keys(): if 'by-role' in ds[key] and role in ds[key]['by-role'].keys():
for f in ds[key]['by-role'][role]: for f in ds[key]['by-role'][role]:
@ -86,9 +93,9 @@ class Node(object):
for f in ds[key]['by-os'][self.os_platform].keys(): for f in ds[key]['by-os'][self.os_platform].keys():
files += [os.path.join(dirname, key, 'by-os', files += [os.path.join(dirname, key, 'by-os',
self.os_platform, f)] self.os_platform, f)]
if 'default' in ds[key] and 'default' in ds[key]['default']: if dfs in ds[key] and dfs in ds[key][dfs]:
for f in ds[key]['default']['default'].keys(): for f in ds[key][dfs][dfs].keys():
files += [os.path.join(dirname, key, 'default', 'default', f)] files += [os.path.join(dirname, key, dfs, dfs, f)]
self.files[key] = sorted(set(files)) self.files[key] = sorted(set(files))
logging.debug('set_files:\nkey: %s, node: %s, file_list: %s' % logging.debug('set_files:\nkey: %s, node: %s, file_list: %s' %
(key, self.node_id, self.files[key])) (key, self.node_id, self.files[key]))
@ -112,7 +119,8 @@ class Node(object):
def add_files(self, dirname, key, ds): def add_files(self, dirname, key, ds):
for role in self.roles: for role in self.roles:
if 'once-by-role' in ds[key] and role in ds[key]['once-by-role'].keys(): if ('once-by-role' in ds[key] and
role in ds[key]['once-by-role'].keys()):
for f in ds[key]['once-by-role'][role]: for f in ds[key]['once-by-role'][role]:
self.files[key] += [os.path.join(dirname, key, self.files[key] += [os.path.join(dirname, key,
'once-by-role', role, f)] 'once-by-role', role, f)]
@ -125,11 +133,11 @@ class Node(object):
cl = 'cluster-%s' % self.cluster cl = 'cluster-%s' % self.cluster
logging.debug('%s/%s/%s/%s' % (odir, label, cl, sn)) logging.debug('%s/%s/%s/%s' % (odir, label, cl, sn))
ddir = os.path.join(odir, label, cl, sn) ddir = os.path.join(odir, label, cl, sn)
mdir(ddir) tools.mdir(ddir)
for f in self.files[label]: for f in self.files[label]:
logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, f)) logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, f))
if not fake: if not fake:
outs, errs, code = ssh_node(ip=self.ip, outs, errs, code = tools.ssh_node(ip=self.ip,
filename=f, filename=f,
ssh_opts=self.ssh_opts, ssh_opts=self.ssh_opts,
env_vars=self.env_vars, env_vars=self.env_vars,
@ -154,7 +162,7 @@ class Node(object):
def exec_simple_cmd(self, cmd, infile, outfile, timeout=15, fake=False): def exec_simple_cmd(self, cmd, infile, outfile, timeout=15, fake=False):
logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, cmd)) logging.info('node:%s(%s), exec: %s' % (self.node_id, self.ip, cmd))
if not fake: if not fake:
outs, errs, code = ssh_node(ip=self.ip, outs, errs, code = tools.ssh_node(ip=self.ip,
command=cmd, command=cmd,
ssh_opts=self.ssh_opts, ssh_opts=self.ssh_opts,
env_vars=self.env_vars, env_vars=self.env_vars,
@ -171,7 +179,7 @@ class Node(object):
(self.node_id, self.ip, label)) (self.node_id, self.ip, label))
cmd = 'du -b %s' % self.data[label].replace('\n', ' ') cmd = 'du -b %s' % self.data[label].replace('\n', ' ')
logging.info('node: %s, logs du-cmd: %s' % (self.node_id, cmd)) logging.info('node: %s, logs du-cmd: %s' % (self.node_id, cmd))
outs, errs, code = ssh_node(ip=self.ip, outs, errs, code = tools.ssh_node(ip=self.ip,
command=cmd, command=cmd,
sshopts=sshopts, sshopts=sshopts,
sshvars='', sshvars='',
@ -194,14 +202,14 @@ class Node(object):
logging.info("node: %s, ip: %s, size: %s" % logging.info("node: %s, ip: %s, size: %s" %
(self.node_id, self.ip, self.logsize)) (self.node_id, self.ip, self.logsize))
def get_files(self, label, sshopts, odir='info', timeout=15): def get_files(self, label, odir='info', timeout=15):
logging.info('node:%s(%s), filelist: %s' % logging.info('node:%s(%s), filelist: %s' %
(self.node_id, self.ip, label)) (self.node_id, self.ip, label))
sn = 'node-%s' % self.node_id sn = 'node-%s' % self.node_id
cl = 'cluster-%s' % self.cluster cl = 'cluster-%s' % self.cluster
ddir = os.path.join(odir, label, cl, sn) ddir = os.path.join(odir, label, cl, sn)
mdir(ddir) tools.mdir(ddir)
outs, errs, code = get_files_rsync(ip=self.ip, outs, errs, code = tools.get_files_rsync(ip=self.ip,
data=self.data[label], data=self.data[label],
ssh_opts=self.ssh_opts, ssh_opts=self.ssh_opts,
dpath=ddir, dpath=ddir,
@ -224,95 +232,24 @@ class Node(object):
logging.debug('node: %s, key: %s, data:\n%s' % logging.debug('node: %s, key: %s, data:\n%s' %
(self.node_id, key, self.data[key])) (self.node_id, key, self.data[key]))
def apply_include_filter(self, lfilter): def logs_filter(self):
logging.info('apply_include_filter: node: %s, filter: %s' % (self.node_id, lfilter)) result = {}
flogs = {} for re_pair in self.log_filter:
if 'include' in lfilter and lfilter['include'] is not None: for f, s in self.logs.items():
for f in self.dulogs.splitlines(): if (('include' not in re_pair or
try: re.search(re_pair['include'], f)) and
if ('include' in lfilter and re.search(lfilter['include'], f)): ('exclude' not in re_pair or
flogs[f.split("\t")[1]] = int(f.split("\t")[0]) not re.search(re_pair['exclude'], f))):
else: result[f] = s
logging.debug("filter %s by %s" % (f, lfilter)) self.logs = result
except re.error as e:
logging.error('logs_include_filter: filter: %s, str: %s, re.error: %s' %
(lfilter, f, str(e)))
sys.exit(5)
self.flogs.update(flogs) def logs_populate(self, timeout=5):
return True got_logs = False
else: for path in self.log_path:
return False
def apply_exclude_filter(self, lfilter):
logging.info('apply_exclude_filter: node: %s, filter: %s' % (self.node_id, lfilter))
rflogs = []
if 'exclude' in lfilter and lfilter['exclude'] is None:
return True
if 'exclude' in lfilter and lfilter['exclude'] is not None:
for f in self.flogs:
try:
if re.search(lfilter['exclude'], f):
rflogs.append(f)
logging.info('logs_exclude_filter: %s' % f)
except re.error as e:
logging.error('logs_include_filter: filter: %s, str: %s, re.error: %s' %
(lfilter, f, str(e)))
sys.exit(5)
for f in rflogs:
logging.debug('apply_exclude_filter: node: %s remove file: %s from log list' % (self.node_id, f ))
self.flogs.pop(f, None)
return True
else:
return False
def logs_filter(self, filterconf):
brstr = 'by_role'
flogs = {}
logging.info('logs_filter: node: %s, filter: %s' % (self.node_id, filterconf))
bynodeidinc = False
bynodeidexc = False
# need to check the following logic:
if 'by_node_id' in filterconf and self.node_id in filterconf['by_node_id']:
if self.apply_include_filter(filterconf['by_node_id'][self.node_id]):
bynodeidinc = True
if self.apply_exclude_filter(filterconf['by_node_id'][self.node_id]):
bynodeidexc = True
if bynodeidinc:
return
if bynodeidexc:
return
byrole = False
if brstr in filterconf:
for role in self.roles:
if role in filterconf[brstr].keys():
logging.info('logs_filter: apply filter for role %s' % role)
byrole = True
if self.apply_include_filter(filterconf[brstr][role]):
byrole = True
if not byrole:
if 'default' in filterconf:
self.apply_include_filter(filterconf['default'])
else:
# unexpected
logging.warning('default log filter is not defined')
self.flogs = {}
byrole = False
if brstr in filterconf:
for role in self.roles:
if role in filterconf[brstr].keys():
logging.info('logs_filter: apply filter for role %s' % role)
if self.apply_exclude_filter(filterconf[brstr][role]):
byrole = True
if not byrole:
if 'default' in filterconf:
logging.info('logs_filter: apply default exclude filter')
self.apply_exclude_filter(filterconf['default'])
def log_size_from_find(self, path, sshopts, timeout=5):
cmd = ("find '%s' -type f -exec du -b {} +" % (path)) cmd = ("find '%s' -type f -exec du -b {} +" % (path))
logging.info('log_size_from_find: node: %s, logs du-cmd: %s' % (self.node_id, cmd)) logging.info('logs_populate: node: %s, logs du-cmd: %s' %
outs, errs, code = ssh_node(ip=self.ip, (self.node_id, cmd))
outs, errs, code = tools.ssh_node(ip=self.ip,
command=cmd, command=cmd,
ssh_opts=self.ssh_opts, ssh_opts=self.ssh_opts,
env_vars='', env_vars='',
@ -321,11 +258,15 @@ class Node(object):
logging.error("node: %s, ip: %s, command: %s, " logging.error("node: %s, ip: %s, command: %s, "
"timeout code: %s, error message: %s" % "timeout code: %s, error message: %s" %
(self.node_id, self.ip, cmd, code, errs)) (self.node_id, self.ip, cmd, code, errs))
self.dulogs = "" break
return False if len(outs):
self.dulogs = outs got_logs = True
logging.info('log_size_from_find: dulogs: %s' % (self.dulogs)) for line in outs.split('\n'):
return True if '\t' in line:
size, filename = line.split('\t')
self.logs[filename] = int(size)
logging.debug('logs_populate: logs: %s' % (self.logs))
return got_logs
def print_files(self): def print_files(self):
for k in self.files.keys(): for k in self.files.keys():
@ -349,14 +290,14 @@ class Nodes(object):
"""Class nodes """ """Class nodes """
def __init__(self, cluster, extended, conf, filename=None): def __init__(self, cluster, extended, conf, filename=None):
import_subprocess()
self.dirname = conf.rqdir.rstrip('/') self.dirname = conf.rqdir.rstrip('/')
if (not os.path.exists(self.dirname)): if (not os.path.exists(self.dirname)):
logging.error("directory %s doesn't exist" % (self.dirname)) logging.error("directory %s doesn't exist" % (self.dirname))
sys.exit(1) sys.exit(1)
self.files = get_dir_structure(conf.rqdir)[os.path.basename(self.dirname)] dn = os.path.basename(self.dirname)
self.files = tools.get_dir_structure(conf.rqdir)[dn]
if (conf.fuelip is None) or (conf.fuelip == ""): if (conf.fuelip is None) or (conf.fuelip == ""):
logging.error('Nodes: looks like fuelip is not set(%s)' % conf.fuelip) logging.error('looks like fuelip is not set(%s)' % conf.fuelip)
sys.exit(7) sys.exit(7)
self.fuelip = conf.fuelip self.fuelip = conf.fuelip
self.conf = conf self.conf = conf
@ -375,7 +316,6 @@ class Nodes(object):
self.load_nodes(conf) self.load_nodes(conf)
self.get_version() self.get_version()
def __str__(self): def __str__(self):
s = "#node-id, cluster, admin-ip, mac, os, roles, online, status\n" s = "#node-id, cluster, admin-ip, mac, os, roles, online, status\n"
for node in sorted(self.nodes.values(), key=lambda x: x.node_id): for node in sorted(self.nodes.values(), key=lambda x: x.node_id):
@ -397,7 +337,7 @@ class Nodes(object):
online=True, online=True,
ip=self.fuelip, ip=self.fuelip,
conf=conf) conf=conf)
nodes_json, err, code = ssh_node(ip=self.fuelip, nodes_json, err, code = tools.ssh_node(ip=self.fuelip,
command=fuel_node_cmd, command=fuel_node_cmd,
ssh_opts=fuelnode.ssh_opts, ssh_opts=fuelnode.ssh_opts,
env_vars="", env_vars="",
@ -410,17 +350,20 @@ class Nodes(object):
def pass_hard_filter(self, node): def pass_hard_filter(self, node):
if self.conf.hard_filter: if self.conf.hard_filter:
if self.conf.hard_filter.status and (node.status not in self.conf.hard_filter.status): if (self.conf.hard_filter.status and
logging.info("hard filter by status: excluding node-%s" % node.node_id) (node.status not in self.conf.hard_filter.status)):
logging.info("hard filter by status: excluding node-%s" %
node.node_id)
return False return False
if (isinstance(self.conf.hard_filter.online, bool) and if (isinstance(self.conf.hard_filter.online, bool) and
(bool(node.online) != bool(self.conf.hard_filter.online))): (bool(node.online) != self.conf.hard_filter.online)):
logging.info("hard filter by online: excluding node-%s" % node.node_id) logging.info("hard filter by online: excluding node-%s" %
node.node_id)
return False return False
if (self.conf.hard_filter.node_ids and if (self.conf.hard_filter.node_ids and
((int(node.node_id) not in self.conf.hard_filter.node_ids) and (int(node.node_id) not in self.conf.hard_filter.node_ids)):
(str(node.node_id) not in self.conf.hard_filter.node_ids))): logging.info("hard filter by ids: excluding node-%s" %
logging.info("hard filter by ids: excluding node-%s" % node.node_id) node.node_id)
return False return False
if self.conf.hard_filter.roles: if self.conf.hard_filter.roles:
ok_roles = [] ok_roles = []
@ -428,7 +371,8 @@ class Nodes(object):
if role in self.conf.hard_filter.roles: if role in self.conf.hard_filter.roles:
ok_roles.append(role) ok_roles.append(role)
if not ok_roles: if not ok_roles:
logging.info("hard filter by roles: excluding node-%s" % node.node_id) logging.info("hard filter by roles: excluding node-%s" %
node.node_id)
return False return False
return True return True
@ -468,9 +412,8 @@ class Nodes(object):
def get_version(self): def get_version(self):
cmd = "awk -F ':' '/release/ {print \$2}' /etc/nailgun/version.yaml" cmd = "awk -F ':' '/release/ {print \$2}' /etc/nailgun/version.yaml"
logging.info('get_version:%s' %self.conf.ssh_opts)
fuelnode = self.nodes[self.fuelip] fuelnode = self.nodes[self.fuelip]
release, err, code = ssh_node(ip=fuelnode.ip, release, err, code = tools.ssh_node(ip=fuelnode.ip,
command=cmd, command=cmd,
ssh_opts=fuelnode.ssh_opts, ssh_opts=fuelnode.ssh_opts,
env_vars="", env_vars="",
@ -485,23 +428,23 @@ class Nodes(object):
def get_release(self): def get_release(self):
cmd = "awk -F ':' '/fuel_version/ {print \$2}' /etc/astute.yaml" cmd = "awk -F ':' '/fuel_version/ {print \$2}' /etc/astute.yaml"
for node in self.nodes.values(): for node in self.nodes.values():
# skip master
if node.node_id == 0: if node.node_id == 0:
# skip master
node.release = self.version node.release = self.version
if (node.node_id != 0) and (node.status == 'ready'): if (node.node_id != 0) and (node.status == 'ready'):
release, err, code = ssh_node(ip=node.ip, release, err, code = tools.ssh_node(ip=node.ip,
command=cmd, command=cmd,
sshopts=self.sshopts, ssh_opts=node.sshopts,
sshvars='', timeout=node.timeout)
timeout=self.timeout,
filename=None)
if code != 0: if code != 0:
logging.warning("get_release: node: %s: Can't get node release" % logging.warning("get_release: node: %s: %s" %
(node.node_id)) (node.node_id, "Can't get node release"))
node.release = self.version node.release = None
continue continue
else:
node.release = release.strip('\n "\'') node.release = release.strip('\n "\'')
logging.info("get_release: node: %s, release: %s" % (node.node_id, node.release)) logging.info("get_release: node: %s, release: %s" %
(node.node_id, node.release))
def get_node_file_list(self): def get_node_file_list(self):
for key in self.files.keys(): for key in self.files.keys():
@ -531,71 +474,61 @@ class Nodes(object):
for node in self.nodes.values(): for node in self.nodes.values():
logging.debug('%s' % node.files[ckey]) logging.debug('%s' % node.files[ckey])
def exec_filter(self, node):
f = self.conf.soft_filter
if f:
result = (((not f.status) or (node.status in f.status)) and
((not f.roles) or (node.role in f.roles)) and
((not f.node_ids) or (node.node_id in f.node_ids)))
else:
result = True
return result and (((self.cluster and node.cluster != 0 and
str(self.cluster) == str(node.cluster)) or not
self.cluster) and node.online)
def launch_ssh(self, odir='info', timeout=15, fake=False): def launch_ssh(self, odir='info', timeout=15, fake=False):
lock = flock.FLock('/tmp/timmy-cmds.lock') lock = flock.FLock('/tmp/timmy-cmds.lock')
if not lock.lock(): if not lock.lock():
logging.warning('Unable to obtain lock, skipping "cmds"-part') logging.warning('Unable to obtain lock, skipping "cmds"-part')
return '' return ''
try:
label = ckey label = ckey
threads = [] run_items = []
sem = threading.BoundedSemaphore(value=100) for n in [n for n in self.nodes.values() if self.exec_filter(n)]:
for node in self.nodes.values(): run_items.append(tools.RunItem(target=n.exec_cmd,
if (self.cluster and str(self.cluster) != str(node.cluster) and args={'label': label,
node.cluster != 0): 'odir': odir,
continue 'fake': fake}))
if node.status in self.conf.soft_filter.status and node.online: tools.run_batch(run_items, 100)
sem.acquire(True) finally:
t = threading.Thread(target=semaphore_release,
args=(sem,
node.exec_cmd,
node.node_id,
[label,
odir,
fake]))
threads.append(t)
t.start()
for t in threads:
t.join()
lock.unlock() lock.unlock()
def filter_logs(self):
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
node.logs_filter(self.conf.log_files['filter'])
logging.debug('filter logs: node-%s: filtered logs: %s' %
(node.node_id, node.flogs))
def calculate_log_size(self, timeout=15): def calculate_log_size(self, timeout=15):
lsize = 0 total_size = 0
for node in self.nodes.values(): for node in [n for n in self.nodes.values() if self.exec_filter(n)]:
if (self.cluster and str(self.cluster) != str(node.cluster) and if not node.logs_populate(5):
node.cluster != 0): logging.warning("can't get log file list from node %s" %
continue node.node_id)
if node.status in self.conf.soft_filter.status and node.online: else:
if not node.log_size_from_find(self.conf.log_files['path'],5): node.logs_filter()
logging.warning("can't get log file list from node %s" % node.node_id) logging.debug('filter logs: node-%s: filtered logs: %s' %
self.filter_logs() (node.node_id, node.logs))
for node in self.nodes.values(): total_size += sum(node.logs.values())
for f in node.flogs: logging.info('Full log size on nodes(with fuel): %s bytes' %
lsize += node.flogs[f] total_size)
for fl in sorted(node.flogs.items(), key=lambda x: x[1]): self.alogsize = total_size / 1024
logging.debug(fl)
logging.info('Full log size on nodes(with fuel): %s bytes' % lsize)
self.alogsize = lsize / 1024
def is_enough_space(self, directory, coefficient=1.2): def is_enough_space(self, directory, coefficient=1.2):
mdir(directory) tools.mdir(directory)
outs, errs, code = free_space(directory, timeout=1) outs, errs, code = tools.free_space(directory, timeout=1)
if code != 0: if code != 0:
logging.error("Can't get free space: %s" % errs) logging.error("Can't get free space: %s" % errs)
return False return False
try: try:
fs = int(outs.rstrip('\n')) fs = int(outs.rstrip('\n'))
except: except:
logging.error("is_enough_space: can't get free space\nouts: %s" % outs) logging.error("is_enough_space: can't get free space\nouts: %s" %
outs)
return False return False
logging.info('logsize: %s Kb, free space: %s Kb' % (self.alogsize, fs)) logging.info('logsize: %s Kb, free space: %s Kb' % (self.alogsize, fs))
if (self.alogsize*coefficient > fs): if (self.alogsize*coefficient > fs):
@ -606,9 +539,9 @@ class Nodes(object):
def create_archive_general(self, directory, outfile, timeout): def create_archive_general(self, directory, outfile, timeout):
cmd = "tar jcf '%s' -C %s %s" % (outfile, directory, ".") cmd = "tar jcf '%s' -C %s %s" % (outfile, directory, ".")
mdir(self.conf.archives) tools.mdir(self.conf.archives)
logging.debug("create_archive_general: cmd: %s" % cmd) logging.debug("create_archive_general: cmd: %s" % cmd)
outs, errs, code = launch_cmd(command=cmd, outs, errs, code = tools.launch_cmd(command=cmd,
timeout=timeout) timeout=timeout)
if code != 0: if code != 0:
logging.error("Can't create archive %s" % (errs)) logging.error("Can't create archive %s" % (errs))
@ -617,10 +550,11 @@ class Nodes(object):
'''Returns interface speed through which logs will be dowloaded''' '''Returns interface speed through which logs will be dowloaded'''
for node in self.nodes.values(): for node in self.nodes.values():
if not (node.ip == 'localhost' or node.ip.startswith('127.')): if not (node.ip == 'localhost' or node.ip.startswith('127.')):
cmd = "cat /sys/class/net/$(/sbin/ip -o route get %s | cut -d' ' -f3)/speed" % node.ip cmd = ("%s$(/sbin/ip -o route get %s | cut -d' ' -f3)/speed" %
out, err, code = launch_cmd(cmd, node.timeout) ('cat /sys/class/net/', node.ip))
out, err, code = tools.launch_cmd(cmd, node.timeout)
if code != 0: if code != 0:
logging.error("can't get interface speed: error message: %s" % err) logging.error("can't get interface speed: error: %s" % err)
return defspeed return defspeed
try: try:
speed = int(out) speed = int(out)
@ -628,105 +562,65 @@ class Nodes(object):
speed = defspeed speed = defspeed
return speed return speed
def create_log_archives(self, outdir, timeout, fake=False, maxthreads=10, speed=100): def archive_logs(self, outdir, timeout,
fake=False, maxthreads=10, speed=100):
if fake: if fake:
logging.info('create_log_archives: skip creating archives(fake:%s)' % fake) logging.info('archive_logs:skip creating archives(fake:%s)' % fake)
return return
threads = []
txtfl = [] txtfl = []
speed = self.find_adm_interface_speed(speed) speed = self.find_adm_interface_speed(speed)
if len(self.nodes) > maxthreads: speed = int(speed * 0.9 / min(maxthreads, len(self.nodes)))
speed = int(speed * 0.9 / maxthreads) pythonslowpipe = tools.slowpipe % speed
else: run_items = []
speed = int(speed * 0.9 / len(self.nodes)) for node in [n for n in self.nodes.values() if self.exec_filter(n)]:
pythonslowpipe = 'import sys\n'
pythonslowpipe += 'import time\n'
pythonslowpipe += 'while 1:\n'
pythonslowpipe += ' a = sys.stdin.read(int(1250*%s))\n' % speed
pythonslowpipe += ' if a:\n'
pythonslowpipe += ' sys.stdout.write(a)\n'
pythonslowpipe += ' time.sleep(0.01)\n'
pythonslowpipe += ' else:\n'
pythonslowpipe += ' break\n'
sem = threading.BoundedSemaphore(value=maxthreads)
for node in self.nodes.values():
if (self.cluster and str(self.cluster) != str(node.cluster) and
node.cluster != 0):
continue
if node.status in self.conf.soft_filter.status and node.online:
sem.acquire(True)
node.archivelogsfile = os.path.join(outdir, node.archivelogsfile = os.path.join(outdir,
'logs-node-'+str(node.node_id) + '.tar.gz') 'logs-node-%s.tar.gz' %
mdir(outdir) str(node.node_id))
tools.mdir(outdir)
logslistfile = node.archivelogsfile + '.txt' logslistfile = node.archivelogsfile + '.txt'
txtfl.append(logslistfile) txtfl.append(logslistfile)
try: try:
with open(logslistfile, 'w') as llf: with open(logslistfile, 'w') as llf:
for line in node.flogs: for filename in node.logs:
llf.write(line+"\0") llf.write(filename+"\0")
except: except:
logging.error("create_archive_logs: Can't write to file %s" % logslistfile) logging.error("create_archive_logs: Can't write to file %s" %
logslistfile)
continue continue
if node.ip == 'localhost' or node.ip.startswith('127.'):
cmd = "tar --gzip --create --file - --null --files-from -" cmd = "tar --gzip --create --file - --null --files-from -"
else: if not (node.ip == 'localhost' or node.ip.startswith('127.')):
cmd = "tar --gzip --create --file - --null --files-from - | python -c '%s'" % pythonslowpipe cmd = ' '.join([cmd, "| python -c '%s'" % pythonslowpipe])
t = threading.Thread(target=semaphore_release, args = {'cmd': cmd,
args=(sem, 'infile': logslistfile,
node.exec_simple_cmd, 'outfile': node.archivelogsfile,
node.node_id, 'timeout': timeout}
[cmd, run_items.append(tools.RunItem(target=node.exec_simple_cmd,
logslistfile, args=args))
node.archivelogsfile, tools.run_batch(run_items, maxthreads)
timeout]
)
)
threads.append(t)
t.start()
while True:
try:
tt = []
for t in threads:
if t is not None and t.isAlive():
t.join(1)
else:
tt.append(t)
if len(threads) == len(tt):
break
except KeyboardInterrupt:
#sys.exit(9)
killall_children(self.timeout)
raise KeyboardInterrupt()
for tfile in txtfl: for tfile in txtfl:
try: try:
os.remove(tfile) os.remove(tfile)
except: except:
logging.error("create_log_archives: can't delete file %s" % tfile) logging.error("archive_logs: can't delete file %s" % tfile)
def get_conf_files(self, odir=fkey, timeout=15): def get_conf_files(self, odir=fkey, timeout=15):
if fkey not in self.files: if fkey not in self.files:
logging.warning("get_conf_files: %s directory does not exist" % fkey) logging.warning("get_conf_files: %s directory doesn't exist" %
fkey)
return return
lock = flock.FLock('/tmp/timmy-files.lock') lock = flock.FLock('/tmp/timmy-files.lock')
if not lock.lock(): if not lock.lock():
logging.warning('Unable to obtain lock, skipping "files"-part') logging.warning('Unable to obtain lock, skipping "files"-part')
return '' return ''
try:
label = fkey label = fkey
threads = [] run_items = []
for node in self.nodes.values(): for n in [n for n in self.nodes.values() if self.exec_filter(n)]:
if (self.cluster and str(self.cluster) != str(node.cluster) and run_items.append(tools.RunItem(target=n.get_files,
node.cluster != 0): args={'label': label,
continue 'odir': odir}))
if node.status in self.conf.soft_filter.status and node.online: tools.run_batch(run_items, 10)
t = threading.Thread(target=node.get_files, finally:
args=(label,
odir,))
threads.append(t)
t.start()
for t in threads:
t.join()
lock.unlock() lock.unlock()

View File

@ -22,37 +22,73 @@ tools module
import os import os
import logging import logging
import sys import sys
import threading
import multiprocessing
def import_subprocess():
if 'subprocess' not in globals():
global subprocess
global ok_python
try:
import subprocess32 as subprocess
logging.info("using improved subprocess32 module\n")
ok_python = True
except:
import subprocess import subprocess
logging.warning(("Please upgrade the module 'subprocess' to the latest version: "
"https://pypi.python.org/pypi/subprocess32/"))
ok_python = True
if sys.version_info > (2, 7, 0):
ok_python = False
logging.warning('this subprocess module does not support timeouts')
else:
logging.info('subprocess is already loaded')
def semaphore_release(sema, func, node_id, params):
logging.info('start ssh node: %s' % node_id) slowpipe = '''
import sys
import time
while 1:
a = sys.stdin.read(int(1250*%s))
if a:
sys.stdout.write(a)
time.sleep(0.01)
else:
break
'''
def interrupt_wrapper(f):
def wrapper(*args, **kwargs):
try: try:
result = func(*params) f(*args, **kwargs)
except: except KeyboardInterrupt:
logging.error("failed to launch: %s on node %s" % node_id) logging.warning('Interrupted, exiting.')
return wrapper
class RunItem():
def __init__(self, target, args):
self.target = target
self.args = args
self.process = None
class SemaphoreProcess(multiprocessing.Process):
def __init__(self, semaphore, target, args):
multiprocessing.Process.__init__(self)
self.semaphore = semaphore
self.target = target
self.args = args
def run(self):
try:
self.target(**self.args)
finally: finally:
sema.release() logging.debug('finished call: %s' % self.target)
logging.info('finish ssh node: %s' % node_id) self.semaphore.release()
return result
def run_batch(item_list, maxthreads):
semaphore = multiprocessing.BoundedSemaphore(maxthreads)
try:
for run_item in item_list:
semaphore.acquire(True)
p = SemaphoreProcess(target=run_item.target,
semaphore=semaphore,
args=run_item.args)
run_item.process = p
p.start()
for run_item in item_list:
run_item.process.join()
run_item.process = None
except KeyboardInterrupt:
for run_item in item_list:
if run_item.process:
run_item.process.terminate()
raise KeyboardInterrupt()
def get_dir_structure(rootdir): def get_dir_structure(rootdir):
@ -85,27 +121,34 @@ def mdir(directory):
def launch_cmd(command, timeout): def launch_cmd(command, timeout):
def _timeout_terminate(pid):
try:
os.kill(pid, 15)
logging.error("launch_cmd: pid %d killed by timeout" % pid)
except:
pass
logging.info('launch_cmd: command %s' % command) logging.info('launch_cmd: command %s' % command)
p = subprocess.Popen(command, p = subprocess.Popen(command,
shell=True, shell=True,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
if ok_python: timeout_killer = None
try:
outs, errs = p.communicate(timeout=timeout+1)
except subprocess.TimeoutExpired:
p.kill()
outs, errs = p.communicate()
logging.error("command: %s err: %s, returned: %s" %
(command, errs, p.returncode))
else:
try: try:
timeout_killer = threading.Timer(timeout, _timeout_terminate, [p.pid])
timeout_killer.start()
outs, errs = p.communicate() outs, errs = p.communicate()
except: except:
try:
p.kill() p.kill()
except:
pass
outs, errs = p.communicate() outs, errs = p.communicate()
logging.error("command: %s err: %s, returned: %s" % logging.error("command: %s err: %s, returned: %s" %
(command, errs, p.returncode)) (command, errs, p.returncode))
finally:
if timeout_killer:
timeout_killer.cancel()
logging.debug("ssh return: err:%s\nouts:%s\ncode:%s" % logging.debug("ssh return: err:%s\nouts:%s\ncode:%s" %
(errs, outs, p.returncode)) (errs, outs, p.returncode))
logging.info("ssh return: err:%s\ncode:%s" % logging.info("ssh return: err:%s\ncode:%s" %
@ -115,8 +158,10 @@ def launch_cmd(command, timeout):
def ssh_node(ip, command, ssh_opts=[], env_vars=[], timeout=15, filename=None, def ssh_node(ip, command, ssh_opts=[], env_vars=[], timeout=15, filename=None,
inputfile=None, outputfile=None, prefix='nice -n 19 ionice -c 3'): inputfile=None, outputfile=None, prefix='nice -n 19 ionice -c 3'):
#ssh_opts = " ".join(ssh_opts) if type(ssh_opts) is list:
#env_vars = " ".join(env_vars) ssh_opts = ' '.join(ssh_opts)
if type(env_vars) is list:
env_vars = ' '.join(env_vars)
if (ip in ['localhost', '127.0.0.1']) or ip.startswith('127.'): if (ip in ['localhost', '127.0.0.1']) or ip.startswith('127.'):
logging.info("skip ssh") logging.info("skip ssh")
bstr = "%s timeout '%s' bash -c " % ( bstr = "%s timeout '%s' bash -c " % (
@ -135,9 +180,12 @@ def ssh_node(ip, command, ssh_opts=[], env_vars=[], timeout=15, filename=None,
logging.info("ssh_node: inputfile selected, cmd: %s" % cmd) logging.info("ssh_node: inputfile selected, cmd: %s" % cmd)
if outputfile is not None: if outputfile is not None:
cmd += ' > "' + outputfile + '"' cmd += ' > "' + outputfile + '"'
cmd = ("trap 'kill $pid' 15; " +
"trap 'kill $pid' 2; " + cmd + '&:; pid=$!; wait $!')
outs, errs, code = launch_cmd(cmd, timeout) outs, errs, code = launch_cmd(cmd, timeout)
return outs, errs, code return outs, errs, code
def killall_children(timeout): def killall_children(timeout):
cmd = 'ps -o pid --ppid %d --noheaders' % os.getpid() cmd = 'ps -o pid --ppid %d --noheaders' % os.getpid()
out, errs, code = launch_cmd(cmd, timeout) out, errs, code = launch_cmd(cmd, timeout)
@ -166,7 +214,10 @@ def killall_children(timeout):
except: except:
logging.warning('could not kill %s' % p) logging.warning('could not kill %s' % p)
def get_files_rsync(ip, data, ssh_opts, dpath, timeout=15): def get_files_rsync(ip, data, ssh_opts, dpath, timeout=15):
if type(ssh_opts) is list:
ssh_opts = ' '.join(ssh_opts)
if (ip in ['localhost', '127.0.0.1']) or ip.startswith('127.'): if (ip in ['localhost', '127.0.0.1']) or ip.startswith('127.'):
logging.info("skip ssh rsync") logging.info("skip ssh rsync")
cmd = ("timeout '%s' rsync -avzr --files-from=- / '%s'" cmd = ("timeout '%s' rsync -avzr --files-from=- / '%s'"
@ -185,15 +236,6 @@ def get_files_rsync(ip, data, ssh_opts, dpath, timeout=15):
stdin=subprocess.PIPE, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE) stderr=subprocess.PIPE)
if ok_python:
try:
outs, errs = p.communicate(input=data, timeout=timeout+1)
except subprocess.TimeoutExpired:
p.kill()
outs, errs = p.communicate()
logging.error("ip: %s, command: %s err: %s, returned: %s" %
(ip, cmd, errs, p.returncode))
else:
try: try:
outs, errs = p.communicate(input=data) outs, errs = p.communicate(input=data)
except: except: