# Copyright (c) 2020 CLOUD&HEAT GmbH https://www.cloudandheat.com # Copyright 2020 VEXXHOST, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Generate data for all routers and their L3 HA states. There are scenarios where an L3 HA router can end up being active in many different L3 agents. This can be tricky to find and cause chaos in the system, while effort should be done in finding the root cause of this, this will help alert and catch any occurances of it. """ import argparse import glob import time import os import psutil from pyroute2 import NetNS def get_pid_files(state_path): """Get all PID files for Keepalived.""" ha_conf_dir = os.path.join(state_path, 'ha_confs') pid_glob = os.path.join(ha_conf_dir, '*.pid.keepalived-vrrp') return glob.glob(pid_glob) def verify_router_configured(router_id, master): """Verify is the router is properly configured on the system.""" configured = 1 try: namespace = NetNS('qrouter-%s' % router_id) except OSError: return 0 for link in namespace.get_links(): name = link.get_attr('IFLA_IFNAME') if name[:3] not in ('qg-', 'qr-'): continue addr = namespace.get_addr(label=name) # Router is not master but addresses are configured. if master == 0 and len(addr) != 0: configured = 0 # Router is master but addresses are not configured. if master == 1 and len(addr) == 0: configured = 0 namespace.close() return configured def main(): """Entry-point for script.""" parser = argparse.ArgumentParser() parser.add_argument("--prefix", default="node_openstack_l3_router", help="Prefix of metric") parser.add_argument("--state", default="/var/lib/neutron", help="Neutron state path") parser.add_argument("--loop", type=int, help="Loop every N seconds") parser.add_argument("--output", help="Output file (default to STDOUT)") args = parser.parse_args() while True: output = "" for pid_file in get_pid_files(args.state): state_path = pid_file.replace('.pid.keepalived-vrrp', '') state_file = os.path.join(state_path, 'state') router_id = os.path.basename(state_path) with open(pid_file) as pid_fd: pid = int(pid_fd.read()) # Check if the process is _actually_ running if psutil.pid_exists(pid) is False: continue # Check if the PID is indeed for the correct router proc = psutil.Process(pid) cmdline = " ".join(proc.cmdline()) if router_id not in cmdline: continue with open(state_file) as state_fd: master = 1 if 'master' in state_fd.read() else 0 configured = verify_router_configured(router_id, master) output += '%s_configured{router_id="%s"} %d\n' % ( args.prefix, router_id, configured ) output += '%s_master{router_id="%s"} %d\n' % ( args.prefix, router_id, master ) if args.output: with open(args.output, 'w') as output_fd: output_fd.write(output) print(output) if args.loop: time.sleep(args.loop) else: break