Mohammed Naser ce39dc5ed7 Fix NetNS creation when PID is re-used
Change-Id: I464d769d58998fcb85d1de6ddc6e11cb6525fdf9
2020-11-05 22:45:06 +00:00

130 lines
3.9 KiB
Python

# Copyright (c) 2020 CLOUD&HEAT GmbH https://www.cloudandheat.com
# Copyright 2020 VEXXHOST, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generate data for all routers and their L3 HA states.
There are scenarios where an L3 HA router can end up being active in many
different L3 agents. This can be tricky to find and cause chaos in the system,
while effort should be done in finding the root cause of this, this will help
alert and catch any occurances of it.
"""
import argparse
import glob
import time
import os
import psutil
from pyroute2 import NetNS
def get_pid_files(state_path):
"""Get all PID files for Keepalived."""
ha_conf_dir = os.path.join(state_path, 'ha_confs')
pid_glob = os.path.join(ha_conf_dir, '*.pid.keepalived-vrrp')
return glob.glob(pid_glob)
def verify_router_configured(router_id, master):
"""Verify is the router is properly configured on the system."""
configured = 1
try:
namespace = NetNS('qrouter-%s' % router_id)
except OSError:
return 0
for link in namespace.get_links():
name = link.get_attr('IFLA_IFNAME')
if name[:3] not in ('qg-', 'qr-'):
continue
addr = namespace.get_addr(label=name)
# Router is not master but addresses are configured.
if master == 0 and len(addr) != 0:
configured = 0
# Router is master but addresses are not configured.
if master == 1 and len(addr) == 0:
configured = 0
namespace.close()
return configured
def main():
"""Entry-point for script."""
parser = argparse.ArgumentParser()
parser.add_argument("--prefix", default="node_openstack_l3_router",
help="Prefix of metric")
parser.add_argument("--state", default="/var/lib/neutron",
help="Neutron state path")
parser.add_argument("--loop", type=int, help="Loop every N seconds")
parser.add_argument("--output", help="Output file (default to STDOUT)")
args = parser.parse_args()
while True:
output = ""
for pid_file in get_pid_files(args.state):
state_path = pid_file.replace('.pid.keepalived-vrrp', '')
state_file = os.path.join(state_path, 'state')
router_id = os.path.basename(state_path)
with open(pid_file) as pid_fd:
pid = int(pid_fd.read())
# Check if the process is _actually_ running
if psutil.pid_exists(pid) is False:
continue
# Check if the PID is indeed for the correct router
proc = psutil.Process(pid)
cmdline = " ".join(proc.cmdline())
if router_id not in cmdline:
continue
with open(state_file) as state_fd:
master = 1 if 'master' in state_fd.read() else 0
configured = verify_router_configured(router_id, master)
output += '%s_configured{router_id="%s"} %d\n' % (
args.prefix,
router_id,
configured
)
output += '%s_master{router_id="%s"} %d\n' % (
args.prefix,
router_id,
master
)
if args.output:
with open(args.output, 'w') as output_fd:
output_fd.write(output)
print(output)
if args.loop:
time.sleep(args.loop)
else:
break