130 lines
3.9 KiB
Python
130 lines
3.9 KiB
Python
# Copyright (c) 2020 CLOUD&HEAT GmbH https://www.cloudandheat.com
|
|
# Copyright 2020 VEXXHOST, Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Generate data for all routers and their L3 HA states.
|
|
|
|
There are scenarios where an L3 HA router can end up being active in many
|
|
different L3 agents. This can be tricky to find and cause chaos in the system,
|
|
while effort should be done in finding the root cause of this, this will help
|
|
alert and catch any occurances of it.
|
|
"""
|
|
|
|
import argparse
|
|
import glob
|
|
import time
|
|
import os
|
|
|
|
import psutil
|
|
from pyroute2 import NetNS
|
|
|
|
|
|
def get_pid_files(state_path):
|
|
"""Get all PID files for Keepalived."""
|
|
|
|
ha_conf_dir = os.path.join(state_path, 'ha_confs')
|
|
pid_glob = os.path.join(ha_conf_dir, '*.pid.keepalived-vrrp')
|
|
return glob.glob(pid_glob)
|
|
|
|
|
|
def verify_router_configured(router_id, master):
|
|
"""Verify is the router is properly configured on the system."""
|
|
|
|
configured = 1
|
|
|
|
try:
|
|
namespace = NetNS('qrouter-%s' % router_id)
|
|
except OSError:
|
|
return 0
|
|
|
|
for link in namespace.get_links():
|
|
name = link.get_attr('IFLA_IFNAME')
|
|
if name[:3] not in ('qg-', 'qr-'):
|
|
continue
|
|
|
|
addr = namespace.get_addr(label=name)
|
|
|
|
# Router is not master but addresses are configured.
|
|
if master == 0 and len(addr) != 0:
|
|
configured = 0
|
|
|
|
# Router is master but addresses are not configured.
|
|
if master == 1 and len(addr) == 0:
|
|
configured = 0
|
|
|
|
namespace.close()
|
|
|
|
return configured
|
|
|
|
|
|
def main():
|
|
"""Entry-point for script."""
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--prefix", default="node_openstack_l3_router",
|
|
help="Prefix of metric")
|
|
parser.add_argument("--state", default="/var/lib/neutron",
|
|
help="Neutron state path")
|
|
parser.add_argument("--loop", type=int, help="Loop every N seconds")
|
|
parser.add_argument("--output", help="Output file (default to STDOUT)")
|
|
args = parser.parse_args()
|
|
|
|
while True:
|
|
output = ""
|
|
for pid_file in get_pid_files(args.state):
|
|
state_path = pid_file.replace('.pid.keepalived-vrrp', '')
|
|
state_file = os.path.join(state_path, 'state')
|
|
|
|
router_id = os.path.basename(state_path)
|
|
|
|
with open(pid_file) as pid_fd:
|
|
pid = int(pid_fd.read())
|
|
|
|
# Check if the process is _actually_ running
|
|
if psutil.pid_exists(pid) is False:
|
|
continue
|
|
|
|
# Check if the PID is indeed for the correct router
|
|
proc = psutil.Process(pid)
|
|
cmdline = " ".join(proc.cmdline())
|
|
if router_id not in cmdline:
|
|
continue
|
|
|
|
with open(state_file) as state_fd:
|
|
master = 1 if 'master' in state_fd.read() else 0
|
|
|
|
configured = verify_router_configured(router_id, master)
|
|
|
|
output += '%s_configured{router_id="%s"} %d\n' % (
|
|
args.prefix,
|
|
router_id,
|
|
configured
|
|
)
|
|
output += '%s_master{router_id="%s"} %d\n' % (
|
|
args.prefix,
|
|
router_id,
|
|
master
|
|
)
|
|
|
|
if args.output:
|
|
with open(args.output, 'w') as output_fd:
|
|
output_fd.write(output)
|
|
|
|
print(output)
|
|
|
|
if args.loop:
|
|
time.sleep(args.loop)
|
|
else:
|
|
break
|