Merge "Add evaluation results counters to /metrics"

This commit is contained in:
Zuul 2025-04-09 16:35:39 +00:00 committed by Gerrit Code Review
commit c373d73290
19 changed files with 557 additions and 11 deletions

View File

@ -0,0 +1,120 @@
#
# Copyright 2024 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import collections
from oslo_log import log
import pecan
from pecan import rest
import wsmeext.pecan as wsme_pecan
from aodh.api.controllers.v2 import base
from aodh.api import rbac
from aodh import evaluator
from aodh.i18n import _
from aodh import profiler
LOG = log.getLogger(__name__)
class EvaluationResultOutput(base.Base):
"""A class for representing evaluation result data"""
alarm_id = str
project_id = str
state_counters = {str: int}
@classmethod
def sample(cls):
return cls(
alarm_id="b8e17f58-089a-43fc-a96b-e9bcac4d4b53",
project_id="2dd8edd6c8c24f49bf04670534f6b357",
state_counters={
"ok": 20,
"insufficient data": 5,
"alarm": 10,
}
)
class MetricsOutput(base.Base):
"""A class for representing data from metrics API endpoint"""
evaluation_results = [EvaluationResultOutput]
"The evaluation result counters"
# This could be extended for other metrics in the future
@classmethod
def sample(cls):
return cls(evaluation_results=[{
"alarm_id": "b8e17f58-089a-43fc-a96b-e9bcac4d4b53",
"project_id": "2dd8edd6c8c24f49bf04670534f6b357",
"state_counters": {
"ok": 20,
"insufficient data": 5,
"alarm": 10,
}
}])
@profiler.trace_cls('api')
class MetricsController(rest.RestController):
"""Manages the metrics api endpoint"""
@staticmethod
def group_counters(counters):
result = collections.defaultdict(lambda: collections.defaultdict(dict))
for c in counters:
result[c.project_id][c.alarm_id][c.state] = c.value
return result
@wsme_pecan.wsexpose(MetricsOutput)
def get_all(self):
"""Return all metrics"""
if not pecan.request.cfg.enable_evaluation_results_metrics:
raise base.ClientSideError(_(
"metrics endpoint is disabled"
), 403)
project_id = pecan.request.headers.get('X-Project-Id')
target = {"project_id": project_id}
rbac.enforce('get_metrics', pecan.request.headers,
pecan.request.enforcer, target)
content = MetricsOutput()
alarm_states = [evaluator.UNKNOWN, evaluator.OK, evaluator.ALARM]
LOG.debug('Getting evaluation result counters from database')
grouped_counters = self.group_counters(
pecan.request.storage.get_alarm_counters(project_id=project_id)
)
evaluation_results = []
for project, alarms in grouped_counters.items():
for alarm, states in alarms.items():
evaluation_results.append(
EvaluationResultOutput(
project_id=project,
alarm_id=alarm,
state_counters={
state: states.get(state.replace(" ", "_"), 0)
for state in alarm_states
}
)
)
content.evaluation_results = evaluation_results
return content

View File

@ -20,6 +20,7 @@
from aodh.api.controllers.v2 import alarms
from aodh.api.controllers.v2 import capabilities
from aodh.api.controllers.v2 import metrics
from aodh.api.controllers.v2 import query
from aodh.api.controllers.v2 import quotas
@ -31,3 +32,4 @@ class V2Controller(object):
query = query.QueryController()
capabilities = capabilities.CapabilitiesController()
quotas = quotas.QuotasController()
metrics = metrics.MetricsController()

View File

@ -329,7 +329,19 @@ rules = [
}
],
deprecated_rule=deprecated_delete_quotas
)
),
policy.DocumentedRuleDefault(
name="telemetry:get_metrics",
check_str=PROJECT_READER,
scope_types=['project'],
description='Get all metrics.',
operations=[
{
'path': '/v2/metrics',
'method': 'GET'
}
]
),
]

View File

@ -112,6 +112,11 @@ class Evaluator(object, metaclass=abc.ABCMeta):
self.alarm_change_notifier.info({},
notification, payload)
def _increment_evaluation_result(self, alarm_id, project_id, state):
if self.conf.enable_evaluation_results_metrics:
self._storage_conn.increment_alarm_counter(
alarm_id, project_id, state)
def _refresh(self, alarm, state, reason, reason_data, always_record=False):
"""Refresh alarm state."""
try:

View File

@ -177,3 +177,7 @@ class ThresholdEvaluator(evaluator.Evaluator):
evaluation = (evaluator.UNKNOWN, None, e.statistics, 0,
e.reason)
self._transition_alarm(alarm, *evaluation)
if evaluation[0] is not None:
self._increment_evaluation_result(alarm.alarm_id,
alarm.project_id,
evaluation[0].replace(" ", "_"))

View File

@ -14,6 +14,7 @@
import itertools
from keystoneauth1 import loading
from oslo_config import cfg
import aodh.api
import aodh.api.controllers.v2.alarm_rules.gnocchi
@ -32,6 +33,12 @@ import aodh.notifier.zaqar
import aodh.service
import aodh.storage
OPTS = [
cfg.BoolOpt('enable_evaluation_results_metrics',
default=False,
help=("Whether metric collection should be enabled.")),
]
def list_opts():
return [
@ -44,7 +51,8 @@ def list_opts():
aodh.evaluator.loadbalancer.OPTS,
aodh.notifier.rest.OPTS,
aodh.queue.OPTS,
aodh.service.OPTS)),
aodh.service.OPTS,
OPTS)),
('api',
itertools.chain(
aodh.api.OPTS,

View File

@ -111,6 +111,16 @@ class Connection(object):
"""Delete an alarm and its history data."""
raise aodh.NotImplementedError('Alarms not implemented')
@staticmethod
def increment_alarm_counter(alarm_id, project_id, state):
"""Increment a counter."""
raise aodh.NotImplementedError('Alarm counters not implemented')
@staticmethod
def get_alarm_counters(alarm_id=None, project_id=None, state=None):
"""Get value of a counter."""
raise aodh.NotImplementedError('Alarm counters not implemented')
@staticmethod
def get_alarm_changes(alarm_id, on_behalf_of,
user=None, project=None, alarm_type=None,

View File

@ -56,6 +56,9 @@ AVAILABLE_STORAGE_CAPABILITIES = {
'storage': {'production_ready': True},
}
# int type should be 32b long in both mysql and postgresql
COUNTER_RESET_AT_VALUE = 2000000000
def apply_filters(query, model, **filters):
filter_dict = {}
@ -314,6 +317,9 @@ class Connection(base.Connection):
:param alarm_id: ID of the alarm to delete
"""
with _session_for_write() as session:
session.query(models.AlarmCounter).filter(
models.AlarmCounter.alarm_id == alarm_id,
).delete()
session.query(models.Alarm).filter(
models.Alarm.alarm_id == alarm_id,
).delete()
@ -322,6 +328,45 @@ class Connection(base.Connection):
models.AlarmChange.alarm_id == alarm_id,
).delete()
def increment_alarm_counter(self, alarm_id, project_id, state):
"""Increment a counter.
:param alarm_id: the id of alarm to which the counter belongs
:param project_id: the id of the project of the alarm
:param state: the state of the alarm to increment
"""
with _session_for_write() as session:
LOG.debug("Incrementing counter %(state)s for alarm %(alarm_id)s",
{'alarm_id': alarm_id, 'state': state})
counter = self.get_alarm_counters(alarm_id, project_id, state)
counter_value = 0
if len(counter) == 1:
counter_value = counter[0].value
new_value = counter_value + 1
if counter_value >= COUNTER_RESET_AT_VALUE:
LOG.debug("Resetting counter %(state)s for alarm %(alarm_id)s",
{'alarm_id': alarm_id, 'state': state})
new_value = 1
elif counter_value == 0:
# We have a new uninitialized counter
counter = models.AlarmCounter(
alarm_id=alarm_id,
project_id=project_id,
state=state
)
counter.update({'value': new_value})
session.add(counter)
else:
session.query(models.AlarmCounter).filter(
models.AlarmCounter.alarm_id == alarm_id,
models.AlarmCounter.project_id == project_id,
models.AlarmCounter.state == state,
).update({'value': new_value})
return state
@staticmethod
def _row_to_alarm_change_model(row):
return alarm_api_models.AlarmChange(event_id=row.event_id,
@ -338,6 +383,23 @@ class Connection(base.Connection):
"""Yields a lists of alarms that match filter."""
return self._retrieve_data(filter_expr, orderby, limit, models.Alarm)
def get_alarm_counters(self, alarm_id=None, project_id=None, state=None):
"""Yields a counter based on its alarm_id, project_id and state."""
filters = {}
if alarm_id is not None:
filters['alarm_id'] = alarm_id
if project_id is not None:
filters['project_id'] = project_id
if state is not None:
filters['state'] = state
with _session_for_read() as session:
counters = session.query(models.AlarmCounter).filter_by(
**filters
).all()
if counters is None:
return []
return counters
def _retrieve_alarm_history(self, query):
return (self._row_to_alarm_change_model(x) for x in query.all())

View File

@ -160,3 +160,14 @@ class Quota(base.Model):
project_id=project_id,
resource=resource,
limit=limit)
class AlarmCounter(base.Model):
def __init__(self, alarm_id, project_id, state):
base.Model.__init__(
self,
alarm_id=alarm_id,
project_id=project_id,
state=state,
value=0
)

View File

@ -1,6 +1,6 @@
[alembic]
script_location = aodh.storage.sqlalchemy:alembic
sqlalchemy.url =
sqlalchemy.url = sqlite:///aodh.db
[loggers]
keys = root,sqlalchemy,alembic

View File

@ -16,7 +16,8 @@
from logging.config import fileConfig
from alembic import context
from oslo_db.sqlalchemy import enginefacade
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from aodh.storage.sqlalchemy import models
@ -27,7 +28,8 @@ config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
if config.attributes.get('configure_logger', True):
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
@ -53,9 +55,13 @@ def run_migrations_offline():
script output.
"""
conf = config.conf
context.configure(url=conf.database.connection,
target_metadata=target_metadata)
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
@ -71,15 +77,21 @@ def run_migrations_online():
connectable = config.attributes.get('connection', None)
if connectable is None:
engine = enginefacade.writer.get_engine()
with engine.connect() as connection:
# only create Engine if we don't have a Connection from the outside
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection,
target_metadata=target_metadata
target_metadata=target_metadata,
)
with context.begin_transaction():
context.run_migrations()
else:
context.configure(
connection=connectable,

View File

@ -0,0 +1,67 @@
# Copyright 2025 OpenStack Foundation
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
#
"""added_counter_table
Revision ID: 008
Revises: 007
Create Date: 2025-01-15 10:28:02.087788
"""
# revision identifiers, used by Alembic.
revision = '008'
down_revision = '007'
branch_labels = None
depends_on = None
from alembic import op
import sqlalchemy as sa
def upgrade():
op.create_table(
'alarm_counter',
sa.Column('id', sa.String(length=36), nullable=False),
sa.Column('alarm_id', sa.String(length=128), nullable=False),
sa.Column('project_id', sa.String(length=128), nullable=False),
sa.Column('state', sa.String(length=128), nullable=False),
sa.Column('value', sa.Integer(), nullable=False),
sa.ForeignKeyConstraint(
['alarm_id'],
['alarm.alarm_id'],
name='alarm_fkey_ref',
),
sa.PrimaryKeyConstraint('id'),
sa.UniqueConstraint('alarm_id', 'project_id', 'state')
)
op.create_index(
'ix_alarm_counter_alarm_id',
'alarm_counter',
['alarm_id'],
unique=False
)
op.create_index(
'ix_alarm_counter_project_id',
'alarm_counter',
['project_id'],
unique=False
)
op.create_index(
'ix_alarm_counter_state',
'alarm_counter',
['state'],
unique=False
)

View File

@ -141,3 +141,23 @@ class Quota(Base):
project_id = Column(String(128), nullable=False)
resource = Column(String(50), nullable=False)
limit = Column(Integer, nullable=False)
class AlarmCounter(Base):
__tablename__ = 'alarm_counter'
__table_args__ = (
sa.UniqueConstraint('alarm_id', 'project_id', 'state'),
Index('ix_%s_alarm_id' % __tablename__,
'alarm_id'),
Index('ix_%s_state' % __tablename__,
'state'),
Index('ix_%s_project_id' % __tablename__,
'project_id'),
)
id = Column(String(36), primary_key=True, default=uuidutils.generate_uuid)
alarm_id = Column(String(128), sa.ForeignKey('alarm.alarm_id'),
nullable=False)
project_id = Column(String(128), nullable=False)
state = Column(String(128), nullable=False)
value = Column(Integer, nullable=False)

View File

@ -6,3 +6,4 @@
"admin_or_owner": "rule:context_is_admin or project_id:%(project_id)s"
"default": "rule:admin_or_owner"
"telemetry:get_alarms": "role:admin"
"telemetry:get_metrics": "role:admin"

View File

@ -0,0 +1,120 @@
#
# Copyright 2024 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
import os
import webtest
from aodh.api import app
from aodh.storage import models
from aodh.tests import constants
from aodh.tests.functional.api import v2
def getTestAlarm(alarm_id, project_id, user_id):
return models.Alarm(name='name1',
type='gnocchi_aggregation_by_metrics_threshold',
enabled=True,
alarm_id=alarm_id,
description='a',
state='insufficient data',
state_reason='Not evaluated',
severity='critical',
state_timestamp=constants.MIN_DATETIME,
timestamp=constants.MIN_DATETIME,
ok_actions=[],
insufficient_data_actions=[],
alarm_actions=[],
repeat_actions=True,
user_id=user_id,
project_id=project_id,
time_constraints=[dict(name='testcons',
start='0 11 * * *',
duration=300)],
rule=dict(comparison_operator='gt',
threshold=2.0,
aggregation_method='mean',
evaluation_periods=60,
granularity=1,
metrics=[
'41869681-5776-46d6-91ed-cccc43b6e4e3',
'a1fb80f4-c242-4f57-87c6-68f47521059e'
])
)
class TestMetrics(v2.FunctionalTest):
def setUp(self):
super(TestMetrics, self).setUp()
self.project_id = "some_project_id"
self.alarm_id = "some_alarm_id"
self.user_id = "some_user_id"
self.role = "reader"
self.auth_headers = {'X-User-Id': self.user_id,
'X-Project-Id': self.project_id,
'X-Roles': self.role}
self.alarm_conn.create_alarm(getTestAlarm(
self.alarm_id,
self.project_id,
self.user_id)
)
self.alarm_conn.increment_alarm_counter(
self.alarm_id,
self.project_id,
"ok"
)
self.alarm_conn.increment_alarm_counter(
self.alarm_id,
self.project_id,
"insufficient_data"
)
self.alarm_conn.increment_alarm_counter(
self.alarm_id,
self.project_id,
"insufficient_data"
)
def test_get_all_metrics_inside_project(self):
expected = {
"evaluation_results":
[{
"alarm_id": self.alarm_id,
"project_id": self.project_id,
"state_counters": {
"ok": 1,
"insufficient data": 2,
"alarm": 0
}
}]
}
metrics = self.get_json(
'/metrics',
headers=self.auth_headers,
)
self.assertEqual(expected, metrics)
def test_get_all_metrics_forbidden(self):
pf = os.path.abspath('aodh/tests/functional/api/v2/policy.yaml-test')
self.CONF.set_override('policy_file', pf, group='oslo_policy')
self.CONF.set_override('auth_mode', None, group='api')
self.app = webtest.TestApp(app.load_app(self.CONF))
response = self.get_json('/metrics',
expect_errors=True,
status=403,
headers=self.auth_headers)
faultstring = 'RBAC Authorization Failed'
self.assertEqual(403, response.status_code)
self.assertEqual(faultstring,
response.json['error_message']['faultstring'])

View File

@ -96,6 +96,7 @@ class TestBase(test_base.BaseTestCase,
conf = service.prepare_service(argv=[], config_files=[])
self.CONF = self.useFixture(fixture_config.Config(conf)).conf
self.CONF.set_override('connection', db_url, group="database")
self.CONF.set_override('enable_evaluation_results_metrics', True)
manager = self.DRIVER_MANAGERS.get(self.engine)
if not manager:

View File

@ -245,9 +245,30 @@ class AlarmTest(AlarmTestBase):
def test_delete(self):
self.add_some_alarms()
victim = list(self.alarm_conn.get_alarms(name='orange-alert'))[0]
self.alarm_conn.increment_alarm_counter(
victim.alarm_id,
victim.project_id,
"ok"
)
self.assertEqual(
1,
self.alarm_conn.get_alarm_counters(
victim.alarm_id,
victim.project_id,
"ok"
)[0].value
)
self.alarm_conn.delete_alarm(victim.alarm_id)
survivors = list(self.alarm_conn.get_alarms())
self.assertEqual(2, len(survivors))
self.assertEqual(
[],
self.alarm_conn.get_alarm_counters(
victim.alarm_id,
victim.project_id,
"ok"
)
)
for s in survivors:
self.assertNotEqual(victim.name, s.name)
@ -506,3 +527,55 @@ class ComplexAlarmHistoryQueryTest(AlarmTestBase):
alarm_models.AlarmChange.RULE_CHANGE,
alarm_models.AlarmChange.STATE_TRANSITION],
[h.type for h in history])
class AlarmCounterTest(AlarmTestBase):
def test_get_value_of_empty_counter(self):
counter_name = "some_empty_unused_counter"
self.assertEqual([], self.alarm_conn.get_alarm_counters(
"some_alarm_id",
"some_project_id",
counter_name))
def test_counter_increment(self):
self.add_some_alarms()
alarm = list(self.alarm_conn.get_alarms(name='orange-alert'))[0]
counter_name = "counter_for_increment_testing"
project_id = alarm.project_id
alarm_id = alarm.alarm_id
self.assertEqual([], self.alarm_conn.get_alarm_counters(
alarm_id,
project_id,
counter_name
))
for i in range(5):
self.alarm_conn.increment_alarm_counter(
alarm_id,
project_id,
counter_name
)
self.assertEqual(
5,
self.alarm_conn.get_alarm_counters(
alarm_id,
project_id,
counter_name
)[0].value
)
for i in range(3):
self.alarm_conn.increment_alarm_counter(
alarm_id,
project_id,
counter_name
)
self.assertEqual(
8,
self.alarm_conn.get_alarm_counters(
alarm_id,
project_id,
counter_name
)[0].value
)

View File

@ -150,12 +150,21 @@ class TestGnocchiEvaluatorBase(base.TestEvaluatorBase):
self._assert_all_alarms('ok')
def _test_simple_insufficient(self):
self.conf.set_override('enable_evaluation_results_metrics', True)
self._set_all_alarms('ok')
self._evaluate_all_alarms()
self._assert_all_alarms('insufficient data')
expected = [mock.call(alarm) for alarm in self.alarms]
update_calls = self.storage_conn.update_alarm.call_args_list
self.assertEqual(expected, update_calls)
expected = [mock.call(
alarm.alarm_id,
alarm.project_id,
"insufficient_data")
for alarm in self.alarms]
counter_increments = (
self.storage_conn.increment_alarm_counter.call_args_list)
self.assertEqual(expected, counter_increments)
expected = [mock.call(
alarm,
'ok',

View File

@ -0,0 +1,9 @@
---
features:
- |
Added collection of alarm evaluation counters. These show the number of
times alarms were evaluated as ``alarm``, ``ok`` and ``insufficient data``
per alarm. These counters are presented by the /v2/metrics API endpoint.
This feature can be enabled or disabled with
the ``[DEFAULT].enable_evaluation_results_metrics`` configuration option.
It's disabled by default.