Fix bugs and improve accuracy of alarms and clusters
Co-Authored-By: Patrick Petit <ppetit@mirantis.com>
Co-Authored-By: Eric Lemoine <elemoine@mirantis.com>
Change-Id: I784544984b6415316e06e5587e1184e31547bd10
diff --git a/nova/files/grafana_dashboards/nova_influxdb.json b/nova/files/grafana_dashboards/nova_influxdb.json
index 84a6287..92b4ed6 100644
--- a/nova/files/grafana_dashboards/nova_influxdb.json
+++ b/nova/files/grafana_dashboards/nova_influxdb.json
@@ -280,7 +280,7 @@
{
"key": "cluster_name",
"operator": "=",
- "value": "nova-compute"
+ "value": "nova-data"
}
]
}
diff --git a/nova/meta/heka.yml b/nova/meta/heka.yml
index aca4f9c..ba88bf1 100644
--- a/nova/meta/heka.yml
+++ b/nova/meta/heka.yml
@@ -101,34 +101,54 @@
window: 60
periods: 0
function: last
- {%- for nova_service in ('cert', 'consoleauth', 'compute', 'conductor', 'scheduler') %}
- nova_{{ nova_service }}_one_down:
- description: 'At least one Nova {{ nova_service }} is down'
+ {%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
+ nova_{{ nova_service }}_two_up:
+ description: 'There is one or more Nova {{ nova_service }} down'
severity: warning
+ logical_operator: and
rules:
- metric: openstack_nova_services
field:
service: {{ nova_service }}
+ state: up
+ relational_operator: '>='
+ threshold: 2
+ window: 60
+ periods: 0
+ function: last
+ - metric: openstack_nova_services
+ field:
+ service: {{ nova_service }}
state: down
relational_operator: '>'
threshold: 0
window: 60
periods: 0
function: last
- nova_{{ nova_service }}_majority_down:
- description: 'Majority of Nova {{ nova_service }}s are down'
+ nova_{{ nova_service }}_one_up:
+ description: 'There is only one Nova {{ nova_service }} up left'
severity: critical
+ logical_operator: and
rules:
- - metric: openstack_nova_services_percent
+ - metric: openstack_nova_services
field:
service: {{ nova_service }}
state: up
- relational_operator: '<='
- threshold: 50
+ relational_operator: '=='
+ threshold: 1
window: 60
periods: 0
function: last
- nova_{{ nova_service }}_all_down:
+ - metric: openstack_nova_services
+ field:
+ service: {{ nova_service }}
+ state: '== down || == disabled'
+ relational_operator: '>'
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
+ nova_{{ nova_service }}_zero_up:
description: 'All Nova {{ nova_service }}s are down'
severity: down
rules:
@@ -142,6 +162,55 @@
periods: 0
function: last
{%- endfor %}
+ nova_compute_some_down:
+ description: 'Some Nova computes are down'
+ severity: warning
+ logical_operator: and
+ rules:
+ - metric: openstack_nova_services_percent
+ field:
+ service: compute
+ state: down
+ relational_operator: '>'
+ threshold: 0
+ window: 60
+ periods: 0
+ function: last
+ - metric: openstack_nova_services_percent
+ field:
+ service: compute
+ state: up
+ relational_operator: '>='
+ threshold: 50
+ window: 60
+ periods: 0
+ function: last
+ nova_compute_majority_down:
+ description: 'Majority of Nova computes are down'
+ severity: critical
+ rules:
+ - metric: openstack_nova_services_percent
+ field:
+ service: compute
+ state: up
+ relational_operator: '<='
+ threshold: 50
+ window: 60
+ periods: 0
+ function: last
+ nova_compute_all_down:
+ description: 'All Nova compute are down'
+ severity: down
+ rules:
+ - metric: openstack_nova_services_percent
+ field:
+ service: compute
+ state: down
+ relational_operator: '=='
+ threshold: 100
+ window: 60
+ periods: 0
+ function: last
nova_total_free_vcpu_warning:
description: 'There is no VCPU available for new instances'
severity: warning
@@ -166,18 +235,18 @@
alarm:
{%- if pillar.nova.controller is defined %}
nova_api_check:
- alerting: true
+ alerting: enabled
triggers:
- nova_api_check_failed
dimension:
service: nova-api-check
- {%- for nova_service in ('cert', 'consoleauth', 'compute', 'conductor', 'scheduler') %}
+ {%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
nova_{{ nova_service }}:
- alerting: true
+ alerting: enabled
triggers:
- - nova_{{ nova_service }}_all_down
- - nova_{{ nova_service }}_majority_down
- - nova_{{ nova_service }}_one_down
+ - nova_{{ nova_service }}_zero_up
+ - nova_{{ nova_service }}_one_up
+ - nova_{{ nova_service }}_two_up
dimension:
service: nova-{{ nova_service }}
{%- endfor %}
@@ -193,6 +262,14 @@
- nova_total_free_memory_warning
dimension:
service: nova-free-memory
+ nova_compute:
+ alerting: enabled
+ triggers:
+ - nova_compute_all_down
+ - nova_compute_majority_down
+ - nova_compute_some_down
+ dimension:
+ service: nova-compute
{%- endif %}
aggregator:
alarm_cluster:
@@ -205,7 +282,7 @@
members:
- nova_logs_compute
dimension:
- service: nova-compute
+ service: nova-data
nagios_host: 01-service-clusters
nova_logs:
policy: highest_severity
@@ -239,7 +316,7 @@
dimension:
service: nova-control
nagios_host: 01-service-clusters
- {%- for nova_service in ('cert', 'consoleauth', 'compute', 'conductor', 'scheduler') %}
+ {%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
nova_{{ nova_service }}:
policy: highest_severity
alerting: enabled
@@ -259,7 +336,7 @@
members:
- nova_free_vcpu
dimension:
- service: nova-compute
+ service: nova-data
nagios_host: 01-service-clusters
nova_free_memory:
policy: highest_severity
@@ -269,7 +346,17 @@
members:
- nova_free_memory
dimension:
+ service: nova-data
+ nagios_host: 01-service-clusters
+ nova_compute:
+ policy: highest_severity
+ alerting: enabled
+ match:
service: nova-compute
+ members:
+ - nova_compute
+ dimension:
+ service: nova-data
nagios_host: 01-service-clusters
nova_control:
policy: highest_severity
@@ -280,21 +367,22 @@
- nova_logs
- nova_api_endpoint
- nova_api_check
- {%- for nova_service in ('cert', 'consoleauth', 'compute', 'conductor', 'scheduler') %}
+ {%- for nova_service in ('cert', 'consoleauth', 'conductor', 'scheduler') %}
- nova_{{ nova_service }}
{%- endfor %}
dimension:
cluster_name: nova-control
nagios_host: 00-top-clusters
- nova_compute:
+ nova_data:
policy: highest_severity
alerting: enabled_with_notification
match:
- service: nova-compute
+ service: nova-data
members:
- nova_logs_compute
- nova_free_vcpu
- nova_free_memory
+ - nova_compute
dimension:
- cluster_name: nova-compute
+ cluster_name: nova-data
nagios_host: 00-top-clusters