Alerts rationalization for Apache
Change-Id: I2b4c812b8738d0a86dc4d3f36085c2bdc3931283
Closes-Bug: PROD-19656
diff --git a/apache/meta/prometheus.yml b/apache/meta/prometheus.yml
index 0abe6ab..897713c 100644
--- a/apache/meta/prometheus.yml
+++ b/apache/meta/prometheus.yml
@@ -1,26 +1,41 @@
{%- from "apache/map.jinja" import server with context %}
{%- if server.get('enabled', False) %}
-{%- raw %}
+
server:
alert:
- ApacheDown:
+ ApacheServiceDown:
if: >-
apache_up != 1
+ {%- raw %}
labels:
- severity: warning
+ severity: minor
service: apache
annotations:
- summary: 'Apache service down'
- description: 'Apache service is down on node {{ $labels.host }}'
- ApacheIdleWorkersShortage:
+ summary: "Apache service is down"
+ description: "The Apache service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ ApacheServiceOutage:
+ if: >-
+ count(label_replace(apache_up, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster) == count(label_replace(apache_up == 0, "cluster", "$1", "host", "([^0-9]+).+")) by (cluster)
+ {% raw %}
+ labels:
+ severity: critical
+ service: apache
+ annotations:
+ summary: "Apache service outage"
+ description: "All Apache services within the {{ $labels.cluster }} cluster are down."
+ {% endraw %}
+ ApacheWorkersAbsent:
if: >-
apache_IdleWorkers == 0
+ {%- raw %}
+ for: 2m
labels:
- severity: warning
+ severity: minor
service: apache
annotations:
- summary: 'Apache idle workers shortage'
- description: 'Apache idle workers shortage on node {{ $labels.host }}'
-{%- endraw %}
+ summary: "Apache has no available idle workers"
+ description: "The Apache service on the {{ $labels.host }} node has no available workers for at least 2 minutes."
+ {%- endraw %}
{%- endif %}