Alerts rationalization for Elasticsearch
Change-Id: I1ab91bb31e622dd2cfafa6c6f16895700727d286
Closes-Bug: PROD-19796
diff --git a/elasticsearch/map.jinja b/elasticsearch/map.jinja
index 781c018..930a2f9 100644
--- a/elasticsearch/map.jinja
+++ b/elasticsearch/map.jinja
@@ -44,5 +44,7 @@
'default': {
'service_failed_warning_threshold_percent': 0.3,
'service_failed_critical_threshold_percent': 0.6,
+ 'service_disk_space_watermark_minor_threshold_percent': 0.6,
+ 'service_disk_space_watermark_major_threshold_percent': 0.75,
},
}, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %}
diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml
index 0e4d7e0..f688991 100644
--- a/elasticsearch/meta/prometheus.yml
+++ b/elasticsearch/meta/prometheus.yml
@@ -4,89 +4,101 @@
server:
alert:
{%- if client.get('enabled', False) %}
-{%- raw %}
- ElasticsearchClusterHealthStatusYellow:
+ ElasticsearchClusterHealthStatusMajor:
if: >-
elasticsearch_cluster_health_status == 2
+ {%- raw %}
+ for: 2m
labels:
- severity: warning
+ severity: major
service: elasticsearch
annotations:
- summary: Elasticsearch cluster status is YELLOW
- description: >-
- The Elasticsearch cluster status is YELLOW for the last 5 minutes.
- ElasticsearchClusterHealthStatusRed:
+ summary: "Elasticsearch cluster status is YELLOW"
+ description: "The Elasticsearch cluster status is YELLOW for at least 2 minutes."
+ {%- endraw %}
+ ElasticsearchClusterHealthStatusCritical:
if: >-
elasticsearch_cluster_health_status == 3
+ {%- raw %}
+ for: 2m
labels:
severity: critical
service: elasticsearch
annotations:
- summary: 'Elasticsearch cluster status is RED'
- description: >-
- The Elasticsearch cluster status is RED for the last 5 minutes.
-{%- endraw %}
+ summary: "Elasticsearch cluster status is RED"
+ description: "The Elasticsearch cluster status is RED for at least 2 minutes."
+ {%- endraw %}
{%- endif %}
{%- if server.get('enabled', False) %}
-{%- raw %}
- ElasticsearchInfo:
+ ElasticsearchServiceDown:
if: >-
elasticsearch_up{host=~'.*'} == 0
+ {%- raw %}
labels:
- severity: info
+ severity: minor
service: elasticsearch
annotations:
- summary: 'Elasticsearch service is down'
- description: 'Elasticsearch service is down on node {{ $labels.host }}'
- ElasticsearchWarning:
+ summary: "Elasticsearch service is down"
+ description: "The Elasticsearch service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ ElasticsearchServiceDownMinor:
if: >-
- count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %}
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_warning_threshold_percent }}
+ {%- raw %}
+ for: 2m
labels:
- severity: warning
+ severity: minor
service: elasticsearch
annotations:
- summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- ElasticsearchCritical:
+ summary: "{%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down"
+ description: "{{ $value }} Elasticsearch services are down for at least 2 minutes."
+ {%- endraw %}
+ ElasticsearchServiceDownMajor:
if: >-
- count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %}
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_critical_threshold_percent }}
+ {%- raw %}
+ for: 2m
labels:
- severity: critical
+ severity: major
service: elasticsearch
annotations:
- summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- ElasticsearchDown:
+ summary: "{%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down"
+ description: "{{ $value }} Elasticsearch services are down for at least 2 minutes."
+ {%- endraw %}
+ ElasticsearchServiceOutage:
if: >-
count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
- labels:
- severity: down
- service: elasticsearch
- annotations:
- summary: 'All Elasticsearch services are down'
- description: 'All Elasticsearch services are down'
- ElasticsearchClusterDiskLowWaterMark:
- if: >-
- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
- for: 5m
- labels:
- severity: warning
- service: elasticsearch
- annotations:
- summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
- description: >-
- Elasticsearch will not allocate new shards to node {{ $labels.host }}
- ElasticsearchClusterDiskHighWaterMark:
- if: >-
- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90
- for: 5m
- annotations:
- summary: 'Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
- description: >-
- Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node
+ {%- raw %}
labels:
severity: critical
service: elasticsearch
-{%- endraw %}
+ annotations:
+ summary: "Elasticsearch cluster outage"
+ description: "All Elasticsearch services within the cluster are down."
+ {%- endraw %}
+ ElasticsearchDiskWaterMarkMinor:
+ if: >-
+ (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_minor_threshold_percent}}
+ {%- raw %}
+ for: 5m
+ labels:
+ severity: minor
+ service: elasticsearch
+ annotations:
+ summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of disk space"
+ description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes."
+ {%- endraw %}
+ ElasticsearchDiskWaterMarkMajor:
+ if: >-
+ (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_major_threshold_percent}}
+ {%- raw %}
+ for: 5m
+ labels:
+ severity: major
+ service: elasticsearch
+ annotations:
+ summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of disk space"
+ description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes."
+ {%- endraw %}
{%- endif %}
{%- endif %}