server:
alert:
{%- if client.get('enabled', False) %}
-{%- raw %}
- ElasticsearchClusterHealthStatusYellow:
+ ElasticsearchClusterHealthStatusMajor:
if: >-
elasticsearch_cluster_health_status == 2
+ {%- raw %}
+ for: 2m
labels:
- severity: warning
+ severity: major
service: elasticsearch
annotations:
- summary: Elasticsearch cluster status is YELLOW
- description: >-
- The Elasticsearch cluster status is YELLOW for the last 5 minutes.
- ElasticsearchClusterHealthStatusRed:
+ summary: "Elasticsearch cluster status is YELLOW"
+ description: "The Elasticsearch cluster status is YELLOW for 2 minutes."
+ {%- endraw %}
+ ElasticsearchClusterHealthStatusCritical:
if: >-
elasticsearch_cluster_health_status == 3
+ {%- raw %}
+ for: 2m
labels:
severity: critical
service: elasticsearch
annotations:
- summary: 'Elasticsearch cluster status is RED'
- description: >-
- The Elasticsearch cluster status is RED for the last 5 minutes.
-{%- endraw %}
+ summary: "Elasticsearch cluster status is RED"
+ description: "The Elasticsearch cluster status is RED for 2 minutes."
+ {%- endraw %}
{%- endif %}
{%- if server.get('enabled', False) %}
-{%- raw %}
- ElasticsearchInfo:
+ ElasticsearchServiceDown:
if: >-
elasticsearch_up{host=~'.*'} == 0
+ {%- raw %}
labels:
- severity: info
+ severity: minor
service: elasticsearch
annotations:
- summary: 'Elasticsearch service is down'
- description: 'Elasticsearch service is down on node {{ $labels.host }}'
- ElasticsearchWarning:
+ summary: "Elasticsearch service is down"
+ description: "The Elasticsearch service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ ElasticsearchServiceDownMinor:
if: >-
- count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %}
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_warning_threshold_percent }}
+ {%- raw %}
+ for: 2m
labels:
- severity: warning
+ severity: minor
service: elasticsearch
annotations:
- summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- ElasticsearchCritical:
+ summary: "{%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down"
+ description: "{{ $value }} Elasticsearch services are down for 2 minutes."
+ {%- endraw %}
+ ElasticsearchServiceDownMajor:
if: >-
- count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %}
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_critical_threshold_percent }}
+ {%- raw %}
+ for: 2m
labels:
- severity: critical
+ severity: major
service: elasticsearch
annotations:
- summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- ElasticsearchDown:
+ summary: "{%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down"
+ description: "{{ $value }} Elasticsearch services are down for 2 minutes."
+ {%- endraw %}
+ ElasticsearchServiceOutage:
if: >-
count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
+ {%- raw %}
labels:
- severity: down
+ severity: critical
service: elasticsearch
annotations:
- summary: 'All Elasticsearch services are down'
- description: 'All Elasticsearch services are down'
- ElasticsearchClusterDiskLowWaterMark:
+ summary: "Elasticsearch cluster outage"
+ description: "All Elasticsearch services within the cluster are down."
+ {%- endraw %}
+ ElasticsearchDiskWaterMarkMinor:
if: >-
- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
+ (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_minor_threshold_percent}}
+ {%- raw %}
for: 5m
labels:
- severity: warning
+ severity: minor
service: elasticsearch
annotations:
- summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
- description: >-
- Elasticsearch will not allocate new shards to node {{ $labels.host }}
- ElasticsearchClusterDiskHighWaterMark:
+ summary: "Elasticsearch uses {%- endraw %} {{monitoring.service_disk_space_watermark_minor_threshold_percent*100}}%{%- raw %} of disk space"
+ description: "The Elasticsearch '{{ $labels.instance }}' instance uses {{ $value }}% of disk space on the {{ $labels.host }} node for 5 minutes."
+ {%- endraw %}
+ ElasticsearchDiskWaterMarkMajor:
if: >-
- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90
+ (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_major_threshold_percent}}
+ {%- raw %}
for: 5m
- annotations:
- summary: 'Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
- description: >-
- Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node
labels:
- severity: critical
+ severity: major
service: elasticsearch
-{%- endraw %}
+ annotations:
+ summary: "Elasticsearch uses {%- endraw %} {{monitoring.service_disk_space_watermark_major_threshold_percent*100}}%{%- raw %} of disk space"
+ description: "The Elasticsearch '{{ $labels.instance }}' instance uses {{ $value }}% of disk space on the {{ $labels.host }} node for 5 minutes."
+ {%- endraw %}
{%- endif %}
{%- endif %}