Change-Id: I1ab91bb31e622dd2cfafa6c6f16895700727d286
Closes-Bug: PROD-19796
'default': {
'service_failed_warning_threshold_percent': 0.3,
'service_failed_critical_threshold_percent': 0.6,
'default': {
'service_failed_warning_threshold_percent': 0.3,
'service_failed_critical_threshold_percent': 0.6,
+ 'service_disk_space_watermark_minor_threshold_percent': 0.6,
+ 'service_disk_space_watermark_major_threshold_percent': 0.75,
},
}, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %}
},
}, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %}
server:
alert:
{%- if client.get('enabled', False) %}
server:
alert:
{%- if client.get('enabled', False) %}
-{%- raw %}
- ElasticsearchClusterHealthStatusYellow:
+ ElasticsearchClusterHealthStatusMajor:
if: >-
elasticsearch_cluster_health_status == 2
if: >-
elasticsearch_cluster_health_status == 2
service: elasticsearch
annotations:
service: elasticsearch
annotations:
- summary: Elasticsearch cluster status is YELLOW
- description: >-
- The Elasticsearch cluster status is YELLOW for the last 5 minutes.
- ElasticsearchClusterHealthStatusRed:
+ summary: "Elasticsearch cluster status is YELLOW"
+ description: "The Elasticsearch cluster status is YELLOW for at least 2 minutes."
+ {%- endraw %}
+ ElasticsearchClusterHealthStatusCritical:
if: >-
elasticsearch_cluster_health_status == 3
if: >-
elasticsearch_cluster_health_status == 3
labels:
severity: critical
service: elasticsearch
annotations:
labels:
severity: critical
service: elasticsearch
annotations:
- summary: 'Elasticsearch cluster status is RED'
- description: >-
- The Elasticsearch cluster status is RED for the last 5 minutes.
-{%- endraw %}
+ summary: "Elasticsearch cluster status is RED"
+ description: "The Elasticsearch cluster status is RED for at least 2 minutes."
+ {%- endraw %}
{%- endif %}
{%- if server.get('enabled', False) %}
{%- endif %}
{%- if server.get('enabled', False) %}
-{%- raw %}
- ElasticsearchInfo:
+ ElasticsearchServiceDown:
if: >-
elasticsearch_up{host=~'.*'} == 0
if: >-
elasticsearch_up{host=~'.*'} == 0
service: elasticsearch
annotations:
service: elasticsearch
annotations:
- summary: 'Elasticsearch service is down'
- description: 'Elasticsearch service is down on node {{ $labels.host }}'
- ElasticsearchWarning:
+ summary: "Elasticsearch service is down"
+ description: "The Elasticsearch service on the {{ $labels.host }} node is down."
+ {%- endraw %}
+ ElasticsearchServiceDownMinor:
- count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %}
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_warning_threshold_percent }}
+ {%- raw %}
+ for: 2m
service: elasticsearch
annotations:
service: elasticsearch
annotations:
- summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- ElasticsearchCritical:
+ summary: "{%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down"
+ description: "{{ $value }} Elasticsearch services are down for at least 2 minutes."
+ {%- endraw %}
+ ElasticsearchServiceDownMajor:
- count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %}
+ count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_critical_threshold_percent }}
+ {%- raw %}
+ for: 2m
service: elasticsearch
annotations:
service: elasticsearch
annotations:
- summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down'
- ElasticsearchDown:
+ summary: "{%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down"
+ description: "{{ $value }} Elasticsearch services are down for at least 2 minutes."
+ {%- endraw %}
+ ElasticsearchServiceOutage:
if: >-
count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
if: >-
count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'})
service: elasticsearch
annotations:
service: elasticsearch
annotations:
- summary: 'All Elasticsearch services are down'
- description: 'All Elasticsearch services are down'
- ElasticsearchClusterDiskLowWaterMark:
+ summary: "Elasticsearch cluster outage"
+ description: "All Elasticsearch services within the cluster are down."
+ {%- endraw %}
+ ElasticsearchDiskWaterMarkMinor:
- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85
+ (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_minor_threshold_percent}}
+ {%- raw %}
service: elasticsearch
annotations:
service: elasticsearch
annotations:
- summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
- description: >-
- Elasticsearch will not allocate new shards to node {{ $labels.host }}
- ElasticsearchClusterDiskHighWaterMark:
+ summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of disk space"
+ description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes."
+ {%- endraw %}
+ ElasticsearchDiskWaterMarkMajor:
- (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90
+ (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_major_threshold_percent}}
+ {%- raw %}
- annotations:
- summary: 'Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}'
- description: >-
- Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node
+ annotations:
+ summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of disk space"
+ description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes."
+ {%- endraw %}
{%- endif %}
{%- endif %}
{%- endif %}
{%- endif %}