From: Mateusz Matuszkowiak Date: Fri, 11 May 2018 12:19:09 +0000 (+0200) Subject: Alerts rationalization for Elasticsearch X-Git-Url: https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas%2Felasticsearch.git;a=commitdiff_plain;h=92b11253e83f785e42c5b65dce57c4f29eb1fe9c Alerts rationalization for Elasticsearch Change-Id: I1ab91bb31e622dd2cfafa6c6f16895700727d286 Closes-Bug: PROD-19796 --- diff --git a/elasticsearch/map.jinja b/elasticsearch/map.jinja index 781c018..930a2f9 100644 --- a/elasticsearch/map.jinja +++ b/elasticsearch/map.jinja @@ -44,5 +44,7 @@ RedHat: 'default': { 'service_failed_warning_threshold_percent': 0.3, 'service_failed_critical_threshold_percent': 0.6, + 'service_disk_space_watermark_minor_threshold_percent': 0.6, + 'service_disk_space_watermark_major_threshold_percent': 0.75, }, }, grain='os_family', merge=salt['pillar.get']('elasticsearch:monitoring')) %} diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml index 0e4d7e0..f688991 100644 --- a/elasticsearch/meta/prometheus.yml +++ b/elasticsearch/meta/prometheus.yml @@ -4,89 +4,101 @@ server: alert: {%- if client.get('enabled', False) %} -{%- raw %} - ElasticsearchClusterHealthStatusYellow: + ElasticsearchClusterHealthStatusMajor: if: >- elasticsearch_cluster_health_status == 2 + {%- raw %} + for: 2m labels: - severity: warning + severity: major service: elasticsearch annotations: - summary: Elasticsearch cluster status is YELLOW - description: >- - The Elasticsearch cluster status is YELLOW for the last 5 minutes. - ElasticsearchClusterHealthStatusRed: + summary: "Elasticsearch cluster status is YELLOW" + description: "The Elasticsearch cluster status is YELLOW for at least 2 minutes." + {%- endraw %} + ElasticsearchClusterHealthStatusCritical: if: >- elasticsearch_cluster_health_status == 3 + {%- raw %} + for: 2m labels: severity: critical service: elasticsearch annotations: - summary: 'Elasticsearch cluster status is RED' - description: >- - The Elasticsearch cluster status is RED for the last 5 minutes. -{%- endraw %} + summary: "Elasticsearch cluster status is RED" + description: "The Elasticsearch cluster status is RED for at least 2 minutes." + {%- endraw %} {%- endif %} {%- if server.get('enabled', False) %} -{%- raw %} - ElasticsearchInfo: + ElasticsearchServiceDown: if: >- elasticsearch_up{host=~'.*'} == 0 + {%- raw %} labels: - severity: info + severity: minor service: elasticsearch annotations: - summary: 'Elasticsearch service is down' - description: 'Elasticsearch service is down on node {{ $labels.host }}' - ElasticsearchWarning: + summary: "Elasticsearch service is down" + description: "The Elasticsearch service on the {{ $labels.host }} node is down." + {%- endraw %} + ElasticsearchServiceDownMinor: if: >- - count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_warning_threshold_percent }} {% raw %} + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_warning_threshold_percent }} + {%- raw %} + for: 2m labels: - severity: warning + severity: minor service: elasticsearch annotations: - summary: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' - description: 'More than {%- endraw %} {{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' - ElasticsearchCritical: + summary: "{%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down" + description: "{{ $value }} Elasticsearch services are down for at least 2 minutes." + {%- endraw %} + ElasticsearchServiceDownMajor: if: >- - count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {% endraw %} {{ monitoring.service_failed_critical_threshold_percent }} {% raw %} + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_critical_threshold_percent }} + {%- raw %} + for: 2m labels: - severity: critical + severity: major service: elasticsearch annotations: - summary: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' - description: 'More than {%- endraw %} {{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down' - ElasticsearchDown: + summary: "{%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down" + description: "{{ $value }} Elasticsearch services are down for at least 2 minutes." + {%- endraw %} + ElasticsearchServiceOutage: if: >- count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'}) + {%- raw %} labels: - severity: down + severity: critical service: elasticsearch annotations: - summary: 'All Elasticsearch services are down' - description: 'All Elasticsearch services are down' - ElasticsearchClusterDiskLowWaterMark: + summary: "Elasticsearch cluster outage" + description: "All Elasticsearch services within the cluster are down." + {%- endraw %} + ElasticsearchDiskWaterMarkMinor: if: >- - (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85 + (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_minor_threshold_percent}} + {%- raw %} for: 5m labels: - severity: warning + severity: minor service: elasticsearch annotations: - summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' - description: >- - Elasticsearch will not allocate new shards to node {{ $labels.host }} - ElasticsearchClusterDiskHighWaterMark: + summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of disk space" + description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes." + {%- endraw %} + ElasticsearchDiskWaterMarkMajor: if: >- - (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90 + (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_major_threshold_percent}} + {%- raw %} for: 5m - annotations: - summary: 'Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' - description: >- - Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node labels: - severity: critical + severity: major service: elasticsearch -{%- endraw %} + annotations: + summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of disk space" + description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes." + {%- endraw %} {%- endif %} {%- endif %}