X-Git-Url: https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas%2Felasticsearch.git;a=blobdiff_plain;f=elasticsearch%2Fmeta%2Fprometheus.yml;h=f6889916c06a2444bedceb334f70b6baecf2def2;hp=f0aa983b0d997c2ef7bbf3498220598f7a9edcab;hb=92b11253e83f785e42c5b65dce57c4f29eb1fe9c;hpb=29fb4553518f6471f322f0704d5c5fd23375afe0 diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml index f0aa983..f688991 100644 --- a/elasticsearch/meta/prometheus.yml +++ b/elasticsearch/meta/prometheus.yml @@ -1,66 +1,104 @@ {%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %} -{%- from "elasticsearch/map.jinja" import server, client with context %} +{%- from "elasticsearch/map.jinja" import server, client, monitoring with context %} server: alert: {%- if client.get('enabled', False) %} -{%- raw %} - ElasticsearchClusterHealthStatusYellow: + ElasticsearchClusterHealthStatusMajor: if: >- elasticsearch_cluster_health_status == 2 + {%- raw %} + for: 2m labels: - severity: warning + severity: major service: elasticsearch annotations: - summary: Elasticsearch cluster status is YELLOW - description: >- - The Elasticsearch cluster status is YELLOW for the last 5 minutes. - ElasticsearchClusterHealthStatusRed: + summary: "Elasticsearch cluster status is YELLOW" + description: "The Elasticsearch cluster status is YELLOW for at least 2 minutes." + {%- endraw %} + ElasticsearchClusterHealthStatusCritical: if: >- elasticsearch_cluster_health_status == 3 + {%- raw %} + for: 2m labels: severity: critical service: elasticsearch annotations: - summary: 'Elasticsearch cluster status is RED' - description: >- - The Elasticsearch cluster status is RED for the last 5 minutes. -{%- endraw %} + summary: "Elasticsearch cluster status is RED" + description: "The Elasticsearch cluster status is RED for at least 2 minutes." + {%- endraw %} {%- endif %} {%- if server.get('enabled', False) %} -{%- raw %} - ElasticsearchDown: + ElasticsearchServiceDown: if: >- - elasticsearch_up{host=~'.*'} != 1 + elasticsearch_up{host=~'.*'} == 0 + {%- raw %} labels: - severity: warning + severity: minor service: elasticsearch annotations: - summary: 'Elasticsearch service down' - description: 'Elasticsearch service is down on node {{ $labels.host }}' - ElasticsearchClusterDiskLowWaterMark: + summary: "Elasticsearch service is down" + description: "The Elasticsearch service on the {{ $labels.host }} node is down." + {%- endraw %} + ElasticsearchServiceDownMinor: if: >- - (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85 - for: 5m + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_warning_threshold_percent }} + {%- raw %} + for: 2m labels: - severity: warning + severity: minor service: elasticsearch annotations: - summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' - description: >- - Elasticsearch will not allocate new shards to node {{ $labels.host }} - - ElasticsearchClusterDiskHighWaterMark: + summary: "{%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down" + description: "{{ $value }} Elasticsearch services are down for at least 2 minutes." + {%- endraw %} + ElasticsearchServiceDownMajor: if: >- - (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90 - for: 5m + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_critical_threshold_percent }} + {%- raw %} + for: 2m + labels: + severity: major + service: elasticsearch annotations: - summary: 'Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' - description: >- - Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node + summary: "{%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down" + description: "{{ $value }} Elasticsearch services are down for at least 2 minutes." + {%- endraw %} + ElasticsearchServiceOutage: + if: >- + count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'}) + {%- raw %} labels: severity: critical service: elasticsearch -{%- endraw %} + annotations: + summary: "Elasticsearch cluster outage" + description: "All Elasticsearch services within the cluster are down." + {%- endraw %} + ElasticsearchDiskWaterMarkMinor: + if: >- + (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_minor_threshold_percent}} + {%- raw %} + for: 5m + labels: + severity: minor + service: elasticsearch + annotations: + summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of disk space" + description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes." + {%- endraw %} + ElasticsearchDiskWaterMarkMajor: + if: >- + (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_major_threshold_percent}} + {%- raw %} + for: 5m + labels: + severity: major + service: elasticsearch + annotations: + summary: "Elasticsearch uses {%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of disk space" + description: "Elasticsearch uses {{ $value }}% of disk space on the '{{ $labels.instance }}' instance on the {{ $labels.host }} node for at least 5 minutes." + {%- endraw %} {%- endif %} {%- endif %}