X-Git-Url: https://gerrit.mcp.mirantis.com/gitweb?p=salt-formulas%2Felasticsearch.git;a=blobdiff_plain;f=elasticsearch%2Fmeta%2Fprometheus.yml;h=46ab505612c1af82e8692b94f25b06250b1a6615;hp=464111d1feb7d32f57ffe58f84a08944b1222af3;hb=694a03ca0d12290a20fca5ac74053234e373842c;hpb=85c27aa47cd904d8b487a169d0e56b4c268b9259 diff --git a/elasticsearch/meta/prometheus.yml b/elasticsearch/meta/prometheus.yml index 464111d..46ab505 100644 --- a/elasticsearch/meta/prometheus.yml +++ b/elasticsearch/meta/prometheus.yml @@ -1,58 +1,104 @@ -{%- if pillar.elasticsearch.server is defined %} -{% raw %} +{%- if pillar.elasticsearch.server is defined or pillar.elasticsearch.client is defined %} +{%- from "elasticsearch/map.jinja" import server, client, monitoring with context %} + server: alert: - ElasticsearchDown: +{%- if client.get('enabled', False) %} + ElasticsearchClusterHealthStatusMajor: + if: >- + elasticsearch_cluster_health_status == 2 + {%- raw %} + for: 2m + labels: + severity: major + service: elasticsearch + annotations: + summary: "Elasticsearch cluster status is YELLOW" + description: "The Elasticsearch cluster status is YELLOW for 2 minutes." + {%- endraw %} + ElasticsearchClusterHealthStatusCritical: + if: >- + elasticsearch_cluster_health_status == 3 + {%- raw %} + for: 2m + labels: + severity: critical + service: elasticsearch + annotations: + summary: "Elasticsearch cluster status is RED" + description: "The Elasticsearch cluster status is RED for 2 minutes." + {%- endraw %} +{%- endif %} +{%- if server.get('enabled', False) %} + ElasticsearchServiceDown: + if: >- + elasticsearch_up{host=~'.*'} == 0 + {%- raw %} + labels: + severity: minor + service: elasticsearch + annotations: + summary: "Elasticsearch service is down" + description: "The Elasticsearch service on the {{ $labels.host }} node is down." + {%- endraw %} + ElasticsearchServiceDownMinor: if: >- - elasticsearch_up != 1 + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_warning_threshold_percent }} + {%- raw %} + for: 2m labels: - severity: warning + severity: minor service: elasticsearch annotations: - summary: 'Elasticsearch service down' - description: 'Elasticsearch service is down on node {{ $labels.host }}' - ElasticsearchClusterHealthStatusYellow: + summary: "{%- endraw %}{{monitoring.service_failed_warning_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down" + description: "{{ $value }} Elasticsearch services are down for 2 minutes." + {%- endraw %} + ElasticsearchServiceDownMajor: if: >- - max_over_time(elasticsearch_cluster_health_status[5m]) == 2 + count(elasticsearch_up{host=~'.*'} == 0) >= count(elasticsearch_up{host=~'.*'}) * {{ monitoring.service_failed_critical_threshold_percent }} + {%- raw %} + for: 2m labels: - severity: warning + severity: major service: elasticsearch annotations: - summary: Elasticsearch cluster status is YELLOW - description: >- - The Elasticsearch cluster status is YELLOW for the last 5 minutes. - ElasticsearchClusterHealthStatusRed: + summary: "{%- endraw %}{{monitoring.service_failed_critical_threshold_percent*100}}%{%- raw %} of Elasticsearch services are down" + description: "{{ $value }} Elasticsearch services are down for 2 minutes." + {%- endraw %} + ElasticsearchServiceOutage: if: >- - max_over_time(elasticsearch_cluster_health_status[5m]) == 3 + count(elasticsearch_up{host=~'.*'} == 0) == count(elasticsearch_up{host=~'.*'}) + {%- raw %} labels: severity: critical service: elasticsearch annotations: - summary: 'Elasticsearch cluster status is RED' - description: >- - The Elasticsearch cluster status is RED for the last 5 minutes. - ElasticsearchClusterDiskLowWaterMark: + summary: "Elasticsearch cluster outage" + description: "All Elasticsearch services within the cluster are down." + {%- endraw %} + ElasticsearchDiskWaterMarkMinor: if: >- - (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 85 + (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_minor_threshold_percent}} + {%- raw %} for: 5m labels: - severity: warning + severity: minor service: elasticsearch annotations: - summary: 'Elasticsearch low disk watermark [85%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' - description: >- - Elasticsearch will not allocate new shards to node {{ $labels.host }} - - ElasticsearchClusterDiskHighWaterMark: + summary: "Elasticsearch uses {%- endraw %} {{monitoring.service_disk_space_watermark_minor_threshold_percent*100}}%{%- raw %} of disk space" + description: "The Elasticsearch '{{ $labels.instance }}' instance uses {{ $value }}% of disk space on the {{ $labels.host }} node for 5 minutes." + {%- endraw %} + ElasticsearchDiskWaterMarkMajor: if: >- - (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) * 100.0 >= 90 + (max(elasticsearch_fs_total_total_in_bytes) by (host, instance) - max(elasticsearch_fs_total_available_in_bytes) by (host, instance)) / max(elasticsearch_fs_total_total_in_bytes) by (host, instance) >= {{monitoring.service_disk_space_watermark_major_threshold_percent}} + {%- raw %} for: 5m - annotations: - summary: 'Elasticsearch high disk watermark [90%] exceeded on node {{ $labels.host}} instance {{ $labels.instance }}' - description: >- - Elasticsearch will not allocate new shards to node {{ $labels.host }} and will attempt to relocate shards to another node labels: - severity: critical + severity: major service: elasticsearch -{% endraw %} + annotations: + summary: "Elasticsearch uses {%- endraw %} {{monitoring.service_disk_space_watermark_major_threshold_percent*100}}%{%- raw %} of disk space" + description: "The Elasticsearch '{{ $labels.instance }}' instance uses {{ $value }}% of disk space on the {{ $labels.host }} node for 5 minutes." + {%- endraw %} +{%- endif %} {%- endif %}