blob: d08b38584040ed508aa5fb06fcea84ada57c8da7 [file] [log] [blame]
{%- if pillar.opencontrail is defined %}
{%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web, monitoring, version with context %}
{%- if database.get('enabled', False) and
database.get('cassandra', False) and
exporters is defined %}
{%- set packages = exporters.get('jmx', {}).get('packages', ('jmx-exporter', )) %}
{%- set template = exporters.get('jmx', {}).get('template', 'opencontrail/files/jmx-exporter-default') %}
{%- load_yaml as new_exporters_cfg %}
exporters:
jmx:
enabled: true
{%- if packages is defined %}
packages:
{%- for pkg in packages %}
- {{ pkg }}
{%- endfor %}
{%- endif %}
{%- if template is defined %}
template: {{ template }}
{%- endif %}
services:
cassandra:
enabled: true
template: opencontrail/files/jmx_exporter_cassandra.conf
jmx_bind:
address: localhost
{%- if version < 4.0 %}
port: 7199
{%- else %}
{%- if database.get('type') == 'control' or database.get('type') == 'cluster' %}
port: 7198
{%- elif database.get('type') == 'analytics' %}
port: 7199
{%- endif %}
{%- endif %}
bind:
address: 0.0.0.0
port: 9111
{%- endload %}
{{ new_exporters_cfg|yaml(False) }}
{%- endif %}
server:
alert:
ContrailApiDown:
if: >-
http_response_status{name=~"contrail.*"} == 0
{%- raw %}
for: 2m
labels:
severity: minor
service: contrail
annotations:
summary: "{{ $labels.name }} API endpoint is not accessible"
description: "The {{ $labels.name }} API endpoint on the {{ $labels.host }} node is not accessible for 2 minutes."
{%- endraw %}
ContrailApiDownMinor:
if: >-
count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_warning_threshold_percent }}
{%- raw %}
for: 2m
labels:
severity: minor
service: contrail
annotations:
summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible"
description: "{{ $value }} {{ $labels.name }} API endpoints (>= {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %}) are not accessible for 2 minutes."
{%- endraw %}
ContrailApiDownMajor:
if: >-
count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
for: 2m
labels:
severity: major
service: contrail
annotations:
summary: "{%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible"
description: "{{ $value }} {{ $labels.name }} API endpoints (>= {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %}) are not accessible for 2 minutes."
{%- endraw %}
ContrailApiOutage:
if: >-
count(http_response_status{name=~"contrail.*"} == 0) by (name) == count(http_response_status{name=~"contrail.*"}) by (name)
{%- raw %}
for: 2m
labels:
severity: critical
service: contrail
annotations:
summary: "{{ $labels.name }} API outage"
description: "The {{ $labels.name }} API is not accessible for all available endpoints for 2 minutes."
{%- endraw %}
ContrailProcessDown:
if: >-
procstat_running{process_name=~"contrail.*"} == 0
labels:
severity: minor
service: contrail
annotations:
{%- raw %}
summary: "{{ $labels.process_name }} process is down"
description: "The {{ $labels.process_name }} process on the {{ $labels.host }} node is down."
{%- endraw %}
ContrailProcessDownMinor:
if: >-
count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_warning_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
labels:
severity: minor
service: contrail
annotations:
{%- raw %}
summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down"
description: "{{ $value }} {{ $labels.process_name }} processes (>= {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %}) are down."
{%- endraw %}
ContrailProcessDownMajor:
if: >-
count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_critical_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name)
labels:
severity: major
service: contrail
annotations:
{%- raw %}
summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down"
description: "{{ $value }} {{ $labels.process_name }} processes (>= {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %}) are down."
{%- endraw %}
ContrailProcessOutage:
if: >-
count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) == count(procstat_running{process_name=~"contrail.*"}) by (process_name)
labels:
severity: critical
service: contrail
annotations:
{%- raw %}
summary: "{{ $labels.name }} service outage"
description: "All {{ $labels.process_name }} processes are down."
ContrailBGPSessionsNoEstablished:
if: >-
max(contrail_bgp_session_count) by (host) == 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "No established OpenContrail BGP sessions"
description: "There are no established OpenContrail BGP sessions on the {{ $labels.host }} node for 2 minutes."
ContrailBGPSessionsNoActive:
if: >-
max(contrail_bgp_session_up_count) by (host) == 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "No active OpenContrail BGP sessions"
description: "There are no active OpenContrail BGP sessions on the {{ $labels.host }} node for 2 minutes."
ContrailBGPSessionsDown:
if: >-
min(contrail_bgp_session_down_count) by (host) > 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail BGP sessions are down"
description: "{{ $value }} OpenContrail BGP sessions on the {{ $labels.host }} node are down for 2 minutes."
ContrailXMPPSessionsMissingEstablished:
if: >-
count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_up_count) > 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "Missing established OpenContrail XMPP sessions"
description: "{{ $value }} established OpenContrail XMPP sessions are missing on the compute cluster for 2 minutes."
ContrailXMPPSessionsMissing:
if: >-
count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_count) > 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "Missing OpenContrail XMPP sessions"
description: "{{ $value }} OpenContrail XMPP sessions are missing on the compute cluster for 2 minutes."
ContrailXMPPSessionsDown:
if: >-
min(contrail_xmpp_session_down_count) by (host) > 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail XMPP sessions are down"
description: "{{ $value }} OpenContrail XMPP sessions on the {{ $labels.host }} node are down for 2 minutes."
ContrailXMPPSessionsTooHigh:
{%- endraw %}
if: >-
{%- set xmpp_toohigh_threshold = monitoring.xmpp_sessions_too_high_threshold %}
min(contrail_xmpp_session_count) by (host) >= {{ xmpp_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail XMPP sessions reached the limit of {%- endraw %} {{ xmpp_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail XMPP sessions on the {{ $labels.host }} node are open for 2 minutes."
{%- endraw %}
ContrailXMPPSessionsChangesTooHigh:
if: >-
{%- set xmpp_variation_threshold = monitoring.xmpp_sessions_variation_threshold %}
abs(delta(contrail_xmpp_session_count[2m])) >= {{ xmpp_variation_threshold }}
{%- raw %}
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail XMPP sessions changes reached the limit of {%- endraw %}{{ xmpp_variation_threshold }}{%- raw %}"
description: "The OpenContrail XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
ContrailVrouterXMPPSessionsZero:
if: >-
min(contrail_vrouter_xmpp) by (host) == 0
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "No OpenContrail vRouter XMPP sessions"
description: "There are no OpenContrail vRouter XMPP sessions on the {{ $labels.host }} node for 2 minutes."
{%- endraw %}
ContrailVrouterXMPPSessionsTooHigh:
if: >-
{%- set vrouter_xmpp_toohigh_threshold = monitoring.vrouter_xmpp_sessions_too_high_threshold %}
min(contrail_vrouter_xmpp) by (host) >= {{ vrouter_xmpp_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter XMPP sessions reached the limit of {%- endraw %} {{ vrouter_xmpp_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter XMPP sessions are open on the {{ $labels.host }} node for 2 minutes."
{%- endraw %}
ContrailVrouterXMPPSessionsChangesTooHigh:
if: >-
{%- set vrouter_xmpp_variation_threshold = monitoring.vrouter_xmpp_sessions_variation_threshold %}
abs(delta(contrail_vrouter_xmpp[2m])) >= {{ vrouter_xmpp_variation_threshold }}
{%- raw %}
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_xmpp_variation_threshold }}{%- raw %}"
description: "The OpenContrail vRouter XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
{%- endraw %}
{%- if version < 4.0 %}
ContrailVrouterDNSXMPPSessionsZero:
if: >-
min(contrail_vrouter_dns_xmpp) by (host) == 0
for: 2m
{%- raw %}
labels:
severity: warning
service: contrail
annotations:
summary: "No OpenContrail vRouter DNS-XMPP sessions"
description: "There are no OpenContrail vRouter DNS-XMPP sessions on the {{ $labels.host }} node for 2 minutes."
{%- endraw %}
ContrailVrouterDNSXMPPSessionsTooHigh:
if: >-
{%- set vrouter_dns_xmpp_toohigh_threshold = monitoring.vrouter_dns_xmpp_sessions_too_high_threshold %}
min(contrail_vrouter_dns_xmpp) by (host) >= {{ vrouter_dns_xmpp_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter DNS-XMPP sessions reached the limit of {%- endraw %} {{ vrouter_dns_xmpp_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter DNS-XMPP sessions are open on the {{ $labels.host }} node for 2 minutes."
{%- endraw %}
ContrailVrouterDNSXMPPSessionsChangesTooHigh:
if: >-
{%- set vrouter_dns_xmpp_variation_threshold = monitoring.vrouter_dns_xmpp_sessions_variation_threshold %}
abs(delta(contrail_vrouter_dns_xmpp[2m])) >= {{ vrouter_dns_xmpp_variation_threshold }}
{%- raw %}
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter DNS-XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_dns_xmpp_variation_threshold }}{%- raw %}"
description: "The OpenContrail vRouter DNS-XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times."
{%- endraw %}
{%- endif %}
ContrailVrouterLLSSessionsTooHigh:
if: >-
{%- set vrouter_lls_toohigh_threshold = monitoring.vrouter_lls_too_high_threshold %}
min(contrail_vrouter_lls) by (host) >= {{ vrouter_lls_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter LLS sessions reached the limit of {%- endraw %} {{ vrouter_lls_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter LLS sessions are open on the {{ $labels.host }} node for 2 minutes."
{%- endraw %}
ContrailVrouterLLSSessionsChangesTooHigh:
if: >-
{%- set vrouter_lls_variation_threshold = monitoring.vrouter_lls_variation_threshold %}
abs(delta(contrail_vrouter_lls[2m])) >= {{ vrouter_lls_variation_threshold }}
{%- raw %}
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter LLS sessions changes reached the limit of {%- endraw %} {{ vrouter_lls_variation_threshold }}{%- raw %}"
description: "The OpenContrail vRouter LLS sessions on the {{ $labels.host }} node have changed {{ $value }} times."
{%- endraw %}
ContrailFlowsActiveTooHigh:
if: >-
{%- set vrouter_flows_active_toohigh_threshold = monitoring.vrouter_flows_active_too_high_threshold %}
deriv(contrail_vrouter_flows_active[5m]) >= {{ vrouter_flows_active_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter active flows reached the limit of {%- endraw %} {{ vrouter_flows_active_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter flows per second on the {{ $labels.host }} node are active for 2 minutes."
{%- endraw %}
ContrailFlowsDiscardedTooHigh:
if: >-
{%- set vrouter_flows_discard_toohigh_threshold = monitoring.vrouter_flows_discard_too_high_threshold %}
rate(contrail_vrouter_flows_discard[5m]) >= {{ vrouter_flows_discard_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter discarded flows reached the limit of {%- endraw %} {{ vrouter_flows_discard_toohigh_threshold }}{%- raw %}/s"
description: "The average per-second rate of discarded OpenContrail vRouter flows on the {{ $labels.host }} node is {{ $value }} for 2 minutes."
{%- endraw %}
ContrailFlowsDroppedTooHigh:
enabled: false
if: >-
{%- set vrouter_flows_flow_action_drop_toohigh_threshold = monitoring.vrouter_flows_flow_action_drop_too_high_threshold %}
rate(contrail_vrouter_flows_flow_action_drop[5m]) >= {{ vrouter_flows_flow_action_drop_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter dropped flows reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}/s"
description: "The average per-second rate of dropped OpenContrail vRouter flows on the {{ $labels.host }} node is {{ $value }} for 2 minutes."
{%- endraw %}
ContrailFlowsFragErrTooHigh:
if: >-
{%- set vrouter_flows_frag_err_toohigh_threshold = monitoring.vrouter_flows_frag_err_too_high_threshold %}
min(contrail_vrouter_flows_frag_err) by (host) >= {{ vrouter_flows_frag_err_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter flows with fragment errors reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter flows on the {{ $labels.host }} node had fragment errors for 2 minutes."
{%- endraw %}
ContrailFlowsNextHopInvalidTooHigh:
if: >-
{%- set vrouter_flows_invalid_nh_toohigh_threshold = monitoring.vrouter_flows_invalid_nh_too_high_threshold %}
rate(contrail_vrouter_flows_invalid_nh[5m]) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter flows with an invalid next hop reached the limit of {%- endraw %} {{ vrouter_flows_invalid_nh_toohigh_threshold }}{%- raw %}/s"
description: "The average per-second rate of OpenContrail vRouter flows with an invalid next hop on the {{ $labels.host }} node is {{ $value }} for 2 minutes."
{%- endraw %}
ContrailFlowsInterfaceInvalidTooHigh:
if: >-
{%- set vrouter_flows_composite_invalid_interface_toohigh_threshold = monitoring.vrouter_flows_composite_invalid_interface_too_high_threshold %}
rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter flows with an invalid composite interface reached the limit of {%- endraw %} {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}{%- raw %}/s"
description: "The average per-second rate of OpenContrail vRouter flows with an invalid composite interface on the {{ $labels.host }} node is {{ $value }} for 2 minutes."
{%- endraw %}
ContrailFlowsLabelInvalidTooHigh:
if: >-
{%- set vrouter_flows_invalid_label_toohigh_threshold = monitoring.vrouter_flows_invalid_label_too_high_threshold %}
min(contrail_vrouter_flows_invalid_label) by (host) >= {{ vrouter_flows_invalid_label_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter flows with an invalid label reached the limit of {%- endraw %} {{ vrouter_flows_invalid_label_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter flows on the {{ $labels.host }} node had an invalid composite interface for 2 minutes."
{%- endraw %}
ContrailFlowsQueueSizeExceededTooHigh:
if: >-
{%- set vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold = monitoring.vrouter_flows_flow_queue_limit_exceeded_too_high_threshold %}
rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter flows exceeding the queue size reached the limit of {%- endraw %} {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}{%- raw %}/s"
description: "The average per-second rate of OpenContrail vRouter flows exceeding the queue size on the {{ $labels.host }} node is {{ $value }} for 2 minutes."
{%- endraw %}
ContrailFlowsTableFullTooHigh:
if: >-
{%- set vrouter_flows_flow_table_full_toohigh_threshold = monitoring.vrouter_flows_flow_table_full_too_high_threshold %}
min(contrail_vrouter_flows_flow_table_full) by (host) >= {{ vrouter_flows_flow_table_full_toohigh_threshold }}
{%- raw %}
for: 2m
labels:
severity: warning
service: contrail
annotations:
summary: "OpenContrail vRouter flows with full table reached the limit of {%- endraw %} {{ vrouter_flows_flow_table_full_toohigh_threshold }}{%- raw %}"
description: "{{ $value }} OpenContrail vRouter flows on the {{ $labels.host }} node had a full table for 2 minutes."
{%- endraw %}
{%- if web.get('enabled', False) and web.get('cache', {}).get('engine', '') == 'redis' %}
{%- raw %}
RedisServiceDown:
if: >-
procstat_running{process_name="redis-server"} == 0
labels:
severity: minor
service: redis
annotations:
summary: "Redis service is down"
description: "The Redis service on the {{ $labels.host }} node is down."
{%- endraw %}
RedisServiceDownMinor:
if: >-
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
{%- raw %}
labels:
severity: minor
service: redis
annotations:
summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Redis services are down"
description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)."
RedisServiceDownMajor:
if: >-
count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
labels:
severity: major
service: redis
annotations:
summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Redis services are down"
description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
RedisServiceOutage:
if: >-
count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"})
labels:
severity: critical
service: redis
annotations:
summary: "Redis service outage"
description: "All Redis services are down."
{%- endraw %}
{%- endif %}
{%- if database.get('enabled', False) %}
{%- raw %}
CassandraServiceDown:
if: >-
procstat_running{process_name="cassandra-server"} == 0
labels:
severity: minor
service: cassandra
annotations:
summary: "Cassandra service is down"
description: "The Cassandra service on the {{ $labels.host }} node is down."
{%- endraw %}
CassandraServiceDownMinor:
if: >-
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
{%- raw %}
labels:
severity: minor
service: cassandra
annotations:
summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Cassandra services are down"
description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)."
CassandraServiceDownMajor:
if: >-
count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
labels:
severity: major
service: cassandra
annotations:
summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Cassandra services are down"
description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
CassandraServiceOutage:
if: >-
count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"})
labels:
severity: critical
service: cassandra
annotations:
summary: "Cassandra service outage"
description: "All Cassandra services are down."
KafkaServiceDown:
if: >-
procstat_running{process_name="kafka-server"} == 0
labels:
severity: minor
service: kafka
annotations:
summary: "Kafka service is down"
description: "The Kafka service on the {{ $labels.host }} node is down."
{%- endraw %}
KafkaServiceDownMinor:
if: >-
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_warning_threshold_percent }}
labels:
severity: minor
service: kafka
annotations:
{%- raw %}
summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Kafka services are down"
description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})."
{%- endraw %}
KafkaServiceDownMajor:
if: >-
count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_critical_threshold_percent }}
{%- raw %}
labels:
severity: major
service: kafka
annotations:
summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Kafka services are down"
description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})."
KafkaServiceOutage:
if: >-
count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"})
labels:
severity: critical
service: kafka
annotations:
summary: "Kafka service outage"
description: "All Kafka services are down."
ZookeeperServiceDown:
if: >-
zookeeper_up == 0
for: 2m
labels:
severity: minor
service: zookeeper
annotations:
summary: "Zookeeper service is down"
description: "The Zookeeper service on the {% raw %}{{ $labels.host }}{% endraw %} node is down for 2 minutes."
ZookeeperServiceErrorWarning:
if: >-
zookeeper_service_health == 0
for: 2m
labels:
severity: warning
service: zookeeper
annotations:
summary: "Zookeeper service error"
description: "The Zookeeper service on the {% raw %}{{ $labels.host }}{% endraw %} node is not responding for 2 minutes."
ZookeeperServicesDownMinor:
if: >-
count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_warning_threshold_percent }}
for: 2m
labels:
severity: minor
service: zookeeper
annotations:
summary: "{{ monitoring.services_failed_warning_threshold_percent*100 }}% of Zookeeper services are down"
description: "{% raw %}{{ $value }} Zookeeper services (>= {% endraw %} {{ monitoring.services_failed_warning_threshold_percent*100 }}%) are down for 2 minutes."
ZookeeperServicesDownMajor:
if: >-
count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_critical_threshold_percent }}
for: 2m
labels:
severity: major
service: zookeeper
annotations:
summary: "{{ monitoring.services_failed_critical_threshold_percent*100 }}% of Zookeeper services are down"
description: "{% raw %}{{ $value }} Zookeeper services (>= {% endraw %} {{ monitoring.services_failed_critical_threshold_percent*100 }}%) are down for 2 minutes."
ZookeeperServiceOutage:
if: >-
count(zookeeper_up == 0) == count(zookeeper_up)
for: 2m
labels:
severity: critical
service: zookeeper
annotations:
summary: "Zookeeper service outage"
description: "All Zookeeper services are down for 2 minutes."
{%- endif %}
{%- if exporters is defined %}
{%- include "prometheus/_exporters_config.sls" %}
{%- endif %}
{%- endif %}