| {%- if pillar.opencontrail is defined %} |
| {%- from "opencontrail/map.jinja" import control, collector, compute, config, database, web, monitoring, version with context %} |
| |
| {%- if database.get('enabled', False) and |
| database.get('cassandra', False) and |
| exporters is defined %} |
| {%- set packages = exporters.get('jmx', {}).get('packages', ('jmx-exporter', )) %} |
| {%- set template = exporters.get('jmx', {}).get('template', 'opencontrail/files/jmx-exporter-default') %} |
| {%- load_yaml as new_exporters_cfg %} |
| exporters: |
| jmx: |
| enabled: true |
| {%- if packages is defined %} |
| packages: |
| {%- for pkg in packages %} |
| - {{ pkg }} |
| {%- endfor %} |
| {%- endif %} |
| {%- if template is defined %} |
| template: {{ template }} |
| {%- endif %} |
| services: |
| cassandra: |
| enabled: true |
| template: opencontrail/files/jmx_exporter_cassandra.conf |
| jmx_bind: |
| address: localhost |
| {%- if version < 4.0 %} |
| port: 7199 |
| {%- else %} |
| {%- if database.get('type') == 'control' or database.get('type') == 'cluster' %} |
| port: 7198 |
| {%- elif database.get('type') == 'analytics' %} |
| port: 7199 |
| {%- endif %} |
| {%- endif %} |
| bind: |
| address: 0.0.0.0 |
| port: 9111 |
| {%- endload %} |
| {{ new_exporters_cfg|yaml(False) }} |
| {%- endif %} |
| server: |
| alert: |
| ContrailApiDown: |
| if: >- |
| http_response_status{name=~"contrail.*"} == 0 |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: minor |
| service: contrail |
| annotations: |
| summary: "{{ $labels.name }} API endpoint is not accessible" |
| description: "The {{ $labels.name }} API endpoint on the {{ $labels.host }} node is not accessible for 2 minutes." |
| {%- endraw %} |
| ContrailApiDownMinor: |
| if: >- |
| count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_warning_threshold_percent }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: minor |
| service: contrail |
| annotations: |
| summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible" |
| description: "{{ $value }} {{ $labels.name }} API endpoints (>= {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %}) are not accessible for 2 minutes." |
| {%- endraw %} |
| ContrailApiDownMajor: |
| if: >- |
| count(http_response_status{name=~"contrail.*"} == 0) by (name) >= count(http_response_status{name=~"contrail.*"}) by (name) *{{ monitoring.services_failed_critical_threshold_percent }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: major |
| service: contrail |
| annotations: |
| summary: "{%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.name }} API endpoints are not accessible" |
| description: "{{ $value }} {{ $labels.name }} API endpoints (>= {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %}) are not accessible for 2 minutes." |
| {%- endraw %} |
| ContrailApiOutage: |
| if: >- |
| count(http_response_status{name=~"contrail.*"} == 0) by (name) == count(http_response_status{name=~"contrail.*"}) by (name) |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: critical |
| service: contrail |
| annotations: |
| summary: "{{ $labels.name }} API outage" |
| description: "The {{ $labels.name }} API is not accessible for all available endpoints for 2 minutes." |
| {%- endraw %} |
| ContrailProcessDown: |
| if: >- |
| procstat_running{process_name=~"contrail.*"} == 0 |
| labels: |
| severity: minor |
| service: contrail |
| annotations: |
| {%- raw %} |
| summary: "{{ $labels.process_name }} process is down" |
| description: "The {{ $labels.process_name }} process on the {{ $labels.host }} node is down." |
| {%- endraw %} |
| ContrailProcessDownMinor: |
| if: >- |
| count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_warning_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name) |
| labels: |
| severity: minor |
| service: contrail |
| annotations: |
| {%- raw %} |
| summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down" |
| description: "{{ $value }} {{ $labels.process_name }} processes (>= {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %}) are down." |
| {%- endraw %} |
| ContrailProcessDownMajor: |
| if: >- |
| count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) >= {{ monitoring.services_failed_critical_threshold_percent }}*count(procstat_running{process_name=~"contrail.*"}) by (process_name) |
| labels: |
| severity: major |
| service: contrail |
| annotations: |
| {%- raw %} |
| summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of {{ $labels.process_name }} processes are down" |
| description: "{{ $value }} {{ $labels.process_name }} processes (>= {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %}) are down." |
| {%- endraw %} |
| ContrailProcessOutage: |
| if: >- |
| count(procstat_running{process_name=~"contrail.*"} == 0) by (process_name) == count(procstat_running{process_name=~"contrail.*"}) by (process_name) |
| labels: |
| severity: critical |
| service: contrail |
| annotations: |
| {%- raw %} |
| summary: "{{ $labels.name }} service outage" |
| description: "All {{ $labels.process_name }} processes are down." |
| ContrailBGPSessionsNoEstablished: |
| if: >- |
| max(contrail_bgp_session_count) by (host) == 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "No established OpenContrail BGP sessions" |
| description: "There are no established OpenContrail BGP sessions on the {{ $labels.host }} node for 2 minutes." |
| ContrailBGPSessionsNoActive: |
| if: >- |
| max(contrail_bgp_session_up_count) by (host) == 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "No active OpenContrail BGP sessions" |
| description: "There are no active OpenContrail BGP sessions on the {{ $labels.host }} node for 2 minutes." |
| ContrailBGPSessionsDown: |
| if: >- |
| min(contrail_bgp_session_down_count) by (host) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail BGP sessions are down" |
| description: "{{ $value }} OpenContrail BGP sessions on the {{ $labels.host }} node are down for 2 minutes." |
| ContrailXMPPSessionsMissingEstablished: |
| if: >- |
| count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_up_count) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "Missing established OpenContrail XMPP sessions" |
| description: "{{ $value }} established OpenContrail XMPP sessions are missing on the compute cluster for 2 minutes." |
| ContrailXMPPSessionsMissing: |
| if: >- |
| count(contrail_vrouter_xmpp) * 2 - sum(contrail_xmpp_session_count) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "Missing OpenContrail XMPP sessions" |
| description: "{{ $value }} OpenContrail XMPP sessions are missing on the compute cluster for 2 minutes." |
| ContrailXMPPSessionsDown: |
| if: >- |
| min(contrail_xmpp_session_down_count) by (host) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail XMPP sessions are down" |
| description: "{{ $value }} OpenContrail XMPP sessions on the {{ $labels.host }} node are down for 2 minutes." |
| ContrailXMPPSessionsTooHigh: |
| {%- endraw %} |
| if: >- |
| {%- set xmpp_toohigh_threshold = monitoring.xmpp_sessions_too_high_threshold %} |
| min(contrail_xmpp_session_count) by (host) >= {{ xmpp_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail XMPP sessions reached the limit of {%- endraw %} {{ xmpp_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail XMPP sessions on the {{ $labels.host }} node are open for 2 minutes." |
| {%- endraw %} |
| ContrailXMPPSessionsChangesTooHigh: |
| if: >- |
| {%- set xmpp_variation_threshold = monitoring.xmpp_sessions_variation_threshold %} |
| abs(delta(contrail_xmpp_session_count[2m])) >= {{ xmpp_variation_threshold }} |
| {%- raw %} |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail XMPP sessions changes reached the limit of {%- endraw %}{{ xmpp_variation_threshold }}{%- raw %}" |
| description: "The OpenContrail XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times." |
| ContrailVrouterXMPPSessionsZero: |
| if: >- |
| min(contrail_vrouter_xmpp) by (host) == 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "No OpenContrail vRouter XMPP sessions" |
| description: "There are no OpenContrail vRouter XMPP sessions on the {{ $labels.host }} node for 2 minutes." |
| {%- endraw %} |
| ContrailVrouterXMPPSessionsTooHigh: |
| if: >- |
| {%- set vrouter_xmpp_toohigh_threshold = monitoring.vrouter_xmpp_sessions_too_high_threshold %} |
| min(contrail_vrouter_xmpp) by (host) >= {{ vrouter_xmpp_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter XMPP sessions reached the limit of {%- endraw %} {{ vrouter_xmpp_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter XMPP sessions are open on the {{ $labels.host }} node for 2 minutes." |
| {%- endraw %} |
| ContrailVrouterXMPPSessionsChangesTooHigh: |
| if: >- |
| {%- set vrouter_xmpp_variation_threshold = monitoring.vrouter_xmpp_sessions_variation_threshold %} |
| abs(delta(contrail_vrouter_xmpp[2m])) >= {{ vrouter_xmpp_variation_threshold }} |
| {%- raw %} |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_xmpp_variation_threshold }}{%- raw %}" |
| description: "The OpenContrail vRouter XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times." |
| {%- endraw %} |
| {%- if version < 4.0 %} |
| ContrailVrouterDNSXMPPSessionsZero: |
| if: >- |
| min(contrail_vrouter_dns_xmpp) by (host) == 0 |
| for: 2m |
| {%- raw %} |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "No OpenContrail vRouter DNS-XMPP sessions" |
| description: "There are no OpenContrail vRouter DNS-XMPP sessions on the {{ $labels.host }} node for 2 minutes." |
| {%- endraw %} |
| ContrailVrouterDNSXMPPSessionsTooHigh: |
| if: >- |
| {%- set vrouter_dns_xmpp_toohigh_threshold = monitoring.vrouter_dns_xmpp_sessions_too_high_threshold %} |
| min(contrail_vrouter_dns_xmpp) by (host) >= {{ vrouter_dns_xmpp_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter DNS-XMPP sessions reached the limit of {%- endraw %} {{ vrouter_dns_xmpp_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter DNS-XMPP sessions are open on the {{ $labels.host }} node for 2 minutes." |
| {%- endraw %} |
| ContrailVrouterDNSXMPPSessionsChangesTooHigh: |
| if: >- |
| {%- set vrouter_dns_xmpp_variation_threshold = monitoring.vrouter_dns_xmpp_sessions_variation_threshold %} |
| abs(delta(contrail_vrouter_dns_xmpp[2m])) >= {{ vrouter_dns_xmpp_variation_threshold }} |
| {%- raw %} |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter DNS-XMPP sessions changes reached the limit of {%- endraw %}{{ vrouter_dns_xmpp_variation_threshold }}{%- raw %}" |
| description: "The OpenContrail vRouter DNS-XMPP sessions on the {{ $labels.host }} node have changed {{ $value }} times." |
| {%- endraw %} |
| {%- endif %} |
| ContrailVrouterLLSSessionsTooHigh: |
| if: >- |
| {%- set vrouter_lls_toohigh_threshold = monitoring.vrouter_lls_too_high_threshold %} |
| min(contrail_vrouter_lls) by (host) >= {{ vrouter_lls_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter LLS sessions reached the limit of {%- endraw %} {{ vrouter_lls_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter LLS sessions are open on the {{ $labels.host }} node for 2 minutes." |
| {%- endraw %} |
| ContrailVrouterLLSSessionsChangesTooHigh: |
| if: >- |
| {%- set vrouter_lls_variation_threshold = monitoring.vrouter_lls_variation_threshold %} |
| abs(delta(contrail_vrouter_lls[2m])) >= {{ vrouter_lls_variation_threshold }} |
| {%- raw %} |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter LLS sessions changes reached the limit of {%- endraw %} {{ vrouter_lls_variation_threshold }}{%- raw %}" |
| description: "The OpenContrail vRouter LLS sessions on the {{ $labels.host }} node have changed {{ $value }} times." |
| {%- endraw %} |
| ContrailFlowsActiveTooHigh: |
| if: >- |
| {%- set vrouter_flows_active_toohigh_threshold = monitoring.vrouter_flows_active_too_high_threshold %} |
| deriv(contrail_vrouter_flows_active[5m]) >= {{ vrouter_flows_active_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter active flows reached the limit of {%- endraw %} {{ vrouter_flows_active_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter flows per second on the {{ $labels.host }} node are active for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsDiscardedTooHigh: |
| if: >- |
| {%- set vrouter_flows_discard_toohigh_threshold = monitoring.vrouter_flows_discard_too_high_threshold %} |
| rate(contrail_vrouter_flows_discard[5m]) >= {{ vrouter_flows_discard_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter discarded flows reached the limit of {%- endraw %} {{ vrouter_flows_discard_toohigh_threshold }}{%- raw %}/s" |
| description: "The average per-second rate of discarded OpenContrail vRouter flows on the {{ $labels.host }} node is {{ $value }} for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsDroppedTooHigh: |
| enabled: false |
| if: >- |
| {%- set vrouter_flows_flow_action_drop_toohigh_threshold = monitoring.vrouter_flows_flow_action_drop_too_high_threshold %} |
| rate(contrail_vrouter_flows_flow_action_drop[5m]) >= {{ vrouter_flows_flow_action_drop_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter dropped flows reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}/s" |
| description: "The average per-second rate of dropped OpenContrail vRouter flows on the {{ $labels.host }} node is {{ $value }} for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsFragErrTooHigh: |
| if: >- |
| {%- set vrouter_flows_frag_err_toohigh_threshold = monitoring.vrouter_flows_frag_err_too_high_threshold %} |
| min(contrail_vrouter_flows_frag_err) by (host) >= {{ vrouter_flows_frag_err_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter flows with fragment errors reached the limit of {%- endraw %} {{ vrouter_flows_flow_action_drop_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter flows on the {{ $labels.host }} node had fragment errors for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsNextHopInvalidTooHigh: |
| if: >- |
| {%- set vrouter_flows_invalid_nh_toohigh_threshold = monitoring.vrouter_flows_invalid_nh_too_high_threshold %} |
| rate(contrail_vrouter_flows_invalid_nh[5m]) >= {{ vrouter_flows_invalid_nh_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter flows with an invalid next hop reached the limit of {%- endraw %} {{ vrouter_flows_invalid_nh_toohigh_threshold }}{%- raw %}/s" |
| description: "The average per-second rate of OpenContrail vRouter flows with an invalid next hop on the {{ $labels.host }} node is {{ $value }} for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsInterfaceInvalidTooHigh: |
| if: >- |
| {%- set vrouter_flows_composite_invalid_interface_toohigh_threshold = monitoring.vrouter_flows_composite_invalid_interface_too_high_threshold %} |
| rate(contrail_vrouter_flows_composite_invalid_interface[5m]) >= {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter flows with an invalid composite interface reached the limit of {%- endraw %} {{ vrouter_flows_composite_invalid_interface_toohigh_threshold }}{%- raw %}/s" |
| description: "The average per-second rate of OpenContrail vRouter flows with an invalid composite interface on the {{ $labels.host }} node is {{ $value }} for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsLabelInvalidTooHigh: |
| if: >- |
| {%- set vrouter_flows_invalid_label_toohigh_threshold = monitoring.vrouter_flows_invalid_label_too_high_threshold %} |
| min(contrail_vrouter_flows_invalid_label) by (host) >= {{ vrouter_flows_invalid_label_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter flows with an invalid label reached the limit of {%- endraw %} {{ vrouter_flows_invalid_label_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter flows on the {{ $labels.host }} node had an invalid composite interface for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsQueueSizeExceededTooHigh: |
| if: >- |
| {%- set vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold = monitoring.vrouter_flows_flow_queue_limit_exceeded_too_high_threshold %} |
| rate(contrail_vrouter_flows_flow_queue_limit_exceeded[5m]) >= {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter flows exceeding the queue size reached the limit of {%- endraw %} {{ vrouter_flows_flow_queue_limit_exceeded_toohigh_threshold }}{%- raw %}/s" |
| description: "The average per-second rate of OpenContrail vRouter flows exceeding the queue size on the {{ $labels.host }} node is {{ $value }} for 2 minutes." |
| {%- endraw %} |
| ContrailFlowsTableFullTooHigh: |
| if: >- |
| {%- set vrouter_flows_flow_table_full_toohigh_threshold = monitoring.vrouter_flows_flow_table_full_too_high_threshold %} |
| min(contrail_vrouter_flows_flow_table_full) by (host) >= {{ vrouter_flows_flow_table_full_toohigh_threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: contrail |
| annotations: |
| summary: "OpenContrail vRouter flows with full table reached the limit of {%- endraw %} {{ vrouter_flows_flow_table_full_toohigh_threshold }}{%- raw %}" |
| description: "{{ $value }} OpenContrail vRouter flows on the {{ $labels.host }} node had a full table for 2 minutes." |
| {%- endraw %} |
| {%- if web.get('enabled', False) and web.get('cache', {}).get('engine', '') == 'redis' %} |
| {%- raw %} |
| RedisServiceDown: |
| if: >- |
| procstat_running{process_name="redis-server"} == 0 |
| labels: |
| severity: minor |
| service: redis |
| annotations: |
| summary: "Redis service is down" |
| description: "The Redis service on the {{ $labels.host }} node is down." |
| {%- endraw %} |
| RedisServiceDownMinor: |
| if: >- |
| count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_warning_threshold_percent }} |
| {%- raw %} |
| labels: |
| severity: minor |
| service: redis |
| annotations: |
| summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Redis services are down" |
| description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)." |
| RedisServiceDownMajor: |
| if: >- |
| count(procstat_running{process_name="redis-server"} == 0) >= count(procstat_running{process_name="redis-server"}) *{{ monitoring.services_failed_critical_threshold_percent }} |
| {%- raw %} |
| labels: |
| severity: major |
| service: redis |
| annotations: |
| summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Redis services are down" |
| description: "{{ $value }} Redis services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})." |
| RedisServiceOutage: |
| if: >- |
| count(procstat_running{process_name="redis-server"} == 0) == count(procstat_running{process_name="redis-server"}) |
| labels: |
| severity: critical |
| service: redis |
| annotations: |
| summary: "Redis service outage" |
| description: "All Redis services are down." |
| {%- endraw %} |
| {%- endif %} |
| {%- if database.get('enabled', False) %} |
| {%- raw %} |
| CassandraServiceDown: |
| if: >- |
| procstat_running{process_name="cassandra-server"} == 0 |
| labels: |
| severity: minor |
| service: cassandra |
| annotations: |
| summary: "Cassandra service is down" |
| description: "The Cassandra service on the {{ $labels.host }} node is down." |
| {%- endraw %} |
| CassandraServiceDownMinor: |
| if: >- |
| count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_warning_threshold_percent }} |
| {%- raw %} |
| labels: |
| severity: minor |
| service: cassandra |
| annotations: |
| summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Cassandra services are down" |
| description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%)." |
| CassandraServiceDownMajor: |
| if: >- |
| count(procstat_running{process_name="cassandra-server"} == 0) >= count(procstat_running{process_name="cassandra-server"}) *{{ monitoring.services_failed_critical_threshold_percent }} |
| {%- raw %} |
| labels: |
| severity: major |
| service: cassandra |
| annotations: |
| summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Cassandra services are down" |
| description: "{{ $value }} Cassandra services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})." |
| CassandraServiceOutage: |
| if: >- |
| count(procstat_running{process_name="cassandra-server"} == 0) == count(procstat_running{process_name="cassandra-server"}) |
| labels: |
| severity: critical |
| service: cassandra |
| annotations: |
| summary: "Cassandra service outage" |
| description: "All Cassandra services are down." |
| KafkaServiceDown: |
| if: >- |
| procstat_running{process_name="kafka-server"} == 0 |
| labels: |
| severity: minor |
| service: kafka |
| annotations: |
| summary: "Kafka service is down" |
| description: "The Kafka service on the {{ $labels.host }} node is down." |
| {%- endraw %} |
| KafkaServiceDownMinor: |
| if: >- |
| count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_warning_threshold_percent }} |
| labels: |
| severity: minor |
| service: kafka |
| annotations: |
| {%- raw %} |
| summary: "{%- endraw %}{{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %} of Kafka services are down" |
| description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_warning_threshold_percent*100}}%{%- raw %})." |
| {%- endraw %} |
| KafkaServiceDownMajor: |
| if: >- |
| count(procstat_running{process_name="kafka-server"} == 0) >= count(procstat_running{process_name="kafka-server"}) *{{ monitoring.services_failed_critical_threshold_percent }} |
| {%- raw %} |
| labels: |
| severity: major |
| service: kafka |
| annotations: |
| summary: "{%- endraw %}{{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %} of Kafka services are down" |
| description: "{{ $value }} Kafka services are down (at least {%- endraw %} {{monitoring.services_failed_critical_threshold_percent*100}}%{%- raw %})." |
| KafkaServiceOutage: |
| if: >- |
| count(procstat_running{process_name="kafka-server"} == 0) == count(procstat_running{process_name="kafka-server"}) |
| labels: |
| severity: critical |
| service: kafka |
| annotations: |
| summary: "Kafka service outage" |
| description: "All Kafka services are down." |
| ZookeeperServiceDown: |
| if: >- |
| zookeeper_up == 0 |
| for: 2m |
| labels: |
| severity: minor |
| service: zookeeper |
| annotations: |
| summary: "Zookeeper service is down" |
| description: "The Zookeeper service on the {% raw %}{{ $labels.host }}{% endraw %} node is down for 2 minutes." |
| ZookeeperServiceErrorWarning: |
| if: >- |
| zookeeper_service_health == 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: zookeeper |
| annotations: |
| summary: "Zookeeper service error" |
| description: "The Zookeeper service on the {% raw %}{{ $labels.host }}{% endraw %} node is not responding for 2 minutes." |
| ZookeeperServicesDownMinor: |
| if: >- |
| count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_warning_threshold_percent }} |
| for: 2m |
| labels: |
| severity: minor |
| service: zookeeper |
| annotations: |
| summary: "{{ monitoring.services_failed_warning_threshold_percent*100 }}% of Zookeeper services are down" |
| description: "{% raw %}{{ $value }} Zookeeper services (>= {% endraw %} {{ monitoring.services_failed_warning_threshold_percent*100 }}%) are down for 2 minutes." |
| ZookeeperServicesDownMajor: |
| if: >- |
| count(zookeeper_up == 0) >= count(zookeeper_up) * {{ monitoring.services_failed_critical_threshold_percent }} |
| for: 2m |
| labels: |
| severity: major |
| service: zookeeper |
| annotations: |
| summary: "{{ monitoring.services_failed_critical_threshold_percent*100 }}% of Zookeeper services are down" |
| description: "{% raw %}{{ $value }} Zookeeper services (>= {% endraw %} {{ monitoring.services_failed_critical_threshold_percent*100 }}%) are down for 2 minutes." |
| ZookeeperServiceOutage: |
| if: >- |
| count(zookeeper_up == 0) == count(zookeeper_up) |
| for: 2m |
| labels: |
| severity: critical |
| service: zookeeper |
| annotations: |
| summary: "Zookeeper service outage" |
| description: "All Zookeeper services are down for 2 minutes." |
| {%- endif %} |
| |
| {%- if exporters is defined %} |
| {%- include "prometheus/_exporters_config.sls" %} |
| {%- endif %} |
| {%- endif %} |