linux/meta/prometheus.yml - salt-formulas/linux - Gitiles

 {%- from "linux/map.jinja" import monitoring, network with context %}
 server:
   alert:
     {%- raw %}
     SystemCpuIoWaitWarning:
       if: >-
         cpu_usage_iowait > 40
       for: 10m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "CPU waited for I/O 40% of time"
         description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
     SystemCpuIoWaitCritical:
       if: >-
         cpu_usage_iowait > 50
       for: 10m
       labels:
         severity: critical
         service: system
       annotations:
         summary: "CPU waited for I/O 50% of time"
         description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
     {%- endraw %}
     {%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
     {%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
     SystemCpuStealTimeWarning:
       if: >-
         cpu_usage_steal > {{ cpu_steal_warn }}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "CPU steal time warning"
         description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes."
     SystemCpuStealTimeCritical:
       if: >-
         cpu_usage_steal > {{ cpu_steal_crit }}
       for: 5m
       labels:
         severity: critical
         service: system
       annotations:
         summary: "CPU steal time critical"
         description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes."
     SystemCpuFullWarning:
       {%- endraw %}
       {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
       if: >-
         100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
       {% raw %}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
         description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
       {%- endraw %}
     {% if not ([
         salt['pillar.get']('nova:compute:enabled', False)
     ]|select('equalto', True)|list) %} {# glorified `not any(<iterable>)` condition #}
     SystemLoadTooHighWarning:
       {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
       if: >-
         system_load15{host!~".*cmp[0-9]+"} / system_n_cpus > {{ load_threshold }}
       {%- raw %}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
         description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
       {%- endraw %}
     SystemLoadTooHighCritical:
       {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
       if: >-
         system_load15{host!~".*cmp[0-9]+"} / system_n_cpus > {{ load_threshold }}
       {%- raw %}
       for: 5m
       labels:
         severity: critical
         service: system
       annotations:
         summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
         description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
       {%- endraw %}
     {% endif %}
     SystemDiskFullWarning:
       {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
       if: >-
         disk_used_percent{mode!="ro"} >= {{ disk_threshold }}
       {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
         description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
     SystemDiskFullMajor:
       {%- endraw %}
       {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
       if: >-
         disk_used_percent{mode!="ro"} >= {{ disk_threshold }}
       {%- raw %}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
         description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
     SystemDiskInodesFullWarning:
       {%- endraw %}
       {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
       if: >-
         100 * disk_inodes_used{mode!="ro"} / disk_inodes_total{mode!="ro"} >= {{ inodes_threshold }}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
     SystemDiskInodesFullMajor:
       {%- endraw %}
       {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
       if: >-
         100 * disk_inodes_used{mode!="ro"} / disk_inodes_total{mode!="ro"} >= {{ inodes_threshold }}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
     SystemDiskErrorsTooHigh:
       if: >-
         increase(hdd_errors_total[1m]) > 0
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
     SystemDiskBacklogWarning:
       if: >-
         increase(diskio_weighted_io_time[10m]) > 2000
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk {{ $labels.name }} requests waited 2 seconds"
         description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
     SystemDiskBacklogCritical:
       if: >-
         increase(diskio_weighted_io_time[10m]) > 5000
       labels:
         severity: critical
         service: system
       annotations:
         summary: "Disk {{ $labels.name }} requests waited 5 seconds"
         description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
     SystemDiskRequestQueuedWarning:
       if: >-
         increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
         description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
     SystemDiskRequestQueuedCritical:
       if: >-
         increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
       labels:
         severity: critical
         service: system
       annotations:
         summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
         description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
     SystemMemoryFullWarning:
       if: >-
         mem_used_percent > 90 and mem_available < 8 * 2^30
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "More than 90% of memory is used and less than 8 GB of memory is available"
         description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
     SystemMemoryFullMajor:
       if: >-
         mem_used_percent > 95 and mem_available < 4 * 2^30
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "More than 95% of memory is used and less than 4 GB of memory is available"
         description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
     SystemRxPacketsDroppedTooHigh:
     {%- endraw %}
       {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
       if: >-
         increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
         description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
     SystemTxPacketsDroppedTooHigh:
       {%- endraw %}
       {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
       if: >-
         increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
         description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
     CronProcessDown:
       if: >-
         procstat_running{process_name="cron"} == 0
       for: 2m
       labels:
         severity: critical
         service: system
       annotations:
         summary: "Cron process is down"
         description: "The cron process on the {{ $labels.host }} node is down."
     SshdProcessDown:
       if: >-
         procstat_running{process_name="sshd"} == 0
       for: 2m
       labels:
         severity: critical
         service: system
       annotations:
         summary: "SSH process is down"
         description: "The SSH process on the {{ $labels.host }} node is down."
     SshFailedLoginsTooHigh:
       {%- endraw %}
       {%- set threshold = monitoring.failed_auths_threshold.warn %}
       if: >-
         increase(failed_logins_total[5m]) > {{ threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ threshold }}{%- raw %} failed SSH logins"
         description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
     PacketsDroppedByCpuWarning:
       if: >-
         floor(increase(nstat_packet_drop[10m])) > 0
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Increased number of CPU dropped packets"
         description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
     PacketsDroppedByCpuMinor:
       if: >-
         floor(increase(nstat_packet_drop[10m])) > 100
       labels:
         severity: minor
         service: system
       annotations:
         summary: "CPU dropped more than 100 packets"
         description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
     NetdevBudgetRanOutsWarning:
       {%- endraw %}
       {%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %}
       if: >-
         max(rate(nstat_time_squeeze[5m])) without (cpu) > {{ squeeze_rate_threshold }}
       for: 7m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
         description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
       {%- endraw %}
     {%- if network.bridge == 'openvswitch' %}
       {%- raw %}
     ProcessOVSVswitchdMemoryWarning:
       if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2
       for: 5m
       labels:
         severity: warning
         service: ovs
       annotations:
         summary: "ovs-vswitchd takes more than 20% of system memory"
         description: "ovs-vswitchd takes more than 20% of system memory"
     ProcessOVSVswitchdMemoryCritical:
       if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3
       for: 5m
       labels:
         severity: critical
         service: ovs
       annotations:
         summary: "ovs-vswitchd takes more than 30% of system memory"
         description: "ovs-vswitchd takes more than 30% of system memory"
       {%- endraw %}
     {%- endif %}
 {%- set bond_interfaces = [] %}
 {%- for interface_name, interface in network.interface.items() %}
   {%- if interface.type == 'bond' and interface.enabled == True %}
     {%- do bond_interfaces.append(interface_name) %}
   {%- endif %}
 {%- endfor %}
 {%- if bond_interfaces|length > 0 %}
   {%- raw %}
     BondInterfaceDown:
       if: >-
         bond_status < 1
       labels:
         severity: critical
         service: system
       annotations:
         summary: "{{ $labels.bond }} bond interface is down"
         description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
     BondInterfaceSlaveDown:
       if: >-
         bond_slave_status < 1
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
         description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
     BondInterfaceSlaveDownMajor:
       if: >-
         sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
       labels:
         severity: major
         service: system
       annotations:
         summary: "50% of bond interface slaves {{ $labels.bond }} are down"
         description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
     BondInterfaceSingleSlave:
       if: >-
         count(bond_slave_status) by (bond,host) == 1
       labels:
         severity: major
         service: system
       annotations:
         summary: "The {{ $labels.bond }} bond interface has only one slave"
         description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has only one slave."
   {%- endraw %}
 {%- endif %}
	{%- from "linux/map.jinja" import monitoring, network with context %}
	server:
	alert:
	{%- raw %}
	SystemCpuIoWaitWarning:
	if: >-
	cpu_usage_iowait > 40
	for: 10m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "CPU waited for I/O 40% of time"
	description: "The CPU on the {{ $labels.host }} node spent 40% of time waiting for I/O."
	SystemCpuIoWaitCritical:
	if: >-
	cpu_usage_iowait > 50
	for: 10m
	labels:
	severity: critical
	service: system
	annotations:
	summary: "CPU waited for I/O 50% of time"
	description: "The CPU on the {{ $labels.host }} node spent 50% of time waiting for I/O."
	{%- endraw %}
	{%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn\|float %}
	{%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit\|float %}
	SystemCpuStealTimeWarning:
	if: >-
	cpu_usage_steal > {{ cpu_steal_warn }}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "CPU steal time warning"
	description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes."
	SystemCpuStealTimeCritical:
	if: >-
	cpu_usage_steal > {{ cpu_steal_crit }}
	for: 5m
	labels:
	severity: critical
	service: system
	annotations:
	summary: "CPU steal time critical"
	description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes."
	SystemCpuFullWarning:
	{%- endraw %}
	{%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn\|float %}
	if: >-
	100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
	{% raw %}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
	description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
	{%- endraw %}
	{% if not ([
	salt['pillar.get']('nova:compute:enabled', False)
	]\|select('equalto', True)\|list) %} {# glorified `not any(<iterable>)` condition #}
	SystemLoadTooHighWarning:
	{%- set load_threshold = monitoring.system_load_threshold.warn\|float %}
	if: >-
	system_load15{host!~".*cmp[0-9]+"} / system_n_cpus > {{ load_threshold }}
	{%- raw %}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
	description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
	{%- endraw %}
	SystemLoadTooHighCritical:
	{%- set load_threshold = monitoring.system_load_threshold.crit\|float %}
	if: >-
	system_load15{host!~".*cmp[0-9]+"} / system_n_cpus > {{ load_threshold }}
	{%- raw %}
	for: 5m
	labels:
	severity: critical
	service: system
	annotations:
	summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
	description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
	{%- endraw %}
	{% endif %}
	SystemDiskFullWarning:
	{%- set disk_threshold = monitoring.disk_usage_percentage.warn\|float %}
	if: >-
	disk_used_percent{mode!="ro"} >= {{ disk_threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
	description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
	SystemDiskFullMajor:
	{%- endraw %}
	{%- set disk_threshold = monitoring.disk_usage_percentage.major\|float %}
	if: >-
	disk_used_percent{mode!="ro"} >= {{ disk_threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
	description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
	SystemDiskInodesFullWarning:
	{%- endraw %}
	{%- set inodes_threshold = monitoring.inodes_usage_percentage.warn\|float %}
	if: >-
	100 * disk_inodes_used{mode!="ro"} / disk_inodes_total{mode!="ro"} >= {{ inodes_threshold }}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
	SystemDiskInodesFullMajor:
	{%- endraw %}
	{%- set inodes_threshold = monitoring.inodes_usage_percentage.major\|float %}
	if: >-
	100 * disk_inodes_used{mode!="ro"} / disk_inodes_total{mode!="ro"} >= {{ inodes_threshold }}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
	SystemDiskErrorsTooHigh:
	if: >-
	increase(hdd_errors_total[1m]) > 0
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk {{ $labels.device }} is failing"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
	SystemDiskBacklogWarning:
	if: >-
	increase(diskio_weighted_io_time[10m]) > 2000
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk {{ $labels.name }} requests waited 2 seconds"
	description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 2 seconds on the device during the last 10 minutes."
	SystemDiskBacklogCritical:
	if: >-
	increase(diskio_weighted_io_time[10m]) > 5000
	labels:
	severity: critical
	service: system
	annotations:
	summary: "Disk {{ $labels.name }} requests waited 5 seconds"
	description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node waited in total 5 seconds on the device during the last 10 minutes."
	SystemDiskRequestQueuedWarning:
	if: >-
	increase(diskio_io_time[10m]) > 0.9 * 10 * 60 * 1000
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk {{ $labels.name }} requests were queued for 90% of time"
	description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 90% of the device time during the last 10 minutes."
	SystemDiskRequestQueuedCritical:
	if: >-
	increase(diskio_io_time[10m]) > 0.98 * 10 * 60 * 1000
	labels:
	severity: critical
	service: system
	annotations:
	summary: "Disk {{ $labels.name }} requests were queued for 98% of time"
	description: "I/O requests for the {{ $labels.name }} disk on the {{ $labels.host }} node spent in queue 98% of the device time during the last 10 minutes."
	SystemMemoryFullWarning:
	if: >-
	mem_used_percent > 90 and mem_available < 8 * 2^30
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "More than 90% of memory is used and less than 8 GB of memory is available"
	description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
	SystemMemoryFullMajor:
	if: >-
	mem_used_percent > 95 and mem_available < 4 * 2^30
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "More than 95% of memory is used and less than 4 GB of memory is available"
	description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
	SystemRxPacketsDroppedTooHigh:
	{%- endraw %}
	{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
	if: >-
	increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
	description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
	SystemTxPacketsDroppedTooHigh:
	{%- endraw %}
	{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
	if: >-
	increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
	description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
	CronProcessDown:
	if: >-
	procstat_running{process_name="cron"} == 0
	for: 2m
	labels:
	severity: critical
	service: system
	annotations:
	summary: "Cron process is down"
	description: "The cron process on the {{ $labels.host }} node is down."
	SshdProcessDown:
	if: >-
	procstat_running{process_name="sshd"} == 0
	for: 2m
	labels:
	severity: critical
	service: system
	annotations:
	summary: "SSH process is down"
	description: "The SSH process on the {{ $labels.host }} node is down."
	SshFailedLoginsTooHigh:
	{%- endraw %}
	{%- set threshold = monitoring.failed_auths_threshold.warn %}
	if: >-
	increase(failed_logins_total[5m]) > {{ threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ threshold }}{%- raw %} failed SSH logins"
	description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
	PacketsDroppedByCpuWarning:
	if: >-
	floor(increase(nstat_packet_drop[10m])) > 0
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Increased number of CPU dropped packets"
	description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
	PacketsDroppedByCpuMinor:
	if: >-
	floor(increase(nstat_packet_drop[10m])) > 100
	labels:
	severity: minor
	service: system
	annotations:
	summary: "CPU dropped more than 100 packets"
	description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
	NetdevBudgetRanOutsWarning:
	{%- endraw %}
	{%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %}
	if: >-
	max(rate(nstat_time_squeeze[5m])) without (cpu) > {{ squeeze_rate_threshold }}
	for: 7m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
	description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
	{%- endraw %}
	{%- if network.bridge == 'openvswitch' %}
	{%- raw %}
	ProcessOVSVswitchdMemoryWarning:
	if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2
	for: 5m
	labels:
	severity: warning
	service: ovs
	annotations:
	summary: "ovs-vswitchd takes more than 20% of system memory"
	description: "ovs-vswitchd takes more than 20% of system memory"
	ProcessOVSVswitchdMemoryCritical:
	if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3
	for: 5m
	labels:
	severity: critical
	service: ovs
	annotations:
	summary: "ovs-vswitchd takes more than 30% of system memory"
	description: "ovs-vswitchd takes more than 30% of system memory"
	{%- endraw %}
	{%- endif %}
	{%- set bond_interfaces = [] %}
	{%- for interface_name, interface in network.interface.items() %}
	{%- if interface.type == 'bond' and interface.enabled == True %}
	{%- do bond_interfaces.append(interface_name) %}
	{%- endif %}
	{%- endfor %}
	{%- if bond_interfaces\|length > 0 %}
	{%- raw %}
	BondInterfaceDown:
	if: >-
	bond_status < 1
	labels:
	severity: critical
	service: system
	annotations:
	summary: "{{ $labels.bond }} bond interface is down"
	description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
	BondInterfaceSlaveDown:
	if: >-
	bond_slave_status < 1
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
	description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
	BondInterfaceSlaveDownMajor:
	if: >-
	sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
	labels:
	severity: major
	service: system
	annotations:
	summary: "50% of bond interface slaves {{ $labels.bond }} are down"
	description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
	BondInterfaceSingleSlave:
	if: >-
	count(bond_slave_status) by (bond,host) == 1
	labels:
	severity: major
	service: system
	annotations:
	summary: "The {{ $labels.bond }} bond interface has only one slave"
	description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has only one slave."
	{%- endraw %}
	{%- endif %}