linux/meta/prometheus.yml - salt-formulas/linux - Gitiles

 {%- from "linux/map.jinja" import monitoring, network with context %}
 server:
   alert:
     {%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn|float %}
     {%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit|float %}
     SystemCpuStealTimeWarning:
       if: >-
         cpu_usage_steal > {{ cpu_steal_warn }}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "CPU steal time warning"
         description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes."
     SystemCpuStealTimeCritical:
       if: >-
         cpu_usage_steal > {{ cpu_steal_crit }}
       for: 5m
       labels:
         severity: critical
         service: system
       annotations:
         summary: "CPU steal time critical"
         description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes."
     SystemCpuFullWarning:
       {%- endraw %}
       {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
       if: >-
         100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
       {% raw %}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
         description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
     SystemLoadTooHighWarning:
       {%- endraw %}
       {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
       if: >-
         system_load5 / system_n_cpus > {{ load_threshold }}
       {%- raw %}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
         description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
     SystemLoadTooHighCritical:
       {%- endraw %}
       {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
       if: >-
         system_load5 / system_n_cpus > {{ load_threshold }}
       {%- raw %}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
         description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
     SystemDiskFullWarning:
       {%- endraw %}
       {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
       if: >-
         disk_used_percent >= {{ disk_threshold }}
       {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
         description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
     SystemDiskFullMajor:
       {%- endraw %}
       {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
       if: >-
         disk_used_percent >= {{ disk_threshold }}
       {%- raw %}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
         description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
     SystemDiskInodesFullWarning:
       {%- endraw %}
       {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
       if: >-
         100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
     SystemDiskInodesFullMajor:
       {%- endraw %}
       {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
       if: >-
         100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
     SystemDiskErrorsTooHigh:
       if: >-
         increase(hdd_errors_total[1m]) > 0
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
       {%- endraw %}
   {%- if grains.get('virtual', None) == 'physical' %}
     {%- raw %}
     SystemSMARTDiskUDMACrcErrorsTooHigh:
       if: >-
         increase(smart_device_udma_crc_errors[1m]) > 0
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has UDMA CRC errors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting UDMA CRC errors for 5 minutes."
     SystemSMARTDiskHealthStatus:
       if: >-
         smart_device_health_ok == 0
       for: 1m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has bad health"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting a bad health status for 1 minute."
     SystemSMARTDiskReadErrorRate:
       if: >-
         increase(smart_device_read_error_rate[1m]) > 0
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has read errors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased read error rate for 5 minutes."
     SystemSMARTDiskSeekErrorRate:
       if: >-
         increase(smart_device_seek_error_rate[1m]) > 0
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has seek errors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased seek error rate for 5 minutes."
     SystemSMARTDiskTemperatureHigh:
       if: >-
         smart_device_temp_c >= 60
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk temperature is high"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
     SystemSMARTDiskReallocatedSectorsCount:
       if: >-
         smart_attribute_raw_value{name="Reallocated_Sector_Ct"} > 10
       labels:
         severity: warning
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has reallocated sectors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has reallocated {{ $value }} sectors."
     SystemSMARTDiskCurrentPendingSectors:
       if: >-
         smart_attribute_raw_value{name="Current_Pending_Sector"} > 0
       labels:
         severity: major
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has current pending sectors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'current pending' sectors."
     SystemSMARTDiskReportedUncorrectableErrors:
       if: >-
         smart_attribute_raw_value{name="Reported_Uncorrect"} > 0
       labels:
         severity: major
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has reported uncorrectable errors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'reported uncorrectable' errors."
     SystemSMARTDiskOfflineUncorrectableSectors:
       if: >-
         smart_attribute_raw_value{name="Offline_Uncorrectable"} > 0
       labels:
         severity: major
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has offline uncorrectable sectors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'offline uncorrectable' sectors."
     SystemSMARTDiskEndToEndError:
       if: >-
         smart_attribute_raw_value{name="End-to-End_Error"} > 0
       labels:
         severity: major
         service: system
       annotations:
         summary: "The {{ $labels.device }} disk has end-to-end errors"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'end-to-end' errors."
     {%- endraw %}
   {%- endif %}
     {%- raw %}
     SystemMemoryFullWarning:
       if: >-
         mem_used_percent > 90 and mem_available < 8 * 2^30
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "More than 90% of memory is used or less than 8 GB of memory is available"
         description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
     SystemMemoryFullMajor:
       if: >-
         mem_used_percent > 95 and mem_available < 4 * 2^30
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "More than 95% of memory is used or less than 4 GB of memory is available"
         description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
     SystemRxPacketsDroppedTooHigh:
     {%- endraw %}
       {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
       if: >-
         increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
         description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
     SystemTxPacketsDroppedTooHigh:
       {%- endraw %}
       {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
       if: >-
         increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
         description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
     CronProcessDown:
       if: >-
         procstat_running{process_name="cron"} == 0
       labels:
         severity: critical
         service: system
       annotations:
         summary: "Cron process is down"
         description: "The cron process on the {{ $labels.host }} node is down."
     SshdProcessDown:
       if: >-
         procstat_running{process_name="sshd"} == 0
       labels:
         severity: critical
         service: system
       annotations:
         summary: "SSH process is down"
         description: "The SSH process on the {{ $labels.host }} node is down."
     SshFailedLoginsTooHigh:
       {%- endraw %}
       {%- set threshold = monitoring.failed_auths_threshold.warn %}
       if: >-
         increase(failed_logins_total[5m]) > {{ threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ threshold }}{%- raw %} failed SSH logins"
         description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
     PacketsDroppedByCpuWarning:
       if: >-
         floor(increase(nstat_packet_drop[10m])) > 0
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Increased number of CPU dropped packets"
         description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
     PacketsDroppedByCpuMinor:
       if: >-
         floor(increase(nstat_packet_drop[10m])) > 100
       labels:
         severity: minor
         service: system
       annotations:
         summary: "CPU dropped more than 100 packets"
         description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
     NetdevBudgetRanOutsWarning:
       {%- endraw %}
       {%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %}
       if: >-
         max(rate(nstat_time_squeeze[5m])) without (cpu) > {{ squeeze_rate_threshold }}
       for: 7m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
         description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
       {%- endraw %}
     {%- if network.bridge == 'openvswitch' %}
       {%- raw %}
     ProcessOVSVswitchdMemoryWarning:
       if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2
       for: 5m
       labels:
         severity: warning
         service: ovs
       annotations:
         summary: "ovs-vswitchd takes more than 20% of system memory"
         description: "ovs-vswitchd takes more than 20% of system memory"
     ProcessOVSVswitchdMemoryCritical:
       if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3
       for: 5m
       labels:
         severity: critical
         service: ovs
       annotations:
         summary: "ovs-vswitchd takes more than 30% of system memory"
         description: "ovs-vswitchd takes more than 30% of system memory"
       {%- endraw %}
     {%- endif %}
 {%- set bond_interfaces = [] %}
 {%- for interface_name, interface in network.interface.items() %}
   {%- if interface.type == 'bond' and interface.enabled == True %}
     {%- do bond_interfaces.append(interface_name) %}
   {%- endif %}
 {%- endfor %}
 {%- if bond_interfaces|length > 0 %}
   {%- raw %}
     BondInterfaceDown:
       if: >-
         bond_status < 1
       labels:
         severity: critical
         service: system
       annotations:
         summary: "{{ $labels.bond }} bond interface is down"
         description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
     BondInterfaceSlaveDown:
       if: >-
         bond_slave_status < 1
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
         description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
     BondInterfaceSlaveDownMajor:
       if: >-
         sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
       labels:
         severity: major
         service: system
       annotations:
         summary: "50% of bond interface slaves {{ $labels.bond }} are down"
         description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
     BondInterfaceSingleSlave:
       if: >-
         count(bond_slave_status) by (bond,host) == 1
       labels:
         severity: major
         service: system
       annotations:
         summary: "The {{ $labels.bond }} bond interface has only one slave"
         description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has only one slave."
   {%- endraw %}
 {%- endif %}
	{%- from "linux/map.jinja" import monitoring, network with context %}
	server:
	alert:
	{%- set cpu_steal_warn = monitoring.cpu_steal_percentage.warn\|float %}
	{%- set cpu_steal_crit = monitoring.cpu_steal_percentage.crit\|float %}
	SystemCpuStealTimeWarning:
	if: >-
	cpu_usage_steal > {{ cpu_steal_warn }}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "CPU steal time warning"
	description: "The CPU steal time was above {{ cpu_steal_warn }}% on the {%- raw %} {{ $labels.host }}{%- endraw %} node for 5 minutes."
	SystemCpuStealTimeCritical:
	if: >-
	cpu_usage_steal > {{ cpu_steal_crit }}
	for: 5m
	labels:
	severity: critical
	service: system
	annotations:
	summary: "CPU steal time critical"
	description: "The CPU steal time was above {{ cpu_steal_crit }}% on the {%- raw %} {{ $labels.host }} node for 5 minutes."
	SystemCpuFullWarning:
	{%- endraw %}
	{%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn\|float %}
	if: >-
	100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
	{% raw %}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
	description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
	SystemLoadTooHighWarning:
	{%- endraw %}
	{%- set load_threshold = monitoring.system_load_threshold.warn\|float %}
	if: >-
	system_load5 / system_n_cpus > {{ load_threshold }}
	{%- raw %}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
	description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
	SystemLoadTooHighCritical:
	{%- endraw %}
	{%- set load_threshold = monitoring.system_load_threshold.crit\|float %}
	if: >-
	system_load5 / system_n_cpus > {{ load_threshold }}
	{%- raw %}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
	description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
	SystemDiskFullWarning:
	{%- endraw %}
	{%- set disk_threshold = monitoring.disk_usage_percentage.warn\|float %}
	if: >-
	disk_used_percent >= {{ disk_threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
	description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
	SystemDiskFullMajor:
	{%- endraw %}
	{%- set disk_threshold = monitoring.disk_usage_percentage.major\|float %}
	if: >-
	disk_used_percent >= {{ disk_threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
	description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
	SystemDiskInodesFullWarning:
	{%- endraw %}
	{%- set inodes_threshold = monitoring.inodes_usage_percentage.warn\|float %}
	if: >-
	100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
	SystemDiskInodesFullMajor:
	{%- endraw %}
	{%- set inodes_threshold = monitoring.inodes_usage_percentage.major\|float %}
	if: >-
	100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
	SystemDiskErrorsTooHigh:
	if: >-
	increase(hdd_errors_total[1m]) > 0
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk {{ $labels.device }} is failing"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
	{%- endraw %}
	{%- if grains.get('virtual', None) == 'physical' %}
	{%- raw %}
	SystemSMARTDiskUDMACrcErrorsTooHigh:
	if: >-
	increase(smart_device_udma_crc_errors[1m]) > 0
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has UDMA CRC errors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting UDMA CRC errors for 5 minutes."
	SystemSMARTDiskHealthStatus:
	if: >-
	smart_device_health_ok == 0
	for: 1m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has bad health"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting a bad health status for 1 minute."
	SystemSMARTDiskReadErrorRate:
	if: >-
	increase(smart_device_read_error_rate[1m]) > 0
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has read errors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased read error rate for 5 minutes."
	SystemSMARTDiskSeekErrorRate:
	if: >-
	increase(smart_device_seek_error_rate[1m]) > 0
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has seek errors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased seek error rate for 5 minutes."
	SystemSMARTDiskTemperatureHigh:
	if: >-
	smart_device_temp_c >= 60
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk temperature is high"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
	SystemSMARTDiskReallocatedSectorsCount:
	if: >-
	smart_attribute_raw_value{name="Reallocated_Sector_Ct"} > 10
	labels:
	severity: warning
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has reallocated sectors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has reallocated {{ $value }} sectors."
	SystemSMARTDiskCurrentPendingSectors:
	if: >-
	smart_attribute_raw_value{name="Current_Pending_Sector"} > 0
	labels:
	severity: major
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has current pending sectors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'current pending' sectors."
	SystemSMARTDiskReportedUncorrectableErrors:
	if: >-
	smart_attribute_raw_value{name="Reported_Uncorrect"} > 0
	labels:
	severity: major
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has reported uncorrectable errors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'reported uncorrectable' errors."
	SystemSMARTDiskOfflineUncorrectableSectors:
	if: >-
	smart_attribute_raw_value{name="Offline_Uncorrectable"} > 0
	labels:
	severity: major
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has offline uncorrectable sectors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'offline uncorrectable' sectors."
	SystemSMARTDiskEndToEndError:
	if: >-
	smart_attribute_raw_value{name="End-to-End_Error"} > 0
	labels:
	severity: major
	service: system
	annotations:
	summary: "The {{ $labels.device }} disk has end-to-end errors"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has {{ $value }} 'end-to-end' errors."
	{%- endraw %}
	{%- endif %}
	{%- raw %}
	SystemMemoryFullWarning:
	if: >-
	mem_used_percent > 90 and mem_available < 8 * 2^30
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "More than 90% of memory is used or less than 8 GB of memory is available"
	description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
	SystemMemoryFullMajor:
	if: >-
	mem_used_percent > 95 and mem_available < 4 * 2^30
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "More than 95% of memory is used or less than 4 GB of memory is available"
	description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
	SystemRxPacketsDroppedTooHigh:
	{%- endraw %}
	{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
	if: >-
	increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }} unless on (host,interface) bond_slave_active == 0
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
	description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
	SystemTxPacketsDroppedTooHigh:
	{%- endraw %}
	{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
	if: >-
	increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
	description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
	CronProcessDown:
	if: >-
	procstat_running{process_name="cron"} == 0
	labels:
	severity: critical
	service: system
	annotations:
	summary: "Cron process is down"
	description: "The cron process on the {{ $labels.host }} node is down."
	SshdProcessDown:
	if: >-
	procstat_running{process_name="sshd"} == 0
	labels:
	severity: critical
	service: system
	annotations:
	summary: "SSH process is down"
	description: "The SSH process on the {{ $labels.host }} node is down."
	SshFailedLoginsTooHigh:
	{%- endraw %}
	{%- set threshold = monitoring.failed_auths_threshold.warn %}
	if: >-
	increase(failed_logins_total[5m]) > {{ threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ threshold }}{%- raw %} failed SSH logins"
	description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
	PacketsDroppedByCpuWarning:
	if: >-
	floor(increase(nstat_packet_drop[10m])) > 0
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Increased number of CPU dropped packets"
	description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
	PacketsDroppedByCpuMinor:
	if: >-
	floor(increase(nstat_packet_drop[10m])) > 100
	labels:
	severity: minor
	service: system
	annotations:
	summary: "CPU dropped more than 100 packets"
	description: "The {{ $labels.cpu }} CPU on the {{ $labels.host }} node dropped {{ $value }} packets during the last 10 minutes."
	NetdevBudgetRanOutsWarning:
	{%- endraw %}
	{%- set squeeze_rate_threshold = monitoring.netdev_budget_squeeze_rate %}
	if: >-
	max(rate(nstat_time_squeeze[5m])) without (cpu) > {{ squeeze_rate_threshold }}
	for: 7m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
	description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
	{%- endraw %}
	{%- if network.bridge == 'openvswitch' %}
	{%- raw %}
	ProcessOVSVswitchdMemoryWarning:
	if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2
	for: 5m
	labels:
	severity: warning
	service: ovs
	annotations:
	summary: "ovs-vswitchd takes more than 20% of system memory"
	description: "ovs-vswitchd takes more than 20% of system memory"
	ProcessOVSVswitchdMemoryCritical:
	if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3
	for: 5m
	labels:
	severity: critical
	service: ovs
	annotations:
	summary: "ovs-vswitchd takes more than 30% of system memory"
	description: "ovs-vswitchd takes more than 30% of system memory"
	{%- endraw %}
	{%- endif %}
	{%- set bond_interfaces = [] %}
	{%- for interface_name, interface in network.interface.items() %}
	{%- if interface.type == 'bond' and interface.enabled == True %}
	{%- do bond_interfaces.append(interface_name) %}
	{%- endif %}
	{%- endfor %}
	{%- if bond_interfaces\|length > 0 %}
	{%- raw %}
	BondInterfaceDown:
	if: >-
	bond_status < 1
	labels:
	severity: critical
	service: system
	annotations:
	summary: "{{ $labels.bond }} bond interface is down"
	description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
	BondInterfaceSlaveDown:
	if: >-
	bond_slave_status < 1
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
	description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
	BondInterfaceSlaveDownMajor:
	if: >-
	sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
	labels:
	severity: major
	service: system
	annotations:
	summary: "50% of bond interface slaves {{ $labels.bond }} are down"
	description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
	BondInterfaceSingleSlave:
	if: >-
	count(bond_slave_status) by (bond,host) == 1
	labels:
	severity: major
	service: system
	annotations:
	summary: "The {{ $labels.bond }} bond interface has only one slave"
	description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has only one slave."
	{%- endraw %}
	{%- endif %}