linux/meta/prometheus.yml - salt-formulas/linux - Gitiles

 {%- from "linux/map.jinja" import monitoring with context %}
 server:
   alert:
     SystemCpuFullWarning:
       {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
       if: >-
         100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
       {% raw %}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
         description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
     SystemLoadTooHighWarning:
       {%- endraw %}
       {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
       if: >-
         system_load5 / system_n_cpus > {{ load_threshold }}
       {%- raw %}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
         description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
     SystemLoadTooHighCritical:
       {%- endraw %}
       {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
       if: >-
         system_load5 / system_n_cpus > {{ load_threshold }}
       {%- raw %}
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
         description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
     SystemDiskFullWarning:
       {%- endraw %}
       {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
       if: >-
         disk_used_percent >= {{ disk_threshold }}
       {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
         description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
     SystemDiskFullMajor:
       {%- endraw %}
       {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
       if: >-
         disk_used_percent >= {{ disk_threshold }}
       {%- raw %}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
         description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
     SystemDiskInodesFullWarning:
       {%- endraw %}
       {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
       if: >-
         100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
     SystemDiskInodesFullMajor:
       {%- endraw %}
       {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
       if: >-
         100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
     SystemDiskErrorsTooHigh:
       if: >-
         increase(hdd_errors_total[1m]) > 0
       for: 5m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
     SystemMemoryFullWarning:
       {%- endraw %}
       {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
       if: >-
         mem_used_percent >= {{ mem_threshold }}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
     SystemMemoryFullMajor:
       {%- endraw %}
       {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
       if: >-
         mem_used_percent >= {{ mem_threshold }}
       for: 2m
       labels:
         severity: major
         service: system
       annotations:
         summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
         description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
     SystemSwapFullWarning:
       {%- endraw %}
       {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
       if: >-
         swap_used_percent >= {{ swap_threshold }}
       for: 2m
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
         description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
     SystemSwapFullMinor:
       {%- endraw %}
       {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
       if: >-
         swap_used_percent >= {{ swap_threshold }}
       for: 2m
       labels:
         severity: minor
         service: system
       annotations:
         summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
         description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
     SystemRxPacketsDroppedTooHigh:
       {%- endraw %}
       {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
       if: >-
         increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
         description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
     SystemRxPacketsDroppedLongTermTooHigh:
       if: >-
         increase(net_drop_in[1m]) > 0
       for: 10m
       labels:
         severity: major
         service: system
       annotations:
         summary: "Received packets long term dropping"
         description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
     SystemTxPacketsDroppedTooHigh:
       {%- endraw %}
       {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
       if: >-
         increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
         description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
     CronProcessDown:
       if: >-
         procstat_running{process_name="cron"} == 0
       labels:
         severity: critical
         service: system
       annotations:
         summary: "Cron process is down"
         description: "The cron process on the {{ $labels.host }} node is down."
     SshdProcessDown:
       if: >-
         procstat_running{process_name="sshd"} == 0
       labels:
         severity: critical
         service: system
       annotations:
         summary: "SSH process is down"
         description: "The SSH process on the {{ $labels.host }} node is down."
     SshFailedLoginsTooHigh:
       {%- endraw %}
       {%- set threshold = monitoring.failed_auths_threshold.warn %}
       if: >-
         increase(failed_logins_total[5m]) > {{ threshold }}
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ threshold }}{%- raw %} failed SSH logins"
         description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
 {%- endraw %}
 {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
 {%- raw %}
     BondInterfaceDown:
       if: >-
         bond_status < 1
       labels:
         severity: critical
         service: system
       annotations:
         summary: "{{ $labels.bond }} bond interface is down"
         description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
     BondInterfaceSlaveDown:
       if: >-
         bond_slave_status < 1
       labels:
         severity: warning
         service: system
       annotations:
         summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
         description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
     BondInterfaceSlaveDownMajor:
       if: >-
         sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
       labels:
         severity: major
         service: system
       annotations:
         summary: "50% of bond interface slaves {{ $labels.bond }} are down"
         description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
 {% endraw %}
 {%- endif %}
	{%- from "linux/map.jinja" import monitoring with context %}
	server:
	alert:
	SystemCpuFullWarning:
	{%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn\|float %}
	if: >-
	100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
	{% raw %}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
	description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for 2 minutes."
	SystemLoadTooHighWarning:
	{%- endraw %}
	{%- set load_threshold = monitoring.system_load_threshold.warn\|float %}
	if: >-
	system_load5 / system_n_cpus > {{ load_threshold }}
	{%- raw %}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
	description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
	SystemLoadTooHighCritical:
	{%- endraw %}
	{%- set load_threshold = monitoring.system_load_threshold.crit\|float %}
	if: >-
	system_load5 / system_n_cpus > {{ load_threshold }}
	{%- raw %}
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
	description: "The system load per CPU on the {{ $labels.host }} node is {{ $value }} for 5 minutes."
	SystemDiskFullWarning:
	{%- endraw %}
	{%- set disk_threshold = monitoring.disk_usage_percentage.warn\|float %}
	if: >-
	disk_used_percent >= {{ disk_threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
	description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
	SystemDiskFullMajor:
	{%- endraw %}
	{%- set disk_threshold = monitoring.disk_usage_percentage.major\|float %}
	if: >-
	disk_used_percent >= {{ disk_threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
	description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for 2 minutes."
	SystemDiskInodesFullWarning:
	{%- endraw %}
	{%- set inodes_threshold = monitoring.inodes_usage_percentage.warn\|float %}
	if: >-
	100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
	SystemDiskInodesFullMajor:
	{%- endraw %}
	{%- set inodes_threshold = monitoring.inodes_usage_percentage.major\|float %}
	if: >-
	100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for 2 minutes."
	SystemDiskErrorsTooHigh:
	if: >-
	increase(hdd_errors_total[1m]) > 0
	for: 5m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "Disk {{ $labels.device }} is failing"
	description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
	SystemMemoryFullWarning:
	{%- endraw %}
	{%- set mem_threshold = monitoring.memory_usage_percentage.warn\|float %}
	if: >-
	mem_used_percent >= {{ mem_threshold }}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
	SystemMemoryFullMajor:
	{%- endraw %}
	{%- set mem_threshold = monitoring.memory_usage_percentage.major\|float %}
	if: >-
	mem_used_percent >= {{ mem_threshold }}
	for: 2m
	labels:
	severity: major
	service: system
	annotations:
	summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
	description: "The {{ $labels.host }} node uses {{ $value }}% of memory for 2 minutes."
	SystemSwapFullWarning:
	{%- endraw %}
	{%- set swap_threshold = monitoring.swap_usage_percentage.warn\|float %}
	if: >-
	swap_used_percent >= {{ swap_threshold }}
	for: 2m
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
	description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
	SystemSwapFullMinor:
	{%- endraw %}
	{%- set swap_threshold = monitoring.swap_usage_percentage.minor\|float %}
	if: >-
	swap_used_percent >= {{ swap_threshold }}
	for: 2m
	labels:
	severity: minor
	service: system
	annotations:
	summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
	description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for 2 minutes."
	SystemRxPacketsDroppedTooHigh:
	{%- endraw %}
	{%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
	if: >-
	increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
	description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
	SystemRxPacketsDroppedLongTermTooHigh:
	if: >-
	increase(net_drop_in[1m]) > 0
	for: 10m
	labels:
	severity: major
	service: system
	annotations:
	summary: "Received packets long term dropping"
	description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
	SystemTxPacketsDroppedTooHigh:
	{%- endraw %}
	{%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
	if: >-
	increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
	description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
	CronProcessDown:
	if: >-
	procstat_running{process_name="cron"} == 0
	labels:
	severity: critical
	service: system
	annotations:
	summary: "Cron process is down"
	description: "The cron process on the {{ $labels.host }} node is down."
	SshdProcessDown:
	if: >-
	procstat_running{process_name="sshd"} == 0
	labels:
	severity: critical
	service: system
	annotations:
	summary: "SSH process is down"
	description: "The SSH process on the {{ $labels.host }} node is down."
	SshFailedLoginsTooHigh:
	{%- endraw %}
	{%- set threshold = monitoring.failed_auths_threshold.warn %}
	if: >-
	increase(failed_logins_total[5m]) > {{ threshold }}
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ threshold }}{%- raw %} failed SSH logins"
	description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
	{%- endraw %}
	{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
	{%- raw %}
	BondInterfaceDown:
	if: >-
	bond_status < 1
	labels:
	severity: critical
	service: system
	annotations:
	summary: "{{ $labels.bond }} bond interface is down"
	description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
	BondInterfaceSlaveDown:
	if: >-
	bond_slave_status < 1
	labels:
	severity: warning
	service: system
	annotations:
	summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
	description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
	BondInterfaceSlaveDownMajor:
	if: >-
	sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
	labels:
	severity: major
	service: system
	annotations:
	summary: "50% of bond interface slaves {{ $labels.bond }} are down"
	description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
	{% endraw %}
	{%- endif %}