Merge pull request #150 from horakmar/repo-key-via-proxy
Workaround for fetching repo keys via proxy.
diff --git a/linux/files/95proxies b/linux/files/95proxies
index 5bbdaea..0f689c9 100644
--- a/linux/files/95proxies
+++ b/linux/files/95proxies
@@ -1,4 +1,4 @@
{%- from "linux/map.jinja" import network with context %}
Acquire::http::proxy "http://{{ network.proxy.host }}:{{ network.proxy.port }}/";
Acquire::ftp::proxy "ftp://{{ network.proxy.host }}:{{ network.proxy.port }}/";
-Acquire::https::proxy "https://{{ network.proxy.host }}:{{ network.proxy.port }}/";
\ No newline at end of file
+Acquire::https::proxy "http://{{ network.proxy.host }}:{{ network.proxy.port }}/";
diff --git a/linux/map.jinja b/linux/map.jinja
index 4236a14..a526b5b 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -282,6 +282,9 @@
{% set monitoring = salt['grains.filter_by']({
'default': {
+ 'bond_status': {
+ 'interfaces': False
+ },
'zombie': {
'warn': 3,
'crit': 7,
@@ -307,30 +310,43 @@
'interface_regex': '^[a-z0-9]+$',
'ignore_selected': False,
},
- 'bond_status': {
- 'interfaces': False
+ 'cpu_usage_percentage': {
+ 'warn': 90.0,
},
- 'cpu_idle_percentage': {
- 'warn': 10.0,
+ 'memory_usage_percentage': {
+ 'warn': 90.0,
+ 'major': 95.0,
},
- 'free_memory_percentage': {
- 'warn': 10.0,
- 'crit': 5.0,
+ 'disk_usage_percentage': {
+ 'warn': 85.0,
+ 'major': 95.0,
},
- 'load_5': {
- 'warn': 3,
+ 'swap_usage_percentage': {
+ 'warn': 50.0,
+ 'minor': 90.0,
},
- 'rx_packets_dropped_rate': {
- 'warn': 100,
+ 'inodes_usage_percentage': {
+ 'warn': 85.0,
+ 'major': 95.0,
},
- 'tx_packets_dropped_rate': {
- 'warn': 100,
+ 'system_load_threshold': {
+ 'warn': 1,
+ 'crit': 2,
+ },
+ 'rx_packets_dropped_threshold': {
+ 'warn': 100,
+ },
+ 'tx_packets_dropped_threshold': {
+ 'warn': 100,
},
'swap_in_rate': {
- 'warn': 1024 * 1024,
+ 'warn': 1024 * 1024,
},
'swap_out_rate': {
- 'warn': 1024 * 1024,
+ 'warn': 1024 * 1024,
+ },
+ 'failed_auths_threshold': {
+ 'warn': 5,
},
},
}, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 5d75f74..695c736 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,176 +1,242 @@
{%- from "linux/map.jinja" import monitoring with context %}
server:
alert:
- SystemCpuIdleTooLow:
- {%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
- if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
+ SystemCpuFullWarning:
+ {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
+ if: >-
+ 100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
{% raw %}
+ for: 2m
labels:
severity: warning
service: system
annotations:
- summary: 'Idle CPU usage too low on {{ $labels.host }}'
- description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
- SystemDiskSpaceTooLow:
- if: 'predict_linear(disk_free[1h], 8*3600) < 0'
- {% raw %}
- for: 15m
+ summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
+ description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for at least 2 minutes."
+ SystemLoadTooHighWarning:
+ {%- endraw %}
+ {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
+ if: >-
+ system_load5 / system_n_cpus > {{ load_threshold }}
+ {%- raw %}
+ for: 5m
labels:
severity: warning
service: system
annotations:
- summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
- description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
- {% endraw %}
- SystemFreeOpenFilesTooLow:
- if: 'predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max'
- {% raw %}
+ summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
+ description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
+ SystemLoadTooHighCritical:
+ {%- endraw %}
+ {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
+ if: >-
+ system_load5 / system_n_cpus > {{ load_threshold }}
+ {%- raw %}
+ for: 5m
labels:
severity: warning
service: system
annotations:
- summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
- description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
- {% endraw %}
- SystemDiskErrors:
- if: 'increase(hdd_errors_total[5m]) > 0'
- {% raw %}
- labels:
- severity: critical
- service: system
- annotations:
- summary: 'Disk {{ $labels.device }} is failing'
- description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
- {% endraw %}
- SystemDiskSpaceFull:
- if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
- {% raw %}
- labels:
- severity: critical
- service: system
- annotations:
- summary: 'Disk partition {{ $labels.path }} full on {{ $labels.host }}'
- description: 'The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}.'
- {% endraw %}
- SystemDiskInodesTooLow:
- if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
- {% raw %}
- for: 15m
+ summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
+ description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
+ SystemDiskFullWarning:
+ {%- endraw %}
+ {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
+ if: >-
+ disk_used_percent >= {{ disk_threshold }}
+ {%- raw %}
+ for: 2m
labels:
severity: warning
service: system
annotations:
- summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
- description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
- {% endraw %}
- SystemDiskInodesFull:
- if: 'disk_inodes_used / disk_inodes_total >= 0.99'
- {% raw %}
+ summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
+ description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
+ SystemDiskFullMajor:
+ {%- endraw %}
+ {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
+ if: >-
+ disk_used_percent >= {{ disk_threshold }}
+ {%- raw %}
+ for: 2m
labels:
- severity: critical
+ severity: major
service: system
annotations:
- summary: 'Inodes for {{ $labels.path }} full on {{ $labels.host }}'
- description: 'The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}.'
- {% endraw %}
- SystemMemoryAvailableLow:
- {%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
- if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
- {% raw %}
+ summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
+ description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
+ SystemDiskInodesFullWarning:
+ {%- endraw %}
+ {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
+ if: >-
+ 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
+ for: 2m
labels:
severity: warning
service: system
annotations:
- summary: 'Free memory low on {{ $labels.host }}'
- description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
- SystemMemoryAvailableTooLow:
- {%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
- if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
- {% raw %}
+ summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
+ description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
+ SystemDiskInodesFullMajor:
+ {%- endraw %}
+ {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
+ if: >-
+ 100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
+ for: 2m
labels:
- severity: critical
+ severity: major
service: system
annotations:
- summary: 'Free memory too low on {{ $labels.host }}'
- description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
- SystemLoad5TooHigh:
- if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
- {% raw %}
+ summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
+ description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
+ SystemDiskErrorsTooHigh:
+ if: >-
+ increase(hdd_errors_total[1m]) > 0
+ for: 5m
labels:
severity: warning
service: system
annotations:
- summary: 'High system load (5m) on {{ $labels.host }}'
- description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
+ summary: "Disk {{ $labels.device }} is failing"
+ description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for at least 5 minutes."
+ SystemMemoryFullWarning:
+ {%- endraw %}
+ {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
+ if: >-
+ mem_used_percent >= {{ mem_threshold }}
+ for: 2m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
+ description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
+ SystemMemoryFullMajor:
+ {%- endraw %}
+ {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
+ if: >-
+ mem_used_percent >= {{ mem_threshold }}
+ for: 2m
+ labels:
+ severity: major
+ service: system
+ annotations:
+ summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
+ description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
+ SystemSwapFullWarning:
+ {%- endraw %}
+ {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
+ if: >-
+ swap_used_percent >= {{ swap_threshold }}
+ for: 2m
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
+ description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
+ SystemSwapFullMinor:
+ {%- endraw %}
+ {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
+ if: >-
+ swap_used_percent >= {{ swap_threshold }}
+ for: 2m
+ labels:
+ severity: minor
+ service: system
+ annotations:
+ summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
+ description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
SystemRxPacketsDroppedTooHigh:
- {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
- if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
- {% raw %}
+ {%- endraw %}
+ {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
+ if: >-
+ increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
labels:
- severity: critical
+ severity: warning
service: system
annotations:
- summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
- description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
+ summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
+ description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
+ SystemRxPacketsDroppedLongTermTooHigh:
+ if: >-
+ increase(net_drop_in[1m]) > 0
+ for: 10m
+ labels:
+ severity: major
+ service: system
+ annotations:
+ summary: "Received packets long term dropping"
+ description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
SystemTxPacketsDroppedTooHigh:
- {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
- if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
- {% raw %}
+ {%- endraw %}
+ {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
+ if: >-
+ increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
+ labels:
+ severity: warning
+ service: system
+ annotations:
+ summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
+ description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
+ CronProcessDown:
+ if: >-
+ procstat_running{process_name="cron"} == 0
labels:
severity: critical
service: system
annotations:
- summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
- description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
- SystemSwapIn:
- {%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
- if: rate(swap_in[2m]) > {{ swap_in_threshold }}
- {% raw %}
+ summary: "Cron process is down"
+ description: "The cron process on the {{ $labels.host }} node is down."
+ SshdProcessDown:
+ if: >-
+ procstat_running{process_name="sshd"} == 0
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: "SSH process is down"
+ description: "The SSH process on the {{ $labels.host }} node is down."
+ SshFailedLoginsTooHigh:
+ {%- endraw %}
+ {%- set threshold = monitoring.failed_auths_threshold.warn %}
+ if: >-
+ increase(failed_logins_total[5m]) > {{ threshold }}
labels:
severity: warning
service: system
annotations:
- summary: 'Swap input throughput too high on {{ $labels.host }}'
- description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
- SystemSwapOut:
- {%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
- if: rate(swap_out[2m]) > {{ swap_out_threshold }}
- {% raw %}
- labels:
- severity: warning
- service: system
- annotations:
- summary: 'Swap output throughput too high on {{ $labels.host }}'
- description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
+ summary: "{{ threshold }}{%- raw %} failed SSH logins"
+ description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
+{%- endraw %}
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
+{%- raw %}
BondInterfaceDown:
- if: 'bond_status < 1'
- {% raw %}
+ if: >-
+ bond_status < 1
labels:
severity: critical
service: system
annotations:
- summary: 'Bond status interface {{ $labels.bond }} is DOWN on {{ $labels.host }}'
- description: 'The bond interface ({{ $labels.bond }) has all ifaces in a down state on {{ $labels.host }}.'
- {% endraw %}
- BondSlaveInterfacesMinimum:
- if: '(sum(bond_slave_status) BY (bond,host)) / (count(bond_slave_status) BY (bond,host)) <= 0.5'
- {% raw %}
- labels:
- severity: critical
- service: system
- annotations:
- summary: 'At least half of Bond slave interfaces {{ $labels.bond }} are DOWN on {{ $labels.host }}'
- description: 'The bond interface ({{ $labels.bond }) has at least half of slave ifaces in a down state on {{ $labels.host }}.'
- {% endraw %}
- BondSlaveInterfaceStatus:
- if: 'bond_slave_status < 1'
- {% raw %}
+ summary: "{{ $labels.bond }} bond interface is down"
+ description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
+ BondInterfaceSlaveDown:
+ if: >-
+ bond_slave_status < 1
labels:
severity: warning
service: system
annotations:
- summary: 'Bond slave interface {{ $labels.interface }} is DOWN on {{ $labels.host }} for {{ $labels.bond }}'
- description: 'The bond slave interface ({{ $labels.interface }) is in DOWN state for {{ $labels.bond }} on {{ $labels.host }}.'
- {% endraw %}
+ summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
+ description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
+ BondInterfaceSlaveDownMajor:
+ if: >-
+ sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
+ labels:
+ severity: major
+ service: system
+ annotations:
+ summary: "50% of bond interface slaves {{ $labels.bond }} are down"
+ description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
+{% endraw %}
{%- endif %}
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index 854bf26..0c39da1 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -23,6 +23,12 @@
processes:
swap:
system:
+ procstat:
+ process:
+ sshd:
+ exe: sshd
+ cron:
+ exe: cron
linux_sysctl_fs:
{%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
bond:
diff --git a/linux/network/dpdk.sls b/linux/network/dpdk.sls
index e16f9c9..c92bc82 100644
--- a/linux/network/dpdk.sls
+++ b/linux/network/dpdk.sls
@@ -138,7 +138,7 @@
After=openvswitch-switch.service
{# enforce ip address and mtu for ovs dpdk br-prv #}
-/etc/network/interfaces.d/ifcfg-{{ interface_name }}:
+/etc/network/interfaces.u/ifcfg-{{ interface_name }}:
file.managed:
- contents: |
auto {{ interface_name }}
@@ -148,6 +148,7 @@
{%- if interface.mtu is defined %}
mtu {{ interface.mtu }}
{%- endif %}
+ - makedirs: True
- require:
- file: /etc/systemd/system/ifup@{{ interface_name }}.service.d/override.conf
diff --git a/linux/network/interface.sls b/linux/network/interface.sls
index 21ec084..f2691a5 100644
--- a/linux/network/interface.sls
+++ b/linux/network/interface.sls
@@ -357,6 +357,9 @@
gateway: {{ route.gateway }}
{%- endif %}
{%- endfor %}
+ {%- if interface.noifupdown is defined %}
+ - require_reboot: {{ interface.noifupdown }}
+ {%- endif %}
{%- endif %}