Merge pull request #150 from horakmar/repo-key-via-proxy

Workaround for fetching repo keys via proxy.
diff --git a/linux/files/95proxies b/linux/files/95proxies
index 5bbdaea..0f689c9 100644
--- a/linux/files/95proxies
+++ b/linux/files/95proxies
@@ -1,4 +1,4 @@
 {%- from "linux/map.jinja" import network with context %}
 Acquire::http::proxy "http://{{ network.proxy.host }}:{{ network.proxy.port }}/";
 Acquire::ftp::proxy "ftp://{{ network.proxy.host }}:{{ network.proxy.port }}/";
-Acquire::https::proxy "https://{{ network.proxy.host }}:{{ network.proxy.port }}/";
\ No newline at end of file
+Acquire::https::proxy "http://{{ network.proxy.host }}:{{ network.proxy.port }}/";
diff --git a/linux/map.jinja b/linux/map.jinja
index 4236a14..a526b5b 100644
--- a/linux/map.jinja
+++ b/linux/map.jinja
@@ -282,6 +282,9 @@
 
 {% set monitoring = salt['grains.filter_by']({
     'default': {
+        'bond_status': {
+            'interfaces': False
+        },
         'zombie': {
             'warn': 3,
             'crit': 7,
@@ -307,30 +310,43 @@
             'interface_regex': '^[a-z0-9]+$',
             'ignore_selected': False,
         },
-        'bond_status': {
-            'interfaces': False
+        'cpu_usage_percentage': {
+              'warn': 90.0,
         },
-        'cpu_idle_percentage': {
-              'warn': 10.0,
+        'memory_usage_percentage': {
+            'warn': 90.0,
+            'major': 95.0,
         },
-        'free_memory_percentage': {
-              'warn': 10.0,
-              'crit': 5.0,
+        'disk_usage_percentage': {
+            'warn': 85.0,
+            'major': 95.0,
         },
-        'load_5': {
-              'warn': 3,
+        'swap_usage_percentage': {
+            'warn': 50.0,
+            'minor': 90.0,
         },
-        'rx_packets_dropped_rate': {
-              'warn': 100,
+        'inodes_usage_percentage': {
+            'warn': 85.0,
+            'major': 95.0,
         },
-        'tx_packets_dropped_rate': {
-              'warn': 100,
+        'system_load_threshold': {
+            'warn': 1,
+            'crit': 2,
+        },
+        'rx_packets_dropped_threshold': {
+            'warn': 100,
+        },
+        'tx_packets_dropped_threshold': {
+            'warn': 100,
         },
         'swap_in_rate': {
-              'warn': 1024 * 1024,
+            'warn': 1024 * 1024,
         },
         'swap_out_rate': {
-              'warn': 1024 * 1024,
+            'warn': 1024 * 1024,
+        },
+        'failed_auths_threshold': {
+            'warn': 5,
         },
     },
 }, grain='os_family', merge=salt['pillar.get']('linux:monitoring')) %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index 5d75f74..695c736 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -1,176 +1,242 @@
 {%- from "linux/map.jinja" import monitoring with context %}
 server:
   alert:
-    SystemCpuIdleTooLow:
-      {%- set cpu_idle_threshold = monitoring.cpu_idle_percentage.warn|float %}
-      if: avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) < {{ cpu_idle_threshold }}
+    SystemCpuFullWarning:
+      {%- set cpu_usage_threshold = monitoring.cpu_usage_percentage.warn|float %}
+      if: >-
+        100 - avg_over_time(cpu_usage_idle{cpu="cpu-total"}[5m]) > {{ cpu_usage_threshold }}
       {% raw %}
+      for: 2m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Idle CPU usage too low on {{ $labels.host }}'
-        description: 'The average idle CPU usage is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ cpu_idle_threshold}}%).'
-    SystemDiskSpaceTooLow:
-      if: 'predict_linear(disk_free[1h], 8*3600) < 0'
-      {% raw %}
-      for: 15m
+        summary: "{%- endraw %}{{ cpu_usage_threshold }}{%- raw %}% CPU usage"
+        description: "The average CPU usage on the {{ $labels.host }} node is {{ $value }}% for at least 2 minutes."
+    SystemLoadTooHighWarning:
+      {%- endraw %}
+      {%- set load_threshold = monitoring.system_load_threshold.warn|float %}
+      if: >-
+        system_load5 / system_n_cpus > {{ load_threshold }}
+      {%- raw %}
+      for: 5m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Free space for {{ $labels.path }} too low on {{ $labels.host }}'
-        description: 'The disk partition ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
-      {% endraw %}
-    SystemFreeOpenFilesTooLow:
-      if: 'predict_linear(linux_sysctl_fs_file_nr[1h], 8*3600) > linux_sysctl_fs_file_max'
-      {% raw %}
+        summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
+        description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
+    SystemLoadTooHighCritical:
+      {%- endraw %}
+      {%- set load_threshold = monitoring.system_load_threshold.crit|float %}
+      if: >-
+        system_load5 / system_n_cpus > {{ load_threshold }}
+      {%- raw %}
+      for: 5m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
-        description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
-      {% endraw %}
-    SystemDiskErrors:
-      if: 'increase(hdd_errors_total[5m]) > 0'
-      {% raw %}
-      labels:
-        severity: critical
-        service: system
-      annotations:
-        summary: 'Disk {{ $labels.device }} is failing'
-        description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
-      {% endraw %}
-    SystemDiskSpaceFull:
-      if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
-      {% raw %}
-      labels:
-        severity: critical
-        service: system
-      annotations:
-        summary: 'Disk partition {{ $labels.path }} full on {{ $labels.host }}'
-        description: 'The disk partition ({{ $labels.path }}) is used at {{ $value }}% on {{ $labels.host }}.'
-      {% endraw %}
-    SystemDiskInodesTooLow:
-      if: 'predict_linear(disk_inodes_free[1h], 8*3600) < 0'
-      {% raw %}
-      for: 15m
+        summary: "System load is {%- endraw %}{{ load_threshold }}{%- raw %}"
+        description: "System load per CPU on the {{ $labels.host }} node is {{ $value }} for at least 5 minutes."
+    SystemDiskFullWarning:
+      {%- endraw %}
+      {%- set disk_threshold = monitoring.disk_usage_percentage.warn|float %}
+      if: >-
+        disk_used_percent >= {{ disk_threshold }}
+      {%- raw %}
+      for: 2m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Free inodes for {{ $labels.path }} too low on {{ $labels.host }}'
-        description: 'The disk inodes ({{ $labels.path }}) will be full in less than 8 hours on {{ $labels.host }}.'
-      {% endraw %}
-    SystemDiskInodesFull:
-      if: 'disk_inodes_used / disk_inodes_total >= 0.99'
-      {% raw %}
+        summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
+        description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
+    SystemDiskFullMajor:
+      {%- endraw %}
+      {%- set disk_threshold = monitoring.disk_usage_percentage.major|float %}
+      if: >-
+        disk_used_percent >= {{ disk_threshold }}
+      {%- raw %}
+      for: 2m
       labels:
-        severity: critical
+        severity: major
         service: system
       annotations:
-        summary: 'Inodes for {{ $labels.path }} full on {{ $labels.host }}'
-        description: 'The disk inodes ({{ $labels.path }}) are used at {{ $value }}% on {{ $labels.host }}.'
-      {% endraw %}
-    SystemMemoryAvailableLow:
-      {%- set mem_avail_warn_threshold = monitoring.free_memory_percentage.warn|float %}
-      if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_warn_threshold }}
-      {% raw %}
+        summary: "Disk partition {{ $labels.path }} is {%- endraw %} {{ disk_threshold }}{%- raw %}% full"
+        description: "The disk partition ({{ $labels.path }}) on the {{ $labels.host }} node is {{ $value }}% full for at least 2 minutes."
+    SystemDiskInodesFullWarning:
+      {%- endraw %}
+      {%- set inodes_threshold = monitoring.inodes_usage_percentage.warn|float %}
+      if: >-
+        100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
+      for: 2m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Free memory low on {{ $labels.host }}'
-        description: 'The percentage of free memory is low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_warn_threshold }}%).'
-    SystemMemoryAvailableTooLow:
-      {%- set mem_avail_crit_threshold = monitoring.free_memory_percentage.crit|float %}
-      if: avg_over_time(mem_available_percent[5m]) < {{ mem_avail_crit_threshold }}
-      {% raw %}
+        summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
+        description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
+    SystemDiskInodesFullMajor:
+      {%- endraw %}
+      {%- set inodes_threshold = monitoring.inodes_usage_percentage.major|float %}
+      if: >-
+        100 * disk_inodes_used / disk_inodes_total >= {{ inodes_threshold }}
+      for: 2m
       labels:
-        severity: critical
+        severity: major
         service: system
       annotations:
-        summary: 'Free memory too low on {{ $labels.host }}'
-        description: 'The percentage of free memory is too low on node {{ $labels.host }} (current value={{ $value }}%, threshold={% endraw %}{{ mem_avail_crit_threshold }}%).'
-    SystemLoad5TooHigh:
-      if: system_load5 / system_n_cpus > {{ monitoring.load_5.warn }}
-      {% raw %}
+        summary: "{{ inodes_threshold }}{%- raw %}% of inodes for {{ $labels.path }} are used"
+        description: "The {{ $labels.host }} node uses {{ $value }}% of disk inodes in the {{ $labels.path }} volume for at least 2 minutes."
+    SystemDiskErrorsTooHigh:
+      if: >-
+        increase(hdd_errors_total[1m]) > 0
+      for: 5m
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'High system load (5m) on {{ $labels.host }}'
-        description: 'The 5-minutes system load is too high on node {{ $labels.host }} (current value={{ $value }}, threshold={% endraw %}{{ monitoring.load_5.warn }}).'
+        summary: "Disk {{ $labels.device }} is failing"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for at least 5 minutes."
+    SystemMemoryFullWarning:
+      {%- endraw %}
+      {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
+      if: >-
+        mem_used_percent >= {{ mem_threshold }}
+      for: 2m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
+        description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
+    SystemMemoryFullMajor:
+      {%- endraw %}
+      {%- set mem_threshold = monitoring.memory_usage_percentage.major|float %}
+      if: >-
+        mem_used_percent >= {{ mem_threshold }}
+      for: 2m
+      labels:
+        severity: major
+        service: system
+      annotations:
+        summary: "{{ mem_threshold }}{%- raw %}% of memory is used"
+        description: "The {{ $labels.host }} node uses {{ $value }}% of memory for at least 2 minutes."
+    SystemSwapFullWarning:
+      {%- endraw %}
+      {%- set swap_threshold = monitoring.swap_usage_percentage.warn|float %}
+      if: >-
+        swap_used_percent >= {{ swap_threshold }}
+      for: 2m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
+        description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
+    SystemSwapFullMinor:
+      {%- endraw %}
+      {%- set swap_threshold = monitoring.swap_usage_percentage.minor|float %}
+      if: >-
+        swap_used_percent >= {{ swap_threshold }}
+      for: 2m
+      labels:
+        severity: minor
+        service: system
+      annotations:
+        summary: "{{ swap_threshold }}{%- raw %}% of swap is used"
+        description: "The swap on the {{ $labels.host }} node is {{ $value }}% used for at least 2 minutes."
     SystemRxPacketsDroppedTooHigh:
-      {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_rate.warn %}
-      if: rate(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
-      {% raw %}
+      {%- endraw %}
+      {%- set net_rx_dropped_threshold = monitoring.rx_packets_dropped_threshold.warn %}
+      if: >-
+        increase(net_drop_in[1m]) > {{ net_rx_dropped_threshold }}
       labels:
-        severity: critical
+        severity: warning
         service: system
       annotations:
-        summary: 'Too many received packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
-        description: 'The rate of received packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_rx_dropped_threshold }}/sec)'
+        summary: "{{ net_rx_dropped_threshold }}{%- raw %} received packets were dropped"
+        description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
+    SystemRxPacketsDroppedLongTermTooHigh:
+      if: >-
+        increase(net_drop_in[1m]) > 0
+      for: 10m
+      labels:
+        severity: major
+        service: system
+      annotations:
+        summary: "Received packets long term dropping"
+        description: "{{ $value }} packets received by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last 10 minutes."
     SystemTxPacketsDroppedTooHigh:
-      {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_rate.warn %}
-      if: rate(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
-      {% raw %}
+      {%- endraw %}
+      {%- set net_tx_dropped_threshold = monitoring.tx_packets_dropped_threshold.warn %}
+      if: >-
+        increase(net_drop_out[1m]) > {{ net_tx_dropped_threshold }}
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "{{ net_tx_dropped_threshold }}{%- raw %} transmitted packets were dropped"
+        description: "{{ $value }} packets transmitted by the {{ $labels.interface }} interface on the {{ $labels.host }} node were dropped during the last minute."
+    CronProcessDown:
+      if: >-
+        procstat_running{process_name="cron"} == 0
       labels:
         severity: critical
         service: system
       annotations:
-        summary: 'Too many transmitted packets dropped on {{ $labels.host }} for interface {{ $labels.interface }}'
-        description: 'The rate of transmitted packets which are dropped is too high on node {{ $labels.host }} for interface {{ $labels.interface }} (current value={{ $value }}/sec, threshold={% endraw %}{{ net_tx_dropped_threshold }}/sec)'
-    SystemSwapIn:
-      {%- set swap_in_threshold = monitoring.swap_in_rate.warn %}
-      if: rate(swap_in[2m]) > {{ swap_in_threshold }}
-      {% raw %}
+        summary: "Cron process is down"
+        description: "The cron process on the {{ $labels.host }} node is down."
+    SshdProcessDown:
+      if: >-
+        procstat_running{process_name="sshd"} == 0
+      labels:
+        severity: critical
+        service: system
+      annotations:
+        summary: "SSH process is down"
+        description: "The SSH process on the {{ $labels.host }} node is down."
+    SshFailedLoginsTooHigh:
+      {%- endraw %}
+      {%- set threshold = monitoring.failed_auths_threshold.warn %}
+      if: >-
+        increase(failed_logins_total[5m]) > {{ threshold }}
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Swap input throughput too high on {{ $labels.host }}'
-        description: 'The rate of swap input bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_in_threshold }}b/s).'
-    SystemSwapOut:
-      {%- set swap_out_threshold = monitoring.swap_out_rate.warn %}
-      if: rate(swap_out[2m]) > {{ swap_out_threshold }}
-      {% raw %}
-      labels:
-        severity: warning
-        service: system
-      annotations:
-        summary: 'Swap output throughput too high on {{ $labels.host }}'
-        description: 'The rate of swap output bytes is too high on node {{ $labels.host }} (current value={{ $value }}b/s, threshold={% endraw %}{{ swap_out_threshold }}b/s).'
+        summary: "{{ threshold }}{%- raw %} failed SSH logins"
+        description: "{{ $value }} failed SSH login attempts on the {{ $labels.host }} node during the last 5 minutes."
+{%- endraw %}
 {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
+{%- raw %}
     BondInterfaceDown:
-      if: 'bond_status < 1'
-      {% raw %}
+      if: >-
+        bond_status < 1
       labels:
         severity: critical
         service: system
       annotations:
-        summary: 'Bond status interface {{ $labels.bond }} is DOWN on {{ $labels.host }}'
-        description: 'The bond interface ({{ $labels.bond }) has all ifaces in a down state on {{ $labels.host }}.'
-      {% endraw %}
-    BondSlaveInterfacesMinimum:
-      if: '(sum(bond_slave_status) BY (bond,host)) / (count(bond_slave_status) BY (bond,host)) <= 0.5'
-      {% raw %}
-      labels:
-        severity: critical
-        service: system
-      annotations:
-        summary: 'At least half of Bond slave interfaces {{ $labels.bond }} are DOWN on {{ $labels.host }}'
-        description: 'The bond interface ({{ $labels.bond }) has at least half of slave ifaces in a down state on {{ $labels.host }}.'
-      {% endraw %}
-    BondSlaveInterfaceStatus:
-      if: 'bond_slave_status < 1'
-      {% raw %}
+        summary: "{{ $labels.bond }} bond interface is down"
+        description: "The {{ $labels.bond }} bond interface on the {{ $labels.host }} node has all ifaces down."
+    BondInterfaceSlaveDown:
+      if: >-
+        bond_slave_status < 1
       labels:
         severity: warning
         service: system
       annotations:
-        summary: 'Bond slave interface {{ $labels.interface }} is DOWN on {{ $labels.host }} for {{ $labels.bond }}'
-        description: 'The bond slave interface ({{ $labels.interface }) is in DOWN state for {{ $labels.bond }} on {{ $labels.host }}.'
-      {% endraw %}
+        summary: "{{ $labels.bond }} bond interface slave {{ $labels.interface }} is down"
+        description: "The {{ $labels.bond }} bond interface slave {{ $labels.interface }} on the {{ $labels.host }} node is down."
+    BondInterfaceSlaveDownMajor:
+      if: >-
+        sum(bond_slave_status) by (bond,host) <= on (bond,host) 0.5 * count(bond_slave_status)
+      labels:
+        severity: major
+        service: system
+      annotations:
+        summary: "50% of bond interface slaves {{ $labels.bond }} are down"
+        description: "{{ $value }} {{ $labels.bond }} bond interface slaves on the {{ $labels.host }} node are down."
+{% endraw %}
 {%- endif %}
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index 854bf26..0c39da1 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -23,6 +23,12 @@
     processes:
     swap:
     system:
+    procstat:
+      process:
+        sshd:
+          exe: sshd
+        cron:
+          exe: cron
     linux_sysctl_fs:
 {%- if monitoring.bond_status.interfaces is defined and monitoring.bond_status.interfaces %}
     bond:
diff --git a/linux/network/dpdk.sls b/linux/network/dpdk.sls
index e16f9c9..c92bc82 100644
--- a/linux/network/dpdk.sls
+++ b/linux/network/dpdk.sls
@@ -138,7 +138,7 @@
         After=openvswitch-switch.service
 
 {# enforce ip address and mtu for ovs dpdk br-prv #}
-/etc/network/interfaces.d/ifcfg-{{ interface_name }}:
+/etc/network/interfaces.u/ifcfg-{{ interface_name }}:
   file.managed:
     - contents: |
         auto {{ interface_name }}
@@ -148,6 +148,7 @@
         {%- if interface.mtu is defined %}
         mtu {{ interface.mtu }}
         {%- endif %}
+    - makedirs: True
     - require:
       - file: /etc/systemd/system/ifup@{{ interface_name }}.service.d/override.conf
 
diff --git a/linux/network/interface.sls b/linux/network/interface.sls
index 21ec084..f2691a5 100644
--- a/linux/network/interface.sls
+++ b/linux/network/interface.sls
@@ -357,6 +357,9 @@
       gateway: {{ route.gateway }}
       {%- endif %}
     {%- endfor %}
+  {%- if interface.noifupdown is defined %}
+  - require_reboot: {{ interface.noifupdown }}
+  {%- endif %}
 
 {%- endif %}