Merge "Handle min kernel requirement for some sysctl parameters" into release/2019.2.0
diff --git a/linux/files/smart_telegraf.conf b/linux/files/smart_telegraf.conf
new file mode 100644
index 0000000..e787120
--- /dev/null
+++ b/linux/files/smart_telegraf.conf
@@ -0,0 +1,21 @@
+[[inputs.smart]]
+{%- include 'telegraf/files/input/_common.conf' %}
+{%- if values.path is defined %}
+  path = "{{ values.path }}"
+{%- endif %}
+{%- if values.use_sudo is defined %}
+  use_sudo = {{ values.use_sudo|lower }}
+{%- endif %}
+{%- if values.nocheck is defined %}
+  nocheck = "{{ values.nocheck }}"
+{%- endif %}
+{%- if values.attributes is defined %}
+  attributes = {{ values.attributes|lower }}
+{%- endif %}
+{%- if values.excludes is defined %}
+  excludes = {{ values.excludes | json }}
+{%- endif %}
+{%- if values.devices is defined %}
+  devices = {{ values.devices | json }}
+{%- endif %}
+{%- include 'telegraf/files/input/_filters.conf' %}
diff --git a/linux/meta/fluentd.yml b/linux/meta/fluentd.yml
index f6d6720..60d2ef5 100644
--- a/linux/meta/fluentd.yml
+++ b/linux/meta/fluentd.yml
@@ -29,7 +29,16 @@
             label:
               - name: host
                 value: ${Hostname}
+          metric_hdd_errors_exclude:
+            tag: metric.hdd_errors
+            type: grep
+            # Regexp: https://regex101.com/r/ZRMX9j/3
+            exclude:
+              - name: Payload
+                regexp: (virDomainNetFind|libvirt|(At|De)tach(ing)?\ volume)
           metric_hdd_errors_parse:
+            require:
+              - metric_hdd_errors_exclude
             tag: metric.hdd_errors
             type: parser
             key_name: Payload
@@ -87,6 +96,14 @@
               - name: ident
                 regexp: '^(.*)$'
                 result: $1.systemd
+  {%- if pillar.get('telegraf', {}).get('agent', {}).get('enabled', False) %}
+          push_to_telegraf:
+            require_in:
+              - push_to_default
+            tag: 'telegraf.systemd'
+            type: relabel
+            label: telegraf
+  {%- endif %}
           push_to_default:
             tag: '*.systemd'
             type: copy
@@ -119,7 +136,16 @@
     label:
       default_metric:
         filter:
+          metric_hdd_errors_exclude:
+            tag: metric.hdd_errors
+            type: grep
+            # Regexp: https://regex101.com/r/ZRMX9j/3
+            exclude:
+              - name: Payload
+                regexp: (virDomainNetFind|libvirt|(At|De)tach(ing)?\ volume)
           metric_hdd_errors_parse:
+            require:
+              - metric_hdd_errors_exclude
             tag: metric.hdd_errors
             type: parser
             key_name: Payload
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index f405367..c202548 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -99,8 +99,62 @@
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
-    SystemMemoryFullWarning:
       {%- endraw %}
+  {%- if grains.get('virtual', None) == 'physical' %}
+    {%- raw %}
+    SystemSMARTDiskUDMACrcErrorsTooHigh:
+      if: >-
+        increase(smart_device_udma_crc_errors[1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has UDMA CRC errors"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting UDMA CRC errors for 5 minutes."
+    SystemSMARTDiskHealthStatus:
+      if: >-
+        smart_device_health_ok == 0
+      for: 1m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has bad health"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting a bad health status for 1 minute."
+    SystemSMARTDiskReadErrorRate:
+      if: >-
+        increase(smart_device_read_error_rate[1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has read errors"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased read error rate for 5 minutes."
+    SystemSMARTDiskSeekErrorRate:
+      if: >-
+        increase(smart_device_seek_error_rate[1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has seek errors"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased seek error rate for 5 minutes."
+    SystemSMARTDiskTemperatureHigh:
+      if: >-
+        smart_device_temp_c >= 60
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk temperature is high"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
+    {%- endraw %}
+  {%- endif %}
+    SystemMemoryFullWarning:
       {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
       if: >-
         mem_used_percent >= {{ mem_threshold }}
@@ -232,8 +286,29 @@
       annotations:
         summary: "CPU terminated {{ squeeze_rate_threshold }}{%- raw %} net_rx_action loops per second"
         description: "The rate of net_rx_action loops terminations on the {{ $labels.host }} node is {{ $value }} per second during the last 7 minutes. Modify the net.core.netdev_budget and net.core.netdev_budget_usecs kernel parameters."
-{%- endraw -%}
-
+      {%- endraw %}
+    {%- if network.bridge == 'openvswitch' %}
+      {%- raw %}
+    ProcessOVSVswitchdMemoryWarning:
+      if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.2
+      for: 5m
+      labels:
+        severity: warning
+        service: ovs
+      annotations:
+        summary: "ovs-vswitchd takes more than 20% of system memory"
+        description: "ovs-vswitchd takes more than 20% of system memory"
+    ProcessOVSVswitchdMemoryCritical:
+      if: procstat_memory_vms{process_name="ovs-vswitchd"} / on(host) mem_total > 0.3
+      for: 5m
+      labels:
+        severity: critical
+        service: ovs
+      annotations:
+        summary: "ovs-vswitchd takes more than 30% of system memory"
+        description: "ovs-vswitchd takes more than 30% of system memory"
+      {%- endraw %}
+    {%- endif %}
 {%- set bond_interfaces = [] %}
 {%- for interface_name, interface in network.interface.items() %}
   {%- if interface.type == 'bond' and interface.enabled == True %}
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index 52b4fe7..934824a 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -1,6 +1,11 @@
 {%- from "linux/map.jinja" import network with context %}
 agent:
   input:
+  {%- if grains.get('virtual', None) == 'physical' %}
+    smart:
+      template: linux/files/smart_telegraf.conf
+      path: "/usr/sbin/smartctl"
+  {%- endif %}
     cpu:
       percpu: false
       totalcpu: true
@@ -33,6 +38,10 @@
           exe: sshd
         cron:
           exe: cron
+{%- if network.bridge == 'openvswitch' %}
+        ovs-vswitchd:
+          exe: ovs-vswitchd
+{%- endif %}
     linux_sysctl_fs:
 {%- set bond_interfaces = [] %}
 {%- for interface_name, interface in network.interface.items() %}
diff --git a/linux/network/interface.sls b/linux/network/interface.sls
index 771bd5e..2cfdb3b 100644
--- a/linux/network/interface.sls
+++ b/linux/network/interface.sls
@@ -104,8 +104,8 @@
 
 add_int_{{ int_name }}_to_ovs_bridge_{{ interface_name }}:
   cmd.run:
-    - unless: ovs-vsctl show | grep {{ int_name }}
     - name: ovs-vsctl{%- if network.ovs_nowait %} --no-wait{%- endif %} add-port {{ interface_name }} {{ int_name }}
+    - unless: ovs-vsctl list-ports {{ interface_name }} | grep -qFx {{ int_name }}
 {%- endif %}
 
 {%- endfor %}
diff --git a/linux/network/resolv.sls b/linux/network/resolv.sls
index 965ed2f..bac6ebf 100644
--- a/linux/network/resolv.sls
+++ b/linux/network/resolv.sls
@@ -11,6 +11,11 @@
   - require:
     - pkg: resolvconf
 
+ensure_resolvconf_symlink:
+  cmd.run:
+  - name: dpkg-reconfigure -fnoninteractive resolvconf
+  - unless: test -L /etc/resolv.conf
+
   {%- if network.resolv is defined %}
 /etc/resolvconf/resolv.conf.d/base:
   file.managed: