Added SMART disk monitoring for physical devices
Change-Id: I2ff434136de0bcbf22d1d69aaf88283b956cfb5b
Related-Bug: PROD-27914
(cherry picked from commit 6fd01e5cd1de81fde7248549970a39358725782c)
diff --git a/linux/files/smart_telegraf.conf b/linux/files/smart_telegraf.conf
new file mode 100644
index 0000000..e787120
--- /dev/null
+++ b/linux/files/smart_telegraf.conf
@@ -0,0 +1,21 @@
+[[inputs.smart]]
+{%- include 'telegraf/files/input/_common.conf' %}
+{%- if values.path is defined %}
+  path = "{{ values.path }}"
+{%- endif %}
+{%- if values.use_sudo is defined %}
+  use_sudo = {{ values.use_sudo|lower }}
+{%- endif %}
+{%- if values.nocheck is defined %}
+  nocheck = "{{ values.nocheck }}"
+{%- endif %}
+{%- if values.attributes is defined %}
+  attributes = {{ values.attributes|lower }}
+{%- endif %}
+{%- if values.excludes is defined %}
+  excludes = {{ values.excludes | json }}
+{%- endif %}
+{%- if values.devices is defined %}
+  devices = {{ values.devices | json }}
+{%- endif %}
+{%- include 'telegraf/files/input/_filters.conf' %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index f405367..6e4551f 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -99,8 +99,62 @@
       annotations:
         summary: "Disk {{ $labels.device }} is failing"
         description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting errors for 5 minutes."
-    SystemMemoryFullWarning:
       {%- endraw %}
+  {%- if grains.get('virtual', None) == 'physical' %}
+    {%- raw %}
+    SystemSMARTDiskUDMACrcErrorsTooHigh:
+      if: >-
+        increase(smart_device_udma_crc_errors[1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has UDMA CRC errors"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting UDMA CRC errors for 5 minutes."
+    SystemSMARTDiskHealthStatus:
+      if: >-
+        smart_device_health_ok == 0
+      for: 1m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has bad health"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting a bad health status for 1 minute."
+    SystemSMARTDiskReadErrorRate:
+      if: >-
+        increase(smart_device_read_error_rate[1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has read errors"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased read error rate for 5 minutes."
+    SystemSMARTDiskSeekErrorRate:
+      if: >-
+        increase(smart_device_seek_error_rate[1m]) > 0
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk has seek errors"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node is reporting an increased seek error rate for 5 minutes."
+    SystemSMARTDiskTemperatureHigh:
+      if: >-
+        smart_device_temp_c >= 60
+      for: 5m
+      labels:
+        severity: warning
+        service: system
+      annotations:
+        summary: "The {{ $labels.device }} disk temperature is high"
+        description: "The {{ $labels.device }} disk on the {{ $labels.host }} node has a temperature of {{ $value }}C for 5 minutes."
+    {%- endraw %}
+  {%- endif %}
+    SystemMemoryFullWarning:
       {%- set mem_threshold = monitoring.memory_usage_percentage.warn|float %}
       if: >-
         mem_used_percent >= {{ mem_threshold }}
diff --git a/linux/meta/telegraf.yml b/linux/meta/telegraf.yml
index 52b4fe7..1d83ca6 100644
--- a/linux/meta/telegraf.yml
+++ b/linux/meta/telegraf.yml
@@ -1,6 +1,11 @@
 {%- from "linux/map.jinja" import network with context %}
 agent:
   input:
+  {%- if grains.get('virtual', None) == 'physical' %}
+    smart:
+      template: linux/files/smart_telegraf.conf
+      path: "/usr/sbin/smartctl"
+  {%- endif %}
     cpu:
       percpu: false
       totalcpu: true