Add Prometheus monitoring

This change adds an alert that will trigger when Heka doesn't send data
anymore to its configured outputs. This should cover cases when Heka is
down or stuck.

Change-Id: I59b578d5366d0f5bd584eb4cccea7355765e9dc7
Depends-On: Ic13fd1b6fe4008771b93057bf2cbf0b78825516f
diff --git a/heka/_common.sls b/heka/_common.sls
index a46b820..fc1a651 100644
--- a/heka/_common.sls
+++ b/heka/_common.sls
@@ -26,6 +26,11 @@
   - require:
     - user: heka_user
 
+/usr/local/bin/monitor_heka_queues.sh:
+  file.managed:
+  - source: salt://heka/files/monitor_heka_queues.sh
+  - mode: 755
+
 heka_user:
   user.present:
   - name: heka
diff --git a/heka/files/monitor_heka_queues.sh b/heka/files/monitor_heka_queues.sh
new file mode 100755
index 0000000..7360440
--- /dev/null
+++ b/heka/files/monitor_heka_queues.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+#
+# This script reports the size of each Heka output queue.
+
+for q in /var/cache/*/output_queue/*;
+do
+  DATA=$(du -sb "$q")
+  SIZE=$(echo $DATA | cut -d " " -f 1)
+  QUEUE=$(echo $DATA | cut -d " " -f 2)
+  if [[ -n "${SIZE}" && -n "${QUEUE}" ]]; then
+    echo "heka_output_queue_size,queue=${QUEUE} value=${SIZE}"
+  fi
+done
diff --git a/heka/meta/prometheus.yml b/heka/meta/prometheus.yml
new file mode 100644
index 0000000..daf3bd8
--- /dev/null
+++ b/heka/meta/prometheus.yml
@@ -0,0 +1,13 @@
+server:
+  alert:
+    HekaOutputQueueStalled:
+      if: >-
+        delta(heka_output_queue_size[1h]) == 0
+      labels:
+        severity: warning
+        service: heka
+      annotations:
+        summary: 'Heka queue stalled'
+{%- raw %}
+        description: 'The {{ $labels.queue }} queue is stalled on node {{ $labels.host }} for more than 1 hour. The corresponding Heka service is either down or stuck.'
+{%- endraw %}
diff --git a/heka/meta/telegraf.yml b/heka/meta/telegraf.yml
new file mode 100644
index 0000000..d8c7b13
--- /dev/null
+++ b/heka/meta/telegraf.yml
@@ -0,0 +1,8 @@
+agent:
+  input:
+    monitor_heka:
+      template: telegraf/files/input/exec.conf
+      commands:
+      - /usr/local/bin/monitor_heka_queues.sh
+      data_format: influx
+      interval: 60s
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 14a0a96..15b34cb 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -5,7 +5,11 @@
         enabled: true
       heka:
         enabled: true
+      prometheus:
+        enabled: true
       sensu:
         enabled: true
       sphinx:
         enabled: true
+      telegraf:
+        enabled: true