Merge "Add Prometheus monitoring"
diff --git a/heka/_common.sls b/heka/_common.sls
index a46b820..fc1a651 100644
--- a/heka/_common.sls
+++ b/heka/_common.sls
@@ -26,6 +26,11 @@
   - require:
     - user: heka_user
 
+/usr/local/bin/monitor_heka_queues.sh:
+  file.managed:
+  - source: salt://heka/files/monitor_heka_queues.sh
+  - mode: 755
+
 heka_user:
   user.present:
   - name: heka
diff --git a/heka/files/monitor_heka_queues.sh b/heka/files/monitor_heka_queues.sh
new file mode 100755
index 0000000..7360440
--- /dev/null
+++ b/heka/files/monitor_heka_queues.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+#
+# This script reports the size of each Heka output queue.
+
+for q in /var/cache/*/output_queue/*;
+do
+  DATA=$(du -sb "$q")
+  SIZE=$(echo $DATA | cut -d " " -f 1)
+  QUEUE=$(echo $DATA | cut -d " " -f 2)
+  if [[ -n "${SIZE}" && -n "${QUEUE}" ]]; then
+    echo "heka_output_queue_size,queue=${QUEUE} value=${SIZE}"
+  fi
+done
diff --git a/heka/meta/prometheus.yml b/heka/meta/prometheus.yml
new file mode 100644
index 0000000..daf3bd8
--- /dev/null
+++ b/heka/meta/prometheus.yml
@@ -0,0 +1,13 @@
+server:
+  alert:
+    HekaOutputQueueStalled:
+      if: >-
+        delta(heka_output_queue_size[1h]) == 0
+      labels:
+        severity: warning
+        service: heka
+      annotations:
+        summary: 'Heka queue stalled'
+{%- raw %}
+        description: 'The {{ $labels.queue }} queue is stalled on node {{ $labels.host }} for more than 1 hour. The corresponding Heka service is either down or stuck.'
+{%- endraw %}
diff --git a/heka/meta/telegraf.yml b/heka/meta/telegraf.yml
new file mode 100644
index 0000000..d8c7b13
--- /dev/null
+++ b/heka/meta/telegraf.yml
@@ -0,0 +1,8 @@
+agent:
+  input:
+    monitor_heka:
+      template: telegraf/files/input/exec.conf
+      commands:
+      - /usr/local/bin/monitor_heka_queues.sh
+      data_format: influx
+      interval: 60s
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 14a0a96..15b34cb 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -5,7 +5,11 @@
         enabled: true
       heka:
         enabled: true
+      prometheus:
+        enabled: true
       sensu:
         enabled: true
       sphinx:
         enabled: true
+      telegraf:
+        enabled: true