Merge "Add Prometheus monitoring"
diff --git a/heka/_common.sls b/heka/_common.sls
index a46b820..fc1a651 100644
--- a/heka/_common.sls
+++ b/heka/_common.sls
@@ -26,6 +26,11 @@
- require:
- user: heka_user
+/usr/local/bin/monitor_heka_queues.sh:
+ file.managed:
+ - source: salt://heka/files/monitor_heka_queues.sh
+ - mode: 755
+
heka_user:
user.present:
- name: heka
diff --git a/heka/files/monitor_heka_queues.sh b/heka/files/monitor_heka_queues.sh
new file mode 100755
index 0000000..7360440
--- /dev/null
+++ b/heka/files/monitor_heka_queues.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+#
+# This script reports the size of each Heka output queue.
+
+for q in /var/cache/*/output_queue/*;
+do
+ DATA=$(du -sb "$q")
+ SIZE=$(echo $DATA | cut -d " " -f 1)
+ QUEUE=$(echo $DATA | cut -d " " -f 2)
+ if [[ -n "${SIZE}" && -n "${QUEUE}" ]]; then
+ echo "heka_output_queue_size,queue=${QUEUE} value=${SIZE}"
+ fi
+done
diff --git a/heka/meta/prometheus.yml b/heka/meta/prometheus.yml
new file mode 100644
index 0000000..daf3bd8
--- /dev/null
+++ b/heka/meta/prometheus.yml
@@ -0,0 +1,13 @@
+server:
+ alert:
+ HekaOutputQueueStalled:
+ if: >-
+ delta(heka_output_queue_size[1h]) == 0
+ labels:
+ severity: warning
+ service: heka
+ annotations:
+ summary: 'Heka queue stalled'
+{%- raw %}
+ description: 'The {{ $labels.queue }} queue is stalled on node {{ $labels.host }} for more than 1 hour. The corresponding Heka service is either down or stuck.'
+{%- endraw %}
diff --git a/heka/meta/telegraf.yml b/heka/meta/telegraf.yml
new file mode 100644
index 0000000..d8c7b13
--- /dev/null
+++ b/heka/meta/telegraf.yml
@@ -0,0 +1,8 @@
+agent:
+ input:
+ monitor_heka:
+ template: telegraf/files/input/exec.conf
+ commands:
+ - /usr/local/bin/monitor_heka_queues.sh
+ data_format: influx
+ interval: 60s
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index 14a0a96..15b34cb 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -5,7 +5,11 @@
enabled: true
heka:
enabled: true
+ prometheus:
+ enabled: true
sensu:
enabled: true
sphinx:
enabled: true
+ telegraf:
+ enabled: true