Merge "Add alerts and targets for prometheus LTS and relay"
diff --git a/metadata/service/support.yml b/metadata/service/support.yml
index df2b7c3..e4d3cb4 100644
--- a/metadata/service/support.yml
+++ b/metadata/service/support.yml
@@ -5,3 +5,5 @@
         enabled: true
       grafana:
         enabled: true
+      telegraf:
+        enabled: true
diff --git a/prometheus/meta/prometheus.yml b/prometheus/meta/prometheus.yml
index 80768e2..4451e8f 100644
--- a/prometheus/meta/prometheus.yml
+++ b/prometheus/meta/prometheus.yml
@@ -1,5 +1,5 @@
 {%- if pillar.prometheus is defined %}
-{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring with context %}
+{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay with context %}
 server:
   alert:
 {%- if server.get('enabled', False) %}
@@ -117,4 +117,105 @@
         description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
 {%- endraw %}
 {%- endif %}
+{%- if relay.get('enabled', False) %}
+{%- raw %}
+    PrometheusRelayServiceDown:
+      if: >-
+        procstat_running{process_name="prometheus-relay"} == 0
+      for: 2m
+      labels:
+        severity: minor
+        service: prometheus
+      annotations:
+        summary: "Prometheus relay service is down"
+        description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
+    PrometheusRelayServiceDownMajor:
+      if: >-
+        count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
+      for: 2m
+      labels:
+        severity: major
+        service: prometheus
+      annotations:
+        summary: "50% of Prometheus relay services are down"
+        description: "{{ $value }} of Prometheus relay services (>= 50%) are down for 2 minutes."
+    PrometheusRelayServiceOutage:
+      if: >-
+        count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
+      for: 2m
+      labels:
+        severity: critical
+        service: prometheus
+      annotations:
+        summary: "Prometheus relay service outage"
+        description: "All Prometheus relay services are down for 2 minutes."
+{%- endraw %}
+{%- endif %}
+{%- if server.get("enabled", False) and not server.get("is_container", True) %}
+{%- raw %}
+    PrometheusLTSServiceDown:
+      if: >-
+        procstat_running{process_name="prometheus"} == 0
+      for: 2m
+      labels:
+        severity: minor
+        service: prometheus
+      annotations:
+        summary: "Prometheus Long Term Storage service is down"
+        description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
+    PrometheusRelayServiceDownMajor:
+      if: >-
+        count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
+      for: 2m
+      labels:
+        severity: major
+        service: prometheus
+      annotations:
+        summary: "50% of Prometheus Long Term Storage services are down"
+        description: "{{ $value }} of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
+    PrometheusRelayServiceOutage:
+      if: >-
+        count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
+      for: 2m
+      labels:
+        severity: critical
+        service: prometheus
+      annotations:
+        summary: "Prometheus Long Term Storage service outage"
+        description: "All Prometheus Long Term Storage services are down for 2 minutes."
+{%- endraw %}
+{%- endif %}
+{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
+  {%- set addresses = [] %}
+  {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
+    {%- do addresses.append(server.bind.address) %}
+  {%- endif %}
+  {%- for address in grains['fqdn_ip4'] %}
+    {%- if not address.startswith('127') %}
+      {%- do addresses.append(address) %}
+    {%- endif %}
+  {%- endfor %}
+  target:
+    static:
+      prometheus_lts:
+        enabled: True
+        endpoint:
+          - address: {{ addresses[0] }}
+            port: {{ server.bind.port }}
+        relabel_configs:
+          - regex: {{ addresses[0] }}:{{ server.bind.port }}
+            replacement: {{ grains['host'] }}
+            source_labels: "__address__"
+            target_label: "host"
+      prometheus_relay:
+        enabled: True
+        endpoint:
+          - address: {{ addresses[0] }}
+            port: {{ relay.bind.port }}
+        relabel_configs:
+          - regex: {{ addresses[0] }}:{{ relay.bind.port }}
+            replacement: {{ grains['host'] }}
+            source_labels: "__address__"
+            target_label: "host"
+{%- endif %}
 {%- endif %}
diff --git a/prometheus/meta/telegraf.yml b/prometheus/meta/telegraf.yml
new file mode 100644
index 0000000..3cce8a9
--- /dev/null
+++ b/prometheus/meta/telegraf.yml
@@ -0,0 +1,17 @@
+{%- if pillar.prometheus is defined %}
+{%- from "prometheus/map.jinja" import server, relay with context %}
+agent:
+  input:
+  {%- if relay.get('enabled', False) and server.get("enabled", False) and not server.get("is_container", True) %}
+    procstat:
+      process:
+    {%- if relay.get('enabled', False) %}
+        prometheus-relay:
+          pattern: '/usr/bin/prometheus-relay'
+    {%- endif %}
+    {%- if server.get("enabled", False) and not server.get("is_container", True) %}
+        prometheus:
+          pattern: '/usr/bin/prometheus[^-]'
+    {%- endif %}
+  {% endif %}
+{%- endif %}