prometheus/meta/prometheus.yml - salt-formulas/prometheus - Gitiles

 {%- if pillar.prometheus is defined %}
 {%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay, sf_notifier with context %}
 server:
   alert:
 {%- if server.get('enabled', False) %}
 {% raw %}
     PrometheusTargetDown:
       if: up != 1
       for: 2m
       labels:
         severity: critical
         service: prometheus
       annotations:
         summary: "Prometheus target is down"
         description: "The Prometheus target for the {{ $labels.job }} job on the {{ or $labels.host $labels.instance }} node is down for 2 minutes."
     PrometheusTargetSamplesOrderWarning:
       if: increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0
       labels:
         severity: warning
         service: prometheus
       annotations:
         summary: "Prometheus samples are out of order"
         description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)."
     PrometheusTargetSamplesBoundsWarning:
       if: increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0
       labels:
         severity: warning
         service: prometheus
       annotations:
         summary: "Prometheus samples timestamps are out of bounds"
         description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)."
     PrometheusTargetSamplesDuplicateWarning:
       if: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0
       labels:
         severity: warning
         service: prometheus
       annotations:
         summary: "Prometheus samples have duplicate timestamps"
         description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)."
 {% endraw %}
 {%- if server.version == 1.7 %}
 {% raw %}
     PrometheusDataIngestionWarning:
       if: prometheus_local_storage_rushed_mode != 0
       for: 10m
       labels:
         severity: warning
         service: prometheus
       annotations:
         summary: "Prometheus is in the rushed mode"
         description: "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes."
 {% endraw %}
 {%- endif %}
 {%- endif %}
 {%- if server.get('config', {}).get('remote_write') %}
     PrometheusRemoteStorageQueueFullWarning:
       {%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %}
       if: >-
         prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
 {%- raw %}
       for: 2m
       labels:
         severity: warning
         service: prometheus
       annotations:
         summary: "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full"
         description: "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes."
 {%- endraw %}
 {%- endif %}
 {%- if remote_storage_adapter.get('enabled', False) %}
     RemoteStorageAdapterMetricsSendingWarning:
       {%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %}
       if: >-
         increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }}
 {% raw %}
       labels:
         severity: warning
         service: remote_storage_adapter
       annotations:
         summary: "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}"
         description: "The remote storage adapter metrics on sent to received ratio on the {{ $labels.instance }} instance is {{ $value }}."
 {% endraw %}
     RemoteStorageAdapterMetricsIgnoredWarning:
       {%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %}
       if: >-
         increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }}
 {% raw %}
       labels:
         severity: warning
         service: remote_storage_adapter
       annotations:
         summary: "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid"
         description: "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid."
 {%- endraw %}
 {%- endif %}
 {%- if alertmanager.get('enabled', False) %}
 {%- raw %}
     AlertmanagerNotificationFailureWarning:
       if: >-
         increase(alertmanager_notifications_failed_total[2m]) > 0
       for: 2m
       labels:
         severity: warning
         service: alertmanager
       annotations:
         summary: "Alertmanager notifications fail"
         description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes."
     AlertmanagerAlertsInvalidWarning:
       if: >-
         increase(alertmanager_alerts_invalid_total[2m]) > 0
       for: 2m
       labels:
         severity: warning
         service: alertmanager
       annotations:
         summary: "Alertmanager alerts are invalid"
         description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
 {%- endraw %}
 {%- endif %}
 {%- if relay.get('enabled', False) %}
 {%- raw %}
     PrometheusRelayServiceDown:
       if: >-
         procstat_running{process_name="prometheus-relay"} == 0
       for: 2m
       labels:
         severity: minor
         service: prometheus
       annotations:
         summary: "Prometheus relay service is down"
         description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
     PrometheusRelayServiceDownMajor:
       if: >-
         count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
       for: 2m
       labels:
         severity: major
         service: prometheus
       annotations:
         summary: "50% of Prometheus relay services are down"
         description: "{{ $value }}% of Prometheus relay services (>= 50%) are down for 2 minutes."
     PrometheusRelayServiceOutage:
       if: >-
         count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
       for: 2m
       labels:
         severity: critical
         service: prometheus
       annotations:
         summary: "Prometheus relay service outage"
         description: "All Prometheus relay services are down for 2 minutes."
 {%- endraw %}
 {%- endif %}
 {%- if server.get("enabled", False) and not server.get("is_container", True) %}
 {%- raw %}
     PrometheusLTSServiceDown:
       if: >-
         procstat_running{process_name="prometheus"} == 0
       for: 2m
       labels:
         severity: minor
         service: prometheus
       annotations:
         summary: "Prometheus Long Term Storage service is down"
         description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
     PrometheusLTSServiceDownMajor:
       if: >-
         count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
       for: 2m
       labels:
         severity: major
         service: prometheus
       annotations:
         summary: "50% of Prometheus Long Term Storage services are down"
         description: "{{ $value }}% of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
     PrometheusLTSServiceOutage:
       if: >-
         count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
       for: 2m
       labels:
         severity: critical
         service: prometheus
       annotations:
         summary: "Prometheus Long Term Storage service outage"
         description: "All Prometheus Long Term Storage services are down for 2 minutes."
 {%- endraw %}
 {%- endif %}
 {%- if sf_notifier.get('enabled', False) %}
 {%- raw %}
     SfNotifierDown:
       if: >-
         absent(sf_auth_ok) == 1
       for: 2m
       labels:
         severity: critical
         service: sf-notifier
       annotations:
         summary: "Sf-notifier service is down"
         description: "The sf-notifier service is down for 2 minutes."
     SfNotifierAuthFailure:
       if: >-
         sf_auth_ok == 0
       for: 2m
       labels:
         severity: critical
         service: sf-notifier
       annotations:
         summary: "Sf-notifier authentication failure"
         description: "The sf-notifier service fails to authenticate to Salesforce for 2 minutes."
 {%- endraw %}
 {%- endif %}
 {%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) or sf_notifier.get('enabled', False) %}
   target:
   {%- if sf_notifier.get('enabled', False) %}
     dns:
       enabled: true
       endpoint:
         - name: 'sf_notifier'
           domain:
           - 'tasks.monitoring_sf_notifier'
           type: A
           port: {{ sf_notifier.uwsgi.bind_port }}
   {%- endif %}
   {%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
     {%- set addresses = [] %}
     {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
       {%- do addresses.append(server.bind.address) %}
     {%- endif %}
     {%- for address in grains['fqdn_ip4'] %}
       {%- if not address.startswith('127') %}
         {%- do addresses.append(address) %}
       {%- endif %}
     {%- endfor %}
     static:
       prometheus_lts:
         enabled: True
         endpoint:
           - address: {{ addresses[0] }}
             port: {{ server.bind.port }}
         relabel_configs:
           - regex: {{ addresses[0] }}:{{ server.bind.port }}
             replacement: {{ grains['host'] }}
             source_labels: "__address__"
             target_label: "host"
       prometheus_relay:
         enabled: True
         endpoint:
           - address: {{ addresses[0] }}
             port: {{ relay.bind.port }}
         relabel_configs:
           - regex: {{ addresses[0] }}:{{ relay.bind.port }}
             replacement: {{ grains['host'] }}
             source_labels: "__address__"
             target_label: "host"
   {%- endif %}
 {%- endif %}
 {%- endif %}
	{%- if pillar.prometheus is defined %}
	{%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay, sf_notifier with context %}
	server:
	alert:
	{%- if server.get('enabled', False) %}
	{% raw %}
	PrometheusTargetDown:
	if: up != 1
	for: 2m
	labels:
	severity: critical
	service: prometheus
	annotations:
	summary: "Prometheus target is down"
	description: "The Prometheus target for the {{ $labels.job }} job on the {{ or $labels.host $labels.instance }} node is down for 2 minutes."
	PrometheusTargetSamplesOrderWarning:
	if: increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0
	labels:
	severity: warning
	service: prometheus
	annotations:
	summary: "Prometheus samples are out of order"
	description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)."
	PrometheusTargetSamplesBoundsWarning:
	if: increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0
	labels:
	severity: warning
	service: prometheus
	annotations:
	summary: "Prometheus samples timestamps are out of bounds"
	description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)."
	PrometheusTargetSamplesDuplicateWarning:
	if: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0
	labels:
	severity: warning
	service: prometheus
	annotations:
	summary: "Prometheus samples have duplicate timestamps"
	description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)."
	{% endraw %}
	{%- if server.version == 1.7 %}
	{% raw %}
	PrometheusDataIngestionWarning:
	if: prometheus_local_storage_rushed_mode != 0
	for: 10m
	labels:
	severity: warning
	service: prometheus
	annotations:
	summary: "Prometheus is in the rushed mode"
	description: "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes."
	{% endraw %}
	{%- endif %}
	{%- endif %}
	{%- if server.get('config', {}).get('remote_write') %}
	PrometheusRemoteStorageQueueFullWarning:
	{%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent\|float %}
	if: >-
	prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }}
	{%- raw %}
	for: 2m
	labels:
	severity: warning
	service: prometheus
	annotations:
	summary: "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full"
	description: "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes."
	{%- endraw %}
	{%- endif %}
	{%- if remote_storage_adapter.get('enabled', False) %}
	RemoteStorageAdapterMetricsSendingWarning:
	{%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio\|float %}
	if: >-
	increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }}
	{% raw %}
	labels:
	severity: warning
	service: remote_storage_adapter
	annotations:
	summary: "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}"
	description: "The remote storage adapter metrics on sent to received ratio on the {{ $labels.instance }} instance is {{ $value }}."
	{% endraw %}
	RemoteStorageAdapterMetricsIgnoredWarning:
	{%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio\|float %}
	if: >-
	increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }}
	{% raw %}
	labels:
	severity: warning
	service: remote_storage_adapter
	annotations:
	summary: "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid"
	description: "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid."
	{%- endraw %}
	{%- endif %}
	{%- if alertmanager.get('enabled', False) %}
	{%- raw %}
	AlertmanagerNotificationFailureWarning:
	if: >-
	increase(alertmanager_notifications_failed_total[2m]) > 0
	for: 2m
	labels:
	severity: warning
	service: alertmanager
	annotations:
	summary: "Alertmanager notifications fail"
	description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes."
	AlertmanagerAlertsInvalidWarning:
	if: >-
	increase(alertmanager_alerts_invalid_total[2m]) > 0
	for: 2m
	labels:
	severity: warning
	service: alertmanager
	annotations:
	summary: "Alertmanager alerts are invalid"
	description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes."
	{%- endraw %}
	{%- endif %}
	{%- if relay.get('enabled', False) %}
	{%- raw %}
	PrometheusRelayServiceDown:
	if: >-
	procstat_running{process_name="prometheus-relay"} == 0
	for: 2m
	labels:
	severity: minor
	service: prometheus
	annotations:
	summary: "Prometheus relay service is down"
	description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes."
	PrometheusRelayServiceDownMajor:
	if: >-
	count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5
	for: 2m
	labels:
	severity: major
	service: prometheus
	annotations:
	summary: "50% of Prometheus relay services are down"
	description: "{{ $value }}% of Prometheus relay services (>= 50%) are down for 2 minutes."
	PrometheusRelayServiceOutage:
	if: >-
	count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"})
	for: 2m
	labels:
	severity: critical
	service: prometheus
	annotations:
	summary: "Prometheus relay service outage"
	description: "All Prometheus relay services are down for 2 minutes."
	{%- endraw %}
	{%- endif %}
	{%- if server.get("enabled", False) and not server.get("is_container", True) %}
	{%- raw %}
	PrometheusLTSServiceDown:
	if: >-
	procstat_running{process_name="prometheus"} == 0
	for: 2m
	labels:
	severity: minor
	service: prometheus
	annotations:
	summary: "Prometheus Long Term Storage service is down"
	description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes."
	PrometheusLTSServiceDownMajor:
	if: >-
	count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5
	for: 2m
	labels:
	severity: major
	service: prometheus
	annotations:
	summary: "50% of Prometheus Long Term Storage services are down"
	description: "{{ $value }}% of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes."
	PrometheusLTSServiceOutage:
	if: >-
	count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"})
	for: 2m
	labels:
	severity: critical
	service: prometheus
	annotations:
	summary: "Prometheus Long Term Storage service outage"
	description: "All Prometheus Long Term Storage services are down for 2 minutes."
	{%- endraw %}
	{%- endif %}
	{%- if sf_notifier.get('enabled', False) %}
	{%- raw %}
	SfNotifierDown:
	if: >-
	absent(sf_auth_ok) == 1
	for: 2m
	labels:
	severity: critical
	service: sf-notifier
	annotations:
	summary: "Sf-notifier service is down"
	description: "The sf-notifier service is down for 2 minutes."
	SfNotifierAuthFailure:
	if: >-
	sf_auth_ok == 0
	for: 2m
	labels:
	severity: critical
	service: sf-notifier
	annotations:
	summary: "Sf-notifier authentication failure"
	description: "The sf-notifier service fails to authenticate to Salesforce for 2 minutes."
	{%- endraw %}
	{%- endif %}
	{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) or sf_notifier.get('enabled', False) %}
	target:
	{%- if sf_notifier.get('enabled', False) %}
	dns:
	enabled: true
	endpoint:
	- name: 'sf_notifier'
	domain:
	- 'tasks.monitoring_sf_notifier'
	type: A
	port: {{ sf_notifier.uwsgi.bind_port }}
	{%- endif %}
	{%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %}
	{%- set addresses = [] %}
	{%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %}
	{%- do addresses.append(server.bind.address) %}
	{%- endif %}
	{%- for address in grains['fqdn_ip4'] %}
	{%- if not address.startswith('127') %}
	{%- do addresses.append(address) %}
	{%- endif %}
	{%- endfor %}
	static:
	prometheus_lts:
	enabled: True
	endpoint:
	- address: {{ addresses[0] }}
	port: {{ server.bind.port }}
	relabel_configs:
	- regex: {{ addresses[0] }}:{{ server.bind.port }}
	replacement: {{ grains['host'] }}
	source_labels: "__address__"
	target_label: "host"
	prometheus_relay:
	enabled: True
	endpoint:
	- address: {{ addresses[0] }}
	port: {{ relay.bind.port }}
	relabel_configs:
	- regex: {{ addresses[0] }}:{{ relay.bind.port }}
	replacement: {{ grains['host'] }}
	source_labels: "__address__"
	target_label: "host"
	{%- endif %}
	{%- endif %}
	{%- endif %}