| {%- if pillar.prometheus is defined %} |
| {%- from "prometheus/map.jinja" import server, alertmanager, remote_storage_adapter, monitoring, relay, sf_notifier with context %} |
| server: |
| alert: |
| {%- if server.get('enabled', False) %} |
| {% raw %} |
| PrometheusTargetDown: |
| if: up != 1 |
| for: 2m |
| labels: |
| severity: critical |
| service: prometheus |
| annotations: |
| summary: "Prometheus target is down" |
| description: "The Prometheus target for the {{ $labels.job }} job on the {{ or $labels.host $labels.instance }} node is down for 2 minutes." |
| PrometheusTargetSamplesOrderWarning: |
| if: increase(prometheus_target_scrapes_sample_out_of_order_total[1m]) > 0 |
| labels: |
| severity: warning |
| service: prometheus |
| annotations: |
| summary: "Prometheus samples are out of order" |
| description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance are out of order (as measured over the last minute)." |
| PrometheusTargetSamplesBoundsWarning: |
| if: increase(prometheus_target_scrapes_sample_out_of_bounds_total[1m]) > 0 |
| labels: |
| severity: warning |
| service: prometheus |
| annotations: |
| summary: "Prometheus samples timestamps are out of bounds" |
| description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have timestamps out of bounds (as measured over the last minute)." |
| PrometheusTargetSamplesDuplicateWarning: |
| if: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]) > 0 |
| labels: |
| severity: warning |
| service: prometheus |
| annotations: |
| summary: "Prometheus samples have duplicate timestamps" |
| description: "{{ $value }} Prometheus samples on the {{ $labels.instance }} instance have duplicate timestamps (as measured over the last minute)." |
| {% endraw %} |
| {%- if server.version == 1.7 %} |
| {% raw %} |
| PrometheusDataIngestionWarning: |
| if: prometheus_local_storage_rushed_mode != 0 |
| for: 10m |
| labels: |
| severity: warning |
| service: prometheus |
| annotations: |
| summary: "Prometheus is in the rushed mode" |
| description: "The Prometheus service writes on the {{ $labels.instance }} instance do not keep up with data ingestion speed for 10 minutes." |
| {% endraw %} |
| {%- endif %} |
| {%- endif %} |
| {%- if server.get('config', {}).get('remote_write') %} |
| PrometheusRemoteStorageQueueFullWarning: |
| {%- set threshold = monitoring.prometheus.remote_storage_queue_full_percent|float %} |
| if: >- |
| prometheus_remote_storage_queue_length / prometheus_remote_storage_queue_capacity * 100 > {{ threshold }} |
| {%- raw %} |
| for: 2m |
| labels: |
| severity: warning |
| service: prometheus |
| annotations: |
| summary: "Prometheus remote storage queue is {%- endraw %} {{ threshold }}{%- raw %}% full" |
| description: "The Prometheus remote storage queue on the {{ $labels.instance }} instance is {{ $value }}% full for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- if remote_storage_adapter.get('enabled', False) %} |
| RemoteStorageAdapterMetricsSendingWarning: |
| {%- set threshold = monitoring.remote_storage_adapter.sent_vs_received_ratio|float %} |
| if: >- |
| increase(sent_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(received_samples_total[1m]) < {{ threshold }} |
| {% raw %} |
| labels: |
| severity: warning |
| service: remote_storage_adapter |
| annotations: |
| summary: "Remote storage adapter metrics sent/received ratio reached limit of {%- endraw %} {{ threshold }}{%- raw %}" |
| description: "The remote storage adapter metrics on sent to received ratio on the {{ $labels.instance }} instance is {{ $value }}." |
| {% endraw %} |
| RemoteStorageAdapterMetricsIgnoredWarning: |
| {%- set threshold = monitoring.remote_storage_adapter.ignored_vs_sent_ratio|float %} |
| if: >- |
| increase(prometheus_influxdb_ignored_samples_total{job="remote_storage_adapter"}[1m]) / on (job, instance) increase(sent_samples_total[1m]) >= {{ threshold }} |
| {% raw %} |
| labels: |
| severity: warning |
| service: remote_storage_adapter |
| annotations: |
| summary: "{%- endraw %}{{ threshold }}{%- raw %}% of remote storage adapter metrics are invalid" |
| description: "{{ $value }}% of remote storage adapter metrics on the {{ $labels.instance }} instance are invalid." |
| {%- endraw %} |
| {%- endif %} |
| {%- if alertmanager.get('enabled', False) %} |
| {%- raw %} |
| AlertmanagerNotificationFailureWarning: |
| if: >- |
| increase(alertmanager_notifications_failed_total[2m]) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: alertmanager |
| annotations: |
| summary: "Alertmanager notifications fail" |
| description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} notifications on the {{ $labels.instance }} instance fail for 2 minutes." |
| AlertmanagerAlertsInvalidWarning: |
| if: >- |
| increase(alertmanager_alerts_invalid_total[2m]) > 0 |
| for: 2m |
| labels: |
| severity: warning |
| service: alertmanager |
| annotations: |
| summary: "Alertmanager alerts are invalid" |
| description: "An average of {{ $value }} Alertmanager {{ $labels.integration }} alerts on the {{ $labels.instance }} instance are invalid for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- if relay.get('enabled', False) %} |
| {%- raw %} |
| PrometheusRelayServiceDown: |
| if: >- |
| procstat_running{process_name="prometheus-relay"} == 0 |
| for: 2m |
| labels: |
| severity: minor |
| service: prometheus |
| annotations: |
| summary: "Prometheus relay service is down" |
| description: "The Prometheus relay service on the {{$labels.host}} node is down for 2 minutes." |
| PrometheusRelayServiceDownMajor: |
| if: >- |
| count(procstat_running{process_name="prometheus-relay"} == 0) >= count(procstat_running{process_name="prometheus-relay"}) * 0.5 |
| for: 2m |
| labels: |
| severity: major |
| service: prometheus |
| annotations: |
| summary: "50% of Prometheus relay services are down" |
| description: "{{ $value }}% of Prometheus relay services (>= 50%) are down for 2 minutes." |
| PrometheusRelayServiceOutage: |
| if: >- |
| count(procstat_running{process_name="prometheus-relay"} == 0) == count(procstat_running{process_name="prometheus-relay"}) |
| for: 2m |
| labels: |
| severity: critical |
| service: prometheus |
| annotations: |
| summary: "Prometheus relay service outage" |
| description: "All Prometheus relay services are down for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- if server.get("enabled", False) and not server.get("is_container", True) %} |
| {%- raw %} |
| PrometheusLTSServiceDown: |
| if: >- |
| procstat_running{process_name="prometheus"} == 0 |
| for: 2m |
| labels: |
| severity: minor |
| service: prometheus |
| annotations: |
| summary: "Prometheus Long Term Storage service is down" |
| description: "The Prometheus Long Term Storage service on the {{$labels.host}} node is down for 2 minutes." |
| PrometheusLTSServiceDownMajor: |
| if: >- |
| count(procstat_running{process_name="prometheus"} == 0) >= count(procstat_running{process_name="prometheus"}) * 0.5 |
| for: 2m |
| labels: |
| severity: major |
| service: prometheus |
| annotations: |
| summary: "50% of Prometheus Long Term Storage services are down" |
| description: "{{ $value }}% of Prometheus Long Term Storage services (>= 50%) are down for 2 minutes." |
| PrometheusLTSServiceOutage: |
| if: >- |
| count(procstat_running{process_name="prometheus"} == 0) == count(procstat_running{process_name="prometheus"}) |
| for: 2m |
| labels: |
| severity: critical |
| service: prometheus |
| annotations: |
| summary: "Prometheus Long Term Storage service outage" |
| description: "All Prometheus Long Term Storage services are down for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- if sf_notifier.get('enabled', False) %} |
| {%- raw %} |
| SfNotifierDown: |
| if: >- |
| absent(sf_auth_ok) == 1 |
| for: 2m |
| labels: |
| severity: critical |
| service: sf-notifier |
| annotations: |
| summary: "Sf-notifier service is down" |
| description: "The sf-notifier service is down for 2 minutes." |
| SfNotifierAuthFailure: |
| if: >- |
| sf_auth_ok == 0 |
| for: 2m |
| labels: |
| severity: critical |
| service: sf-notifier |
| annotations: |
| summary: "Sf-notifier authentication failure" |
| description: "The sf-notifier service fails to authenticate to Salesforce for 2 minutes." |
| {%- endraw %} |
| {%- endif %} |
| {%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) or sf_notifier.get('enabled', False) %} |
| target: |
| {%- if sf_notifier.get('enabled', False) %} |
| dns: |
| enabled: true |
| endpoint: |
| - name: 'sf_notifier' |
| domain: |
| - 'tasks.monitoring_sf_notifier' |
| type: A |
| port: {{ sf_notifier.uwsgi.bind_port }} |
| {%- endif %} |
| {%- if server.get("enabled", False) and not server.get("is_container", True) and relay.get('enabled', False) %} |
| {%- set addresses = [] %} |
| {%- if server.get('bind', {}).address is defined and not server.bind.address.startswith('127') and server.bind.address != '0.0.0.0' %} |
| {%- do addresses.append(server.bind.address) %} |
| {%- endif %} |
| {%- for address in grains['fqdn_ip4'] %} |
| {%- if not address.startswith('127') %} |
| {%- do addresses.append(address) %} |
| {%- endif %} |
| {%- endfor %} |
| static: |
| prometheus_lts: |
| enabled: True |
| endpoint: |
| - address: {{ addresses[0] }} |
| port: {{ server.bind.port }} |
| relabel_configs: |
| - regex: {{ addresses[0] }}:{{ server.bind.port }} |
| replacement: {{ grains['host'] }} |
| source_labels: "__address__" |
| target_label: "host" |
| prometheus_relay: |
| enabled: True |
| endpoint: |
| - address: {{ addresses[0] }} |
| port: {{ relay.bind.port }} |
| relabel_configs: |
| - regex: {{ addresses[0] }}:{{ relay.bind.port }} |
| replacement: {{ grains['host'] }} |
| source_labels: "__address__" |
| target_label: "host" |
| {%- endif %} |
| {%- endif %} |
| {%- endif %} |