Generate metrics from logs
Change-Id: I5a8ccb235d36c1b4115794904f373a5704c2296d
diff --git a/linux/meta/fluentd.yml b/linux/meta/fluentd.yml
index 6b4041c..c65d3fb 100644
--- a/linux/meta/fluentd.yml
+++ b/linux/meta/fluentd.yml
@@ -5,6 +5,49 @@
gem: ['fluent-plugin-systemd']
config:
label:
+ default_metric:
+ filter:
+ metric_failed_user:
+ tag: metric.failed_user
+ type: prometheus
+ metric:
+ - name: failed_logins_total
+ type: counter
+ desc: The total number of failed logins.
+ label:
+ - name: host
+ value: ${Hostname}
+ metric_out_of_memory:
+ tag: metric.out_of_memory
+ type: prometheus
+ metric:
+ - name: out_of_memory_total
+ type: counter
+ desc: The total number of OOM.
+ label:
+ - name: host
+ value: ${Hostname}
+ metric_hdd_errors_parse:
+ tag: metric.hdd_errors
+ type: parser
+ key_name: Payload
+ parser:
+ type: regexp
+ format: '/(?<device>[sv]d[a-z]+\d*)/'
+ metric_hdd_errors:
+ tag: metric.hdd_errors
+ require:
+ - metric_hdd_errors_parse
+ type: prometheus
+ metric:
+ - name: hdd_errors_total
+ type: counter
+ desc: The total number of hdd errors.
+ label:
+ - name: host
+ value: ${Hostname}
+ - name: device
+ value: ${device}
systemd:
input:
systemd:
@@ -37,11 +80,33 @@
tag: systemd.source
type: rewrite_tag_filter
rule:
- - name: service
- regexp: '^(.*)\.(.*)$'
+ - name: ident
+ regexp: '^(.*)$'
result: __TAG__.$1
push_to_default:
tag: 'systemd.source.*'
+ type: copy
+ store:
+ - type: relabel
+ label: default_output
+ - type: rewrite_tag_filter
+ rule:
+ - name: Payload
+ regexp: '^Invalid user'
+ result: metric.failed_user
+ - name: Payload
+ regexp: '^Out of memory'
+ result: metric.out_of_memory
+ - name: Payload
+ regexp: >-
+ 'error.+[sv]d[a-z]+\d*'
+ result: metric.hdd_errors
+ - name: Payload
+ regexp: >-
+ '[sv]d[a-z]+\d*.+error'
+ result: metric.hdd_errors
+ push_to_metric:
+ tag: 'metric.**'
type: relabel
- label: default_output
+ label: default_metric
{%- endif %}
diff --git a/linux/meta/prometheus.yml b/linux/meta/prometheus.yml
index e8a26e3..b2911d5 100644
--- a/linux/meta/prometheus.yml
+++ b/linux/meta/prometheus.yml
@@ -32,6 +32,16 @@
summary: 'Free open files for {{ $labels.path }} too low on {{ $labels.host }}'
description: 'Host {{ $labels.host }}) will run out of free open files in less than 8 hours.'
{% endraw %}
+ SystemDiskErrors:
+ if: 'increase(hdd_errors_total[5m]) > 0'
+ {% raw %}
+ labels:
+ severity: critical
+ service: system
+ annotations:
+ summary: 'Disk {{ $labels.device }} is failing'
+ description: 'The disk ({{ $labels.device }}) is reporting errors on {{ $labels.host }}.'
+ {% endraw %}
SystemDiskSpaceFull:
if: 'disk_used_percent >= 99 and disk_inodes_total > 0'
{% raw %}