Add support of mcelog service

Change-Id: I32c83d63e7f359704ab6cc77dec07a1617880fbb
Prod-Related: PROD-20137
diff --git a/README.rst b/README.rst
index 5b7c72a..216755b 100644
--- a/README.rst
+++ b/README.rst
@@ -918,6 +918,18 @@
           logpath: "/var/log/atop"
           outfile: "/var/log/atop/daily.log"
 
+Linux with mcelog service:
+
+.. code-block:: yaml
+
+    linux:
+      system:
+        mcelog:
+          enabled: true
+          logging:
+            syslog: true
+            syslog_error: true
+
 RHEL / CentOS
 ^^^^^^^^^^^^^
 
diff --git a/linux/files/mcelog.conf b/linux/files/mcelog.conf
new file mode 100644
index 0000000..2b2302f
--- /dev/null
+++ b/linux/files/mcelog.conf
@@ -0,0 +1,199 @@
+{%- from "linux/map.jinja" import system with context %}
+{%- set mcelog = system.mcelog %}
+#
+# Example config file for mcelog
+# mcelog is the user space backend that decodes and process machine check events
+# (cpu hardware errors) reported by the CPU to the kernel
+#
+
+# general format
+#optionname = value
+# white space is not allowed in value currently, except at the end where it is dropped
+#
+
+# In general all command line options that are not commands work here.
+# See man mcelog or mcelog --help for a list.
+# e.g. to enable the --no-syslog option use
+#no-syslog = yes   (or no to disable)
+# when the option has a argument
+#logfile = /tmp/logfile
+# below are the options which are not command line options.
+
+# Set CPU type for which mcelog decodes events:
+#cpu = type
+# For valid values for type please see mcelog --help.
+# If this value is set incorrectly the decoded output will be likely incorrect.
+# By default when this parameter is not set mcelog uses the CPU it is running on
+# on very new kernels the mcelog events reported by the kernel also carry
+# the CPU type which is used too when available and not overriden.
+
+# Enable daemon mode:
+#daemon = yes
+# By default mcelog just processes the currently pending events and exits.
+# In daemon mode it will keep running as a daemon in the background and poll
+# the kernel for events and then decode them.
+
+# Filter out known broken events by default.
+filter = yes
+# Don't log memory errors individually.
+# They still get accounted if that is enabled.
+#filter-memory-errors = yes
+
+# output in undecoded raw format to be easier machine readable
+# (default is decoded).
+#raw = yes
+
+# Set CPU Mhz to decode uptime from time stamp counter (output
+# unreliable, not needed on new kernels which report the event time
+# directly. A lot of systems don't have a linear time stamp clock
+# and the output is wrong then.
+# Normally mcelog tries to figure out if it the TSC is reliable
+# and only uses the current frequency then.
+# Setting a frequency forces timestamp decoding.
+# This setting is obsolete with modern kernels which report the time
+# directly.
+#cpumhz = 1800.00
+
+# log output options
+# Log decoded machine checks in syslog (default stdout or syslog for daemon)
+#syslog = yes
+# Log decoded machine checks in syslog with error level
+#syslog-error = yes
+# Never log anything to syslog
+#no-syslog = yes
+# Append log output to logfile instead of stdout. Only when no syslog logging is active
+#logfile = filename
+
+{%- if mcelog.logging is defined %}
+
+{%- if mcelog.logging.syslog is defined %}
+syslog = {{ 'yes' if mcelog.logging.syslog else 'no' }}
+{%- endif %}
+{%- if mcelog.logging.syslog_error is defined %}
+syslog-error = {{ 'yes' if mcelog.logging.syslog_error else 'no' }}
+{%- endif %}
+{%- if mcelog.logging.no_syslog is defined %}
+no-syslog = {{ 'yes' if mcelog.logging.no_syslog else 'no' }}
+{%- endif %}
+{%- if mcelog.logging.logfile is defined %}
+logfile = {{ mcelog.logging.logfile }}
+{%- endif %}
+
+{%- endif %}
+# Use SMBIOS information to decode DIMMs (needs root).
+# This function is not recommended to use right now and generally not needed.
+# The exception is memdb prepopulation, which is configured separately below.
+#dmi = no
+
+# When in daemon mode run as this user after set up.
+# Note that the triggers will run as this user too.
+# Setting this to non root will mean that triggers cannot take some corrective
+# action, like offlining objects.
+#run-credentials-user = root
+
+# group to run as daemon with
+# default to the group of the run-credentials-user
+#run-credentials-group = nobody
+
+[server]
+# user allowed to access client socket.
+# when set to * match any
+# root is always allowed to access.
+# default: root only
+client-user = root
+# group allowed to access mcelog
+# When no group is configured any group matches (but still user checking).
+# when set to * match any
+#client-group = root
+# Path to the unix socket for client<->server communication.
+# When no socket-path is configured the server will not start
+#socket-path = /var/run/mcelog-client
+# When mcelog starts it checks if a server is already running. This configures the timeout
+# for this check.
+#initial-ping-timeout = 2
+#
+[dimm]
+# Is the in memory DIMM error tracking enabled?
+# Only works on systems with integrated memory controller and
+# which are supported.
+# Only takes effect in daemon mode.
+dimm-tracking-enabled = yes
+# Use DMI information from the BIOS to prepopulate DIMM database.
+# Note this might not work with all BIOS and requires mcelog to run as root.
+# Alternative is to let mcelog create DIMM objects on demand.
+dmi-prepopulate = yes
+#
+# Execute these triggers when the rate of corrected or uncorrected
+# Errors per DIMM exceeds the threshold.
+# Note when the hardware does not report DIMMs this might also
+# be per channel.
+# The default of 10/24h is reasonable for server quality
+# DDR3 DIMMs as of 2009/10.
+#uc-error-trigger = dimm-error-trigger
+uc-error-threshold = 1 / 24h
+#ce-error-trigger = dimm-error-trigger
+ce-error-threshold = 10 / 24h
+
+[socket]
+# Enable memory error accounting per socket.
+socket-tracking-enabled = yes
+
+# Threshold and trigger for uncorrected memory errors on a socket.
+# mem-uc-error-trigger = socket-memory-error-trigger
+
+mem-uc-error-threshold = 100 / 24h
+
+# Trigger script for corrected memory errors on a socket.
+mem-ce-error-trigger = socket-memory-error-trigger
+
+# Threshold on when to trigger a correct error for the socket.
+
+mem-ce-error-threshold = 100 / 24h
+
+#  Log socket error threshold explicitely?
+mem-ce-error-log = yes
+
+# Trigger script for uncorrected bus error events
+bus-uc-threshold-trigger = bus-error-trigger
+
+# Trigger script for uncorrected IOMCA erors
+iomca-threshold-trigger = iomca-error-trigger
+
+# Trigger script for other uncategorized errors
+unknown-threshold-trigger = unknown-error-trigger
+
+[cache]
+# Processing of cache error thresholds reported by Intel CPUs.
+cache-threshold-trigger = cache-error-trigger
+
+# Should cache threshold events be logged explicitely?
+cache-threshold-log = yes
+
+[page]
+# Memory error accouting per 4K memory page.
+# Threshold for the correct memory errors trigger script.
+memory-ce-threshold = 10 / 24h
+
+# Trigger script for corrected errors.
+# memory-ce-trigger = page-error-trigger
+
+# Should page threshold events be logged explicitely?
+memory-ce-log = yes
+
+# specify the internal action in mcelog to exceeding a page error threshold
+# this is done in addition to executing the trigger script if available
+# off      no action
+# account  only account errors
+# soft     try to soft-offline page without killing any processes
+#          This requires an uptodate kernel. Might not be successfull.
+# hard     try to hard-offline page by killing processes
+#          Requires an uptodate kernel. Might not be successfull.
+# soft-then-hard   First try to soft offline, then try hard offlining
+#memory-ce-action = off|account|soft|hard|soft-then-hard
+memory-ce-action = soft
+
+[trigger]
+# Maximum number of running triggers
+children-max = 2
+# execute triggers in this directory
+directory = /etc/mcelog
diff --git a/linux/system/init.sls b/linux/system/init.sls
index c1b13e4..ad3681a 100644
--- a/linux/system/init.sls
+++ b/linux/system/init.sls
@@ -117,3 +117,6 @@
 {%- if system.banner is defined %}
 - linux.system.banner
 {%- endif %}
+{%- if system.mcelog is defined %}
+- linux.system.mcelog
+{%- endif %}
diff --git a/linux/system/mcelog.sls b/linux/system/mcelog.sls
new file mode 100644
index 0000000..c2d0fd4
--- /dev/null
+++ b/linux/system/mcelog.sls
@@ -0,0 +1,32 @@
+{%- from "linux/map.jinja" import system with context %}
+{%- if system.enabled %}
+
+{%- if system.get('mcelog',{}).get('enabled', False) %}
+
+mcelog_packages:
+  pkg.installed:
+    - name: mcelog
+
+mcelog_conf:
+  file.managed:
+    - name: /etc/mcelog/mcelog.conf
+    - source: salt://linux/files/mcelog.conf
+    - template: jinja
+    - user: root
+    - group: root
+    - mode: 644
+    - require:
+      - pkg: mcelog_packages
+
+mce_service:
+  service.running:
+  - name: mcelog
+  - enable: true
+  - require:
+    - pkg: mcelog_packages
+  - watch:
+    - file: mcelog_conf
+
+{%- endif %}
+
+{%- endif %}
diff --git a/tests/pillar/system.sls b/tests/pillar/system.sls
index eb6201e..8aeb9d7 100644
--- a/tests/pillar/system.sls
+++ b/tests/pillar/system.sls
@@ -369,3 +369,8 @@
       interval: 20
       logpath: "/var/mylog/atop"
       outfile: "/var/mylog/atop/daily.log"
+    mcelog:
+      enabled: true
+      logging:
+        syslog: true
+        syslog_error: true