Log collector module
New:
- [Done] multiple namespace selector
- [Done] keyword-based pod selector
- [Done] per-pod logs syntax detection and parsing
- [Differed] in-place filtering for shorter logs
- [Done] individual logs timestamp detection
- [Done] Unix time bases Timestamp sorting
- [Done] Single file logs output using common format
- [Done] add all log types from all MOS namespaces and pods
Update:
- resource preparation can be skipped per module
- updated log collection using multiple threads
- new setting LOG_COLLECT_THREADS
Fixes:
- Network MTU fix
- Faster cmd execution on single pod
- Ceph benchmark validations
- Ceph benchmark report sorting
- Daemonset deployment with nodes skipped
- Network tree debugging script
- Tree depth limiter, i.e. stackoverflow prevention
Related-PROD: PROD-36845
Change-Id: Icf229ac62078c6418ab4dbdff12b0d27ed42af1d
diff --git a/cfg_checker/common/decorators.py b/cfg_checker/common/decorators.py
index 1b39460..d507ae5 100644
--- a/cfg_checker/common/decorators.py
+++ b/cfg_checker/common/decorators.py
@@ -4,6 +4,7 @@
import time
from cfg_checker.common import logger, logger_cli
+from cfg_checker.common.exception import KubeException
def retry(exceptions, total_tries=5, initial_wait=1, backoff_factor=2):
@@ -29,7 +30,8 @@
print_args = args if args else "no args"
if _tries == 1:
msg = "... {} failed after {} tries".format(
- f.__name___,
+ # dirty hack to get name
+ str(f).split(" ")[1],
total_tries
)
logger_cli.info(msg)
@@ -39,10 +41,11 @@
kwargs
)
)
- raise
+ _tries = 0
+ raise KubeException(msg)
msg = "... {}; Exception: {}.\n" \
"... retrying in {} seconds!".format(
- f.__name__,
+ str(f).split(" ")[1],
e,
_delay
)
diff --git a/cfg_checker/common/kube_utils.py b/cfg_checker/common/kube_utils.py
index f4c38ef..e6b9922 100644
--- a/cfg_checker/common/kube_utils.py
+++ b/cfg_checker/common/kube_utils.py
@@ -11,6 +11,7 @@
from kubernetes import client as kclient, config as kconfig, watch
from kubernetes.stream import stream
from kubernetes.client.rest import ApiException
+from urllib3.exceptions import MaxRetryError
from time import time, sleep
from cfg_checker.common import logger, logger_cli
@@ -965,7 +966,11 @@
_svc
)
- def get_pod_logs(self, podname, ns):
+ def list_namespaces(self):
+ return self.CoreV1.list_namespace()
+
+ @retry(ApiException, initial_wait=2)
+ def get_pod_logs(self, podname, container, ns, tail_lines=50):
# Params
# read log of the specified Pod # noqa: E501 This method makes a synchronous HTTP request by default. To make an asynchronous HTTP request, please pass async_req=True
@@ -1022,10 +1027,23 @@
# If the method is called asynchronously, returns the request
# thread.
- return self.CoreV1.read_namespaced_pod_log(
- podname,
- ns,
- # timestamps=True,
- tail_lines=50,
- # pretty=True
- )
+ try:
+ return self.CoreV1.read_namespaced_pod_log(
+ name=podname,
+ namespace=ns,
+ container=container,
+ timestamps=True,
+ tail_lines=tail_lines,
+ # pretty=True,
+ _request_timeout=(1, 5)
+ )
+ except MaxRetryError as e:
+ logger_cli.warning(
+ "WARNING: Failed to retrieve log {}/{}:{}:\n{}".format(
+ ns,
+ podname,
+ container,
+ e.reason
+ )
+ )
+ return ""
diff --git a/cfg_checker/common/other.py b/cfg_checker/common/other.py
index 4c3ef04..987168f 100644
--- a/cfg_checker/common/other.py
+++ b/cfg_checker/common/other.py
@@ -193,5 +193,18 @@
return getattr(obj, attr, *args)
return functools.reduce(_getattr, [obj] + attr.split('.'))
+ @staticmethod
+ def split_option_type(size):
+ # I know, but it is faster then regex
+ _numbers = [48, 49, 50, 51, 52, 53, 54, 55, 56, 57]
+ _s_int = "0"
+ _s_type = ""
+ for ch in size:
+ if ord(ch) in _numbers:
+ _s_int += ch
+ else:
+ _s_type += ch
+ return int(_s_int), _s_type
+
utils = Utils()
diff --git a/cfg_checker/common/settings.py b/cfg_checker/common/settings.py
index 08d4163..27482ff 100644
--- a/cfg_checker/common/settings.py
+++ b/cfg_checker/common/settings.py
@@ -209,6 +209,7 @@
self.mcp_host = _get_env_value('MCP_ENV_HOST', None)
self.salt_port = _get_env_value('MCP_SALT_PORT', '6969')
self.threads = int(_get_env_value('MCP_THREADS', "5"))
+ self.sage_threads = int(_get_env_value('LOG_COLLECT_THREADS', "15"))
self.script_execution_timeout = int(
_get_env_value('MCP_SCRIPT_RUN_TIMEOUT', "300")
)
@@ -489,3 +490,5 @@
# Init vars that is specific to detected envs only
logger_cli.debug("... loading detected environment type vars")
self._init_env_values()
+ # Set internal resource preparation flag default
+ self.prepare_qa_resources = True