blob: 71a24ac02ba07ea3e3e2ce600ed0531b1a20eba3 [file] [log] [blame]
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +03001import os
koder aka kdanilov962ee5f2016-12-19 02:40:08 +02002import json
koder aka kdanilov70227062016-11-26 23:23:21 +02003import time
4import array
koder aka kdanilov962ee5f2016-12-19 02:40:08 +02005import logging
koder aka kdanilov70227062016-11-26 23:23:21 +02006import threading
koder aka kdanilov962ee5f2016-12-19 02:40:08 +02007import traceback
8import subprocess
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +03009
10
koder aka kdanilov962ee5f2016-12-19 02:40:08 +020011mod_name = "sensors"
koder aka kdanilov70227062016-11-26 23:23:21 +020012__version__ = (0, 1)
13
14
koder aka kdanilov962ee5f2016-12-19 02:40:08 +020015logger = logging.getLogger("agent.sensors")
koder aka kdanilov70227062016-11-26 23:23:21 +020016SensorsMap = {}
17
18
19def provides(name):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030020 def closure(func):
koder aka kdanilov70227062016-11-26 23:23:21 +020021 SensorsMap[name] = func
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030022 return func
23 return closure
24
25
26def is_dev_accepted(name, disallowed_prefixes, allowed_prefixes):
27 dev_ok = True
28
29 if disallowed_prefixes is not None:
30 dev_ok = all(not name.startswith(prefix)
31 for prefix in disallowed_prefixes)
32
33 if dev_ok and allowed_prefixes is not None:
34 dev_ok = any(name.startswith(prefix)
35 for prefix in allowed_prefixes)
36
37 return dev_ok
38
39
40def get_pid_list(disallowed_prefixes, allowed_prefixes):
41 """Return pid list from list of pids and names"""
42 # exceptions
koder aka kdanilov70227062016-11-26 23:23:21 +020043 disallowed = disallowed_prefixes if disallowed_prefixes is not None else []
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030044 if allowed_prefixes is None:
45 # if nothing setted - all ps will be returned except setted
46 result = [pid
47 for pid in os.listdir('/proc')
koder aka kdanilov70227062016-11-26 23:23:21 +020048 if pid.isdigit() and pid not in disallowed]
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030049 else:
50 result = []
51 for pid in os.listdir('/proc'):
koder aka kdanilov70227062016-11-26 23:23:21 +020052 if pid.isdigit() and pid not in disallowed:
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030053 name = get_pid_name(pid)
koder aka kdanilov962ee5f2016-12-19 02:40:08 +020054 if pid in allowed_prefixes or any(name.startswith(val) for val in allowed_prefixes):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030055 # this is allowed pid?
56 result.append(pid)
57 return result
58
59
60def get_pid_name(pid):
61 """Return name by pid"""
62 try:
63 with open(os.path.join('/proc/', pid, 'cmdline'), 'r') as pidfile:
64 try:
65 cmd = pidfile.readline().split()[0]
66 return os.path.basename(cmd).rstrip('\x00')
67 except IndexError:
68 # no cmd returned
69 return "<NO NAME>"
70 except IOError:
71 # upstream wait any string, no matter if we couldn't read proc
72 return "no_such_process"
73
74
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +030075# 1 - major number
76# 2 - minor mumber
77# 3 - device name
78# 4 - reads completed successfully
79# 5 - reads merged
80# 6 - sectors read
81# 7 - time spent reading (ms)
82# 8 - writes completed
83# 9 - writes merged
84# 10 - sectors written
85# 11 - time spent writing (ms)
86# 12 - I/Os currently in progress
87# 13 - time spent doing I/Os (ms)
88# 14 - weighted time spent doing I/Os (ms)
89
90io_values_pos = [
91 (3, 'reads_completed', True),
92 (5, 'sectors_read', True),
93 (6, 'rtime', True),
94 (7, 'writes_completed', True),
95 (9, 'sectors_written', True),
96 (10, 'wtime', True),
97 (11, 'io_queue', False),
98 (13, 'io_time', True)
99]
100
101
102@provides("block-io")
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200103def io_stat(config, disallowed_prefixes=('ram', 'loop'), allowed_prefixes=None):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300104 results = {}
105 for line in open('/proc/diskstats'):
106 vals = line.split()
107 dev_name = vals[2]
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300108 dev_ok = is_dev_accepted(dev_name,
109 disallowed_prefixes,
110 allowed_prefixes)
koder aka kdanilov70227062016-11-26 23:23:21 +0200111 if not dev_ok or dev_name[-1].isdigit():
112 continue
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300113
koder aka kdanilov70227062016-11-26 23:23:21 +0200114 for pos, name, _ in io_values_pos:
115 results["{0}.{1}".format(dev_name, name)] = int(vals[pos])
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300116 return results
117
118
119# 1 - major number
120# 2 - minor mumber
121# 3 - device name
122# 4 - reads completed successfully
123# 5 - reads merged
124# 6 - sectors read
125# 7 - time spent reading (ms)
126# 8 - writes completed
127# 9 - writes merged
128# 10 - sectors written
129# 11 - time spent writing (ms)
130# 12 - I/Os currently in progress
131# 13 - time spent doing I/Os (ms)
132# 14 - weighted time spent doing I/Os (ms)
133
134net_values_pos = [
135 (0, 'recv_bytes', True),
136 (1, 'recv_packets', True),
137 (8, 'send_bytes', True),
138 (9, 'send_packets', True),
139]
140
141
142@provides("net-io")
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200143def net_stat(config, disallowed_prefixes=('docker', 'lo'), allowed_prefixes=('eth',)):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300144 results = {}
145
146 for line in open('/proc/net/dev').readlines()[2:]:
147 dev_name, stats = line.split(":", 1)
148 dev_name = dev_name.strip()
149 vals = stats.split()
150
151 dev_ok = is_dev_accepted(dev_name,
152 disallowed_prefixes,
153 allowed_prefixes)
154
155 if '.' in dev_name and dev_name.split('.')[-1].isdigit():
156 dev_ok = False
157
158 if dev_ok:
koder aka kdanilov70227062016-11-26 23:23:21 +0200159 for pos, name, _ in net_values_pos:
160 results["{0}.{1}".format(dev_name, name)] = int(vals[pos])
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300161 return results
162
163
164def pid_stat(pid):
165 """Return total cpu usage time from process"""
166 # read /proc/pid/stat
167 with open(os.path.join('/proc/', pid, 'stat'), 'r') as pidfile:
168 proctimes = pidfile.readline().split()
169 # get utime from /proc/<pid>/stat, 14 item
170 utime = proctimes[13]
171 # get stime from proc/<pid>/stat, 15 item
172 stime = proctimes[14]
173 # count total process used time
174 return float(int(utime) + int(stime))
175
176
koder aka kdanilov70227062016-11-26 23:23:21 +0200177@provides("perprocess-cpu")
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200178def pscpu_stat(config, disallowed_prefixes=None, allowed_prefixes=None):
koder aka kdanilov70227062016-11-26 23:23:21 +0200179 results = {}
180 # TODO(koder): fixed list of PID's nust be given
181 for pid in get_pid_list(disallowed_prefixes, allowed_prefixes):
182 try:
183 results["{0}.{1}".format(get_pid_name(pid), pid)] = pid_stat(pid)
184 except IOError:
185 # may be, proc has already terminated, skip it
186 continue
187 return results
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300188
189
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300190def get_mem_stats(pid):
191 """Return memory data of pid in format (private, shared)"""
192
193 fname = '/proc/{0}/{1}'.format(pid, "smaps")
194 lines = open(fname).readlines()
195
196 shared = 0
197 private = 0
198 pss = 0
199
koder aka kdanilov70227062016-11-26 23:23:21 +0200200 # add 0.5KiB as this avg error due to truncation
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300201 pss_adjust = 0.5
202
203 for line in lines:
204 if line.startswith("Shared"):
205 shared += int(line.split()[1])
206
207 if line.startswith("Private"):
208 private += int(line.split()[1])
209
210 if line.startswith("Pss"):
211 pss += float(line.split()[1]) + pss_adjust
212
213 # Note Shared + Private = Rss above
214 # The Rss in smaps includes video card mem etc.
215
216 if pss != 0:
217 shared = int(pss - private)
218
219 return (private, shared)
220
221
koder aka kdanilov70227062016-11-26 23:23:21 +0200222def get_ram_size():
223 """Return RAM size in Kb"""
224 with open("/proc/meminfo") as proc:
225 mem_total = proc.readline().split()
226 return int(mem_total[1])
227
228
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300229@provides("perprocess-ram")
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200230def psram_stat(config, disallowed_prefixes=None, allowed_prefixes=None):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300231 results = {}
koder aka kdanilov70227062016-11-26 23:23:21 +0200232 # TODO(koder): fixed list of PID's nust be given
233 for pid in get_pid_list(disallowed_prefixes, allowed_prefixes):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300234 try:
235 dev_name = get_pid_name(pid)
236
237 private, shared = get_mem_stats(pid)
238 total = private + shared
239 sys_total = get_ram_size()
koder aka kdanilov70227062016-11-26 23:23:21 +0200240 usage = float(total) / sys_total
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300241
242 sensor_name = "{0}({1})".format(dev_name, pid)
243
koder aka kdanilov70227062016-11-26 23:23:21 +0200244 results.update([
245 (sensor_name + ".private_mem", private),
246 (sensor_name + ".shared_mem", shared),
247 (sensor_name + ".used_mem", total),
248 (sensor_name + ".mem_usage_percent", int(usage * 100))])
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300249 except IOError:
250 # permission denied or proc die
251 continue
252 return results
253
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300254# 0 - cpu name
255# 1 - user: normal processes executing in user mode
256# 2 - nice: niced processes executing in user mode
257# 3 - system: processes executing in kernel mode
258# 4 - idle: twiddling thumbs
259# 5 - iowait: waiting for I/O to complete
260# 6 - irq: servicing interrupts
261# 7 - softirq: servicing softirqs
262
koder aka kdanilov22d134e2016-11-08 11:33:19 +0200263cpu_values_pos = [
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300264 (1, 'user_processes', True),
265 (2, 'nice_processes', True),
266 (3, 'system_processes', True),
267 (4, 'idle_time', True),
268]
269
270
271@provides("system-cpu")
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200272def syscpu_stat(config, disallowed_prefixes=None, allowed_prefixes=None):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300273 results = {}
274
275 # calculate core count
276 core_count = 0
277
278 for line in open('/proc/stat'):
279 vals = line.split()
280 dev_name = vals[0]
281
282 if dev_name == 'cpu':
koder aka kdanilov70227062016-11-26 23:23:21 +0200283 for pos, name, _ in cpu_values_pos:
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300284 sensor_name = "{0}.{1}".format(dev_name, name)
koder aka kdanilov70227062016-11-26 23:23:21 +0200285 results[sensor_name] = int(vals[pos])
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300286 elif dev_name == 'procs_blocked':
287 val = int(vals[1])
koder aka kdanilov70227062016-11-26 23:23:21 +0200288 results["cpu.procs_blocked"] = val
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300289 elif dev_name.startswith('cpu'):
290 core_count += 1
291
292 # procs in queue
293 TASKSPOS = 3
294 vals = open('/proc/loadavg').read().split()
295 ready_procs = vals[TASKSPOS].partition('/')[0]
koder aka kdanilov70227062016-11-26 23:23:21 +0200296
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300297 # dec on current proc
298 procs_queue = (float(ready_procs) - 1) / core_count
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200299 results["cpu.procs_queue_x10"] = int(procs_queue * 10)
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300300
301 return results
302
303
304# return this values or setted in allowed
305ram_fields = [
306 'MemTotal',
307 'MemFree',
308 'Buffers',
309 'Cached',
310 'SwapCached',
311 'Dirty',
312 'Writeback',
313 'SwapTotal',
314 'SwapFree'
315]
316
317
318@provides("system-ram")
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200319def sysram_stat(config, disallowed_prefixes=None, allowed_prefixes=None):
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300320 if allowed_prefixes is None:
321 allowed_prefixes = ram_fields
koder aka kdanilov70227062016-11-26 23:23:21 +0200322
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300323 results = {}
koder aka kdanilov70227062016-11-26 23:23:21 +0200324
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300325 for line in open('/proc/meminfo'):
326 vals = line.split()
327 dev_name = vals[0].rstrip(":")
328
329 dev_ok = is_dev_accepted(dev_name,
330 disallowed_prefixes,
331 allowed_prefixes)
332
333 title = "ram.{0}".format(dev_name)
334
335 if dev_ok:
koder aka kdanilov70227062016-11-26 23:23:21 +0200336 results[title] = int(vals[1])
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300337
338 if 'ram.MemFree' in results and 'ram.MemTotal' in results:
339 used = results['ram.MemTotal'].value - results['ram.MemFree'].value
koder aka kdanilov70227062016-11-26 23:23:21 +0200340 results["ram.usage_percent"] = int(float(used) / results['ram.MemTotal'].value)
341
koder aka kdanilov3b4da8b2016-10-17 00:17:53 +0300342 return results
koder aka kdanilov70227062016-11-26 23:23:21 +0200343
344
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200345@provides("ceph")
346def ceph_stat(config, disallowed_prefixes=None, allowed_prefixes=None):
347 results = {}
348
349 def get_val(dct, path):
350 if '/' in path:
351 root, next = path.split('/', 1)
352 return get_val(dct[root], next)
353 return dct[path]
354
355 for osd_id in config['osds']:
356 asok = '/var/run/ceph/{}-osd.{}.asok'.format(config['cluster'], osd_id)
357 out = subprocess.check_output('ceph daemon {} perf dump'.format(asok), shell=True)
358 data = json.loads(out)
359 for key_name in config['counters']:
360 results["osd{}.{}".format(osd_id, key_name.replace("/", "."))] = get_val(data, key_name)
361
362 return results
363
364
koder aka kdanilov70227062016-11-26 23:23:21 +0200365class SensorsData(object):
366 def __init__(self):
367 self.cond = threading.Condition()
368 self.collected_at = array.array("f")
369 self.stop = False
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200370 self.data = {} # {str: array[data]}
371 self.data_fd = None # temporary file to store results
372 self.exception = None
373
374
375def collect(sensors_config):
376 curr = {}
377 for name, config in sensors_config.items():
378 params = {'config': config}
379
380 if "allow" in config:
381 params["allowed_prefixes"] = config["allow"]
382
383 if "disallow" in config:
384 params["disallowed_prefixes"] = config["disallow"]
385
386 curr[name] = SensorsMap[name](**params)
387 return curr
koder aka kdanilov70227062016-11-26 23:23:21 +0200388
389
390# TODO(koder): a lot code here can be optimized and cached, but nobody cares (c)
391def sensors_bg_thread(sensors_config, sdata):
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200392 try:
393 next_collect_at = time.time()
koder aka kdanilov70227062016-11-26 23:23:21 +0200394
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200395 # TODO: handle exceptions here
396 while not sdata.stop:
397 dtime = next_collect_at - time.time()
398 if dtime > 0:
399 with sdata.cond:
400 sdata.cond.wait(dtime)
koder aka kdanilov70227062016-11-26 23:23:21 +0200401
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200402 next_collect_at += 1.0
koder aka kdanilov70227062016-11-26 23:23:21 +0200403
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200404 if sdata.stop:
405 break
koder aka kdanilov70227062016-11-26 23:23:21 +0200406
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200407 ctm = time.time()
408 new_data = collect(sensors_config)
409 etm = time.time()
koder aka kdanilov70227062016-11-26 23:23:21 +0200410
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200411 if etm - ctm > 0.1:
412 # TODO(koder): need to signal that something in not really ok with sensor collecting
413 pass
koder aka kdanilov70227062016-11-26 23:23:21 +0200414
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200415 # TODO: need to pack data time after time to avoid long operations on next updates request
416 with sdata.cond:
417 sdata.collected_at.append(ctm)
418 for source_name, vals in new_data.items():
419 for sensor_name, val in vals.items():
420 key = (source_name, sensor_name)
421 if key not in sdata.data:
422 sdata.data[key] = array.array('L', [val])
423 else:
424 sdata.data[key].append(val)
425 except Exception:
426 logger.exception("In sensor BG thread")
427 sdata.exception = traceback.format_exc()
koder aka kdanilov70227062016-11-26 23:23:21 +0200428
429
430sensors_thread = None
431sdata = None # type: SensorsData
432
433
434def rpc_start(sensors_config):
435 global sensors_thread
436 global sdata
437
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200438 if array.array('L').itemsize != 8:
439 message = "Python array.array('L') items should be 8 bytes in size, not {}." + \
440 " Can't provide sensors on this platform. Disable sensors in config and retry"
441 raise ValueError(message.format(array.array('L').itemsize))
442
koder aka kdanilov70227062016-11-26 23:23:21 +0200443 if sensors_thread is not None:
444 raise ValueError("Thread already running")
445
446 sdata = SensorsData()
447 sensors_thread = threading.Thread(target=sensors_bg_thread, args=(sensors_config, sdata))
448 sensors_thread.daemon = True
449 sensors_thread.start()
450
451
452def rpc_get_updates():
453 if sdata is None:
454 raise ValueError("No sensor thread running")
455
456 with sdata.cond:
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200457 if sdata.exception:
458 raise Exception(sdata.exception)
koder aka kdanilov70227062016-11-26 23:23:21 +0200459 res = sdata.data
460 collected_at = sdata.collected_at
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200461 sdata.collected_at = array.array(sdata.collected_at.typecode)
462 sdata.data = {name: array.array(val.typecode) for name, val in sdata.data.items()}
koder aka kdanilov70227062016-11-26 23:23:21 +0200463
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200464 bres = {key: data.tostring() for key, data in res.items()}
465 return bres, collected_at.tostring()
koder aka kdanilov70227062016-11-26 23:23:21 +0200466
467
468def rpc_stop():
469 global sensors_thread
470 global sdata
471
472 if sensors_thread is None:
473 raise ValueError("No sensor thread running")
474
475 sdata.stop = True
476 with sdata.cond:
477 sdata.cond.notify_all()
478
479 sensors_thread.join()
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200480
481 if sdata.exception:
482 raise Exception(sdata.exception)
483
koder aka kdanilov70227062016-11-26 23:23:21 +0200484 res = sdata.data
485 collected_at = sdata.collected_at
486
487 sensors_thread = None
488 sdata = None
489
koder aka kdanilov962ee5f2016-12-19 02:40:08 +0200490 bres = {key: data.tostring() for key, data in res.items()}
491 return bres, collected_at.tostring()