blob: 9153bef2c983ec78776e287edc8a34aecf785dcb [file] [log] [blame]
pwyckoff99b000b2008-04-03 19:30:55 +00001// Copyright (c) 2006- Facebook
2// Distributed under the Thrift Software License
3//
4// See accompanying file LICENSE or visit the Thrift site at:
5// http://developers.facebook.com/thrift/
6
7#include <sys/time.h>
8
9#include "FacebookBase.h"
10#include "ServiceTracker.h"
David Reissf4335df2009-02-07 02:36:46 +000011#include "concurrency/ThreadManager.h"
pwyckoff99b000b2008-04-03 19:30:55 +000012
13using namespace std;
14using namespace facebook::fb303;
David Reiss858519a2009-02-07 02:36:50 +000015using namespace apache::thrift::concurrency;
pwyckoff99b000b2008-04-03 19:30:55 +000016
17
18uint64_t ServiceTracker::CHECKPOINT_MINIMUM_INTERVAL_SECONDS = 60;
19int ServiceTracker::LOG_LEVEL = 5;
20
21
22ServiceTracker::ServiceTracker(facebook::fb303::FacebookBase *handler,
23 void (*logMethod)(int, const string &),
24 bool featureCheckpoint,
25 bool featureStatusCheck,
26 bool featureThreadCheck,
27 Stopwatch::Unit stopwatchUnit)
28 : handler_(handler), logMethod_(logMethod),
29 featureCheckpoint_(featureCheckpoint),
30 featureStatusCheck_(featureStatusCheck),
31 featureThreadCheck_(featureThreadCheck),
32 stopwatchUnit_(stopwatchUnit),
33 checkpointServices_(0)
34{
35 if (featureCheckpoint_) {
36 time_t now = time(NULL);
37 checkpointTime_ = now;
38 } else {
39 checkpointTime_ = 0;
40 }
41}
42
43/**
44 * Registers the beginning of a "service method": basically, any of
45 * the implementations of Thrift remote procedure calls that a
46 * FacebookBase handler is handling. Controls concurrent
47 * services and reports statistics (via log and via fb303 counters).
48 * Throws an exception if the server is not ready to handle service
49 * methods yet.
50 *
51 * note: The relationship between startService() and finishService()
52 * is currently defined so that a call to finishService() should only
53 * be matched to this call to startService() if this method returns
54 * without exception. It wouldn't be a problem to implement things
55 * the other way, so that *every* start needed a finish, but this
56 * convention was chosen to match the way an object's constructor and
57 * destructor work together, i.e. to work well with ServiceMethod
58 * objects.
59 *
60 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
61 * object instantiated at the start
62 * of the service method.
pwyckoff99b000b2008-04-03 19:30:55 +000063 */
64void
65ServiceTracker::startService(const ServiceMethod &serviceMethod)
66{
67 // note: serviceMethod.timer_ automatically starts at construction.
68
69 // log service start
70 logMethod_(5, serviceMethod.signature_);
71
72 // check handler ready
73 if (featureStatusCheck_ && !serviceMethod.featureLogOnly_) {
74 // note: Throwing exceptions before counting statistics. See note
75 // in method header.
76 // note: A STOPPING server is not accepting new connections, but it
77 // is still handling any already-connected threads -- so from the
78 // service method's point of view, a status of STOPPING is a green
79 // light.
80 facebook::fb303::fb_status status = handler_->getStatus();
81 if (status != facebook::fb303::ALIVE
82 && status != facebook::fb303::STOPPING) {
83 if (status == facebook::fb303::STARTING) {
84 throw ServiceException("Server starting up; please try again later");
85 } else {
86 throw ServiceException("Server not alive; please try again later");
87 }
88 }
89 }
90
91 // check server threads
92 if (featureThreadCheck_ && !serviceMethod.featureLogOnly_) {
93 // note: Might want to put these messages in reportCheckpoint() if
94 // log is getting spammed.
95 if (threadManager_ != NULL) {
96 size_t idle_count = threadManager_->idleWorkerCount();
97 if (idle_count == 0) {
98 stringstream message;
99 message << "service " << serviceMethod.signature_
100 << ": all threads (" << threadManager_->workerCount()
101 << ") in use";
102 logMethod_(3, message.str());
103 }
104 }
105 }
106}
107
108/**
109 * Logs a significant step in the middle of a "service method"; see
110 * startService.
111 *
112 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
113 * object instantiated at the start
114 * of the service method.
115 * @return int64_t Elapsed units (see stopwatchUnit_) since ServiceMethod
116 * instantiation.
pwyckoff99b000b2008-04-03 19:30:55 +0000117 */
118int64_t
119ServiceTracker::stepService(const ServiceMethod &serviceMethod,
120 const string &stepName)
121{
122 stringstream message;
123 string elapsed_label;
124 int64_t elapsed = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
125 &elapsed_label);
126 message << serviceMethod.signature_
127 << ' ' << stepName
128 << " [" << elapsed_label << ']';
129 logMethod_(5, message.str());
130 return elapsed;
131}
132
133/**
134 * Registers the end of a "service method"; see startService().
135 *
136 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
137 * object instantiated at the start
138 * of the service method.
pwyckoff99b000b2008-04-03 19:30:55 +0000139 */
140void
141ServiceTracker::finishService(const ServiceMethod &serviceMethod)
142{
143 // log end of service
144 stringstream message;
145 string duration_label;
146 int64_t duration = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
147 &duration_label);
148 message << serviceMethod.signature_
149 << " finish [" << duration_label << ']';
150 logMethod_(5, message.str());
151
152 // count, record, and maybe report service statistics
153 if (!serviceMethod.featureLogOnly_) {
154
155 if (!featureCheckpoint_) {
156
157 // lifetime counters
158 // (note: No need to lock statisticsMutex_ if not doing checkpoint;
159 // FacebookService::incrementCounter() is already thread-safe.)
160 handler_->incrementCounter("lifetime_services");
161
162 } else {
163
164 statisticsMutex_.lock();
165 // note: No exceptions expected from this code block. Wrap in a try
166 // just to be safe.
167 try {
168
169 // lifetime counters
170 // note: Good to synchronize this with the increment of
171 // checkpoint services, even though incrementCounter() is
172 // already thread-safe, for the sake of checkpoint reporting
173 // consistency (i.e. since the last checkpoint,
174 // lifetime_services has incremented by checkpointServices_).
175 handler_->incrementCounter("lifetime_services");
176
177 // checkpoint counters
178 checkpointServices_++;
179 checkpointDuration_ += duration;
180
181 // per-service timing
182 // note kjv: According to my tests it is very slightly faster to
183 // call insert() once (and detect not-found) than calling find()
184 // and then maybe insert (if not-found). However, the difference
185 // is tiny for small maps like this one, and the code for the
186 // faster solution is slightly less readable. Also, I wonder if
187 // the instantiation of the (often unused) pair to insert makes
188 // the first algorithm slower after all.
189 map<string, pair<uint64_t, uint64_t> >::iterator iter;
190 iter = checkpointServiceDuration_.find(serviceMethod.name_);
191 if (iter != checkpointServiceDuration_.end()) {
192 iter->second.first++;
193 iter->second.second += duration;
194 } else {
195 checkpointServiceDuration_.insert(make_pair(serviceMethod.name_,
196 make_pair(1, duration)));
197 }
198
199 // maybe report checkpoint
200 // note: ...if it's been long enough since the last report.
201 time_t now = time(NULL);
202 uint64_t check_interval = now - checkpointTime_;
203 if (check_interval >= CHECKPOINT_MINIMUM_INTERVAL_SECONDS) {
204 reportCheckpoint();
205 }
206
207 } catch (...) {
208 statisticsMutex_.unlock();
209 throw;
210 }
211 statisticsMutex_.unlock();
212
213 }
214 }
215}
216
217/**
218 * Logs some statistics gathered since the last call to this method.
219 *
220 * note: Thread race conditions on this method could cause
221 * misreporting and/or undefined behavior; the caller must protect
222 * uses of the object variables (and calls to this method) with a
223 * mutex.
224 *
pwyckoff99b000b2008-04-03 19:30:55 +0000225 */
226void
227ServiceTracker::reportCheckpoint()
228{
229 time_t now = time(NULL);
230
231 uint64_t check_count = checkpointServices_;
232 uint64_t check_interval = now - checkpointTime_;
233 uint64_t check_duration = checkpointDuration_;
234
235 // export counters for timing of service methods (by service name)
236 handler_->setCounter("checkpoint_time", check_interval);
237 map<string, pair<uint64_t, uint64_t> >::iterator iter;
238 uint64_t count;
239 for (iter = checkpointServiceDuration_.begin();
240 iter != checkpointServiceDuration_.end();
241 iter++) {
242 count = iter->second.first;
243 handler_->setCounter(string("checkpoint_count_") + iter->first, count);
244 if (count == 0) {
245 handler_->setCounter(string("checkpoint_speed_") + iter->first,
246 0);
247 } else {
248 handler_->setCounter(string("checkpoint_speed_") + iter->first,
249 iter->second.second / count);
250 }
251 }
252
253 // reset checkpoint variables
254 // note: Clearing the map while other threads are using it might
255 // cause undefined behavior.
256 checkpointServiceDuration_.clear();
257 checkpointTime_ = now;
258 checkpointServices_ = 0;
259 checkpointDuration_ = 0;
260
261 // get lifetime variables
262 uint64_t life_count = handler_->getCounter("lifetime_services");
263 uint64_t life_interval = now - handler_->aliveSince();
264
265 // log checkpoint
266 stringstream message;
267 message << "checkpoint_time:" << check_interval
268 << " checkpoint_services:" << check_count
269 << " checkpoint_speed_sum:" << check_duration
270 << " lifetime_time:" << life_interval
271 << " lifetime_services:" << life_count;
272 if (featureThreadCheck_ && threadManager_ != NULL) {
273 size_t worker_count = threadManager_->workerCount();
274 size_t idle_count = threadManager_->idleWorkerCount();
275 message << " total_workers:" << worker_count
276 << " active_workers:" << (worker_count - idle_count);
277 }
278 logMethod_(4, message.str());
279}
280
281/**
282 * Remembers the thread manager used in the server, for monitoring thread
283 * activity.
284 *
285 * @param shared_ptr<ThreadManager> threadManager The server's thread manager.
pwyckoff99b000b2008-04-03 19:30:55 +0000286 */
287void
288ServiceTracker::setThreadManager(boost::shared_ptr<ThreadManager>
289 threadManager)
290{
291 threadManager_ = threadManager;
292}
293
294/**
295 * Logs messages to stdout; the passed message will be logged if the
296 * passed level is less than or equal to LOG_LEVEL.
297 *
298 * This is the default logging method used by the ServiceTracker. An
299 * alternate logging method (that accepts the same parameters) may be
300 * specified to the constructor.
301 *
302 * @param int level A level associated with the message: higher levels
303 * are used to indicate higher levels of detail.
304 * @param string message The message to log.
305 */
306void
307ServiceTracker::defaultLogMethod(int level, const string &message)
308{
309 if (level <= LOG_LEVEL) {
310 string level_string;
311 time_t now = time(NULL);
312 char now_pretty[26];
313 ctime_r(&now, now_pretty);
314 now_pretty[24] = '\0';
315 switch (level) {
316 case 1:
317 level_string = "CRITICAL";
318 break;
319 case 2:
320 level_string = "ERROR";
321 break;
322 case 3:
323 level_string = "WARNING";
324 break;
325 case 5:
326 level_string = "DEBUG";
327 break;
328 case 4:
329 default:
330 level_string = "INFO";
331 break;
332 }
333 cout << '[' << level_string << "] [" << now_pretty << "] "
334 << message << endl;
335 }
336}
337
338
339/**
340 * Creates a Stopwatch, which can report the time elapsed since its
341 * creation.
342 *
pwyckoff99b000b2008-04-03 19:30:55 +0000343 */
344Stopwatch::Stopwatch()
345{
346 gettimeofday(&startTime_, NULL);
347}
348
349void
350Stopwatch::reset()
351{
352 gettimeofday(&startTime_, NULL);
353}
354
355uint64_t
356Stopwatch::elapsedUnits(Stopwatch::Unit unit, string *label) const
357{
358 timeval now_time;
359 gettimeofday(&now_time, NULL);
360 time_t duration_secs = now_time.tv_sec - startTime_.tv_sec;
361
362 uint64_t duration_units;
363 switch (unit) {
364 case UNIT_SECONDS:
365 duration_units = duration_secs
366 + (now_time.tv_usec - startTime_.tv_usec + 500000) / 1000000;
367 if (NULL != label) {
368 stringstream ss_label;
369 ss_label << duration_units << " secs";
370 label->assign(ss_label.str());
371 }
372 break;
373 case UNIT_MICROSECONDS:
374 duration_units = duration_secs * 1000000
375 + now_time.tv_usec - startTime_.tv_usec;
376 if (NULL != label) {
377 stringstream ss_label;
378 ss_label << duration_units << " us";
379 label->assign(ss_label.str());
380 }
381 break;
382 case UNIT_MILLISECONDS:
383 default:
384 duration_units = duration_secs * 1000
385 + (now_time.tv_usec - startTime_.tv_usec + 500) / 1000;
386 if (NULL != label) {
387 stringstream ss_label;
388 ss_label << duration_units << " ms";
389 label->assign(ss_label.str());
390 }
391 break;
392 }
393 return duration_units;
394}
395
396/**
397 * Creates a ServiceMethod, used for tracking a single service method
398 * invocation (via the ServiceTracker). The passed name of the
399 * ServiceMethod is used to group statistics (e.g. counts and durations)
400 * for similar invocations; the passed signature is used to uniquely
401 * identify the particular invocation in the log.
402 *
403 * note: A version of this constructor is provided that automatically
404 * forms a signature the name and a passed numeric id. Silly, sure,
405 * but commonly used, since it often saves the caller a line or two of
406 * code.
407 *
408 * @param ServiceTracker *tracker The service tracker that will track this
409 * ServiceMethod.
410 * @param const string &name The service method name (usually independent
411 * of service method parameters).
412 * @param const string &signature A signature uniquely identifying the method
413 * invocation (usually name plus parameters).
pwyckoff99b000b2008-04-03 19:30:55 +0000414 */
415ServiceMethod::ServiceMethod(ServiceTracker *tracker,
416 const string &name,
417 const string &signature,
418 bool featureLogOnly)
419 : tracker_(tracker), name_(name), signature_(signature),
420 featureLogOnly_(featureLogOnly)
421{
422 // note: timer_ automatically starts at construction.
423
424 // invoke tracker to start service
425 // note: Might throw. If it throws, then this object's destructor
426 // won't be called, which is according to plan: finishService() is
427 // only supposed to be matched to startService() if startService()
428 // returns without error.
429 tracker_->startService(*this);
430}
431
432ServiceMethod::ServiceMethod(ServiceTracker *tracker,
433 const string &name,
434 uint64_t id,
435 bool featureLogOnly)
436 : tracker_(tracker), name_(name), featureLogOnly_(featureLogOnly)
437{
438 // note: timer_ automatically starts at construction.
439 stringstream ss_signature;
440 ss_signature << name << " (" << id << ')';
441 signature_ = ss_signature.str();
442
443 // invoke tracker to start service
444 // note: Might throw. If it throws, then this object's destructor
445 // won't be called, which is according to plan: finishService() is
446 // only supposed to be matched to startService() if startService()
447 // returns without error.
448 tracker_->startService(*this);
449}
450
451ServiceMethod::~ServiceMethod()
452{
453 // invoke tracker to finish service
454 // note: Not expecting an exception from this code, but
455 // finishService() might conceivably throw an out-of-memory
456 // exception.
457 try {
458 tracker_->finishService(*this);
459 } catch (...) {
460 // don't throw
461 }
462}
463
464uint64_t
465ServiceMethod::step(const std::string &stepName)
466{
467 return tracker_->stepService(*this, stepName);
468}