blob: 8dc150d85682c45c7f56f8ce034ef3d0991322a4 [file] [log] [blame]
pwyckoff99b000b2008-04-03 19:30:55 +00001// Copyright (c) 2006- Facebook
2// Distributed under the Thrift Software License
3//
4// See accompanying file LICENSE or visit the Thrift site at:
5// http://developers.facebook.com/thrift/
6
7#include <sys/time.h>
8
9#include "FacebookBase.h"
10#include "ServiceTracker.h"
11#include "thrift/concurrency/ThreadManager.h"
12
13using namespace std;
14using namespace facebook::fb303;
15using namespace facebook::thrift::concurrency;
16
17
18uint64_t ServiceTracker::CHECKPOINT_MINIMUM_INTERVAL_SECONDS = 60;
19int ServiceTracker::LOG_LEVEL = 5;
20
21
22ServiceTracker::ServiceTracker(facebook::fb303::FacebookBase *handler,
23 void (*logMethod)(int, const string &),
24 bool featureCheckpoint,
25 bool featureStatusCheck,
26 bool featureThreadCheck,
27 Stopwatch::Unit stopwatchUnit)
28 : handler_(handler), logMethod_(logMethod),
29 featureCheckpoint_(featureCheckpoint),
30 featureStatusCheck_(featureStatusCheck),
31 featureThreadCheck_(featureThreadCheck),
32 stopwatchUnit_(stopwatchUnit),
33 checkpointServices_(0)
34{
35 if (featureCheckpoint_) {
36 time_t now = time(NULL);
37 checkpointTime_ = now;
38 } else {
39 checkpointTime_ = 0;
40 }
41}
42
43/**
44 * Registers the beginning of a "service method": basically, any of
45 * the implementations of Thrift remote procedure calls that a
46 * FacebookBase handler is handling. Controls concurrent
47 * services and reports statistics (via log and via fb303 counters).
48 * Throws an exception if the server is not ready to handle service
49 * methods yet.
50 *
51 * note: The relationship between startService() and finishService()
52 * is currently defined so that a call to finishService() should only
53 * be matched to this call to startService() if this method returns
54 * without exception. It wouldn't be a problem to implement things
55 * the other way, so that *every* start needed a finish, but this
56 * convention was chosen to match the way an object's constructor and
57 * destructor work together, i.e. to work well with ServiceMethod
58 * objects.
59 *
60 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
61 * object instantiated at the start
62 * of the service method.
63 * @author Karl Voskuil
64 */
65void
66ServiceTracker::startService(const ServiceMethod &serviceMethod)
67{
68 // note: serviceMethod.timer_ automatically starts at construction.
69
70 // log service start
71 logMethod_(5, serviceMethod.signature_);
72
73 // check handler ready
74 if (featureStatusCheck_ && !serviceMethod.featureLogOnly_) {
75 // note: Throwing exceptions before counting statistics. See note
76 // in method header.
77 // note: A STOPPING server is not accepting new connections, but it
78 // is still handling any already-connected threads -- so from the
79 // service method's point of view, a status of STOPPING is a green
80 // light.
81 facebook::fb303::fb_status status = handler_->getStatus();
82 if (status != facebook::fb303::ALIVE
83 && status != facebook::fb303::STOPPING) {
84 if (status == facebook::fb303::STARTING) {
85 throw ServiceException("Server starting up; please try again later");
86 } else {
87 throw ServiceException("Server not alive; please try again later");
88 }
89 }
90 }
91
92 // check server threads
93 if (featureThreadCheck_ && !serviceMethod.featureLogOnly_) {
94 // note: Might want to put these messages in reportCheckpoint() if
95 // log is getting spammed.
96 if (threadManager_ != NULL) {
97 size_t idle_count = threadManager_->idleWorkerCount();
98 if (idle_count == 0) {
99 stringstream message;
100 message << "service " << serviceMethod.signature_
101 << ": all threads (" << threadManager_->workerCount()
102 << ") in use";
103 logMethod_(3, message.str());
104 }
105 }
106 }
107}
108
109/**
110 * Logs a significant step in the middle of a "service method"; see
111 * startService.
112 *
113 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
114 * object instantiated at the start
115 * of the service method.
116 * @return int64_t Elapsed units (see stopwatchUnit_) since ServiceMethod
117 * instantiation.
118 * @author Karl Voskuil
119 */
120int64_t
121ServiceTracker::stepService(const ServiceMethod &serviceMethod,
122 const string &stepName)
123{
124 stringstream message;
125 string elapsed_label;
126 int64_t elapsed = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
127 &elapsed_label);
128 message << serviceMethod.signature_
129 << ' ' << stepName
130 << " [" << elapsed_label << ']';
131 logMethod_(5, message.str());
132 return elapsed;
133}
134
135/**
136 * Registers the end of a "service method"; see startService().
137 *
138 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
139 * object instantiated at the start
140 * of the service method.
141 * @author Karl Voskuil
142 */
143void
144ServiceTracker::finishService(const ServiceMethod &serviceMethod)
145{
146 // log end of service
147 stringstream message;
148 string duration_label;
149 int64_t duration = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
150 &duration_label);
151 message << serviceMethod.signature_
152 << " finish [" << duration_label << ']';
153 logMethod_(5, message.str());
154
155 // count, record, and maybe report service statistics
156 if (!serviceMethod.featureLogOnly_) {
157
158 if (!featureCheckpoint_) {
159
160 // lifetime counters
161 // (note: No need to lock statisticsMutex_ if not doing checkpoint;
162 // FacebookService::incrementCounter() is already thread-safe.)
163 handler_->incrementCounter("lifetime_services");
164
165 } else {
166
167 statisticsMutex_.lock();
168 // note: No exceptions expected from this code block. Wrap in a try
169 // just to be safe.
170 try {
171
172 // lifetime counters
173 // note: Good to synchronize this with the increment of
174 // checkpoint services, even though incrementCounter() is
175 // already thread-safe, for the sake of checkpoint reporting
176 // consistency (i.e. since the last checkpoint,
177 // lifetime_services has incremented by checkpointServices_).
178 handler_->incrementCounter("lifetime_services");
179
180 // checkpoint counters
181 checkpointServices_++;
182 checkpointDuration_ += duration;
183
184 // per-service timing
185 // note kjv: According to my tests it is very slightly faster to
186 // call insert() once (and detect not-found) than calling find()
187 // and then maybe insert (if not-found). However, the difference
188 // is tiny for small maps like this one, and the code for the
189 // faster solution is slightly less readable. Also, I wonder if
190 // the instantiation of the (often unused) pair to insert makes
191 // the first algorithm slower after all.
192 map<string, pair<uint64_t, uint64_t> >::iterator iter;
193 iter = checkpointServiceDuration_.find(serviceMethod.name_);
194 if (iter != checkpointServiceDuration_.end()) {
195 iter->second.first++;
196 iter->second.second += duration;
197 } else {
198 checkpointServiceDuration_.insert(make_pair(serviceMethod.name_,
199 make_pair(1, duration)));
200 }
201
202 // maybe report checkpoint
203 // note: ...if it's been long enough since the last report.
204 time_t now = time(NULL);
205 uint64_t check_interval = now - checkpointTime_;
206 if (check_interval >= CHECKPOINT_MINIMUM_INTERVAL_SECONDS) {
207 reportCheckpoint();
208 }
209
210 } catch (...) {
211 statisticsMutex_.unlock();
212 throw;
213 }
214 statisticsMutex_.unlock();
215
216 }
217 }
218}
219
220/**
221 * Logs some statistics gathered since the last call to this method.
222 *
223 * note: Thread race conditions on this method could cause
224 * misreporting and/or undefined behavior; the caller must protect
225 * uses of the object variables (and calls to this method) with a
226 * mutex.
227 *
228 * @author Karl Voskuil
229 */
230void
231ServiceTracker::reportCheckpoint()
232{
233 time_t now = time(NULL);
234
235 uint64_t check_count = checkpointServices_;
236 uint64_t check_interval = now - checkpointTime_;
237 uint64_t check_duration = checkpointDuration_;
238
239 // export counters for timing of service methods (by service name)
240 handler_->setCounter("checkpoint_time", check_interval);
241 map<string, pair<uint64_t, uint64_t> >::iterator iter;
242 uint64_t count;
243 for (iter = checkpointServiceDuration_.begin();
244 iter != checkpointServiceDuration_.end();
245 iter++) {
246 count = iter->second.first;
247 handler_->setCounter(string("checkpoint_count_") + iter->first, count);
248 if (count == 0) {
249 handler_->setCounter(string("checkpoint_speed_") + iter->first,
250 0);
251 } else {
252 handler_->setCounter(string("checkpoint_speed_") + iter->first,
253 iter->second.second / count);
254 }
255 }
256
257 // reset checkpoint variables
258 // note: Clearing the map while other threads are using it might
259 // cause undefined behavior.
260 checkpointServiceDuration_.clear();
261 checkpointTime_ = now;
262 checkpointServices_ = 0;
263 checkpointDuration_ = 0;
264
265 // get lifetime variables
266 uint64_t life_count = handler_->getCounter("lifetime_services");
267 uint64_t life_interval = now - handler_->aliveSince();
268
269 // log checkpoint
270 stringstream message;
271 message << "checkpoint_time:" << check_interval
272 << " checkpoint_services:" << check_count
273 << " checkpoint_speed_sum:" << check_duration
274 << " lifetime_time:" << life_interval
275 << " lifetime_services:" << life_count;
276 if (featureThreadCheck_ && threadManager_ != NULL) {
277 size_t worker_count = threadManager_->workerCount();
278 size_t idle_count = threadManager_->idleWorkerCount();
279 message << " total_workers:" << worker_count
280 << " active_workers:" << (worker_count - idle_count);
281 }
282 logMethod_(4, message.str());
283}
284
285/**
286 * Remembers the thread manager used in the server, for monitoring thread
287 * activity.
288 *
289 * @param shared_ptr<ThreadManager> threadManager The server's thread manager.
290 * @author Karl Voskuil
291 */
292void
293ServiceTracker::setThreadManager(boost::shared_ptr<ThreadManager>
294 threadManager)
295{
296 threadManager_ = threadManager;
297}
298
299/**
300 * Logs messages to stdout; the passed message will be logged if the
301 * passed level is less than or equal to LOG_LEVEL.
302 *
303 * This is the default logging method used by the ServiceTracker. An
304 * alternate logging method (that accepts the same parameters) may be
305 * specified to the constructor.
306 *
307 * @param int level A level associated with the message: higher levels
308 * are used to indicate higher levels of detail.
309 * @param string message The message to log.
310 */
311void
312ServiceTracker::defaultLogMethod(int level, const string &message)
313{
314 if (level <= LOG_LEVEL) {
315 string level_string;
316 time_t now = time(NULL);
317 char now_pretty[26];
318 ctime_r(&now, now_pretty);
319 now_pretty[24] = '\0';
320 switch (level) {
321 case 1:
322 level_string = "CRITICAL";
323 break;
324 case 2:
325 level_string = "ERROR";
326 break;
327 case 3:
328 level_string = "WARNING";
329 break;
330 case 5:
331 level_string = "DEBUG";
332 break;
333 case 4:
334 default:
335 level_string = "INFO";
336 break;
337 }
338 cout << '[' << level_string << "] [" << now_pretty << "] "
339 << message << endl;
340 }
341}
342
343
344/**
345 * Creates a Stopwatch, which can report the time elapsed since its
346 * creation.
347 *
348 * @author Karl Voskuil
349 */
350Stopwatch::Stopwatch()
351{
352 gettimeofday(&startTime_, NULL);
353}
354
355void
356Stopwatch::reset()
357{
358 gettimeofday(&startTime_, NULL);
359}
360
361uint64_t
362Stopwatch::elapsedUnits(Stopwatch::Unit unit, string *label) const
363{
364 timeval now_time;
365 gettimeofday(&now_time, NULL);
366 time_t duration_secs = now_time.tv_sec - startTime_.tv_sec;
367
368 uint64_t duration_units;
369 switch (unit) {
370 case UNIT_SECONDS:
371 duration_units = duration_secs
372 + (now_time.tv_usec - startTime_.tv_usec + 500000) / 1000000;
373 if (NULL != label) {
374 stringstream ss_label;
375 ss_label << duration_units << " secs";
376 label->assign(ss_label.str());
377 }
378 break;
379 case UNIT_MICROSECONDS:
380 duration_units = duration_secs * 1000000
381 + now_time.tv_usec - startTime_.tv_usec;
382 if (NULL != label) {
383 stringstream ss_label;
384 ss_label << duration_units << " us";
385 label->assign(ss_label.str());
386 }
387 break;
388 case UNIT_MILLISECONDS:
389 default:
390 duration_units = duration_secs * 1000
391 + (now_time.tv_usec - startTime_.tv_usec + 500) / 1000;
392 if (NULL != label) {
393 stringstream ss_label;
394 ss_label << duration_units << " ms";
395 label->assign(ss_label.str());
396 }
397 break;
398 }
399 return duration_units;
400}
401
402/**
403 * Creates a ServiceMethod, used for tracking a single service method
404 * invocation (via the ServiceTracker). The passed name of the
405 * ServiceMethod is used to group statistics (e.g. counts and durations)
406 * for similar invocations; the passed signature is used to uniquely
407 * identify the particular invocation in the log.
408 *
409 * note: A version of this constructor is provided that automatically
410 * forms a signature the name and a passed numeric id. Silly, sure,
411 * but commonly used, since it often saves the caller a line or two of
412 * code.
413 *
414 * @param ServiceTracker *tracker The service tracker that will track this
415 * ServiceMethod.
416 * @param const string &name The service method name (usually independent
417 * of service method parameters).
418 * @param const string &signature A signature uniquely identifying the method
419 * invocation (usually name plus parameters).
420 * @author Karl Voskuil
421 */
422ServiceMethod::ServiceMethod(ServiceTracker *tracker,
423 const string &name,
424 const string &signature,
425 bool featureLogOnly)
426 : tracker_(tracker), name_(name), signature_(signature),
427 featureLogOnly_(featureLogOnly)
428{
429 // note: timer_ automatically starts at construction.
430
431 // invoke tracker to start service
432 // note: Might throw. If it throws, then this object's destructor
433 // won't be called, which is according to plan: finishService() is
434 // only supposed to be matched to startService() if startService()
435 // returns without error.
436 tracker_->startService(*this);
437}
438
439ServiceMethod::ServiceMethod(ServiceTracker *tracker,
440 const string &name,
441 uint64_t id,
442 bool featureLogOnly)
443 : tracker_(tracker), name_(name), featureLogOnly_(featureLogOnly)
444{
445 // note: timer_ automatically starts at construction.
446 stringstream ss_signature;
447 ss_signature << name << " (" << id << ')';
448 signature_ = ss_signature.str();
449
450 // invoke tracker to start service
451 // note: Might throw. If it throws, then this object's destructor
452 // won't be called, which is according to plan: finishService() is
453 // only supposed to be matched to startService() if startService()
454 // returns without error.
455 tracker_->startService(*this);
456}
457
458ServiceMethod::~ServiceMethod()
459{
460 // invoke tracker to finish service
461 // note: Not expecting an exception from this code, but
462 // finishService() might conceivably throw an out-of-memory
463 // exception.
464 try {
465 tracker_->finishService(*this);
466 } catch (...) {
467 // don't throw
468 }
469}
470
471uint64_t
472ServiceMethod::step(const std::string &stepName)
473{
474 return tracker_->stepService(*this, stepName);
475}