blob: c20a0683fd7e72eefc3bd783d751c06d9db5a851 [file] [log] [blame]
Gavin McDonald0b75e1a2010-10-28 02:12:01 +00001/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19
20#include <sys/time.h>
21
22#include "FacebookBase.h"
23#include "ServiceTracker.h"
24#include "concurrency/ThreadManager.h"
25
26using namespace std;
27using namespace facebook::fb303;
28using namespace apache::thrift::concurrency;
29
30
31uint64_t ServiceTracker::CHECKPOINT_MINIMUM_INTERVAL_SECONDS = 60;
32int ServiceTracker::LOG_LEVEL = 5;
33
34
35ServiceTracker::ServiceTracker(facebook::fb303::FacebookBase *handler,
36 void (*logMethod)(int, const string &),
37 bool featureCheckpoint,
38 bool featureStatusCheck,
39 bool featureThreadCheck,
40 Stopwatch::Unit stopwatchUnit)
41 : handler_(handler), logMethod_(logMethod),
42 featureCheckpoint_(featureCheckpoint),
43 featureStatusCheck_(featureStatusCheck),
44 featureThreadCheck_(featureThreadCheck),
45 stopwatchUnit_(stopwatchUnit),
46 checkpointServices_(0)
47{
48 if (featureCheckpoint_) {
49 time_t now = time(NULL);
50 checkpointTime_ = now;
51 } else {
52 checkpointTime_ = 0;
53 }
54}
55
56/**
57 * Registers the beginning of a "service method": basically, any of
58 * the implementations of Thrift remote procedure calls that a
59 * FacebookBase handler is handling. Controls concurrent
60 * services and reports statistics (via log and via fb303 counters).
61 * Throws an exception if the server is not ready to handle service
62 * methods yet.
63 *
64 * note: The relationship between startService() and finishService()
65 * is currently defined so that a call to finishService() should only
66 * be matched to this call to startService() if this method returns
67 * without exception. It wouldn't be a problem to implement things
68 * the other way, so that *every* start needed a finish, but this
69 * convention was chosen to match the way an object's constructor and
70 * destructor work together, i.e. to work well with ServiceMethod
71 * objects.
72 *
73 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
74 * object instantiated at the start
75 * of the service method.
76 */
77void
78ServiceTracker::startService(const ServiceMethod &serviceMethod)
79{
80 // note: serviceMethod.timer_ automatically starts at construction.
81
82 // log service start
83 logMethod_(5, serviceMethod.signature_);
84
85 // check handler ready
86 if (featureStatusCheck_ && !serviceMethod.featureLogOnly_) {
87 // note: Throwing exceptions before counting statistics. See note
88 // in method header.
89 // note: A STOPPING server is not accepting new connections, but it
90 // is still handling any already-connected threads -- so from the
91 // service method's point of view, a status of STOPPING is a green
92 // light.
93 facebook::fb303::fb_status status = handler_->getStatus();
94 if (status != facebook::fb303::ALIVE
95 && status != facebook::fb303::STOPPING) {
96 if (status == facebook::fb303::STARTING) {
97 throw ServiceException("Server starting up; please try again later");
98 } else {
99 throw ServiceException("Server not alive; please try again later");
100 }
101 }
102 }
103
104 // check server threads
105 if (featureThreadCheck_ && !serviceMethod.featureLogOnly_) {
106 // note: Might want to put these messages in reportCheckpoint() if
107 // log is getting spammed.
108 if (threadManager_ != NULL) {
109 size_t idle_count = threadManager_->idleWorkerCount();
110 if (idle_count == 0) {
111 stringstream message;
112 message << "service " << serviceMethod.signature_
113 << ": all threads (" << threadManager_->workerCount()
114 << ") in use";
115 logMethod_(3, message.str());
116 }
117 }
118 }
119}
120
121/**
122 * Logs a significant step in the middle of a "service method"; see
123 * startService.
124 *
125 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
126 * object instantiated at the start
127 * of the service method.
128 * @return int64_t Elapsed units (see stopwatchUnit_) since ServiceMethod
129 * instantiation.
130 */
131int64_t
132ServiceTracker::stepService(const ServiceMethod &serviceMethod,
133 const string &stepName)
134{
135 stringstream message;
136 string elapsed_label;
137 int64_t elapsed = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
138 &elapsed_label);
139 message << serviceMethod.signature_
140 << ' ' << stepName
141 << " [" << elapsed_label << ']';
142 logMethod_(5, message.str());
143 return elapsed;
144}
145
146/**
147 * Registers the end of a "service method"; see startService().
148 *
149 * @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
150 * object instantiated at the start
151 * of the service method.
152 */
153void
154ServiceTracker::finishService(const ServiceMethod &serviceMethod)
155{
156 // log end of service
157 stringstream message;
158 string duration_label;
159 int64_t duration = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
160 &duration_label);
161 message << serviceMethod.signature_
162 << " finish [" << duration_label << ']';
163 logMethod_(5, message.str());
164
165 // count, record, and maybe report service statistics
166 if (!serviceMethod.featureLogOnly_) {
167
168 if (!featureCheckpoint_) {
169
170 // lifetime counters
171 // (note: No need to lock statisticsMutex_ if not doing checkpoint;
172 // FacebookService::incrementCounter() is already thread-safe.)
173 handler_->incrementCounter("lifetime_services");
174
175 } else {
176
177 statisticsMutex_.lock();
178 // note: No exceptions expected from this code block. Wrap in a try
179 // just to be safe.
180 try {
181
182 // lifetime counters
183 // note: Good to synchronize this with the increment of
184 // checkpoint services, even though incrementCounter() is
185 // already thread-safe, for the sake of checkpoint reporting
186 // consistency (i.e. since the last checkpoint,
187 // lifetime_services has incremented by checkpointServices_).
188 handler_->incrementCounter("lifetime_services");
189
190 // checkpoint counters
191 checkpointServices_++;
192 checkpointDuration_ += duration;
193
194 // per-service timing
195 // note kjv: According to my tests it is very slightly faster to
196 // call insert() once (and detect not-found) than calling find()
197 // and then maybe insert (if not-found). However, the difference
198 // is tiny for small maps like this one, and the code for the
199 // faster solution is slightly less readable. Also, I wonder if
200 // the instantiation of the (often unused) pair to insert makes
201 // the first algorithm slower after all.
202 map<string, pair<uint64_t, uint64_t> >::iterator iter;
203 iter = checkpointServiceDuration_.find(serviceMethod.name_);
204 if (iter != checkpointServiceDuration_.end()) {
205 iter->second.first++;
206 iter->second.second += duration;
207 } else {
208 checkpointServiceDuration_.insert(make_pair(serviceMethod.name_,
209 make_pair(1, duration)));
210 }
211
212 // maybe report checkpoint
213 // note: ...if it's been long enough since the last report.
214 time_t now = time(NULL);
215 uint64_t check_interval = now - checkpointTime_;
216 if (check_interval >= CHECKPOINT_MINIMUM_INTERVAL_SECONDS) {
217 reportCheckpoint();
218 }
219
220 } catch (...) {
221 statisticsMutex_.unlock();
222 throw;
223 }
224 statisticsMutex_.unlock();
225
226 }
227 }
228}
229
230/**
231 * Logs some statistics gathered since the last call to this method.
232 *
233 * note: Thread race conditions on this method could cause
234 * misreporting and/or undefined behavior; the caller must protect
235 * uses of the object variables (and calls to this method) with a
236 * mutex.
237 *
238 */
239void
240ServiceTracker::reportCheckpoint()
241{
242 time_t now = time(NULL);
243
244 uint64_t check_count = checkpointServices_;
245 uint64_t check_interval = now - checkpointTime_;
246 uint64_t check_duration = checkpointDuration_;
247
248 // export counters for timing of service methods (by service name)
249 handler_->setCounter("checkpoint_time", check_interval);
250 map<string, pair<uint64_t, uint64_t> >::iterator iter;
251 uint64_t count;
252 for (iter = checkpointServiceDuration_.begin();
253 iter != checkpointServiceDuration_.end();
254 iter++) {
255 count = iter->second.first;
256 handler_->setCounter(string("checkpoint_count_") + iter->first, count);
257 if (count == 0) {
258 handler_->setCounter(string("checkpoint_speed_") + iter->first,
259 0);
260 } else {
261 handler_->setCounter(string("checkpoint_speed_") + iter->first,
262 iter->second.second / count);
263 }
264 }
265
266 // reset checkpoint variables
267 // note: Clearing the map while other threads are using it might
268 // cause undefined behavior.
269 checkpointServiceDuration_.clear();
270 checkpointTime_ = now;
271 checkpointServices_ = 0;
272 checkpointDuration_ = 0;
273
274 // get lifetime variables
275 uint64_t life_count = handler_->getCounter("lifetime_services");
276 uint64_t life_interval = now - handler_->aliveSince();
277
278 // log checkpoint
279 stringstream message;
280 message << "checkpoint_time:" << check_interval
281 << " checkpoint_services:" << check_count
282 << " checkpoint_speed_sum:" << check_duration
283 << " lifetime_time:" << life_interval
284 << " lifetime_services:" << life_count;
285 if (featureThreadCheck_ && threadManager_ != NULL) {
286 size_t worker_count = threadManager_->workerCount();
287 size_t idle_count = threadManager_->idleWorkerCount();
288 message << " total_workers:" << worker_count
289 << " active_workers:" << (worker_count - idle_count);
290 }
291 logMethod_(4, message.str());
292}
293
294/**
295 * Remembers the thread manager used in the server, for monitoring thread
296 * activity.
297 *
298 * @param shared_ptr<ThreadManager> threadManager The server's thread manager.
299 */
300void
301ServiceTracker::setThreadManager(boost::shared_ptr<ThreadManager>
302 threadManager)
303{
304 threadManager_ = threadManager;
305}
306
307/**
308 * Logs messages to stdout; the passed message will be logged if the
309 * passed level is less than or equal to LOG_LEVEL.
310 *
311 * This is the default logging method used by the ServiceTracker. An
312 * alternate logging method (that accepts the same parameters) may be
313 * specified to the constructor.
314 *
315 * @param int level A level associated with the message: higher levels
316 * are used to indicate higher levels of detail.
317 * @param string message The message to log.
318 */
319void
320ServiceTracker::defaultLogMethod(int level, const string &message)
321{
322 if (level <= LOG_LEVEL) {
323 string level_string;
324 time_t now = time(NULL);
325 char now_pretty[26];
326 ctime_r(&now, now_pretty);
327 now_pretty[24] = '\0';
328 switch (level) {
329 case 1:
330 level_string = "CRITICAL";
331 break;
332 case 2:
333 level_string = "ERROR";
334 break;
335 case 3:
336 level_string = "WARNING";
337 break;
338 case 5:
339 level_string = "DEBUG";
340 break;
341 case 4:
342 default:
343 level_string = "INFO";
344 break;
345 }
346 cout << '[' << level_string << "] [" << now_pretty << "] "
347 << message << endl;
348 }
349}
350
351
352/**
353 * Creates a Stopwatch, which can report the time elapsed since its
354 * creation.
355 *
356 */
357Stopwatch::Stopwatch()
358{
359 gettimeofday(&startTime_, NULL);
360}
361
362void
363Stopwatch::reset()
364{
365 gettimeofday(&startTime_, NULL);
366}
367
368uint64_t
369Stopwatch::elapsedUnits(Stopwatch::Unit unit, string *label) const
370{
371 timeval now_time;
372 gettimeofday(&now_time, NULL);
373 time_t duration_secs = now_time.tv_sec - startTime_.tv_sec;
374
375 uint64_t duration_units;
376 switch (unit) {
377 case UNIT_SECONDS:
378 duration_units = duration_secs
379 + (now_time.tv_usec - startTime_.tv_usec + 500000) / 1000000;
380 if (NULL != label) {
381 stringstream ss_label;
382 ss_label << duration_units << " secs";
383 label->assign(ss_label.str());
384 }
385 break;
386 case UNIT_MICROSECONDS:
387 duration_units = duration_secs * 1000000
388 + now_time.tv_usec - startTime_.tv_usec;
389 if (NULL != label) {
390 stringstream ss_label;
391 ss_label << duration_units << " us";
392 label->assign(ss_label.str());
393 }
394 break;
395 case UNIT_MILLISECONDS:
396 default:
397 duration_units = duration_secs * 1000
398 + (now_time.tv_usec - startTime_.tv_usec + 500) / 1000;
399 if (NULL != label) {
400 stringstream ss_label;
401 ss_label << duration_units << " ms";
402 label->assign(ss_label.str());
403 }
404 break;
405 }
406 return duration_units;
407}
408
409/**
410 * Creates a ServiceMethod, used for tracking a single service method
411 * invocation (via the ServiceTracker). The passed name of the
412 * ServiceMethod is used to group statistics (e.g. counts and durations)
413 * for similar invocations; the passed signature is used to uniquely
414 * identify the particular invocation in the log.
415 *
416 * note: A version of this constructor is provided that automatically
417 * forms a signature the name and a passed numeric id. Silly, sure,
418 * but commonly used, since it often saves the caller a line or two of
419 * code.
420 *
421 * @param ServiceTracker *tracker The service tracker that will track this
422 * ServiceMethod.
423 * @param const string &name The service method name (usually independent
424 * of service method parameters).
425 * @param const string &signature A signature uniquely identifying the method
426 * invocation (usually name plus parameters).
427 */
428ServiceMethod::ServiceMethod(ServiceTracker *tracker,
429 const string &name,
430 const string &signature,
431 bool featureLogOnly)
432 : tracker_(tracker), name_(name), signature_(signature),
433 featureLogOnly_(featureLogOnly)
434{
435 // note: timer_ automatically starts at construction.
436
437 // invoke tracker to start service
438 // note: Might throw. If it throws, then this object's destructor
439 // won't be called, which is according to plan: finishService() is
440 // only supposed to be matched to startService() if startService()
441 // returns without error.
442 tracker_->startService(*this);
443}
444
445ServiceMethod::ServiceMethod(ServiceTracker *tracker,
446 const string &name,
447 uint64_t id,
448 bool featureLogOnly)
449 : tracker_(tracker), name_(name), featureLogOnly_(featureLogOnly)
450{
451 // note: timer_ automatically starts at construction.
452 stringstream ss_signature;
453 ss_signature << name << " (" << id << ')';
454 signature_ = ss_signature.str();
455
456 // invoke tracker to start service
457 // note: Might throw. If it throws, then this object's destructor
458 // won't be called, which is according to plan: finishService() is
459 // only supposed to be matched to startService() if startService()
460 // returns without error.
461 tracker_->startService(*this);
462}
463
464ServiceMethod::~ServiceMethod()
465{
466 // invoke tracker to finish service
467 // note: Not expecting an exception from this code, but
468 // finishService() might conceivably throw an out-of-memory
469 // exception.
470 try {
471 tracker_->finishService(*this);
472 } catch (...) {
473 // don't throw
474 }
475}
476
477uint64_t
478ServiceMethod::step(const std::string &stepName)
479{
480 return tracker_->stepService(*this, stepName);
481}