blob: a2e670c768b3708cf3508afe107d2ed9c2617789 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
#include <sys/time.h>
#include "FacebookBase.h"
#include "ServiceTracker.h"
#include <thrift/concurrency/ThreadManager.h>
using namespace std;
using namespace facebook::fb303;
using namespace apache::thrift::concurrency;
uint64_t ServiceTracker::CHECKPOINT_MINIMUM_INTERVAL_SECONDS = 60;
int ServiceTracker::LOG_LEVEL = 5;
ServiceTracker::ServiceTracker(facebook::fb303::FacebookBase *handler,
void (*logMethod)(int, const string &),
bool featureCheckpoint,
bool featureStatusCheck,
bool featureThreadCheck,
Stopwatch::Unit stopwatchUnit)
: handler_(handler), logMethod_(logMethod),
featureCheckpoint_(featureCheckpoint),
featureStatusCheck_(featureStatusCheck),
featureThreadCheck_(featureThreadCheck),
stopwatchUnit_(stopwatchUnit),
checkpointServices_(0)
{
if (featureCheckpoint_) {
time_t now = time(nullptr);
checkpointTime_ = now;
} else {
checkpointTime_ = 0;
}
}
/**
* Registers the beginning of a "service method": basically, any of
* the implementations of Thrift remote procedure calls that a
* FacebookBase handler is handling. Controls concurrent
* services and reports statistics (via log and via fb303 counters).
* Throws an exception if the server is not ready to handle service
* methods yet.
*
* note: The relationship between startService() and finishService()
* is currently defined so that a call to finishService() should only
* be matched to this call to startService() if this method returns
* without exception. It wouldn't be a problem to implement things
* the other way, so that *every* start needed a finish, but this
* convention was chosen to match the way an object's constructor and
* destructor work together, i.e. to work well with ServiceMethod
* objects.
*
* @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
* object instantiated at the start
* of the service method.
*/
void
ServiceTracker::startService(const ServiceMethod &serviceMethod)
{
// note: serviceMethod.timer_ automatically starts at construction.
// log service start
logMethod_(5, serviceMethod.signature_);
// check handler ready
if (featureStatusCheck_ && !serviceMethod.featureLogOnly_) {
// note: Throwing exceptions before counting statistics. See note
// in method header.
// note: A STOPPING server is not accepting new connections, but it
// is still handling any already-connected threads -- so from the
// service method's point of view, a status of STOPPING is a green
// light.
facebook::fb303::fb_status status = handler_->getStatus();
if (status != facebook::fb303::ALIVE
&& status != facebook::fb303::STOPPING) {
if (status == facebook::fb303::STARTING) {
throw ServiceException("Server starting up; please try again later");
} else {
throw ServiceException("Server not alive; please try again later");
}
}
}
// check server threads
if (featureThreadCheck_ && !serviceMethod.featureLogOnly_) {
// note: Might want to put these messages in reportCheckpoint() if
// log is getting spammed.
if (threadManager_ != nullptr) {
size_t idle_count = threadManager_->idleWorkerCount();
if (idle_count == 0) {
stringstream message;
message << "service " << serviceMethod.signature_
<< ": all threads (" << threadManager_->workerCount()
<< ") in use";
logMethod_(3, message.str());
}
}
}
}
/**
* Logs a significant step in the middle of a "service method"; see
* startService.
*
* @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
* object instantiated at the start
* of the service method.
* @return int64_t Elapsed units (see stopwatchUnit_) since ServiceMethod
* instantiation.
*/
int64_t
ServiceTracker::stepService(const ServiceMethod &serviceMethod,
const string &stepName)
{
stringstream message;
string elapsed_label;
int64_t elapsed = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
&elapsed_label);
message << serviceMethod.signature_
<< ' ' << stepName
<< " [" << elapsed_label << ']';
logMethod_(5, message.str());
return elapsed;
}
/**
* Registers the end of a "service method"; see startService().
*
* @param const ServiceMethod &serviceMethod A reference to the ServiceMethod
* object instantiated at the start
* of the service method.
*/
void
ServiceTracker::finishService(const ServiceMethod &serviceMethod)
{
// log end of service
stringstream message;
string duration_label;
int64_t duration = serviceMethod.timer_.elapsedUnits(stopwatchUnit_,
&duration_label);
message << serviceMethod.signature_
<< " finish [" << duration_label << ']';
logMethod_(5, message.str());
// count, record, and maybe report service statistics
if (!serviceMethod.featureLogOnly_) {
if (!featureCheckpoint_) {
// lifetime counters
// (note: No need to lock statisticsMutex_ if not doing checkpoint;
// FacebookService::incrementCounter() is already thread-safe.)
handler_->incrementCounter("lifetime_services");
} else {
statisticsMutex_.lock();
// note: No exceptions expected from this code block. Wrap in a try
// just to be safe.
try {
// lifetime counters
// note: Good to synchronize this with the increment of
// checkpoint services, even though incrementCounter() is
// already thread-safe, for the sake of checkpoint reporting
// consistency (i.e. since the last checkpoint,
// lifetime_services has incremented by checkpointServices_).
handler_->incrementCounter("lifetime_services");
// checkpoint counters
checkpointServices_++;
checkpointDuration_ += duration;
// per-service timing
// note kjv: According to my tests it is very slightly faster to
// call insert() once (and detect not-found) than calling find()
// and then maybe insert (if not-found). However, the difference
// is tiny for small maps like this one, and the code for the
// faster solution is slightly less readable. Also, I wonder if
// the instantiation of the (often unused) pair to insert makes
// the first algorithm slower after all.
map<string, pair<uint64_t, uint64_t> >::iterator iter;
iter = checkpointServiceDuration_.find(serviceMethod.name_);
if (iter != checkpointServiceDuration_.end()) {
iter->second.first++;
iter->second.second += duration;
} else {
checkpointServiceDuration_.insert(make_pair(serviceMethod.name_,
make_pair(1, duration)));
}
// maybe report checkpoint
// note: ...if it's been long enough since the last report.
time_t now = time(nullptr);
uint64_t check_interval = now - checkpointTime_;
if (check_interval >= CHECKPOINT_MINIMUM_INTERVAL_SECONDS) {
reportCheckpoint();
}
} catch (...) {
statisticsMutex_.unlock();
throw;
}
statisticsMutex_.unlock();
}
}
}
/**
* Logs some statistics gathered since the last call to this method.
*
* note: Thread race conditions on this method could cause
* misreporting and/or undefined behavior; the caller must protect
* uses of the object variables (and calls to this method) with a
* mutex.
*
*/
void
ServiceTracker::reportCheckpoint()
{
time_t now = time(nullptr);
uint64_t check_count = checkpointServices_;
uint64_t check_interval = now - checkpointTime_;
uint64_t check_duration = checkpointDuration_;
// export counters for timing of service methods (by service name)
handler_->setCounter("checkpoint_time", check_interval);
map<string, pair<uint64_t, uint64_t> >::iterator iter;
uint64_t count;
for (iter = checkpointServiceDuration_.begin();
iter != checkpointServiceDuration_.end();
++iter) {
count = iter->second.first;
handler_->setCounter(string("checkpoint_count_") + iter->first, count);
if (count == 0) {
handler_->setCounter(string("checkpoint_speed_") + iter->first,
0);
} else {
handler_->setCounter(string("checkpoint_speed_") + iter->first,
iter->second.second / count);
}
}
// reset checkpoint variables
// note: Clearing the map while other threads are using it might
// cause undefined behavior.
checkpointServiceDuration_.clear();
checkpointTime_ = now;
checkpointServices_ = 0;
checkpointDuration_ = 0;
// get lifetime variables
uint64_t life_count = handler_->getCounter("lifetime_services");
uint64_t life_interval = now - handler_->aliveSince();
// log checkpoint
stringstream message;
message << "checkpoint_time:" << check_interval
<< " checkpoint_services:" << check_count
<< " checkpoint_speed_sum:" << check_duration
<< " lifetime_time:" << life_interval
<< " lifetime_services:" << life_count;
if (featureThreadCheck_ && threadManager_ != nullptr) {
size_t worker_count = threadManager_->workerCount();
size_t idle_count = threadManager_->idleWorkerCount();
message << " total_workers:" << worker_count
<< " active_workers:" << (worker_count - idle_count);
}
logMethod_(4, message.str());
}
/**
* Remembers the thread manager used in the server, for monitoring thread
* activity.
*
* @param shared_ptr<ThreadManager> threadManager The server's thread manager.
*/
void
ServiceTracker::setThreadManager(boost::shared_ptr<ThreadManager>
threadManager)
{
threadManager_ = threadManager;
}
/**
* Logs messages to stdout; the passed message will be logged if the
* passed level is less than or equal to LOG_LEVEL.
*
* This is the default logging method used by the ServiceTracker. An
* alternate logging method (that accepts the same parameters) may be
* specified to the constructor.
*
* @param int level A level associated with the message: higher levels
* are used to indicate higher levels of detail.
* @param string message The message to log.
*/
void
ServiceTracker::defaultLogMethod(int level, const string &message)
{
if (level <= LOG_LEVEL) {
string level_string;
time_t now = time(nullptr);
char now_pretty[26];
ctime_r(&now, now_pretty);
now_pretty[24] = '\0';
switch (level) {
case 1:
level_string = "CRITICAL";
break;
case 2:
level_string = "ERROR";
break;
case 3:
level_string = "WARNING";
break;
case 5:
level_string = "DEBUG";
break;
case 4:
default:
level_string = "INFO";
break;
}
cout << '[' << level_string << "] [" << now_pretty << "] "
<< message << endl;
}
}
/**
* Creates a Stopwatch, which can report the time elapsed since its
* creation.
*
*/
Stopwatch::Stopwatch()
{
gettimeofday(&startTime_, nullptr);
}
void
Stopwatch::reset()
{
gettimeofday(&startTime_, nullptr);
}
uint64_t
Stopwatch::elapsedUnits(Stopwatch::Unit unit, string *label) const
{
timeval now_time;
gettimeofday(&now_time, nullptr);
time_t duration_secs = now_time.tv_sec - startTime_.tv_sec;
uint64_t duration_units;
switch (unit) {
case UNIT_SECONDS:
duration_units = duration_secs
+ (now_time.tv_usec - startTime_.tv_usec + 500000) / 1000000;
if (nullptr != label) {
stringstream ss_label;
ss_label << duration_units << " secs";
label->assign(ss_label.str());
}
break;
case UNIT_MICROSECONDS:
duration_units = duration_secs * 1000000
+ now_time.tv_usec - startTime_.tv_usec;
if (nullptr != label) {
stringstream ss_label;
ss_label << duration_units << " us";
label->assign(ss_label.str());
}
break;
case UNIT_MILLISECONDS:
default:
duration_units = duration_secs * 1000
+ (now_time.tv_usec - startTime_.tv_usec + 500) / 1000;
if (nullptr != label) {
stringstream ss_label;
ss_label << duration_units << " ms";
label->assign(ss_label.str());
}
break;
}
return duration_units;
}
/**
* Creates a ServiceMethod, used for tracking a single service method
* invocation (via the ServiceTracker). The passed name of the
* ServiceMethod is used to group statistics (e.g. counts and durations)
* for similar invocations; the passed signature is used to uniquely
* identify the particular invocation in the log.
*
* note: A version of this constructor is provided that automatically
* forms a signature the name and a passed numeric id. Silly, sure,
* but commonly used, since it often saves the caller a line or two of
* code.
*
* @param ServiceTracker *tracker The service tracker that will track this
* ServiceMethod.
* @param const string &name The service method name (usually independent
* of service method parameters).
* @param const string &signature A signature uniquely identifying the method
* invocation (usually name plus parameters).
*/
ServiceMethod::ServiceMethod(ServiceTracker *tracker,
const string &name,
const string &signature,
bool featureLogOnly)
: tracker_(tracker), name_(name), signature_(signature),
featureLogOnly_(featureLogOnly)
{
// note: timer_ automatically starts at construction.
// invoke tracker to start service
// note: Might throw. If it throws, then this object's destructor
// won't be called, which is according to plan: finishService() is
// only supposed to be matched to startService() if startService()
// returns without error.
tracker_->startService(*this);
}
ServiceMethod::ServiceMethod(ServiceTracker *tracker,
const string &name,
uint64_t id,
bool featureLogOnly)
: tracker_(tracker), name_(name), featureLogOnly_(featureLogOnly)
{
// note: timer_ automatically starts at construction.
stringstream ss_signature;
ss_signature << name << " (" << id << ')';
signature_ = ss_signature.str();
// invoke tracker to start service
// note: Might throw. If it throws, then this object's destructor
// won't be called, which is according to plan: finishService() is
// only supposed to be matched to startService() if startService()
// returns without error.
tracker_->startService(*this);
}
ServiceMethod::~ServiceMethod()
{
// invoke tracker to finish service
// note: Not expecting an exception from this code, but
// finishService() might conceivably throw an out-of-memory
// exception.
try {
tracker_->finishService(*this);
} catch (...) {
// don't throw
}
}
uint64_t
ServiceMethod::step(const std::string &stepName)
{
return tracker_->stepService(*this, stepName);
}