Fail and retry logic for TSocketPool

Summary: Replicating php logic: If opening fails enough times, mark server as down for some amount of time

Reviewed By: aditya

Test Plan: compiling thrift - any good test ideas?

Revert: OK

DiffCamp Revision: 8381


git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@665534 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/lib/cpp/src/transport/TSocketPool.cpp b/lib/cpp/src/transport/TSocketPool.cpp
index 235d060..af7303b 100644
--- a/lib/cpp/src/transport/TSocketPool.cpp
+++ b/lib/cpp/src/transport/TSocketPool.cpp
@@ -14,6 +14,26 @@
 using namespace std;
 
 /**
+ * TSocketPoolServer implementation
+ *
+ * @author Akhil Wable <akhil@facebook.com>
+ */
+TSocketPoolServer::TSocketPoolServer()
+  : host_(""),
+    port_(0),
+    lastFailTime_(0),
+    consecutiveFailures_(0) {}
+
+/**
+ * Constructor for TSocketPool server
+ */
+TSocketPoolServer::TSocketPoolServer(const std::string &host, int port)
+  : host_(host),
+    port_(port),
+    lastFailTime_(0),
+    consecutiveFailures_(0) {}
+
+/**
  * TSocketPool implementation.
  *
  * @author Jason Sobel <jsobel@facebook.com>
@@ -38,13 +58,15 @@
 }
 
 TSocketPool::TSocketPool(const vector<pair<string, int> > servers) : TSocket(),
-  servers_(servers),
   numRetries_(1),
   retryInterval_(60),
   maxConsecutiveFailures_(1),
   randomize_(true),
   alwaysTryLast_(true)
 {
+  for (unsigned i = 0; i < servers.size(); ++i) {
+    addServer(servers[i].first, servers[i].second);
+  }
 }
 
 TSocketPool::TSocketPool(const string& host, int port) : TSocket(),
@@ -62,7 +84,7 @@
 }
 
 void TSocketPool::addServer(const string& host, int port) {
-  servers_.push_back(pair<string, int>(host, port));
+  servers_.push_back(TSocketPoolServer(host, port));
 }
 
 void TSocketPool::setNumRetries(int numRetries) {
@@ -92,20 +114,45 @@
     std::random_shuffle(servers_.begin(), servers_.end());
   }
 
-  for (unsigned int i = 0; i < servers_.size(); ++i) {
-    host_ = servers_[i].first;
-    port_ = servers_[i].second;
+  unsigned int numServers = servers_.size();
+  for (unsigned int i = 0; i < numServers; ++i) {
 
-    for (int j = 0; j < numRetries_; ++j) {
-      try {
-        TSocket::open();
+    TSocketPoolServer &server = servers_[i];
+    bool retryIntervalPassed = (server.lastFailTime_ == 0);
+    bool isLastServer = alwaysTryLast_ ? (i == (numServers - 1)) : false;
 
-        // success
-        return;
-      } catch (TException e) {
-        // connection failed
+    if (server.lastFailTime_ > 0) {
+      // The server was marked as down, so check if enough time has elapsed to retry
+      int elapsedTime = time(NULL) - server.lastFailTime_;
+      if (elapsedTime > retryInterval_) {
+        retryIntervalPassed = true;
       }
     }
+
+    if (retryIntervalPassed || isLastServer) {
+      for (int j = 0; j < numRetries_; ++j) {
+        try {
+          TSocket::open();
+
+          // reset lastFailTime_ is required
+          if (server.lastFailTime_) {
+            server.lastFailTime_ = 0;
+          }
+
+          // success
+          return;
+        } catch (TException e) {
+          // connection failed
+        }
+      }
+    }
+
+    ++server.consecutiveFailures_;
+    if (server.consecutiveFailures_ > maxConsecutiveFailures_) {
+      // Mark server as down
+      server.consecutiveFailures_ = 0;
+      server.lastFailTime_ = time(NULL);
+    }
   }
 
   GlobalOutput("TSocketPool::open: all connections failed");