THRIFT-153. Proper handling of strings with escapes (in IDL)

- Recognize and parse escape characters in .thrift files.
- Escape strings used as constants in generated source files.


git-svn-id: https://svn.apache.org/repos/asf/incubator/thrift/trunk@758922 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/compiler/cpp/src/generate/t_cocoa_generator.cc b/compiler/cpp/src/generate/t_cocoa_generator.cc
index f2121ca..04b0ad7 100644
--- a/compiler/cpp/src/generate/t_cocoa_generator.cc
+++ b/compiler/cpp/src/generate/t_cocoa_generator.cc
@@ -1794,7 +1794,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      render << "@\"" + value->get_string() + "\"";
+      render << "@\"" << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       render << ((value->get_integer() > 0) ? "YES" : "NO");
diff --git a/compiler/cpp/src/generate/t_cpp_generator.cc b/compiler/cpp/src/generate/t_cpp_generator.cc
index 5ad390a..c9b2055 100644
--- a/compiler/cpp/src/generate/t_cpp_generator.cc
+++ b/compiler/cpp/src/generate/t_cpp_generator.cc
@@ -538,7 +538,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      render << "\"" + value->get_string() + "\"";
+      render << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       render << ((value->get_integer() > 0) ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_csharp_generator.cc b/compiler/cpp/src/generate/t_csharp_generator.cc
index 6387906..f4c6f89 100644
--- a/compiler/cpp/src/generate/t_csharp_generator.cc
+++ b/compiler/cpp/src/generate/t_csharp_generator.cc
@@ -330,7 +330,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
       case t_base_type::TYPE_STRING:
-        render << "\"" + value->get_string() + "\"";
+        render << '"' << get_escaped_string(value) << '"';
         break;
       case t_base_type::TYPE_BOOL:
         render << ((value->get_integer() > 0) ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_erl_generator.cc b/compiler/cpp/src/generate/t_erl_generator.cc
index e7648ba..1027cd2 100644
--- a/compiler/cpp/src/generate/t_erl_generator.cc
+++ b/compiler/cpp/src/generate/t_erl_generator.cc
@@ -315,7 +315,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "\"" << value->get_string() << "\"";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_generator.cc b/compiler/cpp/src/generate/t_generator.cc
index 6766cd1..a04446b 100644
--- a/compiler/cpp/src/generate/t_generator.cc
+++ b/compiler/cpp/src/generate/t_generator.cc
@@ -59,6 +59,19 @@
   close_generator();
 }
 
+string t_generator::escape_string(const string &in) const {
+  string result = "";
+  for (string::const_iterator it = in.begin(); it < in.end(); it++) {
+    std::map<char, std::string>::const_iterator res = escape_.find(*it);
+    if (res != escape_.end()) {
+      result.append(res->second);
+    } else {
+      result.push_back(*it);
+    }
+  }
+  return result;
+}
+
 void t_generator::generate_consts(vector<t_const*> consts) {
   vector<t_const*>::iterator c_iter;
   for (c_iter = consts.begin(); c_iter != consts.end(); ++c_iter) {
diff --git a/compiler/cpp/src/generate/t_generator.h b/compiler/cpp/src/generate/t_generator.h
index 060e656..8fdf5ff 100644
--- a/compiler/cpp/src/generate/t_generator.h
+++ b/compiler/cpp/src/generate/t_generator.h
@@ -27,6 +27,11 @@
     indent_ = 0;
     program_ = program;
     program_name_ = get_program_name(program);
+    escape_['\n'] = "\\n";
+    escape_['\r'] = "\\r";
+    escape_['\t'] = "\\t";
+    escape_['"']  = "\\\"";
+    escape_['\\'] = "\\\\";
   }
 
   virtual ~t_generator() {}
@@ -45,6 +50,16 @@
                                   const std::string& line_prefix,
                                   const std::string& contents,
                                   const std::string& comment_end);
+
+  /**
+   * Escape string to use one in generated sources.
+   */
+  virtual std::string escape_string(const std::string &in) const;
+
+  std::string get_escaped_string(t_const_value* constval) {
+    return escape_string(constval->get_string());
+  }
+
  protected:
 
   /**
@@ -184,6 +199,11 @@
    */
   std::string out_dir_base_;
 
+  /**
+   * Map of characters to escape in string literals.
+   */
+  std::map<char, std::string> escape_;
+
  private:
   /**
    * Current code indentation level
diff --git a/compiler/cpp/src/generate/t_hs_generator.cc b/compiler/cpp/src/generate/t_hs_generator.cc
index 18a07f0..94fb959 100644
--- a/compiler/cpp/src/generate/t_hs_generator.cc
+++ b/compiler/cpp/src/generate/t_hs_generator.cc
@@ -303,7 +303,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "\"" << value->get_string() << "\"";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "True" : "False");
diff --git a/compiler/cpp/src/generate/t_html_generator.cc b/compiler/cpp/src/generate/t_html_generator.cc
index fc05709..671019e 100644
--- a/compiler/cpp/src/generate/t_html_generator.cc
+++ b/compiler/cpp/src/generate/t_html_generator.cc
@@ -32,6 +32,12 @@
     : t_generator(program)
   {
     out_dir_base_ = "gen-html";
+    escape_.clear();
+    escape_['&']  = "&amp;";
+    escape_['<']  = "&lt;";
+    escape_['>']  = "&gt;";
+    escape_['"']  = "&quot;";
+    escape_['\''] = "&apos;";
   }
 
   void generate_program();
@@ -396,7 +402,7 @@
     f_out_ << tvalue->get_double();
     break;
   case t_const_value::CV_STRING:
-    f_out_ << "\"" << tvalue->get_string() << "\"";
+    f_out_ << '"' << get_escaped_string(tvalue) << '"';
     break;
   case t_const_value::CV_MAP:
     {
diff --git a/compiler/cpp/src/generate/t_java_generator.cc b/compiler/cpp/src/generate/t_java_generator.cc
index e258a47..6e2d0b3 100644
--- a/compiler/cpp/src/generate/t_java_generator.cc
+++ b/compiler/cpp/src/generate/t_java_generator.cc
@@ -519,7 +519,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      render << "\"" + value->get_string() + "\"";
+      render << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       render << ((value->get_integer() > 0) ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_ocaml_generator.cc b/compiler/cpp/src/generate/t_ocaml_generator.cc
index a60e00c..8c8f0c0 100644
--- a/compiler/cpp/src/generate/t_ocaml_generator.cc
+++ b/compiler/cpp/src/generate/t_ocaml_generator.cc
@@ -361,7 +361,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "\"" << value->get_string() << "\"";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_perl_generator.cc b/compiler/cpp/src/generate/t_perl_generator.cc
index 02ac779..be72891 100644
--- a/compiler/cpp/src/generate/t_perl_generator.cc
+++ b/compiler/cpp/src/generate/t_perl_generator.cc
@@ -31,6 +31,8 @@
     : t_oop_generator(program)
   {
     out_dir_base_ = "gen-perl";
+    escape_['$'] = "\\$";
+    escape_['@'] = "\\@";
   }
 
   /**
@@ -328,7 +330,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "'" << value->get_string() << "'";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "1" : "0");
@@ -1796,5 +1798,4 @@
   throw "INVALID TYPE IN type_to_enum: " + type->get_name();
 }
 
-
 THRIFT_REGISTER_GENERATOR(perl, "Perl", "");
diff --git a/compiler/cpp/src/generate/t_php_generator.cc b/compiler/cpp/src/generate/t_php_generator.cc
index f74076c..7b3e617 100644
--- a/compiler/cpp/src/generate/t_php_generator.cc
+++ b/compiler/cpp/src/generate/t_php_generator.cc
@@ -51,6 +51,7 @@
     }
 
     out_dir_base_ = (binary_inline_ ? "gen-phpi" : "gen-php");
+    escape_['$'] = "\\$";
   }
 
   /**
@@ -371,7 +372,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "'" << value->get_string() << "'";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_py_generator.cc b/compiler/cpp/src/generate/t_py_generator.cc
index cde70a9..59a4705 100644
--- a/compiler/cpp/src/generate/t_py_generator.cc
+++ b/compiler/cpp/src/generate/t_py_generator.cc
@@ -387,7 +387,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "'" << value->get_string() << "'";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "True" : "False");
diff --git a/compiler/cpp/src/generate/t_rb_generator.cc b/compiler/cpp/src/generate/t_rb_generator.cc
index d3ac711..2a7f7d9 100644
--- a/compiler/cpp/src/generate/t_rb_generator.cc
+++ b/compiler/cpp/src/generate/t_rb_generator.cc
@@ -338,7 +338,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "%q\"" << value->get_string() << '"';
+      out << "%q\"" << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "true" : "false");
diff --git a/compiler/cpp/src/generate/t_st_generator.cc b/compiler/cpp/src/generate/t_st_generator.cc
index 960db63..76a8c9d 100644
--- a/compiler/cpp/src/generate/t_st_generator.cc
+++ b/compiler/cpp/src/generate/t_st_generator.cc
@@ -348,7 +348,7 @@
     t_base_type::t_base tbase = ((t_base_type*)type)->get_base();
     switch (tbase) {
     case t_base_type::TYPE_STRING:
-      out << "'" << value->get_string() << "'";
+      out << '"' << get_escaped_string(value) << '"';
       break;
     case t_base_type::TYPE_BOOL:
       out << (value->get_integer() > 0 ? "true" : "false");
diff --git a/compiler/cpp/src/thriftl.ll b/compiler/cpp/src/thriftl.ll
index 0b4e960..a700ca3 100644
--- a/compiler/cpp/src/thriftl.ll
+++ b/compiler/cpp/src/thriftl.ll
@@ -14,6 +14,7 @@
 
 %{
 
+#include <string>
 #include <errno.h>
 
 #include "main.h"
@@ -58,10 +59,8 @@
 comment       ("//"[^\n]*)
 unixcomment   ("#"[^\n]*)
 symbol        ([:;\,\{\}\(\)\=<>\[\]])
-dliteral      ("\""[^"]*"\"")
-sliteral      ("'"[^']*"'")
 st_identifier ([a-zA-Z-][\.a-zA-Z_0-9-]*)
-
+literal_begin (['\"])
 
 %%
 
@@ -222,17 +221,56 @@
   return tok_st_identifier;
 }
 
-{dliteral} {
-  yylval.id = strdup(yytext+1);
-  yylval.id[strlen(yylval.id)-1] = '\0';
-  return tok_literal;
+{literal_begin} {
+  char mark = yytext[0];
+  std::string result;
+  for(;;)
+  {
+    int ch = yyinput();
+    switch (ch) {
+      case EOF:
+        yyerror("End of file while read string at %d\n", yylineno);
+        exit(1);
+      case '\n':
+        yyerror("End of line while read string at %d\n", yylineno - 1);
+        exit(1);
+      case '\\':
+        ch = yyinput();
+        switch (ch) {
+          case 'r':
+            result.push_back('\r');
+            continue;
+          case 'n':
+            result.push_back('\n');
+            continue;
+          case 't':
+            result.push_back('\t');
+            continue;
+          case '"':
+            result.push_back('"');
+            continue;
+          case '\'':
+            result.push_back('\'');
+            continue;
+          case '\\':
+            result.push_back('\\');
+            continue;
+          default:
+            yyerror("Bad escape character\n");
+            return -1;
+        }
+        break;
+      default:
+        if (ch == mark) {
+          yylval.id = strdup(result.c_str());
+          return tok_literal;
+        } else {
+          result.push_back(ch);
+        }
+    }
+  }
 }
 
-{sliteral} {
-  yylval.id = strdup(yytext+1);
-  yylval.id[strlen(yylval.id)-1] = '\0';
-  return tok_literal;
-}
 
 {doctext} {
  /* This does not show up in the parse tree. */