THRIFT-2375 Excessive <br>'s in generated HTML
Patch: Jens Geyer
diff --git a/compiler/cpp/src/generate/t_html_generator.cc b/compiler/cpp/src/generate/t_html_generator.cc
index 701fdd1..8ac7ddb 100644
--- a/compiler/cpp/src/generate/t_html_generator.cc
+++ b/compiler/cpp/src/generate/t_html_generator.cc
@@ -70,6 +70,8 @@
escape_['>'] = ">";
escape_['"'] = """;
escape_['\''] = "'";
+
+ init_allowed__markup();
}
void generate_program();
@@ -79,12 +81,14 @@
std::vector<t_program*>& finished);
void generate_index();
std::string escape_html(std::string const & str);
+ std::string escape_html_tags(std::string const & str);
void generate_css();
void generate_css_content(std::ofstream & f_target);
void generate_style_tag();
std::string make_file_link( std::string name);
bool is_utf8_sequence(std::string const & str, size_t firstpos);
void detect_input_encoding(std::string const & str, size_t firstpos);
+ void init_allowed__markup();
/**
* Program-level generation functions
@@ -106,7 +110,7 @@
std::ofstream f_out_;
std::string current_file_;
input_type input_type_;
-
+ std::map<std::string, int> allowed_markup;
bool standalone_;
};
@@ -395,21 +399,7 @@
*/
void t_html_generator::print_doc(t_doc* tdoc) {
if (tdoc->has_doc()) {
- string doc = tdoc->get_doc();
- size_t index;
- while ((index = doc.find_first_of("\r\n")) != string::npos) {
- if (index == 0) {
- f_out_ << "<p/>" << endl;
- } else {
- f_out_ << escape_html( doc.substr(0, index)) << endl;
- }
- if (index + 1 < doc.size() && doc.at(index) != doc.at(index + 1) &&
- (doc.at(index + 1) == '\r' || doc.at(index + 1) == '\n')) {
- index++;
- }
- doc = doc.substr(index + 1);
- }
- f_out_ << escape_html(doc) << "<br/>";
+ f_out_ << escape_html(tdoc->get_doc()) << "<br/>";
}
}
@@ -462,12 +452,141 @@
input_type_ = INPUT_PLAIN;
}
-std::string t_html_generator::escape_html(std::string const & str) {
+void t_html_generator::init_allowed__markup() {
+ allowed_markup.clear();
+ // standalone tags
+ allowed_markup["br"] = 1;
+ allowed_markup["br/"] = 1;
+ allowed_markup["img"] = 1;
+ // paired tags
+ allowed_markup["b"] = 1;
+ allowed_markup["/b"] = 1;
+ allowed_markup["u"] = 1;
+ allowed_markup["/u"] = 1;
+ allowed_markup["i"] = 1;
+ allowed_markup["/i"] = 1;
+ allowed_markup["s"] = 1;
+ allowed_markup["/s"] = 1;
+ allowed_markup["big"] = 1;
+ allowed_markup["/big"] = 1;
+ allowed_markup["small"] = 1;
+ allowed_markup["/small"] = 1;
+ allowed_markup["sup"] = 1;
+ allowed_markup["/sup"] = 1;
+ allowed_markup["sub"] = 1;
+ allowed_markup["/sub"] = 1;
+ allowed_markup["pre"] = 1;
+ allowed_markup["/pre"] = 1;
+ allowed_markup["tt"] = 1;
+ allowed_markup["/tt"] = 1;
+ allowed_markup["ul"] = 1;
+ allowed_markup["/ul"] = 1;
+ allowed_markup["ol"] = 1;
+ allowed_markup["/ol"] = 1;
+ allowed_markup["li"] = 1;
+ allowed_markup["/li"] = 1;
+ allowed_markup["a"] = 1;
+ allowed_markup["/a"] = 1;
+ allowed_markup["p"] = 1;
+ allowed_markup["/p"] = 1;
+ allowed_markup["code"] = 1;
+ allowed_markup["/code"] = 1;
+ allowed_markup["dl"] = 1;
+ allowed_markup["/dl"] = 1;
+ allowed_markup["dt"] = 1;
+ allowed_markup["/dt"] = 1;
+ allowed_markup["dd"] = 1;
+ allowed_markup["/dd"] = 1;
+ allowed_markup["h1"] = 1;
+ allowed_markup["/h1"] = 1;
+ allowed_markup["h2"] = 1;
+ allowed_markup["/h2"] = 1;
+ allowed_markup["h3"] = 1;
+ allowed_markup["/h3"] = 1;
+ allowed_markup["h4"] = 1;
+ allowed_markup["/h4"] = 1;
+ allowed_markup["h5"] = 1;
+ allowed_markup["/h5"] = 1;
+ allowed_markup["h6"] = 1;
+ allowed_markup["/h6"] = 1;
+}
+
+std::string t_html_generator::escape_html_tags(std::string const & str) {
+ std::ostringstream result;
+ unsigned char c = '?';
+ size_t lastpos;
+ size_t firstpos = 0;
+ while( firstpos < str.length()) {
+
+ // look for non-ASCII char
+ lastpos = firstpos;
+ while( lastpos < str.length()) {
+ c = str.at(lastpos);
+ if( ('<' == c) || ('>' == c)) {
+ break;
+ }
+ ++lastpos;
+ }
+
+ // copy what we got so far
+ if( lastpos > firstpos) {
+ result << str.substr( firstpos, lastpos-firstpos);
+ firstpos = lastpos;
+ }
+
+ // reached the end?
+ if( firstpos >= str.length()) {
+ break;
+ }
+
+ // tag end without corresponding begin
+ ++firstpos;
+ if( '>' == c) {
+ result << ">";
+ continue;
+ }
+
+ // extract the tag
+ std::ostringstream tagstream;
+ while( firstpos < str.length()) {
+ c = str.at(firstpos);
+ ++firstpos;
+ if('<'==c) {
+ tagstream << "<"; // nested begin?
+ } else if('>'==c) {
+ break;
+ } else {
+ tagstream << c; // not very efficient, but tags should be quite short
+ }
+ }
+
+ // we allow for several markup in docstrings, all else will become escaped
+ string tag_content = tagstream.str();
+ string tag_key = tag_content;
+ size_t first_white = tag_key.find_first_of(" \t\f\v\n\r");
+ if( first_white != string::npos) {
+ tag_key.erase(first_white);
+ }
+ for (std::string::size_type i=0; i<tag_key.length(); ++i) {
+ tag_key[i] = tolower(tag_key[i]);
+ }
+ if( allowed_markup.find(tag_key) != allowed_markup.end()) {
+ result << "<" << tag_content << ">";
+ } else {
+ result << "<" << tagstream.str() << ">";
+ pverbose("illegal markup <%s> in doc-comment\n", tag_key.c_str());
+ }
+ }
+
+ return result.str();
+}
+
+std::string t_html_generator::escape_html(std::string const & str) {
// the generated HTML header says it is UTF-8 encoded
// if UTF-8 input has been detected before, we don't need to change anything
if( input_type_ == INPUT_UTF8) {
- return str;
+ return escape_html_tags(str);
}
// convert unsafe chars to their &#<num>; equivalent
@@ -495,6 +614,11 @@
firstpos = lastpos;
}
+ // reached the end?
+ if( firstpos >= str.length()) {
+ break;
+ }
+
// some control code?
if( (0 <= ic) && (31 >= ic))
{
@@ -502,10 +626,10 @@
{
case '\r' :
case '\n' :
- result << "<br/>";
- break;
case '\t' :
- result << " ";
+ result << c;
+ break;
+ default: // silently consume all other ctrl chars
break;
}
++firstpos;
@@ -538,7 +662,7 @@
}
}
- return result.str();
+ return escape_html_tags(result.str());
}
/**