Replaced regular expressions in with find & replace, where possible.
continuous-integration/drone/push Build is failing Details

This commit is contained in:
tastytea 2019-05-17 05:43:17 +02:00
parent 27b3d1cc55
commit f9563cddcd
Signed by: tastytea
GPG Key ID: CFC39497F1B26E07
3 changed files with 40 additions and 15 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.2)
project(remwharead
VERSION 0.1.1
VERSION 0.1.2
LANGUAGES CXX
)

View File

@ -98,29 +98,53 @@ const string URI::strip_html(const string &html)
{
string out;
out = regex_replace(html, regex("<script[^<]+"), ""); // Remove JavaScript.
out = regex_replace(out, regex("<style[^<]+"), ""); // Remove CSS.
out = remove_html_tags(out); // Remove tags.
out = regex_replace(out, regex("\r"), ""); // Remove CR.
out = remove_html_tags(html, "script") // Remove JavaScript.
out = remove_html_tags(out, "style"); // Remove CSS.
out = remove_html_tags(out); // Remove tags.
size_t pos = 0;
while ((pos = out.find("\r")) != std::string::npos) // Remove CR.
{
out.replace(pos, 1, "");
}
out = regex_replace(out, regex("\\s+\n"), "\n"); // Remove trailing space.
out = regex_replace(out, regex("\n{2,}"), "\n"); // Reduce newlines.
return unescape_html(out);
}
const string URI::remove_html_tags(const string &html)
const string URI::remove_html_tags(const string &html, const string &tag)
{
// NOTE: I did this with regex_replace before, but libstdc++ segfaulted.
string out;
size_t pos = 0;
while (pos != std::string::npos)
if (tag.empty())
{
size_t startpos = html.find('<', pos);
size_t endpos = html.find('>', startpos);
out += html.substr(pos, startpos - pos);
pos = endpos;
if (pos != std::string::npos)
size_t pos = 0;
while (pos != std::string::npos)
{
++pos;
size_t startpos = html.find('<', pos);
size_t endpos = html.find('>', startpos);
out += html.substr(pos, startpos - pos);
pos = endpos;
if (pos != std::string::npos)
{
++pos;
}
}
}
else
{
size_t pos = 0;
out = html;
while ((pos = out.find("<" + tag)) != std::string::npos)
{
size_t endpos = out.find("</" + tag, pos);
if (endpos == std::string::npos)
{
break;
}
endpos += 3 + tag.length(); // tag + </ + >
out.replace(pos, endpos - pos, "");
}
}

View File

@ -44,7 +44,8 @@ protected:
const string extract_title(const string &html);
const string extract_description(const string &html);
const string strip_html(const string &html);
const string remove_html_tags(const string &html);
//! Remove all HTML tags. If tag is not empty, remove tag and its content.
const string remove_html_tags(const string &html, const string &tag = "");
const string unescape_html(const string &html);
const string remove_newlines(const string &text);
};