Add helpers::unescape_html() & tests.
This commit is contained in:
parent
7f31d897cf
commit
8a9be5d45b
147
src/helpers.cpp
147
src/helpers.cpp
|
@ -5,6 +5,7 @@
|
|||
#include <codecvt>
|
||||
#include <locale>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
namespace epubgrep::helpers
|
||||
|
@ -35,4 +36,150 @@ std::string urldecode(const std::string_view url)
|
|||
return decoded;
|
||||
}
|
||||
|
||||
std::string unescape_html(const std::string_view html)
|
||||
{
|
||||
std::string output;
|
||||
output.reserve(html.size());
|
||||
|
||||
// Source: https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_
|
||||
// entity_references#Character_entity_references_in_HTML
|
||||
const std::map<std::string_view, char32_t>
|
||||
names{{"exclamation", 0x0021}, {"quot", 0x0022}, {"percent", 0x0025},
|
||||
{"amp", 0x0026}, {"apos", 0x0027}, {"add", 0x002B},
|
||||
{"lt", 0x003C}, {"equal", 0x003D}, {"gt", 0x003E},
|
||||
{"nbsp", 0x00A0}, {"iexcl", 0x00A1}, {"cent", 0x00A2},
|
||||
{"pound", 0x00A3}, {"curren", 0x00A4}, {"yen", 0x00A5},
|
||||
{"brvbar", 0x00A6}, {"sect", 0x00A7}, {"uml", 0x00A8},
|
||||
{"copy", 0x00A9}, {"ordf", 0x00AA}, {"laquo", 0x00AB},
|
||||
{"not", 0x00AC}, {"shy", 0x00AD}, {"reg", 0x00AE},
|
||||
{"macr", 0x00AF}, {"deg", 0x00B0}, {"plusmn", 0x00B1},
|
||||
{"sup2", 0x00B2}, {"sup3", 0x00B3}, {"acute", 0x00B4},
|
||||
{"micro", 0x00B5}, {"para", 0x00B6}, {"middot", 0x00B7},
|
||||
{"cedil", 0x00B8}, {"sup1", 0x00B9}, {"ordm", 0x00BA},
|
||||
{"raquo", 0x00BB}, {"frac14", 0x00BC}, {"frac12", 0x00BD},
|
||||
{"frac34", 0x00BE}, {"iquest", 0x00BF}, {"Agrave", 0x00C0},
|
||||
{"Aacute", 0x00C1}, {"Acirc", 0x00C2}, {"Atilde", 0x00C3},
|
||||
{"Auml", 0x00C4}, {"Aring", 0x00C5}, {"AElig", 0x00C6},
|
||||
{"Ccedil", 0x00C7}, {"Egrave", 0x00C8}, {"Eacute", 0x00C9},
|
||||
{"Ecirc", 0x00CA}, {"Euml", 0x00CB}, {"Igrave", 0x00CC},
|
||||
{"Iacute", 0x00CD}, {"Icirc", 0x00CE}, {"Iuml", 0x00CF},
|
||||
{"ETH", 0x00D0}, {"Ntilde", 0x00D1}, {"Ograve", 0x00D2},
|
||||
{"Oacute", 0x00D3}, {"Ocirc", 0x00D4}, {"Otilde", 0x00D5},
|
||||
{"Ouml", 0x00D6}, {"times", 0x00D7}, {"Oslash", 0x00D8},
|
||||
{"Ugrave", 0x00D9}, {"Uacute", 0x00DA}, {"Ucirc", 0x00DB},
|
||||
{"Uuml", 0x00DC}, {"Yacute", 0x00DD}, {"THORN", 0x00DE},
|
||||
{"szlig", 0x00DF}, {"agrave", 0x00E0}, {"aacute", 0x00E1},
|
||||
{"acirc", 0x00E2}, {"atilde", 0x00E3}, {"auml", 0x00E4},
|
||||
{"aring", 0x00E5}, {"aelig", 0x00E6}, {"ccedil", 0x00E7},
|
||||
{"egrave", 0x00E8}, {"eacute", 0x00E9}, {"ecirc", 0x00EA},
|
||||
{"euml", 0x00EB}, {"igrave", 0x00EC}, {"iacute", 0x00ED},
|
||||
{"icirc", 0x00EE}, {"iuml", 0x00EF}, {"eth", 0x00F0},
|
||||
{"ntilde", 0x00F1}, {"ograve", 0x00F2}, {"oacute", 0x00F3},
|
||||
{"ocirc", 0x00F4}, {"otilde", 0x00F5}, {"ouml", 0x00F6},
|
||||
{"divide", 0x00F7}, {"oslash", 0x00F8}, {"ugrave", 0x00F9},
|
||||
{"uacute", 0x00FA}, {"ucirc", 0x00FB}, {"uuml", 0x00FC},
|
||||
{"yacute", 0x00FD}, {"thorn", 0x00FE}, {"yuml", 0x00FF},
|
||||
{"OElig", 0x0152}, {"oelig", 0x0153}, {"Scaron", 0x0160},
|
||||
{"scaron", 0x0161}, {"Yuml", 0x0178}, {"fnof", 0x0192},
|
||||
{"circ", 0x02C6}, {"tilde", 0x02DC}, {"Alpha", 0x0391},
|
||||
{"Beta", 0x0392}, {"Gamma", 0x0393}, {"Delta", 0x0394},
|
||||
{"Epsilon", 0x0395}, {"Zeta", 0x0396}, {"Eta", 0x0397},
|
||||
{"Theta", 0x0398}, {"Iota", 0x0399}, {"Kappa", 0x039A},
|
||||
{"Lambda", 0x039B}, {"Mu", 0x039C}, {"Nu", 0x039D},
|
||||
{"Xi", 0x039E}, {"Omicron", 0x039F}, {"Pi", 0x03A0},
|
||||
{"Rho", 0x03A1}, {"Sigma", 0x03A3}, {"Tau", 0x03A4},
|
||||
{"Upsilon", 0x03A5}, {"Phi", 0x03A6}, {"Chi", 0x03A7},
|
||||
{"Psi", 0x03A8}, {"Omega", 0x03A9}, {"alpha", 0x03B1},
|
||||
{"beta", 0x03B2}, {"gamma", 0x03B3}, {"delta", 0x03B4},
|
||||
{"epsilon", 0x03B5}, {"zeta", 0x03B6}, {"eta", 0x03B7},
|
||||
{"theta", 0x03B8}, {"iota", 0x03B9}, {"kappa", 0x03BA},
|
||||
{"lambda", 0x03BB}, {"mu", 0x03BC}, {"nu", 0x03BD},
|
||||
{"xi", 0x03BE}, {"omicron", 0x03BF}, {"pi", 0x03C0},
|
||||
{"rho", 0x03C1}, {"sigmaf", 0x03C2}, {"sigma", 0x03C3},
|
||||
{"tau", 0x03C4}, {"upsilon", 0x03C5}, {"phi", 0x03C6},
|
||||
{"chi", 0x03C7}, {"psi", 0x03C8}, {"omega", 0x03C9},
|
||||
{"thetasym", 0x03D1}, {"upsih", 0x03D2}, {"piv", 0x03D6},
|
||||
{"ensp", 0x2002}, {"emsp", 0x2003}, {"thinsp", 0x2009},
|
||||
{"zwnj", 0x200C}, {"zwj", 0x200D}, {"lrm", 0x200E},
|
||||
{"rlm", 0x200F}, {"ndash", 0x2013}, {"mdash", 0x2014},
|
||||
{"horbar", 0x2015}, {"lsquo", 0x2018}, {"rsquo", 0x2019},
|
||||
{"sbquo", 0x201A}, {"ldquo", 0x201C}, {"rdquo", 0x201D},
|
||||
{"bdquo", 0x201E}, {"dagger", 0x2020}, {"Dagger", 0x2021},
|
||||
{"bull", 0x2022}, {"hellip", 0x2026}, {"permil", 0x2030},
|
||||
{"prime", 0x2032}, {"Prime", 0x2033}, {"lsaquo", 0x2039},
|
||||
{"rsaquo", 0x203A}, {"oline", 0x203E}, {"frasl", 0x2044},
|
||||
{"euro", 0x20AC}, {"image", 0x2111}, {"weierp", 0x2118},
|
||||
{"real", 0x211C}, {"trade", 0x2122}, {"alefsym", 0x2135},
|
||||
{"larr", 0x2190}, {"uarr", 0x2191}, {"rarr", 0x2192},
|
||||
{"darr", 0x2193}, {"harr", 0x2194}, {"crarr", 0x21B5},
|
||||
{"lArr", 0x21D0}, {"uArr", 0x21D1}, {"rArr", 0x21D2},
|
||||
{"dArr", 0x21D3}, {"hArr", 0x21D4}, {"forall", 0x2200},
|
||||
{"part", 0x2202}, {"exist", 0x2203}, {"empty", 0x2205},
|
||||
{"nabla", 0x2207}, {"isin", 0x2208}, {"notin", 0x2209},
|
||||
{"ni", 0x220B}, {"prod", 0x220F}, {"sum", 0x2211},
|
||||
{"minus", 0x2212}, {"lowast", 0x2217}, {"radic", 0x221A},
|
||||
{"prop", 0x221D}, {"infin", 0x221E}, {"ang", 0x2220},
|
||||
{"and", 0x2227}, {"or", 0x2228}, {"cap", 0x2229},
|
||||
{"cup", 0x222A}, {"int", 0x222B}, {"there4", 0x2234},
|
||||
{"sim", 0x223C}, {"cong", 0x2245}, {"asymp", 0x2248},
|
||||
{"ne", 0x2260}, {"equiv", 0x2261}, {"le", 0x2264},
|
||||
{"ge", 0x2265}, {"sub", 0x2282}, {"sup", 0x2283},
|
||||
{"nsub", 0x2284}, {"sube", 0x2286}, {"supe", 0x2287},
|
||||
{"oplus", 0x2295}, {"otimes", 0x2297}, {"perp", 0x22A5},
|
||||
{"sdot", 0x22C5}, {"lceil", 0x2308}, {"rceil", 0x2309},
|
||||
{"lfloor", 0x230A}, {"rfloor", 0x230B}, {"lang", 0x2329},
|
||||
{"rang", 0x232A}, {"loz", 0x25CA}, {"spades", 0x2660},
|
||||
{"clubs", 0x2663}, {"hearts", 0x2665}, {"diams", 0x2666}};
|
||||
|
||||
// Used to convert number to utf-8 char.
|
||||
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
|
||||
// Matches numbered entities between 1 and 8 digits, decimal or hexadecimal,
|
||||
// or named entities.
|
||||
static const boost::regex re_entity{"&(#(x)?([[:alnum:]]{1,8})"
|
||||
"|[^;[:space:][:punct:]]+);"};
|
||||
std::string::const_iterator begin{html.begin()};
|
||||
std::string::const_iterator end{html.end()};
|
||||
boost::match_results<std::string::const_iterator> match;
|
||||
|
||||
// Used for appending the rest of the text, after the last replacement.
|
||||
std::string::const_iterator end_last_match{begin};
|
||||
|
||||
while (boost::regex_search(begin, end, match, re_entity))
|
||||
{
|
||||
output += match.prefix();
|
||||
try
|
||||
{
|
||||
const char32_t codepoint{
|
||||
[&match, &names]
|
||||
{
|
||||
// If it doesn't start with a '#' it is a named entity.
|
||||
if (match[1].str()[0] != '#')
|
||||
{
|
||||
return names.at(match[1].str());
|
||||
}
|
||||
// 'x' after '#' means the number is hexadecimal.
|
||||
if (match[2].length() == 1)
|
||||
{
|
||||
return static_cast<char32_t>(
|
||||
std::stoul(match[3].str(), nullptr, 16));
|
||||
}
|
||||
// '#' without 'x' means the number is decimal.
|
||||
return static_cast<char32_t>(
|
||||
std::stoul(match[3].str(), nullptr, 10));
|
||||
}()};
|
||||
output += u8c.to_bytes(codepoint);
|
||||
}
|
||||
catch (const std::out_of_range &) // Named entity could not be found.
|
||||
{
|
||||
output += match.str();
|
||||
}
|
||||
begin = match[0].end();
|
||||
end_last_match = begin;
|
||||
}
|
||||
|
||||
output += std::string(end_last_match, end);
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
} // namespace epubgrep::helpers
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#define EPUBGREP_HELPERS_HPP
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
namespace epubgrep::helpers
|
||||
{
|
||||
|
||||
|
@ -11,6 +12,9 @@ namespace epubgrep::helpers
|
|||
//! Decode percent-encoded text. Used for restricted characters in URLs.
|
||||
[[nodiscard]] std::string urldecode(std::string_view url);
|
||||
|
||||
//! Un-escape &,   and so on.
|
||||
[[nodiscard]] std::string unescape_html(std::string_view html);
|
||||
|
||||
} // namespace epubgrep::helpers
|
||||
|
||||
#endif // EPUBGREP_HELPERS_HPP
|
||||
|
|
|
@ -61,4 +61,51 @@ SCENARIO("Helpers work as intended")
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
SECTION("unescape_html() doesn't fail and returns the decoded text")
|
||||
{
|
||||
GIVEN("A text with a named entity in it.")
|
||||
{
|
||||
std::string encoded_text{"Sleepy & ready for bed"};
|
||||
std::string decoded_text{};
|
||||
|
||||
try
|
||||
{
|
||||
decoded_text = epubgrep::helpers::unescape_html(encoded_text);
|
||||
}
|
||||
catch (const std::exception &)
|
||||
{
|
||||
exception = true;
|
||||
}
|
||||
|
||||
THEN("No exception is thrown")
|
||||
AND_THEN("It returns the unescaped text")
|
||||
{
|
||||
REQUIRE_FALSE(exception);
|
||||
REQUIRE(decoded_text == "Sleepy & ready for bed");
|
||||
}
|
||||
}
|
||||
|
||||
GIVEN("A text with numbered entities in it.")
|
||||
{
|
||||
std::string encoded_text{"Sleepy & ready for bed"};
|
||||
std::string decoded_text{};
|
||||
|
||||
try
|
||||
{
|
||||
decoded_text = epubgrep::helpers::unescape_html(encoded_text);
|
||||
}
|
||||
catch (const std::exception &)
|
||||
{
|
||||
exception = true;
|
||||
}
|
||||
|
||||
THEN("No exception is thrown")
|
||||
AND_THEN("It returns the unescaped text")
|
||||
{
|
||||
REQUIRE_FALSE(exception);
|
||||
REQUIRE(decoded_text == "Sleepy & ready for bed");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user