support for hexadecimal numbered entities, 1-8 digits

This commit is contained in:
tastytea 2018-04-30 00:56:55 +02:00
parent b312e89c64
commit f57ff976e5
Signed by: tastytea
GPG Key ID: 59346E0EA35C67E5
2 changed files with 15 additions and 5 deletions

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.7)
project (mastorss
VERSION 0.5.12
VERSION 0.5.13
LANGUAGES CXX
)

View File

@ -39,13 +39,23 @@ void unescape_html(string &str)
string html = str;
str = "";
// Used to convert int to utf-8 char
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> u8c;
std::regex reentity("&#(\\d{2,4});");
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> u8c;
std::regex re_entity("&#(x)?(\\d{1,8});");
std::smatch match;
while (std::regex_search(html, match, reentity))
while (std::regex_search(html, match, re_entity))
{
str += match.prefix().str() + u8c.to_bytes(std::stoi(match[1].str()));
char32_t codepoint = 0;
// 'x' in front of the number means it's hexadecimal, else decimal.
if (match[1].length() == 1)
{
codepoint = std::stoi(match[2].str(), nullptr, 16);
}
else
{
codepoint = std::stoi(match[2].str(), nullptr, 10);
}
str += match.prefix().str() + u8c.to_bytes(codepoint);
html = match.suffix().str();
}
str += html;