From f57ff976e5d2de220a60dafc92af28ed3d5223e3 Mon Sep 17 00:00:00 2001 From: tastytea Date: Mon, 30 Apr 2018 00:56:55 +0200 Subject: [PATCH] support for hexadecimal numbered entities, 1-8 digits --- CMakeLists.txt | 2 +- src/parse.cpp | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index deac81a..e8b23a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required (VERSION 3.7) project (mastorss - VERSION 0.5.12 + VERSION 0.5.13 LANGUAGES CXX ) diff --git a/src/parse.cpp b/src/parse.cpp index b3fee77..0ef7567 100644 --- a/src/parse.cpp +++ b/src/parse.cpp @@ -39,13 +39,23 @@ void unescape_html(string &str) string html = str; str = ""; // Used to convert int to utf-8 char - std::wstring_convert, char16_t> u8c; - std::regex reentity("&#(\\d{2,4});"); + std::wstring_convert, char32_t> u8c; + std::regex re_entity("&#(x)?(\\d{1,8});"); std::smatch match; - while (std::regex_search(html, match, reentity)) + while (std::regex_search(html, match, re_entity)) { - str += match.prefix().str() + u8c.to_bytes(std::stoi(match[1].str())); + char32_t codepoint = 0; + // 'x' in front of the number means it's hexadecimal, else decimal. + if (match[1].length() == 1) + { + codepoint = std::stoi(match[2].str(), nullptr, 16); + } + else + { + codepoint = std::stoi(match[2].str(), nullptr, 10); + } + str += match.prefix().str() + u8c.to_bytes(codepoint); html = match.suffix().str(); } str += html;